"""Experiment 7: Attractor dynamics for noise-tolerant recall.

Current architecture: heteroassociative, one-shot (W @ cue → target)
Problem: noisy cue → noisy recall, no error correction

Fix: Use attractor dynamics (like real CA3 recurrent network).

Approach 1: Autoassociative + heteroassociative
  - Store patterns as attractors: W_auto += outer(pattern, pattern)
  - Noisy cue → iterate W_auto until convergence → clean cue
  - Then: W_hetero @ clean_cue → target

Approach 2: Recurrent settling with inhibition
  - W stores associations
  - Recall: iterate (W @ code → WTA → W @ code → ...) with lateral inhibition
  - Network settles into clean attractor state

Approach 3: Modern Hopfield (softmax energy)
  - Replace linear W @ x with softmax-based attention over stored patterns
  - Exponential storage capacity, natural noise tolerance

Approach 4: Hebbian + recurrent cleanup with learned inhibition
  - W for associations + lateral inhibition matrix for competition
"""

import sys
import time
from pathlib import Path

import torch
import torch.nn as nn
import numpy as np

DEVICE = "cuda"


def cosine(a, b):
    if a.norm() == 0 or b.norm() == 0:
        return 0.0
    return nn.functional.cosine_similarity(a.unsqueeze(0), b.unsqueeze(0)).item()


def winner_take_all(x, k):
    _, idx = x.topk(k, dim=-1)
    out = torch.zeros_like(x)
    out.scatter_(-1, idx, 1.0)
    return out


# ===== Approach 1: Autoassociative cleanup + heteroassociative recall =====

class AttractorMemory:
    """Two-stage recall: first clean the cue, then associate.

    W_auto: autoassociative (cue → cue), stores cue patterns as attractors
    W_hetero: heteroassociative (cue ��� target), stores associations

    Recall: noisy_cue → settle in W_auto → clean_cue → W_hetero → target
    """
    def __init__(self, input_dim, code_dim=16384, k=50):
        self.k = k
        self.code_dim = code_dim
        self.proj = (torch.randn(input_dim, code_dim, device=DEVICE)
                     * (1.0 / input_dim**0.5))
        # Autoassociative: cue cleanup network
        self.W_auto = torch.zeros(code_dim, code_dim, device=DEVICE)
        # Heteroassociative: cue → target
        self.W_hetero = torch.zeros(code_dim, code_dim, device=DEVICE)

    def sep(self, x):
        return winner_take_all(x @ self.proj, self.k)

    def learn(self, cue_emb, target_emb):
        cc = self.sep(cue_emb)
        tc = self.sep(target_emb)
        # Auto: store cue as attractor
        self.W_auto += torch.outer(cc, cc)
        # Hetero: cue → target
        self.W_hetero += torch.outer(tc, cc)

    def settle(self, code, W, steps=10):
        """Iterate until convergence (attractor dynamics)."""
        for _ in range(steps):
            raw = W @ code
            new_code = winner_take_all(raw, self.k)
            if (new_code == code).all():
                break  # Converged
            code = new_code
        return code

    def recall(self, query_emb, settle_steps=10):
        """Noisy query → auto-settle → hetero-associate."""
        # Encode
        code = self.sep(query_emb)
        # Phase 1: Settle in autoassociative network (cleanup)
        clean_code = self.settle(code, self.W_auto, steps=settle_steps)
        # Phase 2: Associate
        raw = self.W_hetero @ clean_code
        return winner_take_all(raw, self.k)

    def recall_no_settle(self, query_emb):
        """Direct recall without settling (baseline)."""
        code = self.sep(query_emb)
        raw = self.W_hetero @ code
        return winner_take_all(raw, self.k)


# ===== Approach 2: Modern Hopfield-inspired attention =====

class HopfieldMemory:
    """Modern Hopfield network: attention over stored patterns.

    Instead of W @ query (linear), use:
    softmax(beta * query @ stored_patterns^T) @ stored_targets

    This gives exponential capacity and natural noise tolerance.
    Still uses WTA codes for compatibility with Hebbian multi-hop.
    """
    def __init__(self, input_dim, code_dim=16384, k=50, beta=8.0):
        self.k = k
        self.code_dim = code_dim
        self.beta = beta
        self.proj = (torch.randn(input_dim, code_dim, device=DEVICE)
                     * (1.0 / input_dim**0.5))
        self.stored_cue_codes = []
        self.stored_target_codes = []

    def sep(self, x):
        return winner_take_all(x @ self.proj, self.k)

    def learn(self, cue_emb, target_emb):
        self.stored_cue_codes.append(self.sep(cue_emb))
        self.stored_target_codes.append(self.sep(target_emb))

    def recall(self, query_emb, steps=3):
        """Hopfield retrieval: iterative attention over stored patterns."""
        if not self.stored_cue_codes:
            return torch.zeros(self.code_dim, device=DEVICE)

        cue_matrix = torch.stack(self.stored_cue_codes)  # [N, code_dim]
        target_matrix = torch.stack(self.stored_target_codes)

        xi = self.sep(query_emb)  # [code_dim]

        for _ in range(steps):
            # Attention weights
            scores = self.beta * (xi @ cue_matrix.T)  # [N]
            attn = torch.softmax(scores, dim=0)  # [N]
            # Weighted sum of stored cue patterns (settle to nearest)
            xi = attn @ cue_matrix  # [code_dim]
            xi = winner_take_all(xi, self.k)

        # Final: associate to target
        scores = self.beta * (xi @ cue_matrix.T)
        attn = torch.softmax(scores, dim=0)
        recalled = attn @ target_matrix
        return winner_take_all(recalled, self.k)


# ===== Approach 3: Recurrent Hebbian with lateral inhibition =====

class RecurrentHebbianMemory:
    """Hebbian W + lateral inhibition for competitive recall.

    During settling, neurons compete: strongly activated patterns
    suppress weakly activated ones via inhibition.
    """
    def __init__(self, input_dim, code_dim=16384, k=50, inhibition=0.1):
        self.k = k
        self.code_dim = code_dim
        self.inhibition = inhibition
        self.proj = (torch.randn(input_dim, code_dim, device=DEVICE)
                     * (1.0 / input_dim**0.5))
        self.W = torch.zeros(code_dim, code_dim, device=DEVICE)

    def sep(self, x):
        return winner_take_all(x @ self.proj, self.k)

    def learn(self, cue_emb, target_emb):
        cc = self.sep(cue_emb)
        tc = self.sep(target_emb)
        self.W += torch.outer(tc, cc)
        # Also store cue as auto-attractor (for settling)
        self.W += torch.outer(cc, cc) * 0.5

    def recall(self, query_emb, steps=5):
        code = self.sep(query_emb)
        for _ in range(steps):
            # Excitation from W
            excitation = self.W @ code
            # Global inhibition: subtract mean activity
            inhibition = excitation.mean() * self.inhibition
            activation = excitation - inhibition
            # WTA: winner suppresses losers
            code = winner_take_all(activation, self.k)
        return code


# ===== Test harness =====

def build_and_test(MemClass, model, n_test_pairs=10, n_background=0,
                   label="", **kwargs):
    """Unified test for all memory architectures."""
    from sentence_transformers import SentenceTransformer

    pairs = [
        ("What's the weather like today?", "User checks weather every morning"),
        ("Let's deploy the new version", "Deployment uses GitHub Actions with k3s"),
        ("The database is slow again", "Missing index on users table"),
        ("I need to fix the auth bug", "JWT tokens with 24h expiry in Redis"),
        ("The API returns 500 errors", "OOM in the Python worker"),
        ("Let's set up monitoring", "Prometheus + Grafana on OCI cluster"),
        ("Tests are failing in CI", "CI needs postgres service container"),
        ("Memory usage is too high", "Leak in websocket handler"),
        ("Help with Docker setup", "docker-compose for dev, k3s for prod"),
        ("Log files are too large", "Logs rotate daily, shipped to Loki"),
    ][:n_test_pairs]

    paraphrases = [
        "How's the weather outside?",
        "We should push the new release",
        "DB performance is terrible",
        "There's a login bug to fix",
        "Getting internal server errors",
        "We need better observability",
        "CI tests keep breaking",
        "Service using too much RAM",
        "Docker configuration help",
        "Logs eating up disk space",
    ][:n_test_pairs]

    embed_dim = model.get_sentence_embedding_dimension()
    mem = MemClass(embed_dim, **kwargs)

    # Store test memories
    cue_embs = model.encode([p[0] for p in pairs], convert_to_tensor=True,
                             normalize_embeddings=True, device=DEVICE)
    target_embs = model.encode([p[1] for p in pairs], convert_to_tensor=True,
                                normalize_embeddings=True, device=DEVICE)
    for i in range(len(pairs)):
        mem.learn(cue_embs[i], target_embs[i])

    # Store background noise
    if n_background > 0:
        bg_cues = [f"Background task {i} about topic {i%20}" for i in range(n_background)]
        bg_targets = [f"Background fact {i} detail {i%10}" for i in range(n_background)]
        bg_cue_embs = model.encode(bg_cues, convert_to_tensor=True,
                                    normalize_embeddings=True, device=DEVICE, batch_size=256)
        bg_target_embs = model.encode(bg_targets, convert_to_tensor=True,
                                       normalize_embeddings=True, device=DEVICE, batch_size=256)
        for i in range(n_background):
            mem.learn(bg_cue_embs[i], bg_target_embs[i])

    # Test
    target_codes = torch.stack([mem.sep(t) for t in target_embs])
    para_embs = model.encode(paraphrases, convert_to_tensor=True,
                              normalize_embeddings=True, device=DEVICE)

    exact_correct = 0
    para_correct = 0

    for i in range(len(pairs)):
        # Exact
        recalled = mem.recall(cue_embs[i])
        sims = nn.functional.cosine_similarity(recalled.unsqueeze(0), target_codes, dim=-1)
        if sims.argmax().item() == i:
            exact_correct += 1

        # Paraphrase
        recalled_p = mem.recall(para_embs[i])
        sims_p = nn.functional.cosine_similarity(recalled_p.unsqueeze(0), target_codes, dim=-1)
        if sims_p.argmax().item() == i:
            para_correct += 1

    n = len(pairs)
    print(f"  {label} (bg={n_background}): "
          f"Exact={exact_correct}/{n} ({exact_correct/n:.0%}), "
          f"Para={para_correct}/{n} ({para_correct/n:.0%})")
    return exact_correct / n, para_correct / n


def main():
    print("=" * 60)
    print("Experiment 7: Attractor Dynamics")
    print("=" * 60)

    from sentence_transformers import SentenceTransformer
    model = SentenceTransformer("all-MiniLM-L6-v2", device=DEVICE)

    configs = [
        ("Flat Hebbian (baseline)", dict(code_dim=16384, k=50)),
    ]

    # Test each architecture at different scales
    for bg in [0, 100, 500, 1000]:
        print(f"\n=== Background memories: {bg} ===")

        # Baseline: flat Hebbian (no settling)
        class FlatHebbian:
            def __init__(self, input_dim, code_dim=16384, k=50):
                self.k = k
                self.code_dim = code_dim
                self.proj = (torch.randn(input_dim, code_dim, device=DEVICE)
                             * (1.0 / input_dim**0.5))
                self.W = torch.zeros(code_dim, code_dim, device=DEVICE)
            def sep(self, x):
                return winner_take_all(x @ self.proj, self.k)
            def learn(self, c, t):
                self.W += torch.outer(self.sep(t), self.sep(c))
            def recall(self, q):
                code = self.sep(q)
                return winner_take_all(self.W @ code, self.k)

        build_and_test(FlatHebbian, model, n_background=bg,
                       label="Flat Hebbian", code_dim=16384, k=50)

        # Approach 1: Autoassociative cleanup
        build_and_test(AttractorMemory, model, n_background=bg,
                       label="Attractor (auto+hetero)", code_dim=16384, k=50)

        # Approach 2: Modern Hopfield
        for beta in [4.0, 8.0, 16.0]:
            build_and_test(HopfieldMemory, model, n_background=bg,
                           label=f"Hopfield (β={beta})", code_dim=16384, k=50,
                           beta=beta)

        # Approach 3: Recurrent with inhibition
        for inhib in [0.1, 0.5, 1.0]:
            build_and_test(RecurrentHebbianMemory, model, n_background=bg,
                           label=f"Recurrent (inhib={inhib})", code_dim=16384, k=50,
                           inhibition=inhib)


if __name__ == "__main__":
    main()