"""Experiment 5: Performance benchmarks.

Measure:
1. Learning throughput (memories/second)
2. Recall latency (ms per query)
3. GPU memory usage at different scales
4. Multi-hop latency vs hops
5. End-to-end: embed + separate + recall pipeline
"""

import sys
import time
import json
from pathlib import Path

import torch
import torch.nn as nn
import numpy as np

DEVICE = "cuda"
RESULTS_DIR = Path(__file__).parent.parent / "doc"


def winner_take_all(x, k):
    _, idx = x.topk(k, dim=-1)
    out = torch.zeros_like(x)
    out.scatter_(-1, idx, 1.0)
    return out


class BenchMemory:
    def __init__(self, input_dim, code_dim, k):
        self.k = k
        self.code_dim = code_dim
        self.proj = (torch.randn(input_dim, code_dim, device=DEVICE)
                     * (1.0 / input_dim**0.5))
        self.W = torch.zeros(code_dim, code_dim, device=DEVICE)

    def sep(self, x):
        return winner_take_all(x @ self.proj, self.k)

    def learn(self, cue, target):
        self.W += torch.outer(self.sep(target), self.sep(cue))

    def recall(self, query, hops=1):
        code = self.sep(query)
        for _ in range(hops):
            code = winner_take_all(self.W @ code, self.k)
        return code


def benchmark_learn(input_dim, code_dim, k, n_memories):
    """Measure learning throughput."""
    mem = BenchMemory(input_dim, code_dim, k)
    cues = torch.randn(n_memories, input_dim, device=DEVICE)
    targets = torch.randn(n_memories, input_dim, device=DEVICE)

    torch.cuda.synchronize()
    t0 = time.time()
    for i in range(n_memories):
        mem.learn(cues[i], targets[i])
    torch.cuda.synchronize()
    dt = time.time() - t0

    return n_memories / dt, dt


def benchmark_recall(input_dim, code_dim, k, n_memories, n_queries=1000, hops=1):
    """Measure recall latency."""
    mem = BenchMemory(input_dim, code_dim, k)

    # Pre-fill
    for _ in range(n_memories):
        c = torch.randn(input_dim, device=DEVICE)
        t = torch.randn(input_dim, device=DEVICE)
        mem.learn(c, t)

    queries = torch.randn(n_queries, input_dim, device=DEVICE)

    torch.cuda.synchronize()
    t0 = time.time()
    for i in range(n_queries):
        mem.recall(queries[i], hops=hops)
    torch.cuda.synchronize()
    dt = time.time() - t0

    return dt / n_queries * 1000  # ms per query


def benchmark_memory_usage(input_dim, code_dims):
    """Measure GPU memory at different code_dim."""
    results = {}
    for cd in code_dims:
        torch.cuda.empty_cache()
        torch.cuda.reset_peak_memory_stats()

        before = torch.cuda.memory_allocated()
        mem = BenchMemory(input_dim, cd, k=50)
        # Learn 1000 memories
        for _ in range(1000):
            c = torch.randn(input_dim, device=DEVICE)
            t = torch.randn(input_dim, device=DEVICE)
            mem.learn(c, t)

        after = torch.cuda.memory_allocated()
        peak = torch.cuda.max_memory_allocated()

        w_size = cd * cd * 4 / 1024**2  # MB
        proj_size = input_dim * cd * 4 / 1024**2  # MB
        total_allocated = (after - before) / 1024**2

        results[cd] = {
            "W_size_MB": w_size,
            "proj_size_MB": proj_size,
            "total_allocated_MB": total_allocated,
            "peak_MB": peak / 1024**2,
        }
        print(f"  code_dim={cd:>6}: W={w_size:.0f}MB, proj={proj_size:.0f}MB, "
              f"total={total_allocated:.0f}MB")

        del mem
    return results


def main():
    print("=" * 60)
    print("Experiment 5: Performance Benchmarks")
    print("=" * 60)

    input_dim = 384  # MiniLM dimension

    # Test 1: Learning throughput
    print("\n=== Learning Throughput ===")
    for code_dim, k in [(8192, 50), (16384, 50), (32768, 50)]:
        for n in [1000, 5000, 10000]:
            rate, dt = benchmark_learn(input_dim, code_dim, k, n)
            print(f"  code={code_dim}, k={k}, N={n:>5}: "
                  f"{rate:>8.0f} memories/s ({dt:.2f}s)")
        torch.cuda.empty_cache()

    # Test 2: Recall latency
    print("\n=== Recall Latency ===")
    for code_dim, k in [(8192, 50), (16384, 50), (32768, 50)]:
        for n_mem in [100, 1000, 10000]:
            ms = benchmark_recall(input_dim, code_dim, k, n_mem, n_queries=1000)
            print(f"  code={code_dim}, k={k}, N={n_mem:>5}: {ms:.3f} ms/query")
        torch.cuda.empty_cache()

    # Test 3: Multi-hop latency
    print("\n=== Multi-hop Latency ===")
    for hops in [1, 2, 3, 5, 10]:
        ms = benchmark_recall(input_dim, 16384, 50, 1000, n_queries=1000, hops=hops)
        print(f"  hops={hops:>2}: {ms:.3f} ms/query")

    # Test 4: GPU Memory
    print("\n=== GPU Memory Usage ===")
    benchmark_memory_usage(input_dim, [4096, 8192, 16384, 32768, 65536])

    # Test 5: End-to-end with sentence-transformers
    print("\n=== End-to-End Pipeline Latency ===")
    from sentence_transformers import SentenceTransformer
    model = SentenceTransformer("all-MiniLM-L6-v2", device=DEVICE)

    mem = BenchMemory(384, 16384, 50)
    # Pre-fill 1000 memories
    sentences = [f"This is test sentence number {i}" for i in range(1000)]
    embs = model.encode(sentences, convert_to_tensor=True,
                        normalize_embeddings=True, device=DEVICE)
    for i in range(1000):
        mem.learn(embs[i], embs[min(i+1, 999)])

    # Benchmark single query pipeline
    query = "What is the test sentence?"
    n_runs = 100

    torch.cuda.synchronize()
    t0 = time.time()
    for _ in range(n_runs):
        q_emb = model.encode([query], convert_to_tensor=True,
                              normalize_embeddings=True, device=DEVICE)[0]
        recalled = mem.recall(q_emb, hops=1)
    torch.cuda.synchronize()
    dt = (time.time() - t0) / n_runs * 1000

    # Breakdown
    t_embed = 0
    t_recall = 0
    for _ in range(n_runs):
        torch.cuda.synchronize()
        t1 = time.time()
        q_emb = model.encode([query], convert_to_tensor=True,
                              normalize_embeddings=True, device=DEVICE)[0]
        torch.cuda.synchronize()
        t2 = time.time()
        recalled = mem.recall(q_emb, hops=1)
        torch.cuda.synchronize()
        t3 = time.time()
        t_embed += t2 - t1
        t_recall += t3 - t2

    t_embed = t_embed / n_runs * 1000
    t_recall = t_recall / n_runs * 1000

    print(f"  Total: {dt:.1f} ms/query")
    print(f"    Embedding: {t_embed:.1f} ms")
    print(f"    Recall: {t_recall:.3f} ms")
    print(f"    Ratio: embedding is {t_embed/t_recall:.0f}x slower than recall")


if __name__ == "__main__":
    main()