"""Experiment 5: Performance benchmarks. Measure: 1. Learning throughput (memories/second) 2. Recall latency (ms per query) 3. GPU memory usage at different scales 4. Multi-hop latency vs hops 5. End-to-end: embed + separate + recall pipeline """ import sys import time import json from pathlib import Path import torch import torch.nn as nn import numpy as np DEVICE = "cuda" RESULTS_DIR = Path(__file__).parent.parent / "doc" def winner_take_all(x, k): _, idx = x.topk(k, dim=-1) out = torch.zeros_like(x) out.scatter_(-1, idx, 1.0) return out class BenchMemory: def __init__(self, input_dim, code_dim, k): self.k = k self.code_dim = code_dim self.proj = (torch.randn(input_dim, code_dim, device=DEVICE) * (1.0 / input_dim**0.5)) self.W = torch.zeros(code_dim, code_dim, device=DEVICE) def sep(self, x): return winner_take_all(x @ self.proj, self.k) def learn(self, cue, target): self.W += torch.outer(self.sep(target), self.sep(cue)) def recall(self, query, hops=1): code = self.sep(query) for _ in range(hops): code = winner_take_all(self.W @ code, self.k) return code def benchmark_learn(input_dim, code_dim, k, n_memories): """Measure learning throughput.""" mem = BenchMemory(input_dim, code_dim, k) cues = torch.randn(n_memories, input_dim, device=DEVICE) targets = torch.randn(n_memories, input_dim, device=DEVICE) torch.cuda.synchronize() t0 = time.time() for i in range(n_memories): mem.learn(cues[i], targets[i]) torch.cuda.synchronize() dt = time.time() - t0 return n_memories / dt, dt def benchmark_recall(input_dim, code_dim, k, n_memories, n_queries=1000, hops=1): """Measure recall latency.""" mem = BenchMemory(input_dim, code_dim, k) # Pre-fill for _ in range(n_memories): c = torch.randn(input_dim, device=DEVICE) t = torch.randn(input_dim, device=DEVICE) mem.learn(c, t) queries = torch.randn(n_queries, input_dim, device=DEVICE) torch.cuda.synchronize() t0 = time.time() for i in range(n_queries): mem.recall(queries[i], hops=hops) torch.cuda.synchronize() dt = time.time() - t0 return dt / n_queries * 1000 # ms per query def benchmark_memory_usage(input_dim, code_dims): """Measure GPU memory at different code_dim.""" results = {} for cd in code_dims: torch.cuda.empty_cache() torch.cuda.reset_peak_memory_stats() before = torch.cuda.memory_allocated() mem = BenchMemory(input_dim, cd, k=50) # Learn 1000 memories for _ in range(1000): c = torch.randn(input_dim, device=DEVICE) t = torch.randn(input_dim, device=DEVICE) mem.learn(c, t) after = torch.cuda.memory_allocated() peak = torch.cuda.max_memory_allocated() w_size = cd * cd * 4 / 1024**2 # MB proj_size = input_dim * cd * 4 / 1024**2 # MB total_allocated = (after - before) / 1024**2 results[cd] = { "W_size_MB": w_size, "proj_size_MB": proj_size, "total_allocated_MB": total_allocated, "peak_MB": peak / 1024**2, } print(f" code_dim={cd:>6}: W={w_size:.0f}MB, proj={proj_size:.0f}MB, " f"total={total_allocated:.0f}MB") del mem return results def main(): print("=" * 60) print("Experiment 5: Performance Benchmarks") print("=" * 60) input_dim = 384 # MiniLM dimension # Test 1: Learning throughput print("\n=== Learning Throughput ===") for code_dim, k in [(8192, 50), (16384, 50), (32768, 50)]: for n in [1000, 5000, 10000]: rate, dt = benchmark_learn(input_dim, code_dim, k, n) print(f" code={code_dim}, k={k}, N={n:>5}: " f"{rate:>8.0f} memories/s ({dt:.2f}s)") torch.cuda.empty_cache() # Test 2: Recall latency print("\n=== Recall Latency ===") for code_dim, k in [(8192, 50), (16384, 50), (32768, 50)]: for n_mem in [100, 1000, 10000]: ms = benchmark_recall(input_dim, code_dim, k, n_mem, n_queries=1000) print(f" code={code_dim}, k={k}, N={n_mem:>5}: {ms:.3f} ms/query") torch.cuda.empty_cache() # Test 3: Multi-hop latency print("\n=== Multi-hop Latency ===") for hops in [1, 2, 3, 5, 10]: ms = benchmark_recall(input_dim, 16384, 50, 1000, n_queries=1000, hops=hops) print(f" hops={hops:>2}: {ms:.3f} ms/query") # Test 4: GPU Memory print("\n=== GPU Memory Usage ===") benchmark_memory_usage(input_dim, [4096, 8192, 16384, 32768, 65536]) # Test 5: End-to-end with sentence-transformers print("\n=== End-to-End Pipeline Latency ===") from sentence_transformers import SentenceTransformer model = SentenceTransformer("all-MiniLM-L6-v2", device=DEVICE) mem = BenchMemory(384, 16384, 50) # Pre-fill 1000 memories sentences = [f"This is test sentence number {i}" for i in range(1000)] embs = model.encode(sentences, convert_to_tensor=True, normalize_embeddings=True, device=DEVICE) for i in range(1000): mem.learn(embs[i], embs[min(i+1, 999)]) # Benchmark single query pipeline query = "What is the test sentence?" n_runs = 100 torch.cuda.synchronize() t0 = time.time() for _ in range(n_runs): q_emb = model.encode([query], convert_to_tensor=True, normalize_embeddings=True, device=DEVICE)[0] recalled = mem.recall(q_emb, hops=1) torch.cuda.synchronize() dt = (time.time() - t0) / n_runs * 1000 # Breakdown t_embed = 0 t_recall = 0 for _ in range(n_runs): torch.cuda.synchronize() t1 = time.time() q_emb = model.encode([query], convert_to_tensor=True, normalize_embeddings=True, device=DEVICE)[0] torch.cuda.synchronize() t2 = time.time() recalled = mem.recall(q_emb, hops=1) torch.cuda.synchronize() t3 = time.time() t_embed += t2 - t1 t_recall += t3 - t2 t_embed = t_embed / n_runs * 1000 t_recall = t_recall / n_runs * 1000 print(f" Total: {dt:.1f} ms/query") print(f" Embedding: {t_embed:.1f} ms") print(f" Recall: {t_recall:.3f} ms") print(f" Ratio: embedding is {t_embed/t_recall:.0f}x slower than recall") if __name__ == "__main__": main()