nuonuo/experiments/exp11_scale_ceiling.py
Fam Zheng d923aa1e31 NuoNuo: Hippocampal memory module prototype
Hopfield + Hebbian hybrid memory system for LLMs.
Two nights of experiments (16 iterations), validated on LongMemEval (ICLR 2025).

Architecture:
- Single-hop: Two-Stage Hopfield (NN top-20 → softmax settle)
- Multi-hop: Hebbian W matrix with WTA pattern separation
- 64% on LongMemEval (500 questions), retrieval-only, no LLM dependency
- 4ms latency @ 20K memories, ~1GB VRAM

Key findings:
- Hopfield attention solved noise tolerance (20% → 100% vs flat Hebbian)
- WTA pattern separation enables 20K+ capacity
- Multi-hop associative chains (6 hops, CosSim=1.0) — RAG can't do this
- MiniLM-L6 is optimal (discrimination gap > absolute similarity)
- Paraphrase cue augmentation: 55% → 100% on synthetic, 36% → 64% on benchmark
- SNN encoder viable (CosSim 0.99) but not needed for current architecture
2026-04-07 10:37:24 +01:00

238 lines
8.3 KiB
Python

"""Experiment P3: Breaking the 20K 80% ceiling.
Hypothesis: NN pre-filter (top-20) misses the correct cue at large scale.
Tests:
1. Oracle analysis: is the correct cue in top-K? What K is needed?
2. Hierarchical memory: cluster memories, route query to relevant cluster
3. Re-ranking: top-K NN → cross-similarity re-rank → Hopfield on re-ranked
4. Multiple projections: ensemble of NN lookups with different random projections
"""
import sys
import time
from pathlib import Path
import torch
import torch.nn as nn
import numpy as np
DEVICE = "cuda"
PAIRS = [
("What's the weather like today?", "User checks weather every morning"),
("Let's deploy the new version", "Deployment uses GitHub Actions with k3s"),
("The database is slow again", "Missing index on users table"),
("I need to fix the authentication bug", "JWT tokens with 24h expiry in Redis"),
("The API returns 500 errors", "OOM in the Python worker"),
("Let's set up monitoring", "Prometheus + Grafana on OCI"),
("Tests failing in CI", "CI needs postgres service container"),
("Memory usage too high", "Leak in websocket handler"),
("Help with Docker setup", "docker-compose for dev, k3s for prod"),
("Log files too large", "Logs rotate daily, shipped to Loki"),
]
PARAPHRASES = [
"How's the weather?", "Ship the release", "DB performance terrible",
"Fix the login issue", "Server errors everywhere", "Need observability",
"CI tests breaking", "Service using too much RAM", "Docker config help",
"Logs eating disk space",
]
def cosine(a, b):
return nn.functional.cosine_similarity(a.unsqueeze(0), b.unsqueeze(0)).item()
def load_model():
from sentence_transformers import SentenceTransformer
return SentenceTransformer("all-MiniLM-L6-v2", device=DEVICE)
def build_memory(model, n_bg):
"""Build memory with test pairs + background."""
cue_embs = model.encode([p[0] for p in PAIRS], convert_to_tensor=True,
normalize_embeddings=True, device=DEVICE)
target_embs = model.encode([p[1] for p in PAIRS], convert_to_tensor=True,
normalize_embeddings=True, device=DEVICE)
para_embs = model.encode(PARAPHRASES, convert_to_tensor=True,
normalize_embeddings=True, device=DEVICE)
all_cues = list(cue_embs)
all_targets = list(target_embs)
all_mids = list(range(len(PAIRS)))
if n_bg > 0:
topics = ["server", "db", "api", "fe", "be", "cache",
"queue", "net", "store", "auth", "docker", "k8s"]
bg_cues = [f"The {topics[i%len(topics)]} has issue {i}" for i in range(n_bg)]
bg_targets = [f"Fix {topics[i%len(topics)]} issue {i}" for i in range(n_bg)]
bg_c = model.encode(bg_cues, convert_to_tensor=True,
normalize_embeddings=True, device=DEVICE, batch_size=256)
bg_t = model.encode(bg_targets, convert_to_tensor=True,
normalize_embeddings=True, device=DEVICE, batch_size=256)
for i in range(n_bg):
all_cues.append(bg_c[i])
all_targets.append(bg_t[i])
all_mids.append(100 + i)
cue_mat = torch.stack(all_cues)
target_mat = torch.stack(all_targets)
return cue_mat, target_mat, all_mids, cue_embs, target_embs, para_embs
def test_topk_coverage(model, n_bg_list):
"""Is the correct cue in top-K? What K do we need?"""
print("=== Test 1: Top-K Coverage Analysis ===\n")
for n_bg in n_bg_list:
cue_mat, target_mat, mids, cue_embs, target_embs, para_embs = build_memory(model, n_bg)
for K in [5, 10, 20, 50, 100, 200]:
in_topk = 0
for i in range(len(PARAPHRASES)):
sims = para_embs[i] @ cue_mat.T
_, top_idx = sims.topk(min(K, len(mids)))
top_mids = [mids[j] for j in top_idx.tolist()]
if i in top_mids:
in_topk += 1
n = len(PARAPHRASES)
print(f" N={n_bg+len(PAIRS):>6}, K={K:>3}: "
f"{in_topk}/{n} ({in_topk/n:.0%}) correct cue in top-K")
print()
def test_two_stage_topk(model, n_bg):
"""Vary K in two-stage Hopfield to find optimal."""
print(f"\n=== Test 2: Two-Stage K Optimization (bg={n_bg}) ===\n")
cue_mat, target_mat, mids, cue_embs, target_embs, para_embs = build_memory(model, n_bg)
for K in [5, 10, 20, 50, 100, 200]:
correct = 0
for i in range(len(PARAPHRASES)):
sims = para_embs[i] @ cue_mat.T
k = min(K, len(mids))
_, top_idx = sims.topk(k)
cand_cues = cue_mat[top_idx]
cand_targets = target_mat[top_idx]
cand_mids = [mids[j] for j in top_idx.tolist()]
# Hopfield settle
xi = para_embs[i]
for _ in range(3):
scores = 16.0 * (xi @ cand_cues.T)
attn = torch.softmax(scores, dim=0)
xi = attn @ cand_cues
xi = nn.functional.normalize(xi, dim=0)
scores = 16.0 * (xi @ cand_cues.T)
attn = torch.softmax(scores, dim=0)
mid_scores = {}
for j, mid in enumerate(cand_mids):
mid_scores[mid] = mid_scores.get(mid, 0) + attn[j].item()
best_mid = max(mid_scores, key=mid_scores.get)
if best_mid == i:
correct += 1
n = len(PARAPHRASES)
print(f" K={K:>3}: {correct}/{n} ({correct/n:.0%})")
def test_hierarchical(model, n_bg):
"""Cluster memories by topic, route query to relevant cluster."""
print(f"\n=== Test 3: Hierarchical Memory (bg={n_bg}) ===\n")
cue_mat, target_mat, mids, cue_embs, target_embs, para_embs = build_memory(model, n_bg)
# Simple clustering: k-means on cue embeddings
from torch import cdist
n_clusters = max(10, (n_bg + len(PAIRS)) // 100)
# K-means (simple implementation)
N = cue_mat.shape[0]
centroids = cue_mat[torch.randperm(N)[:n_clusters]].clone()
for _ in range(20):
dists = 1 - cue_mat @ centroids.T # cosine distance
assignments = dists.argmin(dim=1)
for c in range(n_clusters):
mask = assignments == c
if mask.sum() > 0:
centroids[c] = nn.functional.normalize(cue_mat[mask].mean(dim=0), dim=0)
# Route query to top-3 clusters, then Hopfield within
correct = 0
for i in range(len(PARAPHRASES)):
# Find relevant clusters
cluster_sims = para_embs[i] @ centroids.T
top_clusters = cluster_sims.topk(3).indices
# Gather candidates from top clusters
cand_idx = []
for c in top_clusters:
cluster_members = (assignments == c).nonzero().squeeze(-1).tolist()
cand_idx.extend(cluster_members)
cand_idx = list(set(cand_idx))
if not cand_idx:
continue
# Hopfield on candidates
cand_cues = cue_mat[cand_idx]
cand_targets = target_mat[cand_idx]
cand_mids = [mids[j] for j in cand_idx]
K = min(20, len(cand_idx))
sims = para_embs[i] @ cand_cues.T
_, top_local = sims.topk(K)
local_cues = cand_cues[top_local]
local_mids = [cand_mids[j] for j in top_local.tolist()]
xi = para_embs[i]
for _ in range(3):
scores = 16.0 * (xi @ local_cues.T)
attn = torch.softmax(scores, dim=0)
xi = attn @ local_cues
xi = nn.functional.normalize(xi, dim=0)
scores = 16.0 * (xi @ local_cues.T)
attn = torch.softmax(scores, dim=0)
mid_scores = {}
for j, mid in enumerate(local_mids):
mid_scores[mid] = mid_scores.get(mid, 0) + attn[j].item()
best_mid = max(mid_scores, key=mid_scores.get)
if best_mid == i:
correct += 1
n = len(PARAPHRASES)
print(f" Hierarchical (clusters={n_clusters}): {correct}/{n} ({correct/n:.0%})")
def main():
print("=" * 60)
print("Experiment P3: Breaking the 20K Ceiling")
print("=" * 60)
model = load_model()
# Test 1: Top-K coverage
test_topk_coverage(model, [0, 500, 2000, 5000, 10000, 20000])
# Test 2: K optimization
for bg in [2000, 10000, 20000]:
test_two_stage_topk(model, bg)
# Test 3: Hierarchical
for bg in [2000, 10000, 20000]:
test_hierarchical(model, bg)
if __name__ == "__main__":
main()