Hopfield + Hebbian hybrid memory system for LLMs. Two nights of experiments (16 iterations), validated on LongMemEval (ICLR 2025). Architecture: - Single-hop: Two-Stage Hopfield (NN top-20 → softmax settle) - Multi-hop: Hebbian W matrix with WTA pattern separation - 64% on LongMemEval (500 questions), retrieval-only, no LLM dependency - 4ms latency @ 20K memories, ~1GB VRAM Key findings: - Hopfield attention solved noise tolerance (20% → 100% vs flat Hebbian) - WTA pattern separation enables 20K+ capacity - Multi-hop associative chains (6 hops, CosSim=1.0) — RAG can't do this - MiniLM-L6 is optimal (discrimination gap > absolute similarity) - Paraphrase cue augmentation: 55% → 100% on synthetic, 36% → 64% on benchmark - SNN encoder viable (CosSim 0.99) but not needed for current architecture
177 lines
7.0 KiB
Python
177 lines
7.0 KiB
Python
"""Experiment P6: Multi-turn conversation simulation.
|
|
|
|
Simulate a realistic multi-day conversation scenario:
|
|
- Day 1: User discusses database issues
|
|
- Day 2: User works on deployment
|
|
- Day 3: User comes back with a related question → should recall Day 1 context
|
|
- Day 4: User asks about something mentioned in passing on Day 1
|
|
|
|
Test: cross-session recall, context accumulation, multi-hop across days.
|
|
"""
|
|
|
|
import sys
|
|
import time
|
|
from pathlib import Path
|
|
|
|
import torch
|
|
import torch.nn as nn
|
|
import numpy as np
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
from nuonuo.hippocampus import HippocampalMemory
|
|
from llm import generate_paraphrases_heuristic
|
|
|
|
DEVICE = "cuda"
|
|
|
|
|
|
def load_model():
|
|
from sentence_transformers import SentenceTransformer
|
|
return SentenceTransformer("all-MiniLM-L6-v2", device=DEVICE)
|
|
|
|
|
|
def emb(model, text):
|
|
return model.encode([text], convert_to_tensor=True,
|
|
normalize_embeddings=True, device=DEVICE)[0]
|
|
|
|
|
|
def store_with_augmentation(mem, model, cue, target, timestamp=0.0):
|
|
"""Store a memory with heuristic paraphrases."""
|
|
cue_emb = emb(model, cue)
|
|
target_emb = emb(model, target)
|
|
paras = generate_paraphrases_heuristic(cue, n=3)
|
|
para_embs = [emb(model, p) for p in paras] if paras else None
|
|
return mem.store(cue_emb, target_emb, cue_variants=para_embs,
|
|
metadata={"cue": cue, "target": target},
|
|
timestamp=timestamp)
|
|
|
|
|
|
def test_recall(mem, model, query, expected_target_substr):
|
|
"""Test if recall contains expected substring."""
|
|
results = mem.recall(emb(model, query), top_k=3)
|
|
for r in results:
|
|
if expected_target_substr.lower() in r.metadata.get("target", "").lower():
|
|
return True, r.similarity, r.metadata["target"]
|
|
return False, 0.0, results[0].metadata.get("target", "???") if results else "no results"
|
|
|
|
|
|
def main():
|
|
print("=" * 60)
|
|
print("Experiment P6: Multi-turn Conversation")
|
|
print("=" * 60)
|
|
|
|
model = load_model()
|
|
mem = HippocampalMemory(embed_dim=384)
|
|
|
|
# ===== Day 1: Database troubleshooting session =====
|
|
print("\n--- Day 1: Database Troubleshooting ---")
|
|
day1_memories = [
|
|
("The database is really slow", "The users table is missing an index on created_at"),
|
|
("What's the query that's slow?", "SELECT * FROM users WHERE created_at > ? ORDER BY created_at"),
|
|
("How many rows in the users table?", "About 2.3 million rows, growing 10K per day"),
|
|
("Who has access to the database?", "Only the backend team: Alice, Bob, and Charlie"),
|
|
("What's the database host?", "PostgreSQL on db.internal:5432, running version 15.2"),
|
|
]
|
|
for cue, target in day1_memories:
|
|
store_with_augmentation(mem, model, cue, target, timestamp=1.0)
|
|
|
|
# ===== Day 2: Deployment work =====
|
|
print("--- Day 2: Deployment ---")
|
|
day2_memories = [
|
|
("How do we deploy?", "Blue-green deployment via GitHub Actions, config in .github/workflows/deploy.yml"),
|
|
("What's the rollback procedure?", "Switch the load balancer back to the previous blue/green slot"),
|
|
("Where are the deployment logs?", "GitHub Actions logs, also mirrored to Loki at loki.internal:3100"),
|
|
("Who approves production deploys?", "Requires approval from Alice or David in the #deploys channel"),
|
|
]
|
|
for cue, target in day2_memories:
|
|
store_with_augmentation(mem, model, cue, target, timestamp=2.0)
|
|
|
|
# ===== Day 3: Monitoring setup =====
|
|
print("--- Day 3: Monitoring ---")
|
|
day3_memories = [
|
|
("Set up monitoring for the database", "Prometheus scrapes pg_exporter on db.internal:9187, dashboard in Grafana"),
|
|
("What alerts do we have?", "PagerDuty alerts for: CPU>80%, disk>90%, replication lag>30s"),
|
|
("Where's the Grafana dashboard?", "grafana.internal/d/postgres-overview, login with SSO"),
|
|
]
|
|
for cue, target in day3_memories:
|
|
store_with_augmentation(mem, model, cue, target, timestamp=3.0)
|
|
|
|
print(f"\nTotal memories: {mem.stats()}")
|
|
|
|
# ===== Test: Cross-session recall =====
|
|
print("\n=== Cross-session Recall Tests ===\n")
|
|
|
|
tests = [
|
|
# (query, expected_substring, description)
|
|
# Day 1 recall
|
|
("DB is slow again", "index", "Day 1: DB slow → index"),
|
|
("How big is the users table?", "million", "Day 1: table size"),
|
|
("Who can access the database?", "Alice", "Day 1: DB access"),
|
|
("What Postgres version?", "15.2", "Day 1: PG version"),
|
|
|
|
# Day 2 recall
|
|
("How to deploy the new version?", "blue-green", "Day 2: deploy method"),
|
|
("How to rollback?", "load balancer", "Day 2: rollback"),
|
|
("Who approves deploys?", "Alice", "Day 2: deploy approval"),
|
|
|
|
# Day 3 recall
|
|
("Where's the monitoring dashboard?", "grafana", "Day 3: Grafana URL"),
|
|
("What alerts are configured?", "PagerDuty", "Day 3: alerts"),
|
|
|
|
# Cross-day inference
|
|
("The database is slow, what index is missing?", "created_at", "Cross: DB slow → specific index"),
|
|
("I need to check deploy logs", "Loki", "Cross: deploy logs → Loki"),
|
|
("Database monitoring exporter", "pg_exporter", "Cross: DB + monitoring"),
|
|
]
|
|
|
|
correct = 0
|
|
for query, expected, desc in tests:
|
|
found, sim, got = test_recall(mem, model, query, expected)
|
|
status = "✓" if found else "✗"
|
|
if found:
|
|
correct += 1
|
|
print(f" {status} [{sim:.2f}] {desc}")
|
|
if not found:
|
|
print(f" Expected '{expected}', got: '{got[:50]}...'")
|
|
|
|
n = len(tests)
|
|
print(f"\n Total: {correct}/{n} ({correct/n:.0%})")
|
|
|
|
# ===== Test: Multi-hop across days =====
|
|
print("\n=== Multi-hop Across Days ===\n")
|
|
|
|
# Store explicit chains across days
|
|
# Day 1: "DB slow" → "missing index"
|
|
# Day 3: "monitoring DB" → "pg_exporter"
|
|
# Chain: "DB slow" → (hop1) "missing index" → ... can we reach monitoring?
|
|
|
|
# Actually, multi-hop needs explicit chain links. Let's store some:
|
|
store_with_augmentation(mem, model,
|
|
"The missing index caused the slow query",
|
|
"Added index and set up monitoring to prevent recurrence",
|
|
timestamp=3.5)
|
|
|
|
chain = mem.recall_chain(emb(model, "database is slow"), hops=3)
|
|
print(" Chain from 'database is slow':")
|
|
for r in chain:
|
|
print(f" hop {r.hop_distance}: {r.metadata.get('target', '?')[:60]}...")
|
|
|
|
# ===== Test: Memory conflicts =====
|
|
print("\n=== Memory Update / Conflict ===\n")
|
|
|
|
# Store contradicting info
|
|
store_with_augmentation(mem, model,
|
|
"What Postgres version?", "Upgraded to PostgreSQL 16.1 last night",
|
|
timestamp=4.0)
|
|
|
|
# Which version does it recall?
|
|
results = mem.recall(emb(model, "What Postgres version are we running?"), top_k=2)
|
|
print(" Query: 'What Postgres version?'")
|
|
for r in results:
|
|
print(f" [{r.similarity:.2f}] {r.metadata.get('target', '?')}")
|
|
print(" Note: Both old (15.2) and new (16.1) returned — recency sorting needed")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|