nuonuo/experiments/exp07e_cue_augmentation.py

"""Experiment 7e: Cue augmentation to overcome embedding model limitations.

Idea: When storing a memory, also store augmented versions of the cue.
If the user says "The database is slow", also store:
- The embedding with added noise (gaussian augmentation)
- A shifted version toward common paraphrase patterns

This increases the "catchment basin" of each memory without changing the model.

Also test: using the LLM itself to generate paraphrases (simulated here).
"""

import sys
import time
from pathlib import Path

import torch
import torch.nn as nn
import numpy as np

DEVICE = "cuda"


def cosine(a, b):
    return nn.functional.cosine_similarity(a.unsqueeze(0), b.unsqueeze(0)).item()


class AugmentedHopfield:
    """Hopfield with cue augmentation.

    Each memory stores N augmented cue embeddings, all pointing to the same target.
    During recall, any of the augmented cues can match.
    """
    def __init__(self, beta=16.0, top_k=20, n_augments=5, noise_std=0.15):
        self.beta = beta
        self.top_k = top_k
        self.n_augments = n_augments
        self.noise_std = noise_std
        self.cue_embs = []
        self.target_embs = []
        self.memory_ids = []  # Which original memory each entry belongs to

    def learn(self, cue_emb, target_emb, memory_id=None):
        """Store with augmented cues."""
        mid = memory_id if memory_id is not None else len(set(self.memory_ids))

        # Original
        self.cue_embs.append(cue_emb.detach())
        self.target_embs.append(target_emb.detach())
        self.memory_ids.append(mid)

        # Augmented: add noise and renormalize
        for _ in range(self.n_augments):
            noisy = cue_emb + torch.randn_like(cue_emb) * self.noise_std
            noisy = nn.functional.normalize(noisy, dim=0)
            self.cue_embs.append(noisy)
            self.target_embs.append(target_emb.detach())
            self.memory_ids.append(mid)

    def learn_with_paraphrases(self, cue_embs_list, target_emb, memory_id=None):
        """Store multiple cue embeddings for the same target.
        cue_embs_list: list of embeddings (original + paraphrases)
        """
        mid = memory_id if memory_id is not None else len(set(self.memory_ids))
        for ce in cue_embs_list:
            self.cue_embs.append(ce.detach())
            self.target_embs.append(target_emb.detach())
            self.memory_ids.append(mid)

    def recall(self, query_emb, steps=3):
        cue_mat = torch.stack(self.cue_embs)
        target_mat = torch.stack(self.target_embs)
        N = cue_mat.shape[0]

        # Stage 1: top-K
        k = min(self.top_k, N)
        sims = query_emb @ cue_mat.T
        _, top_idx = sims.topk(k)

        cand_cues = cue_mat[top_idx]
        cand_targets = target_mat[top_idx]

        # Stage 2: Hopfield settle
        xi = query_emb
        for _ in range(steps):
            scores = self.beta * (xi @ cand_cues.T)
            attn = torch.softmax(scores, dim=0)
            xi = attn @ cand_cues
            xi = nn.functional.normalize(xi, dim=0)

        scores = self.beta * (xi @ cand_cues.T)
        attn = torch.softmax(scores, dim=0)
        target = attn @ cand_targets
        return nn.functional.normalize(target, dim=0)


def load_model():
    from sentence_transformers import SentenceTransformer
    return SentenceTransformer("all-MiniLM-L6-v2", device=DEVICE)


def test_augmentation(model):
    """Compare: no augmentation vs noise augmentation vs paraphrase augmentation."""
    print("\n=== Augmentation Comparison (20 pairs, 2000 bg) ===")

    pairs = [
        ("What's the weather like today?", "User checks weather every morning"),
        ("Let's deploy the new version", "Deployment uses GitHub Actions with k3s"),
        ("The database is slow again", "Missing index on users table"),
        ("I need to fix the authentication bug", "JWT tokens with 24h expiry in Redis"),
        ("The API returns 500 errors", "OOM in the Python worker"),
        ("Let's set up monitoring", "Prometheus + Grafana on OCI"),
        ("Tests failing in CI", "CI needs postgres service container"),
        ("Memory usage too high", "Leak in websocket handler"),
        ("Help with Docker setup", "docker-compose for dev, k3s for prod"),
        ("Log files too large", "Logs rotate daily, shipped to Loki"),
        ("How to add caching?", "Redis available at redis.internal:6379"),
        ("Frontend loads slowly", "CDN CloudFlare, 1h TTL for assets"),
        ("Refactor payment module", "Stripe API, webhook in payments/webhook.py"),
        ("Set up new server", "Ubuntu 22.04, Docker, Tailscale, monitoring"),
        ("Optimize search", "Elasticsearch v8, recently upgraded"),
        ("Backup the database", "Daily 3am UTC cron to S3"),
        ("Configure reverse proxy", "Traefik, not nginx"),
        ("Team meeting schedule", "Standup 10am London, Mon-Fri"),
        ("Learn a new programming language", "User has Python+Go, new to systems"),
        ("Review my pull request", "User prefers small PRs with clear commits"),
    ]
    paraphrases = [
        "How's the weather?", "Ship the release", "DB performance terrible",
        "Fix the login issue", "Server errors everywhere", "Need observability",
        "CI tests breaking", "Service using too much RAM", "Docker config help",
        "Logs eating disk space", "Want to add a cache layer", "Website too slow",
        "Payment code needs rework", "Provision a new machine", "Search is slow",
        "Need a DB backup", "Proxy configuration", "When's the standup?",
        "Want to learn Rust", "Check my pull request",
    ]
    # Hand-crafted additional paraphrases for hard cases
    extra_paraphrases = {
        1: ["Ship the release", "Push to production", "Release the new build"],
        3: ["Fix the login issue", "Authentication is broken", "Login doesn't work"],
        4: ["Server errors everywhere", "Getting 500s", "Internal server error"],
        5: ["Need observability", "Set up alerts", "Monitor the services"],
        10: ["Add a cache layer", "Implement caching", "Cache the responses"],
        11: ["Website too slow", "Page load time is bad", "Frontend performance"],
        13: ["Provision a new machine", "Need a new server", "Set up a new box"],
        17: ["When's the standup?", "What time is the meeting?", "Daily sync time?"],
        18: ["Want to learn Rust", "Getting into Rust", "Start learning Rust"],
        19: ["Check my pull request", "Look at my code changes", "PR review please"],
    }

    cue_embs = model.encode([p[0] for p in pairs], convert_to_tensor=True,
                             normalize_embeddings=True, device=DEVICE)
    target_embs = model.encode([p[1] for p in pairs], convert_to_tensor=True,
                                normalize_embeddings=True, device=DEVICE)
    para_embs = model.encode(paraphrases, convert_to_tensor=True,
                              normalize_embeddings=True, device=DEVICE)

    # Encode extra paraphrases
    extra_embs = {}
    for idx, texts in extra_paraphrases.items():
        extra_embs[idx] = model.encode(texts, convert_to_tensor=True,
                                        normalize_embeddings=True, device=DEVICE)

    # Background
    topics = ["server", "database", "API", "frontend", "backend", "cache",
              "queue", "network", "storage", "auth", "docker", "kubernetes",
              "redis", "nginx", "postgres", "python", "golang", "react",
              "terraform", "ansible"]
    actions = ["crashed", "is slow", "needs update", "has bug", "timed out",
               "needs migration", "needs backup", "has leak", "is down", "needs config"]
    bg_cues = [f"The {topics[i%len(topics)]} {actions[i%len(actions)]} (ticket {i})"
               for i in range(2000)]
    bg_targets = [f"Fix {topics[i%len(topics)]} {actions[i%len(actions)]}: wiki {i}"
                  for i in range(2000)]

    bg_cue_embs = model.encode(bg_cues, convert_to_tensor=True,
                                normalize_embeddings=True, device=DEVICE, batch_size=256)
    bg_target_embs = model.encode(bg_targets, convert_to_tensor=True,
                                   normalize_embeddings=True, device=DEVICE, batch_size=256)

    def evaluate(mem, label):
        correct = 0
        for i in range(len(paraphrases)):
            with torch.no_grad():
                recalled = mem.recall(para_embs[i])
            all_sims = [cosine(recalled, target_embs[j]) for j in range(len(pairs))]
            if np.argmax(all_sims) == i:
                correct += 1
        n = len(paraphrases)
        print(f"  {label}: {correct}/{n} ({correct/n:.0%})")
        return correct / n

    # Method 1: No augmentation (baseline)
    mem1 = AugmentedHopfield(n_augments=0)
    for i in range(len(pairs)):
        mem1.learn(cue_embs[i], target_embs[i], memory_id=i)
    for i in range(2000):
        mem1.learn(bg_cue_embs[i], bg_target_embs[i], memory_id=100+i)
    evaluate(mem1, "No augmentation")

    # Method 2: Noise augmentation (5 copies)
    for noise in [0.1, 0.15, 0.2, 0.3]:
        mem2 = AugmentedHopfield(n_augments=5, noise_std=noise)
        for i in range(len(pairs)):
            mem2.learn(cue_embs[i], target_embs[i], memory_id=i)
        for i in range(2000):
            # Don't augment background
            mem2.cue_embs.append(bg_cue_embs[i])
            mem2.target_embs.append(bg_target_embs[i])
            mem2.memory_ids.append(100+i)
        evaluate(mem2, f"Noise aug (σ={noise}, n=5)")

    # Method 3: Noise augmentation (20 copies)
    mem3 = AugmentedHopfield(n_augments=20, noise_std=0.15)
    for i in range(len(pairs)):
        mem3.learn(cue_embs[i], target_embs[i], memory_id=i)
    for i in range(2000):
        mem3.cue_embs.append(bg_cue_embs[i])
        mem3.target_embs.append(bg_target_embs[i])
        mem3.memory_ids.append(100+i)
    evaluate(mem3, "Noise aug (σ=0.15, n=20)")

    # Method 4: Paraphrase augmentation (hand-crafted extras)
    mem4 = AugmentedHopfield(n_augments=0)
    for i in range(len(pairs)):
        cue_list = [cue_embs[i]]
        if i in extra_embs:
            cue_list.extend([e for e in extra_embs[i]])
        mem4.learn_with_paraphrases(cue_list, target_embs[i], memory_id=i)
    for i in range(2000):
        mem4.cue_embs.append(bg_cue_embs[i])
        mem4.target_embs.append(bg_target_embs[i])
        mem4.memory_ids.append(100+i)
    evaluate(mem4, "Paraphrase aug (hand-crafted)")

    # Method 5: Noise + Paraphrase combined
    mem5 = AugmentedHopfield(n_augments=5, noise_std=0.15)
    for i in range(len(pairs)):
        cue_list = [cue_embs[i]]
        if i in extra_embs:
            cue_list.extend([e for e in extra_embs[i]])
        mem5.learn_with_paraphrases(cue_list, target_embs[i], memory_id=i)
    for i in range(2000):
        mem5.cue_embs.append(bg_cue_embs[i])
        mem5.target_embs.append(bg_target_embs[i])
        mem5.memory_ids.append(100+i)
    evaluate(mem5, "Noise + Paraphrase combined")


def main():
    print("=" * 60)
    print("Experiment 7e: Cue Augmentation")
    print("=" * 60)

    model = load_model()
    test_augmentation(model)


if __name__ == "__main__":
    main()