nuonuo/experiments/exp11_scale_ceiling.py

"""Experiment P3: Breaking the 20K 80% ceiling.

Hypothesis: NN pre-filter (top-20) misses the correct cue at large scale.

Tests:
1. Oracle analysis: is the correct cue in top-K? What K is needed?
2. Hierarchical memory: cluster memories, route query to relevant cluster
3. Re-ranking: top-K NN → cross-similarity re-rank → Hopfield on re-ranked
4. Multiple projections: ensemble of NN lookups with different random projections
"""

import sys
import time
from pathlib import Path

import torch
import torch.nn as nn
import numpy as np

DEVICE = "cuda"

PAIRS = [
    ("What's the weather like today?", "User checks weather every morning"),
    ("Let's deploy the new version", "Deployment uses GitHub Actions with k3s"),
    ("The database is slow again", "Missing index on users table"),
    ("I need to fix the authentication bug", "JWT tokens with 24h expiry in Redis"),
    ("The API returns 500 errors", "OOM in the Python worker"),
    ("Let's set up monitoring", "Prometheus + Grafana on OCI"),
    ("Tests failing in CI", "CI needs postgres service container"),
    ("Memory usage too high", "Leak in websocket handler"),
    ("Help with Docker setup", "docker-compose for dev, k3s for prod"),
    ("Log files too large", "Logs rotate daily, shipped to Loki"),
]

PARAPHRASES = [
    "How's the weather?", "Ship the release", "DB performance terrible",
    "Fix the login issue", "Server errors everywhere", "Need observability",
    "CI tests breaking", "Service using too much RAM", "Docker config help",
    "Logs eating disk space",
]


def cosine(a, b):
    return nn.functional.cosine_similarity(a.unsqueeze(0), b.unsqueeze(0)).item()


def load_model():
    from sentence_transformers import SentenceTransformer
    return SentenceTransformer("all-MiniLM-L6-v2", device=DEVICE)


def build_memory(model, n_bg):
    """Build memory with test pairs + background."""
    cue_embs = model.encode([p[0] for p in PAIRS], convert_to_tensor=True,
                             normalize_embeddings=True, device=DEVICE)
    target_embs = model.encode([p[1] for p in PAIRS], convert_to_tensor=True,
                                normalize_embeddings=True, device=DEVICE)
    para_embs = model.encode(PARAPHRASES, convert_to_tensor=True,
                              normalize_embeddings=True, device=DEVICE)

    all_cues = list(cue_embs)
    all_targets = list(target_embs)
    all_mids = list(range(len(PAIRS)))

    if n_bg > 0:
        topics = ["server", "db", "api", "fe", "be", "cache",
                  "queue", "net", "store", "auth", "docker", "k8s"]
        bg_cues = [f"The {topics[i%len(topics)]} has issue {i}" for i in range(n_bg)]
        bg_targets = [f"Fix {topics[i%len(topics)]} issue {i}" for i in range(n_bg)]
        bg_c = model.encode(bg_cues, convert_to_tensor=True,
                             normalize_embeddings=True, device=DEVICE, batch_size=256)
        bg_t = model.encode(bg_targets, convert_to_tensor=True,
                             normalize_embeddings=True, device=DEVICE, batch_size=256)
        for i in range(n_bg):
            all_cues.append(bg_c[i])
            all_targets.append(bg_t[i])
            all_mids.append(100 + i)

    cue_mat = torch.stack(all_cues)
    target_mat = torch.stack(all_targets)
    return cue_mat, target_mat, all_mids, cue_embs, target_embs, para_embs


def test_topk_coverage(model, n_bg_list):
    """Is the correct cue in top-K? What K do we need?"""
    print("=== Test 1: Top-K Coverage Analysis ===\n")

    for n_bg in n_bg_list:
        cue_mat, target_mat, mids, cue_embs, target_embs, para_embs = build_memory(model, n_bg)

        for K in [5, 10, 20, 50, 100, 200]:
            in_topk = 0
            for i in range(len(PARAPHRASES)):
                sims = para_embs[i] @ cue_mat.T
                _, top_idx = sims.topk(min(K, len(mids)))
                top_mids = [mids[j] for j in top_idx.tolist()]
                if i in top_mids:
                    in_topk += 1

            n = len(PARAPHRASES)
            print(f"  N={n_bg+len(PAIRS):>6}, K={K:>3}: "
                  f"{in_topk}/{n} ({in_topk/n:.0%}) correct cue in top-K")
        print()


def test_two_stage_topk(model, n_bg):
    """Vary K in two-stage Hopfield to find optimal."""
    print(f"\n=== Test 2: Two-Stage K Optimization (bg={n_bg}) ===\n")

    cue_mat, target_mat, mids, cue_embs, target_embs, para_embs = build_memory(model, n_bg)

    for K in [5, 10, 20, 50, 100, 200]:
        correct = 0
        for i in range(len(PARAPHRASES)):
            sims = para_embs[i] @ cue_mat.T
            k = min(K, len(mids))
            _, top_idx = sims.topk(k)
            cand_cues = cue_mat[top_idx]
            cand_targets = target_mat[top_idx]
            cand_mids = [mids[j] for j in top_idx.tolist()]

            # Hopfield settle
            xi = para_embs[i]
            for _ in range(3):
                scores = 16.0 * (xi @ cand_cues.T)
                attn = torch.softmax(scores, dim=0)
                xi = attn @ cand_cues
                xi = nn.functional.normalize(xi, dim=0)

            scores = 16.0 * (xi @ cand_cues.T)
            attn = torch.softmax(scores, dim=0)

            mid_scores = {}
            for j, mid in enumerate(cand_mids):
                mid_scores[mid] = mid_scores.get(mid, 0) + attn[j].item()

            best_mid = max(mid_scores, key=mid_scores.get)
            if best_mid == i:
                correct += 1

        n = len(PARAPHRASES)
        print(f"  K={K:>3}: {correct}/{n} ({correct/n:.0%})")


def test_hierarchical(model, n_bg):
    """Cluster memories by topic, route query to relevant cluster."""
    print(f"\n=== Test 3: Hierarchical Memory (bg={n_bg}) ===\n")

    cue_mat, target_mat, mids, cue_embs, target_embs, para_embs = build_memory(model, n_bg)

    # Simple clustering: k-means on cue embeddings
    from torch import cdist
    n_clusters = max(10, (n_bg + len(PAIRS)) // 100)

    # K-means (simple implementation)
    N = cue_mat.shape[0]
    centroids = cue_mat[torch.randperm(N)[:n_clusters]].clone()

    for _ in range(20):
        dists = 1 - cue_mat @ centroids.T  # cosine distance
        assignments = dists.argmin(dim=1)
        for c in range(n_clusters):
            mask = assignments == c
            if mask.sum() > 0:
                centroids[c] = nn.functional.normalize(cue_mat[mask].mean(dim=0), dim=0)

    # Route query to top-3 clusters, then Hopfield within
    correct = 0
    for i in range(len(PARAPHRASES)):
        # Find relevant clusters
        cluster_sims = para_embs[i] @ centroids.T
        top_clusters = cluster_sims.topk(3).indices

        # Gather candidates from top clusters
        cand_idx = []
        for c in top_clusters:
            cluster_members = (assignments == c).nonzero().squeeze(-1).tolist()
            cand_idx.extend(cluster_members)
        cand_idx = list(set(cand_idx))

        if not cand_idx:
            continue

        # Hopfield on candidates
        cand_cues = cue_mat[cand_idx]
        cand_targets = target_mat[cand_idx]
        cand_mids = [mids[j] for j in cand_idx]

        K = min(20, len(cand_idx))
        sims = para_embs[i] @ cand_cues.T
        _, top_local = sims.topk(K)

        local_cues = cand_cues[top_local]
        local_mids = [cand_mids[j] for j in top_local.tolist()]

        xi = para_embs[i]
        for _ in range(3):
            scores = 16.0 * (xi @ local_cues.T)
            attn = torch.softmax(scores, dim=0)
            xi = attn @ local_cues
            xi = nn.functional.normalize(xi, dim=0)

        scores = 16.0 * (xi @ local_cues.T)
        attn = torch.softmax(scores, dim=0)
        mid_scores = {}
        for j, mid in enumerate(local_mids):
            mid_scores[mid] = mid_scores.get(mid, 0) + attn[j].item()

        best_mid = max(mid_scores, key=mid_scores.get)
        if best_mid == i:
            correct += 1

    n = len(PARAPHRASES)
    print(f"  Hierarchical (clusters={n_clusters}): {correct}/{n} ({correct/n:.0%})")


def main():
    print("=" * 60)
    print("Experiment P3: Breaking the 20K Ceiling")
    print("=" * 60)

    model = load_model()

    # Test 1: Top-K coverage
    test_topk_coverage(model, [0, 500, 2000, 5000, 10000, 20000])

    # Test 2: K optimization
    for bg in [2000, 10000, 20000]:
        test_two_stage_topk(model, bg)

    # Test 3: Hierarchical
    for bg in [2000, 10000, 20000]:
        test_hierarchical(model, bg)


if __name__ == "__main__":
    main()