nuonuo/experiments/exp08_llm_integration.py

"""Experiment P0: LLM Integration — end-to-end memory-augmented conversation.

Tests:
1. Memory extraction (heuristic fallback since LLM gateway is down)
2. Paraphrase generation (heuristic fallback)
3. End-to-end: conversation → extract → store → recall → inject
4. Multi-turn conversation simulation
"""

import sys
import time
from pathlib import Path

import torch

sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
sys.path.insert(0, str(Path(__file__).parent.parent))

from nuonuo.hippocampus import HippocampalMemory
from llm import (LLMClient, extract_memories_heuristic, extract_memories_llm,
                 generate_paraphrases_heuristic, generate_paraphrases_llm,
                 format_recalled_memories)

DEVICE = "cuda"


def load_model():
    from sentence_transformers import SentenceTransformer
    return SentenceTransformer("all-MiniLM-L6-v2", device=DEVICE)


def emb(model, text):
    return model.encode([text], convert_to_tensor=True,
                        normalize_embeddings=True, device=DEVICE)[0]


def test_heuristic_extraction():
    """Test memory extraction without LLM."""
    print("=== Test 1: Heuristic Memory Extraction ===\n")

    conversations = [
        ("How do I deploy to production?",
         "Use the blue-green deployment pipeline via GitHub Actions. The config is in .github/workflows/deploy.yml"),
        ("The database is really slow today",
         "Check for missing indexes on the users table. Last time this happened it was the created_at column."),
        ("Hi, how are you?",
         "I'm doing well, thanks!"),
        ("What port does Redis run on?",
         "Redis is on port 6379 at redis.internal"),
        ("Fix the auth bug please",
         "The auth service uses JWT tokens with 24h expiry stored in Redis. The bug was in token refresh logic."),
    ]

    for user_msg, assistant_msg in conversations:
        memories = extract_memories_heuristic(user_msg, assistant_msg)
        print(f"  User: {user_msg[:50]}...")
        if memories:
            for m in memories:
                print(f"    → CUE: {m.cue[:40]}... | TARGET: {m.target[:50]}... | IMP: {m.importance}")
        else:
            print(f"    → (nothing extracted)")
        print()


def test_heuristic_paraphrases():
    """Test paraphrase generation without LLM."""
    print("=== Test 2: Heuristic Paraphrase Generation ===\n")

    texts = [
        "How do I deploy to production?",
        "The database is slow",
        "Can you fix the authentication bug?",
        "I need to configure nginx",
        "Let's set up monitoring for the server",
    ]

    for text in texts:
        paras = generate_paraphrases_heuristic(text, n=3)
        print(f"  Original: {text}")
        for p in paras:
            print(f"    → {p}")
        print()


def test_end_to_end(model):
    """Full pipeline: conversation → extract → store → recall → inject."""
    print("=== Test 3: End-to-End Pipeline ===\n")

    memory = HippocampalMemory(embed_dim=384)
    llm = LLMClient()  # Will fail gracefully if gateway down

    # Simulate a few conversation turns
    turns = [
        ("How do I deploy to production?",
         "Use blue-green deployment via GitHub Actions. Config in .github/workflows/deploy.yml"),
        ("The database is really slow",
         "Check for missing indexes on users table, especially created_at column"),
        ("What port does Redis run on?",
         "Redis is on port 6379 at redis.internal"),
        ("Fix the auth bug",
         "Auth uses JWT tokens with 24h expiry in Redis. Bug was in token refresh."),
        ("How do I backup the database?",
         "Backups run daily at 3am UTC via cron job to S3. Config in /etc/cron.d/db-backup"),
    ]

    # Phase 1: Learn from conversations
    print("--- Phase 1: Learning from conversations ---")
    for user_msg, assistant_msg in turns:
        # Extract memories
        if llm.available:
            memories = extract_memories_llm(llm, user_msg, assistant_msg)
        else:
            memories = extract_memories_heuristic(user_msg, assistant_msg)

        for mem_item in memories:
            # Generate paraphrases
            if llm.available:
                paras = generate_paraphrases_llm(llm, mem_item.cue, n=3)
            else:
                paras = generate_paraphrases_heuristic(mem_item.cue, n=3)

            # Embed and store
            cue_emb = emb(model, mem_item.cue)
            target_emb = emb(model, mem_item.target)
            para_embs = [emb(model, p) for p in paras] if paras else None

            mid = memory.store(
                cue_emb, target_emb,
                cue_variants=para_embs,
                metadata={"cue": mem_item.cue, "target": mem_item.target,
                          "importance": mem_item.importance},
            )
            print(f"  Stored [{mid}]: {mem_item.cue[:40]}... → {mem_item.target[:40]}...")
            if paras:
                print(f"    + {len(paras)} paraphrases: {[p[:30] for p in paras]}")

    print(f"\n  Total: {memory.stats()}")

    # Phase 2: Recall
    print("\n--- Phase 2: Recall from new queries ---")
    queries = [
        "DB performance is terrible",
        "How to push a new release?",
        "What's the Redis connection info?",
        "The login system has a problem",
        "Need to create a database backup",
        "Where's the deployment config?",
    ]

    for query in queries:
        query_emb = emb(model, query)

        # Single-hop recall
        results = memory.recall(query_emb, top_k=2)

        # Multi-hop
        chain = memory.recall_chain(query_emb, hops=2)

        # Format for context injection
        all_results = results + [r for r in chain if r.memory_id not in {r2.memory_id for r2 in results}]
        context = format_recalled_memories(all_results)

        print(f"\n  Query: \"{query}\"")
        if results:
            print(f"    Top result: {results[0].metadata.get('target', '?')[:60]}...")
            print(f"    Similarity: {results[0].similarity:.3f}")
        if chain and len(chain) > 1:
            print(f"    Chain hop 2: {chain[1].metadata.get('target', '?')[:60]}...")
        if context:
            print(f"    Context injection:\n      {context.replace(chr(10), chr(10) + '      ')}")


def test_llm_live(model):
    """Test with live LLM if available."""
    print("\n=== Test 4: Live LLM Integration ===\n")

    llm = LLMClient()
    if not llm.available:
        print("  LLM Gateway not available. Skipping live test.")
        print("  To test: ensure https://ste-jarvis.tiktok-row.net/llm/v1 is reachable")
        return

    # Test extraction
    user_msg = "The payment webhook keeps failing with a 502 error"
    assistant_msg = "The webhook endpoint at /api/payments/webhook is behind nginx. Check if the upstream timeout is too short — payment processing can take up to 30 seconds."

    memories = extract_memories_llm(llm, user_msg, assistant_msg)
    print(f"  Extracted {len(memories)} memories from live LLM:")
    for m in memories:
        print(f"    CUE: {m.cue} | TARGET: {m.target[:60]}... | IMP: {m.importance}")

    # Test paraphrase
    if memories:
        paras = generate_paraphrases_llm(llm, memories[0].cue, n=3)
        print(f"\n  Paraphrases for '{memories[0].cue}':")
        for p in paras:
            print(f"    → {p}")


def main():
    print("=" * 60)
    print("Experiment P0: LLM Integration")
    print("=" * 60)

    model = load_model()
    test_heuristic_extraction()
    test_heuristic_paraphrases()
    test_end_to_end(model)
    test_llm_live(model)


if __name__ == "__main__":
    main()