improve: enhance KB search with better embedding and chunking

2026-03-04 11:47:03 +00:00 · 2026-03-04 11:47:03 +00:00 · 69ad06ca5b
commit 69ad06ca5b
parent fe1370230f
2 changed files with 92 additions and 42 deletions
--- a/scripts/embed.py
+++ b/scripts/embed.py
@ -1,26 +1,66 @@
 #!/usr/bin/env python3
-"""Generate embeddings for text chunks. Reads JSON from stdin, writes JSON to stdout.
+"""Embedding HTTP server. Loads model once at startup, serves requests on port 8199.

-Input:  {"texts": ["text1", "text2", ...]}
-Output: {"embeddings": [[0.1, 0.2, ...], [0.3, 0.4, ...], ...]}
+POST /embed  {"texts": ["text1", "text2", ...]}
+Response:    {"embeddings": [[0.1, 0.2, ...], ...]}
+
+GET /health  -> 200 OK
 """
 import json
 import sys
+from http.server import HTTPServer, BaseHTTPRequestHandler
 from sentence_transformers import SentenceTransformer

 MODEL_NAME = "all-MiniLM-L6-v2"
+PORT = 8199

-def main():
-    data = json.loads(sys.stdin.read())
-    texts = data["texts"]
+# Load model once at startup
+print(f"Loading model {MODEL_NAME}...", flush=True)
+model = SentenceTransformer(MODEL_NAME)
+print(f"Model loaded, serving on port {PORT}", flush=True)

-    if not texts:
-        print(json.dumps({"embeddings": []}))
-        return

-    model = SentenceTransformer(MODEL_NAME)
-    embeddings = model.encode(texts, normalize_embeddings=True)
-    print(json.dumps({"embeddings": embeddings.tolist()}))
+class EmbedHandler(BaseHTTPRequestHandler):
+    def do_POST(self):
+        length = int(self.headers.get("Content-Length", 0))
+        body = self.rfile.read(length)
+        data = json.loads(body)
+        texts = data.get("texts", [])
+
+        if not texts:
+            result = {"embeddings": []}
+        else:
+            embeddings = model.encode(texts, normalize_embeddings=True)
+            result = {"embeddings": embeddings.tolist()}
+
+        resp = json.dumps(result).encode()
+        self.send_response(200)
+        self.send_header("Content-Type", "application/json")
+        self.send_header("Content-Length", str(len(resp)))
+        self.end_headers()
+        self.wfile.write(resp)
+
+    def do_GET(self):
+        self.send_response(200)
+        self.send_header("Content-Type", "text/plain")
+        self.end_headers()
+        self.wfile.write(b"ok")
+
+    def log_message(self, format, *args):
+        # Suppress per-request logs
+        pass
+

 if __name__ == "__main__":
-    main()
+    import socket
+    # Dual-stack: listen on both IPv4 and IPv6
+    class DualStackHTTPServer(HTTPServer):
+        address_family = socket.AF_INET6
+        def server_bind(self):
+            self.socket.setsockopt(socket.IPPROTO_IPV6, socket.IPV6_V6ONLY, 0)
+            super().server_bind()
+    server = DualStackHTTPServer(("::", PORT), EmbedHandler)
+    try:
+        server.serve_forever()
+    except KeyboardInterrupt:
+        pass
--- a/src/kb.rs
+++ b/src/kb.rs
@ -1,6 +1,5 @@
 use anyhow::Result;
 use sqlx::sqlite::SqlitePool;
-use std::process::Stdio;

 const TOP_K: usize = 5;

@ -30,21 +29,34 @@ impl KbManager {

    /// Re-index a single article: delete its old chunks, chunk the content, embed, store
    pub async fn index(&self, article_id: &str, content: &str) -> Result<()> {
-        // Delete only this article's chunks
-        sqlx::query("DELETE FROM kb_chunks WHERE article_id = ?")
-            .bind(article_id)
-            .execute(&self.pool)
-            .await?;
+        self.index_batch(&[(article_id.to_string(), content.to_string())]).await
+    }

-        let chunks = split_chunks(content);
-        if chunks.is_empty() {
+    /// Batch re-index multiple articles in one embedding call (avoids repeated model loading).
+    pub async fn index_batch(&self, articles: &[(String, String)]) -> Result<()> {
+        // Collect all chunks with their article_id
+        let mut all_chunks: Vec<(String, Chunk)> = Vec::new(); // (article_id, chunk)
+        for (article_id, content) in articles {
+            sqlx::query("DELETE FROM kb_chunks WHERE article_id = ?")
+                .bind(article_id)
+                .execute(&self.pool)
+                .await?;
+
+            let chunks = split_chunks(content);
+            for chunk in chunks {
+                all_chunks.push((article_id.clone(), chunk));
+            }
+        }
+
+        if all_chunks.is_empty() {
            return Ok(());
        }

-        let texts: Vec<String> = chunks.iter().map(|c| c.content.clone()).collect();
+        // Single embedding call for all chunks
+        let texts: Vec<String> = all_chunks.iter().map(|(_, c)| c.content.clone()).collect();
        let embeddings = compute_embeddings(&texts).await?;

-        for (chunk, embedding) in chunks.iter().zip(embeddings.into_iter()) {
+        for ((article_id, chunk), embedding) in all_chunks.iter().zip(embeddings.into_iter()) {
            let vec_bytes = embedding_to_bytes(&embedding);
            sqlx::query(
                "INSERT INTO kb_chunks (id, article_id, title, content, embedding) VALUES (?, ?, ?, ?, ?)",
@ -58,7 +70,7 @@ impl KbManager {
            .await?;
        }

-        tracing::info!("KB indexed article {}: {} chunks", article_id, chunks.len());
+        tracing::info!("KB indexed {} articles, {} total chunks", articles.len(), all_chunks.len());
        Ok(())
    }

@ -138,30 +150,28 @@ impl KbManager {
    }
 }

-/// Call Python script to compute embeddings
+/// Call embedding HTTP server
 async fn compute_embeddings(texts: &[String]) -> Result<Vec<Vec<f32>>> {
+    let embed_url = std::env::var("TORI_EMBED_URL")
+        .unwrap_or_else(|_| "http://127.0.0.1:8199".to_string());
+    let client = reqwest::Client::new();
    let input = serde_json::json!({ "texts": texts });

-    let mut child = tokio::process::Command::new("/app/venv/bin/python")
-        .arg("/app/scripts/embed.py")
-        .stdin(Stdio::piped())
-        .stdout(Stdio::piped())
-        .stderr(Stdio::piped())
-        .spawn()?;
+    let resp = client
+        .post(format!("{}/embed", embed_url))
+        .json(&input)
+        .timeout(std::time::Duration::from_secs(300))
+        .send()
+        .await
+        .map_err(|e| anyhow::anyhow!("Embedding server request failed (is embed.py running?): {}", e))?;

-    if let Some(mut stdin) = child.stdin.take() {
-        use tokio::io::AsyncWriteExt;
-        stdin.write_all(input.to_string().as_bytes()).await?;
+    if !resp.status().is_success() {
+        let status = resp.status();
+        let body = resp.text().await.unwrap_or_default();
+        anyhow::bail!("Embedding server error {}: {}", status, body);
    }

-    let output = child.wait_with_output().await?;
-
-    if !output.status.success() {
-        let stderr = String::from_utf8_lossy(&output.stderr);
-        anyhow::bail!("Embedding script failed: {}", stderr);
-    }
-
-    let result: serde_json::Value = serde_json::from_slice(&output.stdout)?;
+    let result: serde_json::Value = resp.json().await?;
    let embeddings: Vec<Vec<f32>> = result["embeddings"]
        .as_array()
        .ok_or_else(|| anyhow::anyhow!("Invalid embedding output"))?