From 1ee35b4d19bc3f378c10f3dc59eff32e05198b78 Mon Sep 17 00:00:00 2001
From: Fam Zheng <fam@euphon.net>
Date: Sun, 17 May 2026 22:47:06 +0100
Subject: [PATCH] =?UTF-8?q?notes(asr):=20overlap=20=E5=88=87=E7=89=87=20+?=
 =?UTF-8?q?=20LLM=20=E6=8B=BC=E6=8E=A5=E5=8E=BB=E9=87=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- ffmpeg 用 -ss/-t 顺序切 65s 段，stride 55s（10s overlap）；单段 ≤70s 整段不切
- 串行喂外部 ASR 后，把全部 chunk_texts 喂一次 LLM 让它去重 + 修边界字
- 单段直接返回 naive，LLM 失败也 fallback naive，不卡流程
- sidecar 注入 LLM_GATEWAY/LLM_MODEL/LLM_TOKEN env
---
 apps/notes/feishu/server.py | 164 +++++++++++++++++++++++++++---------
 apps/notes/k8s/all.yaml     |   9 ++
 2 files changed, 132 insertions(+), 41 deletions(-)

diff --git a/apps/notes/feishu/server.py b/apps/notes/feishu/server.py
index ed43aac..4535a18 100644
--- a/apps/notes/feishu/server.py
+++ b/apps/notes/feishu/server.py
@@ -31,12 +31,13 @@ def healthz():
 
 class TranscribeReq(BaseModel):
     audio_path: str
-    chunk_seconds: int = 60  # 60s ≈ 1-1.5 MB m4a，远低于 ASR 限制
+    chunk_seconds: int = 65       # 单段长度，远低于 Qwen3-ASR 8192-token cache（~7min）
+    overlap_seconds: int = 10     # 相邻段重叠，给 LLM stitching 留 anchor
 
 
 @app.post('/transcribe')
 def transcribe(req: TranscribeReq):
-    """ffmpeg 切片 → 串行喂外部 ASR → 拼接 transcript。"""
+    """ffmpeg 切 overlap 片 → 串行 ASR → LLM 拼接去重。"""
     src = Path(req.audio_path)
     if not src.exists():
         raise HTTPException(400, f'audio not found: {src}')
@@ -48,62 +49,143 @@ def transcribe(req: TranscribeReq):
     tmp = Path(tempfile.gettempdir()) / f'transcribe-{uuid.uuid4().hex}'
     tmp.mkdir(parents=True)
     try:
-        # 用 ffmpeg segment：直接 copy stream（fast & 不损失质量）
-        # 个别情况下 -c copy 在某些容器格式下切不精准，回退 re-encode 到 aac
-        ext = src.suffix.lstrip('.') or 'm4a'
-        chunk_pattern = f'chunk_%03d.{ext}'
+        # 1) 用 ffprobe 拿总时长
+        out = subprocess.check_output(
+            ['ffprobe', '-v', 'quiet', '-show_entries', 'format=duration',
+             '-of', 'csv=p=0', str(src)],
+            timeout=60,
+        )
         try:
-            subprocess.run(
-                ['ffmpeg', '-y', '-i', str(src),
-                 '-f', 'segment', '-segment_time', str(req.chunk_seconds),
-                 '-c', 'copy', '-reset_timestamps', '1',
-                 str(tmp / chunk_pattern)],
-                check=True, capture_output=True, timeout=180,
-            )
-        except subprocess.CalledProcessError:
-            # fallback: re-encode AAC，慢但稳
-            log.warning("ffmpeg -c copy 失败，回退 re-encode")
-            for p in tmp.glob(f'chunk_*.{ext}'):
-                p.unlink(missing_ok=True)
-            subprocess.run(
-                ['ffmpeg', '-y', '-i', str(src),
-                 '-f', 'segment', '-segment_time', str(req.chunk_seconds),
-                 '-c:a', 'aac', '-b:a', '64k', '-ac', '1', '-ar', '16000',
-                 '-reset_timestamps', '1',
-                 str(tmp / 'chunk_%03d.m4a')],
-                check=True, capture_output=True, timeout=600,
-            )
-            ext = 'm4a'
+            duration = float(out.decode().strip())
+        except ValueError:
+            raise HTTPException(500, f'ffprobe duration parse: {out!r}')
+        log.info("duration=%.1fs", duration)
 
-        chunks = sorted(tmp.glob(f'chunk_*.{ext}'))
-        if not chunks:
-            raise HTTPException(500, 'ffmpeg produced 0 chunks')
-        log.info("split %s → %d chunks", src.name, len(chunks))
+        # 2) 切 chunk_seconds 段，stride = chunk_seconds - overlap_seconds
+        stride = max(1, req.chunk_seconds - req.overlap_seconds)
+        ext = src.suffix.lstrip('.') or 'm4a'
+        chunks_meta = []
+        i = 0
+        start = 0.0
+        # 短录音单段够：不切，直接整段
+        single_shot = duration <= req.chunk_seconds + 5
+        if single_shot:
+            chunks_meta = [{'start': 0.0, 'path': src, 'idx': 0}]
+        else:
+            while start < duration:
+                cp = tmp / f'chunk_{i:03d}.{ext}'
+                # -ss 在 -i 前：input seek，快；-c copy 不重新编码
+                try:
+                    subprocess.run(
+                        ['ffmpeg', '-y', '-ss', f'{start:.2f}',
+                         '-t', f'{req.chunk_seconds}',
+                         '-i', str(src), '-c', 'copy', str(cp)],
+                        check=True, capture_output=True, timeout=120,
+                    )
+                except subprocess.CalledProcessError:
+                    subprocess.run(
+                        ['ffmpeg', '-y', '-ss', f'{start:.2f}',
+                         '-t', f'{req.chunk_seconds}',
+                         '-i', str(src),
+                         '-c:a', 'aac', '-b:a', '64k', '-ac', '1', '-ar', '16000',
+                         str(cp)],
+                        check=True, capture_output=True, timeout=180,
+                    )
+                if cp.stat().st_size < 1024:
+                    break
+                chunks_meta.append({'start': start, 'path': cp, 'idx': i})
+                start += stride
+                i += 1
 
-        all_text = []
-        for i, c in enumerate(chunks, 1):
-            log.info("ASR chunk %d/%d (%s, %d KB)", i, len(chunks), c.name, c.stat().st_size // 1024)
-            with open(c, 'rb') as f:
+        if not chunks_meta:
+            raise HTTPException(500, 'no chunks produced')
+        log.info("chunks=%d, stride=%ds, overlap=%ds",
+                 len(chunks_meta), stride, req.overlap_seconds)
+
+        # 3) 串行 ASR
+        chunk_texts = []
+        for m in chunks_meta:
+            log.info("ASR chunk %d/%d (start=%.1fs, %dKB)",
+                     m['idx'] + 1, len(chunks_meta), m['start'],
+                     m['path'].stat().st_size // 1024)
+            with open(m['path'], 'rb') as f:
                 r = requests.post(
                     asr_url,
                     headers={'Authorization': f'Bearer {asr_token}'},
-                    files={'file': (c.name, f, 'audio/mp4')},
+                    files={'file': (m['path'].name, f, 'audio/mp4')},
                     data={'model': 'qwen3-asr', 'response_format': 'json'},
                     timeout=300,
                 )
             if not r.ok:
-                raise HTTPException(502, f'ASR chunk {i} {r.status_code}: {r.text[:300]}')
+                raise HTTPException(502, f'ASR chunk {m["idx"]} {r.status_code}: {r.text[:300]}')
             try:
                 text = r.json().get('text', '').strip()
             except Exception:
-                raise HTTPException(502, f'ASR chunk {i} bad json: {r.text[:200]}')
-            all_text.append(text)
-        full = '\n'.join(t for t in all_text if t)
-        return {'text': full, 'chunks': len(chunks)}
+                raise HTTPException(502, f'ASR chunk {m["idx"]} bad json: {r.text[:200]}')
+            chunk_texts.append(text)
+
+        # 4) 单段直接返回
+        if len(chunk_texts) == 1:
+            return {'text': chunk_texts[0], 'chunks': 1, 'stitched_by': 'single'}
+
+        # 5) LLM 拼接（gemma 一次性看所有 chunks 去重 + 拼）
+        stitched = llm_stitch(chunk_texts, req.overlap_seconds)
+        return {
+            'text': stitched,
+            'chunks': len(chunk_texts),
+            'stitched_by': 'llm',
+        }
     finally:
         shutil.rmtree(tmp, ignore_errors=True)
 
 
+def llm_stitch(chunks: list[str], overlap_seconds: int) -> str:
+    """让 LLM 把相邻段重叠部分去重 + 修正边界字。失败 fallback 朴素拼接。"""
+    gw = os.environ.get('LLM_GATEWAY', '').rstrip('/')
+    tok = os.environ.get('LLM_TOKEN', '')
+    model = os.environ.get('LLM_MODEL', 'gemma-4-31b-it')
+    naive = '\n'.join(chunks)
+    if not gw or not tok:
+        log.warning("LLM not configured, fall back to naive concat")
+        return naive
+
+    parts = []
+    for i, c in enumerate(chunks):
+        parts.append(f"段 {i + 1}：\n{c}")
+    user = (
+        f"下面是一段会议录音的 ASR 转写，被切成 {len(chunks)} 段。"
+        f"相邻段有约 {overlap_seconds} 秒（几句话）的重叠。\n\n"
+        + "\n\n".join(parts)
+        + "\n\n请把所有段拼成一段连续文本：去掉相邻段交界处的重复、"
+          "修正明显 ASR 错字（结合上下文）、补回被切断的词。\n"
+          "不要加任何解释、标题、段号；只输出拼好的连续文本。"
+    )
+    payload = {
+        "model": model,
+        "messages": [
+            {"role": "system", "content": "你是 ASR 转写后处理助手，专门做去重拼接和错字修正。"},
+            {"role": "user", "content": user},
+        ],
+        "temperature": 0.1,
+    }
+    try:
+        r = requests.post(
+            gw + '/chat/completions',
+            headers={'Authorization': f'Bearer {tok}'},
+            json=payload,
+            timeout=600,
+        )
+        if not r.ok:
+            log.warning("stitch LLM %s: %s", r.status_code, r.text[:200])
+            return naive
+        d = r.json()
+        text = d['choices'][0]['message']['content'].strip()
+        return text or naive
+    except Exception as e:
+        log.warning("stitch LLM call failed: %s", e)
+        return naive
+
+
 class ConvertReq(BaseModel):
     md_path: str
     title: Optional[str] = None
diff --git a/apps/notes/k8s/all.yaml b/apps/notes/k8s/all.yaml
index 27ace1c..695dc7e 100644
--- a/apps/notes/k8s/all.yaml
+++ b/apps/notes/k8s/all.yaml
@@ -116,6 +116,15 @@ spec:
                 secretKeyRef:
                   name: notes-creds
                   key: asr_token
+            - name: LLM_GATEWAY
+              value: http://3.135.65.204:8848/v1
+            - name: LLM_MODEL
+              value: gemma-4-31b-it
+            - name: LLM_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: notes-creds
+                  key: llm_token
           readinessProbe:
             httpGet: { path: /healthz, port: feishu }
             initialDelaySeconds: 3