notes(asr): 切片串行 ASR 绕单文件大小限制

ASR server 直接 500 拒绝大文件 (15MB / ~15min 4.7s 即返回 500)，不是处理超时。改成：sidecar 装 ffmpeg → /transcribe endpoint 把音频切 60s 段 → 串行调外部 ASR → 拼接 transcript。notes 主容器 call_asr 改成 POST 到 sidecar /transcribe（timeout 1h 给长录音留余地）。 - feishu sidecar Dockerfile + ffmpeg + requests - server.py 加 TranscribeReq；fallback -c copy 失败时 re-encode AAC - main.rs 删除 asr_url/asr_token 字段（now sidecar concern） - k8s manifest: ASR_URL/ASR_TOKEN 从主容器移到 feishu sidecar env
2026-05-17 22:38:05 +01:00
parent e5a87cc65f
commit 688ccdc76f
4 changed files with 104 additions and 39 deletions
@@ -4,7 +4,7 @@
 FROM node:20-slim

 RUN apt-get update && apt-get install -y --no-install-recommends \
-    python3 python3-pip python3-markdown ca-certificates curl \
+    python3 python3-pip python3-markdown ca-certificates curl ffmpeg \
    && rm -rf /var/lib/apt/lists/*

 # lark-cli postinstall 调 curl 下二进制，没 curl 会报 spawnSync ENOENT
@@ -12,7 +12,8 @@ RUN npm install -g @larksuite/cli@1.0.29

 RUN pip install --no-cache-dir --break-system-packages \
    fastapi==0.115.6 \
-    uvicorn==0.34.0
+    uvicorn==0.34.0 \
+    requests==2.32.3

 COPY markdown-to-feishu /usr/local/bin/markdown-to-feishu
 RUN chmod +x /usr/local/bin/markdown-to-feishu
@@ -1,17 +1,19 @@
-"""notes feishu sidecar：HTTP 包一层 markdown-to-feishu。
-
-POST /convert  {md_path, title?, existing_doc_id?}
-  → 跑 markdown-to-feishu，parse 最后那段 JSON，返回 {doc_id, url}
+"""notes 多用途 sidecar：
+  POST /transcribe — 用 ffmpeg 切片 + 串行调外部 ASR，绕过单请求大小限制
+  POST /convert    — markdown-to-feishu，把会议纪要 push 飞书 docx
 """

 import json
 import logging
 import os
-import re
+import shutil
 import subprocess
+import tempfile
+import uuid
 from pathlib import Path
 from typing import Optional

+import requests
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel

@@ -27,6 +29,81 @@ def healthz():
    return {'ok': True}


+class TranscribeReq(BaseModel):
+    audio_path: str
+    chunk_seconds: int = 60  # 60s ≈ 1-1.5 MB m4a，远低于 ASR 限制
+
+
+@app.post('/transcribe')
+def transcribe(req: TranscribeReq):
+    """ffmpeg 切片 → 串行喂外部 ASR → 拼接 transcript。"""
+    src = Path(req.audio_path)
+    if not src.exists():
+        raise HTTPException(400, f'audio not found: {src}')
+    asr_url = os.environ.get('ASR_URL', '')
+    asr_token = os.environ.get('ASR_TOKEN', '')
+    if not asr_url or not asr_token:
+        raise HTTPException(500, 'ASR_URL/ASR_TOKEN not configured in sidecar')
+
+    tmp = Path(tempfile.gettempdir()) / f'transcribe-{uuid.uuid4().hex}'
+    tmp.mkdir(parents=True)
+    try:
+        # 用 ffmpeg segment：直接 copy stream（fast & 不损失质量）
+        # 个别情况下 -c copy 在某些容器格式下切不精准，回退 re-encode 到 aac
+        ext = src.suffix.lstrip('.') or 'm4a'
+        chunk_pattern = f'chunk_%03d.{ext}'
+        try:
+            subprocess.run(
+                ['ffmpeg', '-y', '-i', str(src),
+                 '-f', 'segment', '-segment_time', str(req.chunk_seconds),
+                 '-c', 'copy', '-reset_timestamps', '1',
+                 str(tmp / chunk_pattern)],
+                check=True, capture_output=True, timeout=180,
+            )
+        except subprocess.CalledProcessError:
+            # fallback: re-encode AAC，慢但稳
+            log.warning("ffmpeg -c copy 失败，回退 re-encode")
+            for p in tmp.glob(f'chunk_*.{ext}'):
+                p.unlink(missing_ok=True)
+            subprocess.run(
+                ['ffmpeg', '-y', '-i', str(src),
+                 '-f', 'segment', '-segment_time', str(req.chunk_seconds),
+                 '-c:a', 'aac', '-b:a', '64k', '-ac', '1', '-ar', '16000',
+                 '-reset_timestamps', '1',
+                 str(tmp / 'chunk_%03d.m4a')],
+                check=True, capture_output=True, timeout=600,
+            )
+            ext = 'm4a'
+
+        chunks = sorted(tmp.glob(f'chunk_*.{ext}'))
+        if not chunks:
+            raise HTTPException(500, 'ffmpeg produced 0 chunks')
+        log.info("split %s → %d chunks", src.name, len(chunks))
+
+        all_text = []
+        for i, c in enumerate(chunks, 1):
+            log.info("ASR chunk %d/%d (%s, %d KB)", i, len(chunks), c.name, c.stat().st_size // 1024)
+            with open(c, 'rb') as f:
+                r = requests.post(
+                    asr_url,
+                    headers={'Authorization': f'Bearer {asr_token}'},
+                    files={'file': (c.name, f, 'audio/mp4')},
+                    data={'model': 'qwen3-asr', 'response_format': 'json'},
+                    timeout=300,
+                )
+            if not r.ok:
+                raise HTTPException(502, f'ASR chunk {i} {r.status_code}: {r.text[:300]}')
+            try:
+                text = r.json().get('text', '').strip()
+            except Exception:
+                raise HTTPException(502, f'ASR chunk {i} bad json: {r.text[:200]}')
+            all_text.append(text)
+        full = '\n'.join(t for t in all_text if t)
+        return {'text': full, 'chunks': len(chunks)}
+    finally:
+        shutil.rmtree(tmp, ignore_errors=True)
+
+
 class ConvertReq(BaseModel):
    md_path: str
    title: Optional[str] = None
@@ -72,8 +72,6 @@ spec:
              value: /data/app.db
            - name: BLOBS_DIR
              value: /data/blobs
-            - name: ASR_URL
-              value: http://18.159.112.195:8848/v1/audio/transcriptions
            - name: LLM_GATEWAY
              value: http://3.135.65.204:8848/v1
            - name: LLM_MODEL
@@ -83,11 +81,6 @@ spec:
                secretKeyRef:
                  name: notes-creds
                  key: passphrase
-            - name: ASR_TOKEN
-              valueFrom:
-                secretKeyRef:
-                  name: notes-creds
-                  key: asr_token
            - name: LLM_TOKEN
              valueFrom:
                secretKeyRef:
@@ -115,6 +108,14 @@ spec:
          ports:
            - containerPort: 8002
              name: feishu
+          env:
+            - name: ASR_URL
+              value: http://18.159.112.195:8848/v1/audio/transcriptions
+            - name: ASR_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: notes-creds
+                  key: asr_token
          readinessProbe:
            httpGet: { path: /healthz, port: feishu }
            initialDelaySeconds: 3
@@ -31,8 +31,6 @@ struct AppState {
    db: Arc<Mutex<Connection>>,
    blobs_dir: PathBuf,
    passphrase: String,
-    asr_url: String,
-    asr_token: String,
    llm_gateway: String,
    llm_token: String,
    llm_model: String,
@@ -53,9 +51,7 @@ async fn main() -> std::io::Result<()> {
    if passphrase.is_empty() {
        tracing::warn!("PASSPHRASE not set — all /api/* will return 401");
    }
-    let asr_url = std::env::var("ASR_URL")
-        .unwrap_or_else(|_| "http://18.159.112.195:8848/v1/audio/transcriptions".into());
-    let asr_token = std::env::var("ASR_TOKEN").unwrap_or_default();
+    // ASR 现在由 sidecar 调（切片串行），主容器不再直接调外部 ASR
    let llm_gateway =
        std::env::var("LLM_GATEWAY").unwrap_or_else(|_| "http://3.135.65.204:8848/v1".into());
    let llm_token = std::env::var("LLM_TOKEN").unwrap_or_default();
@@ -95,8 +91,6 @@ async fn main() -> std::io::Result<()> {
        db: Arc::new(Mutex::new(conn)),
        blobs_dir,
        passphrase,
-        asr_url,
-        asr_token,
        llm_gateway,
        llm_token,
        llm_model,
@@ -465,38 +459,30 @@ fn set_status(s: &AppState, id: i64, status: &str, transcript: Option<&str>, err
 async fn call_asr(
    s: &AppState,
    path: &std::path::Path,
-    filename: &str,
+    _filename: &str,
 ) -> Result<String, String> {
-    let bytes = tokio::fs::read(path).await.map_err(|e| e.to_string())?;
-    let part = reqwest::multipart::Part::bytes(bytes)
-        .file_name(filename.to_string())
-        .mime_str("audio/mpeg")
-        .map_err(|e| e.to_string())?;
-    let form = reqwest::multipart::Form::new()
-        .text("model", "qwen3-asr")
-        .text("response_format", "json")
-        .part("file", part);
-
+    // 走 sidecar /transcribe：sidecar 用 ffmpeg 切片 + 串行调外部 ASR，绕过 ASR server 单文件大小限制
+    let url = format!("{}/transcribe", s.feishu_url.trim_end_matches('/'));
+    let payload = json!({ "audio_path": path.to_string_lossy() });
    let resp = s
        .http
-        .post(&s.asr_url)
-        .bearer_auth(&s.asr_token)
-        .multipart(form)
-        .timeout(std::time::Duration::from_secs(600))
+        .post(&url)
+        .json(&payload)
+        .timeout(std::time::Duration::from_secs(3600))
        .send()
        .await
-        .map_err(|e| format!("connect: {e}"))?;
+        .map_err(|e| format!("connect sidecar: {e}"))?;
    if !resp.status().is_success() {
        let st = resp.status();
        let body = resp.text().await.unwrap_or_default();
-        return Err(format!("ASR {st}: {body}"));
+        return Err(format!("sidecar /transcribe {st}: {body}"));
    }
    let v: Value = resp.json().await.map_err(|e| format!("decode: {e}"))?;
    let text = v
        .get("text")
        .and_then(|x| x.as_str())
        .map(|s| s.to_string())
-        .ok_or_else(|| format!("ASR response no 'text': {v}"))?;
+        .ok_or_else(|| format!("no 'text' in response: {v}"))?;
    Ok(text)
 }