From 1ee35b4d19bc3f378c10f3dc59eff32e05198b78 Mon Sep 17 00:00:00 2001 From: Fam Zheng Date: Sun, 17 May 2026 22:47:06 +0100 Subject: [PATCH] =?UTF-8?q?notes(asr):=20overlap=20=E5=88=87=E7=89=87=20+?= =?UTF-8?q?=20LLM=20=E6=8B=BC=E6=8E=A5=E5=8E=BB=E9=87=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - ffmpeg 用 -ss/-t 顺序切 65s 段,stride 55s(10s overlap);单段 ≤70s 整段不切 - 串行喂外部 ASR 后,把全部 chunk_texts 喂一次 LLM 让它去重 + 修边界字 - 单段直接返回 naive,LLM 失败也 fallback naive,不卡流程 - sidecar 注入 LLM_GATEWAY/LLM_MODEL/LLM_TOKEN env --- apps/notes/feishu/server.py | 164 +++++++++++++++++++++++++++--------- apps/notes/k8s/all.yaml | 9 ++ 2 files changed, 132 insertions(+), 41 deletions(-) diff --git a/apps/notes/feishu/server.py b/apps/notes/feishu/server.py index ed43aac..4535a18 100644 --- a/apps/notes/feishu/server.py +++ b/apps/notes/feishu/server.py @@ -31,12 +31,13 @@ def healthz(): class TranscribeReq(BaseModel): audio_path: str - chunk_seconds: int = 60 # 60s ≈ 1-1.5 MB m4a,远低于 ASR 限制 + chunk_seconds: int = 65 # 单段长度,远低于 Qwen3-ASR 8192-token cache(~7min) + overlap_seconds: int = 10 # 相邻段重叠,给 LLM stitching 留 anchor @app.post('/transcribe') def transcribe(req: TranscribeReq): - """ffmpeg 切片 → 串行喂外部 ASR → 拼接 transcript。""" + """ffmpeg 切 overlap 片 → 串行 ASR → LLM 拼接去重。""" src = Path(req.audio_path) if not src.exists(): raise HTTPException(400, f'audio not found: {src}') @@ -48,62 +49,143 @@ def transcribe(req: TranscribeReq): tmp = Path(tempfile.gettempdir()) / f'transcribe-{uuid.uuid4().hex}' tmp.mkdir(parents=True) try: - # 用 ffmpeg segment:直接 copy stream(fast & 不损失质量) - # 个别情况下 -c copy 在某些容器格式下切不精准,回退 re-encode 到 aac - ext = src.suffix.lstrip('.') or 'm4a' - chunk_pattern = f'chunk_%03d.{ext}' + # 1) 用 ffprobe 拿总时长 + out = subprocess.check_output( + ['ffprobe', '-v', 'quiet', '-show_entries', 'format=duration', + '-of', 'csv=p=0', str(src)], + timeout=60, + ) try: - subprocess.run( - ['ffmpeg', '-y', '-i', str(src), - '-f', 'segment', '-segment_time', str(req.chunk_seconds), - '-c', 'copy', '-reset_timestamps', '1', - str(tmp / chunk_pattern)], - check=True, capture_output=True, timeout=180, - ) - except subprocess.CalledProcessError: - # fallback: re-encode AAC,慢但稳 - log.warning("ffmpeg -c copy 失败,回退 re-encode") - for p in tmp.glob(f'chunk_*.{ext}'): - p.unlink(missing_ok=True) - subprocess.run( - ['ffmpeg', '-y', '-i', str(src), - '-f', 'segment', '-segment_time', str(req.chunk_seconds), - '-c:a', 'aac', '-b:a', '64k', '-ac', '1', '-ar', '16000', - '-reset_timestamps', '1', - str(tmp / 'chunk_%03d.m4a')], - check=True, capture_output=True, timeout=600, - ) - ext = 'm4a' + duration = float(out.decode().strip()) + except ValueError: + raise HTTPException(500, f'ffprobe duration parse: {out!r}') + log.info("duration=%.1fs", duration) - chunks = sorted(tmp.glob(f'chunk_*.{ext}')) - if not chunks: - raise HTTPException(500, 'ffmpeg produced 0 chunks') - log.info("split %s → %d chunks", src.name, len(chunks)) + # 2) 切 chunk_seconds 段,stride = chunk_seconds - overlap_seconds + stride = max(1, req.chunk_seconds - req.overlap_seconds) + ext = src.suffix.lstrip('.') or 'm4a' + chunks_meta = [] + i = 0 + start = 0.0 + # 短录音单段够:不切,直接整段 + single_shot = duration <= req.chunk_seconds + 5 + if single_shot: + chunks_meta = [{'start': 0.0, 'path': src, 'idx': 0}] + else: + while start < duration: + cp = tmp / f'chunk_{i:03d}.{ext}' + # -ss 在 -i 前:input seek,快;-c copy 不重新编码 + try: + subprocess.run( + ['ffmpeg', '-y', '-ss', f'{start:.2f}', + '-t', f'{req.chunk_seconds}', + '-i', str(src), '-c', 'copy', str(cp)], + check=True, capture_output=True, timeout=120, + ) + except subprocess.CalledProcessError: + subprocess.run( + ['ffmpeg', '-y', '-ss', f'{start:.2f}', + '-t', f'{req.chunk_seconds}', + '-i', str(src), + '-c:a', 'aac', '-b:a', '64k', '-ac', '1', '-ar', '16000', + str(cp)], + check=True, capture_output=True, timeout=180, + ) + if cp.stat().st_size < 1024: + break + chunks_meta.append({'start': start, 'path': cp, 'idx': i}) + start += stride + i += 1 - all_text = [] - for i, c in enumerate(chunks, 1): - log.info("ASR chunk %d/%d (%s, %d KB)", i, len(chunks), c.name, c.stat().st_size // 1024) - with open(c, 'rb') as f: + if not chunks_meta: + raise HTTPException(500, 'no chunks produced') + log.info("chunks=%d, stride=%ds, overlap=%ds", + len(chunks_meta), stride, req.overlap_seconds) + + # 3) 串行 ASR + chunk_texts = [] + for m in chunks_meta: + log.info("ASR chunk %d/%d (start=%.1fs, %dKB)", + m['idx'] + 1, len(chunks_meta), m['start'], + m['path'].stat().st_size // 1024) + with open(m['path'], 'rb') as f: r = requests.post( asr_url, headers={'Authorization': f'Bearer {asr_token}'}, - files={'file': (c.name, f, 'audio/mp4')}, + files={'file': (m['path'].name, f, 'audio/mp4')}, data={'model': 'qwen3-asr', 'response_format': 'json'}, timeout=300, ) if not r.ok: - raise HTTPException(502, f'ASR chunk {i} {r.status_code}: {r.text[:300]}') + raise HTTPException(502, f'ASR chunk {m["idx"]} {r.status_code}: {r.text[:300]}') try: text = r.json().get('text', '').strip() except Exception: - raise HTTPException(502, f'ASR chunk {i} bad json: {r.text[:200]}') - all_text.append(text) - full = '\n'.join(t for t in all_text if t) - return {'text': full, 'chunks': len(chunks)} + raise HTTPException(502, f'ASR chunk {m["idx"]} bad json: {r.text[:200]}') + chunk_texts.append(text) + + # 4) 单段直接返回 + if len(chunk_texts) == 1: + return {'text': chunk_texts[0], 'chunks': 1, 'stitched_by': 'single'} + + # 5) LLM 拼接(gemma 一次性看所有 chunks 去重 + 拼) + stitched = llm_stitch(chunk_texts, req.overlap_seconds) + return { + 'text': stitched, + 'chunks': len(chunk_texts), + 'stitched_by': 'llm', + } finally: shutil.rmtree(tmp, ignore_errors=True) +def llm_stitch(chunks: list[str], overlap_seconds: int) -> str: + """让 LLM 把相邻段重叠部分去重 + 修正边界字。失败 fallback 朴素拼接。""" + gw = os.environ.get('LLM_GATEWAY', '').rstrip('/') + tok = os.environ.get('LLM_TOKEN', '') + model = os.environ.get('LLM_MODEL', 'gemma-4-31b-it') + naive = '\n'.join(chunks) + if not gw or not tok: + log.warning("LLM not configured, fall back to naive concat") + return naive + + parts = [] + for i, c in enumerate(chunks): + parts.append(f"段 {i + 1}:\n{c}") + user = ( + f"下面是一段会议录音的 ASR 转写,被切成 {len(chunks)} 段。" + f"相邻段有约 {overlap_seconds} 秒(几句话)的重叠。\n\n" + + "\n\n".join(parts) + + "\n\n请把所有段拼成一段连续文本:去掉相邻段交界处的重复、" + "修正明显 ASR 错字(结合上下文)、补回被切断的词。\n" + "不要加任何解释、标题、段号;只输出拼好的连续文本。" + ) + payload = { + "model": model, + "messages": [ + {"role": "system", "content": "你是 ASR 转写后处理助手,专门做去重拼接和错字修正。"}, + {"role": "user", "content": user}, + ], + "temperature": 0.1, + } + try: + r = requests.post( + gw + '/chat/completions', + headers={'Authorization': f'Bearer {tok}'}, + json=payload, + timeout=600, + ) + if not r.ok: + log.warning("stitch LLM %s: %s", r.status_code, r.text[:200]) + return naive + d = r.json() + text = d['choices'][0]['message']['content'].strip() + return text or naive + except Exception as e: + log.warning("stitch LLM call failed: %s", e) + return naive + + class ConvertReq(BaseModel): md_path: str title: Optional[str] = None diff --git a/apps/notes/k8s/all.yaml b/apps/notes/k8s/all.yaml index 27ace1c..695dc7e 100644 --- a/apps/notes/k8s/all.yaml +++ b/apps/notes/k8s/all.yaml @@ -116,6 +116,15 @@ spec: secretKeyRef: name: notes-creds key: asr_token + - name: LLM_GATEWAY + value: http://3.135.65.204:8848/v1 + - name: LLM_MODEL + value: gemma-4-31b-it + - name: LLM_TOKEN + valueFrom: + secretKeyRef: + name: notes-creds + key: llm_token readinessProbe: httpGet: { path: /healthz, port: feishu } initialDelaySeconds: 3