- ffmpeg 用 -ss/-t 顺序切 65s 段,stride 55s(10s overlap);单段 ≤70s 整段不切 - 串行喂外部 ASR 后,把全部 chunk_texts 喂一次 LLM 让它去重 + 修边界字 - 单段直接返回 naive,LLM 失败也 fallback naive,不卡流程 - sidecar 注入 LLM_GATEWAY/LLM_MODEL/LLM_TOKEN env
This commit is contained in:
+123
-41
@@ -31,12 +31,13 @@ def healthz():
|
|||||||
|
|
||||||
class TranscribeReq(BaseModel):
|
class TranscribeReq(BaseModel):
|
||||||
audio_path: str
|
audio_path: str
|
||||||
chunk_seconds: int = 60 # 60s ≈ 1-1.5 MB m4a,远低于 ASR 限制
|
chunk_seconds: int = 65 # 单段长度,远低于 Qwen3-ASR 8192-token cache(~7min)
|
||||||
|
overlap_seconds: int = 10 # 相邻段重叠,给 LLM stitching 留 anchor
|
||||||
|
|
||||||
|
|
||||||
@app.post('/transcribe')
|
@app.post('/transcribe')
|
||||||
def transcribe(req: TranscribeReq):
|
def transcribe(req: TranscribeReq):
|
||||||
"""ffmpeg 切片 → 串行喂外部 ASR → 拼接 transcript。"""
|
"""ffmpeg 切 overlap 片 → 串行 ASR → LLM 拼接去重。"""
|
||||||
src = Path(req.audio_path)
|
src = Path(req.audio_path)
|
||||||
if not src.exists():
|
if not src.exists():
|
||||||
raise HTTPException(400, f'audio not found: {src}')
|
raise HTTPException(400, f'audio not found: {src}')
|
||||||
@@ -48,62 +49,143 @@ def transcribe(req: TranscribeReq):
|
|||||||
tmp = Path(tempfile.gettempdir()) / f'transcribe-{uuid.uuid4().hex}'
|
tmp = Path(tempfile.gettempdir()) / f'transcribe-{uuid.uuid4().hex}'
|
||||||
tmp.mkdir(parents=True)
|
tmp.mkdir(parents=True)
|
||||||
try:
|
try:
|
||||||
# 用 ffmpeg segment:直接 copy stream(fast & 不损失质量)
|
# 1) 用 ffprobe 拿总时长
|
||||||
# 个别情况下 -c copy 在某些容器格式下切不精准,回退 re-encode 到 aac
|
out = subprocess.check_output(
|
||||||
ext = src.suffix.lstrip('.') or 'm4a'
|
['ffprobe', '-v', 'quiet', '-show_entries', 'format=duration',
|
||||||
chunk_pattern = f'chunk_%03d.{ext}'
|
'-of', 'csv=p=0', str(src)],
|
||||||
|
timeout=60,
|
||||||
|
)
|
||||||
try:
|
try:
|
||||||
subprocess.run(
|
duration = float(out.decode().strip())
|
||||||
['ffmpeg', '-y', '-i', str(src),
|
except ValueError:
|
||||||
'-f', 'segment', '-segment_time', str(req.chunk_seconds),
|
raise HTTPException(500, f'ffprobe duration parse: {out!r}')
|
||||||
'-c', 'copy', '-reset_timestamps', '1',
|
log.info("duration=%.1fs", duration)
|
||||||
str(tmp / chunk_pattern)],
|
|
||||||
check=True, capture_output=True, timeout=180,
|
|
||||||
)
|
|
||||||
except subprocess.CalledProcessError:
|
|
||||||
# fallback: re-encode AAC,慢但稳
|
|
||||||
log.warning("ffmpeg -c copy 失败,回退 re-encode")
|
|
||||||
for p in tmp.glob(f'chunk_*.{ext}'):
|
|
||||||
p.unlink(missing_ok=True)
|
|
||||||
subprocess.run(
|
|
||||||
['ffmpeg', '-y', '-i', str(src),
|
|
||||||
'-f', 'segment', '-segment_time', str(req.chunk_seconds),
|
|
||||||
'-c:a', 'aac', '-b:a', '64k', '-ac', '1', '-ar', '16000',
|
|
||||||
'-reset_timestamps', '1',
|
|
||||||
str(tmp / 'chunk_%03d.m4a')],
|
|
||||||
check=True, capture_output=True, timeout=600,
|
|
||||||
)
|
|
||||||
ext = 'm4a'
|
|
||||||
|
|
||||||
chunks = sorted(tmp.glob(f'chunk_*.{ext}'))
|
# 2) 切 chunk_seconds 段,stride = chunk_seconds - overlap_seconds
|
||||||
if not chunks:
|
stride = max(1, req.chunk_seconds - req.overlap_seconds)
|
||||||
raise HTTPException(500, 'ffmpeg produced 0 chunks')
|
ext = src.suffix.lstrip('.') or 'm4a'
|
||||||
log.info("split %s → %d chunks", src.name, len(chunks))
|
chunks_meta = []
|
||||||
|
i = 0
|
||||||
|
start = 0.0
|
||||||
|
# 短录音单段够:不切,直接整段
|
||||||
|
single_shot = duration <= req.chunk_seconds + 5
|
||||||
|
if single_shot:
|
||||||
|
chunks_meta = [{'start': 0.0, 'path': src, 'idx': 0}]
|
||||||
|
else:
|
||||||
|
while start < duration:
|
||||||
|
cp = tmp / f'chunk_{i:03d}.{ext}'
|
||||||
|
# -ss 在 -i 前:input seek,快;-c copy 不重新编码
|
||||||
|
try:
|
||||||
|
subprocess.run(
|
||||||
|
['ffmpeg', '-y', '-ss', f'{start:.2f}',
|
||||||
|
'-t', f'{req.chunk_seconds}',
|
||||||
|
'-i', str(src), '-c', 'copy', str(cp)],
|
||||||
|
check=True, capture_output=True, timeout=120,
|
||||||
|
)
|
||||||
|
except subprocess.CalledProcessError:
|
||||||
|
subprocess.run(
|
||||||
|
['ffmpeg', '-y', '-ss', f'{start:.2f}',
|
||||||
|
'-t', f'{req.chunk_seconds}',
|
||||||
|
'-i', str(src),
|
||||||
|
'-c:a', 'aac', '-b:a', '64k', '-ac', '1', '-ar', '16000',
|
||||||
|
str(cp)],
|
||||||
|
check=True, capture_output=True, timeout=180,
|
||||||
|
)
|
||||||
|
if cp.stat().st_size < 1024:
|
||||||
|
break
|
||||||
|
chunks_meta.append({'start': start, 'path': cp, 'idx': i})
|
||||||
|
start += stride
|
||||||
|
i += 1
|
||||||
|
|
||||||
all_text = []
|
if not chunks_meta:
|
||||||
for i, c in enumerate(chunks, 1):
|
raise HTTPException(500, 'no chunks produced')
|
||||||
log.info("ASR chunk %d/%d (%s, %d KB)", i, len(chunks), c.name, c.stat().st_size // 1024)
|
log.info("chunks=%d, stride=%ds, overlap=%ds",
|
||||||
with open(c, 'rb') as f:
|
len(chunks_meta), stride, req.overlap_seconds)
|
||||||
|
|
||||||
|
# 3) 串行 ASR
|
||||||
|
chunk_texts = []
|
||||||
|
for m in chunks_meta:
|
||||||
|
log.info("ASR chunk %d/%d (start=%.1fs, %dKB)",
|
||||||
|
m['idx'] + 1, len(chunks_meta), m['start'],
|
||||||
|
m['path'].stat().st_size // 1024)
|
||||||
|
with open(m['path'], 'rb') as f:
|
||||||
r = requests.post(
|
r = requests.post(
|
||||||
asr_url,
|
asr_url,
|
||||||
headers={'Authorization': f'Bearer {asr_token}'},
|
headers={'Authorization': f'Bearer {asr_token}'},
|
||||||
files={'file': (c.name, f, 'audio/mp4')},
|
files={'file': (m['path'].name, f, 'audio/mp4')},
|
||||||
data={'model': 'qwen3-asr', 'response_format': 'json'},
|
data={'model': 'qwen3-asr', 'response_format': 'json'},
|
||||||
timeout=300,
|
timeout=300,
|
||||||
)
|
)
|
||||||
if not r.ok:
|
if not r.ok:
|
||||||
raise HTTPException(502, f'ASR chunk {i} {r.status_code}: {r.text[:300]}')
|
raise HTTPException(502, f'ASR chunk {m["idx"]} {r.status_code}: {r.text[:300]}')
|
||||||
try:
|
try:
|
||||||
text = r.json().get('text', '').strip()
|
text = r.json().get('text', '').strip()
|
||||||
except Exception:
|
except Exception:
|
||||||
raise HTTPException(502, f'ASR chunk {i} bad json: {r.text[:200]}')
|
raise HTTPException(502, f'ASR chunk {m["idx"]} bad json: {r.text[:200]}')
|
||||||
all_text.append(text)
|
chunk_texts.append(text)
|
||||||
full = '\n'.join(t for t in all_text if t)
|
|
||||||
return {'text': full, 'chunks': len(chunks)}
|
# 4) 单段直接返回
|
||||||
|
if len(chunk_texts) == 1:
|
||||||
|
return {'text': chunk_texts[0], 'chunks': 1, 'stitched_by': 'single'}
|
||||||
|
|
||||||
|
# 5) LLM 拼接(gemma 一次性看所有 chunks 去重 + 拼)
|
||||||
|
stitched = llm_stitch(chunk_texts, req.overlap_seconds)
|
||||||
|
return {
|
||||||
|
'text': stitched,
|
||||||
|
'chunks': len(chunk_texts),
|
||||||
|
'stitched_by': 'llm',
|
||||||
|
}
|
||||||
finally:
|
finally:
|
||||||
shutil.rmtree(tmp, ignore_errors=True)
|
shutil.rmtree(tmp, ignore_errors=True)
|
||||||
|
|
||||||
|
|
||||||
|
def llm_stitch(chunks: list[str], overlap_seconds: int) -> str:
|
||||||
|
"""让 LLM 把相邻段重叠部分去重 + 修正边界字。失败 fallback 朴素拼接。"""
|
||||||
|
gw = os.environ.get('LLM_GATEWAY', '').rstrip('/')
|
||||||
|
tok = os.environ.get('LLM_TOKEN', '')
|
||||||
|
model = os.environ.get('LLM_MODEL', 'gemma-4-31b-it')
|
||||||
|
naive = '\n'.join(chunks)
|
||||||
|
if not gw or not tok:
|
||||||
|
log.warning("LLM not configured, fall back to naive concat")
|
||||||
|
return naive
|
||||||
|
|
||||||
|
parts = []
|
||||||
|
for i, c in enumerate(chunks):
|
||||||
|
parts.append(f"段 {i + 1}:\n{c}")
|
||||||
|
user = (
|
||||||
|
f"下面是一段会议录音的 ASR 转写,被切成 {len(chunks)} 段。"
|
||||||
|
f"相邻段有约 {overlap_seconds} 秒(几句话)的重叠。\n\n"
|
||||||
|
+ "\n\n".join(parts)
|
||||||
|
+ "\n\n请把所有段拼成一段连续文本:去掉相邻段交界处的重复、"
|
||||||
|
"修正明显 ASR 错字(结合上下文)、补回被切断的词。\n"
|
||||||
|
"不要加任何解释、标题、段号;只输出拼好的连续文本。"
|
||||||
|
)
|
||||||
|
payload = {
|
||||||
|
"model": model,
|
||||||
|
"messages": [
|
||||||
|
{"role": "system", "content": "你是 ASR 转写后处理助手,专门做去重拼接和错字修正。"},
|
||||||
|
{"role": "user", "content": user},
|
||||||
|
],
|
||||||
|
"temperature": 0.1,
|
||||||
|
}
|
||||||
|
try:
|
||||||
|
r = requests.post(
|
||||||
|
gw + '/chat/completions',
|
||||||
|
headers={'Authorization': f'Bearer {tok}'},
|
||||||
|
json=payload,
|
||||||
|
timeout=600,
|
||||||
|
)
|
||||||
|
if not r.ok:
|
||||||
|
log.warning("stitch LLM %s: %s", r.status_code, r.text[:200])
|
||||||
|
return naive
|
||||||
|
d = r.json()
|
||||||
|
text = d['choices'][0]['message']['content'].strip()
|
||||||
|
return text or naive
|
||||||
|
except Exception as e:
|
||||||
|
log.warning("stitch LLM call failed: %s", e)
|
||||||
|
return naive
|
||||||
|
|
||||||
|
|
||||||
class ConvertReq(BaseModel):
|
class ConvertReq(BaseModel):
|
||||||
md_path: str
|
md_path: str
|
||||||
title: Optional[str] = None
|
title: Optional[str] = None
|
||||||
|
|||||||
@@ -116,6 +116,15 @@ spec:
|
|||||||
secretKeyRef:
|
secretKeyRef:
|
||||||
name: notes-creds
|
name: notes-creds
|
||||||
key: asr_token
|
key: asr_token
|
||||||
|
- name: LLM_GATEWAY
|
||||||
|
value: http://3.135.65.204:8848/v1
|
||||||
|
- name: LLM_MODEL
|
||||||
|
value: gemma-4-31b-it
|
||||||
|
- name: LLM_TOKEN
|
||||||
|
valueFrom:
|
||||||
|
secretKeyRef:
|
||||||
|
name: notes-creds
|
||||||
|
key: llm_token
|
||||||
readinessProbe:
|
readinessProbe:
|
||||||
httpGet: { path: /healthz, port: feishu }
|
httpGet: { path: /healthz, port: feishu }
|
||||||
initialDelaySeconds: 3
|
initialDelaySeconds: 3
|
||||||
|
|||||||
Reference in New Issue
Block a user