#!/usr/bin/env -S uv run --script # /// script # requires-python = ">=3.11" # dependencies = ["requests"] # /// """Generate voice audio using IndexTTS2 with a fixed reference voice. Usage: ./gen_voice --schema ./gen_voice '{"text":"你好世界"}' ./gen_voice 你好世界 """ import json import os import sys import time import requests INDEXTTS_URL = "http://100.107.41.75:7860" SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) REF_AUDIO = os.path.join(SCRIPT_DIR, "..", "assets", "ref_voice.mp3") OUTPUT_DIR = os.path.expanduser("~/down") # cache the uploaded ref path to avoid re-uploading _CACHE_FILE = "/tmp/noc_gen_voice_ref_cache.json" SCHEMA = { "name": "gen_voice", "description": "Generate speech audio from text using voice cloning (IndexTTS2). Returns the file path of the generated wav. Use send_file to send it to the user.", "parameters": { "type": "object", "properties": { "text": { "type": "string", "description": "The text to synthesize into speech", }, }, "required": ["text"], }, } def get_ref_path(): """Upload ref audio once, cache the server-side path. Invalidate if server restarted.""" # check cache — validate against server uptime if os.path.exists(_CACHE_FILE): try: with open(_CACHE_FILE) as f: cache = json.load(f) # quick health check — if server is up and path exists, reuse r = requests.head(f"{INDEXTTS_URL}/gradio_api/file={cache['path']}", timeout=3) if r.status_code == 200: return cache["path"] except Exception: pass # upload with open(REF_AUDIO, "rb") as f: resp = requests.post(f"{INDEXTTS_URL}/gradio_api/upload", files={"files": f}) resp.raise_for_status() ref_path = resp.json()[0] # cache with open(_CACHE_FILE, "w") as f: json.dump({"path": ref_path}, f) return ref_path def synthesize(text): ref = get_ref_path() file_data = {"path": ref, "meta": {"_type": "gradio.FileData"}} # submit job resp = requests.post( f"{INDEXTTS_URL}/gradio_api/call/synthesize", json={ "data": [ text, file_data, # spk_audio file_data, # emo_audio 0.5, # emo_alpha 0, 0, 0, 0, 0, 0, 0, 0.8, # emotions (calm=0.8) False, # use_emo_text "", # emo_text False, # use_random ] }, ) resp.raise_for_status() event_id = resp.json()["event_id"] # poll result via SSE result_resp = requests.get( f"{INDEXTTS_URL}/gradio_api/call/synthesize/{event_id}", stream=True ) for line in result_resp.iter_lines(decode_unicode=True): if line.startswith("data: "): data = json.loads(line[6:]) if isinstance(data, list) and data: url = data[0].get("url", "") if url: # download the wav wav = requests.get(url) wav.raise_for_status() os.makedirs(OUTPUT_DIR, exist_ok=True) ts = time.strftime("%Y%m%d_%H%M%S") out_path = os.path.join(OUTPUT_DIR, f"tts_{ts}.wav") with open(out_path, "wb") as f: f.write(wav.content) return out_path elif data is None: raise RuntimeError("TTS synthesis failed (server returned null)") raise RuntimeError("No result received from TTS server") def main(): if len(sys.argv) < 2 or sys.argv[1] in ("--help", "-h"): print(__doc__.strip()) sys.exit(0) if sys.argv[1] == "--schema": print(json.dumps(SCHEMA, ensure_ascii=False)) sys.exit(0) arg = sys.argv[1] if not arg.startswith("{"): text = " ".join(sys.argv[1:]) else: try: args = json.loads(arg) text = args.get("text", "") except json.JSONDecodeError as e: print(f"Invalid JSON: {e}") sys.exit(1) if not text: print("Error: text is required") sys.exit(1) try: path = synthesize(text) print(path) except Exception as e: print(f"Error: {e}") sys.exit(1) if __name__ == "__main__": main()