""" Generate the W1 audio smoke dataset: a handful of 5s sine-wave clips paired with deterministic transcripts. Why synthetic instead of real speech: W1 only proves the forward path (WhisperEncoder -> Projector -> GPT prepend) and that the projector's gradient flows into a decreasing loss on a tiny fixed set. Real speech adds a network dependency to a step that should be reproducible offline. W2 swaps in LibriSpeech. Audio files land under data/audio_smoke/wavs/ (gitignored). The manifest data/audio_smoke/manifest.jsonl is the only artifact committed. Usage: python -m scripts.audio_smoke_data """ import argparse import json import wave from pathlib import Path import numpy as np SAMPLES = [ (220.0, "low tone"), (330.0, "mid low tone"), (440.0, "middle tone"), (660.0, "mid high tone"), (880.0, "high tone"), ] SR = 16000 DURATION_S = 5.0 def synth_sine(freq_hz, duration_s=DURATION_S, sr=SR): """Sine + 2nd harmonic + a sliver of noise so Whisper sees non-degenerate frames (a pure tone collapses to a near-constant log-mel).""" t = np.arange(int(sr * duration_s)) / sr x = 0.5 * np.sin(2 * np.pi * freq_hz * t) + 0.25 * np.sin(2 * np.pi * 2 * freq_hz * t) rng = np.random.default_rng(int(freq_hz)) x = x + 0.01 * rng.standard_normal(len(x)) return x.astype(np.float32) def write_wav_pcm16(path, audio, sr=SR): """Write mono PCM16 WAV using the stdlib (no scipy/soundfile dependency).""" pcm = np.clip(audio, -1.0, 1.0) pcm = (pcm * 32767.0).astype(np.int16) with wave.open(str(path), "wb") as w: w.setnchannels(1) w.setsampwidth(2) w.setframerate(sr) w.writeframes(pcm.tobytes()) def generate_synthetic(data_dir): data_dir = Path(data_dir) wav_dir = data_dir / "wavs" wav_dir.mkdir(parents=True, exist_ok=True) manifest_path = data_dir / "manifest.jsonl" with open(manifest_path, "w") as f: for freq, text in SAMPLES: name = f"sine_{int(freq):04d}.wav" wav_path = wav_dir / name if not wav_path.exists(): write_wav_pcm16(wav_path, synth_sine(freq)) f.write(json.dumps({"wav": f"wavs/{name}", "text": text, "sr": SR}) + "\n") print(f"Wrote {len(SAMPLES)} samples to {data_dir}") return manifest_path if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--data-dir", default="data/audio_smoke") args = parser.parse_args() generate_synthetic(args.data_dir)