tune the data mixture a bit, load optimizer by default when SFT. These were confirmed to be best settings from sweeps of sft

2026-02-16 20:23:04 +00:00
parent 77f8fb8303
commit 1415fb7617
2 changed files with 26 additions and 10 deletions
@@ -186,6 +186,9 @@ def load_optimizer_state(source, device, rank, model_tag=None, step=None):
    if step is None:
        step = find_last_step(checkpoint_dir)
    optimizer_path = os.path.join(checkpoint_dir, f"optim_{step:06d}_rank{rank:d}.pt")
+    if not os.path.exists(optimizer_path):
+        log0(f"Optimizer checkpoint not found: {optimizer_path}")
+        return None
    log0(f"Loading optimizer state from {optimizer_path}")
    optimizer_data = torch.load(optimizer_path, map_location=device)
    return optimizer_data