From 98eed6df189e395056c34621043d082878df392f Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Thu, 5 Feb 2026 18:14:09 +0000
Subject: [PATCH] bring back an assert guarding against bad param sizing

---
 nanochat/optim.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/nanochat/optim.py b/nanochat/optim.py
index 4cc2a1f..42d862b 100644
--- a/nanochat/optim.py
+++ b/nanochat/optim.py
@@ -377,6 +377,7 @@ class DistMuonAdamW(torch.optim.Optimizer):
                 param_infos[p] = dict(future=future, grad_slice=grad, is_small=True)
             else:
                 # Large params: reduce_scatter
+                assert grad.shape[0] % world_size == 0, f"AdamW reduce_scatter requires shape[0] ({grad.shape[0]}) divisible by world_size ({world_size})"
                 rank_size = grad.shape[0] // world_size
                 grad_slice = torch.empty_like(grad[:rank_size])
                 future = dist.reduce_scatter_tensor(grad_slice, grad, op=dist.ReduceOp.AVG, async_op=True).get_future()