From e1dafc510f122d5c31c38a3c96e45e544f47930f Mon Sep 17 00:00:00 2001 From: Yamahammer <137644546+Yamahammer@users.noreply.github.com> Date: Fri, 16 Jan 2026 21:50:34 -0500 Subject: [PATCH] Reduce token waste in BOS bestfit by cropping shortest doc (#445) When no document fits the remaining row space, crop the shortest document in the buffer instead of the first. This minimizes discarded tokens. Co-authored-by: Claude Opus 4.5 --- nanochat/dataloader.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/nanochat/dataloader.py b/nanochat/dataloader.py index 562d517..3e89893 100644 --- a/nanochat/dataloader.py +++ b/nanochat/dataloader.py @@ -178,8 +178,9 @@ def tokenizing_distributed_data_loader_with_state_bos_bestfit( doc = doc_buffer.pop(best_idx) row.extend(doc) else: - # No doc fits - crop first doc to fill remaining - doc = doc_buffer.pop(0) + # No doc fits - crop shortest in buffer to fill remaining and minimize waste + shortest_idx = min(range(len(doc_buffer)), key=lambda i: len(doc_buffer[i])) + doc = doc_buffer.pop(shortest_idx) row.extend(doc[:remaining]) rows.append(row[:row_capacity])