allow base_loss to report the loss of any arbitrary huggingface model similar to base_eval. had to change dataloader to be a lot better and just take tokenizer, not load the nanochat one. much better this way anyway

2026-01-12 03:10:13 +00:00
parent aa95fb2e03
commit 21608ec51e
4 changed files with 73 additions and 15 deletions
@@ -103,9 +103,10 @@ class HuggingFaceTokenizer:
    def id_to_token(self, id):
        return self.tokenizer.id_to_token(id)

-    def _encode_one(self, text, prepend=None, append=None):
+    def _encode_one(self, text, prepend=None, append=None, num_threads=None):
        # encode a single string
        # prepend/append can be either a string of a special token or a token id directly.
+        # num_threads is ignored (only used by the nanochat Tokenizer for parallel encoding)
        assert isinstance(text, str)
        ids = []
        if prepend is not None: