allow base_loss to report the loss of any arbitrary huggingface model similar to base_eval. had to change dataloader to be a lot better and just take tokenizer, not load the nanochat one. much better this way anyway
This commit is contained in:
@@ -5,9 +5,8 @@ import pyarrow.parquet as pq
|
||||
|
||||
from nanochat.common import get_dist_info
|
||||
from nanochat.dataset import list_parquet_files
|
||||
from nanochat.tokenizer import get_tokenizer
|
||||
|
||||
def tokenizing_distributed_data_loader_with_state(B, T, split, tokenizer_threads=4, tokenizer_batch_size=128, device="cuda", resume_state_dict=None):
|
||||
def tokenizing_distributed_data_loader_with_state(tokenizer, B, T, split, tokenizer_threads=4, tokenizer_batch_size=128, device="cuda", resume_state_dict=None):
|
||||
"""
|
||||
Stream pretraining text from parquet files, tokenize, yield training batches.
|
||||
|
||||
@@ -62,8 +61,6 @@ def tokenizing_distributed_data_loader_with_state(B, T, split, tokenizer_threads
|
||||
|
||||
# Now emit batches of tokens.
|
||||
needed_tokens = B * T + 1 # +1 is because we also need the target at the last token
|
||||
# get the tokenizer and the bos token
|
||||
tokenizer = get_tokenizer()
|
||||
bos_token = tokenizer.get_bos_token_id()
|
||||
# scratch buffer holds the tokens for one iteration
|
||||
token_buffer = deque() # we stream tokens on the right and pop from the left
|
||||
|
||||
@@ -103,9 +103,10 @@ class HuggingFaceTokenizer:
|
||||
def id_to_token(self, id):
|
||||
return self.tokenizer.id_to_token(id)
|
||||
|
||||
def _encode_one(self, text, prepend=None, append=None):
|
||||
def _encode_one(self, text, prepend=None, append=None, num_threads=None):
|
||||
# encode a single string
|
||||
# prepend/append can be either a string of a special token or a token id directly.
|
||||
# num_threads is ignored (only used by the nanochat Tokenizer for parallel encoding)
|
||||
assert isinstance(text, str)
|
||||
ids = []
|
||||
if prepend is not None:
|
||||
|
||||
Reference in New Issue
Block a user