1
0
mirror of https://github.com/osmarks/nanogpt-experiments.git synced 2024-11-13 05:19:58 +00:00

Merge pull request #420 from vinjn/fix-371-enc-is-not-defined

Move enc to gloabal namespace to fix #371
This commit is contained in:
Andrej 2024-02-27 09:27:01 -08:00 committed by GitHub
commit 325be85d9b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -16,6 +16,8 @@ num_proc = 8
# it is better than 1 usually though
num_proc_load_dataset = num_proc
enc = tiktoken.get_encoding("gpt2")
if __name__ == '__main__':
# takes 54GB in huggingface .cache dir, about 8M documents (8,013,769)
dataset = load_dataset("openwebtext", num_proc=num_proc_load_dataset)
@ -38,7 +40,6 @@ if __name__ == '__main__':
# })
# we now want to tokenize the dataset. first define the encoding function (gpt2 bpe)
enc = tiktoken.get_encoding("gpt2")
def process(example):
ids = enc.encode_ordinary(example['text']) # encode_ordinary ignores any special tokens
ids.append(enc.eot_token) # add the end of text token, e.g. 50256 for gpt2 bpe