1
0
mirror of https://github.com/osmarks/nanogpt-experiments.git synced 2024-12-18 06:00:29 +00:00

Merge pull request #305 from okuvshynov/fix_osx_dataload

nanogpt: fix multiprocessing in load_dataset on os x
This commit is contained in:
Andrej 2023-06-17 20:26:35 -07:00 committed by GitHub
commit 4eb7a96b07
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -16,6 +16,7 @@ num_proc = 8
# it is better than 1 usually though # it is better than 1 usually though
num_proc_load_dataset = num_proc num_proc_load_dataset = num_proc
if __name__ == '__main__':
# takes 54GB in huggingface .cache dir, about 8M documents (8,013,769) # takes 54GB in huggingface .cache dir, about 8M documents (8,013,769)
dataset = load_dataset("openwebtext", num_proc=num_proc_load_dataset) dataset = load_dataset("openwebtext", num_proc=num_proc_load_dataset)