mirror of
https://github.com/osmarks/nanogpt-experiments.git
synced 2024-11-10 20:09:58 +00:00
Merge pull request #301 from okuvshynov/master
[easy] allow multithreading in load_dataset
This commit is contained in:
commit
41d7014f7d
@ -11,8 +11,13 @@ from datasets import load_dataset # huggingface datasets
|
|||||||
# good number to use is ~order number of cpu cores // 2
|
# good number to use is ~order number of cpu cores // 2
|
||||||
num_proc = 8
|
num_proc = 8
|
||||||
|
|
||||||
|
# number of workers in load_dataset() call
|
||||||
|
# best number might be different from num_proc above as it also depends on NW speed.
|
||||||
|
# it is better than 1 usually though
|
||||||
|
num_proc_load_dataset = num_proc
|
||||||
|
|
||||||
# takes 54GB in huggingface .cache dir, about 8M documents (8,013,769)
|
# takes 54GB in huggingface .cache dir, about 8M documents (8,013,769)
|
||||||
dataset = load_dataset("openwebtext")
|
dataset = load_dataset("openwebtext", num_proc=num_proc_load_dataset)
|
||||||
|
|
||||||
# owt by default only contains the 'train' split, so create a test split
|
# owt by default only contains the 'train' split, so create a test split
|
||||||
split_dataset = dataset["train"].train_test_split(test_size=0.0005, seed=2357, shuffle=True)
|
split_dataset = dataset["train"].train_test_split(test_size=0.0005, seed=2357, shuffle=True)
|
||||||
|
Loading…
Reference in New Issue
Block a user