Merge pull request #301 from okuvshynov/master

[easy] allow multithreading in load_dataset
2025-11-15 14:47:13 +00:00 · 2023-06-16 18:30:03 -07:00
parent 7339b904ef bb7e96754a
commit 41d7014f7d
1 changed files with 6 additions and 1 deletions
--- a/data/openwebtext/prepare.py
+++ b/data/openwebtext/prepare.py
@@ -11,8 +11,13 @@ from datasets import load_dataset # huggingface datasets
 # good number to use is ~order number of cpu cores // 2
 num_proc = 8

+# number of workers in load_dataset() call
+# best number might be different from num_proc above as it also depends on NW speed.
+# it is better than 1 usually though
+num_proc_load_dataset = num_proc
+
 # takes 54GB in huggingface .cache dir, about 8M documents (8,013,769)
-dataset = load_dataset("openwebtext")
+dataset = load_dataset("openwebtext", num_proc=num_proc_load_dataset)

 # owt by default only contains the 'train' split, so create a test split
 split_dataset = dataset["train"].train_test_split(test_size=0.0005, seed=2357, shuffle=True)