From bb7e96754a57ba53a99c902c3c5396f4acd7cf97 Mon Sep 17 00:00:00 2001 From: Oleksandr Kuvshynov <661042+okuvshynov@users.noreply.github.com> Date: Fri, 16 Jun 2023 20:00:17 -0400 Subject: [PATCH] nanogpt: allow multithreading in load dataset --- data/openwebtext/prepare.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/data/openwebtext/prepare.py b/data/openwebtext/prepare.py index aad8170..eebfe53 100644 --- a/data/openwebtext/prepare.py +++ b/data/openwebtext/prepare.py @@ -11,8 +11,13 @@ from datasets import load_dataset # huggingface datasets # good number to use is ~order number of cpu cores // 2 num_proc = 8 +# number of workers in load_dataset() call +# best number might be different from num_proc above as it also depends on NW speed. +# it is better than 1 usually though +num_proc_load_dataset = num_proc + # takes 54GB in huggingface .cache dir, about 8M documents (8,013,769) -dataset = load_dataset("openwebtext") +dataset = load_dataset("openwebtext", num_proc=num_proc_load_dataset) # owt by default only contains the 'train' split, so create a test split split_dataset = dataset["train"].train_test_split(test_size=0.0005, seed=2357, shuffle=True)