mirror of
				https://github.com/osmarks/nanogpt-experiments.git
				synced 2025-10-31 15:23:01 +00:00 
			
		
		
		
	Merge pull request #301 from okuvshynov/master
[easy] allow multithreading in load_dataset
This commit is contained in:
		| @@ -11,8 +11,13 @@ from datasets import load_dataset # huggingface datasets | |||||||
| # good number to use is ~order number of cpu cores // 2 | # good number to use is ~order number of cpu cores // 2 | ||||||
| num_proc = 8 | num_proc = 8 | ||||||
|  |  | ||||||
|  | # number of workers in load_dataset() call | ||||||
|  | # best number might be different from num_proc above as it also depends on NW speed. | ||||||
|  | # it is better than 1 usually though | ||||||
|  | num_proc_load_dataset = num_proc | ||||||
|  |  | ||||||
| # takes 54GB in huggingface .cache dir, about 8M documents (8,013,769) | # takes 54GB in huggingface .cache dir, about 8M documents (8,013,769) | ||||||
| dataset = load_dataset("openwebtext") | dataset = load_dataset("openwebtext", num_proc=num_proc_load_dataset) | ||||||
|  |  | ||||||
| # owt by default only contains the 'train' split, so create a test split | # owt by default only contains the 'train' split, so create a test split | ||||||
| split_dataset = dataset["train"].train_test_split(test_size=0.0005, seed=2357, shuffle=True) | split_dataset = dataset["train"].train_test_split(test_size=0.0005, seed=2357, shuffle=True) | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Andrej
					Andrej