From 7c6ea8409e44deb188b4c3213c01fd1e9174537a Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Fri, 30 Dec 2022 22:18:20 +0000 Subject: [PATCH] simplify the prepare script a lot, write only using one process, seems sufficient for now. ty @LaihoE for suggestion and @proger for flagging --- README.md | 1 + data/openwebtext/prepare.py | 40 +++++++++++-------------------------- 2 files changed, 13 insertions(+), 28 deletions(-) diff --git a/README.md b/README.md index 378feb7..16069cf 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,7 @@ Dependencies: - `pip install datasets` for huggingface datasets <3 (if you want to download + preprocess OpenWebText) - `pip install tiktoken` for OpenAI's fast bpe code <3 - `pip install wandb` for optional logging <3 +- `pip install tqdm` ## usage diff --git a/data/openwebtext/prepare.py b/data/openwebtext/prepare.py index b962126..81ca13a 100644 --- a/data/openwebtext/prepare.py +++ b/data/openwebtext/prepare.py @@ -1,15 +1,14 @@ # saves the openwebtext dataset to a binary file for training. following was helpful: # https://github.com/HazyResearch/flash-attention/blob/main/training/src/datamodules/language_modeling_hf.py -import mmap -import subprocess +from tqdm import tqdm import numpy as np import tiktoken from datasets import load_dataset # huggingface datasets -# number of workers in .map() calls -# good number to use is ~order num_cpu_cores() -num_proc = 16 +# number of workers in .map() call +# good number to use is ~order number of cpu cores // 2 +num_proc = 8 # takes 54GB in huggingface .cache dir, about 8M documents (8,013,769) dataset = load_dataset("openwebtext") @@ -49,32 +48,17 @@ tokenized = split_dataset.map( # concatenate all the ids in each dataset into one large file we can use for training for split, dset in tokenized.items(): - - offset = np.cumsum(dset['len']).tolist() - total = offset[-1] # total number of tokens in the dataset - dset = dset.add_column('offset', offset) - - # preallocate space in a temporary file to store the concatenated ids + arr_len = np.sum(dset['len']) filename = f'{split}.bin' dtype = np.uint16 # (can do since enc.max_token_value == 50256 is < 2**16) - bytes_per_token = 2 # i.e. np.dtype(dtype).itemsize - subprocess.run(['truncate', '-s', str(total * bytes_per_token), filename], check=True) + arr = np.memmap(filename, dtype=dtype, mode='w+', shape=(arr_len,)) - # write the ids to the file - def write_to_file(example): - with open(filename, 'r+b') as f: - arr_len = len(example['ids']) - start = example['offset'] - arr_len - mm = mmap.mmap(f.fileno(), 0) - arr = np.ndarray((arr_len,), dtype=dtype, buffer=mm, offset=bytes_per_token * start) - arr[:] = example['ids'] - mm.flush() - - dset.map( - write_to_file, - desc=f"writing {split} split to file {filename}", - num_proc=num_proc, - ) + print(f"writing {filename}...") + idx = 0 + for example in tqdm(dset): + arr[idx : idx + example['len']] = example['ids'] + idx += example['len'] + arr.flush() # train.bin is ~17GB, val.bin ~8.5MB # train has ~9B tokens (9,035,582,198)