# saves the openwebtext dataset to a binary file for training. following was helpful: # https://github.com/HazyResearch/flash-attention/blob/main/training/src/datamodules/language_modeling_hf.py import mmap import subprocess import numpy as np import tiktoken from datasets import load_dataset # huggingface datasets # number of workers in .map() calls # good number to use is ~order num_cpu_cores() num_proc = 16 # takes 54GB in huggingface .cache dir, about 8M documents (8,013,769) dataset = load_dataset("openwebtext") # owt by default only contains the 'train' split, so create a test split split_dataset = dataset["train"].train_test_split(test_size=0.0005, seed=2357, shuffle=True) split_dataset['val'] = split_dataset.pop('test') # rename the test split to val # this results in: # >>> split_dataset # DatasetDict({ # train: Dataset({ # features: ['text'], # num_rows: 8009762 # }) # val: Dataset({ # features: ['text'], # num_rows: 4007 # }) # }) # we now want to tokenize the dataset. first define the encoding function (gpt2 bpe) enc = tiktoken.get_encoding("gpt2") def process(example): ids = enc.encode_ordinary(example['text']) # encode_ordinary ignores any special tokens ids.append(enc.eot_token) # add the end of text token, e.g. 50256 for gpt2 bpe out = {'ids': ids, 'len': len(ids)} return out # tokenize the dataset tokenized = split_dataset.map( process, remove_columns=['text'], desc="tokenizing the splits", num_proc=num_proc, ) # concatenate all the ids in each dataset into one large file we can use for training for split, dset in tokenized.items(): offset = np.cumsum(dset['len']).tolist() total = offset[-1] # total number of tokens in the dataset dset = dset.add_column('offset', offset) # preallocate space in a temporary file to store the concatenated ids filename = f'{split}.bin' dtype = np.uint16 # (can do since enc.max_token_value == 50256 is < 2**16) bytes_per_token = 2 # i.e. np.dtype(dtype).itemsize subprocess.run(['truncate', '-s', str(total * bytes_per_token), filename], check=True) # write the ids to the file def write_to_file(example): with open(filename, 'r+b') as f: arr_len = len(example['ids']) start = example['offset'] - arr_len mm = mmap.mmap(f.fileno(), 0) arr = np.ndarray((arr_len,), dtype=dtype, buffer=mm, offset=bytes_per_token * start) arr[:] = example['ids'] mm.flush() dset.map( write_to_file, desc=f"writing {split} split to file {filename}", num_proc=num_proc, ) # train.bin is ~17GB, val.bin ~8.5MB # train has ~9B tokens (9,035,582,198) # val has ~4M tokens (4,434,897) # to read the bin files later, e.g. with numpy: # m = np.memmap('train.bin', dtype=np.uint16, mode='r')