nanogpt-experiments/data/openwebtext/prepare.py

# saves the openwebtext dataset to a binary file for training. following was helpful:
# https://github.com/HazyResearch/flash-attention/blob/main/training/src/datamodules/language_modeling_hf.py

import mmap
import subprocess
import numpy as np
import tiktoken
from datasets import load_dataset # huggingface datasets

# number of workers in .map() calls
# good number to use is ~order num_cpu_cores()
num_proc = 16

# takes 54GB in huggingface .cache dir, about 8M documents (8,013,769)
dataset = load_dataset("openwebtext")

# owt by default only contains the 'train' split, so create a test split
split_dataset = dataset["train"].train_test_split(test_size=0.0005, seed=2357, shuffle=True)
split_dataset['val'] = split_dataset.pop('test') # rename the test split to val

# this results in:
# >>> split_dataset
# DatasetDict({
#     train: Dataset({
#         features: ['text'],
#         num_rows: 8009762
#     })
#     val: Dataset({
#         features: ['text'],
#         num_rows: 4007
#     })
# })

# we now want to tokenize the dataset. first define the encoding function (gpt2 bpe)
enc = tiktoken.get_encoding("gpt2")
def process(example):
    ids = enc.encode_ordinary(example['text']) # encode_ordinary ignores any special tokens
    ids.append(enc.eot_token) # add the end of text token, e.g. 50256 for gpt2 bpe
    out = {'ids': ids, 'len': len(ids)}
    return out

# tokenize the dataset
tokenized = split_dataset.map(
    process,
    remove_columns=['text'],
    desc="tokenizing the splits",
    num_proc=num_proc,
)

# concatenate all the ids in each dataset into one large file we can use for training
for split, dset in tokenized.items():

    offset = np.cumsum(dset['len']).tolist()
    total = offset[-1] # total number of tokens in the dataset
    dset = dset.add_column('offset', offset)

    # preallocate space in a temporary file to store the concatenated ids
    filename = f'{split}.bin'
    dtype = np.uint16 # (can do since enc.max_token_value == 50256 is < 2**16)
    bytes_per_token = 2 # i.e. np.dtype(dtype).itemsize
    subprocess.run(['truncate', '-s', str(total * bytes_per_token), filename], check=True)

    # write the ids to the file
    def write_to_file(example):
        with open(filename, 'r+b') as f:
            arr_len = len(example['ids'])
            start = example['offset'] - arr_len
            mm = mmap.mmap(f.fileno(), 0)
            arr = np.ndarray((arr_len,), dtype=dtype, buffer=mm, offset=bytes_per_token * start)
            arr[:] = example['ids']
            mm.flush()

    dset.map(
        write_to_file,
        desc=f"writing {split} split to file {filename}",
        num_proc=num_proc,
    )

# train.bin is ~17GB, val.bin ~8.5MB
# train has ~9B tokens (9,035,582,198)
# val has ~4M tokens (4,434,897)

# to read the bin files later, e.g. with numpy:
# m = np.memmap('train.bin', dtype=np.uint16, mode='r')