mirror of
https://github.com/osmarks/nanogpt-experiments.git
synced 2025-08-31 18:07:58 +00:00
first very bad commit
This commit is contained in:
84
data/openwebtext/prepare.py
Normal file
84
data/openwebtext/prepare.py
Normal file
@@ -0,0 +1,84 @@
|
||||
# saves the openwebtext dataset to a binary file for training. following was helpful:
|
||||
# https://github.com/HazyResearch/flash-attention/blob/main/training/src/datamodules/language_modeling_hf.py
|
||||
|
||||
import mmap
|
||||
import subprocess
|
||||
import numpy as np
|
||||
import tiktoken
|
||||
from datasets import load_dataset # huggingface datasets
|
||||
|
||||
# number of workers in .map() calls
|
||||
# good number to use is ~order num_cpu_cores()
|
||||
num_proc = 16
|
||||
|
||||
# takes 54GB in huggingface .cache dir, about 8M documents (8,013,769)
|
||||
dataset = load_dataset("openwebtext")
|
||||
|
||||
# owt by default only contains the 'train' split, so create a test split
|
||||
split_dataset = dataset["train"].train_test_split(test_size=0.0005, seed=2357, shuffle=True)
|
||||
split_dataset['val'] = split_dataset.pop('test') # rename the test split to val
|
||||
|
||||
# this results in:
|
||||
# >>> split_dataset
|
||||
# DatasetDict({
|
||||
# train: Dataset({
|
||||
# features: ['text'],
|
||||
# num_rows: 8009762
|
||||
# })
|
||||
# val: Dataset({
|
||||
# features: ['text'],
|
||||
# num_rows: 4007
|
||||
# })
|
||||
# })
|
||||
|
||||
# we now want to tokenize the dataset. first define the encoding function (gpt2 bpe)
|
||||
enc = tiktoken.get_encoding("gpt2")
|
||||
def process(example):
|
||||
ids = enc.encode_ordinary(example['text']) # encode_ordinary ignores any special tokens
|
||||
ids.append(enc.eot_token) # add the end of text token, e.g. 50256 for gpt2 bpe
|
||||
out = {'ids': ids, 'len': len(ids)}
|
||||
return out
|
||||
|
||||
# tokenize the dataset
|
||||
tokenized = split_dataset.map(
|
||||
process,
|
||||
remove_columns=['text'],
|
||||
desc="tokenizing the splits",
|
||||
num_proc=num_proc,
|
||||
)
|
||||
|
||||
# concatenate all the ids in each dataset into one large file we can use for training
|
||||
for split, dset in tokenized.items():
|
||||
|
||||
offset = np.cumsum(dset['len']).tolist()
|
||||
total = offset[-1] # total number of tokens in the dataset
|
||||
dset = dset.add_column('offset', offset)
|
||||
|
||||
# preallocate space in a temporary file to store the concatenated ids
|
||||
filename = f'{split}.bin'
|
||||
dtype = np.uint16 # (can do since enc.max_token_value == 50256 is < 2**16)
|
||||
bytes_per_token = 2 # i.e. np.dtype(dtype).itemsize
|
||||
subprocess.run(['truncate', '-s', str(total * bytes_per_token), filename], check=True)
|
||||
|
||||
# write the ids to the file
|
||||
def write_to_file(example):
|
||||
with open(filename, 'r+b') as f:
|
||||
arr_len = len(example['ids'])
|
||||
start = example['offset'] - arr_len
|
||||
mm = mmap.mmap(f.fileno(), 0)
|
||||
arr = np.ndarray((arr_len,), dtype=dtype, buffer=mm, offset=bytes_per_token * start)
|
||||
arr[:] = example['ids']
|
||||
mm.flush()
|
||||
|
||||
dset.map(
|
||||
write_to_file,
|
||||
desc=f"writing {split} split to file {filename}",
|
||||
num_proc=num_proc,
|
||||
)
|
||||
|
||||
# train.bin is ~17GB, val.bin ~8.5MB
|
||||
# train has ~9B tokens (9,035,582,198)
|
||||
# val has ~4M tokens (4,434,897)
|
||||
|
||||
# to read the bin files later, e.g. with numpy:
|
||||
# m = np.memmap('train.bin', dtype=np.uint16, mode='r')
|
15
data/openwebtext/readme.md
Normal file
15
data/openwebtext/readme.md
Normal file
@@ -0,0 +1,15 @@
|
||||
|
||||
## openwebtext dataset
|
||||
|
||||
after running `prepare.py` (preprocess) we get:
|
||||
|
||||
- train.bin is ~17GB, val.bin ~8.5MB
|
||||
- train has ~9B tokens (9,035,582,198)
|
||||
- val has ~4M tokens (4,434,897)
|
||||
|
||||
this came from 8,013,769 documents in total.
|
||||
|
||||
references:
|
||||
|
||||
- OpenAI's WebText dataset is discussed in [GPT-2 paper](https://d4mucfpksywv.cloudfront.net/better-language-models/language_models_are_unsupervised_multitask_learners.pdf)
|
||||
- [OpenWebText](https://skylion007.github.io/OpenWebTextCorpus/) dataset
|
Reference in New Issue
Block a user