1
0
mirror of https://github.com/osmarks/nanogpt-experiments.git synced 2024-09-21 03:39:44 +00:00

simplify the prepare script a lot, write only using one process, seems sufficient for now. ty @LaihoE for suggestion and @proger for flagging

This commit is contained in:
Andrej Karpathy 2022-12-30 22:18:20 +00:00
parent d8abd21258
commit 7c6ea8409e
2 changed files with 13 additions and 28 deletions

View File

@ -11,6 +11,7 @@ Dependencies:
- `pip install datasets` for huggingface datasets <3 (if you want to download + preprocess OpenWebText) - `pip install datasets` for huggingface datasets <3 (if you want to download + preprocess OpenWebText)
- `pip install tiktoken` for OpenAI's fast bpe code <3 - `pip install tiktoken` for OpenAI's fast bpe code <3
- `pip install wandb` for optional logging <3 - `pip install wandb` for optional logging <3
- `pip install tqdm`
## usage ## usage

View File

@ -1,15 +1,14 @@
# saves the openwebtext dataset to a binary file for training. following was helpful: # saves the openwebtext dataset to a binary file for training. following was helpful:
# https://github.com/HazyResearch/flash-attention/blob/main/training/src/datamodules/language_modeling_hf.py # https://github.com/HazyResearch/flash-attention/blob/main/training/src/datamodules/language_modeling_hf.py
import mmap from tqdm import tqdm
import subprocess
import numpy as np import numpy as np
import tiktoken import tiktoken
from datasets import load_dataset # huggingface datasets from datasets import load_dataset # huggingface datasets
# number of workers in .map() calls # number of workers in .map() call
# good number to use is ~order num_cpu_cores() # good number to use is ~order number of cpu cores // 2
num_proc = 16 num_proc = 8
# takes 54GB in huggingface .cache dir, about 8M documents (8,013,769) # takes 54GB in huggingface .cache dir, about 8M documents (8,013,769)
dataset = load_dataset("openwebtext") dataset = load_dataset("openwebtext")
@ -49,32 +48,17 @@ tokenized = split_dataset.map(
# concatenate all the ids in each dataset into one large file we can use for training # concatenate all the ids in each dataset into one large file we can use for training
for split, dset in tokenized.items(): for split, dset in tokenized.items():
arr_len = np.sum(dset['len'])
offset = np.cumsum(dset['len']).tolist()
total = offset[-1] # total number of tokens in the dataset
dset = dset.add_column('offset', offset)
# preallocate space in a temporary file to store the concatenated ids
filename = f'{split}.bin' filename = f'{split}.bin'
dtype = np.uint16 # (can do since enc.max_token_value == 50256 is < 2**16) dtype = np.uint16 # (can do since enc.max_token_value == 50256 is < 2**16)
bytes_per_token = 2 # i.e. np.dtype(dtype).itemsize arr = np.memmap(filename, dtype=dtype, mode='w+', shape=(arr_len,))
subprocess.run(['truncate', '-s', str(total * bytes_per_token), filename], check=True)
# write the ids to the file print(f"writing {filename}...")
def write_to_file(example): idx = 0
with open(filename, 'r+b') as f: for example in tqdm(dset):
arr_len = len(example['ids']) arr[idx : idx + example['len']] = example['ids']
start = example['offset'] - arr_len idx += example['len']
mm = mmap.mmap(f.fileno(), 0) arr.flush()
arr = np.ndarray((arr_len,), dtype=dtype, buffer=mm, offset=bytes_per_token * start)
arr[:] = example['ids']
mm.flush()
dset.map(
write_to_file,
desc=f"writing {split} split to file {filename}",
num_proc=num_proc,
)
# train.bin is ~17GB, val.bin ~8.5MB # train.bin is ~17GB, val.bin ~8.5MB
# train has ~9B tokens (9,035,582,198) # train has ~9B tokens (9,035,582,198)