mirror of
https://github.com/osmarks/nanogpt-experiments.git
synced 2024-12-18 14:10:28 +00:00
simplify the prepare script a lot, write only using one process, seems sufficient for now. ty @LaihoE for suggestion and @proger for flagging
This commit is contained in:
parent
d8abd21258
commit
7c6ea8409e
@ -11,6 +11,7 @@ Dependencies:
|
||||
- `pip install datasets` for huggingface datasets <3 (if you want to download + preprocess OpenWebText)
|
||||
- `pip install tiktoken` for OpenAI's fast bpe code <3
|
||||
- `pip install wandb` for optional logging <3
|
||||
- `pip install tqdm`
|
||||
|
||||
## usage
|
||||
|
||||
|
@ -1,15 +1,14 @@
|
||||
# saves the openwebtext dataset to a binary file for training. following was helpful:
|
||||
# https://github.com/HazyResearch/flash-attention/blob/main/training/src/datamodules/language_modeling_hf.py
|
||||
|
||||
import mmap
|
||||
import subprocess
|
||||
from tqdm import tqdm
|
||||
import numpy as np
|
||||
import tiktoken
|
||||
from datasets import load_dataset # huggingface datasets
|
||||
|
||||
# number of workers in .map() calls
|
||||
# good number to use is ~order num_cpu_cores()
|
||||
num_proc = 16
|
||||
# number of workers in .map() call
|
||||
# good number to use is ~order number of cpu cores // 2
|
||||
num_proc = 8
|
||||
|
||||
# takes 54GB in huggingface .cache dir, about 8M documents (8,013,769)
|
||||
dataset = load_dataset("openwebtext")
|
||||
@ -49,32 +48,17 @@ tokenized = split_dataset.map(
|
||||
|
||||
# concatenate all the ids in each dataset into one large file we can use for training
|
||||
for split, dset in tokenized.items():
|
||||
|
||||
offset = np.cumsum(dset['len']).tolist()
|
||||
total = offset[-1] # total number of tokens in the dataset
|
||||
dset = dset.add_column('offset', offset)
|
||||
|
||||
# preallocate space in a temporary file to store the concatenated ids
|
||||
arr_len = np.sum(dset['len'])
|
||||
filename = f'{split}.bin'
|
||||
dtype = np.uint16 # (can do since enc.max_token_value == 50256 is < 2**16)
|
||||
bytes_per_token = 2 # i.e. np.dtype(dtype).itemsize
|
||||
subprocess.run(['truncate', '-s', str(total * bytes_per_token), filename], check=True)
|
||||
arr = np.memmap(filename, dtype=dtype, mode='w+', shape=(arr_len,))
|
||||
|
||||
# write the ids to the file
|
||||
def write_to_file(example):
|
||||
with open(filename, 'r+b') as f:
|
||||
arr_len = len(example['ids'])
|
||||
start = example['offset'] - arr_len
|
||||
mm = mmap.mmap(f.fileno(), 0)
|
||||
arr = np.ndarray((arr_len,), dtype=dtype, buffer=mm, offset=bytes_per_token * start)
|
||||
arr[:] = example['ids']
|
||||
mm.flush()
|
||||
|
||||
dset.map(
|
||||
write_to_file,
|
||||
desc=f"writing {split} split to file {filename}",
|
||||
num_proc=num_proc,
|
||||
)
|
||||
print(f"writing {filename}...")
|
||||
idx = 0
|
||||
for example in tqdm(dset):
|
||||
arr[idx : idx + example['len']] = example['ids']
|
||||
idx += example['len']
|
||||
arr.flush()
|
||||
|
||||
# train.bin is ~17GB, val.bin ~8.5MB
|
||||
# train has ~9B tokens (9,035,582,198)
|
||||
|
Loading…
Reference in New Issue
Block a user