simplify the prepare script a lot, write only using one process, seems sufficient for now. ty @LaihoE for suggestion and @proger for flagging

2025-07-05 03:12:50 +00:00 · 2022-12-30 22:18:20 +00:00 · 2022-12-30 22:18:20 +00:00 · 7c6ea8409e
commit 7c6ea8409e
parent d8abd21258
2 changed files with 13 additions and 28 deletions
--- a/README.md
+++ b/README.md
@ -11,6 +11,7 @@ Dependencies:
 - `pip install datasets` for huggingface datasets <3 (if you want to download + preprocess OpenWebText)
 - `pip install tiktoken` for OpenAI's fast bpe code <3
 - `pip install wandb` for optional logging <3
 - `pip install tqdm`
 ## usage
--- a/data/openwebtext/prepare.py
+++ b/data/openwebtext/prepare.py
@ -1,15 +1,14 @@
 # saves the openwebtext dataset to a binary file for training. following was helpful:
 # https://github.com/HazyResearch/flash-attention/blob/main/training/src/datamodules/language_modeling_hf.py
-import mmap
+from tqdm import tqdm
 import subprocess
 import numpy as np
 import tiktoken
 from datasets import load_dataset # huggingface datasets
-# number of workers in .map() calls
+# number of workers in .map() call
-# good number to use is ~order num_cpu_cores()
+# good number to use is ~order number of cpu cores // 2
-num_proc = 16
+num_proc = 8
 # takes 54GB in huggingface .cache dir, about 8M documents (8,013,769)
 dataset = load_dataset("openwebtext")
@ -49,32 +48,17 @@ tokenized = split_dataset.map(
 # concatenate all the ids in each dataset into one large file we can use for training
 for split, dset in tokenized.items():
-
+    arr_len = np.sum(dset['len'])
    offset = np.cumsum(dset['len']).tolist()
    total = offset[-1] # total number of tokens in the dataset
    dset = dset.add_column('offset', offset)
    # preallocate space in a temporary file to store the concatenated ids
    filename = f'{split}.bin'
    dtype = np.uint16 # (can do since enc.max_token_value == 50256 is < 2**16)
-    bytes_per_token = 2 # i.e. np.dtype(dtype).itemsize
+    arr = np.memmap(filename, dtype=dtype, mode='w+', shape=(arr_len,))
    subprocess.run(['truncate', '-s', str(total * bytes_per_token), filename], check=True)
-    # write the ids to the file
+    print(f"writing {filename}...")
-    def write_to_file(example):
+    idx = 0
-        with open(filename, 'r+b') as f:
+    for example in tqdm(dset):
-            arr_len = len(example['ids'])
+        arr[idx : idx + example['len']] = example['ids']
-            start = example['offset'] - arr_len
+        idx += example['len']
-            mm = mmap.mmap(f.fileno(), 0)
+    arr.flush()
            arr = np.ndarray((arr_len,), dtype=dtype, buffer=mm, offset=bytes_per_token * start)
            arr[:] = example['ids']
            mm.flush()
    dset.map(
        write_to_file,
        desc=f"writing {split} split to file {filename}",
        num_proc=num_proc,
    )
 # train.bin is ~17GB, val.bin ~8.5MB
 # train has ~9B tokens (9,035,582,198)