simplify the prepare script a lot, write only using one process, seems sufficient for now. ty @LaihoE for suggestion and @proger for flagging

2025-07-05 11:22:49 +00:00 · 2022-12-30 22:18:20 +00:00 · 2022-12-30 22:18:20 +00:00 · 7c6ea8409e
commit 7c6ea8409e
parent d8abd21258
2 changed files with 13 additions and 28 deletions
--- a/README.md
+++ b/README.md
@ -11,6 +11,7 @@ Dependencies:
 - `pip install datasets` for huggingface datasets <3 (if you want to download + preprocess OpenWebText)
 - `pip install tiktoken` for OpenAI's fast bpe code <3
 - `pip install wandb` for optional logging <3
+- `pip install tqdm`

 ## usage

--- a/data/openwebtext/prepare.py
+++ b/data/openwebtext/prepare.py
@ -1,15 +1,14 @@
 # saves the openwebtext dataset to a binary file for training. following was helpful:
 # https://github.com/HazyResearch/flash-attention/blob/main/training/src/datamodules/language_modeling_hf.py

-import mmap
-import subprocess
+from tqdm import tqdm
 import numpy as np
 import tiktoken
 from datasets import load_dataset # huggingface datasets

-# number of workers in .map() calls
-# good number to use is ~order num_cpu_cores()
-num_proc = 16
+# number of workers in .map() call
+# good number to use is ~order number of cpu cores // 2
+num_proc = 8

 # takes 54GB in huggingface .cache dir, about 8M documents (8,013,769)
 dataset = load_dataset("openwebtext")
@ -49,32 +48,17 @@ tokenized = split_dataset.map(

 # concatenate all the ids in each dataset into one large file we can use for training
 for split, dset in tokenized.items():
-
-    offset = np.cumsum(dset['len']).tolist()
-    total = offset[-1] # total number of tokens in the dataset
-    dset = dset.add_column('offset', offset)
-
-    # preallocate space in a temporary file to store the concatenated ids
+    arr_len = np.sum(dset['len'])
    filename = f'{split}.bin'
    dtype = np.uint16 # (can do since enc.max_token_value == 50256 is < 2**16)
-    bytes_per_token = 2 # i.e. np.dtype(dtype).itemsize
-    subprocess.run(['truncate', '-s', str(total * bytes_per_token), filename], check=True)
+    arr = np.memmap(filename, dtype=dtype, mode='w+', shape=(arr_len,))

-    # write the ids to the file
-    def write_to_file(example):
-        with open(filename, 'r+b') as f:
-            arr_len = len(example['ids'])
-            start = example['offset'] - arr_len
-            mm = mmap.mmap(f.fileno(), 0)
-            arr = np.ndarray((arr_len,), dtype=dtype, buffer=mm, offset=bytes_per_token * start)
-            arr[:] = example['ids']
-            mm.flush()
-
-    dset.map(
-        write_to_file,
-        desc=f"writing {split} split to file {filename}",
-        num_proc=num_proc,
-    )
+    print(f"writing {filename}...")
+    idx = 0
+    for example in tqdm(dset):
+        arr[idx : idx + example['len']] = example['ids']
+        idx += example['len']
+    arr.flush()

 # train.bin is ~17GB, val.bin ~8.5MB
 # train has ~9B tokens (9,035,582,198)