first very bad commit

2025-10-23 11:37:38 +00:00 · 2022-12-28 00:58:19 +00:00
commit fe8042867c
6 changed files with 628 additions and 0 deletions
--- a/data/openwebtext/prepare.py
+++ b/data/openwebtext/prepare.py
@@ -0,0 +1,84 @@
+# saves the openwebtext dataset to a binary file for training. following was helpful:
+# https://github.com/HazyResearch/flash-attention/blob/main/training/src/datamodules/language_modeling_hf.py
+
+import mmap
+import subprocess
+import numpy as np
+import tiktoken
+from datasets import load_dataset # huggingface datasets
+
+# number of workers in .map() calls
+# good number to use is ~order num_cpu_cores()
+num_proc = 16
+
+# takes 54GB in huggingface .cache dir, about 8M documents (8,013,769)
+dataset = load_dataset("openwebtext")
+
+# owt by default only contains the 'train' split, so create a test split
+split_dataset = dataset["train"].train_test_split(test_size=0.0005, seed=2357, shuffle=True)
+split_dataset['val'] = split_dataset.pop('test') # rename the test split to val
+
+# this results in:
+# >>> split_dataset
+# DatasetDict({
+#     train: Dataset({
+#         features: ['text'],
+#         num_rows: 8009762
+#     })
+#     val: Dataset({
+#         features: ['text'],
+#         num_rows: 4007
+#     })
+# })
+
+# we now want to tokenize the dataset. first define the encoding function (gpt2 bpe)
+enc = tiktoken.get_encoding("gpt2")
+def process(example):
+    ids = enc.encode_ordinary(example['text']) # encode_ordinary ignores any special tokens
+    ids.append(enc.eot_token) # add the end of text token, e.g. 50256 for gpt2 bpe
+    out = {'ids': ids, 'len': len(ids)}
+    return out
+
+# tokenize the dataset
+tokenized = split_dataset.map(
+    process,
+    remove_columns=['text'],
+    desc="tokenizing the splits",
+    num_proc=num_proc,
+)
+
+# concatenate all the ids in each dataset into one large file we can use for training
+for split, dset in tokenized.items():
+
+    offset = np.cumsum(dset['len']).tolist()
+    total = offset[-1] # total number of tokens in the dataset
+    dset = dset.add_column('offset', offset)
+
+    # preallocate space in a temporary file to store the concatenated ids
+    filename = f'{split}.bin'
+    dtype = np.uint16 # (can do since enc.max_token_value == 50256 is < 2**16)
+    bytes_per_token = 2 # i.e. np.dtype(dtype).itemsize
+    subprocess.run(['truncate', '-s', str(total * bytes_per_token), filename], check=True)
+
+    # write the ids to the file
+    def write_to_file(example):
+        with open(filename, 'r+b') as f:
+            arr_len = len(example['ids'])
+            start = example['offset'] - arr_len
+            mm = mmap.mmap(f.fileno(), 0)
+            arr = np.ndarray((arr_len,), dtype=dtype, buffer=mm, offset=bytes_per_token * start)
+            arr[:] = example['ids']
+            mm.flush()
+
+    dset.map(
+        write_to_file,
+        desc=f"writing {split} split to file {filename}",
+        num_proc=num_proc,
+    )
+
+# train.bin is ~17GB, val.bin ~8.5MB
+# train has ~9B tokens (9,035,582,198)
+# val has ~4M tokens (4,434,897)
+
+# to read the bin files later, e.g. with numpy:
+# m = np.memmap('train.bin', dtype=np.uint16, mode='r')
--- a/data/openwebtext/readme.md
+++ b/data/openwebtext/readme.md
@@ -0,0 +1,15 @@
+
+## openwebtext dataset
+
+after running `prepare.py` (preprocess) we get:
+
+- train.bin is ~17GB, val.bin ~8.5MB
+- train has ~9B tokens (9,035,582,198)
+- val has ~4M tokens (4,434,897)
+
+this came from 8,013,769 documents in total.
+
+references:
+
+- OpenAI's WebText dataset is discussed in [GPT-2 paper](https://d4mucfpksywv.cloudfront.net/better-language-models/language_models_are_unsupervised_multitask_learners.pdf)
+- [OpenWebText](https://skylion007.github.io/OpenWebTextCorpus/) dataset