use relative paths so that running the data prep scripts always create files in local folder, no matter where run from

2025-11-01 07:43:01 +00:00 · 2023-01-20 10:39:45 -08:00
parent 2c7806db6e
commit edb7a7eab0
3 changed files with 7 additions and 6 deletions
--- a/data/openwebtext/prepare.py
+++ b/data/openwebtext/prepare.py
@@ -1,6 +1,7 @@
 # saves the openwebtext dataset to a binary file for training. following was helpful:
 # https://github.com/HazyResearch/flash-attention/blob/main/training/src/datamodules/language_modeling_hf.py

+import os
 from tqdm import tqdm
 import numpy as np
 import tiktoken
@@ -50,7 +51,7 @@ tokenized = split_dataset.map(
 # concatenate all the ids in each dataset into one large file we can use for training
 for split, dset in tokenized.items():
    arr_len = np.sum(dset['len'])
-    filename = f'{split}.bin'
+    filename = os.path.join(os.path.dirname(__file__), f'{split}.bin')
    dtype = np.uint16 # (can do since enc.max_token_value == 50256 is < 2**16)
    arr = np.memmap(filename, dtype=dtype, mode='w+', shape=(arr_len,))