mirror of
https://github.com/osmarks/nanogpt-experiments.git
synced 2025-09-01 18:37:58 +00:00
use relative paths so that running the data prep scripts always create files in local folder, no matter where run from
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
# saves the openwebtext dataset to a binary file for training. following was helpful:
|
||||
# https://github.com/HazyResearch/flash-attention/blob/main/training/src/datamodules/language_modeling_hf.py
|
||||
|
||||
import os
|
||||
from tqdm import tqdm
|
||||
import numpy as np
|
||||
import tiktoken
|
||||
@@ -50,7 +51,7 @@ tokenized = split_dataset.map(
|
||||
# concatenate all the ids in each dataset into one large file we can use for training
|
||||
for split, dset in tokenized.items():
|
||||
arr_len = np.sum(dset['len'])
|
||||
filename = f'{split}.bin'
|
||||
filename = os.path.join(os.path.dirname(__file__), f'{split}.bin')
|
||||
dtype = np.uint16 # (can do since enc.max_token_value == 50256 is < 2**16)
|
||||
arr = np.memmap(filename, dtype=dtype, mode='w+', shape=(arr_len,))
|
||||
|
||||
|
Reference in New Issue
Block a user