mirror of
https://github.com/osmarks/nanogpt-experiments.git
synced 2024-11-10 20:09:58 +00:00
use relative paths so that running the data prep scripts always create files in local folder, no matter where run from
This commit is contained in:
parent
2c7806db6e
commit
edb7a7eab0
@ -1,6 +1,7 @@
|
|||||||
# saves the openwebtext dataset to a binary file for training. following was helpful:
|
# saves the openwebtext dataset to a binary file for training. following was helpful:
|
||||||
# https://github.com/HazyResearch/flash-attention/blob/main/training/src/datamodules/language_modeling_hf.py
|
# https://github.com/HazyResearch/flash-attention/blob/main/training/src/datamodules/language_modeling_hf.py
|
||||||
|
|
||||||
|
import os
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import tiktoken
|
import tiktoken
|
||||||
@ -50,7 +51,7 @@ tokenized = split_dataset.map(
|
|||||||
# concatenate all the ids in each dataset into one large file we can use for training
|
# concatenate all the ids in each dataset into one large file we can use for training
|
||||||
for split, dset in tokenized.items():
|
for split, dset in tokenized.items():
|
||||||
arr_len = np.sum(dset['len'])
|
arr_len = np.sum(dset['len'])
|
||||||
filename = f'{split}.bin'
|
filename = os.path.join(os.path.dirname(__file__), f'{split}.bin')
|
||||||
dtype = np.uint16 # (can do since enc.max_token_value == 50256 is < 2**16)
|
dtype = np.uint16 # (can do since enc.max_token_value == 50256 is < 2**16)
|
||||||
arr = np.memmap(filename, dtype=dtype, mode='w+', shape=(arr_len,))
|
arr = np.memmap(filename, dtype=dtype, mode='w+', shape=(arr_len,))
|
||||||
|
|
||||||
|
@ -25,8 +25,8 @@ print(f"val has {len(val_ids):,} tokens")
|
|||||||
# export to bin files
|
# export to bin files
|
||||||
train_ids = np.array(train_ids, dtype=np.uint16)
|
train_ids = np.array(train_ids, dtype=np.uint16)
|
||||||
val_ids = np.array(val_ids, dtype=np.uint16)
|
val_ids = np.array(val_ids, dtype=np.uint16)
|
||||||
train_ids.tofile('train.bin')
|
train_ids.tofile(os.path.join(os.path.dirname(__file__), 'train.bin'))
|
||||||
val_ids.tofile('val.bin')
|
val_ids.tofile(os.path.join(os.path.dirname(__file__), 'val.bin'))
|
||||||
|
|
||||||
# train.bin has 301,966 tokens
|
# train.bin has 301,966 tokens
|
||||||
# val.bin has 36,059 tokens
|
# val.bin has 36,059 tokens
|
||||||
|
@ -47,8 +47,8 @@ print(f"val has {len(val_ids):,} tokens")
|
|||||||
# export to bin files
|
# export to bin files
|
||||||
train_ids = np.array(train_ids, dtype=np.uint16)
|
train_ids = np.array(train_ids, dtype=np.uint16)
|
||||||
val_ids = np.array(val_ids, dtype=np.uint16)
|
val_ids = np.array(val_ids, dtype=np.uint16)
|
||||||
train_ids.tofile('train.bin')
|
train_ids.tofile(os.path.join(os.path.dirname(__file__), 'train.bin'))
|
||||||
val_ids.tofile('val.bin')
|
val_ids.tofile(os.path.join(os.path.dirname(__file__), 'val.bin'))
|
||||||
|
|
||||||
# save the meta information as well, to help us encode/decode later
|
# save the meta information as well, to help us encode/decode later
|
||||||
meta = {
|
meta = {
|
||||||
@ -56,7 +56,7 @@ meta = {
|
|||||||
'itos': itos,
|
'itos': itos,
|
||||||
'stoi': stoi,
|
'stoi': stoi,
|
||||||
}
|
}
|
||||||
with open('meta.pkl', 'wb') as f:
|
with open(os.path.join(os.path.dirname(__file__), 'meta.pkl'), 'wb') as f:
|
||||||
pickle.dump(meta, f)
|
pickle.dump(meta, f)
|
||||||
|
|
||||||
# length of dataset in characters: 1115394
|
# length of dataset in characters: 1115394
|
||||||
|
Loading…
Reference in New Issue
Block a user