mirror of
				https://github.com/osmarks/nanogpt-experiments.git
				synced 2025-10-29 14:27:41 +00:00 
			
		
		
		
	use relative paths so that running the data prep scripts always create files in local folder, no matter where run from
This commit is contained in:
		| @@ -1,6 +1,7 @@ | ||||
| # saves the openwebtext dataset to a binary file for training. following was helpful: | ||||
| # https://github.com/HazyResearch/flash-attention/blob/main/training/src/datamodules/language_modeling_hf.py | ||||
|  | ||||
| import os | ||||
| from tqdm import tqdm | ||||
| import numpy as np | ||||
| import tiktoken | ||||
| @@ -50,7 +51,7 @@ tokenized = split_dataset.map( | ||||
| # concatenate all the ids in each dataset into one large file we can use for training | ||||
| for split, dset in tokenized.items(): | ||||
|     arr_len = np.sum(dset['len']) | ||||
|     filename = f'{split}.bin' | ||||
|     filename = os.path.join(os.path.dirname(__file__), f'{split}.bin') | ||||
|     dtype = np.uint16 # (can do since enc.max_token_value == 50256 is < 2**16) | ||||
|     arr = np.memmap(filename, dtype=dtype, mode='w+', shape=(arr_len,)) | ||||
|  | ||||
|   | ||||
| @@ -25,8 +25,8 @@ print(f"val has {len(val_ids):,} tokens") | ||||
| # export to bin files | ||||
| train_ids = np.array(train_ids, dtype=np.uint16) | ||||
| val_ids = np.array(val_ids, dtype=np.uint16) | ||||
| train_ids.tofile('train.bin') | ||||
| val_ids.tofile('val.bin') | ||||
| train_ids.tofile(os.path.join(os.path.dirname(__file__), 'train.bin')) | ||||
| val_ids.tofile(os.path.join(os.path.dirname(__file__), 'val.bin')) | ||||
|  | ||||
| # train.bin has 301,966 tokens | ||||
| # val.bin has 36,059 tokens | ||||
|   | ||||
| @@ -47,8 +47,8 @@ print(f"val has {len(val_ids):,} tokens") | ||||
| # export to bin files | ||||
| train_ids = np.array(train_ids, dtype=np.uint16) | ||||
| val_ids = np.array(val_ids, dtype=np.uint16) | ||||
| train_ids.tofile('train.bin') | ||||
| val_ids.tofile('val.bin') | ||||
| train_ids.tofile(os.path.join(os.path.dirname(__file__), 'train.bin')) | ||||
| val_ids.tofile(os.path.join(os.path.dirname(__file__), 'val.bin')) | ||||
|  | ||||
| # save the meta information as well, to help us encode/decode later | ||||
| meta = { | ||||
| @@ -56,7 +56,7 @@ meta = { | ||||
|     'itos': itos, | ||||
|     'stoi': stoi, | ||||
| } | ||||
| with open('meta.pkl', 'wb') as f: | ||||
| with open(os.path.join(os.path.dirname(__file__), 'meta.pkl'), 'wb') as f: | ||||
|     pickle.dump(meta, f) | ||||
|  | ||||
| # length of dataset in characters:  1115394 | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 DG
					DG