From edb7a7eab0e06198d8288bc2f5d7fb06f76a5a57 Mon Sep 17 00:00:00 2001 From: DG Date: Fri, 20 Jan 2023 10:39:45 -0800 Subject: [PATCH] use relative paths so that running the data prep scripts always create files in local folder, no matter where run from --- data/openwebtext/prepare.py | 3 ++- data/shakespeare/prepare.py | 4 ++-- data/shakespeare_char/prepare.py | 6 +++--- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/data/openwebtext/prepare.py b/data/openwebtext/prepare.py index 0aadbb8..7710bdf 100644 --- a/data/openwebtext/prepare.py +++ b/data/openwebtext/prepare.py @@ -1,6 +1,7 @@ # saves the openwebtext dataset to a binary file for training. following was helpful: # https://github.com/HazyResearch/flash-attention/blob/main/training/src/datamodules/language_modeling_hf.py +import os from tqdm import tqdm import numpy as np import tiktoken @@ -50,7 +51,7 @@ tokenized = split_dataset.map( # concatenate all the ids in each dataset into one large file we can use for training for split, dset in tokenized.items(): arr_len = np.sum(dset['len']) - filename = f'{split}.bin' + filename = os.path.join(os.path.dirname(__file__), f'{split}.bin') dtype = np.uint16 # (can do since enc.max_token_value == 50256 is < 2**16) arr = np.memmap(filename, dtype=dtype, mode='w+', shape=(arr_len,)) diff --git a/data/shakespeare/prepare.py b/data/shakespeare/prepare.py index 141e64a..06573ac 100644 --- a/data/shakespeare/prepare.py +++ b/data/shakespeare/prepare.py @@ -25,8 +25,8 @@ print(f"val has {len(val_ids):,} tokens") # export to bin files train_ids = np.array(train_ids, dtype=np.uint16) val_ids = np.array(val_ids, dtype=np.uint16) -train_ids.tofile('train.bin') -val_ids.tofile('val.bin') +train_ids.tofile(os.path.join(os.path.dirname(__file__), 'train.bin')) +val_ids.tofile(os.path.join(os.path.dirname(__file__), 'val.bin')) # train.bin has 301,966 tokens # val.bin has 36,059 tokens diff --git a/data/shakespeare_char/prepare.py b/data/shakespeare_char/prepare.py index 1a4e54d..918d1ee 100644 --- a/data/shakespeare_char/prepare.py +++ b/data/shakespeare_char/prepare.py @@ -47,8 +47,8 @@ print(f"val has {len(val_ids):,} tokens") # export to bin files train_ids = np.array(train_ids, dtype=np.uint16) val_ids = np.array(val_ids, dtype=np.uint16) -train_ids.tofile('train.bin') -val_ids.tofile('val.bin') +train_ids.tofile(os.path.join(os.path.dirname(__file__), 'train.bin')) +val_ids.tofile(os.path.join(os.path.dirname(__file__), 'val.bin')) # save the meta information as well, to help us encode/decode later meta = { @@ -56,7 +56,7 @@ meta = { 'itos': itos, 'stoi': stoi, } -with open('meta.pkl', 'wb') as f: +with open(os.path.join(os.path.dirname(__file__), 'meta.pkl'), 'wb') as f: pickle.dump(meta, f) # length of dataset in characters: 1115394