From 6649b299ebba950ea68c03b91c33f75fd830adaf Mon Sep 17 00:00:00 2001 From: Laiho Date: Tue, 9 May 2023 16:36:59 +0300 Subject: [PATCH] np.sum overflows on windows --- data/openwebtext/prepare.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data/openwebtext/prepare.py b/data/openwebtext/prepare.py index 8dc30e1..aad8170 100644 --- a/data/openwebtext/prepare.py +++ b/data/openwebtext/prepare.py @@ -50,7 +50,7 @@ tokenized = split_dataset.map( # concatenate all the ids in each dataset into one large file we can use for training for split, dset in tokenized.items(): - arr_len = np.sum(dset['len']) + arr_len = np.sum(dset['len'], dtype=np.uint64) filename = os.path.join(os.path.dirname(__file__), f'{split}.bin') dtype = np.uint16 # (can do since enc.max_token_value == 50256 is < 2**16) arr = np.memmap(filename, dtype=dtype, mode='w+', shape=(arr_len,))