1
0
mirror of https://github.com/osmarks/nanogpt-experiments.git synced 2024-11-10 20:09:58 +00:00
nanogpt-experiments/c4gzparse_nanogpt.py
2024-06-24 19:10:15 +00:00

57 lines
1.9 KiB
Python

import os
from tqdm import tqdm
import numpy as np
import tiktoken
import json
import gzip
enc = tiktoken.get_encoding("gpt2")
if __name__ == '__main__':
# takes 54GB in huggingface .cache dir, about 8M documents (8,013,769)
dataset = []
with gzip.open("c4-train.00000-of-01024.json.gz", "r") as file:
while line := file.readline():
try:
dataset.append(json.loads(line))
except EOFError:
pass
# we now want to tokenize the dataset. first define the encoding function (gpt2 bpe)
def process(example):
ids = enc.encode_ordinary(example['text']) # encode_ordinary ignores any special tokens
ids.insert(0, enc.eot_token) # add the end of text token, e.g. 50256 for gpt2 bpe
# note: I think eot should be prepended not appended... hmm. it's called "eot" though...
out = {"ids": ids, "len": len(ids)}
return out
# tokenize the dataset
tokenized = [ process(x) for x in dataset ]
divider = len(tokenized) // 100
tokenized = {
"val": tokenized[:divider],
"train": tokenized[divider:]
}
# concatenate all the ids in each dataset into one large file we can use for training
for split, dset in tokenized.items():
arr_len = sum((d['len'] for d in dset))
filename = os.path.join(os.path.dirname(__file__), f'{split}.bin')
dtype = np.uint16 # (can do since enc.max_token_value == 50256 is < 2**16)
arr = np.memmap(filename, dtype=dtype, mode='w+', shape=(arr_len,))
total_batches = 1024
idx = 0
for d in tqdm(dset, desc=f'writing {filename}'):
arr[idx : idx + d["len"]] = d["ids"]
idx += d["len"]
arr.flush()
# train.bin is ~17GB, val.bin ~8.5MB
# train has ~9B tokens (9,035,582,198)
# val has ~4M tokens (4,434,897)
# to read the bin files later, e.g. with numpy:
# m = np.memmap('train.bin', dtype=np.uint16, mode='r')