import json, numpy, math, functools, base64 real_bped = numpy.frombuffer(base64.b64decode('ZQAgAHQAIABzACAAdABoAGkAbgBkACAAZQByAGEAbgB5ACAAbwAgAG8AbgAgACAAbwB1AG8AcgBnACAAZQBuAGEAbAAEAQ4BYQAgAHIAZQBhAHIAaQACASAAAwF0AAkBcwB0AGkAAQFpAHQAbwBtAGEAdAAgAHcAeQAMAWgAYQBlAAUBZgAgACwAIABsAAgBdQBzAAcBBQFsACAAZQACAUkAIAAGASAAYwBoAG8AdwALAQsBZQBzAAMBAAFhAHMAZQBjAB4BIABrACAAdgAAAWwAaQB0AGkAbQAgACcAAQFsAG8ADQEgAHIAbwAnAAIBbAAAAW0AYQAKASAABwEgAGUAbAAEASAAYgB1AGEAAQFsAGUAYQBjAGkAYwBpAHMAFgEAAWkAIABtAAABDwEgAGEAYgBvACEBbgBvAGkAbAByAGEAaAAgAHAAOgEuACAAawAAAXUAbgBnAG8AQgEBAXYABgFvAHAAcwAbAaMA4gBpAAMBgwBbAV0BXQFzAGUAKwEgAGkAZABjABsBcwAJAW0ACAFvAGQAJAEAARoBOwFJACcAcgBpAGAAYAB3AGgAbwBsAGYAOQFiAGUAHwEzAXMAaABhAAIBdQByABABIwFlAG0AZQB4AGEAZwAKACwBYQBkAHMAaQB0AHQAYgAAAT4AIABgACAACgEAAU4BAQEGAQABdQBjAOIAjQAUAQABAwEVAXQABgEMAQEBbgA3AXAAIABuAGUAEAEgAAwBbABkAG8AYwA/ASkAKQBmAA0BAwFDAQ8BdABpAHIAcABwAAoBNwE0AVQBKgEgAGIAMAFhAAgBWgFlAC4ALgAkAQEBXAEgAHAAZQB0AHIAbwBmAHAAbwBtAGUAAwERAWEAZgFpACEBYwAKAR8BAQFkAGUAZAAJAXAAbACXAaMBMQAxAHMAdQB0AGEAiwEFAWQAaQBzAAEBZACUAWcAZQBtAG8AHAFpAHQAbwAPAQEBcQB1AG4ACQEdAWgAXgFeAQQBZwAwADAAYQBtAGMAZQFiAG8AbgAgAHkAJwFnAGgAdQBsAEEAQQApACAAPwAgAAcBZwBlAGEAAwFlABoBAAFoATYBcgAgAAwBbgB3AA0BEAFsAGoAmwFnAFABGwEgAG4AYAE+AD4AdQB0ABYBFQFPASYBawARAVoBAAFwABMBZgByAGUAdgBrAGUAbQBtAHAABgEvAC8AIAEXAXcAnAE8ADwAvgEAAWYAgQFwAHMABwEBAWEAeQBkAEcBYwANAXQAAgFAASYBbQBwABwBAAFoAHoBWAEIAToA4AFSAdEBYwAAAUYBIABiAXAAYQAFAZ0BWQEKACAAcwBwAGgAYAEeAcwBTAGGAWsAAgH3ATwBYwAHAWYABAFvAAUBOgAgACgAKAArACsA7wHmAQUC8QFSAWIAEAEmARgBcgBJAB0BBAGFARwBOwEYAXUAHwECAT0AIABMATwBdwByAHMAAAEWAUMBQwAgAGUAGAFuACsBdgEAARABYwEHAWQAYQBKAS0BAgFvAGIAcgAAAXcApgEwACAAdQBtAFYBAQLbAdIBEgFiAGMAIAAPAWQACgEjAW0ADQFwABQBLgBiAS0BcwA1AUoBZQBkAMIBAQEpAi8AsgEBAQoBAgF3ASwBEQEXAUABbAB3ACAAPQFUAWwAJgF0ACkBBgECAXgAIAAxADYAZwBpAHIBAAEEAQABMQGNAWEAkwH/ATcBYQBhAGgAAAF2ACkBaABpAHMABgFsAHkAHQGmAXYAZQAxADIAOAFvALoBugEEAWQALwFtAGoBYAAeAScAAwGAAWYAaQAUASAAJwIAAWYAVQGMAS0BZgBmAHMAYwAHAXkAYQBpADUBPgEDASkBYQB2AGkAJwBoAGgANQFtAIIBtQB2AGkAcAB1ABYBZQBsAMcBtAE+AXQAdQBzAC8AVQKHAXcAZQBmAGUAdwBxATMANABtAHUAzQEFAQQBdABiAAgBYgByADgBJgHiAIAAcwBvAGkAlgHiAIYAmQGiAcQBxAFzAG0A6QHqAeUBMgFmAGEAEwFzAbUBCQFiAC8BZwB1AAQBMgFjAG8AZQBwAAMCUwBtAHQBZAJ1AGcAAAFwARkBAwEgAC0BAQFwAHIAbQAHAWkAHQFrACAByQEIAW0ABAEoAEsAYwBsAB0BnAEPAQUBRQEyASgBsQHnARcBEwFjAGgAZQBrANMBZgAhAV8BAAFrABYCaQB6AOgBAgEiACAAbQCRAQoBDgFlAFgBaQBmAEwBIwFWAWwAYgA8AUUBawCgAXMArwEFARMBeAFkAGEBMgBiAAoAagFEACAArgJ9AXQAcgFkAAYBFAEFATEAIABMACAAswGwAU0CrQJhAHAAAwEMAnQAdwBtApYBYQFlAGwAFAFzAEABcAGuAQYBZQA0ACAAGgEIAWcAUQHyAd4BbAAaAXQAIAFkAAABdQBwAGMAcgBsAD0BdwDXAawChwHKAgkBAwFLAXkALQFPAh0CGgFpADAAMgBvAFsCOAA2ACoBBwF0AAABBwKlAjUBCgF1AIgBpAExAVMAIAB6AmQAMwAyAA0CmwJ3AJgBJAERAVABBAEEAUgBHwGTAaMCeQB5AGUAXwAgAB8BHwFhAGYAdAB5AHYCkABsAAQBaAEzAS8BAQESAXMAYgAgAJ8BSAFuAXoB4gCIAGkAPAEjAhkBNQAgAG8BFwFzACIBMQAwAAQBAQE9AQgBMwAgAHkB7QFpAG0AeQDIAXQAZQBzAGwBtwIhAWYAkgEyACAAggG6ABECygEeAR0BQQBQADYANQBpAAUBOQA5ADcAMwB3AFwBbAAcAR8BBQFWAQEBYgD2AWIAIwEHAkQBdwCuAXUAAAFOAgoA3AI/AgoBSAFrAQgBPQF5AAEBTQEwAXQADAFyAG0AAgFzAHkAFgMuAmYARQFjAK4BFQESAWgBNgIvAQABKQA7AIwBJwFnAG4ANQA1AIYCFwI2ADQARQFjANwBSwEiAVcBMwAwAJ4BCAFUAmMAkgEgAJkBfgFkADABOgA6AG8AbwBdAjYBcgBVAV8AXwDFAgYBEgFjABQBZAAtAQABRwGHAW4BSwFGAnIA4gCKAHAAaABmAGwA3wFzAGICAQFpAO0BmgGaAV8BbQA4AWMAdgBvAA0BCAEZA3sBcAAAAWsBSwFvAGsAdgAQAXMAYQH1AU8BOAA3ADkAMABiAW0AKgEwAWgAgAFfAicBFgGBAisBwAFuAREBAgOwAQwBwwISATgBcwAwARYBgAFtAAoBGAEUAW0AHAEdAQ0BMQA1AAQBGAHUAdQBDAF0ABoBAgFzADwCdgEnARUBLgF3AOwBqQJ5AUsCSwJuAC4ADAFzAOMB4wE5A3MBbAEFATABAQFjAHIBZwAPAS8ANwBZASAAcwBRARQBbgIPAWcAdwAAAU8BbAClAXQAaQFwAGEAdQBtAGUBdAARARMBBQFoAEABawEJAW8AWAF8AHwAjgGOAeIAjgCJA5UACwIVAmIAaQBFAWYCYgBsAHAAPAEWAUsBaQEtAv4CUQG3AcoBaAAVAXMCiwA9AccCbQAQAWkAGAEvAUcDRQEAAWsAJwEHAQgB8QIpAasBqwFiAOECEQEuATUBYwBiAHkAdgAgAGcACQEHA7UCXQAgACgBbwETAS8ByQE2AbkBdQJvADgBEwEAAUoCMgEYAWQAngEHASoBFAEJAoEBFwEuAR0BcgDaAXMAawF1Ar4CIQFsABsCBAIgAHMAGgIqAccBbgAaAosBZAC8AhIBaAASARABqwI2ADgALAEsAXUAAgHiALUAwwOXAIkB4QEyADAAZQAiAWsBgAFVAbICYgBzAI8BAAEPATUBbwBRAQ8B8wEGARwBbQAqAnQBAgFjAGQAYQAEAR0BAAE7ACAA3AEpAWQAmAEHAXQAzgEyAYkBIAFoAQUBdwCfAo0DcwF5ADICrQFUAWIAbAEtACAAQgFPASUBLgEUAQgBZgBpAfIBNgErAW4AYQBBARYBDAJiACkBdgARARoBEQFkACkBdQFwALsBAgFvADIB/AI8ATQBYwBEARQBQQJBAngCeAKEApECKwECAb8DCAF0AGkBcwBsAGMCCAF3AGEA1QEAAUcBUQHdATIB'), dtype=numpy.uint32) bped_inv = {} for i in range(256, 1024): x = real_bped[i - 256] a, b = x & 0xFFFF, x >> 16 bped_inv[a, b] = i N_bits = 14 X_bits = 2 ** N_bits mask = X_bits - 1 qfreqs = numpy.frombuffer(base64.b64decode('AQEBAQEBAQEBAUwBAQEBAQEBAQEBAQEBAQEBAQEBAQGyDTEUDwoKMko0ECE3RFY6N0tFNjctJTA4LxwXGSMeFAQsGB8WIRERDycNFxchEhweBR0jHwsKDQkHBioKIw0vLWd8pH6MbW9qciJPU2JjSYkJYaGdVDxJM0giEwwTDAECBQQEBgEBAgICAwUJAgEBAQIBAgQCAQMEAgEBAQEBBAIBBAECAgQBBgEBAgEBAQMBAQEEAQEBAgIBAQEBAQEBAQEGCQQDAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQ8GAQEBAQEBAQEBAQEDBAEBAQEBAQEBAQEBAQEBAUVbfj1ZOU0yPx1EIxZCFzxBZZRxN10ealWKLhxGKQoWXg5qMiJ0I0JlNTUSCR5MLhxIJSY1HS4FKygZFBQmOC8fSA8XOxk0Iyg/IBoRNhcTHRsXOwYfFzASHQEBDgEBIgkZCyIyEBEvARcBDRktFx8ZIQ4VGSQSHCAaDCAmHh8lBQsQJCQSEQcaGCIBDRQOHCEXEQsFIQsCEQ0LCQEMEhcVHRMBFRcBHh4eHQ0dGAQSDBENEg8QGxUbFQEUFRkFGRMZChkBGRgGEAgKGAULERcXBAYMBREWDRUWEAYJEAkOAQ8UBRUIAgoKCAgUDQUUARQBBA0UDAsBEhMSEhISEgURAxIFCQESARIMEQsCCBAREQkRERELBwkREAUKEAcQEA8PDwcPDg8CDwEJDw8IBw4ODgkPDg4HDg4NDg0HDQ4FDQEODQ0OBg4NDQgBDAQFBA0NDAwGAQwNDQ0MAwsGBwYMDAUHAQwFDAsMCwwMAQULCwwMBgsBAwwBDAMMCwsMCwoECwwFCwMLCgsLCwsKCwsKBgsLCgsKCwoKCgIKCgsECgoKAgoBCQoKAwoEAQEBCgkJBAkKAwoBCQkKCQQJAwkKCQkCCQIJBAcJAQkJCAkJCQkJCAkJCQkJCAkJAQkJCAkCCAkICAgGCAgJCAgHCAgIAQgICAgICQgJCAgDCAIIBwgBCAgIBwIICAYICAgHCAcICAcHCAEICAEHBwgHBwgIBwgIBwcHBwYICAgHBwcHBggGBwcGBQYGAQgHBwgHBwcHBwcHBwQBBwcHBwcGBwcHBwYHBgcGBwcGBwYHBgcHBwcHBwcGBAcHBgcHBgcDBgcEBwYGBwcHBwYGBwYGBgYFBgcHBwYGBAQBBgYHAQYGBgYGBgYGBQUGBgYGBgYCBgUFBQYGBgYGBQYGBgYFBgUFBQUGBgYGBgQFBAYFBgYBBQQFAQUFBQYFBQYGBgUGBQUEBQUGBQQFBQYFBgUFBQUGBQUFBQUFBQYGBgYFBQYEBQQFBQIDBQYFBQUFBQUEBQ=='), dtype=numpy.uint8) cdf = numpy.insert(numpy.cumsum(qfreqs, dtype=numpy.uint16), 0, 0) @functools.cache def icdf(y): for i, x in enumerate(cdf): if y < x: return i - 1 def decompress_bpe(s): s = numpy.array(s, dtype=numpy.uint16) while True: r = numpy.zeros(len(s) * 2, dtype=numpy.uint16) z = 0 for c in s: if c > 255: x = real_bped[c - 256] a, b = x & 0xFFFF, x >> 16 r[z] = a z += 1 r[z] = b z += 1 else: r[z] = c z += 1 if z == len(s): break s = r[:z] return bytes(s.astype(numpy.uint8)) def compress_bpe(s): s = numpy.array(bytearray(s), dtype=numpy.uint16) while True: r = numpy.zeros_like(s) z = 0 used = False for pair in zip(s, s[1:]): if used: used = False else: mp = bped_inv.get(pair) if mp: r[z] = mp z += 1 used = True else: r[z] = pair[0] z += 1 used = False if not used: r[z] = s[-1] z += 1 if z == len(s): break s = r[:z] return s def compress_ans(bpes): def C(x, s): s_count = int(qfreqs[s]) return (x // s_count) * X_bits + int(cdf[s]) + (x % s_count) x = 1 for symbol in bpes: x = C(x, symbol) return len(bpes), x def decompress_ans(ilen, num): def D(x): slot = x & mask sym = icdf(slot) prev_state = (x // X_bits) * int(qfreqs[sym]) + slot - int(cdf[sym]) return sym, prev_state x = num syms = [] for _ in range(ilen): sym, x = D(x) syms.append(sym) syms.reverse() return syms def compress(s): l, x = compress_ans(compress_bpe(s)) return l.to_bytes(4, "little") + x.to_bytes(math.ceil(x.bit_length() / 8), "little") def decompress(s): l, x = s[:4], s[4:] return decompress_bpe(decompress_ans(int.from_bytes(l, "little"), int.from_bytes(x, "little")))