random-stuff/code-guessing/compressor-test.py

95 lines
7.8 KiB
Python

import json, numpy, math, functools, base64
real_bped = numpy.frombuffer(base64.b64decode('ZQAgAHQAIABzACAAdABoAGkAbgBkACAAZQByAGEAbgB5ACAAbwAgAG8AbgAgACAAbwB1AG8AcgBnACAAZQBuAGEAbAAEAQ4BYQAgAHIAZQBhAHIAaQACASAAAwF0AAkBcwB0AGkAAQFpAHQAbwBtAGEAdAAgAHcAeQAMAWgAYQBlAAUBZgAgACwAIABsAAgBdQBzAAcBBQFsACAAZQACAUkAIAAGASAAYwBoAG8AdwALAQsBZQBzAAMBAAFhAHMAZQBjAB4BIABrACAAdgAAAWwAaQB0AGkAbQAgACcAAQFsAG8ADQEgAHIAbwAnAAIBbAAAAW0AYQAKASAABwEgAGUAbAAEASAAYgB1AGEAAQFsAGUAYQBjAGkAYwBpAHMAFgEAAWkAIABtAAABDwEgAGEAYgBvACEBbgBvAGkAbAByAGEAaAAgAHAAOgEuACAAawAAAXUAbgBnAG8AQgEBAXYABgFvAHAAcwAbAaMA4gBpAAMBgwBbAV0BXQFzAGUAKwEgAGkAZABjABsBcwAJAW0ACAFvAGQAJAEAARoBOwFJACcAcgBpAGAAYAB3AGgAbwBsAGYAOQFiAGUAHwEzAXMAaABhAAIBdQByABABIwFlAG0AZQB4AGEAZwAKACwBYQBkAHMAaQB0AHQAYgAAAT4AIABgACAACgEAAU4BAQEGAQABdQBjAOIAjQAUAQABAwEVAXQABgEMAQEBbgA3AXAAIABuAGUAEAEgAAwBbABkAG8AYwA/ASkAKQBmAA0BAwFDAQ8BdABpAHIAcABwAAoBNwE0AVQBKgEgAGIAMAFhAAgBWgFlAC4ALgAkAQEBXAEgAHAAZQB0AHIAbwBmAHAAbwBtAGUAAwERAWEAZgFpACEBYwAKAR8BAQFkAGUAZAAJAXAAbACXAaMBMQAxAHMAdQB0AGEAiwEFAWQAaQBzAAEBZACUAWcAZQBtAG8AHAFpAHQAbwAPAQEBcQB1AG4ACQEdAWgAXgFeAQQBZwAwADAAYQBtAGMAZQFiAG8AbgAgAHkAJwFnAGgAdQBsAEEAQQApACAAPwAgAAcBZwBlAGEAAwFlABoBAAFoATYBcgAgAAwBbgB3AA0BEAFsAGoAmwFnAFABGwEgAG4AYAE+AD4AdQB0ABYBFQFPASYBawARAVoBAAFwABMBZgByAGUAdgBrAGUAbQBtAHAABgEvAC8AIAEXAXcAnAE8ADwAvgEAAWYAgQFwAHMABwEBAWEAeQBkAEcBYwANAXQAAgFAASYBbQBwABwBAAFoAHoBWAEIAToA4AFSAdEBYwAAAUYBIABiAXAAYQAFAZ0BWQEKACAAcwBwAGgAYAEeAcwBTAGGAWsAAgH3ATwBYwAHAWYABAFvAAUBOgAgACgAKAArACsA7wHmAQUC8QFSAWIAEAEmARgBcgBJAB0BBAGFARwBOwEYAXUAHwECAT0AIABMATwBdwByAHMAAAEWAUMBQwAgAGUAGAFuACsBdgEAARABYwEHAWQAYQBKAS0BAgFvAGIAcgAAAXcApgEwACAAdQBtAFYBAQLbAdIBEgFiAGMAIAAPAWQACgEjAW0ADQFwABQBLgBiAS0BcwA1AUoBZQBkAMIBAQEpAi8AsgEBAQoBAgF3ASwBEQEXAUABbAB3ACAAPQFUAWwAJgF0ACkBBgECAXgAIAAxADYAZwBpAHIBAAEEAQABMQGNAWEAkwH/ATcBYQBhAGgAAAF2ACkBaABpAHMABgFsAHkAHQGmAXYAZQAxADIAOAFvALoBugEEAWQALwFtAGoBYAAeAScAAwGAAWYAaQAUASAAJwIAAWYAVQGMAS0BZgBmAHMAYwAHAXkAYQBpADUBPgEDASkBYQB2AGkAJwBoAGgANQFtAIIBtQB2AGkAcAB1ABYBZQBsAMcBtAE+AXQAdQBzAC8AVQKHAXcAZQBmAGUAdwBxATMANABtAHUAzQEFAQQBdABiAAgBYgByADgBJgHiAIAAcwBvAGkAlgHiAIYAmQGiAcQBxAFzAG0A6QHqAeUBMgFmAGEAEwFzAbUBCQFiAC8BZwB1AAQBMgFjAG8AZQBwAAMCUwBtAHQBZAJ1AGcAAAFwARkBAwEgAC0BAQFwAHIAbQAHAWkAHQFrACAByQEIAW0ABAEoAEsAYwBsAB0BnAEPAQUBRQEyASgBsQHnARcBEwFjAGgAZQBrANMBZgAhAV8BAAFrABYCaQB6AOgBAgEiACAAbQCRAQoBDgFlAFgBaQBmAEwBIwFWAWwAYgA8AUUBawCgAXMArwEFARMBeAFkAGEBMgBiAAoAagFEACAArgJ9AXQAcgFkAAYBFAEFATEAIABMACAAswGwAU0CrQJhAHAAAwEMAnQAdwBtApYBYQFlAGwAFAFzAEABcAGuAQYBZQA0ACAAGgEIAWcAUQHyAd4BbAAaAXQAIAFkAAABdQBwAGMAcgBsAD0BdwDXAawChwHKAgkBAwFLAXkALQFPAh0CGgFpADAAMgBvAFsCOAA2ACoBBwF0AAABBwKlAjUBCgF1AIgBpAExAVMAIAB6AmQAMwAyAA0CmwJ3AJgBJAERAVABBAEEAUgBHwGTAaMCeQB5AGUAXwAgAB8BHwFhAGYAdAB5AHYCkABsAAQBaAEzAS8BAQESAXMAYgAgAJ8BSAFuAXoB4gCIAGkAPAEjAhkBNQAgAG8BFwFzACIBMQAwAAQBAQE9AQgBMwAgAHkB7QFpAG0AeQDIAXQAZQBzAGwBtwIhAWYAkgEyACAAggG6ABECygEeAR0BQQBQADYANQBpAAUBOQA5ADcAMwB3AFwBbAAcAR8BBQFWAQEBYgD2AWIAIwEHAkQBdwCuAXUAAAFOAgoA3AI/AgoBSAFrAQgBPQF5AAEBTQEwAXQADAFyAG0AAgFzAHkAFgMuAmYARQFjAK4BFQESAWgBNgIvAQABKQA7AIwBJwFnAG4ANQA1AIYCFwI2ADQARQFjANwBSwEiAVcBMwAwAJ4BCAFUAmMAkgEgAJkBfgFkADABOgA6AG8AbwBdAjYBcgBVAV8AXwDFAgYBEgFjABQBZAAtAQABRwGHAW4BSwFGAnIA4gCKAHAAaABmAGwA3wFzAGICAQFpAO0BmgGaAV8BbQA4AWMAdgBvAA0BCAEZA3sBcAAAAWsBSwFvAGsAdgAQAXMAYQH1AU8BOAA3ADkAMABiAW0AKgEwAWgAgAFfAicBFgGBAisBwAFuAREBAgOwAQwBwwISATgBcwAwARYBgAFtAAoBGAEUAW0AHAEdAQ0BMQA1AAQBGAHUAdQBDAF0ABoBAgFzADwCdgEnARUBLgF3AOwBqQJ5AUsCSwJuAC4ADAFzAOMB4wE5A3MBbAEFATABAQFjAHIBZwAPAS8ANwBZASAAcwBRARQBbgIPAWcAdwAAAU8BbAClAXQAaQFwAGEAdQBtAGUBdAARARMBBQFoAEABawEJAW8AWAF8AHwAjgGOAeIAjgCJA5UACwIVAmIAaQBFAWYCYgBsAHAAPAEWAUsBaQEtAv4CUQG3AcoBaAAVAXMCiwA9AccCbQAQAWkAGAEvAUcDRQEAAWsAJwEHAQgB8QIpAasBqwFiAOECEQEuATUBYwBiAHkAdgAgAGcACQEHA7UCXQAgACgBbwETAS8ByQE2AbkBdQJvADgBEwEAAUoCMgEYAWQAngEHASoBFAEJAoEBFwEuAR0BcgDaAXMAawF1Ar4CIQFsABsCBAIgAHMAGgIqAccBbgAaAosBZAC8AhIBaAASARABqwI2ADgALAEsAXUAAgHiALUAwwOXAIkB4QEyADAAZQAiAWsBgAFVAbICYgBzAI8BAAEPATUBbwBRAQ8B8wEGARwBbQAqAnQBAgFjAGQAYQAEAR0BAAE7ACAA3AEpAWQAmAEHAXQAzgEyAYkBIAFoAQUBdwCfAo0DcwF5ADICrQFUAWIAbAEtACAAQgFPASUBLgEUAQgBZgBpAfIBNgErAW4AYQBBARYBDAJiACkBdgARARoBEQFkACkBdQFwALsBAgFvADIB/AI8ATQBYwBEARQBQQJBAngCeAKEApECKwECAb8DCAF0AGkBcwBsAGMCCAF3AGEA1QEAAUcBUQHdATIB'), dtype=numpy.uint32)
bped_inv = {}
for i in range(256, 1024):
x = real_bped[i - 256]
a, b = x & 0xFFFF, x >> 16
bped_inv[a, b] = i
N_bits = 14
X_bits = 2 ** N_bits
mask = X_bits - 1
qfreqs = numpy.frombuffer(base64.b64decode('AQEBAQEBAQEBAUwBAQEBAQEBAQEBAQEBAQEBAQEBAQGyDTEUDwoKMko0ECE3RFY6N0tFNjctJTA4LxwXGSMeFAQsGB8WIRERDycNFxchEhweBR0jHwsKDQkHBioKIw0vLWd8pH6MbW9qciJPU2JjSYkJYaGdVDxJM0giEwwTDAECBQQEBgEBAgICAwUJAgEBAQIBAgQCAQMEAgEBAQEBBAIBBAECAgQBBgEBAgEBAQMBAQEEAQEBAgIBAQEBAQEBAQEGCQQDAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQ8GAQEBAQEBAQEBAQEDBAEBAQEBAQEBAQEBAQEBAUVbfj1ZOU0yPx1EIxZCFzxBZZRxN10ealWKLhxGKQoWXg5qMiJ0I0JlNTUSCR5MLhxIJSY1HS4FKygZFBQmOC8fSA8XOxk0Iyg/IBoRNhcTHRsXOwYfFzASHQEBDgEBIgkZCyIyEBEvARcBDRktFx8ZIQ4VGSQSHCAaDCAmHh8lBQsQJCQSEQcaGCIBDRQOHCEXEQsFIQsCEQ0LCQEMEhcVHRMBFRcBHh4eHQ0dGAQSDBENEg8QGxUbFQEUFRkFGRMZChkBGRgGEAgKGAULERcXBAYMBREWDRUWEAYJEAkOAQ8UBRUIAgoKCAgUDQUUARQBBA0UDAsBEhMSEhISEgURAxIFCQESARIMEQsCCBAREQkRERELBwkREAUKEAcQEA8PDwcPDg8CDwEJDw8IBw4ODgkPDg4HDg4NDg0HDQ4FDQEODQ0OBg4NDQgBDAQFBA0NDAwGAQwNDQ0MAwsGBwYMDAUHAQwFDAsMCwwMAQULCwwMBgsBAwwBDAMMCwsMCwoECwwFCwMLCgsLCwsKCwsKBgsLCgsKCwoKCgIKCgsECgoKAgoBCQoKAwoEAQEBCgkJBAkKAwoBCQkKCQQJAwkKCQkCCQIJBAcJAQkJCAkJCQkJCAkJCQkJCAkJAQkJCAkCCAkICAgGCAgJCAgHCAgIAQgICAgICQgJCAgDCAIIBwgBCAgIBwIICAYICAgHCAcICAcHCAEICAEHBwgHBwgIBwgIBwcHBwYICAgHBwcHBggGBwcGBQYGAQgHBwgHBwcHBwcHBwQBBwcHBwcGBwcHBwYHBgcGBwcGBwYHBgcHBwcHBwcGBAcHBgcHBgcDBgcEBwYGBwcHBwYGBwYGBgYFBgcHBwYGBAQBBgYHAQYGBgYGBgYGBQUGBgYGBgYCBgUFBQYGBgYGBQYGBgYFBgUFBQUGBgYGBgQFBAYFBgYBBQQFAQUFBQYFBQYGBgUGBQUEBQUGBQQFBQYFBgUFBQUGBQUFBQUFBQYGBgYFBQYEBQQFBQIDBQYFBQUFBQUEBQ=='), dtype=numpy.uint8)
cdf = numpy.insert(numpy.cumsum(qfreqs, dtype=numpy.uint16), 0, 0)
@functools.cache
def icdf(y):
for i, x in enumerate(cdf):
if y < x: return i - 1
def decompress_bpe(s):
s = numpy.array(s, dtype=numpy.uint16)
while True:
r = numpy.zeros(len(s) * 2, dtype=numpy.uint16)
z = 0
for c in s:
if c > 255:
x = real_bped[c - 256]
a, b = x & 0xFFFF, x >> 16
r[z] = a
z += 1
r[z] = b
z += 1
else:
r[z] = c
z += 1
if z == len(s): break
s = r[:z]
return bytes(s.astype(numpy.uint8))
def compress_bpe(s):
s = numpy.array(bytearray(s), dtype=numpy.uint16)
while True:
r = numpy.zeros_like(s)
z = 0
used = False
for pair in zip(s, s[1:]):
if used:
used = False
else:
mp = bped_inv.get(pair)
if mp:
r[z] = mp
z += 1
used = True
else:
r[z] = pair[0]
z += 1
used = False
if not used:
r[z] = s[-1]
z += 1
if z == len(s): break
s = r[:z]
return s
def compress_ans(bpes):
def C(x, s):
s_count = int(qfreqs[s])
return (x // s_count) * X_bits + int(cdf[s]) + (x % s_count)
x = 1
for symbol in bpes:
x = C(x, symbol)
return len(bpes), x
def decompress_ans(ilen, num):
def D(x):
slot = x & mask
sym = icdf(slot)
prev_state = (x // X_bits) * int(qfreqs[sym]) + slot - int(cdf[sym])
return sym, prev_state
x = num
syms = []
for _ in range(ilen):
sym, x = D(x)
syms.append(sym)
syms.reverse()
return syms
def compress(s):
l, x = compress_ans(compress_bpe(s))
return l.to_bytes(4, "little") + x.to_bytes(math.ceil(x.bit_length() / 8), "little")
def decompress(s):
l, x = s[:4], s[4:]
return decompress_bpe(decompress_ans(int.from_bytes(l, "little"), int.from_bytes(x, "little")))