mirror of
https://github.com/osmarks/random-stuff
synced 2024-11-08 13:39:53 +00:00
95 lines
7.8 KiB
Python
95 lines
7.8 KiB
Python
|
import json, numpy, math, functools, base64
|
||
|
|
||
|
real_bped = numpy.frombuffer(base64.b64decode('ZQAgAHQAIABzACAAdABoAGkAbgBkACAAZQByAGEAbgB5ACAAbwAgAG8AbgAgACAAbwB1AG8AcgBnACAAZQBuAGEAbAAEAQ4BYQAgAHIAZQBhAHIAaQACASAAAwF0AAkBcwB0AGkAAQFpAHQAbwBtAGEAdAAgAHcAeQAMAWgAYQBlAAUBZgAgACwAIABsAAgBdQBzAAcBBQFsACAAZQACAUkAIAAGASAAYwBoAG8AdwALAQsBZQBzAAMBAAFhAHMAZQBjAB4BIABrACAAdgAAAWwAaQB0AGkAbQAgACcAAQFsAG8ADQEgAHIAbwAnAAIBbAAAAW0AYQAKASAABwEgAGUAbAAEASAAYgB1AGEAAQFsAGUAYQBjAGkAYwBpAHMAFgEAAWkAIABtAAABDwEgAGEAYgBvACEBbgBvAGkAbAByAGEAaAAgAHAAOgEuACAAawAAAXUAbgBnAG8AQgEBAXYABgFvAHAAcwAbAaMA4gBpAAMBgwBbAV0BXQFzAGUAKwEgAGkAZABjABsBcwAJAW0ACAFvAGQAJAEAARoBOwFJACcAcgBpAGAAYAB3AGgAbwBsAGYAOQFiAGUAHwEzAXMAaABhAAIBdQByABABIwFlAG0AZQB4AGEAZwAKACwBYQBkAHMAaQB0AHQAYgAAAT4AIABgACAACgEAAU4BAQEGAQABdQBjAOIAjQAUAQABAwEVAXQABgEMAQEBbgA3AXAAIABuAGUAEAEgAAwBbABkAG8AYwA/ASkAKQBmAA0BAwFDAQ8BdABpAHIAcABwAAoBNwE0AVQBKgEgAGIAMAFhAAgBWgFlAC4ALgAkAQEBXAEgAHAAZQB0AHIAbwBmAHAAbwBtAGUAAwERAWEAZgFpACEBYwAKAR8BAQFkAGUAZAAJAXAAbACXAaMBMQAxAHMAdQB0AGEAiwEFAWQAaQBzAAEBZACUAWcAZQBtAG8AHAFpAHQAbwAPAQEBcQB1AG4ACQEdAWgAXgFeAQQBZwAwADAAYQBtAGMAZQFiAG8AbgAgAHkAJwFnAGgAdQBsAEEAQQApACAAPwAgAAcBZwBlAGEAAwFlABoBAAFoATYBcgAgAAwBbgB3AA0BEAFsAGoAmwFnAFABGwEgAG4AYAE+AD4AdQB0ABYBFQFPASYBawARAVoBAAFwABMBZgByAGUAdgBrAGUAbQBtAHAABgEvAC8AIAEXAXcAnAE8ADwAvgEAAWYAgQFwAHMABwEBAWEAeQBkAEcBYwANAXQAAgFAASYBbQBwABwBAAFoAHoBWAEIAToA4AFSAdEBYwAAAUYBIABiAXAAYQAFAZ0BWQEKACAAcwBwAGgAYAEeAcwBTAGGAWsAAgH3ATwBYwAHAWYABAFvAAUBOgAgACgAKAArACsA7wHmAQUC8QFSAWIAEAEmARgBcgBJAB0BBAGFARwBOwEYAXUAHwECAT0AIABMATwBdwByAHMAAAEWAUMBQwAgAGUAGAFuACsBdgEAARABYwEHAWQAYQBKAS0BAgFvAGIAcgAAAXcApgEwACAAdQBtAFYBAQLbAdIBEgFiAGMAIAAPAWQACgEjAW0ADQFwABQBLgBiAS0BcwA1AUoBZQBkAMIBAQEpAi8AsgEBAQoBAgF3ASwBEQEXAUABbAB3ACAAPQFUAWwAJgF0ACkBBgECAXgAIAAxADYAZwBpAHIBAAEEAQABMQGNAWEAkwH/ATcBYQBhAGgAAAF2ACkBaABpAHMABgFsAHkAHQGmAXYAZQAxADIAOAFvALoBugEEAWQALwFtAGoBYAAeAScAAwGAAWYAaQAUASAAJwIAAWYAVQGMAS0BZgBmAHMAYwAHAXkAYQBpADUBPgEDASkBYQB2AGkAJwBoAGgANQFtAIIBtQB2AGkAcAB1ABYBZQBsAMcBtAE+AXQAdQBzAC8AVQKHAXcAZQBmAGUAdwBxATMANABtAHUAzQEFAQQBdABiAAgBYgByADgBJgHiAIAAcwBvAGkAlgHiAIYAmQGiAcQBxAFzAG0A6QHqAeUBMgFmAGEAEwFzAbUBCQFiAC8BZwB1AAQBMgFjAG8AZQBwAAMCUwBtAHQBZAJ1AGcAAAFwARkBAwEgAC0BAQFwAHIAbQAHAWkAHQFrACAByQEIAW0ABAEoAEsAYwBsAB0BnAEPAQUBRQEyASgBsQHnARcBEwFjAGgAZQBrANMBZgAhAV8BAAFrABYCaQB6AOgBAgEiACAAbQCRAQoBDgFlAFgBaQBmAEwBIwFWAWwAYgA8AUUBawCgAXMArwEFARMBeAFkAGEBMgBiAAoAagFEACAArgJ9AXQAcgFkAAYBFAEFATEAIABMACAAswGwAU0CrQJhAHAAAwEMAnQAdwBtApYBYQFlAGwAFAFzAEABcAGuAQYBZQA0ACAAGgEIAWcAUQHyAd4BbAAaAXQAIAFkAAABdQBwAGMAcgBsAD0BdwDXAawChwHKAgkBAwFLAXkALQFPAh0CGgFpADAAMgBvAFsCOAA2ACoBBwF0AAABBwKlAjUBCgF1AIgBpAExAVMAIAB6AmQAMwAyAA0CmwJ3AJgBJAERAVABBAEEAUgBHwGTAaMCeQB5AGUAXwAgAB8BHwFhAGYAdAB5AHYCkABsAAQBaAEzAS8BAQESAXMAYgAgAJ8BSAFuAXoB4gCIAGkAPAEjAhkBNQAgAG8BFwFzACIBMQAwAAQBAQE9AQgBMwAgAHkB7QFpAG0AeQDIAXQAZQBzAGwBtwIhAWYAkgEyACAAggG6ABECygEeAR0BQQBQADYANQBpAAUBOQA5ADcAMwB3AFwBbAAcAR8BBQFWAQEBYgD2AWIAIwEHAkQBdwCuAXUAAAFOAgoA3AI/AgoBSAFrAQgBPQF5AAEBTQEwAXQADAFyAG0AAgFzAHkAFgMuAmYARQFjAK4BFQESAWgBNgIvAQABKQA7AIwBJwFnAG4ANQA1AIYCFwI2ADQARQFjANwBSwEiAVcBMwAwAJ4BCAFUAmMAkgEgAJkBfgFkADABOgA6AG8AbwBdAjYBcgBVAV8AXwDFAgYBEgFjABQBZAAtAQABRwGHAW4BSwFGAnIA4gCKAHAAaABmAGwA3wFzAGICAQFpAO0BmgGaAV8BbQA4AWMAdgBvAA0BCAEZA3sBcAAAAWsBSwFvAGsAdgAQAXMAYQH1AU8BOAA3ADkAMABiAW0AKgEwAWgAgAFfAicBFgGBAisBwAFuAREBAgOwAQwBwwISATgBcwAwARYBgAFtAAoBGAEUAW0AHAEdAQ0BMQA1AAQBGAHUAdQBDAF0ABoBAgFzADwCdgEnARUBLgF3AOwBqQJ5AUsCSwJuAC4ADAFzAOMB4wE5A3MBbAEFATABAQFjAHIBZwAPAS8ANwBZASAAcwBRARQBbgIPAWcAdwAAAU8BbAClAXQAaQFwAGEAdQBtAGUBdAARARMBBQFoAEABawEJAW8AWAF8AHwAjgGOAeIAjgCJA5UACwIVAmIAaQBFAWYCYgBsAHAAPAEWAUsBaQEtAv4CUQG3AcoBaAAVAXMCiwA9AccCbQAQAWkAGAEvAUcDRQEAAWsAJwEHAQgB8QIpAasBqwFiAOECEQEuATUBYwBiAHkAdgAgAGcACQEHA7UCXQAgACgBbwETAS8ByQE2AbkBdQJvADgBEwEAAUoCMgEYAWQAngEHASoBFAEJAoEBFwEuAR0BcgDaAXMAawF1Ar4CIQFsABsCBAIgAHMAGgIqAccBbgAaAosBZAC8AhIBaAASARABqwI2ADgALAEsAXUAAgHiALUAwwOXAIkB4QEyADAAZQAiAWsBgAFVAbICYgBzAI8BAAEPATUBbwBRAQ8B8wEGARwBbQAqAnQBAgFjAGQAYQAEAR0BAAE7ACAA3AEpAWQAmAEHAXQAzgEyAYkBIAFoAQUBdwCfAo0DcwF5ADICrQFUAWIAbAEtACAAQgFPASUBLgEUAQgBZgBpAfIBNgErAW4AYQBBARYBDAJiACkBdgARARoBEQFkACkBdQFwALsBAgFvADIB/AI8ATQBYwBEARQBQQJBAngCeAKEApEC
|
||
|
bped_inv = {}
|
||
|
for i in range(256, 1024):
|
||
|
x = real_bped[i - 256]
|
||
|
a, b = x & 0xFFFF, x >> 16
|
||
|
bped_inv[a, b] = i
|
||
|
N_bits = 14
|
||
|
X_bits = 2 ** N_bits
|
||
|
mask = X_bits - 1
|
||
|
qfreqs = numpy.frombuffer(base64.b64decode('AQEBAQEBAQEBAUwBAQEBAQEBAQEBAQEBAQEBAQEBAQGyDTEUDwoKMko0ECE3RFY6N0tFNjctJTA4LxwXGSMeFAQsGB8WIRERDycNFxchEhweBR0jHwsKDQkHBioKIw0vLWd8pH6MbW9qciJPU2JjSYkJYaGdVDxJM0giEwwTDAECBQQEBgEBAgICAwUJAgEBAQIBAgQCAQMEAgEBAQEBBAIBBAECAgQBBgEBAgEBAQMBAQEEAQEBAgIBAQEBAQEBAQEGCQQDAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQ8GAQEBAQEBAQEBAQEDBAEBAQEBAQEBAQEBAQEBAUVbfj1ZOU0yPx1EIxZCFzxBZZRxN10ealWKLhxGKQoWXg5qMiJ0I0JlNTUSCR5MLhxIJSY1HS4FKygZFBQmOC8fSA8XOxk0Iyg/IBoRNhcTHRsXOwYfFzASHQEBDgEBIgkZCyIyEBEvARcBDRktFx8ZIQ4VGSQSHCAaDCAmHh8lBQsQJCQSEQcaGCIBDRQOHCEXEQsFIQsCEQ0LCQEMEhcVHRMBFRcBHh4eHQ0dGAQSDBENEg8QGxUbFQEUFRkFGRMZChkBGRgGEAgKGAULERcXBAYMBREWDRUWEAYJEAkOAQ8UBRUIAgoKCAgUDQUUARQBBA0UDAsBEhMSEhISEgURAxIFCQESARIMEQsCCBAREQkRERELBwkREAUKEAcQEA8PDwcPDg8CDwEJDw8IBw4ODgkPDg4HDg4NDg0HDQ4FDQEODQ0OBg4NDQgBDAQFBA0NDAwGAQwNDQ0MAwsGBwYMDAUHAQwFDAsMCwwMAQULCwwMBgsBAwwBDAMMCwsMCwoECwwFCwMLCgsLCwsKCwsKBgsLCgsKCwoKCgIKCgsECgoKAgoBCQoKAwoEAQEBCgkJBAkKAwoBCQkKCQQJAwkKCQkCCQIJBAcJAQkJCAkJCQkJCAkJCQkJCAkJAQkJCAkCCAkICAgGCAgJCAgHCAgIAQgICAgICQgJCAgDCAIIBwgBCAgIBwIICAYICAgHCAcICAcHCAEICAEHBwgHBwgIBwgIBwcHBwYICAgHBwcHBggGBwcGBQYGAQgHBwgHBwcHBwcHBwQBBwcHBwcGBwcHBwYHBgcGBwcGBwYHBgcHBwcHBwcGBAcHBgcHBgcDBgcEBwYGBwcHBwYGBwYGBgYFBgcHBwYGBAQBBgYHAQYGBgYGBgYGBQUGBgYGBgYCBgUFBQYGBgYGBQYGBgYFBgUFBQUGBgYGBgQFBAYFBgYBBQQFAQUFBQYFBQYGBgUGBQUEBQUGBQQFBQYFBgUFBQUGBQUFBQUFBQYGBgYFBQYEBQQFBQIDBQYFBQUFBQUEBQ=='), dtype=numpy.uint8)
|
||
|
|
||
|
cdf = numpy.insert(numpy.cumsum(qfreqs, dtype=numpy.uint16), 0, 0)
|
||
|
@functools.cache
|
||
|
def icdf(y):
|
||
|
for i, x in enumerate(cdf):
|
||
|
if y < x: return i - 1
|
||
|
|
||
|
def decompress_bpe(s):
|
||
|
s = numpy.array(s, dtype=numpy.uint16)
|
||
|
while True:
|
||
|
r = numpy.zeros(len(s) * 2, dtype=numpy.uint16)
|
||
|
z = 0
|
||
|
for c in s:
|
||
|
if c > 255:
|
||
|
x = real_bped[c - 256]
|
||
|
a, b = x & 0xFFFF, x >> 16
|
||
|
r[z] = a
|
||
|
z += 1
|
||
|
r[z] = b
|
||
|
z += 1
|
||
|
else:
|
||
|
r[z] = c
|
||
|
z += 1
|
||
|
if z == len(s): break
|
||
|
s = r[:z]
|
||
|
return bytes(s.astype(numpy.uint8))
|
||
|
|
||
|
def compress_bpe(s):
|
||
|
s = numpy.array(bytearray(s), dtype=numpy.uint16)
|
||
|
while True:
|
||
|
r = numpy.zeros_like(s)
|
||
|
z = 0
|
||
|
used = False
|
||
|
for pair in zip(s, s[1:]):
|
||
|
if used:
|
||
|
used = False
|
||
|
else:
|
||
|
mp = bped_inv.get(pair)
|
||
|
if mp:
|
||
|
r[z] = mp
|
||
|
z += 1
|
||
|
used = True
|
||
|
else:
|
||
|
r[z] = pair[0]
|
||
|
z += 1
|
||
|
used = False
|
||
|
if not used:
|
||
|
r[z] = s[-1]
|
||
|
z += 1
|
||
|
if z == len(s): break
|
||
|
s = r[:z]
|
||
|
return s
|
||
|
|
||
|
def compress_ans(bpes):
|
||
|
def C(x, s):
|
||
|
s_count = int(qfreqs[s])
|
||
|
return (x // s_count) * X_bits + int(cdf[s]) + (x % s_count)
|
||
|
x = 1
|
||
|
for symbol in bpes:
|
||
|
x = C(x, symbol)
|
||
|
return len(bpes), x
|
||
|
|
||
|
def decompress_ans(ilen, num):
|
||
|
def D(x):
|
||
|
slot = x & mask
|
||
|
sym = icdf(slot)
|
||
|
prev_state = (x // X_bits) * int(qfreqs[sym]) + slot - int(cdf[sym])
|
||
|
return sym, prev_state
|
||
|
x = num
|
||
|
syms = []
|
||
|
for _ in range(ilen):
|
||
|
sym, x = D(x)
|
||
|
syms.append(sym)
|
||
|
syms.reverse()
|
||
|
return syms
|
||
|
|
||
|
def compress(s):
|
||
|
l, x = compress_ans(compress_bpe(s))
|
||
|
return l.to_bytes(4, "little") + x.to_bytes(math.ceil(x.bit_length() / 8), "little")
|
||
|
|
||
|
def decompress(s):
|
||
|
l, x = s[:4], s[4:]
|
||
|
return decompress_bpe(decompress_ans(int.from_bytes(l, "little"), int.from_bytes(x, "little")))
|