--[[ LUA MODULE compress.deflatelua - deflate (and gunzip/zlib) implemented in Lua. SYNOPSIS local DEFLATE = require 'compress.deflatelua' -- uncompress gzip file local fh = assert(io.open'foo.txt.gz', 'rb') local ofh = assert(io.open'foo.txt', 'wb') DEFLATE.gunzip {input=fh, output=ofh} fh:close(); ofh:close() -- can also uncompress from string including zlib and raw DEFLATE formats. DESCRIPTION This is a pure Lua implementation of decompressing the DEFLATE format, including the related zlib and gzip formats. Note: This library only supports decompression. Compression is not currently implemented. API Note: in the following functions, input stream `fh` may be a file handle, string, or an iterator function that returns strings. Output stream `ofh` may be a file handle or a function that consumes one byte (number 0..255) per call. DEFLATE.inflate {input=fh, output=ofh} Decompresses input stream `fh` in the DEFLATE format while writing to output stream `ofh`. DEFLATE is detailed in http://tools.ietf.org/html/rfc1951 . DEFLATE.gunzip {input=fh, output=ofh, disable_crc=disable_crc} Decompresses input stream `fh` with the gzip format while writing to output stream `ofh`. `disable_crc` (defaults to `false`) will disable CRC-32 checking to increase speed. gzip is detailed in http://tools.ietf.org/html/rfc1952 . DEFLATE.inflate_zlib {input=fh, output=ofh, disable_crc=disable_crc} Decompresses input stream `fh` with the zlib format while writing to output stream `ofh`. `disable_crc` (defaults to `false`) will disable CRC-32 checking to increase speed. zlib is detailed in http://tools.ietf.org/html/rfc1950 . DEFLATE.adler32(byte, crc) --> rcrc Returns adler32 checksum of byte `byte` (number 0..255) appended to string with adler32 checksum `crc`. This is internally used by `inflate_zlib`. ADLER32 in detailed in http://tools.ietf.org/html/rfc1950 . COMMAND LINE UTILITY A `gunziplua` command line utility (in folder `bin`) is also provided. This mimicks the *nix `gunzip` utility but is a pure Lua implementation that invokes this library. For help do gunziplua -h DEPENDENCIES Requires 'digest.crc32lua' (used for optional CRC-32 checksum checks). https://github.com/davidm/lua-digest-crc32lua Will use a bit library ('bit', 'bit32', 'bit.numberlua') if available. This is not that critical for this library but is required by digest.crc32lua. 'pythonic.optparse' is only required by the optional `gunziplua` command-line utilty for command line parsing. https://github.com/davidm/lua-pythonic-optparse INSTALLATION Copy the `compress` directory into your LUA_PATH. REFERENCES [1] DEFLATE Compressed Data Format Specification version 1.3 http://tools.ietf.org/html/rfc1951 [2] GZIP file format specification version 4.3 http://tools.ietf.org/html/rfc1952 [3] http://en.wikipedia.org/wiki/DEFLATE [4] pyflate, by Paul Sladen http://www.paul.sladen.org/projects/pyflate/ [5] Compress::Zlib::Perl - partial pure Perl implementation of Compress::Zlib http://search.cpan.org/~nwclark/Compress-Zlib-Perl/Perl.pm LICENSE (c) 2008-2011 David Manura. Licensed under the same terms as Lua (MIT). Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. (end license) --]] local M = {_TYPE='module', _NAME='compress.deflatelua', _VERSION='0.3.20111128'} local assert = assert local error = error local ipairs = ipairs local pairs = pairs local print = print local require = require local tostring = tostring local type = type local setmetatable = setmetatable local io = io local math = math local table_sort = table.sort local math_max = math.max local string_char = string.char --[[ Requires the first module listed that exists, else raises like `require`. If a non-string is encountered, it is returned. Second return value is module name loaded (or ''). --]] local function requireany(...) local errs = {} for i = 1, select('#', ...) do local name = select(i, ...) if type(name) ~= 'string' then return name, '' end local ok, mod = pcall(require, name) if ok then return mod, name end errs[#errs+1] = mod end error(table.concat(errs, '\n'), 2) end --local crc32 = require "digest.crc32lua" . crc32_byte --local bit, name_ = requireany('bit', 'bit32', 'bit.numberlua', nil) local bit local crc32 local DEBUG = false -- Whether to use `bit` library functions in current module. -- Unlike the crc32 library, it doesn't make much difference in this module. local NATIVE_BITOPS = (bit ~= nil) local band, lshift, rshift if NATIVE_BITOPS then band = bit.band lshift = bit.lshift rshift = bit.rshift end local function warn(s) io.stderr:write(s, '\n') end local function debug(...) print('DEBUG', ...) end local function runtime_error(s, level) level = level or 1 error({s}, level+1) end local function make_outstate(outbs) local outstate = {} outstate.outbs = outbs outstate.window = {} outstate.window_pos = 1 return outstate end local function output(outstate, byte) -- debug('OUTPUT:', s) local window_pos = outstate.window_pos outstate.outbs(byte) outstate.window[window_pos] = byte outstate.window_pos = window_pos % 32768 + 1 -- 32K end local function noeof(val) return assert(val, 'unexpected end of file') end local function hasbit(bits, bit) return bits % (bit + bit) >= bit end local function memoize(f) local mt = {} local t = setmetatable({}, mt) function mt:__index(k) local v = f(k) t[k] = v return v end return t end -- small optimization (lookup table for powers of 2) local pow2 = memoize(function(n) return 2^n end) --local tbits = memoize( -- function(bits) -- return memoize( function(bit) return getbit(bits, bit) end ) -- end ) -- weak metatable marking objects as bitstream type local is_bitstream = setmetatable({}, {__mode='k'}) -- DEBUG -- prints LSB first --[[ local function bits_tostring(bits, nbits) local s = '' local tmp = bits local function f() local b = tmp % 2 == 1 and 1 or 0 s = s .. b tmp = (tmp - b) / 2 end if nbits then for i=1,nbits do f() end else while tmp ~= 0 do f() end end return s end --]] local function bytestream_from_file(fh) local o = {} function o:read() local sb = fh:read(1) if sb then return sb:byte() end end return o end local function bytestream_from_string(s) local i = 1 local o = {} function o:read() local by if i <= #s then by = s:byte(i) i = i + 1 end return by end return o end local function bytestream_from_function(f) local i = 0 local buffer = '' local o = {} function o:read() return f() -- i = i + 1 -- if i > #buffer then -- buffer = f() -- if not buffer then return end -- i = 1 -- end -- return buffer:byte(i,i) end return o end local function bitstream_from_bytestream(bys) local buf_byte = 0 local buf_nbit = 0 local o = {} function o:nbits_left_in_byte() return buf_nbit end if NATIVE_BITOPS then function o:read(nbits) nbits = nbits or 1 while buf_nbit < nbits do local byte = bys:read() if not byte then return end -- note: more calls also return nil buf_byte = buf_byte + lshift(byte, buf_nbit) buf_nbit = buf_nbit + 8 end local bits if nbits == 0 then bits = 0 elseif nbits == 32 then bits = buf_byte buf_byte = 0 else bits = band(buf_byte, rshift(0xffffffff, 32 - nbits)) buf_byte = rshift(buf_byte, nbits) end buf_nbit = buf_nbit - nbits return bits end else function o:read(nbits) nbits = nbits or 1 while buf_nbit < nbits do local byte = bys:read() if not byte then return end -- note: more calls also return nil buf_byte = buf_byte + pow2[buf_nbit] * byte buf_nbit = buf_nbit + 8 end local m = pow2[nbits] local bits = buf_byte % m buf_byte = (buf_byte - bits) / m buf_nbit = buf_nbit - nbits return bits end end is_bitstream[o] = true return o end local function get_bitstream(o) local bs if is_bitstream[o] then return o elseif io.type(o) == 'file' then bs = bitstream_from_bytestream(bytestream_from_file(o)) elseif type(o) == 'string' then bs = bitstream_from_bytestream(bytestream_from_string(o)) elseif type(o) == 'function' then bs = bitstream_from_bytestream(bytestream_from_function(o)) else runtime_error 'unrecognized type' end return bs end local function get_obytestream(o) local bs if io.type(o) == 'file' then bs = function(sbyte) o:write(string_char(sbyte)) end elseif type(o) == 'function' then bs = o else runtime_error('unrecognized type: ' .. tostring(o)) end return bs end local function HuffmanTable(init, is_full) local t = {} if is_full then for val,nbits in pairs(init) do if nbits ~= 0 then t[#t+1] = {val=val, nbits=nbits} --debug('*',val,nbits) end end else for i=1,#init-2,2 do local firstval, nbits, nextval = init[i], init[i+1], init[i+2] --debug(val, nextval, nbits) if nbits ~= 0 then for val=firstval,nextval-1 do t[#t+1] = {val=val, nbits=nbits} end end end end table_sort(t, function(a,b) return a.nbits == b.nbits and a.val < b.val or a.nbits < b.nbits end) -- assign codes local code = 1 -- leading 1 marker local nbits = 0 for i,s in ipairs(t) do if s.nbits ~= nbits then code = code * pow2[s.nbits - nbits] nbits = s.nbits end s.code = code --debug('huffman code:', i, s.nbits, s.val, code, bits_tostring(code)) code = code + 1 end local minbits = math.huge local look = {} for i,s in ipairs(t) do minbits = math.min(minbits, s.nbits) look[s.code] = s.val end --for _,o in ipairs(t) do -- debug(':', o.nbits, o.val) --end -- function t:lookup(bits) return look[bits] end local msb = NATIVE_BITOPS and function(bits, nbits) local res = 0 for i=1,nbits do res = lshift(res, 1) + band(bits, 1) bits = rshift(bits, 1) end return res end or function(bits, nbits) local res = 0 for i=1,nbits do local b = bits % 2 bits = (bits - b) / 2 res = res * 2 + b end return res end local tfirstcode = memoize( function(bits) return pow2[minbits] + msb(bits, minbits) end) function t:read(bs) local code = 1 -- leading 1 marker local nbits = 0 while 1 do if nbits == 0 then -- small optimization (optional) code = tfirstcode[noeof(bs:read(minbits))] nbits = nbits + minbits else local b = noeof(bs:read()) nbits = nbits + 1 code = code * 2 + b -- MSB first --[[NATIVE_BITOPS code = lshift(code, 1) + b -- MSB first --]] end --debug('code?', code, bits_tostring(code)) local val = look[code] if val then --debug('FOUND', val) return val end end end return t end local function parse_gzip_header(bs) -- local FLG_FTEXT = 2^0 local FLG_FHCRC = 2^1 local FLG_FEXTRA = 2^2 local FLG_FNAME = 2^3 local FLG_FCOMMENT = 2^4 local id1 = bs:read(8) local id2 = bs:read(8) if id1 ~= 31 or id2 ~= 139 then runtime_error 'not in gzip format' end local cm = bs:read(8) -- compression method local flg = bs:read(8) -- FLaGs local mtime = bs:read(32) -- Modification TIME local xfl = bs:read(8) -- eXtra FLags local os = bs:read(8) -- Operating System if DEBUG then debug("CM=", cm) debug("FLG=", flg) debug("MTIME=", mtime) -- debug("MTIME_str=",os.date("%Y-%m-%d %H:%M:%S",mtime)) -- non-portable debug("XFL=", xfl) debug("OS=", os) end if not os then runtime_error 'invalid header' end if hasbit(flg, FLG_FEXTRA) then local xlen = bs:read(16) local extra = 0 for i=1,xlen do extra = bs:read(8) end if not extra then runtime_error 'invalid header' end end local function parse_zstring(bs) repeat local by = bs:read(8) if not by then runtime_error 'invalid header' end until by == 0 end if hasbit(flg, FLG_FNAME) then parse_zstring(bs) end if hasbit(flg, FLG_FCOMMENT) then parse_zstring(bs) end if hasbit(flg, FLG_FHCRC) then local crc16 = bs:read(16) if not crc16 then runtime_error 'invalid header' end -- IMPROVE: check CRC. where is an example .gz file that -- has this set? if DEBUG then debug("CRC16=", crc16) end end end local function parse_zlib_header(bs) local cm = bs:read(4) -- Compression Method local cinfo = bs:read(4) -- Compression info local fcheck = bs:read(5) -- FLaGs: FCHECK (check bits for CMF and FLG) local fdict = bs:read(1) -- FLaGs: FDICT (present dictionary) local flevel = bs:read(2) -- FLaGs: FLEVEL (compression level) local cmf = cinfo * 16 + cm -- CMF (Compresion Method and flags) local flg = fcheck + fdict * 32 + flevel * 64 -- FLaGs if cm ~= 8 then -- not "deflate" runtime_error("unrecognized zlib compression method: " + cm) end if cinfo > 7 then runtime_error("invalid zlib window size: cinfo=" + cinfo) end local window_size = 2^(cinfo + 8) if (cmf*256 + flg) % 31 ~= 0 then runtime_error("invalid zlib header (bad fcheck sum)") end if fdict == 1 then runtime_error("FIX:TODO - FDICT not currently implemented") local dictid_ = bs:read(32) end return window_size end local function parse_huffmantables(bs) local hlit = bs:read(5) -- # of literal/length codes - 257 local hdist = bs:read(5) -- # of distance codes - 1 local hclen = noeof(bs:read(4)) -- # of code length codes - 4 local ncodelen_codes = hclen + 4 local codelen_init = {} local codelen_vals = { 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15} for i=1,ncodelen_codes do local nbits = bs:read(3) local val = codelen_vals[i] codelen_init[val] = nbits end local codelentable = HuffmanTable(codelen_init, true) local function decode(ncodes) local init = {} local nbits local val = 0 while val < ncodes do local codelen = codelentable:read(bs) --FIX:check nil? local nrepeat if codelen <= 15 then nrepeat = 1 nbits = codelen --debug('w', nbits) elseif codelen == 16 then nrepeat = 3 + noeof(bs:read(2)) -- nbits unchanged elseif codelen == 17 then nrepeat = 3 + noeof(bs:read(3)) nbits = 0 elseif codelen == 18 then nrepeat = 11 + noeof(bs:read(7)) nbits = 0 else error 'ASSERT' end for i=1,nrepeat do init[val] = nbits val = val + 1 end end local huffmantable = HuffmanTable(init, true) return huffmantable end local nlit_codes = hlit + 257 local ndist_codes = hdist + 1 local littable = decode(nlit_codes) local disttable = decode(ndist_codes) return littable, disttable end local tdecode_len_base local tdecode_len_nextrabits local tdecode_dist_base local tdecode_dist_nextrabits local function parse_compressed_item(bs, outstate, littable, disttable) local val = littable:read(bs) --debug(val, val < 256 and string_char(val)) if val < 256 then -- literal output(outstate, val) elseif val == 256 then -- end of block return true else if not tdecode_len_base then local t = {[257]=3} local skip = 1 for i=258,285,4 do for j=i,i+3 do t[j] = t[j-1] + skip end if i ~= 258 then skip = skip * 2 end end t[285] = 258 tdecode_len_base = t --for i=257,285 do debug('T1',i,t[i]) end end if not tdecode_len_nextrabits then local t = {} if NATIVE_BITOPS then for i=257,285 do local j = math_max(i - 261, 0) t[i] = rshift(j, 2) end else for i=257,285 do local j = math_max(i - 261, 0) t[i] = (j - (j % 4)) / 4 end end t[285] = 0 tdecode_len_nextrabits = t --for i=257,285 do debug('T2',i,t[i]) end end local len_base = tdecode_len_base[val] local nextrabits = tdecode_len_nextrabits[val] local extrabits = bs:read(nextrabits) local len = len_base + extrabits if not tdecode_dist_base then local t = {[0]=1} local skip = 1 for i=1,29,2 do for j=i,i+1 do t[j] = t[j-1] + skip end if i ~= 1 then skip = skip * 2 end end tdecode_dist_base = t --for i=0,29 do debug('T3',i,t[i]) end end if not tdecode_dist_nextrabits then local t = {} if NATIVE_BITOPS then for i=0,29 do local j = math_max(i - 2, 0) t[i] = rshift(j, 1) end else for i=0,29 do local j = math_max(i - 2, 0) t[i] = (j - (j % 2)) / 2 end end tdecode_dist_nextrabits = t --for i=0,29 do debug('T4',i,t[i]) end end local dist_val = disttable:read(bs) local dist_base = tdecode_dist_base[dist_val] local dist_nextrabits = tdecode_dist_nextrabits[dist_val] local dist_extrabits = bs:read(dist_nextrabits) local dist = dist_base + dist_extrabits --debug('BACK', len, dist) for i=1,len do local pos = (outstate.window_pos - 1 - dist) % 32768 + 1 -- 32K output(outstate, assert(outstate.window[pos], 'invalid distance')) end end return false end local function parse_block(bs, outstate) local bfinal = bs:read(1) local btype = bs:read(2) local BTYPE_NO_COMPRESSION = 0 local BTYPE_FIXED_HUFFMAN = 1 local BTYPE_DYNAMIC_HUFFMAN = 2 local BTYPE_RESERVED_ = 3 if DEBUG then debug('bfinal=', bfinal) debug('btype=', btype) end if btype == BTYPE_NO_COMPRESSION then bs:read(bs:nbits_left_in_byte()) local len = bs:read(16) local nlen_ = noeof(bs:read(16)) for i=1,len do local by = noeof(bs:read(8)) output(outstate, by) end elseif btype == BTYPE_FIXED_HUFFMAN or btype == BTYPE_DYNAMIC_HUFFMAN then local littable, disttable if btype == BTYPE_DYNAMIC_HUFFMAN then littable, disttable = parse_huffmantables(bs) else littable = HuffmanTable {0,8, 144,9, 256,7, 280,8, 288,nil} disttable = HuffmanTable {0,5, 32,nil} end repeat local is_done = parse_compressed_item( bs, outstate, littable, disttable) until is_done else runtime_error 'unrecognized compression type' end return bfinal ~= 0 end function M.inflate(t) local bs = get_bitstream(t.input) local outbs = get_obytestream(t.output) local outstate = make_outstate(outbs) repeat local is_final = parse_block(bs, outstate) until is_final end local inflate = M.inflate function M.gunzip(t) local bs = get_bitstream(t.input) local outbs = get_obytestream(t.output) local disable_crc = t.disable_crc if disable_crc == nil then disable_crc = false end parse_gzip_header(bs) local data_crc32 = 0 inflate{input=bs, output= disable_crc and outbs or function(byte) data_crc32 = crc32(byte, data_crc32) outbs(byte) end } bs:read(bs:nbits_left_in_byte()) local expected_crc32 = bs:read(32) local isize = bs:read(32) -- ignored if DEBUG then debug('crc32=', expected_crc32) debug('isize=', isize) end if not disable_crc and data_crc32 then if data_crc32 ~= expected_crc32 then runtime_error('invalid compressed data--crc error') end end if bs:read() then warn 'trailing garbage ignored' end end function M.adler32(byte, crc) local s1 = crc % 65536 local s2 = (crc - s1) / 65536 s1 = (s1 + byte) % 65521 s2 = (s2 + s1) % 65521 return s2*65536 + s1 end -- 65521 is the largest prime smaller than 2^16 function M.inflate_zlib(t) local bs = get_bitstream(t.input) local outbs = get_obytestream(t.output) local disable_crc = t.disable_crc if disable_crc == nil then disable_crc = false end local window_size_ = parse_zlib_header(bs) local data_adler32 = 1 inflate{input=bs, output= disable_crc and outbs or function(byte) data_adler32 = M.adler32(byte, data_adler32) outbs(byte) end } bs:read(bs:nbits_left_in_byte()) local b3 = bs:read(8) local b2 = bs:read(8) local b1 = bs:read(8) local b0 = bs:read(8) local expected_adler32 = ((b3*256 + b2)*256 + b1)*256 + b0 if DEBUG then debug('alder32=', expected_adler32) end if not disable_crc then if data_adler32 ~= expected_adler32 then runtime_error('invalid compressed data--crc error') end end if bs:read() then warn 'trailing garbage ignored' end end return M