<*
 DEFLATE compression implementation (RFC 1951).

 API:
 - fn char[]? decompress(Allocator allocator, char[] input)
 - fn void?   decompress_stream(InStream input, OutStream output)
 - fn char[]? compress(Allocator allocator, char[] input, )
 - struct Inflater: InStream for pull-based decompression (use `init` then `read`).
*>
module std::compression::deflate;
import std::io, std::math, std::bits, std::sort, std::collections::list;
import std::thread;

faultdef CORRUPTED_DATA;

<*
 Stateful DEFLATE inflater.
 Implements InStream for pull-based decompression.
*>
struct Inflater (InStream)
{
	StreamBitReader reader;
	InflaterState state;
	uint hlit, hdist, hclen;
	uint dyn_i;
	uint dyn_num_lengths;
	uint stored_len;
	uint match_len;
	uint match_dist;
	bool final;
	bool done;

	char[65536] window;
	ulong pos;
	ulong read_pos;

	Huffman* lit_ptr;
	Huffman* dist_ptr;

	char[19] code_lengths;
	char[320] lit_dist_lengths;

	Huffman dyn_lit;
	Huffman dyn_dist;
	Huffman dyn_code_huff;
}

fn void Inflater.init(&self, InStream input, char[] bit_reader_buf = {})
{
	*self = {};
	self.reader.init(input, bit_reader_buf);
	self.state = START_BLOCK;
	static OnceFlag run_once;
    run_once.call(fn void() {
    	build_fixed_huffman(&lit_fixed_global, &dist_fixed_global);
    });
}

fn usz? Inflater.read(&self, char[] buffer) @dynamic
{
	if (self.done && self.pos == self.read_pos) return 0;

	usz total_out = 0;
	while (total_out < buffer.len)
	{
		if (self.read_pos < self.pos)
		{
			ulong to_copy = math::min((ulong)(buffer.len - total_out), self.pos - self.read_pos);

			ulong start_idx = self.read_pos & 0xFFFF;
			ulong len = math::min(to_copy, 65536U - start_idx);

			buffer[total_out:len] = self.window[start_idx:len];

			if (len > usz.max) return CORRUPTED_DATA~;
			total_out = total_out.overflow_add((usz)len) ?? CORRUPTED_DATA~!;
			self.read_pos += len;

			if (len < to_copy)
			{
				buffer[total_out:to_copy - len] = self.window[:to_copy - len];

				total_out += (usz)(to_copy - len);
				self.read_pos += to_copy - len;
			}
			continue;
		}

		if (self.done) break;

		if (self.pos - self.read_pos >= 32768U) break;

		inflater_step(self)!;
	}

	return total_out;
}

fn char? Inflater.read_byte(&self) @dynamic
{
	char[1] b;
	if (try n = self.read(&b) && n > 0)
	{
		return b[0];
	}
	return io::EOF~;
}

fn char[]? decompress(Allocator allocator, char[] input) => @pool()
{
	if (!input.len) return allocator::new_array(allocator, char, 0);

	ByteReader mem_stream;
	mem_stream.init(input);

	Inflater* inflater = allocator::new(tmem, Inflater);
	char[4096] tmp_bit_buf;
	inflater.init(&mem_stream, &tmp_bit_buf);

	usz out_cap = input.len * 2;
	if (out_cap < 1024) out_cap = 1024;

	ByteWriter writer;
	writer.tinit();
	writer.ensure_capacity(out_cap)!;

	usz out_len = io::copy_to(inflater, &writer)!;
	if (out_len == 0) return allocator::new_array(allocator, char, 0);

	char[] result = allocator::alloc_array(allocator, char, out_len);
	result[..] = writer.array_view()[:out_len];
	return result;
}

<*
 Decompress stream using DEFLATE.
 Reads from `input`, writes to `output`.
 Uses blocking I/O and the temp allocator
*>
fn void? decompress_stream(InStream input, OutStream output) => @pool()
{
	Inflater* inflater = mem::tnew(Inflater);
	char[] bit_buf = mem::tnew(char[8192]);
	inflater.init(input, bit_buf);
	io::copy_to(inflater, output)!;
}

<*
 Compress data using the DEFLATE algorithm, this uses the temp allocator
 @param input : `The data to compress.`
 @param allocator : `The allocator to use.`
 @return `The compressed data.`
*>
fn char[]? compress(Allocator allocator, char[] input) => @pool()
{
	if (input.len == 0)
	{
		BitWriter writer;
		writer.init(allocator, 8);
		writer.write_bits(1, 1);
		writer.write_bits(1, 2);
		writer.write_huffman(0, 7);
		return writer.finish();
	}

	const uint MIN_MATCH = 3;
	const uint MAX_MATCH = 258;
	const uint WINDOW_SIZE = 32768;
	const uint HASH_SIZE = 32768;
	const uint HASH_MASK = HASH_SIZE - 1;
	const uint MAX_CHAIN = 16;
	const uint GOOD_MATCH = 32;
	const uint NICE_MATCH = 128;

	uint[] head = allocator::alloc_array(tmem, uint, HASH_SIZE);
	head[..] = 0xFFFFFFFF;

	uint[] prev = allocator::alloc_array(tmem, uint, WINDOW_SIZE);

	uint[] lit_freqs = mem::temp_array(uint, 286);
	uint[] dist_freqs = mem::temp_array(uint, 30);
	Token[] tokens = allocator::alloc_array(tmem, Token, input.len + 1);
	usz token_count = 0;

	uint hash = 0;
	if (input.len >= 2)
	{
		hash = ((uint)input[0] << 5) ^ (uint)input[1];
	}

	usz pos = 0;
	while (pos < input.len)
	{
		uint best_len = 0;
		uint best_dist = 0;

		if (pos + 2 < input.len)
		{
			hash = ((hash << 5) ^ (uint)input[pos + 2]) & HASH_MASK;
			uint match_head = head[hash];
			head[hash] = (uint)pos;
			prev[pos & 0x7FFF] = match_head;

			uint chain_len = 0;
			uint curr = match_head;
			while (curr != 0xFFFFFFFF && (uint)pos - curr < WINDOW_SIZE && chain_len < MAX_CHAIN)
			{
				chain_len++;
				if (pos + best_len < input.len && input[curr + best_len] == input[pos + best_len])
				{
					if (best_len < 3 || mem::load((ushort*)&input[pos + best_len - 1], 1) == mem::load((ushort*)&input[curr + best_len - 1], 1))
					{
						uint match_len = 0;
						while (match_len + 8 <= MAX_MATCH && pos + match_len + 8 <= input.len)
						{
							if (mem::load((ulong*)&input[curr + match_len], 1) == mem::load((ulong*)&input[pos + match_len], 1))
							{
								match_len += 8;
								continue;
							}
							break;
						}

						while (match_len < MAX_MATCH &&
							   pos + match_len < input.len &&
							   input[curr + match_len] == input[pos + match_len])
						{
							match_len++;
						}

						if (match_len >= MIN_MATCH && match_len > best_len)
						{
							best_len = match_len;
							best_dist = (uint)pos - curr;
							if (best_len >= NICE_MATCH) break;
						}
					}
				}
				curr = prev[curr & 0x7FFF];
				if (best_len >= GOOD_MATCH) chain_len++;
			}
		}

		if (best_len >= MIN_MATCH)
		{
			uint len_code;
			switch (best_len)
			{
				case 3..10:   len_code = 257 + best_len - 3;
				case 11..18:  len_code = 265 + (best_len - 11) / 2;
				case 19..34:  len_code = 269 + (best_len - 19) / 4;
				case 35..66:  len_code = 273 + (best_len - 35) / 8;
				case 67..130: len_code = 277 + (best_len - 67) / 16;
				case 131..257:len_code = 281 + (best_len - 131) / 32;
				default:      len_code = 285;
			}
			lit_freqs[len_code]++;

			uint dist_code;
			switch (best_dist)
			{
				case 1..4:        dist_code = best_dist - 1;
				case 5..8:        dist_code = 4 + (best_dist - 5) / 2;
				case 9..16:       dist_code = 6 + (best_dist - 9) / 4;
				case 17..32:      dist_code = 8 + (best_dist - 17) / 8;
				case 33..64:      dist_code = 10 + (best_dist - 33) / 16;
				case 65..128:     dist_code = 12 + (best_dist - 65) / 32;
				case 129..256:    dist_code = 14 + (best_dist - 129) / 64;
				case 257..512:    dist_code = 16 + (best_dist - 257) / 128;
				case 513..1024:   dist_code = 18 + (best_dist - 513) / 256;
				case 1025..2048:  dist_code = 20 + (best_dist - 1025) / 512;
				case 2049..4096:  dist_code = 22 + (best_dist - 2049) / 1024;
				case 4097..8192:  dist_code = 24 + (best_dist - 4097) / 2048;
				case 8193..16384: dist_code = 26 + (best_dist - 8193) / 4096;
				default:          dist_code = 28 + (best_dist - 16385) / 8192;
			}
			dist_freqs[dist_code]++;
			tokens[token_count++] = { (ushort)best_len, (ushort)best_dist };

			uint limit = best_len - 1;
			if (pos + limit + 2 < input.len)
			{
				for (uint k = 0; k < limit; k++)
				{
					pos++;
					hash = ((hash << 5) ^ (uint)input[pos + 2]) & HASH_MASK;
					prev[pos & 0x7FFF] = head[hash];
					head[hash] = (uint)pos;
				}
			}
			else
			{
				for (uint k = 0; k < limit; k++)
				{
					pos++;
					if (pos + 2 < input.len)
					{
						hash = ((hash << 5) ^ (uint)input[pos + 2]) & HASH_MASK;
						prev[pos & 0x7FFF] = head[hash];
						head[hash] = (uint)pos;
					}
				}
			}
			pos++;
		}
		else
		{
			lit_freqs[(uint)input[pos]]++;
			tokens[token_count++] = { (ushort)input[pos], 0 };
			pos++;
		}
	}
	lit_freqs[256] = 1;
	tokens[token_count++] = { 256, 0 };

	Huffman lit_huff, dist_huff;
	lit_huff.build_from_freqs(lit_freqs, 15);

	uint total_dist = 0;
	foreach (f : dist_freqs) total_dist += f;
	if (total_dist == 0) dist_freqs[0] = 1;
	dist_huff.build_from_freqs(dist_freqs, 15);

	BitWriter writer;
	writer.init(allocator, math::max((usz)input.len / 2, (usz)1024));
	writer.write_bits(1, 1);
	writer.write_bits(2, 2);

	char[] lit_lens = lit_huff.get_lengths(285);
	char[] dist_lens = dist_huff.get_lengths(29);

	usz hlit_count = 286;
	while (hlit_count > 257 && lit_lens[hlit_count - 1] == 0) hlit_count--;
	usz hdist_count = 30;
	while (hdist_count > 1 && dist_lens[hdist_count - 1] == 0) hdist_count--;

	writer.write_bits((uint)hlit_count - 257, 5);
	writer.write_bits((uint)hdist_count - 1, 5);

	List{Token} tree_tokens; tree_tokens.tinit();
	uint[] tree_freqs = mem::temp_array(uint, 19);

	char[] all_lens = mem::temp_array(char, hlit_count + hdist_count);
	all_lens[:hlit_count] = lit_lens[:hlit_count];
	all_lens[hlit_count:hdist_count] = dist_lens[:hdist_count];

	for (usz i = 0; i < all_lens.len;)
	{
		char len = all_lens[i];
		usz count = 1;
		while (i + count < all_lens.len && all_lens[i + count] == len) count++;

		if (len == 0)
		{
			while (count >= 11)
			{
				uint c = (uint)math::min(count, (usz)138);
				tree_tokens.push({ 18, (ushort)(c - 11) });
				tree_freqs[18]++;
				i += c; count -= c;
			}
			while (count >= 3)
			{
				uint c = (uint)math::min(count, (usz)10);
				tree_tokens.push({ 17, (ushort)(c - 3) });
				tree_freqs[17]++;
				i += c; count -= c;
			}
		}
		else if (count >= 4)
		{
			tree_tokens.push({ (ushort)len, 0 });
			tree_freqs[(uint)len]++;
			i++; count--;
			while (count >= 3)
			{
				uint c = (uint)math::min(count, (usz)6);
				tree_tokens.push({ 16, (ushort)(c - 3) });
				tree_freqs[16]++;
				i += c; count -= c;
			}
		}

		while (count--)
		{
			tree_tokens.push({ (ushort)len, 0 });
			tree_freqs[(uint)len]++;
			i++;
		}
	}

	Huffman tree_huff;
	tree_huff.build_from_freqs(tree_freqs, 7);
	char[] tree_lens = tree_huff.get_lengths(18);

	usz hclen_count = 19;
	while (hclen_count > 4 && tree_lens[ORDER[hclen_count - 1]] == 0) hclen_count--;
	writer.write_bits((uint)hclen_count - 4, 4);

	for (usz i = 0; i < hclen_count; i++)
	{
		writer.write_bits(tree_lens[ORDER[i]], 3);
	}

	Code[19] tree_codes;
	gen_canonical_codes(&tree_codes, tree_lens);
	foreach (t : tree_tokens)
	{
		writer.write_huffman(tree_codes[t.val].code, tree_codes[t.val].len);
		switch (t.val)
		{
			case 16:
				writer.write_bits(t.dist, 2);
			case 17:
				writer.write_bits(t.dist, 3);
			case 18:
				writer.write_bits(t.dist, 7);
		}
	}

	Code[286] lit_codes;
	gen_canonical_codes(&lit_codes, lit_lens[:hlit_count]);
	Code[30] dist_codes;
	gen_canonical_codes(&dist_codes, dist_lens[:hdist_count]);

	foreach (t : tokens[:token_count])
	{
		if (t.dist == 0)
		{
			writer.write_huffman(lit_codes[t.val].code, lit_codes[t.val].len);
		}
		else
		{
			uint best_len = t.val;
			uint best_dist = t.dist;

			uint len_code;
			uint len_extra_bits = 0;
			uint len_extra = 0;
			switch (best_len)
			{
				case 3..10:   len_code = 257 + best_len - 3;
				case 11..18:  len_code = 265 + (best_len - 11) / 2; len_extra_bits = 1; len_extra = (best_len - 11) % 2;
				case 19..34:  len_code = 269 + (best_len - 19) / 4; len_extra_bits = 2; len_extra = (best_len - 19) % 4;
				case 35..66:  len_code = 273 + (best_len - 35) / 8; len_extra_bits = 3; len_extra = (best_len - 35) % 8;
				case 67..130: len_code = 277 + (best_len - 67) / 16; len_extra_bits = 4; len_extra = (best_len - 67) % 16;
				case 131..257:len_code = 281 + (best_len - 131) / 32; len_extra_bits = 5; len_extra = (best_len - 131) % 32;
				default:      len_code = 285;
			}
			writer.write_huffman(lit_codes[len_code].code, lit_codes[len_code].len);
			if (len_extra_bits > 0) { writer.write_bits(len_extra, len_extra_bits); }

			uint dist_code;
			uint dist_extra_bits = 0;
			uint dist_extra = 0;
			switch (best_dist)
			{
				case 1..4:     dist_code = best_dist - 1;
				case 5..8:     dist_code = 4 + (best_dist - 5) / 2; dist_extra_bits = 1; dist_extra = (best_dist - 5) % 2;
				case 9..16:    dist_code = 6 + (best_dist - 9) / 4; dist_extra_bits = 2; dist_extra = (best_dist - 9) % 4;
				case 17..32:   dist_code = 8 + (best_dist - 17) / 8; dist_extra_bits = 3; dist_extra = (best_dist - 17) % 8;
				case 33..64:   dist_code = 10 + (best_dist - 33) / 16; dist_extra_bits = 4; dist_extra = (best_dist - 33) % 16;
				case 65..128:  dist_code = 12 + (best_dist - 65) / 32; dist_extra_bits = 5; dist_extra = (best_dist - 65) % 32;
				case 129..256: dist_code = 14 + (best_dist - 129) / 64; dist_extra_bits = 6; dist_extra = (best_dist - 129) % 64;
				case 257..512: dist_code = 16 + (best_dist - 257) / 128; dist_extra_bits = 7; dist_extra = (best_dist - 257) % 128;
				case 513..1024: dist_code = 18 + (best_dist - 513) / 256; dist_extra_bits = 8; dist_extra = (best_dist - 513) % 256;
				case 1025..2048: dist_code = 20 + (best_dist - 1025) / 512; dist_extra_bits = 9; dist_extra = (best_dist - 1025) % 512;
				case 2049..4096: dist_code = 22 + (best_dist - 2049) / 1024; dist_extra_bits = 10; dist_extra = (best_dist - 2049) % 1024;
				case 4097..8192: dist_code = 24 + (best_dist - 4097) / 2048; dist_extra_bits = 11; dist_extra = (best_dist - 4097) % 2048;
				case 8193..16384: dist_code = 26 + (best_dist - 8193) / 4096; dist_extra_bits = 12; dist_extra = (best_dist - 8193) % 4096;
				default:          dist_code = 28 + (best_dist - 16385) / 8192; dist_extra_bits = 13; dist_extra = (best_dist - 16385) % 8192;
			}
			writer.write_huffman(dist_codes[dist_code].code, dist_codes[dist_code].len);
			if (dist_extra_bits > 0) { writer.write_bits(dist_extra, dist_extra_bits); }
		}
	}

	return writer.finish();
}

/*-----------------------------------------------------------------------------*/

Huffman lit_fixed_global @private;
Huffman dist_fixed_global @private;


struct StreamBitReader @private
{
	InStream stream;
	char[] buffer;
	usz buf_pos;
	usz buf_len;
	ulong bit_buf;
	uint nbits;
	SetCursorFn set_cursor_fn;
}

enum InflaterState @private
{
	START_BLOCK,
	READ_STORED_LEN,
	COPY_STORED,
	READ_DYN_COUNTS,
	READ_DYN_CODELENS,
	READ_DYN_TREES,
	DECODE_SYMBOL,
	READ_DIST_SYM,
	COPY_MATCH,
	DONE
}

fn void StreamBitReader.init(&self, InStream reader, char[] buffer)
{
	*self = { .stream = reader, .buffer = buffer, .set_cursor_fn = &reader.set_cursor };
}
fn void StreamBitReader.close(&self)
{
	if (self.buf_len && self.buf_pos != self.buf_len && self.set_cursor_fn)
	{
		(void)self.set_cursor_fn(self.stream, (long)self.buf_pos - (long)self.buf_len, FROM_CURSOR);
	}
}

fn void? StreamBitReader.refill(&self) @inline
{
	if (self.buf_pos >= self.buf_len)
	{
		usz n = self.stream.read(self.buffer)!;
		if (!n) return io::EOF~;
		self.buf_len = n;
		self.buf_pos = 0;
	}
}

fn uint? StreamBitReader.read_bits(&self, uint count) @inline
{
	if (count == 0) return 0;
	if (self.nbits < count)
	{
		while (self.nbits <= 56)
		{
			if (self.buf_pos >= self.buf_len)
			{
				if (@catch(self.refill())) break;
			}
			self.bit_buf |= (ulong)self.buffer[self.buf_pos++] << self.nbits;
			self.nbits += 8;
		}
	}
	if (self.nbits < count) return CORRUPTED_DATA~;
	uint value = (uint)(self.bit_buf & ((1UL << count) - 1));
	self.bit_buf >>= count;
	self.nbits -= count;
	return value;
}

fn void StreamBitReader.align(&self) @inline
{
	uint skip = self.nbits % 8;
	self.bit_buf >>= skip;
	self.nbits -= skip;
}

struct BitWriter @private
{
	char[] data;
	usz len;
	ulong buffer;
	uint nbits;
	Allocator allocator;
}

fn void BitWriter.init(&self, Allocator allocator, usz initial_cap)
{
	self.allocator = allocator;
	self.data = allocator::alloc_array(allocator, char, initial_cap);
	self.len = 0;
	self.buffer = 0;
	self.nbits = 0;
}

fn void BitWriter.write_bits(&self, uint value, uint count)
{
	self.buffer |= (ulong)(value & ((1 << count) - 1)) << self.nbits;
	self.nbits += count;
	while (self.nbits >= 8)
	{
		if (self.len >= self.data.len)
		{
			usz new_cap = self.data.len;
			if (new_cap < 1024) new_cap = 1024;
			while (new_cap <= self.len)
			{
				if (new_cap > usz.max / 2) { new_cap = self.len + 1; break; }
				new_cap *= 2;
			}
			self.data = allocator::realloc_array(self.allocator, self.data.ptr, char, new_cap);
		}
		self.data[self.len++] = (char)(self.buffer & 0xFF);
		self.buffer >>= 8;
		self.nbits -= 8;
	}
}

fn void BitWriter.write_huffman(&self, uint code, uint len)
{
	uint rev = bits::reverse(code << (32 - len));
	self.write_bits(rev, len);
}

fn char[] BitWriter.finish(&self)
{
	if (self.nbits > 0)
	{
		if (self.len >= self.data.len)
		{
			usz new_cap = self.len + 1;
			self.data = allocator::realloc_array(self.allocator, self.data.ptr, char, new_cap);
		}
		self.data[self.len++] = (char)(self.buffer & 0xFF);
		self.buffer = 0;
		self.nbits = 0;
	}
	return self.data[:self.len];
}

struct Huffman @private
{
	ushort[16] counts;
	ushort[288] symbols;
}

fn void Huffman.build(&self, char[] lengths)
{
	ushort[16] offsets;
	self.counts = {};
	foreach (len : lengths)
	{
		if (len > 0 && len < 16) self.counts[len]++;
	}

	ushort offset = 0;
	for (uint i = 1; i < 16; i++)
	{
		offsets[i] = offset;
		offset += self.counts[i];
	}

	foreach (uint i, len : lengths)
	{
		if (len > 0 && len < 16)
		{
			ushort sym_idx = offsets[len]++;
			if (sym_idx < 288)
			{
				self.symbols[sym_idx] = (ushort)i;
			}
		}
	}
}

struct Token @private
{
	ushort val;
	ushort dist;
}

struct IndexMap @private
{
	usz index;
	uint freq;
}
fn bool IndexMap.less(&self, IndexMap other) => self.freq < other.freq;

<*
 Compute length-limited prefix code lengths using the Larmore–Hirschberg
 package-merge algorithm.
 This is a port of the C3 implementation by @konimarti (https://github.com/konimarti),
 which was based on the C implementation by Stephan Brumme
 (https://create.stephan-brumme.com/length-limited-prefix-codes/).
*>
fn char[] pkg_merge(Allocator allocator, uint[] freqs, uint max_bits) @private => @pool()
{
	List {IndexMap} map;
	map.tinit();

	usz n = 0;
	foreach (i, freq : freqs)
	{
		if (freq == 0) continue;
		map.push({i, freq});
		n++;
	}

	if (n == 0) return {};

	sort::quicksort(map.array_view());

	if (n == 1)
	{
		char[] blen = allocator::new_array(allocator, char, freqs.len);
		blen[map[0].index] = 1;
		return blen;
	}

	uint[] hist = mem::temp_array(uint, n);
	foreach (i, m : map) hist[i] = m.freq;

	usz max_elements = 2 * n;
	uint[] current = mem::temp_array(uint, max_elements);
	uint[] previous = mem::temp_array(uint, max_elements);
	ulong[] is_merged = mem::temp_array(ulong, max_elements);

	previous[:n] = hist[:n];
	usz num_previous = n;
	is_merged[:max_elements] = 0;
	usz num_relevant = 2 * n - 2;

	ulong mask = 1;
	for (uint bits = max_bits - 1; bits > 0; bits--)
	{
		num_previous &= (usz)~1;
		current[0] = hist[0];
		current[1] = hist[1];
		uint sum = current[0] + current[1];

		usz num_current = 2;
		usz num_hist = 2;
		usz num_merged = 0;

		while (true)
		{
			if (num_hist < n && hist[num_hist] <= sum)
			{
				current[num_current++] = hist[num_hist++];
				continue;
			}

			is_merged[num_current] |= mask;
			current[num_current] = sum;
			num_current++;

			num_merged++;
			if (num_merged * 2 >= num_previous) break;

			sum = previous[num_merged * 2] + previous[num_merged * 2 + 1];
		}

		while (num_hist < n) current[num_current++] = hist[num_hist++];
		mask <<= 1;

		if (num_previous >= num_relevant)
		{
			bool keep_going = false;
			for (usz i = num_relevant; i > 1; i--)
			{
				if (previous[i - 1] != current[i - 1])
				{
					keep_going = true;
					break;
				}
			}
			if (!keep_going) break;
		}

		@swap(previous, current);
		num_previous = num_current;
	}

	mask >>= 1;
	hist[..] = 0;

	usz num_analyze = num_relevant;
	while (mask != 0)
	{
		usz num_merged_loop = 0;
		hist[0]++;
		hist[1]++;

		usz symbol = 2;
		for (usz i = symbol; i < num_analyze; i++)
		{
			if ((is_merged[i] & mask) == 0)
			{
				hist[symbol]++;
				symbol++;
			}
			else
			{
				num_merged_loop++;
			}
		}
		num_analyze = 2 * num_merged_loop;
		mask >>= 1;
	}

	for (usz i = 0; i < num_analyze; i++) hist[i]++;

	char[] blen = allocator::new_array(allocator, char, freqs.len);
	foreach (i, m : map)
	{
		blen[m.index] = (char)hist[i];
	}

	return blen;
}

fn char[] Huffman.get_lengths(&self, usz max_sym)
{
	char[] blen = allocator::new_array(tmem, char, max_sym + 1);

	ushort index = 0;
	for (uint len = 1; len < 16; len++)
	{
		uint count = self.counts[len];
		for (uint i = 0; i < count; i++)
		{
			blen[self.symbols[index++]] = (char)len;
		}
	}
	return blen;
}

fn void Huffman.build_from_freqs(&self, uint[] freqs, uint max_bits) => @pool()
{
	char[] blen = pkg_merge(tmem, freqs, max_bits);
	self.build(blen);
}

fn void gen_canonical_codes(Code* codes, char[] blen) @private
{
	ushort[16] bl_count;
	foreach (len : blen)
	{
		if (len > 0) bl_count[len]++;
	}

	ushort[16] next_code;
	ushort code = 0;
	bl_count[0] = 0;
	for (uint bits = 1; bits <= 15; bits++)
	{
		code = (code + bl_count[bits - 1]) << 1;
		next_code[bits] = code;
	}

	foreach (uint n, len : blen)
	{
		if (len != 0)
		{
			uint c = next_code[len];
			codes[n].code = (ushort)c;
			codes[n].len = (ushort)len;
			next_code[len]++;
		}
		else
		{
			codes[n].code = 0;
			codes[n].len = 0;
		}
	}
}

fn void build_fixed_huffman(Huffman* lit, Huffman* dist) @private
{
	char[288] lit_lens;
	for (uint i = 0; i <= 143; i++) lit_lens[i] = 8;
	for (uint i = 144; i <= 255; i++) lit_lens[i] = 9;
	for (uint i = 256; i <= 279; i++) lit_lens[i] = 7;
	for (uint i = 280; i <= 287; i++) lit_lens[i] = 8;
	lit.build(&lit_lens);

	char[32] dist_lens;
	dist_lens[..] = 5;
	dist.build(&dist_lens);
}

struct Code @private
{
	ushort code;
	ushort len;
}

fn void gen_fixed_codes(Code* codes) @private
{
	char[288] lens;
	for (uint i = 0; i <= 143; i++) lens[i] = 8;
	for (uint i = 144; i <= 255; i++) lens[i] = 9;
	for (uint i = 256; i <= 279; i++) lens[i] = 7;
	for (uint i = 280; i <= 287; i++) lens[i] = 8;

	ushort[16] bl_count;
	for (uint i = 0; i < 288; i++)
	{
		if (lens[i] > 0) bl_count[lens[i]]++;
	}

	ushort[16] next_code;
	ushort code = 0;
	bl_count[0] = 0;
	for (uint bits = 1; bits <= 15; bits++)
	{
		code = (code + bl_count[bits - 1]) << 1;
		next_code[bits] = code;
	}

	for (uint n = 0; n < 288; n++)
	{
		uint len = lens[n];
		if (len != 0)
		{
			uint c = next_code[len];
			codes[n].code = (ushort)c;
			codes[n].len = (ushort)len;
			next_code[len]++;
		}
		else
		{
			codes[n].code = 0;
			codes[n].len = 0;
		}
	}
}

fn ushort? Huffman.decode_stream(&self, StreamBitReader* reader) @inline
{
	uint code = 0;
	uint first = 0;
	uint index = 0;
	for (uint len = 1; len < 16; len++)
	{
		code |= reader.read_bits(1)!;
		uint count = self.counts[len];
		if (code < first + count)
		{
			return self.symbols[index + (code - first)];
		}
		index += count;
		first += count;
		first <<= 1;
		code <<= 1;
	}
	return CORRUPTED_DATA~;
}

fn void? inflater_step(Inflater* self) @private
{
	switch (self.state)
	{
		case START_BLOCK:
			self.final = self.reader.read_bits(1)! != 0;
			switch (self.reader.read_bits(2)!)
			{
				case 0:
					self.state = READ_STORED_LEN;
				case 1:
					self.lit_ptr = &lit_fixed_global;
					self.dist_ptr = &dist_fixed_global;
					self.state = DECODE_SYMBOL;
				case 2:
					self.state = READ_DYN_COUNTS;
				default:
					return CORRUPTED_DATA~;
			}
		case READ_STORED_LEN:
			self.reader.align();
			self.stored_len = self.reader.read_bits(16)!;
			uint nlen = self.reader.read_bits(16)!;
			if (self.stored_len != (~nlen & 0xFFFF)) return CORRUPTED_DATA~;
			if (self.stored_len == 0)
			{
				self.state = self.final ? DONE : START_BLOCK;
				break;
			}
			self.state = COPY_STORED;
		case COPY_STORED:
			char c = (char)self.reader.read_bits(8)!;
			inflater_write_byte(self, c);
			self.stored_len--;
			if (self.stored_len == 0)
			{
				self.state = self.final ? DONE : START_BLOCK;
			}
		case READ_DYN_COUNTS:
			self.hlit = self.reader.read_bits(5)! + 257;
			self.hdist = self.reader.read_bits(5)! + 1;
			self.hclen = self.reader.read_bits(4)! + 4;
			self.dyn_i = 0;
			self.code_lengths = {};
			self.state = READ_DYN_CODELENS;

		case READ_DYN_CODELENS:
			self.code_lengths[ORDER[self.dyn_i]] = (char)self.reader.read_bits(3)!;
			self.dyn_i++;
			if (self.dyn_i >= self.hclen)
			{
				self.dyn_code_huff.build(&self.code_lengths);
				self.dyn_i = 0;
				self.dyn_num_lengths = self.hlit + self.hdist;
				if (self.dyn_num_lengths > 320) return CORRUPTED_DATA~;
				self.lit_dist_lengths[..] = 0;
				self.state = READ_DYN_TREES;
			}

		case READ_DYN_TREES:
			ushort sym = self.dyn_code_huff.decode_stream(&self.reader)!;
			switch (sym)
			{
				case 0..15:
					if (self.dyn_i >= 320) return CORRUPTED_DATA~;
					self.lit_dist_lengths[self.dyn_i++] = (char)sym;
				case 16:
					if (self.dyn_i == 0) return CORRUPTED_DATA~;
					uint repeat_count = self.reader.read_bits(2)! + 3;
					char prev = self.lit_dist_lengths[self.dyn_i - 1];
					if (self.dyn_i + repeat_count > self.dyn_num_lengths || self.dyn_i + repeat_count > 320) return CORRUPTED_DATA~;
					for (uint k = 0; k < repeat_count; k++) self.lit_dist_lengths[self.dyn_i++] = prev;
				case 17:
					uint zero_len = self.reader.read_bits(3)! + 3;
					if (self.dyn_i + zero_len > self.dyn_num_lengths || self.dyn_i + zero_len > 320) return CORRUPTED_DATA~;
					for (uint k = 0; k < zero_len; k++) self.lit_dist_lengths[self.dyn_i++] = 0;
				case 18:
					uint zero_len = self.reader.read_bits(7)! + 11;
					if (self.dyn_i + zero_len > self.dyn_num_lengths || self.dyn_i + zero_len > 320) return CORRUPTED_DATA~;
					for (uint k = 0; k < zero_len; k++) self.lit_dist_lengths[self.dyn_i++] = 0;
			}
			if (self.dyn_i >= self.dyn_num_lengths)
			{
				uint hlit = self.hlit;
				uint hdist = self.hdist;
				uint total = hlit + hdist;
				if (hlit > 286 || hdist > 32 || total > 320 || hlit > total)
				{
					return CORRUPTED_DATA~;
				}
				self.dyn_lit.build(self.lit_dist_lengths[0:hlit]);
				self.dyn_dist.build(self.lit_dist_lengths[hlit:hdist]);
				self.lit_ptr = &self.dyn_lit;
				self.dist_ptr = &self.dyn_dist;
				self.state = DECODE_SYMBOL;
			}

		case DECODE_SYMBOL:
			ushort symbol = self.lit_ptr.decode_stream(&self.reader)!;
			switch
			{
				case symbol < 256:
					inflater_write_byte(self, (char)symbol);
				case symbol == 256:
					self.state = self.final ? DONE : START_BLOCK;
				case symbol <= 285:
					uint len_idx = symbol - 257;
					self.match_len = LENGTH_BASE[len_idx] + self.reader.read_bits(LENGTH_EXTRA[len_idx])!;
					self.state = READ_DIST_SYM;
				default:
					return CORRUPTED_DATA~;
			}

		case READ_DIST_SYM:
			ushort dist_sym = self.dist_ptr.decode_stream(&self.reader)!;
			self.match_dist = DIST_BASE[dist_sym] + self.reader.read_bits(DIST_EXTRA[dist_sym])!;
			self.state = COPY_MATCH;

		case COPY_MATCH:
			if (self.match_dist > self.pos) return CORRUPTED_DATA~;
			char c = self.window[(usz)((self.pos - self.match_dist) & 0xFFFF)];
			inflater_write_byte(self, c);
			self.match_len--;
			if (self.match_len == 0)
			{
				self.state = DECODE_SYMBOL;
			}

		case DONE:
			self.done = true;
			self.reader.close();
	}
	return;
}

fn void inflater_write_byte(Inflater* self, char c) @private @inline
{
	self.window[(usz)(self.pos & 0xFFFF)] = c;
	self.pos++;
}

const ushort[31] LENGTH_BASE @private = {
	3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 19, 23, 27, 31,
	35, 43, 51, 59, 67, 83, 99, 115, 131, 163, 195, 227, 258, 0, 0
};

const char[31] LENGTH_EXTRA @private = {
	0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2,
	3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 0, 0, 0
};

const ushort[32] DIST_BASE @private = {
	1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33, 49, 65, 97, 129, 193,
	257, 385, 513, 769, 1025, 1537, 2049, 3073, 4097, 6145,
	8193, 12289, 16385, 24577, 0, 0
};

const char[32] DIST_EXTRA @private = {
	0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6,
	7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 0, 0
};

const uint[19] ORDER @private = { 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15 };