stdlib: std::compression::zip and std::compression::deflate (#2930)

* stdlib: implement `std::compression::zip` and `std::compression::deflate`

- C3 implementation of DEFLATE (RFC 1951) and ZIP archive handling.
- Support for reading and writing archives using STORE and DEFLATE
methods.
- Decompression supports both fixed and dynamic Huffman blocks.
- Compression using greedy LZ77 matching.
- Zero dependencies on libc.
- Stream-based entry reading and writing.
- Full unit test coverage.

NOTE: This is an initial implementation. Future improvements could be:

- Optimization of the LZ77 matching (lazy matching).
- Support for dynamic Huffman blocks in compression.
- ZIP64 support for large files/archives.
- Support for encryption and additional compression methods.

* optimizations+refactoring

deflate:
- replace linear search with hash-based match finding.
- implement support for dynamic Huffman blocks using the Package-Merge
algorithm.
- add streaming decompression.
- add buffered StreamBitReader.

zip:
- add ZIP64 support.
- add CP437 and UTF-8 filename encoding detection.
- add DOS date/time conversion and timestamp preservation.
- add ZipEntryReader for streaming entry reads.
- implement ZipArchive.extract and ZipArchive.recover helpers.

other:
- Add `set_modified_time` to std::io;
- Add benchmarks and a few more unit tests.

* zip: add archive comment support

add tests

* forgot to rename the benchmark :(

* detect utf8 names on weird zips

fix method not passed to open_writer

* another edge case where directory doesn't end with /

* testing utilities

- detect encrypted zip
- `ZipArchive.open_writer` default to DEFLATE

* fix zip64 creation, add tests

* fix ZIP header endianness for big-endian compatibility

Update ZipLFH, ZipCDH, ZipEOCD, Zip64EOCD, and Zip64Locator structs to
use little-endian bitstruct types from std::core::bitorder

* fix ZipEntryReader position tracking and seek logic ZIP_METHOD_STORE

added a test to track this

* add package-merge algorithm attribution

Thanks @konimarti

* standalone deflate_benchmark.c3 against `miniz`

* fix integer overflows, leaks and improve safety

* a few safety for 32-bit systems and tests

* deflate compress optimization

* improve match finding, hash updates, and buffer usage

* use ulong for zip offsets

* style changes (#18)

* style changes

* update tests

* style changes in `deflate.c3`

* fix typo

* Allocator first. Some changes to deflate to use `copy_to`

* Fix missing conversion on 32 bits.

* Fix deflate stream. Formatting. Prefer switch over if-elseif

* - Stream functions now use long/ulong rather than isz/usz for seek/available.
- `instream.seek` is replaced by `set_cursor` and `cursor`.
- `instream.available`, `cursor` etc are long/ulong rather than isz/usz to be correct on 32-bit.

* Update to constdef

* Fix test

---------

Co-authored-by: Book-reader <thevoid@outlook.co.nz>
Co-authored-by: Christoffer Lerno <christoffer@aegik.com>
This commit is contained in:
Manu Linares
2026-02-20 16:41:34 -03:00
committed by GitHub
parent 5055e86518
commit eae7d0c4a1
26 changed files with 3745 additions and 96 deletions

File diff suppressed because it is too large Load Diff

1215
lib/std/compression/zip.c3 Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -658,9 +658,10 @@ fn usz? DString.read_from_stream(&self, InStream reader)
if (&reader.available)
{
usz total_read = 0;
while (usz available = reader.available()!)
while (ulong available = reader.available()!)
{
self.reserve(available);
if (available > isz.max) available = (ulong)isz.max;
self.reserve((usz)available);
StringData* data = self.data();
usz len = reader.read(data.chars[data.len..(data.capacity - 1)])!;
total_read += len;

View File

@@ -126,6 +126,7 @@ const bool ARCH_64_BIT = $$REGISTER_SIZE == 64;
const bool LIBC = $$COMPILER_LIBC_AVAILABLE;
const bool NO_LIBC = !LIBC && !CUSTOM_LIBC;
const bool CUSTOM_LIBC = $$CUSTOM_LIBC;
const bool OLD_IO = $feature(OLD_IO);
const CompilerOptLevel COMPILER_OPT_LEVEL = CompilerOptLevel.from_ordinal($$COMPILER_OPT_LEVEL);
const bool BIG_ENDIAN = $$PLATFORM_BIG_ENDIAN;
const bool I128_NATIVE_SUPPORT = $$PLATFORM_I128_SUPPORTED;

View File

@@ -125,10 +125,11 @@ fn bool run_benchmarks(BenchmarkUnit[] benchmarks)
char[] perc_str = { [0..19] = ' ', [20] = 0 };
int perc = 0;
uint print_step = current_benchmark_iterations / 100;
if (print_step == 0) print_step = 1;
for (this_iteration = 0; this_iteration < current_benchmark_iterations; ++this_iteration, benchmark_nano_seconds = {})
{
if (0 == this_iteration % print_step) // only print right about when the % will update
if (this_iteration % print_step == 0) // only print right about when the % will update
{
perc_str[0..(uint)math::floor((this_iteration / (float)current_benchmark_iterations) * 20)] = '#';
perc = (uint)math::ceil(100 * (this_iteration / (float)current_benchmark_iterations));

View File

@@ -142,7 +142,7 @@ fn void mute_output() @local
File* stderr = io::stderr();
*stderr = test_context.fake_stdout;
*stdout = test_context.fake_stdout;
(void)test_context.fake_stdout.seek(0, Seek.SET)!!;
(void)test_context.fake_stdout.set_cursor(0)!!;
}
fn void unmute_output(bool has_error) @local
@@ -155,7 +155,7 @@ fn void unmute_output(bool has_error) @local
*stderr = test_context.stored.stderr;
*stdout = test_context.stored.stdout;
usz log_size = test_context.fake_stdout.seek(0, Seek.CURSOR)!!;
ulong log_size = test_context.fake_stdout.cursor()!!;
if (has_error)
{
io::printn(test_context.has_ansi_codes ? "[\e[0;31mFAIL\e[0m]" : "[FAIL]");
@@ -165,7 +165,7 @@ fn void unmute_output(bool has_error) @local
{
test_context.fake_stdout.write_byte('\n')!!;
test_context.fake_stdout.write_byte('\0')!!;
(void)test_context.fake_stdout.seek(0, Seek.SET)!!;
test_context.fake_stdout.set_cursor(0)!!;
io::printfn("\n========== TEST LOG ============");
io::printfn("%s\n", test_context.current_test_name);

View File

@@ -39,11 +39,16 @@ fn bool is_dir(String path)
return os::native_is_dir(path);
}
fn usz? get_size(String path)
fn ulong? get_size(String path)
{
return os::native_file_size(path);
}
fn void? set_modified_time(String path, Time_t time)
{
return os::native_set_modified_time(path, time);
}
fn void? delete(String filename)
{
return os::native_remove(filename) @inline;
@@ -63,10 +68,25 @@ fn void? File.reopen(&self, String filename, String mode)
*>
fn usz? File.seek(&self, isz offset, Seek seek_mode = Seek.SET) @dynamic
{
os::native_fseek(self.file, offset, seek_mode)!;
return os::native_ftell(self.file);
os::native_fseek(self.file, offset, (SeekOrigin)seek_mode.ordinal)!;
return (usz)os::native_ftell(self.file);
}
<*
@require self.file != null
*>
fn void? File.set_cursor(&self, long offset, SeekOrigin whence = FROM_START) @dynamic
{
return os::native_fseek(self.file, offset, whence);
}
<*
@require self.file != null
*>
fn long? File.cursor(&self) @dynamic
{
return os::native_ftell(self.file);
}
/*
Implement later
@@ -118,6 +138,14 @@ fn void? File.close(&self) @inline @dynamic
self.file = null;
}
fn ulong? File.size(&self) @dynamic
{
long curr = self.cursor()!;
defer (void)self.set_cursor(curr);
self.set_cursor(0, FROM_END)!;
return self.cursor()!;
}
<*
@require self.file != null
*>
@@ -171,9 +199,8 @@ fn char[]? load_buffer(String filename, char[] buffer)
{
File file = open(filename, "rb")!;
defer (void)file.close();
usz len = file.seek(0, END)!;
long len = file.size()!;
if (len > buffer.len) return io::OVERFLOW~;
file.seek(0, SET)!;
usz read = 0;
while (read < len)
{
@@ -187,16 +214,16 @@ fn char[]? load(Allocator allocator, String filename)
{
File file = open(filename, "rb")!;
defer (void)file.close();
usz len = file.seek(0, END)!;
file.seek(0, SET)!;
char* data = allocator::malloc_try(allocator, len)!;
ulong len = file.size()!;
if (len > usz.max) return io::OUT_OF_SPACE~;
char* data = allocator::malloc_try(allocator, (usz)len)!;
defer catch allocator::free(allocator, data);
usz read = 0;
while (read < len)
while (read < (usz)len)
{
read += file.read(data[read:len - read])!;
read += file.read(data[read:(usz)len - read])!;
}
return data[:len];
return data[:(usz)len];
}
fn char[]? load_path(Allocator allocator, Path path) => load(allocator, path.str_view());

View File

@@ -45,10 +45,9 @@ fn FileMmap? mmap_file(File file, usz offset = 0, usz len = 0, VirtualMemoryAcce
{
if (len == 0)
{
usz cur = file.seek(0, CURSOR)!;
defer file.seek(cur, SET)!!;
usz file_size = file.seek(0, END)!;
len = file_size - offset;
ulong new_len = file.size()! - offset;
if (new_len > (ulong)isz.max) return mem::OUT_OF_MEMORY~;
len = (usz)new_len;
}
// get the page size

View File

@@ -11,6 +11,14 @@ enum Seek
END
}
enum SeekOrigin
{
FROM_START,
FROM_CURSOR,
FROM_END
}
faultdef
ALREADY_EXISTS,
BUSY,

View File

@@ -49,16 +49,16 @@ fn void*? native_freopen(void* file, String filename, String mode) @inline => @
return file ?: file_open_errno()~;
}
fn void? native_fseek(void* file, isz offset, Seek seek_mode) @inline
fn void? native_fseek(void* file, long offset, SeekOrigin seek_mode) @inline
{
if (libc::fseek(file, (SeekIndex)offset, seek_mode.ordinal)) return file_seek_errno()~;
}
fn usz? native_ftell(CFile file) @inline
fn long? native_ftell(CFile file) @inline
{
long index = libc::ftell(file);
return index >= 0 ? (usz)index : file_seek_errno()~;
return index >= 0 ? index : file_seek_errno()~;
}
fn usz? native_fwrite(CFile file, char[] buffer) @inline
@@ -123,3 +123,22 @@ macro fault file_seek_errno() @local
}
}
struct Utimbuf
{
Time_t actime;
Time_t modtime;
}
extern fn int utime(char* filename, void* times) @if(!env::WIN32);
extern fn int _wutime(WChar* filename, void* times) @if(env::WIN32);
fn void? native_set_modified_time(String filename, libc::Time_t time) => @stack_mem(256; Allocator smem)
{
Utimbuf times = { time, time };
$if env::WIN32:
if (_wutime(filename.to_wstring(smem)!, &times)) return io::GENERAL_ERROR~;
$else
if (utime(filename.zstr_copy(smem), &times)) return io::GENERAL_ERROR~;
$endif
}

View File

@@ -4,12 +4,13 @@ import libc;
alias FopenFn = fn void*?(String, String);
alias FreopenFn = fn void*?(void*, String, String);
alias FcloseFn = fn void?(void*);
alias FseekFn = fn void?(void*, isz, Seek);
alias FtellFn = fn usz?(void*);
alias FseekFn = fn void?(void*, long, SeekOrigin);
alias FtellFn = fn long?(void*);
alias FwriteFn = fn usz?(void*, char[] buffer);
alias FreadFn = fn usz?(void*, char[] buffer);
alias RemoveFn = fn void?(String);
alias FputcFn = fn void?(int, void*);
alias SetModifiedTimeFn = fn void?(String, libc::Time_t);
FopenFn native_fopen_fn @weak @if(!$defined(native_fopen_fn));
FcloseFn native_fclose_fn @weak @if(!$defined(native_fclose_fn));
@@ -20,6 +21,7 @@ FwriteFn native_fwrite_fn @weak @if(!$defined(native_fwrite_fn));
FreadFn native_fread_fn @weak @if(!$defined(native_fread_fn));
RemoveFn native_remove_fn @weak @if(!$defined(native_remove_fn));
FputcFn native_fputc_fn @weak @if(!$defined(native_fputc_fn));
SetModifiedTimeFn native_set_modified_time_fn @weak @if(!$defined(native_set_modified_time_fn));
<*
@require mode.len > 0
@@ -52,13 +54,13 @@ fn void*? native_freopen(void* file, String filename, String mode) @inline
return io::UNSUPPORTED_OPERATION~;
}
fn void? native_fseek(void* file, isz offset, Seek seek_mode) @inline
fn void? native_fseek(void* file, long offset, SeekOrigin whence) @inline
{
if (native_fseek_fn) return native_fseek_fn(file, offset, seek_mode);
if (native_fseek_fn) return native_fseek_fn(file, offset, whence);
return io::UNSUPPORTED_OPERATION~;
}
fn usz? native_ftell(CFile file) @inline
fn ulong? native_ftell(CFile file) @inline
{
if (native_ftell_fn) return native_ftell_fn(file);
return io::UNSUPPORTED_OPERATION~;
@@ -81,3 +83,9 @@ fn void? native_fputc(CInt c, CFile stream) @inline
if (native_fputc_fn) return native_fputc_fn(c, stream);
return io::UNSUPPORTED_OPERATION~;
}
fn void? native_set_modified_time(String filename, libc::Time_t time) @inline
{
if (native_set_modified_time_fn) return native_set_modified_time_fn(filename, time);
return io::UNSUPPORTED_OPERATION~;
}

View File

@@ -47,14 +47,15 @@ fn usz? native_file_size(String path) @if(env::WIN32) => @pool()
return (usz)size.quadPart;
}
fn usz? native_file_size(String path) @if(!env::WIN32 && !env::DARWIN && !env::LINUX && !env::ANDROID && !env::BSD_FAMILY)
fn ulong? native_file_size(String path) @if(!env::WIN32 && !env::DARWIN && !env::LINUX && !env::ANDROID && !env::BSD_FAMILY)
{
File f = file::open(path, "r")!;
defer (void)f.close();
return f.seek(0, Seek.END)!;
f.set_cursor(0, FROM_END)!;
return f.cursor();
}
fn usz? native_file_size(String path) @if(env::DARWIN || env::LINUX || env::ANDROID || env::BSD_FAMILY)
fn ulong? native_file_size(String path) @if(env::DARWIN || env::LINUX || env::ANDROID || env::BSD_FAMILY)
{
Stat stat;
native_stat(&stat, path)!;

View File

@@ -36,7 +36,7 @@ fn Path? cwd(Allocator allocator)
fn bool is_dir(Path path) => os::native_is_dir(path.str_view());
fn bool is_file(Path path) => os::native_is_file(path.str_view());
fn usz? file_size(Path path) => os::native_file_size(path.str_view());
fn ulong? file_size(Path path) => os::native_file_size(path.str_view());
fn bool exists(Path path) => os::native_file_or_dir_exists(path.str_view());
fn Path? tcwd() => cwd(tmem) @inline;

View File

@@ -1,12 +1,20 @@
module std::io;
import std::math;
alias SetCursorFn = fn void?(void*, long offset, SeekOrigin whence = START);
interface InStream
{
fn void? close() @optional;
fn long? cursor() @optional;
fn void? set_cursor(long offset, SeekOrigin whence = FROM_START) @optional;
fn usz? seek(isz offset, Seek seek) @optional;
fn usz len() @optional;
fn usz? available() @optional;
fn ulong? size() @optional;
fn ulong? available() @optional;
fn usz? read(char[] buffer);
fn char? read_byte();
fn usz? write_to(OutStream out) @optional;
@@ -24,15 +32,23 @@ interface OutStream
fn usz? read_to(InStream in) @optional;
}
fn usz? available(InStream s)
fn ulong? available(InStream s)
{
if (&s.available) return s.available();
if (&s.set_cursor && &s.cursor)
{
long curr = s.cursor()!;
s.set_cursor(0, FROM_END)!;
ulong len = s.cursor()!;
s.set_cursor(curr)!;
return len - curr;
}
if (&s.seek)
{
usz curr = s.seek(0, Seek.CURSOR)!;
usz len = s.seek(0, Seek.END)!;
s.seek(curr, Seek.SET)!;
return len - curr;
return (ulong)len - (ulong)curr;
}
return io::UNSUPPORTED_OPERATION~;
}
@@ -177,6 +193,11 @@ macro usz? write_using_write_byte(s, char[] bytes)
macro void? pushback_using_seek(s)
{
if (&s.set_cursor)
{
s.set_cursor(-1, FROM_CURSOR)!;
return;
}
s.seek(-1, CURSOR)!;
}
@@ -407,11 +428,11 @@ macro ulong? read_le_ulong(stream)
{
ulong val = (ulong)stream.read_byte()!;
val += (ulong)stream.read_byte()! << 8;
val += (ulong)stream.read_byte()! << 16;
val += (ulong)stream.read_byte()! << 16;
val += (ulong)stream.read_byte()! << 24;
val += (ulong)stream.read_byte()! << 32;
val += (ulong)stream.read_byte()! << 40;
val += (ulong)stream.read_byte()! << 48;
val += (ulong)stream.read_byte()! << 48;
return val + (ulong)stream.read_byte()! << 56;
}
@@ -621,24 +642,30 @@ macro void? skip(stream, usz bytes)
{
if (!bytes) return;
$switch:
$case !$defined(stream.seek):
for (usz i = 0; i < bytes; i++)
{
stream.read()!;
}
return;
$case $typeof(stream) == InStream:
if (!&stream.seek)
{
for (usz i = 0; i < bytes; i++)
{
stream.read()!;
}
return;
}
if (!&stream.seek && !&stream.set_cursor)
{
for (usz i = 0; i < bytes; i++)
{
stream.read()!;
}
return;
}
if (!&stream.set_cursor)
{
stream.seek(bytes, CURSOR)!;
return;
}
stream.set_cursor(bytes, FROM_CURSOR)!;
$case $defined(stream.set_cursor):
stream.set_cursor(bytes, FROM_CURSOR)!;
$case $defined(stream.seek):
stream.seek(bytes, CURSOR)!;
$default:
stream.seek(bytes, CURSOR)!;
for (usz i = 0; i < bytes; i++)
{
stream.read()!;
}
$endswitch
}

View File

@@ -104,28 +104,37 @@ fn void? ByteBuffer.pushback_byte(&self) @dynamic
self.has_last = false;
}
fn usz? ByteBuffer.seek(&self, isz offset, Seek seek) @dynamic
fn long? ByteBuffer.cursor(&self) @dynamic
{
switch (seek)
{
case SET:
if (offset < 0 || offset > self.write_idx) return INVALID_POSITION~;
self.read_idx = offset;
return offset;
case CURSOR:
if ((offset < 0 && self.read_idx < -offset) ||
(offset > 0 && self.read_idx + offset > self.write_idx)) return INVALID_POSITION~;
self.read_idx += offset;
case END:
if (offset < 0 || offset > self.write_idx) return INVALID_POSITION~;
self.read_idx = self.write_idx - offset;
}
return self.read_idx;
}
fn usz? ByteBuffer.available(&self) @inline @dynamic
fn void? ByteBuffer.set_cursor(&self, long offset, SeekOrigin whence = FROM_START) @dynamic
{
return self.write_idx - self.read_idx;
switch (whence)
{
case FROM_START:
if (offset < 0 || offset > self.write_idx) return INVALID_POSITION~;
self.read_idx = (usz)offset;
case FROM_CURSOR:
if ((offset < 0 && self.read_idx < -offset) ||
(offset > 0 && self.read_idx + offset > self.write_idx)) return INVALID_POSITION~;
self.read_idx += (usz)offset;
case FROM_END:
if (offset < 0 || offset > self.write_idx) return INVALID_POSITION~;
self.read_idx = self.write_idx - (usz)offset;
}
}
fn usz? ByteBuffer.seek(&self, isz offset, Seek seek) @dynamic
{
self.set_cursor(offset, (SeekOrigin)seek.ordinal)!;
return (usz)self.cursor();
}
fn ulong? ByteBuffer.available(&self) @inline @dynamic
{
return (ulong)self.write_idx - self.read_idx;
}
fn void ByteBuffer.grow(&self, usz n)

View File

@@ -41,16 +41,26 @@ fn void? ByteReader.pushback_byte(&self) @dynamic
fn usz? ByteReader.seek(&self, isz offset, Seek seek) @dynamic
{
isz new_index;
switch (seek)
self.set_cursor((long)offset, (SeekOrigin)seek.ordinal)!;
return (usz)self.cursor();
}
fn long? ByteReader.cursor(&self) @dynamic
{
return self.index;
}
fn void? ByteReader.set_cursor(&self, long offset, SeekOrigin whence = FROM_START) @dynamic
{
long new_index;
switch (whence)
{
case SET: new_index = offset;
case CURSOR: new_index = self.index + offset;
case END: new_index = self.bytes.len + offset;
case FROM_START: new_index = offset;
case FROM_CURSOR: new_index = self.index + offset;
case FROM_END: new_index = self.bytes.len + offset;
}
if (new_index < 0) return INVALID_POSITION~;
self.index = new_index;
return new_index;
if (new_index < 0 || new_index > self.bytes.len) return INVALID_POSITION~;
self.index = (usz)new_index;
}
fn usz? ByteReader.write_to(&self, OutStream writer) @dynamic
@@ -62,7 +72,7 @@ fn usz? ByteReader.write_to(&self, OutStream writer) @dynamic
return written;
}
fn usz? ByteReader.available(&self) @inline @dynamic
fn ulong? ByteReader.available(&self) @inline @dynamic
{
return max(0, self.bytes.len - self.index);
}
}

View File

@@ -86,9 +86,10 @@ fn usz? ByteWriter.read_from(&self, InStream reader) @dynamic
usz start_index = self.index;
if (&reader.available)
{
while (usz available = reader.available()!)
while (ulong available = reader.available()!)
{
self.ensure_capacity(self.index + available)!;
if (available > usz.max) return OUT_OF_SPACE~;
self.ensure_capacity(self.index + (usz)available)!;
usz read = reader.read(self.bytes[self.index..])!;
self.index += read;
}

View File

@@ -38,7 +38,7 @@ fn char? LimitReader.read_byte(&self) @dynamic
return self.wrapped_stream.read_byte();
}
fn usz? LimitReader.available(&self) @inline @dynamic
fn ulong? LimitReader.available(&self) @inline @dynamic
{
return self.limit;
}

View File

@@ -186,7 +186,7 @@ fn ulong? elf_module_image_base(String path) @local
bool is_little_endian = file.read_byte()! == 1;
// Actually, not supported.
if (!is_little_endian) return backtrace::IMAGE_NOT_FOUND~;
file.seek(0)!;
file.set_cursor(0)!;
if (is_64)
{
Elf64_Ehdr file_header;
@@ -195,7 +195,7 @@ fn ulong? elf_module_image_base(String path) @local
for (isz i = 0; i < file_header.e_phnum; i++)
{
Elf64_Phdr header;
file.seek((usz)file_header.e_phoff + (usz)file_header.e_phentsize * i)!;
file.set_cursor(file_header.e_phoff + (long)file_header.e_phentsize * i)!;
io::read_any(&file, &header)!;
if (header.p_type == PT_PHDR) return header.p_vaddr - header.p_offset;
}
@@ -207,7 +207,7 @@ fn ulong? elf_module_image_base(String path) @local
for (isz i = 0; i < file_header.e_phnum; i++)
{
Elf32_Phdr header;
file.seek(file_header.e_phoff + (usz)file_header.e_phentsize * i)!;
file.set_cursor(file_header.e_phoff + (long)file_header.e_phentsize * i)!;
io::read_any(&file, &header)!;
if (header.p_type == PT_PHDR) return (ulong)header.p_vaddr - header.p_offset;
}