mirror of
https://github.com/c3lang/c3c.git
synced 2026-02-27 12:01:16 +00:00
stdlib: std::compression::zip and std::compression::deflate (#2930)
* stdlib: implement `std::compression::zip` and `std::compression::deflate` - C3 implementation of DEFLATE (RFC 1951) and ZIP archive handling. - Support for reading and writing archives using STORE and DEFLATE methods. - Decompression supports both fixed and dynamic Huffman blocks. - Compression using greedy LZ77 matching. - Zero dependencies on libc. - Stream-based entry reading and writing. - Full unit test coverage. NOTE: This is an initial implementation. Future improvements could be: - Optimization of the LZ77 matching (lazy matching). - Support for dynamic Huffman blocks in compression. - ZIP64 support for large files/archives. - Support for encryption and additional compression methods. * optimizations+refactoring deflate: - replace linear search with hash-based match finding. - implement support for dynamic Huffman blocks using the Package-Merge algorithm. - add streaming decompression. - add buffered StreamBitReader. zip: - add ZIP64 support. - add CP437 and UTF-8 filename encoding detection. - add DOS date/time conversion and timestamp preservation. - add ZipEntryReader for streaming entry reads. - implement ZipArchive.extract and ZipArchive.recover helpers. other: - Add `set_modified_time` to std::io; - Add benchmarks and a few more unit tests. * zip: add archive comment support add tests * forgot to rename the benchmark :( * detect utf8 names on weird zips fix method not passed to open_writer * another edge case where directory doesn't end with / * testing utilities - detect encrypted zip - `ZipArchive.open_writer` default to DEFLATE * fix zip64 creation, add tests * fix ZIP header endianness for big-endian compatibility Update ZipLFH, ZipCDH, ZipEOCD, Zip64EOCD, and Zip64Locator structs to use little-endian bitstruct types from std::core::bitorder * fix ZipEntryReader position tracking and seek logic ZIP_METHOD_STORE added a test to track this * add package-merge algorithm attribution Thanks @konimarti * standalone deflate_benchmark.c3 against `miniz` * fix integer overflows, leaks and improve safety * a few safety for 32-bit systems and tests * deflate compress optimization * improve match finding, hash updates, and buffer usage * use ulong for zip offsets * style changes (#18) * style changes * update tests * style changes in `deflate.c3` * fix typo * Allocator first. Some changes to deflate to use `copy_to` * Fix missing conversion on 32 bits. * Fix deflate stream. Formatting. Prefer switch over if-elseif * - Stream functions now use long/ulong rather than isz/usz for seek/available. - `instream.seek` is replaced by `set_cursor` and `cursor`. - `instream.available`, `cursor` etc are long/ulong rather than isz/usz to be correct on 32-bit. * Update to constdef * Fix test --------- Co-authored-by: Book-reader <thevoid@outlook.co.nz> Co-authored-by: Christoffer Lerno <christoffer@aegik.com>
This commit is contained in:
207
test/compression/deflate_benchmark.c3
Normal file
207
test/compression/deflate_benchmark.c3
Normal file
@@ -0,0 +1,207 @@
|
||||
// 1. `gcc -O3 -c dependencies/miniz/miniz.c -o build/miniz.o`
|
||||
// 2. `build/c3c -O3 compile-run test/compression/deflate_benchmark.c3 build/miniz.o`
|
||||
|
||||
module deflate_benchmark;
|
||||
import std, std::time::clock;
|
||||
|
||||
const int AMOUNT_OF_WORK = 10; // Increase this to scale test data sizes
|
||||
|
||||
fn int main(String[] args)
|
||||
{
|
||||
io::printf("\n%s%s DEFLATE BENCHMARK %s", Ansi.BOLD, Ansi.BG_CYAN, Ansi.RESET);
|
||||
io::printfn(" Comparing C3 std::compression::deflate with miniz (in-process)\n");
|
||||
io::printfn(" Work Scale: %dx\n", AMOUNT_OF_WORK);
|
||||
|
||||
io::printfn("%-26s | %7s | %7s | %7s | %7s | %-10s", "Test Case", "C3 Rat.", "Miz Rat.", "C3 MB/s", "Miz MB/s", "Winner");
|
||||
io::printfn("---------------------------+---------+---------+---------+---------+-----------");
|
||||
|
||||
// Test 1: Redundant data
|
||||
usz redundant_size = 10_000_000 * (usz)AMOUNT_OF_WORK;
|
||||
char[] redundant = allocator::alloc_array(tmem, char, redundant_size);
|
||||
mem::set(redundant.ptr, 'A', redundant_size);
|
||||
run_bench(string::tformat("Redundant (%dMB 'A')", (int)(redundant_size / 1_000_000)), redundant);
|
||||
|
||||
// Test 2: Large Source Project (All .c files in src/compiler)
|
||||
DString project_src;
|
||||
Path src_dir = path::new(tmem, "src/compiler")!!;
|
||||
PathList? compiler_files = path::ls(tmem, src_dir);
|
||||
if (try files = compiler_files) {
|
||||
for (int i = 0; i < AMOUNT_OF_WORK; i++) {
|
||||
foreach (p : files) {
|
||||
if (p.basename().ends_with(".c")) {
|
||||
Path full_p = src_dir.tappend(p.str_view())!!;
|
||||
if (try data = file::load_path(tmem, full_p)) {
|
||||
project_src.append(data);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
run_bench("Compiler Source (Bulk)", project_src.str_view());
|
||||
|
||||
// Test 3: Standard Library (All .c3 files in lib/std)
|
||||
DString std_src;
|
||||
for (int i = 0; i < AMOUNT_OF_WORK; i++) {
|
||||
collect_files(path::new(tmem, "lib/std")!!, ".c3", &std_src);
|
||||
}
|
||||
run_bench("Stdlib Source (Bulk)", std_src.str_view());
|
||||
|
||||
// Test 4: Log Files (Simulated)
|
||||
DString log_data;
|
||||
for (int i = 0; i < 50_000 * AMOUNT_OF_WORK; i++) {
|
||||
log_data.appendf("2024-02-13 21:30:%02d.%03d [INFO] Connection established from 192.168.1.%d\n", i % 60, i % 1000, i % 255);
|
||||
log_data.appendf("2024-02-13 21:30:%02d.%03d [DEBUG] Buffer size: %d bytes\n", i % 60, i % 1000, (i * 123) % 4096);
|
||||
}
|
||||
run_bench("Log Files (Simulated)", log_data.str_view());
|
||||
|
||||
// Test 5: Web Content (Simulated HTML/CSS)
|
||||
DString web_data;
|
||||
web_data.append("<!DOCTYPE html><html><head><style>.item { color: red; margin: 10px; }</style></head><body>");
|
||||
for (int i = 0; i < 1000 * AMOUNT_OF_WORK; i++) {
|
||||
web_data.appendf("<div class='item' id='obj%d'>", i);
|
||||
web_data.append("<h1>Title of the item</h1><p>This is some repetitive descriptive text that might appear on a web page.</p>");
|
||||
web_data.append("<ul><li>Feature 1</li><li>Feature 2</li><li>Feature 3</li></ul></div>");
|
||||
}
|
||||
web_data.append("</body></html>");
|
||||
run_bench("Web Content (Simulated)", web_data.str_view());
|
||||
|
||||
// Test 6: CSV Data (Simulated)
|
||||
DString csv_data;
|
||||
csv_data.append("id,name,value1,value2,status,category\n");
|
||||
for (int i = 0; i < 20_000 * AMOUNT_OF_WORK; i++) {
|
||||
csv_data.appendf("%d,Product_%d,%d.5,%d,\"%s\",\"%s\"\n",
|
||||
i, i % 100, i * 10, i % 500,
|
||||
i % 3 == 0 ? "Active" : "Inactive",
|
||||
i % 5 == 0 ? "Electronics" : "Home");
|
||||
}
|
||||
run_bench("CSV Data (Simulated)", csv_data.str_view());
|
||||
|
||||
// Test 7: Binary Data (Structured)
|
||||
usz binary_size = 2_000_000 * (usz)AMOUNT_OF_WORK;
|
||||
char[] binary = allocator::alloc_array(tmem, char, binary_size);
|
||||
for (usz i = 0; i < binary.len; i += 4) {
|
||||
uint val = (uint)i ^ 0xDEADBEEF;
|
||||
mem::copy(binary.ptr + i, &val, 4);
|
||||
}
|
||||
run_bench("Binary Data (Structured)", binary);
|
||||
|
||||
// Test 8: Random Noise (1MB)
|
||||
usz noise_size = 1_000_000 * (usz)AMOUNT_OF_WORK;
|
||||
DString noise;
|
||||
for (usz i = 0; i < noise_size; i++) {
|
||||
noise.append((char)rand('z' - 'a' + 1) + 'a');
|
||||
}
|
||||
run_bench("Random Noise (Scaled)", noise.str_view());
|
||||
|
||||
// Test 9: Tiny File (Check overhead)
|
||||
run_bench("Tiny File (asd.c3)", "module asd; fn void main() {}\n");
|
||||
|
||||
// Test 10: Natural Language (Repetitive)
|
||||
String text = "The quick brown fox jumps over the lazy dog. ";
|
||||
DString long_text;
|
||||
for (int i = 0; i < 50_000 * AMOUNT_OF_WORK; i++) long_text.append(text);
|
||||
run_bench("Natural Text (Scaled)", long_text.str_view());
|
||||
|
||||
if (args.len > 1) {
|
||||
Path custom_p = path::new(tmem, args[1])!!;
|
||||
if (try custom_data = file::load_path(tmem, custom_p)) {
|
||||
run_bench(string::tformat("Custom: %s", custom_p.basename()), custom_data);
|
||||
}
|
||||
}
|
||||
// Final Summary
|
||||
double avg_c3 = totals.c3_speed_sum / totals.count;
|
||||
double avg_miniz = totals.miniz_speed_sum / totals.count;
|
||||
double total_factor = avg_c3 / avg_miniz;
|
||||
|
||||
io::printfn("\n%sOVERALL SUMMARY%s", Ansi.BOLD, Ansi.RESET);
|
||||
io::printfn(" Average Throughput C3: %8.1f MB/s", avg_c3);
|
||||
io::printfn(" Average Throughput Miniz: %8.1f MB/s", avg_miniz);
|
||||
io::printfn(" %sC3 is %.1fx faster on average!%s\n", Ansi.BOLD, total_factor, Ansi.RESET);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
struct BenchResult {
|
||||
long time_ns;
|
||||
usz size;
|
||||
double ratio;
|
||||
double throughput_mbs;
|
||||
}
|
||||
|
||||
struct BenchTotal {
|
||||
double c3_speed_sum;
|
||||
double miniz_speed_sum;
|
||||
int count;
|
||||
}
|
||||
BenchTotal totals;
|
||||
|
||||
fn void run_bench(String title, char[] data)
|
||||
{
|
||||
// C3 Bench
|
||||
Clock start = clock::now();
|
||||
char[] c3_compressed = deflate::compress(data, tmem)!!;
|
||||
Clock end = clock::now();
|
||||
BenchResult c3 = calculate_metrics(data.len, (long)(end - start), c3_compressed.len);
|
||||
|
||||
// Miniz Bench
|
||||
usz miniz_size;
|
||||
start = clock::now();
|
||||
void* miniz_ptr = tdefl_compress_mem_to_heap(data.ptr, data.len, &miniz_size, MINIZ_FLAGS);
|
||||
end = clock::now();
|
||||
BenchResult miniz = calculate_metrics(data.len, (long)(end - start), miniz_size);
|
||||
if (miniz_ptr) mz_free(miniz_ptr);
|
||||
|
||||
// Performance Delta
|
||||
double speed_factor = c3.throughput_mbs / miniz.throughput_mbs;
|
||||
|
||||
io::printf("%-26s | %6.2f%% | %6.2f%% | %7.1f | %7.1f | %s%s (%.1fx)%s\n",
|
||||
title[:(min(title.len, 26))],
|
||||
c3.ratio, miniz.ratio,
|
||||
c3.throughput_mbs, miniz.throughput_mbs,
|
||||
speed_factor > 1.0 ? Ansi.CYAN : Ansi.WHITE,
|
||||
speed_factor > 1.0 ? "C3" : "Miniz",
|
||||
speed_factor > 1.0 ? speed_factor : 1.0 / speed_factor,
|
||||
Ansi.RESET);
|
||||
|
||||
totals.c3_speed_sum += c3.throughput_mbs;
|
||||
totals.miniz_speed_sum += miniz.throughput_mbs;
|
||||
totals.count++;
|
||||
}
|
||||
|
||||
fn void collect_files(Path dir, String suffix, DString* out)
|
||||
{
|
||||
PathList? items = path::ls(tmem, dir);
|
||||
if (catch items) return;
|
||||
foreach (p : items) {
|
||||
Path full = dir.tappend(p.str_view())!!;
|
||||
if (path::is_dir(full)) {
|
||||
if (p.basename() != "." && p.basename() != "..") {
|
||||
collect_files(full, suffix, out);
|
||||
}
|
||||
} else if (p.basename().ends_with(suffix)) {
|
||||
if (try data = file::load_path(tmem, full)) {
|
||||
out.append(data);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn BenchResult calculate_metrics(usz original_len, long time_ns, usz compressed_len)
|
||||
{
|
||||
BenchResult res;
|
||||
res.time_ns = time_ns;
|
||||
res.size = compressed_len;
|
||||
res.ratio = (double)compressed_len / (double)original_len * 100.0;
|
||||
res.throughput_mbs = (double)original_len / (1024.0 * 1024.0) / ((double)time_ns / 1_000_000_000.0);
|
||||
return res;
|
||||
}
|
||||
|
||||
// External Miniz bindings
|
||||
extern fn void* tdefl_compress_mem_to_heap(void* pSrc_buf, usz src_buf_len, usz* pOut_len, int flags);
|
||||
extern fn void mz_free(void* p);
|
||||
|
||||
const int TDEFL_GREEDY_PARSING_FLAG = 0x04000;
|
||||
const int TDEFL_NONDETERMINISTIC_PARSING_FLAG = 0x08000; // Fastest init for miniz for a fair comparisson
|
||||
const int C3_EQUIVALENT_PROBES = 16; // C3 uses MAX_CHAIN = 16 as default (this should be exposed)
|
||||
|
||||
const int MINIZ_FLAGS = C3_EQUIVALENT_PROBES | TDEFL_GREEDY_PARSING_FLAG | TDEFL_NONDETERMINISTIC_PARSING_FLAG;
|
||||
194
test/compression/zip_compare_7z.c3
Normal file
194
test/compression/zip_compare_7z.c3
Normal file
@@ -0,0 +1,194 @@
|
||||
<*
|
||||
Compare `C3 zip` vs `7z` extraction
|
||||
External dependencies: 7z, diff
|
||||
*>
|
||||
module verify_zip;
|
||||
import std;
|
||||
import libc;
|
||||
|
||||
fn int main(String[] args)
|
||||
{
|
||||
if (args.len < 2)
|
||||
{
|
||||
io::printfn("Usage: %s [-r|--recursive] [-o|--output <dir>] <zip_dir>", args[0]);
|
||||
return 1;
|
||||
}
|
||||
|
||||
bool recursive = false;
|
||||
String zip_dir;
|
||||
String output_dir;
|
||||
|
||||
for (int i = 1; i < args.len; i++)
|
||||
{
|
||||
String arg = args[i];
|
||||
switch (arg)
|
||||
{
|
||||
case "-r":
|
||||
case "--recursive":
|
||||
recursive = true;
|
||||
case "-o":
|
||||
case "--output":
|
||||
if (++i >= args.len)
|
||||
{
|
||||
io::printfn("Error: %s requires a directory path", arg);
|
||||
return 1;
|
||||
}
|
||||
output_dir = args[i];
|
||||
default:
|
||||
if (arg.starts_with("-"))
|
||||
{
|
||||
io::printfn("Error: unknown option %s", arg);
|
||||
return 1;
|
||||
}
|
||||
if (zip_dir)
|
||||
{
|
||||
io::printfn("Error: multiple zip directories specified ('%s' and '%s')", zip_dir, arg);
|
||||
return 1;
|
||||
}
|
||||
zip_dir = arg;
|
||||
}
|
||||
}
|
||||
|
||||
if (!zip_dir)
|
||||
{
|
||||
io::printfn("Error: no zip directory specified.");
|
||||
return 1;
|
||||
}
|
||||
|
||||
return process_dir(zip_dir, recursive, output_dir);
|
||||
}
|
||||
|
||||
fn int process_dir(String dir, bool recursive, String output_dir)
|
||||
{
|
||||
PathList? files = path::ls(tmem, path::temp(dir)!!);
|
||||
if (catch excuse = files)
|
||||
{
|
||||
io::printfn("Could not open directory: %s (Excuse: %s)", dir, excuse);
|
||||
return 1;
|
||||
}
|
||||
|
||||
foreach (p : files)
|
||||
{
|
||||
String name = p.basename();
|
||||
if (name == "." || name == "..") continue;
|
||||
|
||||
String zip_path = path::temp(dir)!!.tappend(name)!!.str_view();
|
||||
|
||||
if (file::is_dir(zip_path))
|
||||
{
|
||||
if (recursive)
|
||||
{
|
||||
if (process_dir(zip_path, recursive, output_dir) != 0) return 1;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!name.ends_with(".zip")) continue;
|
||||
|
||||
ulong size = 0;
|
||||
File? f = file::open(zip_path, "rb");
|
||||
if (try fh = f)
|
||||
{
|
||||
(void)fh.seek(0, Seek.END);
|
||||
size = fh.seek(0, Seek.CURSOR) ?? 0;
|
||||
fh.close()!!;
|
||||
}
|
||||
io::printf("Verifying %-40s [%7d KB] ", name[:(min(name.len, 40))], size / 1024);
|
||||
|
||||
switch (verify_one(zip_path, output_dir))
|
||||
{
|
||||
case 0:
|
||||
io::printfn("%sFAILED%s ❌", Ansi.RED, Ansi.RESET);
|
||||
return 1;
|
||||
case 1:
|
||||
io::printfn("%sPASSED%s ✅", Ansi.GREEN, Ansi.RESET);
|
||||
default:
|
||||
io::printn();
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
fn int verify_one(String zip_path, String output_dir)
|
||||
{
|
||||
Path extract_root;
|
||||
if (output_dir)
|
||||
{
|
||||
extract_root = path::temp(output_dir)!!;
|
||||
}
|
||||
else
|
||||
{
|
||||
extract_root = path::temp_directory(tmem)!!;
|
||||
}
|
||||
|
||||
String name = (String)path::temp(zip_path)!!.basename();
|
||||
|
||||
Path temp_c3 = extract_root.tappend(name.tconcat("_c3"))!!;
|
||||
Path temp_7z = extract_root.tappend(name.tconcat("_7z"))!!;
|
||||
|
||||
(void)path::mkdir(temp_c3, true);
|
||||
(void)path::mkdir(temp_7z, true);
|
||||
|
||||
ZipArchive? archive = zip::open(zip_path, "r");
|
||||
if (catch excuse = archive)
|
||||
{
|
||||
io::printfn("%sFAIL%s (open: %s)", Ansi.RED, Ansi.RESET, excuse);
|
||||
return 0;
|
||||
}
|
||||
defer (void)archive.close();
|
||||
|
||||
Time start = time::now();
|
||||
if (catch excuse = archive.extract(temp_c3.str_view()))
|
||||
{
|
||||
if (excuse == zip::ENCRYPTED_FILE)
|
||||
{
|
||||
io::printf("%sSKIPPED%s (Encrypted)", Ansi.YELLOW, Ansi.RESET);
|
||||
return 2;
|
||||
}
|
||||
io::printfn("%sFAIL%s (extract: %s)", Ansi.RED, Ansi.RESET, excuse);
|
||||
return 0;
|
||||
}
|
||||
Duration c3_time = time::now() - start;
|
||||
|
||||
start = time::now();
|
||||
if (!extract_7z(zip_path, temp_7z.str_view()))
|
||||
{
|
||||
io::printfn("%sFAIL%s (7z extract)", Ansi.RED, Ansi.RESET);
|
||||
return 0;
|
||||
}
|
||||
Duration p7_time = time::now() - start;
|
||||
|
||||
io::printf(" [C3: %5d ms, 7z: %5d ms]", (long)c3_time / 1000, (long)p7_time / 1000);
|
||||
|
||||
io::print(" Comparing... ");
|
||||
if (!compare_dirs(temp_c3.str_view(), temp_7z.str_view()))
|
||||
{
|
||||
io::printfn("%sFAIL%s (Differences found)", Ansi.RED, Ansi.RESET);
|
||||
return 0;
|
||||
}
|
||||
|
||||
// keep files on error for manual verification
|
||||
(void)path::rmtree(temp_c3);
|
||||
(void)path::rmtree(temp_7z);
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
fn bool extract_7z(String zip_path, String output_dir)
|
||||
{
|
||||
String out_opt = "-o".tconcat(output_dir);
|
||||
String[] cmd = { "7z", "x", zip_path, out_opt, "-y", "-bb0" };
|
||||
SubProcess? proc = process::create(cmd, { .search_user_path = true });
|
||||
if (catch excuse = proc) return false;
|
||||
return (int)proc.join()!! == 0;
|
||||
}
|
||||
|
||||
fn bool compare_dirs(String dir1, String dir2)
|
||||
{
|
||||
String[] cmd = { "diff", "-r", dir1, dir2 };
|
||||
SubProcess? proc = process::create(cmd, { .search_user_path = true, .inherit_stdio = true });
|
||||
if (catch excuse = proc) return false;
|
||||
int res = (int)proc.join()!!;
|
||||
return res == 0;
|
||||
}
|
||||
Reference in New Issue
Block a user