From df030ac51c05045af88f95f9b453d14c835c925a Mon Sep 17 00:00:00 2001 From: Manuel Barrio Linares Date: Sun, 15 Feb 2026 22:10:53 -0300 Subject: [PATCH] optimize `blake3` using a runtime for loop instruction count went from 60k to 9k no difference in speed for -O2 or higher --- benchmarks/stdlib/hash/blake3.c3 | 57 ++++++++++++++++++++++++++++++++ lib/std/hash/blake3.c3 | 12 +++---- 2 files changed, 62 insertions(+), 7 deletions(-) create mode 100644 benchmarks/stdlib/hash/blake3.c3 diff --git a/benchmarks/stdlib/hash/blake3.c3 b/benchmarks/stdlib/hash/blake3.c3 new file mode 100644 index 000000000..b1bfdd2c6 --- /dev/null +++ b/benchmarks/stdlib/hash/blake3.c3 @@ -0,0 +1,57 @@ +module blake3_bench; + +fn void initialize_bench() @init +{ + set_benchmark_warmup_iterations(3); + set_benchmark_max_iterations(128); + + input = mem::alloc_array(char, BUFSZ); + input[:BUFSZ] = (char[]){ [0..BUFSZ-1] = 0xA5 }[..]; + input_slice = input[:BUFSZ]; +} + +fn void teardown_bench() @finalizer +{ + mem::free(input); + input = null; +} + +char* input; +char[] input_slice; +const usz BUFSZ = 1024 * 1024; + +module blake3_bench @benchmark; + +import std::hash; + +fn void blake3_hash() +{ + runtime::@start_benchmark(); + char[*] myset = blake3::hash(input_slice); + runtime::@end_benchmark(); + mem::zero_volatile(myset[..]); +} + +fn void compared_with_sha256() +{ + runtime::@start_benchmark(); + char[*] myset = sha256::hash(input_slice); + runtime::@end_benchmark(); + mem::zero_volatile(myset[..]); +} + +fn void compared_with_sha512() +{ + runtime::@start_benchmark(); + char[*] myset = sha512::hash(input_slice); + runtime::@end_benchmark(); + mem::zero_volatile(myset[..]); +} + +fn void compared_with_whirlpool() +{ + runtime::@start_benchmark(); + char[*] myset = whirlpool::hash(input_slice); + runtime::@end_benchmark(); + mem::zero_volatile(myset[..]); +} diff --git a/lib/std/hash/blake3.c3 b/lib/std/hash/blake3.c3 index 29946680e..3c3986ccd 100644 --- a/lib/std/hash/blake3.c3 +++ b/lib/std/hash/blake3.c3 @@ -606,13 +606,11 @@ fn void compress_pre(uint[] state, uint[] cv, char[BLOCK_SIZE] block, usz block_ state[13] = (uint)(counter >> 32); state[14] = (uint)block_len; state[15] = (uint)flags; - @round(state, &block_words[0], 0); - @round(state, &block_words[0], 1); - @round(state, &block_words[0], 2); - @round(state, &block_words[0], 3); - @round(state, &block_words[0], 4); - @round(state, &block_words[0], 5); - @round(state, &block_words[0], 6); + + for (int i = 0; i < 7; i++) + { + @round(state, &block_words[0], (usz)i); + } } macro compress_in_place(uint[] cv, char[BLOCK_SIZE] block, usz block_len, ulong counter, char flags) @local