optimize blake3 using a runtime for loop

instruction count went from 60k to 9k
no difference in speed for -O2 or higher
This commit is contained in:
Manuel Barrio Linares
2026-02-15 22:10:53 -03:00
committed by Christoffer Lerno
parent 4b03a84b00
commit df030ac51c
2 changed files with 62 additions and 7 deletions

View File

@@ -606,13 +606,11 @@ fn void compress_pre(uint[] state, uint[] cv, char[BLOCK_SIZE] block, usz block_
state[13] = (uint)(counter >> 32);
state[14] = (uint)block_len;
state[15] = (uint)flags;
@round(state, &block_words[0], 0);
@round(state, &block_words[0], 1);
@round(state, &block_words[0], 2);
@round(state, &block_words[0], 3);
@round(state, &block_words[0], 4);
@round(state, &block_words[0], 5);
@round(state, &block_words[0], 6);
for (int i = 0; i < 7; i++)
{
@round(state, &block_words[0], (usz)i);
}
}
macro compress_in_place(uint[] cv, char[BLOCK_SIZE] block, usz block_len, ulong counter, char flags) @local