From 97a9cab218ea7dfd7df5a0aaeb69fed6f0855550 Mon Sep 17 00:00:00 2001 From: Zack Puhl Date: Wed, 17 Dec 2025 22:57:34 +0000 Subject: [PATCH] Fix ChaCha20 Alignment Issues --- benchmarks/stdlib/crypto/chacha20.c3 | 18 ++++++++- lib/std/crypto/chacha20.c3 | 37 +++++++------------ src/compiler/sema_expr.c | 2 +- .../vector/vector_pointer_errors.c3 | 2 +- test/unit/stdlib/crypto/chacha20.c3 | 31 +++++++++------- 5 files changed, 48 insertions(+), 42 deletions(-) diff --git a/benchmarks/stdlib/crypto/chacha20.c3 b/benchmarks/stdlib/crypto/chacha20.c3 index ae4a77e3b..ea9003a44 100644 --- a/benchmarks/stdlib/crypto/chacha20.c3 +++ b/benchmarks/stdlib/crypto/chacha20.c3 @@ -2,6 +2,7 @@ // Use of this source code is governed by the MIT license // a copy of which can be found in the LICENSE_STDLIB file. module chacha20_benchmarks; + import std::crypto::chacha20; @@ -15,11 +16,24 @@ const char[] KEY = x'98bef1469be7269837a45bfbc92a5a6ac762507cf96443bf33b96b1bd4c const char[] NONCE = x'44e792d63335abb1582e9253'; const uint COUNTER = 42; -char[] one_megabyte = { [0..1024*1024] = 0xA5 }; +char[] one_mb @align(ulong.sizeof) = { [0..1024*1024] = 0xA5 }; // This doesn't test both encryption + decryption, because it's a symmetric operation that shares // a single common data transformation. Testing one limb is enough. fn void gogo_chacha20() @benchmark { - chacha20::encrypt_mut(one_megabyte[..], KEY, NONCE, COUNTER); + chacha20::encrypt_mut(one_mb[..], KEY, NONCE, COUNTER); +} + +// Check what the speed of an unligned buffer looks like. +fn void gogo_chacha20_unaligned() @benchmark => @pool() +{ + char[] copy = mem::talloc_array(char, one_mb.len + 3); + char[] im_off_slightly = copy[3..]; + copy[3..] = one_mb[..]; + assert((usz)im_off_slightly.ptr % usz.sizeof > 0); + + runtime::@start_benchmark(); + chacha20::encrypt_mut(im_off_slightly, KEY, NONCE, COUNTER); + runtime::@end_benchmark(); } diff --git a/lib/std/crypto/chacha20.c3 b/lib/std/crypto/chacha20.c3 index b112ec1b0..57438b2e5 100644 --- a/lib/std/crypto/chacha20.c3 +++ b/lib/std/crypto/chacha20.c3 @@ -62,17 +62,6 @@ macro quarter_round(uint* x, int a, int b, int c, int d) @local x[c] += x[d]; x[b] = (x[b] ^ x[c]).rotl(7); } - -<* Check the position of the keystream/input buffer usage, and mutate it when necessary. *> -macro ChaCha20.check_position(&self) @local -{ - if (self.position >= BLOCK_SIZE) - { - self.mutate_keystream(); - self.position = 0; - } -} - <* Process the next (or final) chunk of ingested data. *> fn void ChaCha20.mutate_keystream(&self) @local @inline { @@ -136,33 +125,33 @@ fn void ChaCha20.transform(&self, char[] data) if (self.position < BLOCK_SIZE) { usz len = data.len < (BLOCK_SIZE - self.position) ? data.len : (BLOCK_SIZE - self.position); - for (usz i = 0; i < len; i++) - { - data[i] ^= key_stream[self.position + i]; - } + for (usz i = 0; i < len; i++) data[i] ^= key_stream[self.position + i]; self.position += len; data = data[len..]; } - // 2. Process full blocks at a time, word by word according to the system's architecture. - for (; data.len >= BLOCK_SIZE; data = data[BLOCK_SIZE..]) + // 2. Get the amount of bytes offset from the nearest alignment boundary. + // Process full blocks at a time, word by word according to the system's architecture. + // Any extra bytes on each side are dynamically processed byte-by-byte. + usz offset = usz.sizeof - (((usz)data.ptr % usz.sizeof) ?: usz.sizeof); + + for (usz x = offset; data.len >= BLOCK_SIZE; data = data[BLOCK_SIZE..], x = offset) { self.mutate_keystream(); - for (usz i = 0; i < BLOCK_SIZE / usz.sizeof; i++) + if (offset) foreach (i, &b : data[:offset]) *b ^= key_stream[i]; + char[] aligned_data = data[offset..]; + for (; x <= (BLOCK_SIZE - usz.sizeof); x += usz.sizeof) { - usz* data_ref = (usz*)data.ptr + i; - @unaligned_store(*data_ref, @unaligned_load(*data_ref, 1) ^ ((usz*)&self.key_stream)[i], 1); + ((usz*)aligned_data.ptr)[x / usz.sizeof] ^= @unaligned_load(*(usz*)(&key_stream[x]), 1); } + for (; x < BLOCK_SIZE; x++) data[x] ^= key_stream[x]; } // 3. Process any remaining bytes. if (data.len > 0) { self.mutate_keystream(); - for (usz i = 0; i < data.len; i++) - { - data[i] ^= key_stream[i]; - } + for (usz i = 0; i < data.len; i++) data[i] ^= key_stream[i]; self.position = data.len; } diff --git a/src/compiler/sema_expr.c b/src/compiler/sema_expr.c index e1794ae03..d6c949403 100644 --- a/src/compiler/sema_expr.c +++ b/src/compiler/sema_expr.c @@ -8122,7 +8122,7 @@ static bool sema_expr_analyse_mod(SemaContext *context, Expr *expr, Expr *left, { // 1. Analyse both sides and promote to a common type OperatorOverload overload = OVERLOAD_REMINDER; - if (!sema_binary_analyse_arithmetic_subexpr(context, expr, "Cannot calculate the reminder %s %% %s", + if (!sema_binary_analyse_arithmetic_subexpr(context, expr, "Cannot calculate the remainder %s %% %s", false, &overload, failed_ref)) return false; if (!overload) return true; diff --git a/test/test_suite/vector/vector_pointer_errors.c3 b/test/test_suite/vector/vector_pointer_errors.c3 index 8dc8cc440..8196f32b8 100644 --- a/test/test_suite/vector/vector_pointer_errors.c3 +++ b/test/test_suite/vector/vector_pointer_errors.c3 @@ -6,7 +6,7 @@ fn void pointer_add_sub_diff() int*[<2>] y; double*[<2>] z = y; // #error: 'int*[<2>]' to 'double*[<2>]' y / y; // #error: Cannot divide - y % y; // #error: Cannot calculate the reminder + y % y; // #error: Cannot calculate the remainder y * y; // #error: multiply y ^ y; // #error: not defined iptr[<2>] g = (iptr[<2>])y; diff --git a/test/unit/stdlib/crypto/chacha20.c3 b/test/unit/stdlib/crypto/chacha20.c3 index 7e2d68233..1fa34e27d 100644 --- a/test/unit/stdlib/crypto/chacha20.c3 +++ b/test/unit/stdlib/crypto/chacha20.c3 @@ -412,28 +412,31 @@ fn void scrolling_input_unaligned_permutations_with_random_chunks() { // Paranoia, honestly... Use a known test vector a couple blocks long, and - no matter the alignment started from - ensure the same result. Lcg64Random rand; - random::seed(&rand, 0x1337_83fb_c1ac_1a20); + random::seed(&rand, 0x1337_83fb_c1ac_eeee); char[*] key = sha256::hash("dance with me"); char[*] nonce = "123456789abc"; for (usz i = 1; i < ulong.sizeof + 1; i++) { - for (usz j = 1; j < LARGE_INPUT.len; j++) @pool() - { - char[] unaligned @align(ulong.sizeof) = mem::talloc_array(char, j + ulong.sizeof); - char[] encrypt_me = unaligned[i:j]; - encrypt_me[..] = LARGE_INPUT[:j]; + for (usz j = 1; j < LARGE_INPUT.len; j++) + { + for (usz k = 1; k < 128; k++) @pool() + { + char[] unaligned @align(ulong.sizeof) = mem::talloc_array(char, 1 + j + ulong.sizeof); + unaligned[i:j] = LARGE_INPUT[:j]; + test::@check(chacha20::tencrypt(unaligned[i:j], key, nonce) == LARGE_EXPECTED[:j], "Mismatched permutation of hash on index (%d, %d).", i, j); - ChaCha20 c @noinit; - defer c.destroy(); - c.init(key, nonce); - for (usz x = 1; encrypt_me.len; encrypt_me = encrypt_me[x..], x = (rand.next_byte() % min(20, encrypt_me.len ?: 1)) ?: 1) c.transform(encrypt_me[:x]); - test::@check(unaligned[i:j] == LARGE_EXPECTED[:j], "Mismatched permutation of hash on index (%d, %d).", i, j); + char[] encrypt_me = unaligned[i:j]; + ChaCha20 c @noinit; + defer c.destroy(); + c.init(key, nonce); + for (usz x = 1; encrypt_me.len; encrypt_me = encrypt_me[x..], x = (rand.next_byte() % min(k, encrypt_me.len ?: 1)) ?: 1) c.transform(encrypt_me[:x]); + test::@check(unaligned[i:j] == LARGE_EXPECTED[:j], "Mismatched permutation of hash on index (%d, %d; %d).", i, j, k); - // test::@check(chacha20::tencrypt(unaligned[i:j], key, nonce) == LARGE_EXPECTED[:j], "Mismatched permutation of hash on index (%d, %d).", i, j); - test::@check(chacha20::tencrypt(LARGE_INPUT[:j], key, nonce) == LARGE_EXPECTED[:j], "Mismatched permutation of hash on index (%d, %d).", i, j); - }; + test::@check(chacha20::tencrypt(LARGE_INPUT[:j], key, nonce) == LARGE_EXPECTED[:j], "Mismatched permutation of hash on index (%d, %d).", i, j); + }; + } } }