[stdlib] Impove SHA-256 Performance (#2671)

* [stdlib] Impove SHA-256 Performance

Cleaned up the code a bit. Seems to have improved performance anywhere from ~10-25%.

* trade-offs, trade-offs... reduce codegen

* Fix formatting

---------

Co-authored-by: Christoffer Lerno <christoffer@aegik.com>
This commit is contained in:
Zack Puhl
2025-12-29 11:07:03 -05:00
committed by GitHub
parent d96624c578
commit 9b318ec233
2 changed files with 164 additions and 153 deletions

View File

@@ -1,177 +1,175 @@
module std::hash::sha256;
import std::hash::hmac;
import std::bits, std::hash::hmac;
const BLOCK_SIZE = 64;
const HASH_SIZE = 32;
const uint[64] K @local = {
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
};
// Right rotate function
macro uint @rotr(uint x, uint n) @local => (((x) >> (n)) | ((x) << (32 - (n))));
// SHA-256 functions
macro uint @ch(uint x, uint y, uint z) @local => (x & y) ^ (~x & z);
macro uint @maj(uint x, uint y, uint z) @local => (x & y) ^ (x & z) ^ (y & z);
macro uint @_sigma0(uint x) @local => @rotr(x, 2) ^ @rotr(x, 13) ^ @rotr(x, 22);
macro uint @_sigma1(uint x) @local => @rotr(x, 6) ^ @rotr(x, 11) ^ @rotr(x, 25);
macro uint @sigma0(uint x) @local => @rotr(x, 7) ^ @rotr(x, 18) ^ (x >> 3);
macro uint @sigma1(uint x) @local => @rotr(x, 17) ^ @rotr(x, 19) ^ (x >> 10);
struct Sha256
{
uint[8] state;
ulong bitcount;
char[BLOCK_SIZE] buffer;
}
alias HmacSha256 = Hmac{Sha256, HASH_SIZE, BLOCK_SIZE};
alias hmac = hmac::hash{Sha256, HASH_SIZE, BLOCK_SIZE};
alias pbkdf2 = hmac::pbkdf2{Sha256, HASH_SIZE, BLOCK_SIZE};
fn char[HASH_SIZE] hash(char[] data)
const uint[64] K @local = {
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
};
macro uint @ch(uint x, uint y, uint z) @local => (x & y) ^ (~x & z);
macro uint @maj(uint x, uint y, uint z) @local => (x & y) ^ (x & z) ^ (y & z);
macro uint @_sigma0(uint x) @local => x.rotr(2) ^ x.rotr(13) ^ x.rotr(22);
macro uint @_sigma1(uint x) @local => x.rotr(6) ^ x.rotr(11) ^ x.rotr(25);
macro uint @sigma0(uint x) @local => x.rotr(7) ^ x.rotr(18) ^ (x >> 3);
macro uint @sigma1(uint x) @local => x.rotr(17) ^ x.rotr(19) ^ (x >> 10);
struct Sha256
{
Sha256 sha256 @noinit;
sha256.init();
sha256.update(data);
return sha256.final();
uint[8] state @align(usz.sizeof);
char[BLOCK_SIZE] buffer @align(ulong.sizeof); // must align along bitcount sizeof - see `final`
ulong bitcount;
}
fn void Sha256.init(&self)
<*
Compute and return a hash value.
@param [in] data : "The input data to hash."
*>
fn char[HASH_SIZE] hash(char[] data)
{
// Sha256 initialization constants
*self = {
.state = {
0x6A09E667,
0xBB67AE85,
0x3C6EF372,
0xA54FF53A,
0x510E527F,
0x9B05688C,
0x1F83D9AB,
0x5BE0CD19
}
};
Sha256 sha256 @noinit;
sha256.init();
sha256.update(data);
return sha256.final();
}
fn void Sha256.init(&self) => *self = {
.state = {
0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
},
};
<*
@param [in] data
@require data.len <= uint.max
*>
fn void Sha256.update(&self, char[] data) {
uint i = 0;
uint len = data.len;
uint buffer_pos = (uint)(self.bitcount / 8) % BLOCK_SIZE;
self.bitcount += ((ulong)len * 8);
fn void Sha256.update(&self, char[] data)
{
uint buffer_pos = (uint)(self.bitcount >> 3) % BLOCK_SIZE;
self.bitcount += (ulong)data.len << 3; // always record ingested bits count immediately
while (len--) {
self.buffer[buffer_pos++] = data[i++];
if (buffer_pos == BLOCK_SIZE) {
sha256_transform(&self.state, &self.buffer);
buffer_pos = 0; // Reset buffer position
}
}
// Get the buffer position back to 0 if we're midway through consuming some data.
if (buffer_pos > 0 && buffer_pos < BLOCK_SIZE)
{
usz len = min(BLOCK_SIZE - buffer_pos, data.len);
self.buffer[buffer_pos:len] = data[:len];
data = data[len..];
if (buffer_pos + len == BLOCK_SIZE) self.transform();
}
// When the data pointer is aligned, we can disregard unaligned loading in the `transform` macro.
// We do this here from the outer call to reduce the expense of checking alignment on every single block.
if (0 == (usz)data.ptr % usz.sizeof)
{
for (; data.len >= BLOCK_SIZE; data = data[BLOCK_SIZE..]) self.transform((uint*)data.ptr);
}
else
{
for (; data.len >= BLOCK_SIZE; data = data[BLOCK_SIZE..]) self.transform_unaligned((uint*)data.ptr);
}
// Leftover data just gets stored away for the next update or final.
if (data.len)
{
self.buffer[..] = 0;
self.buffer[:data.len] = data[..];
}
}
fn char[HASH_SIZE] Sha256.final(&self) {
char[HASH_SIZE] hash;
ulong i = (self.bitcount / 8) % BLOCK_SIZE;
fn char[HASH_SIZE] Sha256.final(&self)
{
char[HASH_SIZE] hash @align(uint.sizeof);
ulong i = (self.bitcount / 8) % BLOCK_SIZE;
// Append 0x80 to the buffer
self.buffer[i++] = 0x80;
// Append 0x80 to the buffer
self.buffer[i++] = 0x80;
// Pad the buffer with zeros
if (i > BLOCK_SIZE - 8) {
while (i < BLOCK_SIZE) {
self.buffer[i++] = 0x00;
}
sha256_transform(&self.state, &self.buffer);
i = 0; // Reset buffer index after transformation
}
while (i < BLOCK_SIZE - 8) {
self.buffer[i++] = 0x00;
}
// Pad the buffer with zeros
if (i > BLOCK_SIZE - 8)
{
self.buffer[i..] = 0x00;
self.transform();
i = 0; // Reset buffer index after transformation
}
// Append the bitcount in big-endian format
for (int j = 0; j < 8; ++j) {
self.buffer[BLOCK_SIZE - 8 + j] = (char)((self.bitcount >> (56 - j * 8)) & 0xFF);
}
self.buffer[i..(BLOCK_SIZE - 8)] = 0x00;
sha256_transform(&self.state, &self.buffer);
// Append the bitcount in big-endian format
*(ulong*)(&self.buffer[BLOCK_SIZE - 8]) = env::BIG_ENDIAN ??? self.bitcount : bswap(self.bitcount);
// Convert state to the final hash
for (i = 0; i < 8; ++i) {
hash[i * 4] = (char)((self.state[i] >> 24) & 0xFF);
hash[i * 4 + 1] = (char)((self.state[i] >> 16) & 0xFF);
hash[i * 4 + 2] = (char)((self.state[i] >> 8) & 0xFF);
hash[i * 4 + 3] = (char)(self.state[i] & 0xFF);
}
return hash;
self.transform();
// Convert state to the final hash
foreach (x, s : self.state) *(uint*)(&hash[x * uint.sizeof]) = env::BIG_ENDIAN ??? s : bswap(s);
return hash;
}
<*
@param [&inout] state
@param [&in] buffer
*>
fn void sha256_transform(uint* state, char* buffer) @local {
uint a, b, c, d, e, f, g, h, t1, t2;
uint[64] m;
int i;
// These wrappers are necessary to significantly reduce code generation from macro expansions.
// Note that transformations on `self.buffer` (when incoming == null) should always be aligned.
fn void Sha256.transform(&self, uint* incoming = null) @local @noinline => self.do_transform(incoming, true);
fn void Sha256.transform_unaligned(&self, uint* incoming = null) @local @noinline => self.do_transform(incoming, false);
// Prepare the message schedule
for (i = 0; i < 16; ++i) {
m[i] = ((uint)buffer[i * 4] << 24) | ((uint)buffer[i * 4 + 1] << 16) |
((uint)buffer[i * 4 + 2] << 8) | ((uint)buffer[i * 4 + 3]); // Ensure values are cast to uint for correct shifts
}
for (i = 16; i < 64; ++i) {
m[i] = @sigma1(m[i - 2]) + m[i - 7] + @sigma0(m[i - 15]) + m[i - 16];
}
macro Sha256.do_transform(&self, uint* incoming = null, bool $aligned = true) @local
{
uint a, b, c, d, e, f, g, h, t1, t2 @noinit;
uint[64] m @noinit;
int i @noinit;
// Initialize working variables
a = state[0];
b = state[1];
c = state[2];
d = state[3];
e = state[4];
f = state[5];
g = state[6];
h = state[7];
if (!incoming) incoming = (uint*)&self.buffer;
// Perform the main SHA-256 compression function
for (i = 0; i < 64; ++i) {
t1 = h + @_sigma1(e) + @ch(e, f, g) + K[i] + m[i];
t2 = @_sigma0(a) + @maj(a, b, c);
h = g;
g = f;
f = e;
e = d + t1;
d = c;
c = b;
b = a;
a = t1 + t2;
}
$if env::BIG_ENDIAN:
@as_char_view(m)[:BLOCK_SIZE] = @as_char_view(incoming)[:BLOCK_SIZE];
$else
// Unrolling this seems to make the hash slower.
for (i = 0; i < 16; ++i) m[i] = bswap($aligned ??? incoming[i] : @unaligned_load(incoming[i], 1));
$endif
// Update the state
state[0] += a;
state[1] += b;
state[2] += c;
state[3] += d;
state[4] += e;
state[5] += f;
state[6] += g;
state[7] += h;
a = b = c = d = e = f = g = h = t1 = t2 = i = 0;
m[:64] = buffer[:64] = 0;
for (i = 16; i < 64; i++) m[i] = @sigma1(m[i - 2]) + m[i - 7] + @sigma0(m[i - 15]) + m[i - 16];
a = self.state[0];
b = self.state[1];
c = self.state[2];
d = self.state[3];
e = self.state[4];
f = self.state[5];
g = self.state[6];
h = self.state[7];
$for usz $i = 0; $i < 64; $i++:
t1 = h + @_sigma1(e) + @ch(e, f, g) + K[$i] + m[$i];
t2 = @_sigma0(a) + @maj(a, b, c);
h = g;
g = f;
f = e;
e = d + t1;
d = c;
c = b;
b = a;
a = t1 + t2;
$endfor
self.state[0] += a;
self.state[1] += b;
self.state[2] += c;
self.state[3] += d;
self.state[4] += e;
self.state[5] += f;
self.state[6] += g;
self.state[7] += h;
}

View File

@@ -7,7 +7,7 @@ fn void test_sha256_empty()
sha.init();
sha.update("");
assert(sha.final() == x"E3B0C442 98FC1C14 9AFBF4C8 996FB924 27AE41E4 649B934C A495991B 7852B855");
test::@check(sha.final() == x"E3B0C442 98FC1C14 9AFBF4C8 996FB924 27AE41E4 649B934C A495991B 7852B855");
}
fn void test_sha256_abc()
@@ -16,7 +16,7 @@ fn void test_sha256_abc()
sha.init();
sha.update("abc");
assert(sha.final() == x"BA7816BF 8F01CFEA 414140DE 5DAE2223 B00361A3 96177A9C B410FF61 F20015AD");
test::@check(sha.final() == x"BA7816BF 8F01CFEA 414140DE 5DAE2223 B00361A3 96177A9C B410FF61 F20015AD");
}
fn void test_sha256_longer()
@@ -24,7 +24,21 @@ fn void test_sha256_longer()
Sha256 sha;
sha.init();
sha.update("abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopqabcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq");
assert(sha.final() == x"59F109D9 533B2B70 E7C3B814 A2BD218F 78EA5D37 14455BC6 7987CF0D 664399CF");
test::@check(sha.final() == x"59F109D9 533B2B70 E7C3B814 A2BD218F 78EA5D37 14455BC6 7987CF0D 664399CF");
}
fn void test_sha256_multi_update_permute()
{
char[] input = "a really long string a really long string a really long string a really long string a really long string a really long string a really long string a really long string a really long string a really long string a really long string a really long string a really long string a really long string a really long string a really long string a really long string a really long string a really long string a really long string a really long string a really long string a really long string a really long string";
for (usz step = 1; step < input.len; step++)
{
Sha256 sha;
sha.init();
usz i = 0;
for (; i < input.len / step; i++) sha.update(input[i*step : step]);
if (i*step < input.len) sha.update(input[i*step..]);
test::@check(sha.final() == x"b527293dfb70dcce37e593f4c43e1b81909615722bad041b90b8df22bebd00a0", "Mismatch for step %d", step);
}
}
/*
@@ -37,7 +51,7 @@ fn void gigahash_sha256()
Sha256 sha;
sha.init();
sha.update(c);
assert(sha.final() == x"053EADFD EC682CF1 6F3F8704 C7609C57 868DD757 65E08DC5 A7491F5D 06BCB74D");
test::@check(sha.final() == x"053EADFD EC682CF1 6F3F8704 C7609C57 868DD757 65E08DC5 A7491F5D 06BCB74D");
}
*/
fn void test_pbkdf2()
@@ -46,11 +60,11 @@ fn void test_pbkdf2()
char[] s = "salt";
char[32] out;
sha256::pbkdf2(pw, s, 1, &out);
assert(out == x'120FB6CF FCF8B32C 43E72252 56C4F837 A86548C9 2CCC3548 0805987C B70BE17B');
test::@check(out == x'120FB6CF FCF8B32C 43E72252 56C4F837 A86548C9 2CCC3548 0805987C B70BE17B');
sha256::pbkdf2(pw, s, 2, &out);
assert(out == x'AE4D0C95 AF6B46D3 2D0ADFF9 28F06DD0 2A303F8E F3C251DF D6E2D85A 95474C43');
test::@check(out == x'AE4D0C95 AF6B46D3 2D0ADFF9 28F06DD0 2A303F8E F3C251DF D6E2D85A 95474C43');
sha256::pbkdf2(pw, s, 4096, &out);
assert(out == x'C5E478D5 9288C841 AA530DB6 845C4C8D 962893A0 01CE4E11 A4963873 AA98134A');
test::@check(out == x'C5E478D5 9288C841 AA530DB6 845C4C8D 962893A0 01CE4E11 A4963873 AA98134A');
}
fn void test_pbkdf2_2()
@@ -59,7 +73,7 @@ fn void test_pbkdf2_2()
char[] s = "saltSALTsaltSALTsaltSALTsaltSALTsalt";
char[32] out;
sha256::pbkdf2(pw, s, 4096, &out);
assert(out == x'348C89DB CBD32B2F 32D814B8 116E84CF 2B17347E BC180018 1C4E2A1F B8DD53E1');
test::@check(out == x'348C89DB CBD32B2F 32D814B8 116E84CF 2B17347E BC180018 1C4E2A1F B8DD53E1');
}
@@ -70,7 +84,7 @@ fn void test_pbkdf2_3()
char[32] out;
sha256::pbkdf2(pw, salt, 4096, &out);
assert(out == x'89B69D05 16F82989 3C696226 650A8687 8C029AC1 3EE27650 9D5AE58B 6466A724');
test::@check(out == x'89B69D05 16F82989 3C696226 650A8687 8C029AC1 3EE27650 9D5AE58B 6466A724');
}
fn void test_sha256_million_a()
@@ -82,7 +96,6 @@ fn void test_sha256_million_a()
{
sha.update("aaaaaaaaaa");
}
assert(sha.final() == x"CDC76E5C 9914FB92 81A1C7E2 84D73E67 F1809A48 A497200E 046D39CC C7112CD0");
test::@check(sha.final() == x"CDC76E5C 9914FB92 81A1C7E2 84D73E67 F1809A48 A497200E 046D39CC C7112CD0");
}