mirror of
https://github.com/c3lang/c3c.git
synced 2026-02-27 12:01:16 +00:00
748 lines
24 KiB
Plaintext
748 lines
24 KiB
Plaintext
// Copyright (c) 2025-2026 Zack Puhl <github@xmit.xyz>. All rights reserved.
|
|
// Use of this source code is governed by the MIT license
|
|
// a copy of which can be found in the LICENSE_STDLIB file.
|
|
//
|
|
// This is based on the original BLAKE3 reference implementation:
|
|
// https://github.com/BLAKE3-team/BLAKE3/blob/master
|
|
//
|
|
module std::hash::blake3;
|
|
|
|
|
|
const BLOCK_SIZE = 64;
|
|
const CHUNK_SIZE = 1024;
|
|
const KEY_SIZE = 32;
|
|
const KEY_SIZE_WORDS = KEY_SIZE / uint.sizeof;
|
|
const OUT_SIZE = 32;
|
|
const MAX_DEPTH = 54;
|
|
|
|
const uint[8] IV = {
|
|
0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
|
|
0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
|
|
};
|
|
|
|
const char[16][7] MESSAGE_SCHEDULE = {
|
|
x'000102030405060708090a0b0c0d0e0f',
|
|
x'0206030a0700040d010b0c05090e0f08',
|
|
x'03040a0c0d02070e060509000b0f0801',
|
|
x'0a070c090e030d0f04000b0205080106',
|
|
x'0c0d090b0f0a0e080702050300010604',
|
|
x'090e0b05080c0f010d03000a02060407',
|
|
x'0b0f0500010908060e0a020c0304070d',
|
|
};
|
|
|
|
|
|
// Get feature-based optimization options.
|
|
// For now, none of these are used until there's a chance to explore BLAKE3's (necessary) vectorization optimizations.
|
|
//
|
|
<* When true, force the use of slow-but-portable BLAKE3 functions. Do not vectorize the hash function. *>
|
|
const FORCE_PORTABLE = true; //$feature(BLAKE3_FORCE_PORTABLE); // this is statically set to TRUE for now
|
|
<* AARCH64: When not big-endian, use Neon. *>
|
|
const USE_NEON = !FORCE_PORTABLE &&& (env::AARCH64 &&& !env::BIG_ENDIAN);
|
|
<* Bundling some architecture booleans into one. *>
|
|
const IS_X86 = !FORCE_PORTABLE &&& (env::X86_64 ||| env::X86);
|
|
<*
|
|
The maximum possible degree of parallelization based on the current architecture.
|
|
This doesn't represent the ACTUAL degree available.
|
|
*>
|
|
const MAX_SIMD_DEGREE = IS_X86 ??? 16 : (USE_NEON ??? 4 : 1);
|
|
<* There are cases in BLAKE3 where, at compile-time, it's necessary to easily get the max degree, or a minimum of 2. *>
|
|
const MAX_SIMD_DEGREE_OR_2 = @max(MAX_SIMD_DEGREE, 2);
|
|
|
|
|
|
<* Always set to true once BLAKE3 caches some initial CPU details. *>
|
|
bool cpuinfo_initd @local = false;
|
|
|
|
<*
|
|
Cache some information at runtime about the current processor and platform, as needed for optimizations.
|
|
*>
|
|
fn void init_blake3() @local @init
|
|
{
|
|
$if IS_X86:
|
|
cpudetect::x86_initialize_cpu_features(); // query all x86 feature flags, one time
|
|
$endif
|
|
cpuinfo_initd = true;
|
|
}
|
|
|
|
<* Check whether a given CPU flag is set (x86/x86_64 only). *>
|
|
macro bool @check_cpu_flag(X86Feature f) @local @if(IS_X86)
|
|
=> !!(cpudetect::x86_features & f.ordinal);
|
|
|
|
<*
|
|
Return the actual SIMD degree of the processor at runtime.
|
|
*>
|
|
macro @simd_degree() @local
|
|
{
|
|
if (!cpuinfo_initd) init_blake3();
|
|
assert(cpuinfo_initd == true, "Failed to run required BLAKE3 initializations.");
|
|
|
|
$switch:
|
|
$case IS_X86:
|
|
if (@check_cpu_flag(AVX512F) && @check_cpu_flag(AVX512VL)) return 16;
|
|
if (@check_cpu_flag(AVX2)) return 8;
|
|
if (@check_cpu_flag(SSE4_1) || @check_cpu_flag(SSE2)) return 4;
|
|
$case USE_NEON:
|
|
return 4;
|
|
$endswitch
|
|
|
|
return 1;
|
|
}
|
|
|
|
<* Flags used during hash computation based on its state. *>
|
|
enum Blake3Flags : const inline char
|
|
{
|
|
CHUNK_START = 1 << 0,
|
|
CHUNK_END = 1 << 1,
|
|
PARENT = 1 << 2,
|
|
ROOT = 1 << 3,
|
|
KEYED_HASH = 1 << 4,
|
|
DERIVE_KEY_CONTEXT = 1 << 5,
|
|
DERIVE_KEY_MATERIAL = 1 << 6,
|
|
}
|
|
|
|
struct Blake3ChunkState @local
|
|
{
|
|
uint[8] cv;
|
|
ulong chunk_counter;
|
|
char[BLOCK_SIZE] buf;
|
|
char buf_len;
|
|
char blocks_compressed;
|
|
char flags;
|
|
}
|
|
|
|
struct Blake3Output @local
|
|
{
|
|
uint[KEY_SIZE_WORDS] input_cv;
|
|
ulong counter;
|
|
char[BLOCK_SIZE] block;
|
|
char block_len;
|
|
char flags;
|
|
}
|
|
|
|
struct Blake3
|
|
{
|
|
uint[KEY_SIZE_WORDS] key;
|
|
Blake3ChunkState chunk;
|
|
char cv_stack_len;
|
|
char[(MAX_DEPTH + 1) * OUT_SIZE] cv_stack;
|
|
}
|
|
|
|
|
|
<*
|
|
Generate an XOF hash based on the given inputs.
|
|
|
|
Consider the output hash w/ `seek = 0` and `$out_size = 41`:
|
|
```
|
|
2cc39783c223154fea8dfb7c1b1660f2ac2dcbd1c1de8277b0b0dd39b7e50d7d905630c8be290dfcf3
|
|
```
|
|
|
|
Computing with the same input `key` and input `data`, but with a `seek = 3` and `$out_size = 8` yields:
|
|
```
|
|
83c223154fea8dfb
|
|
|
|
which is a slice cut out from the above hash:
|
|
2cc397 [83c223154fea8dfb] 7c1b1660f2ac2dcbd1c1de8277b0b0dd39b7e50d7d905630c8be290dfcf3
|
|
```
|
|
|
|
In this way, the XOF primitive that BLAKE3 is built from allows the hash output to be a potentially
|
|
limitless result that one may slice to their liking using the right parameters.
|
|
|
|
@param [in] data : "The data to hash."
|
|
@param [in] key : "An optional 32-byte key to turn the result into a keyed hash."
|
|
@param seek : "An optional value specifying the offset into the XOF's yield where the resultant hash should begin."
|
|
@param $out_size : "An optional value specifying the desired length to slice from the XOF's yield."
|
|
|
|
@return "The hash as a character array of `$out_size` bytes."
|
|
|
|
@require !key.len || key.len == KEY_SIZE : "Key value must be empty or exactly 32 bytes."
|
|
@require $out_size > 0 : "You cannot use a zero $out_size."
|
|
*>
|
|
macro char[*] hash(char[] data, char[] key = {}, usz seek = 0, usz $out_size = 32)
|
|
{
|
|
char[$out_size] result;
|
|
Blake3 b @noinit;
|
|
defer b.destroy();
|
|
b.init(key);
|
|
b.update(data);
|
|
b.final(result[..], $out_size, seek);
|
|
return result;
|
|
}
|
|
|
|
<*
|
|
Generate a hash from a context string. This call allows one to use the "context" to
|
|
auto-generate keying material for the resultant hash value. Effectively, this allows for
|
|
hashes made from data with completely variable-length keys, rather than having a key fixed
|
|
to 32 bytes. The 'context' nomenclature is from BLAKE3 itself, not my naming.
|
|
|
|
@param [in] data : "The data to hash."
|
|
@param [in] context : "An optional key to turn the result into a keyed hash."
|
|
@param seek : "An optional value specifying the offset into the XOF's yield where the resultant hash should begin."
|
|
@param $out_size : "An optional value specifying the desired length to slice from the XOF's yield."
|
|
|
|
@return "The context-based hash as a character array of `$out_size` bytes."
|
|
|
|
@require $out_size > 0 : "You cannot use a zero $out_size."
|
|
*>
|
|
macro char[*] ctx_hash(char[] data, char[] context, usz seek = 0, usz $out_size = 32)
|
|
{
|
|
char[$out_size] result;
|
|
Blake3 b = new_from_context(context);
|
|
defer b.destroy();
|
|
b.update(data);
|
|
b.final(result[..], $out_size, seek);
|
|
return result;
|
|
}
|
|
|
|
<*
|
|
Generate a new Blake3 hashing structure from the given context string. The context string
|
|
acts as a variable-length key to seed the new hash structure, and makes it ready to ingest
|
|
incoming data with `update`.
|
|
|
|
@param [in] context : "The context byte array used to seed the returned Blake3 context."
|
|
*>
|
|
macro Blake3 new_from_context(char[] context)
|
|
{
|
|
char[KEY_SIZE] context_based_key;
|
|
defer mem::zero_volatile(context_based_key[..]);
|
|
|
|
Blake3 key_from_ctx @noinit;
|
|
defer key_from_ctx.destroy();
|
|
key_from_ctx.init(explicit_flags: Blake3Flags.DERIVE_KEY_CONTEXT);
|
|
key_from_ctx.update(context);
|
|
key_from_ctx.final(context_based_key[..], KEY_SIZE);
|
|
|
|
Blake3 b @noinit;
|
|
b.init(key: context_based_key[..], explicit_flags: Blake3Flags.DERIVE_KEY_MATERIAL);
|
|
return b;
|
|
}
|
|
|
|
|
|
<*
|
|
Initialize a BLAKE3 context.
|
|
|
|
@param [in] key : "An optional key initializer to use."
|
|
|
|
@require !key.len || key.len == KEY_SIZE : "An explicit initialization key must be of KEY_SIZE (32 bytes)."
|
|
*>
|
|
fn void Blake3.init(&self, char[] key = {}, char explicit_flags = 0)
|
|
{
|
|
mem::zero_volatile(@as_char_view(*self));
|
|
|
|
if (key.len)
|
|
{
|
|
foreach (i, &w : self.key) *w = mem::load((uint*)&key[i * $sizeof(self.key[0])], 1);
|
|
if (!explicit_flags) explicit_flags = Blake3Flags.KEYED_HASH;
|
|
}
|
|
else
|
|
{
|
|
self.key[..] = IV[..];
|
|
}
|
|
|
|
self.chunk.init(self.key[..], explicit_flags);
|
|
}
|
|
|
|
<*
|
|
Reset the state of the hashing context, in case it should be reused without reloading the key value.
|
|
*>
|
|
fn void Blake3.reset(&self) @local @inline
|
|
{
|
|
self.chunk.reset(self.key[..], 0);
|
|
self.cv_stack_len = 0;
|
|
}
|
|
|
|
<*
|
|
Private function to merge tree results.
|
|
*>
|
|
fn void Blake3.merge_cv_stack(&self, ulong total_len) @local @inline
|
|
{
|
|
usz post_merge_stack_len = (usz)@popcnt(total_len);
|
|
for (; self.cv_stack_len > post_merge_stack_len; self.cv_stack_len--)
|
|
{
|
|
char* parent_node = &self.cv_stack[(self.cv_stack_len - 2) * OUT_SIZE];
|
|
Blake3Output o = parent_output(parent_node, self.key[..], self.chunk.flags);
|
|
o.chaining_value(parent_node);
|
|
}
|
|
}
|
|
|
|
<*
|
|
Private function to add a new tree onto the stack.
|
|
*>
|
|
fn void Blake3.push_cv(&self, char* new_cv, ulong chunk_counter) @local @inline
|
|
{
|
|
self.merge_cv_stack(chunk_counter);
|
|
self.cv_stack[self.cv_stack_len * OUT_SIZE : OUT_SIZE] = new_cv[:OUT_SIZE];
|
|
self.cv_stack_len++;
|
|
}
|
|
|
|
<*
|
|
Update the hash context by consuming incoming data.
|
|
|
|
@param [in] input : "The slice of new data to digest."
|
|
@param use_tbb : "Should remain `false` until other BLAKE3 optimizations are set up."
|
|
*>
|
|
fn void Blake3.update(&self, char[] input, bool use_tbb = false)
|
|
{
|
|
if (!input.len) return;
|
|
|
|
if (self.chunk.len() > 0)
|
|
{
|
|
usz take = min(CHUNK_SIZE - self.chunk.len(), input.len);
|
|
self.chunk.update(input[:take]);
|
|
input = input[take..];
|
|
|
|
if (!input.len) return;
|
|
|
|
char[KEY_SIZE] chunk_cv;
|
|
Blake3Output o = self.chunk.output();
|
|
o.chaining_value(&chunk_cv);
|
|
self.push_cv(&chunk_cv, self.chunk.chunk_counter);
|
|
self.chunk.reset(self.key[..], self.chunk.chunk_counter + 1);
|
|
}
|
|
|
|
while (input.len > CHUNK_SIZE)
|
|
{
|
|
usz subtree_len = @round_down_to_power_of_2(input.len);
|
|
ulong count_so_far = self.chunk.chunk_counter * CHUNK_SIZE;
|
|
|
|
while ((((ulong)(subtree_len - 1)) & count_so_far) != 0) subtree_len /= 2;
|
|
|
|
ulong subtree_chunks = subtree_len / CHUNK_SIZE;
|
|
if (subtree_len <= CHUNK_SIZE)
|
|
{
|
|
Blake3ChunkState chunk_state;
|
|
chunk_state.init(self.key[..], self.chunk.flags);
|
|
chunk_state.chunk_counter = self.chunk.chunk_counter;
|
|
chunk_state.update(input[:subtree_len]);
|
|
char[OUT_SIZE] cv;
|
|
Blake3Output o = chunk_state.output();
|
|
o.chaining_value(&cv);
|
|
self.push_cv(&cv, chunk_state.chunk_counter);
|
|
}
|
|
else
|
|
{
|
|
char[2 * OUT_SIZE] cv_pair;
|
|
compress_subtree_to_parent_node(input[:subtree_len], self.key[..], self.chunk.chunk_counter, self.chunk.flags, cv_pair[..], use_tbb);
|
|
self.push_cv(&cv_pair[0], self.chunk.chunk_counter);
|
|
self.push_cv(&cv_pair[OUT_SIZE], self.chunk.chunk_counter + (subtree_chunks / 2));
|
|
}
|
|
self.chunk.chunk_counter += subtree_chunks;
|
|
input = input[subtree_len..];
|
|
}
|
|
|
|
if (input.len > 0)
|
|
{
|
|
self.chunk.update(input);
|
|
self.merge_cv_stack(self.chunk.chunk_counter);
|
|
}
|
|
}
|
|
|
|
<*
|
|
Yield the results of the hash into a specified output buffer, at the specified length.
|
|
Note that the `into` slice does not need to be properly cut to receive hash results; it
|
|
just needs to be wide enough to accommodate `into_len` yielded bytes from the XOF.
|
|
|
|
@param [in] into : "The storage buffer for the output hash value. Must be >= `into_len` bytes."
|
|
@param into_len : "How many bytes to receive from the XOF/hash output."
|
|
@param seek : "How far into the XOF's yield to begin the stored byte sequence."
|
|
|
|
@require into.len >= into_len : "The requested output size must be equal to or less than the size of the output slice."
|
|
*>
|
|
fn void Blake3.final(&self, char[] into, usz into_len, usz seek = 0)
|
|
{
|
|
if (!into_len) return;
|
|
|
|
if (!self.cv_stack_len)
|
|
{
|
|
Blake3Output o = self.chunk.output();
|
|
o.root_bytes(seek, into[:into_len]);
|
|
return;
|
|
}
|
|
|
|
Blake3Output o @noinit;
|
|
usz cvs_remaining;
|
|
if (self.chunk.len() > 0)
|
|
{
|
|
cvs_remaining = self.cv_stack_len;
|
|
o = self.chunk.output();
|
|
}
|
|
else
|
|
{
|
|
cvs_remaining = (usz)self.cv_stack_len - 2;
|
|
o = parent_output(&self.cv_stack[cvs_remaining * KEY_SIZE], self.key[..], self.chunk.flags);
|
|
}
|
|
|
|
while (cvs_remaining > 0)
|
|
{
|
|
char[BLOCK_SIZE] parent_block;
|
|
cvs_remaining--;
|
|
parent_block[:32] = self.cv_stack[cvs_remaining * 32 : 32];
|
|
o.chaining_value(&parent_block[32]);
|
|
o = parent_output(&parent_block, self.key[..], self.chunk.flags);
|
|
}
|
|
|
|
o.root_bytes(seek, into[:into_len]);
|
|
}
|
|
|
|
<*
|
|
Destroy a BLAKE3 hashing context.
|
|
*>
|
|
fn void Blake3.destroy(&self) @inline
|
|
{
|
|
mem::zero_volatile(@as_char_view(*self));
|
|
}
|
|
|
|
|
|
<*
|
|
Initialize a BLAKE3 chunk state.
|
|
|
|
@param [in] key
|
|
@param flags
|
|
*>
|
|
fn void Blake3ChunkState.init(&self, uint[] key, char flags) @local @inline
|
|
{
|
|
mem::zero_volatile(@as_char_view(*self));
|
|
self.cv[..] = key[..];
|
|
self.flags = flags;
|
|
}
|
|
|
|
<*
|
|
Reset a BLAKE3 chunk state.
|
|
|
|
@param [in] key
|
|
@param chunk_counter
|
|
*>
|
|
fn void Blake3ChunkState.reset(&self, uint[] key, ulong chunk_counter) @local @inline
|
|
{
|
|
self.init(key, self.flags); // maintain its own flags
|
|
self.chunk_counter = chunk_counter; // update chunk counter
|
|
}
|
|
|
|
<*
|
|
Get bytes length of consumed data.
|
|
*>
|
|
fn usz Blake3ChunkState.len(&self) @operator(len) @local @inline
|
|
=> (BLOCK_SIZE * (usz)self.blocks_compressed) + (usz)self.buf_len;
|
|
|
|
<*
|
|
Ingest an amount of bytes into the chunk's buffer. NOTE: Doesn't check for underflow.
|
|
|
|
@param [in] data : "Data to ingest."
|
|
*>
|
|
fn usz Blake3ChunkState.fill_buf(&self, char[] data) @local @inline
|
|
{
|
|
usz take = min(BLOCK_SIZE - (usz)self.buf_len, data.len);
|
|
self.buf[self.buf_len:take] = data[:take];
|
|
self.buf_len += (char)take;
|
|
return take;
|
|
}
|
|
|
|
<*
|
|
Determine whether to set the CHUNK_START flag.
|
|
*>
|
|
fn char Blake3ChunkState.maybe_start_flag(&self) @local @inline
|
|
=> !self.blocks_compressed ? Blake3Flags.CHUNK_START : 0;
|
|
|
|
<*
|
|
Update the chunk with the provided input bytes.
|
|
|
|
@param [in] input : "Incoming bytes to update with."
|
|
*>
|
|
fn void Blake3ChunkState.update(&self, char[] input) @local
|
|
{
|
|
if (self.buf_len)
|
|
{
|
|
usz take = self.fill_buf(input);
|
|
input = input[take..];
|
|
if (input.len)
|
|
{
|
|
compress_in_place(self.cv[..], self.buf[..], BLOCK_SIZE, self.chunk_counter, self.flags | self.maybe_start_flag());
|
|
self.blocks_compressed++;
|
|
self.buf_len = 0;
|
|
self.buf[..] = {};
|
|
}
|
|
}
|
|
for (; input.len > BLOCK_SIZE; self.blocks_compressed++, input = input[BLOCK_SIZE..])
|
|
{
|
|
compress_in_place(self.cv[..], input[:BLOCK_SIZE], BLOCK_SIZE, self.chunk_counter, self.flags | self.maybe_start_flag());
|
|
}
|
|
self.fill_buf(input);
|
|
}
|
|
|
|
<*
|
|
Convert the chunk state to an "output" type with the right flags.
|
|
*>
|
|
fn Blake3Output Blake3ChunkState.output(&self) @local @inline
|
|
=> make_output(self.cv[..], &self.buf, self.buf_len, self.chunk_counter, self.flags | self.maybe_start_flag() | Blake3Flags.CHUNK_END);
|
|
|
|
<*
|
|
Generate and initialize an output structure with the provided parameters.
|
|
|
|
@param [in] key
|
|
@param [&in] in_block
|
|
@param block_len
|
|
@param counter
|
|
@param flags
|
|
*>
|
|
fn Blake3Output make_output(uint[] key, char* in_block, usz block_len, ulong counter, char flags) @local @noinline
|
|
{
|
|
Blake3Output o;
|
|
o.input_cv[..] = key[..];
|
|
o.block[..] = in_block[:BLOCK_SIZE];
|
|
o.block_len = (char)block_len;
|
|
o.counter = counter;
|
|
o.flags = flags;
|
|
return o;
|
|
}
|
|
|
|
<*
|
|
Auto-generate a parent output structure, pre-initialized with some constant identifiers.
|
|
|
|
@param [&in] block
|
|
@param [in] key
|
|
@param flags
|
|
*>
|
|
macro Blake3Output parent_output(char* block, uint[] key, char flags) @local
|
|
=> make_output(key, block, BLOCK_SIZE, 0, flags | Blake3Flags.PARENT);
|
|
|
|
<*
|
|
Compress then store the chaining value of the output structure.
|
|
|
|
@param [&inout] cv
|
|
*>
|
|
macro void Blake3Output.chaining_value(&self, char* cv) @local
|
|
{
|
|
uint[KEY_SIZE_WORDS] cv_words;
|
|
cv_words[..] = self.input_cv[..];
|
|
compress_in_place(cv_words[..], self.block, self.block_len, self.counter, self.flags);
|
|
cv[:KEY_SIZE] = @as_char_view(cv_words)[:KEY_SIZE];
|
|
}
|
|
|
|
<*
|
|
Store the result of the output into the designated slice.
|
|
|
|
@param seek
|
|
@param [inout] into
|
|
*>
|
|
fn void Blake3Output.root_bytes(&self, usz seek, char[] into) @local
|
|
{
|
|
if (!into.len) return;
|
|
|
|
ulong output_block_counter = seek / BLOCK_SIZE;
|
|
usz offset_within_block = seek % BLOCK_SIZE;
|
|
char[BLOCK_SIZE] wide_buf;
|
|
|
|
if (offset_within_block)
|
|
{
|
|
compress_xof(self.input_cv[..], self.block, self.block_len, output_block_counter, self.flags | Blake3Flags.ROOT, wide_buf[..]);
|
|
usz avail = BLOCK_SIZE - offset_within_block;
|
|
usz bytes = min(into.len, avail);
|
|
into[:bytes] = wide_buf[offset_within_block:bytes];
|
|
into = into[bytes..];
|
|
output_block_counter++;
|
|
}
|
|
if (into.len / BLOCK_SIZE)
|
|
{
|
|
@xof_many(self.input_cv[..], self.block, self.block_len, output_block_counter, self.flags | Blake3Flags.ROOT, into, into.len / BLOCK_SIZE);
|
|
}
|
|
output_block_counter += into.len / 64;
|
|
into = into[(usz)(into.len & -64ll) ..];
|
|
if (into.len)
|
|
{
|
|
compress_xof(self.input_cv[..], self.block, self.block_len, output_block_counter, self.flags | Blake3Flags.ROOT, wide_buf[..]);
|
|
into[..] = wide_buf[:into.len];
|
|
}
|
|
}
|
|
|
|
|
|
// =================================================================================================
|
|
// =================================================================================================
|
|
// =================================================================================================
|
|
// WELCOME TO THE COMPUTATION GARDEN...
|
|
//
|
|
// You wanna understand BLAKE3? You gotta get through us.
|
|
// ______________________________
|
|
// ༼ ºل͟º ༼ ºل͟º ༼ ºل͟º ༽ ºل͟º ༽ ºل͟º ༽
|
|
//
|
|
//
|
|
macro uint @popcnt(#x) @local => (uint)#x.popcount();
|
|
macro uint @highest_one(#x) @local => 63 ^ (uint)#x.clz();
|
|
macro usz @round_down_to_power_of_2(#x) @local => (usz)1 << @highest_one(#x | 1);
|
|
|
|
macro left_subtree_len(usz input_len) @local
|
|
=> @round_down_to_power_of_2((input_len - 1) / CHUNK_SIZE) * CHUNK_SIZE;
|
|
|
|
|
|
macro @g(#state, a, b, c, d, x, y) @local
|
|
{
|
|
#state[a] += #state[b] + x;
|
|
#state[d] = (#state[d] ^ #state[a]).rotr(16);
|
|
#state[c] += #state[d];
|
|
#state[b] = (#state[b] ^ #state[c]).rotr(12);
|
|
#state[a] += #state[b] + y;
|
|
#state[d] = (#state[d] ^ #state[a]).rotr(8);
|
|
#state[c] += #state[d];
|
|
#state[b] = (#state[b] ^ #state[c]).rotr(7);
|
|
}
|
|
|
|
macro @round(uint[] state, uint* msg, usz round) @local
|
|
{
|
|
char* schedule = &MESSAGE_SCHEDULE[round];
|
|
@g(state, 0, 4, 8, 12, msg[schedule[0] ], msg[schedule[1] ]);
|
|
@g(state, 1, 5, 9, 13, msg[schedule[2] ], msg[schedule[3] ]);
|
|
@g(state, 2, 6, 10, 14, msg[schedule[4] ], msg[schedule[5] ]);
|
|
@g(state, 3, 7, 11, 15, msg[schedule[6] ], msg[schedule[7] ]);
|
|
@g(state, 0, 5, 10, 15, msg[schedule[8] ], msg[schedule[9] ]);
|
|
@g(state, 1, 6, 11, 12, msg[schedule[10]], msg[schedule[11]]);
|
|
@g(state, 2, 7, 8, 13, msg[schedule[12]], msg[schedule[13]]);
|
|
@g(state, 3, 4, 9, 14, msg[schedule[14]], msg[schedule[15]]);
|
|
}
|
|
|
|
fn void compress_pre(uint[] state, uint[] cv, char[BLOCK_SIZE] block, usz block_len, ulong counter, char flags) @local @noinline
|
|
{
|
|
uint[16] block_words @noinit;
|
|
foreach (i, &b : block_words) *b = mem::load((uint*)&block[i * 4], 1);
|
|
state[0:8] = cv[0:8];
|
|
state[8:4] = IV[0:4];
|
|
state[12] = (uint)counter;
|
|
state[13] = (uint)(counter >> 32);
|
|
state[14] = (uint)block_len;
|
|
state[15] = (uint)flags;
|
|
@round(state, &block_words[0], 0);
|
|
@round(state, &block_words[0], 1);
|
|
@round(state, &block_words[0], 2);
|
|
@round(state, &block_words[0], 3);
|
|
@round(state, &block_words[0], 4);
|
|
@round(state, &block_words[0], 5);
|
|
@round(state, &block_words[0], 6);
|
|
}
|
|
|
|
macro compress_in_place(uint[] cv, char[BLOCK_SIZE] block, usz block_len, ulong counter, char flags) @local
|
|
{
|
|
uint[16] state @noinit;
|
|
compress_pre(state[..], cv, block, block_len, counter, flags);
|
|
for (usz i = 0; i < 8; i++) cv[i] = state[i] ^ state[i + 8];
|
|
}
|
|
|
|
macro compress_xof(uint[] cv, char[BLOCK_SIZE] block, usz block_len, ulong counter, char flags, char[] out) @local
|
|
{
|
|
uint[16] state @noinit;
|
|
compress_pre(state[..], cv, block, block_len, counter, flags);
|
|
$for usz $i = 0; $i < 8; $i++: mem::store((uint*)&out[4 * $i], state[$i] ^ state[$i + 8], 1); $endfor
|
|
$for usz $i = 0; $i < 8; $i++: mem::store((uint*)&out[4 * (8 + $i)], state[$i + 8] ^ cv[$i], 1); $endfor
|
|
}
|
|
|
|
macro @xof_many(uint[] cv, char[BLOCK_SIZE] block, usz block_len, ulong counter, char flags, char[] out, usz out_blocks) @local
|
|
{
|
|
for (usz i = 0; i < out_blocks; i++, out = out[BLOCK_SIZE..]) compress_xof(cv, block, block_len, counter + i, flags, out);
|
|
}
|
|
|
|
macro hash_one(char* input, usz blocks, uint[] key, ulong counter, char flags, char flags_start, char flags_end, char[] out) @local
|
|
{
|
|
uint[8] cv;
|
|
cv[..] = key[..];
|
|
char block_flags = flags | flags_start;
|
|
for (; blocks > 0; input += BLOCK_SIZE, blocks--, block_flags = flags)
|
|
{
|
|
if (blocks == 1) block_flags |= flags_end;
|
|
compress_in_place(cv[..], input[:BLOCK_SIZE], BLOCK_SIZE, counter, block_flags);
|
|
}
|
|
foreach (i, c : cv) mem::store((uint*)&out[i * 4], c, 1);
|
|
}
|
|
|
|
macro hash_many(char*[] inputs, usz num_inputs, usz blocks, uint[] key, ulong counter, bool $increment_counter, char flags, char flags_start, char flags_end, char* out) @local
|
|
{
|
|
for (; num_inputs > 0; num_inputs--, inputs = inputs[1..], out += OUT_SIZE)
|
|
{
|
|
hash_one(inputs[0], blocks, key, counter, flags, flags_start, flags_end, out[:OUT_SIZE]);
|
|
$if $increment_counter: counter++; $endif
|
|
}
|
|
}
|
|
|
|
|
|
fn void compress_subtree_to_parent_node(char[] input, uint[] key, ulong chunk_counter, char flags, char[] out, bool use_tbb) @local @noinline
|
|
{
|
|
char[MAX_SIMD_DEGREE_OR_2 * OUT_SIZE] cv_array;
|
|
|
|
usz num_cvs = compress_subtree_wide(input, key, chunk_counter, flags, cv_array[..], use_tbb);
|
|
assert(num_cvs <= 2);
|
|
|
|
$if MAX_SIMD_DEGREE_OR_2 > 2:
|
|
char[MAX_SIMD_DEGREE_OR_2 * OUT_SIZE / 2] out_array;
|
|
while (num_cvs > 2) num_cvs = compress_parents_parallel(cv_array[..], num_cvs, key, flags, &out_array);
|
|
$endif
|
|
|
|
out[..] = cv_array[:2 * OUT_SIZE];
|
|
}
|
|
|
|
fn usz compress_subtree_wide(char[] input, uint[] key, ulong chunk_counter, char flags, char* out, bool use_tbb) @local @noinline
|
|
{
|
|
if (input.len <= @simd_degree() * CHUNK_SIZE) return compress_chunks_parallel(input, key, chunk_counter, flags, out);
|
|
|
|
usz left_input_len = left_subtree_len(input.len);
|
|
usz right_input_len = input.len - left_input_len;
|
|
char* right_input = &input[left_input_len];
|
|
ulong right_chunk_counter = chunk_counter + (ulong)(left_input_len / CHUNK_SIZE);
|
|
|
|
char[2 * MAX_SIMD_DEGREE_OR_2 * OUT_SIZE] cv_array;
|
|
usz degree = @simd_degree();
|
|
if (left_input_len > CHUNK_SIZE && degree == 1) degree = 2;
|
|
char* right_cvs = &cv_array[degree * OUT_SIZE];
|
|
|
|
usz left_n = compress_subtree_wide(input[:left_input_len], key, chunk_counter, flags, &cv_array, use_tbb);
|
|
usz right_n = compress_subtree_wide(right_input[:right_input_len], key, right_chunk_counter, flags, right_cvs, use_tbb);
|
|
|
|
if (left_n == 1)
|
|
{
|
|
out[:2 * OUT_SIZE] = cv_array[:2 * OUT_SIZE];
|
|
return 2;
|
|
}
|
|
|
|
return compress_parents_parallel(cv_array[..], left_n + right_n, key, flags, out);
|
|
}
|
|
|
|
fn usz compress_parents_parallel(char[] child_chaining_values, usz num_chaining_values, uint[] key, char flags, char* out) @local @noinline
|
|
{
|
|
char*[MAX_SIMD_DEGREE_OR_2] parents_array;
|
|
usz parents_array_len = 0;
|
|
|
|
while (num_chaining_values - (2 * parents_array_len) >= 2)
|
|
{
|
|
parents_array[parents_array_len++] = &child_chaining_values[2 * parents_array_len * OUT_SIZE];
|
|
}
|
|
|
|
hash_many(parents_array[:parents_array_len], parents_array_len, 1, key, 0, false, flags | Blake3Flags.PARENT, 0, 0, out);
|
|
|
|
if (num_chaining_values > 2 * parents_array_len)
|
|
{
|
|
out[parents_array_len * OUT_SIZE : OUT_SIZE] = child_chaining_values[2 * parents_array_len * OUT_SIZE : OUT_SIZE];
|
|
return parents_array_len + 1;
|
|
}
|
|
|
|
return parents_array_len;
|
|
}
|
|
|
|
fn usz compress_chunks_parallel(char[] input, uint[] key, ulong chunk_counter, char flags, char* out) @local @noinline
|
|
{
|
|
char*[MAX_SIMD_DEGREE] chunks_array;
|
|
usz input_position = 0;
|
|
usz chunks_array_len = 0;
|
|
|
|
for (; input.len - input_position >= CHUNK_SIZE; input_position += CHUNK_SIZE)
|
|
{
|
|
chunks_array[chunks_array_len++] = &input[input_position];
|
|
}
|
|
|
|
hash_many(chunks_array[:chunks_array_len], chunks_array_len, CHUNK_SIZE / BLOCK_SIZE, key, chunk_counter, true, flags, Blake3Flags.CHUNK_START, Blake3Flags.CHUNK_END, out);
|
|
|
|
if (input.len <= input_position) return chunks_array_len;
|
|
|
|
ulong counter = chunk_counter + (ulong)chunks_array_len;
|
|
Blake3ChunkState chunk_state;
|
|
chunk_state.init(key, flags);
|
|
chunk_state.chunk_counter = counter;
|
|
chunk_state.update(input[input_position : input.len - input_position]);
|
|
Blake3Output o = chunk_state.output();
|
|
o.chaining_value(&out[chunks_array_len * OUT_SIZE]);
|
|
|
|
return chunks_array_len + 1;
|
|
}
|