From 9ad25b2aac5b86439575ccfaf257bd271bcde28d Mon Sep 17 00:00:00 2001 From: Nicholas Ward Date: Tue, 7 Mar 2023 11:06:49 -0800 Subject: [PATCH 01/23] optimizations --- .../kernel/asm/hash/blake2b/compression.asm | 29 +-- .../kernel/asm/hash/blake2b/g_functions.asm | 205 +++++++++++++----- 2 files changed, 154 insertions(+), 80 deletions(-) diff --git a/evm/src/cpu/kernel/asm/hash/blake2b/compression.asm b/evm/src/cpu/kernel/asm/hash/blake2b/compression.asm index a25158d9..840d8c54 100644 --- a/evm/src/cpu/kernel/asm/hash/blake2b/compression.asm +++ b/evm/src/cpu/kernel/asm/hash/blake2b/compression.asm @@ -181,30 +181,15 @@ compression_loop: POP POP // stack: cur_block, retdest - %blake2b_internal_state_addr - // stack: start, cur_block, retdest - PUSH 0 - // stack: round=0, start, cur_block, retdest // Run 12 rounds of G functions. - %rep 12 - // stack: round, start, cur_block, retdest - %call_blake2b_g_function(0, 4, 8, 12, 0, 1) - %call_blake2b_g_function(1, 5, 9, 13, 2, 3) - %call_blake2b_g_function(2, 6, 10, 14, 4, 5) - %call_blake2b_g_function(3, 7, 11, 15, 6, 7) - %call_blake2b_g_function(0, 5, 10, 15, 8, 9) - %call_blake2b_g_function(1, 6, 11, 12, 10, 11) - %call_blake2b_g_function(2, 7, 8, 13, 12, 13) - %call_blake2b_g_function(3, 4, 9, 14, 14, 15) - // stack: round, start, cur_block, retdest - %increment - // stack: round + 1, start, cur_block, retdest - %endrep - // stack: 12, start, cur_block, retdest - POP - POP - + PUSH g_functions_return + // stack: g_functions_return, cur_block, retdest + %blake2b_internal_state_addr + // stack: start, g_functions_return, cur_block, retdest + %jump(run_12_rounds_g_function) +g_functions_return: + // Finalize hash value. // stack: cur_block, retdest %blake2b_generate_new_hash_value(7) diff --git a/evm/src/cpu/kernel/asm/hash/blake2b/g_functions.asm b/evm/src/cpu/kernel/asm/hash/blake2b/g_functions.asm index 11e879fc..831841c5 100644 --- a/evm/src/cpu/kernel/asm/hash/blake2b/g_functions.asm +++ b/evm/src/cpu/kernel/asm/hash/blake2b/g_functions.asm @@ -1,89 +1,89 @@ -%macro blake2b_g_function +blake2b_g_function: // Function to mix two input words, x and y, into the four words indexed by a, b, c, d (which // are in the range 0..16) in the internal state. // The internal state is stored in memory starting at the address start. - // stack: a, b, c, d, x, y, start + // stack: a, b, c, d, x, y, start, retdest %stack (indices: 4) -> (indices, indices) - // stack: a, b, c, d, a, b, c, d, x, y, start + // stack: a, b, c, d, a, b, c, d, x, y, start, retdest DUP11 - // stack: start, a, b, c, d, a, b, c, d, x, y, start + // stack: start, a, b, c, d, a, b, c, d, x, y, start, retdest %stack (start, a, b, c, d) -> (d, start, c, start, b, start, a, start) - // stack: d, start, c, start, b, start, a, start, a, b, c, d, x, y, start + // stack: d, start, c, start, b, start, a, start, a, b, c, d, x, y, start, retdest ADD %mload_kernel_general - // stack: v[d], c, start, b, start, a, start, a, b, c, d, x, y, start + // stack: v[d], c, start, b, start, a, start, a, b, c, d, x, y, start, retdest %stack (vd, remaining: 6) -> (remaining, vd) - // stack: c, start, b, start, a, start, v[d], a, b, c, d, x, y, start + // stack: c, start, b, start, a, start, v[d], a, b, c, d, x, y, start, retdest ADD %mload_kernel_general %stack (vc, remaining: 4) -> (remaining, vc) - // stack: b, start, a, start, v[c], v[d], a, b, c, d, x, y, start + // stack: b, start, a, start, v[c], v[d], a, b, c, d, x, y, start, retdest ADD %mload_kernel_general - // stack: v[b], a, start, v[c], v[d], a, b, c, d, x, y, start + // stack: v[b], a, start, v[c], v[d], a, b, c, d, x, y, start, retdest %stack (vb, remaining: 2) -> (remaining, vb) - // stack: a, start, v[b], v[c], v[d], a, b, c, d, x, y, start + // stack: a, start, v[b], v[c], v[d], a, b, c, d, x, y, start, retdest ADD %mload_kernel_general - // stack: v[a], v[b], v[c], v[d], a, b, c, d, x, y, start + // stack: v[a], v[b], v[c], v[d], a, b, c, d, x, y, start, retdest DUP2 - // stack: v[b], v[a], v[b], v[c], v[d], a, b, c, d, x, y, start + // stack: v[b], v[a], v[b], v[c], v[d], a, b, c, d, x, y, start, retdest DUP10 - // stack: x, v[b], v[a], v[b], v[c], v[d], a, b, c, d, x, y, start + // stack: x, v[b], v[a], v[b], v[c], v[d], a, b, c, d, x, y, start, retdest ADD ADD %as_u64 - // stack: v[a]' = (v[a] + v[b] + x) % 2^64, v[b], v[c], v[d], a, b, c, d, x, y, start + // stack: v[a]' = (v[a] + v[b] + x) % 2^64, v[b], v[c], v[d], a, b, c, d, x, y, start, retdest %stack (a, b, c, d) -> (a, d, a, b, c, d) - // stack: v[a]', v[d], v[a]', v[b], v[c], v[d], a, b, c, d, x, y, start + // stack: v[a]', v[d], v[a]', v[b], v[c], v[d], a, b, c, d, x, y, start, retdest XOR %rotr_64(32) - // stack: v[d]' = (v[d] ^ v[a]') >>> 32, v[a]', v[b], v[c], v[d], a, b, c, d, x, y, start + // stack: v[d]' = (v[d] ^ v[a]') >>> 32, v[a]', v[b], v[c], v[d], a, b, c, d, x, y, start, retdest %stack (top: 4, vd) -> (top) - // stack: v[d]', v[a]', v[b], v[c], a, b, c, d, x, y, start + // stack: v[d]', v[a]', v[b], v[c], a, b, c, d, x, y, start, retdest %stack (d, a, b, c) -> (c, d, a, b, d) - // stack: v[c], v[d]', v[a]', v[b], v[d]', a, b, c, d, x, y, start + // stack: v[c], v[d]', v[a]', v[b], v[d]', a, b, c, d, x, y, start, retdest ADD %as_u64 - // stack: v[c]' = (v[c] + v[d]') % 2^64, v[a]', v[b], v[d]', a, b, c, d, x, y, start + // stack: v[c]' = (v[c] + v[d]') % 2^64, v[a]', v[b], v[d]', a, b, c, d, x, y, start, retdest %stack (c, a, b, d) -> (b, c, a, c, d) - // stack: v[b], v[c]', v[a]', v[c]', v[d]', a, b, c, d, x, y, start + // stack: v[b], v[c]', v[a]', v[c]', v[d]', a, b, c, d, x, y, start, retdest XOR %rotr_64(24) - // stack: v[b]' = (v[b] ^ v[c]') >>> 24, v[a]', v[c]', v[d]', a, b, c, d, x, y, start + // stack: v[b]' = (v[b] ^ v[c]') >>> 24, v[a]', v[c]', v[d]', a, b, c, d, x, y, start, retdest SWAP1 - // stack: v[a]', v[b]', v[c]', v[d]', a, b, c, d, x, y, start + // stack: v[a]', v[b]', v[c]', v[d]', a, b, c, d, x, y, start, retdest DUP2 - // stack: v[b]', v[a]', v[b]', v[c]', v[d]', a, b, c, d, x, y, start + // stack: v[b]', v[a]', v[b]', v[c]', v[d]', a, b, c, d, x, y, start, retdest DUP11 - // stack: y, v[b]', v[a]', v[b]', v[c]', v[d]', a, b, c, d, x, y, start + // stack: y, v[b]', v[a]', v[b]', v[c]', v[d]', a, b, c, d, x, y, start, retdest ADD ADD %as_u64 - // stack: v[a]'' = (v[a]' + v[b]' + y) % 2^64, v[b]', v[c]', v[d]', a, b, c, d, x, y, start + // stack: v[a]'' = (v[a]' + v[b]' + y) % 2^64, v[b]', v[c]', v[d]', a, b, c, d, x, y, start, retdest SWAP3 - // stack: v[d]', v[b]', v[c]', v[a]'', a, b, c, d, x, y, start + // stack: v[d]', v[b]', v[c]', v[a]'', a, b, c, d, x, y, start, retdest DUP4 - // stack: v[a]'', v[d]', v[b]', v[c]', v[a]'', a, b, c, d, x, y, start + // stack: v[a]'', v[d]', v[b]', v[c]', v[a]'', a, b, c, d, x, y, start, retdest XOR %rotr_64(16) - // stack: v[d]'' = (v[a]'' ^ v[d]') >>> 8, v[b]', v[c]', v[a]'', a, b, c, d, x, y, start + // stack: v[d]'' = (v[a]'' ^ v[d]') >>> 8, v[b]', v[c]', v[a]'', a, b, c, d, x, y, start, retdest SWAP2 - // stack: v[c]', v[b]', v[d]'', v[a]'', a, b, c, d, x, y, start + // stack: v[c]', v[b]', v[d]'', v[a]'', a, b, c, d, x, y, start, retdest DUP3 - // stack: v[d]'', v[c]', v[b]', v[d]'', v[a]'', a, b, c, d, x, y, start + // stack: v[d]'', v[c]', v[b]', v[d]'', v[a]'', a, b, c, d, x, y, start, retdest ADD %as_u64 - // stack: v[c]'' = (v[c]' + v[d]'') % 2^64, v[b]', v[d]'', v[a]'', a, b, c, d, x, y, start + // stack: v[c]'' = (v[c]' + v[d]'') % 2^64, v[b]', v[d]'', v[a]'', a, b, c, d, x, y, start, retdest DUP1 - // stack: v[c]'', v[c]'', v[b]', v[d]'', v[a]'', a, b, c, d, x, y, start + // stack: v[c]'', v[c]'', v[b]', v[d]'', v[a]'', a, b, c, d, x, y, start, retdest SWAP2 - // stack: v[b]', v[c]'', v[c]'', v[d]'', v[a]'', a, b, c, d, x, y, start + // stack: v[b]', v[c]'', v[c]'', v[d]'', v[a]'', a, b, c, d, x, y, start, retdest XOR %rotr_64(63) - // stack: v[b]'' = (v[b]' ^ v[c]'') >>> 7, v[c]'', v[d]'', v[a]'', a, b, c, d, x, y, start + // stack: v[b]'' = (v[b]' ^ v[c]'') >>> 7, v[c]'', v[d]'', v[a]'', a, b, c, d, x, y, start, retdest %stack (vb, vc, vd, va, a, b, c, d, x, y, start) -> (start, a, va, start, b, vb, start, c, vc, start, d, vd) - // stack: start, a, v[a]'', start, b, v[b]'', start, c, v[c]'', start, d, v[d]'' + // stack: start, a, v[a]'', start, b, v[b]'', start, c, v[c]'', start, d, v[d]'', retdest ADD %mstore_kernel_general ADD @@ -92,35 +92,124 @@ %mstore_kernel_general ADD %mstore_kernel_general -%endmacro + // stack: retdest + JUMP -%macro call_blake2b_g_function(a, b, c, d, x_idx, y_idx) - // stack: round, start - PUSH $y_idx - DUP2 - // stack: round, y_idx, round, start +call_blake2b_g_function: + // stack: a, b, c, d, x_idx, y_idx, round, start, retdest + DUP6 + // stack: y_idx, a, b, c, d, x_idx, y_idx, round, start, retdest + DUP8 + // stack: round, y_idx, a, b, c, d, x_idx, y_idx, round, start, retdest %blake2b_permutation - // stack: s[y_idx], round, start + // stack: s[y_idx], a, b, c, d, x_idx, y_idx, round, start, retdest %blake2b_message_addr ADD %mload_kernel_general - // stack: m[s[y_idx]], round, start - PUSH $x_idx - DUP3 - // stack: round, 2, m[s[y_idx]], round, start + // stack: m[s[y_idx]], a, b, c, d, x_idx, y_idx, round, start, retdest + DUP6 + // stack: x_idx, m[s[y_idx]], a, b, c, d, x_idx, y_idx, round, start, retdest + DUP9 + // stack: round, x_idx, m[s[y_idx]], a, b, c, d, x_idx, y_idx, round, start, retdest %blake2b_permutation - // stack: s[x_idx], m[s[y_idx]], round, start + // stack: s[x_idx], m[s[y_idx]], a, b, c, d, x_idx, y_idx, round, start, retdest %blake2b_message_addr ADD %mload_kernel_general - // stack: m[s[x_idx]], m[s[y_idx]], round, start - %stack (ss: 2, r, s) -> (ss, s, r, s) - // stack: m[s[x_idx]], m[s[y_idx]], start, round, start - PUSH $d - PUSH $c - PUSH $b - PUSH $a - // stack: a, b, c, d, m[s[x_idx]], m[s[y_idx]], start, round, start - %blake2b_g_function - // stack: round, start -%endmacro + // stack: m[s[x_idx]], m[s[y_idx]], a, b, c, d, x_idx, y_idx, round, start, retdest + %stack (mm: 2, abcd: 4, xy: 2, r, s) -> (abcd, mm, s, r, s) + // stack: a, b, c, d, m[s[x_idx]], m[s[y_idx]], start, round, start, retdest + %jump(blake2b_g_function) + +global run_g_function_round: + // stack: round, start, retdest + PUSH g_function_return_1 + // stack: g_function_return_1, round, start, retdest + %stack (ret, r, s) -> (0, 4, 8, 12, 0, 1, r, s, ret, r, s) + // stack: a=0, b=4, c=8, d=12, x_idx=0, y_idx=1, round, start, g_function_return_1, round, start, retdest + %jump(call_blake2b_g_function) +g_function_return_1: + // stack: round, start, retdest + PUSH g_function_return_2 + // stack: g_function_return_2, round, start, retdest + %stack (ret, r, s) -> (1, 5, 9, 13, 2, 3, r, s, ret, r, s) + // stack: a=1, b=5, c=9, d=13, x_idx=2, y_idx=3, round, start, g_function_return_2, round, start, retdest + %jump(call_blake2b_g_function) +g_function_return_2: + // stack: round, start, retdest + PUSH g_function_return_3 + // stack: g_function_return_3, round, start, retdest + %stack (ret, r, s) -> (2, 6, 10, 14, 4, 5, r, s, ret, r, s) + // stack: a=2, b=6, c=10, d=14, x_idx=4, y_idx=5, round, start, g_function_return_3, round, start, retdest + %jump(call_blake2b_g_function) +g_function_return_3: + // stack: round, start, retdest + PUSH g_function_return_4 + // stack: g_function_return_4, round, start, retdest + %stack (ret, r, s) -> (3, 7, 11, 15, 6, 7, r, s, ret, r, s) + // stack: a=3, b=7, c=11, d=15, x_idx=6, y_idx=7, round, start, g_function_return_4, round, start, retdest + %jump(call_blake2b_g_function) +g_function_return_4: + // stack: round, start, retdest + PUSH g_function_return_5 + // stack: g_function_return_5, round, start, retdest + %stack (ret, r, s) -> (0, 5, 10, 15, 8, 9, r, s, ret, r, s) + // stack: a=0, b=5, c=10, d=15, x_idx=8, y_idx=9, round, start, g_function_return_5, round, start, retdest + %jump(call_blake2b_g_function) +g_function_return_5: + // stack: round, start, retdest + PUSH g_function_return_6 + // stack: g_function_return_6, round, start, retdest + %stack (ret, r, s) -> (1, 6, 11, 12, 10, 11, r, s, ret, r, s) + // stack: a=1, b=6, c=11, d=12, x_idx=10, y_idx=11, round, start, g_function_return_6, round, start, retdest + %jump(call_blake2b_g_function) +g_function_return_6: + // stack: round, start, retdest + PUSH g_function_return_7 + // stack: g_function_return_7, round, start, retdest + %stack (ret, r, s) -> (2, 7, 8, 13, 12, 13, r, s, ret, r, s) + // stack: a=2, b=7, c=8, d=13, x_idx=12, y_idx=13, round, start, g_function_return_7, round, start, retdest + %jump(call_blake2b_g_function) +g_function_return_7: + // stack: round, start, retdest + PUSH g_function_return_8 + // stack: g_function_return_8, round, start, retdest + %stack (ret, r, s) -> (3, 4, 9, 14, 14, 15, r, s, ret, r, s) + // stack: a=3, b=4, c=9, d=14, x_idx=14, y_idx=15, round, start, g_function_return_8, round, start, retdest + %jump(call_blake2b_g_function) +g_function_return_8: + // stack: round, start, retdest + SWAP1 + // stack: start, round, retdest + SWAP2 + // stack: retdest, round, start + JUMP + + +global run_12_rounds_g_function: + // stack: start, retdest + PUSH 0 + // stack: round=0, start, retdest +run_next_round_g_function: + // stack: round, start, retdest + PUSH run_g_function_round_return + // stack: run_g_function_round_return, round, start, retdest + SWAP2 + // stack: start, round, run_g_function_round_return, retdest + SWAP1 + // stack: round, start, run_g_function_round_return, retdest + %jump(run_g_function_round) +run_next_round_g_function_return: + // stack: round, start, retdest + %increment + // stack: round+1, start, retdest + DUP1 + // stack: round+1, round+1, start, retdest + %lt_const(12) + // stack: round+1 < 12, round+1, start, retdest + %jumpi(run_next_round_g_function) + // stack: round+1, start, retdest + %pop2 + // stack: retdest + JUMP + From 4e8af821aefb3042008aefcfe906e520619f949b Mon Sep 17 00:00:00 2001 From: Nicholas Ward Date: Tue, 7 Mar 2023 12:25:34 -0800 Subject: [PATCH 02/23] fixes --- evm/src/cpu/kernel/asm/hash/blake2b/g_functions.asm | 12 ++++++------ evm/src/cpu/kernel/tests/hash.rs | 2 ++ 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/evm/src/cpu/kernel/asm/hash/blake2b/g_functions.asm b/evm/src/cpu/kernel/asm/hash/blake2b/g_functions.asm index 831841c5..ff3ee43a 100644 --- a/evm/src/cpu/kernel/asm/hash/blake2b/g_functions.asm +++ b/evm/src/cpu/kernel/asm/hash/blake2b/g_functions.asm @@ -117,8 +117,8 @@ call_blake2b_g_function: ADD %mload_kernel_general // stack: m[s[x_idx]], m[s[y_idx]], a, b, c, d, x_idx, y_idx, round, start, retdest - %stack (mm: 2, abcd: 4, xy: 2, r, s) -> (abcd, mm, s, r, s) - // stack: a, b, c, d, m[s[x_idx]], m[s[y_idx]], start, round, start, retdest + %stack (mm: 2, abcd: 4, xy: 2, r, s) -> (abcd, mm, s) + // stack: a, b, c, d, m[s[x_idx]], m[s[y_idx]], start, retdest %jump(blake2b_g_function) global run_g_function_round: @@ -192,12 +192,12 @@ global run_12_rounds_g_function: // stack: round=0, start, retdest run_next_round_g_function: // stack: round, start, retdest - PUSH run_g_function_round_return - // stack: run_g_function_round_return, round, start, retdest + PUSH run_next_round_g_function_return + // stack: run_next_round_g_function_return, round, start, retdest SWAP2 - // stack: start, round, run_g_function_round_return, retdest + // stack: start, round, run_next_round_g_function_return, retdest SWAP1 - // stack: round, start, run_g_function_round_return, retdest + // stack: round, start, run_next_round_g_function_return, retdest %jump(run_g_function_round) run_next_round_g_function_return: // stack: round, start, retdest diff --git a/evm/src/cpu/kernel/tests/hash.rs b/evm/src/cpu/kernel/tests/hash.rs index bc73ecd5..cc2c96eb 100644 --- a/evm/src/cpu/kernel/tests/hash.rs +++ b/evm/src/cpu/kernel/tests/hash.rs @@ -79,6 +79,8 @@ fn prepare_test( // Run the interpeter let result = run_interpreter_with_memory(interpreter_setup).unwrap(); + dbg!(result.stack().to_vec()); + Ok((expected, result.stack().to_vec())) } From ef377c0b4f612a3e65ffa3beb2fe95fa23f4cbe5 Mon Sep 17 00:00:00 2001 From: Nicholas Ward Date: Tue, 7 Mar 2023 12:27:08 -0800 Subject: [PATCH 03/23] cleanup --- evm/src/cpu/kernel/tests/hash.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/evm/src/cpu/kernel/tests/hash.rs b/evm/src/cpu/kernel/tests/hash.rs index cc2c96eb..bc73ecd5 100644 --- a/evm/src/cpu/kernel/tests/hash.rs +++ b/evm/src/cpu/kernel/tests/hash.rs @@ -79,8 +79,6 @@ fn prepare_test( // Run the interpeter let result = run_interpreter_with_memory(interpreter_setup).unwrap(); - dbg!(result.stack().to_vec()); - Ok((expected, result.stack().to_vec())) } From 3a0d86e26268404857eee9e71ba4874437408d49 Mon Sep 17 00:00:00 2001 From: Nicholas Ward Date: Tue, 7 Mar 2023 15:45:20 -0800 Subject: [PATCH 04/23] hash function optimization --- .../kernel/asm/hash/blake2b/compression.asm | 12 +-- evm/src/cpu/kernel/asm/hash/blake2b/hash.asm | 91 +++++++++++++++++-- 2 files changed, 86 insertions(+), 17 deletions(-) diff --git a/evm/src/cpu/kernel/asm/hash/blake2b/compression.asm b/evm/src/cpu/kernel/asm/hash/blake2b/compression.asm index 840d8c54..fdf02d69 100644 --- a/evm/src/cpu/kernel/asm/hash/blake2b/compression.asm +++ b/evm/src/cpu/kernel/asm/hash/blake2b/compression.asm @@ -192,14 +192,10 @@ g_functions_return: // Finalize hash value. // stack: cur_block, retdest - %blake2b_generate_new_hash_value(7) - %blake2b_generate_new_hash_value(6) - %blake2b_generate_new_hash_value(5) - %blake2b_generate_new_hash_value(4) - %blake2b_generate_new_hash_value(3) - %blake2b_generate_new_hash_value(2) - %blake2b_generate_new_hash_value(1) - %blake2b_generate_new_hash_value(0) + PUSH hash_generate_return + // stack: hash_generate_return, cur_block, retdest + %jump(blake2b_generate_all_hash_values) +hash_generate_return: // stack: h_0', h_1', h_2', h_3', h_4', h_5', h_6', h_7', cur_block, retdest DUP9 // stack: cur_block, h_0', h_1', h_2', h_3', h_4', h_5', h_6', h_7', cur_block, retdest diff --git a/evm/src/cpu/kernel/asm/hash/blake2b/hash.asm b/evm/src/cpu/kernel/asm/hash/blake2b/hash.asm index 712a97c0..91a5530e 100644 --- a/evm/src/cpu/kernel/asm/hash/blake2b/hash.asm +++ b/evm/src/cpu/kernel/asm/hash/blake2b/hash.asm @@ -1,18 +1,91 @@ -%macro blake2b_generate_new_hash_value(i) +blake2b_generate_new_hash_value: + // stack: i, retdest %blake2b_hash_value_addr - %add_const($i) + // stack: addr, i, retdest + DUP2 + ADD %mload_kernel_general - // stack: h_i, ... + // stack: h_i, i, retdest %blake2b_internal_state_addr - %add_const($i) + // stack: addr, h_i, i, retdest + DUP3 + ADD %mload_kernel_general - // stack: v_i, h_i, ... + // stack: v_i, h_i, i, retdest %blake2b_internal_state_addr - %add_const($i) + // stack: addr, v_i, h_i, i, retdest + DUP4 + ADD %add_const(8) %mload_kernel_general - // stack: v_(i+8), v_i, h_i, ... + // stack: v_(i+8), v_i, h_i, i, retdest XOR XOR - // stack: h_i' = v_(i+8) ^ v_i ^ h_i, ... -%endmacro + // stack: h_i' = v_(i+8) ^ v_i ^ h_i, i, retdest + SWAP1 + POP + // stack: h_i', retdest + SWAP1 + JUMP + +global blake2b_generate_all_hash_values: + // stack: retdest + PUSH blake2b_generate_hash_return_7 + // stack: blake2b_generate_hash_return_7, retdest + PUSH 7 + // stack: 7, blake2b_generate_hash_return_7, retdest + %jump(blake2b_generate_new_hash_value) +blake2b_generate_hash_return_7: + // stack: h_7', retdest + PUSH blake2b_generate_hash_return_6 + // stack: blake2b_generate_hash_return_6, h_7', retdest + PUSH 6 + // stack: 6, blake2b_generate_hash_return_6, h_7', retdest + %jump(blake2b_generate_new_hash_value) +blake2b_generate_hash_return_6: + // stack: h_6', h_7', retdest + PUSH blake2b_generate_hash_return_5 + // stack: blake2b_generate_hash_return_5, h_6', h_7', retdest + PUSH 5 + // stack: 5, blake2b_generate_hash_return_5, h_6', h_7', retdest + %jump(blake2b_generate_new_hash_value) +blake2b_generate_hash_return_5: + // stack: h_5', h_6', h_7', retdest + PUSH blake2b_generate_hash_return_4 + // stack: blake2b_generate_hash_return_4, h_5', h_6', h_7', retdest + PUSH 4 + // stack: 4, blake2b_generate_hash_return_4, h_5', h_6', h_7', retdest + %jump(blake2b_generate_new_hash_value) +blake2b_generate_hash_return_4: + // stack: h_4', h_5', h_6', h_7', retdest + PUSH blake2b_generate_hash_return_3 + // stack: blake2b_generate_hash_return_3, h_4', h_5', h_6', h_7', retdest + PUSH 3 + // stack: 3, blake2b_generate_hash_return_3, h_4', h_5', h_6', h_7', retdest + %jump(blake2b_generate_new_hash_value) +blake2b_generate_hash_return_3: + // stack: h_3', h_4', h_5', h_6', h_7', retdest + PUSH blake2b_generate_hash_return_2 + // stack: blake2b_generate_hash_return_2, h_3', h_4', h_5', h_6', h_7', retdest + PUSH 2 + // stack: 2, blake2b_generate_hash_return_2, h_3', h_4', h_5', h_6', h_7', retdest + %jump(blake2b_generate_new_hash_value) +blake2b_generate_hash_return_2: + // stack: h_2', h_3', h_4', h_5', h_6', h_7', retdest + PUSH blake2b_generate_hash_return_1 + // stack: blake2b_generate_hash_return_1, h_2', h_3', h_4', h_5', h_6', h_7', retdest + PUSH 1 + // stack: 1, blake2b_generate_hash_return_1, h_2', h_3', h_4', h_5', h_6', h_7', retdest + %jump(blake2b_generate_new_hash_value) +blake2b_generate_hash_return_1: + // stack: h_1', h_2', h_3', h_4', h_5', h_6', h_7', retdest + PUSH blake2b_generate_hash_return_0 + // stack: blake2b_generate_hash_return_0, h_1', h_2', h_3', h_4', h_5', h_6', h_7', retdest + PUSH 0 + // stack: 0, blake2b_generate_hash_return_0, h_1', h_2', h_3', h_4', h_5', h_6', h_7', retdest + %jump(blake2b_generate_new_hash_value) +blake2b_generate_hash_return_0: + // stack: h_0', h_1', h_2', h_3', h_4', h_5', h_6', h_7', retdest + %stack (h: 8, ret) -> (ret, h) + // stack: retdest, h_0', h_1', h_2', h_3', h_4', h_5', h_6', h_7' + JUMP \ No newline at end of file From df7ea93ab330ec444801d8022ed151f25209cb10 Mon Sep 17 00:00:00 2001 From: Nicholas Ward Date: Tue, 7 Mar 2023 15:45:52 -0800 Subject: [PATCH 05/23] optimize hash generation further --- evm/src/cpu/kernel/asm/hash/blake2b/hash.asm | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/evm/src/cpu/kernel/asm/hash/blake2b/hash.asm b/evm/src/cpu/kernel/asm/hash/blake2b/hash.asm index 91a5530e..945c7fb0 100644 --- a/evm/src/cpu/kernel/asm/hash/blake2b/hash.asm +++ b/evm/src/cpu/kernel/asm/hash/blake2b/hash.asm @@ -14,17 +14,17 @@ blake2b_generate_new_hash_value: // stack: v_i, h_i, i, retdest %blake2b_internal_state_addr // stack: addr, v_i, h_i, i, retdest - DUP4 + SWAP1 + // stack: v_i, addr, h_i, i, retdest + SWAP3 + // stack: i, addr, h_i, v_i, retdest ADD %add_const(8) %mload_kernel_general - // stack: v_(i+8), v_i, h_i, i, retdest + // stack: v_(i+8), h_i, v_i, retdest XOR XOR - // stack: h_i' = v_(i+8) ^ v_i ^ h_i, i, retdest - SWAP1 - POP - // stack: h_i', retdest + // stack: h_i' = v_(i+8) ^ v_i ^ h_i, retdest SWAP1 JUMP @@ -88,4 +88,4 @@ blake2b_generate_hash_return_0: // stack: h_0', h_1', h_2', h_3', h_4', h_5', h_6', h_7', retdest %stack (h: 8, ret) -> (ret, h) // stack: retdest, h_0', h_1', h_2', h_3', h_4', h_5', h_6', h_7' - JUMP \ No newline at end of file + JUMP From 2020202e507595da62867f8d2760c8918379daac Mon Sep 17 00:00:00 2001 From: Nicholas Ward Date: Tue, 7 Mar 2023 16:19:15 -0800 Subject: [PATCH 06/23] optimize hash generation further further --- evm/src/cpu/kernel/asm/hash/blake2b/hash.asm | 77 +++++--------------- evm/src/cpu/kernel/tests/hash.rs | 2 + 2 files changed, 22 insertions(+), 57 deletions(-) diff --git a/evm/src/cpu/kernel/asm/hash/blake2b/hash.asm b/evm/src/cpu/kernel/asm/hash/blake2b/hash.asm index 945c7fb0..654b51b3 100644 --- a/evm/src/cpu/kernel/asm/hash/blake2b/hash.asm +++ b/evm/src/cpu/kernel/asm/hash/blake2b/hash.asm @@ -30,62 +30,25 @@ blake2b_generate_new_hash_value: global blake2b_generate_all_hash_values: // stack: retdest - PUSH blake2b_generate_hash_return_7 - // stack: blake2b_generate_hash_return_7, retdest - PUSH 7 - // stack: 7, blake2b_generate_hash_return_7, retdest + PUSH 8 + // stack: i=8, retdest +blake2b_generate_hash_loop: + // stack: i, h_i', ..., h_7', retdest + %decrement + // stack: i-1, h_i', ..., h_7', retdest + PUSH blake2b_generate_hash_return + // stack: blake2b_generate_hash_return, i-1, h_i', ..., h_7', retdest + DUP2 + // stack: i-1, blake2b_generate_hash_return, i-1, h_i', ..., h_7', retdest %jump(blake2b_generate_new_hash_value) -blake2b_generate_hash_return_7: - // stack: h_7', retdest - PUSH blake2b_generate_hash_return_6 - // stack: blake2b_generate_hash_return_6, h_7', retdest - PUSH 6 - // stack: 6, blake2b_generate_hash_return_6, h_7', retdest - %jump(blake2b_generate_new_hash_value) -blake2b_generate_hash_return_6: - // stack: h_6', h_7', retdest - PUSH blake2b_generate_hash_return_5 - // stack: blake2b_generate_hash_return_5, h_6', h_7', retdest - PUSH 5 - // stack: 5, blake2b_generate_hash_return_5, h_6', h_7', retdest - %jump(blake2b_generate_new_hash_value) -blake2b_generate_hash_return_5: - // stack: h_5', h_6', h_7', retdest - PUSH blake2b_generate_hash_return_4 - // stack: blake2b_generate_hash_return_4, h_5', h_6', h_7', retdest - PUSH 4 - // stack: 4, blake2b_generate_hash_return_4, h_5', h_6', h_7', retdest - %jump(blake2b_generate_new_hash_value) -blake2b_generate_hash_return_4: - // stack: h_4', h_5', h_6', h_7', retdest - PUSH blake2b_generate_hash_return_3 - // stack: blake2b_generate_hash_return_3, h_4', h_5', h_6', h_7', retdest - PUSH 3 - // stack: 3, blake2b_generate_hash_return_3, h_4', h_5', h_6', h_7', retdest - %jump(blake2b_generate_new_hash_value) -blake2b_generate_hash_return_3: - // stack: h_3', h_4', h_5', h_6', h_7', retdest - PUSH blake2b_generate_hash_return_2 - // stack: blake2b_generate_hash_return_2, h_3', h_4', h_5', h_6', h_7', retdest - PUSH 2 - // stack: 2, blake2b_generate_hash_return_2, h_3', h_4', h_5', h_6', h_7', retdest - %jump(blake2b_generate_new_hash_value) -blake2b_generate_hash_return_2: - // stack: h_2', h_3', h_4', h_5', h_6', h_7', retdest - PUSH blake2b_generate_hash_return_1 - // stack: blake2b_generate_hash_return_1, h_2', h_3', h_4', h_5', h_6', h_7', retdest - PUSH 1 - // stack: 1, blake2b_generate_hash_return_1, h_2', h_3', h_4', h_5', h_6', h_7', retdest - %jump(blake2b_generate_new_hash_value) -blake2b_generate_hash_return_1: - // stack: h_1', h_2', h_3', h_4', h_5', h_6', h_7', retdest - PUSH blake2b_generate_hash_return_0 - // stack: blake2b_generate_hash_return_0, h_1', h_2', h_3', h_4', h_5', h_6', h_7', retdest - PUSH 0 - // stack: 0, blake2b_generate_hash_return_0, h_1', h_2', h_3', h_4', h_5', h_6', h_7', retdest - %jump(blake2b_generate_new_hash_value) -blake2b_generate_hash_return_0: - // stack: h_0', h_1', h_2', h_3', h_4', h_5', h_6', h_7', retdest - %stack (h: 8, ret) -> (ret, h) - // stack: retdest, h_0', h_1', h_2', h_3', h_4', h_5', h_6', h_7' +blake2b_generate_hash_return: + // stack: h_(i-1)', i-1, h_i', ..., h_7', retdest + SWAP1 + // stack: i-1, h_(i-1)', h_i', ..., h_7', retdest + DUP1 + // stack: i-1, i-1, h_(i-1)', ..., h_7', retdest + %jumpi(blake2b_generate_hash_loop) + // stack: i-1=0, h_0', ..., h_7', retdest + %stack (i, h: 8, ret) -> (ret, h) + // stack: retdest, h_0'...h_7' JUMP diff --git a/evm/src/cpu/kernel/tests/hash.rs b/evm/src/cpu/kernel/tests/hash.rs index bc73ecd5..cc2c96eb 100644 --- a/evm/src/cpu/kernel/tests/hash.rs +++ b/evm/src/cpu/kernel/tests/hash.rs @@ -79,6 +79,8 @@ fn prepare_test( // Run the interpeter let result = run_interpreter_with_memory(interpreter_setup).unwrap(); + dbg!(result.stack().to_vec()); + Ok((expected, result.stack().to_vec())) } From 7c8026e893952fb721d422b36c512e7caa9fd8be Mon Sep 17 00:00:00 2001 From: Nicholas Ward Date: Tue, 7 Mar 2023 16:49:28 -0800 Subject: [PATCH 07/23] cleanup --- evm/src/cpu/kernel/asm/hash/blake2b/compression.asm | 1 - 1 file changed, 1 deletion(-) diff --git a/evm/src/cpu/kernel/asm/hash/blake2b/compression.asm b/evm/src/cpu/kernel/asm/hash/blake2b/compression.asm index fdf02d69..11e5389b 100644 --- a/evm/src/cpu/kernel/asm/hash/blake2b/compression.asm +++ b/evm/src/cpu/kernel/asm/hash/blake2b/compression.asm @@ -189,7 +189,6 @@ compression_loop: // stack: start, g_functions_return, cur_block, retdest %jump(run_12_rounds_g_function) g_functions_return: - // Finalize hash value. // stack: cur_block, retdest PUSH hash_generate_return From 213ba8ff50835d99039b418dcaf31cd6f7698137 Mon Sep 17 00:00:00 2001 From: Nicholas Ward Date: Tue, 7 Mar 2023 17:17:23 -0800 Subject: [PATCH 08/23] optimized initial hash value generation --- .../cpu/kernel/asm/hash/blake2b/addresses.asm | 16 ----- .../kernel/asm/hash/blake2b/compression.asm | 4 +- evm/src/cpu/kernel/asm/hash/blake2b/iv.asm | 61 ++++++++++++++----- evm/src/cpu/kernel/tests/hash.rs | 2 - 4 files changed, 50 insertions(+), 33 deletions(-) diff --git a/evm/src/cpu/kernel/asm/hash/blake2b/addresses.asm b/evm/src/cpu/kernel/asm/hash/blake2b/addresses.asm index 9d65b9ed..f1d7c3e9 100644 --- a/evm/src/cpu/kernel/asm/hash/blake2b/addresses.asm +++ b/evm/src/cpu/kernel/asm/hash/blake2b/addresses.asm @@ -1,19 +1,3 @@ -// Load the initial hash value (the IV, but with params XOR'd into the first word). -%macro blake2b_initial_hash_value - %blake2b_iv_i(7) - %blake2b_iv_i(6) - %blake2b_iv_i(5) - %blake2b_iv_i(4) - %blake2b_iv_i(3) - %blake2b_iv_i(2) - %blake2b_iv_i(1) - // stack: IV_1, IV_2, IV_3, IV_4, IV_5, IV_6, IV_7 - PUSH 0x01010040 // params: key = 00, digest_size = 64 = 0x40 - %blake2b_iv_i(0) - XOR - // stack: IV_0 ^ params, IV_1, IV_2, IV_3, IV_4, IV_5, IV_6, IV_7 -%endmacro - // Address where the working version of the hash value is stored. %macro blake2b_hash_value_addr PUSH 0 diff --git a/evm/src/cpu/kernel/asm/hash/blake2b/compression.asm b/evm/src/cpu/kernel/asm/hash/blake2b/compression.asm index 11e5389b..cd1f6a80 100644 --- a/evm/src/cpu/kernel/asm/hash/blake2b/compression.asm +++ b/evm/src/cpu/kernel/asm/hash/blake2b/compression.asm @@ -2,7 +2,9 @@ global blake2b_compression: // stack: retdest PUSH 0 // stack: cur_block = 0, retdest - %blake2b_initial_hash_value + PUSH compression_loop + // stack: compression_loop, cur_block, retdest + %jump(blake2b_initial_hash_value) compression_loop: // stack: h_0, ..., h_7, cur_block, retdest diff --git a/evm/src/cpu/kernel/asm/hash/blake2b/iv.asm b/evm/src/cpu/kernel/asm/hash/blake2b/iv.asm index 94e9ba27..48df86a3 100644 --- a/evm/src/cpu/kernel/asm/hash/blake2b/iv.asm +++ b/evm/src/cpu/kernel/asm/hash/blake2b/iv.asm @@ -33,30 +33,63 @@ global blake2b_iv_const: BYTES 91, 224, 205, 25 BYTES 19, 126, 33, 121 -%macro blake2b_iv - // stack: i, ... +global blake2b_iv: + // stack: i, retdest PUSH blake2b_iv_const - // stack: blake2b_iv_const, i, ... + // stack: blake2b_iv_const, i, retdest SWAP1 - // stack: i, blake2b_iv_const, ... + // stack: i, blake2b_iv_const, retdest %mul_const(8) ADD - // stack: blake2b_iv_const + 2 * i, ... + // stack: blake2b_iv_const + 2 * i, retdest DUP1 - // stack: blake2b_iv_const + 2 * i, blake2b_iv_const + 2 * i, ... + // stack: blake2b_iv_const + 2 * i, blake2b_iv_const + 2 * i, retdest %add_const(4) - // stack: blake2b_iv_const + 2 * i + 1, blake2b_iv_const + 2 * i, ... + // stack: blake2b_iv_const + 2 * i + 1, blake2b_iv_const + 2 * i, retdest %mload_kernel_code_u32 SWAP1 %mload_kernel_code_u32 - // stack: IV_i[32:], IV_i[:32], ... + // stack: IV_i[32:], IV_i[:32], retdest %shl_const(32) - // stack: IV_i[32:] << 32, IV_i[:32], ... + // stack: IV_i[32:] << 32, IV_i[:32], retdest ADD // OR - // stack: IV_i, ... + // stack: IV_i, retdest + SWAP1 + JUMP + +%macro blake2b_iv + %stack (i) -> (i, %%after) + %jump(blake2b_iv) +%%after: %endmacro -%macro blake2b_iv_i(i) - PUSH $i - %blake2b_iv -%endmacro +// Load the initial hash value (the IV, but with params XOR'd into the first word). +global blake2b_initial_hash_value: + // stack: retdest + PUSH 8 + // stack: i=8, retdest +blake2b_initial_hash_loop: + // stack: i, IV_i, ..., IV_7, retdest + %decrement + // stack: i-1, IV_i, ..., IV_7, retdest + PUSH blake2b_initial_hash_return + // stack: blake2b_initial_hash_return, i-1, IV_i, ..., IV_7, retdest + DUP2 + // stack: i-1, blake2b_initial_hash_return, i-1, IV_i, ..., IV_7, retdest + %jump(blake2b_iv) +blake2b_initial_hash_return: + // stack: IV_(i-1), i-1, IV_i, ..., IV_7, retdest + SWAP1 + // stack: i-1, IV_(i-1), IV_i, ..., IV_7, retdest + DUP1 + // stack: i-1, i-1, IV_(i-1), ..., IV_7, retdest + %jumpi(blake2b_initial_hash_loop) + // stack: i-1=0, IV_0, ..., IV_7, retdest + POP + // stack: IV_0, ..., IV_7, retdest + PUSH 0x01010040 // params: key = 00, digest_size = 64 = 0x40 + XOR + // stack: IV_0 ^ params, IV_1, IV_2, IV_3, IV_4, IV_5, IV_6, IV_7, retdest + %stack(iv: 8, ret) -> (ret, iv) + JUMP + diff --git a/evm/src/cpu/kernel/tests/hash.rs b/evm/src/cpu/kernel/tests/hash.rs index cc2c96eb..bc73ecd5 100644 --- a/evm/src/cpu/kernel/tests/hash.rs +++ b/evm/src/cpu/kernel/tests/hash.rs @@ -79,8 +79,6 @@ fn prepare_test( // Run the interpeter let result = run_interpreter_with_memory(interpreter_setup).unwrap(); - dbg!(result.stack().to_vec()); - Ok((expected, result.stack().to_vec())) } From e5f8632b5e1fdda1476ac978d7b3390d3d92bb9e Mon Sep 17 00:00:00 2001 From: Nicholas Ward Date: Wed, 22 Feb 2023 15:44:31 -0800 Subject: [PATCH 09/23] small optimizations --- evm/src/cpu/kernel/asm/hash/blake2b/compression.asm | 7 +++---- evm/src/cpu/kernel/asm/hash/sha2/compression.asm | 4 +--- .../cpu/kernel/asm/hash/sha2/message_schedule.asm | 12 +++--------- 3 files changed, 7 insertions(+), 16 deletions(-) diff --git a/evm/src/cpu/kernel/asm/hash/blake2b/compression.asm b/evm/src/cpu/kernel/asm/hash/blake2b/compression.asm index cd1f6a80..bf3dd1c4 100644 --- a/evm/src/cpu/kernel/asm/hash/blake2b/compression.asm +++ b/evm/src/cpu/kernel/asm/hash/blake2b/compression.asm @@ -209,10 +209,9 @@ hash_generate_return: PUSH 0 %mload_kernel_general // stack: num_blocks, cur_block + 1, h_0', h_1', h_2', h_3', h_4', h_5', h_6', h_7', cur_block + 1, retdest - EQ - // stack: last_block, h_0', h_1', h_2', h_3', h_4', h_5', h_6', h_7', cur_block + 1, retdest - %jumpi(compression_end) - %jump(compression_loop) + GT + // stack: not_last_block, h_0', h_1', h_2', h_3', h_4', h_5', h_6', h_7', cur_block + 1, retdest + %jumpi(compression_loop) compression_end: // stack: h_0', h_1', h_2', h_3', h_4', h_5', h_6', h_7', cur_block + 1, retdest diff --git a/evm/src/cpu/kernel/asm/hash/sha2/compression.asm b/evm/src/cpu/kernel/asm/hash/sha2/compression.asm index 8c219ebb..fa58dce8 100644 --- a/evm/src/cpu/kernel/asm/hash/sha2/compression.asm +++ b/evm/src/cpu/kernel/asm/hash/sha2/compression.asm @@ -180,9 +180,7 @@ compression_loop: // stack: a[i+1], b[i+1], c[i+1], d[i+1], e[i+1], f[i+1], g[i+1], h[i+1], num_blocks new, scratch_space_addr, message_schedule_addr new, (i+1)%64, retdest DUP12 // stack: (i+1)%64, a[i+1], b[i+1], c[i+1], d[i+1], e[i+1], f[i+1], g[i+1], h[i+1], num_blocks new, scratch_space_addr, message_schedule_addr new, (i+1)%64, retdest - ISZERO - %jumpi(compression_end_block) - %jump(compression_loop) + %jumpi(compression_loop) compression_end_block: // Add the initial values of the eight working variables (from the start of this block's compression) back into them. // stack: a[64], b[64], c[64], d[64], e[64], f[64], g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest diff --git a/evm/src/cpu/kernel/asm/hash/sha2/message_schedule.asm b/evm/src/cpu/kernel/asm/hash/sha2/message_schedule.asm index 78d98634..0d7c2637 100644 --- a/evm/src/cpu/kernel/asm/hash/sha2/message_schedule.asm +++ b/evm/src/cpu/kernel/asm/hash/sha2/message_schedule.asm @@ -55,9 +55,7 @@ gen_message_schedule_from_block_0_loop: // stack: counter, output_addr - 4, block[0] >> 32, block[1], retdest %decrement DUP1 - ISZERO - %jumpi(gen_message_schedule_from_block_0_end) - %jump(gen_message_schedule_from_block_0_loop) + %jumpi(gen_message_schedule_from_block_0_loop) gen_message_schedule_from_block_0_end: // stack: old counter=0, output_addr, block[0], block[1], retdest POP @@ -185,9 +183,7 @@ gen_message_schedule_remaining_loop: %decrement // stack: counter - 1, output_addr + 4, block[0], block[1], retdest DUP1 - ISZERO - %jumpi(gen_message_schedule_remaining_end) - %jump(gen_message_schedule_remaining_loop) + %jumpi(gen_message_schedule_remaining_loop) gen_message_schedule_remaining_end: // stack: counter=0, output_addr, block[0], block[1], retdest %pop4 @@ -230,9 +226,7 @@ gen_all_message_schedules_loop_end: // stack: cur_addr + 64, counter - 1, cur_output_addr + 256, output_addr, retdest DUP2 // stack: counter - 1, cur_addr + 64, counter - 1, cur_output_addr + 256, output_addr, retdest - ISZERO - %jumpi(gen_all_message_schedules_end) - %jump(gen_all_message_schedules_loop) + %jumpi(gen_all_message_schedules_loop) gen_all_message_schedules_end: // stack: cur_addr + 64, counter - 1, cur_output_addr + 256, output_addr, retdest %pop3 From 2236f30ae557e82901b24f5114e3fb06c033e640 Mon Sep 17 00:00:00 2001 From: Nicholas Ward Date: Tue, 14 Mar 2023 15:52:50 -0700 Subject: [PATCH 10/23] more small optimizations --- .../kernel/asm/hash/blake2b/compression.asm | 54 +++++++++---------- .../cpu/kernel/asm/hash/sha2/compression.asm | 21 ++------ 2 files changed, 32 insertions(+), 43 deletions(-) diff --git a/evm/src/cpu/kernel/asm/hash/blake2b/compression.asm b/evm/src/cpu/kernel/asm/hash/blake2b/compression.asm index bf3dd1c4..9cf8aeef 100644 --- a/evm/src/cpu/kernel/asm/hash/blake2b/compression.asm +++ b/evm/src/cpu/kernel/asm/hash/blake2b/compression.asm @@ -85,8 +85,7 @@ compression_loop: // stack: cur_message_addr + 1, cur_block_byte + 8, ... %endrep // stack: end_message_addr, end_block_start_byte, t, cur_block, is_last_block, retdest - POP - POP + %pop2 // stack: t, cur_block, is_last_block, retdest SWAP1 // stack: cur_block, t, is_last_block, retdest @@ -128,15 +127,14 @@ compression_loop: // stack: 0, start + 8, invert_if_last_block, t, cur_block, retdest %rep 4 // stack: i, loc, ... - DUP2 - DUP2 - // stack: i, loc, i, loc,... + DUP1 + // stack: i, i, loc, ... %blake2b_iv - // stack: IV_i, loc, i, loc,... - SWAP1 - // stack: loc, IV_i, i, loc,... + // stack: IV_i, i, loc, ... + DUP2 + // stack: loc, IV_i, i, loc, ... %mstore_kernel_general - // stack: i, loc,... + // stack: i, loc, ... %increment SWAP1 %increment @@ -147,15 +145,11 @@ compression_loop: %stack (i, loc, inv, last, t) -> (t, t, i, loc, inv, last) // stack: t, t, 4, start + 12, invert_if_last_block, cur_block, retdest %shr_const(64) - // stack: t >> 64, t, 4, start + 12, invert_if_last_block, cur_block, retdest + // stack: t_hi = t >> 64, t, 4, start + 12, invert_if_last_block, cur_block, retdest SWAP1 - // stack: t, t >> 64, 4, start + 12, invert_if_last_block, cur_block, retdest - PUSH 1 - %shl_const(64) - // stack: 1 << 64, t, t >> 64, 4, start + 12, invert_if_last_block, cur_block, retdest - SWAP1 - MOD - // stack: t_lo = t % (1 << 64), t_hi = t >> 64, 4, start + 12, invert_if_last_block, cur_block, retdest + // stack: t, t_hi, 4, start + 12, invert_if_last_block, cur_block, retdest + %mod_const(PUSH 0x10000000000000000) + // stack: t_lo = t % (1 << 64), t_hi, 4, start + 12, invert_if_last_block, cur_block, retdest %stack (t_lo, t_hi, i, loc, inv) -> (i, loc, t_lo, t_hi, inv, 0) // stack: 4, start + 12, t_lo, t_hi, invert_if_last_block, 0, cur_block, retdest @@ -163,25 +157,31 @@ compression_loop: // the values (t % 2**64, t >> 64, invert_if, 0). %rep 4 // stack: i, loc, val, next_val,... - %stack (i, loc, val) -> (i, val, loc, i, loc) - // stack: i, val, loc, i, loc, next_val,... + DUP1 + // stack: i, i, loc, val, next_val,... %blake2b_iv - // stack: IV_i, val, loc, i, loc, next_val,... + // stack: IV_i, i, loc, val, next_val,... + DUP4 + // stack: val, IV_i, i, loc, val, next_val,... XOR - // stack: val ^ IV_i, loc, i, loc, next_val,... - SWAP1 - // stack: loc, val ^ IV_i, i, loc, next_val,... + // stack: val ^ IV_i, i, loc, val, next_val,... + DUP3 + // stack: loc, val ^ IV_i, i, loc, val, next_val,... %mstore_kernel_general - // stack: i, loc, next_val,... + // stack: i, loc, val, next_val,... %increment - SWAP1 + // stack: i + 1, loc, val, next_val,... + SWAP2 + // stack: val, loc, i + 1, next_val,... + POP + // stack: loc, i + 1, next_val,... %increment + // stack: loc + 1, i + 1, next_val,... SWAP1 // stack: i + 1, loc + 1, next_val,... %endrep // stack: 8, loc + 16, cur_block, retdest - POP - POP + %pop2 // stack: cur_block, retdest // Run 12 rounds of G functions. diff --git a/evm/src/cpu/kernel/asm/hash/sha2/compression.asm b/evm/src/cpu/kernel/asm/hash/sha2/compression.asm index fa58dce8..c28a19a2 100644 --- a/evm/src/cpu/kernel/asm/hash/sha2/compression.asm +++ b/evm/src/cpu/kernel/asm/hash/sha2/compression.asm @@ -161,8 +161,7 @@ compression_loop: // stack: message_schedule_addr, i+1==64, i+1, a[i+1], b[i+1], c[i+1], d[i+1], e[i+1], f[i+1], g[i+1], h[i+1], num_blocks, scratch_space_addr, num_blocks new, i, retdest SWAP1 // stack: i+1==64, message_schedule_addr, i+1, a[i+1], b[i+1], c[i+1], d[i+1], e[i+1], f[i+1], g[i+1], h[i+1], num_blocks, scratch_space_addr, num_blocks new, i, retdest - PUSH 256 - MUL + %mul_const(256) // stack: (i+1==64)*256, message_schedule_addr, i+1, a[i+1], b[i+1], c[i+1], d[i+1], e[i+1], f[i+1], g[i+1], h[i+1], num_blocks, scratch_space_addr, num_blocks new, i, retdest ADD // stack: message_schedule_addr new, i+1, a[i+1], b[i+1], c[i+1], d[i+1], e[i+1], f[i+1], g[i+1], h[i+1], num_blocks, scratch_space_addr, num_blocks new, i, retdest @@ -260,20 +259,10 @@ compression_end: // stack: num_blocks, a[0]+a[64], b[0]+b[64], c[0]+c[64], d[0]+d[64], e[0]+e[64], f[0]+f[64], g[0]+g[64], h[0]+h[64], scratch_space_addr, message_schedule_addr, i, retdest POP // stack: a[0]+a[64], b[0]+b[64], c[0]+c[64], d[0]+d[64], e[0]+e[64], f[0]+f[64], g[0]+g[64], h[0]+h[64], scratch_space_addr, message_schedule_addr, i, retdest - %shl_const(32) - ADD // OR - %shl_const(32) - ADD // OR - %shl_const(32) - ADD // OR - %shl_const(32) - ADD // OR - %shl_const(32) - ADD // OR - %shl_const(32) - ADD // OR - %shl_const(32) - ADD // OR + %rep 7 + %shl_const(32) + ADD // OR + %endrep // stack: concat(a[0]+a[64], b[0]+b[64], c[0]+c[64], d[0]+d[64], e[0]+e[64], f[0]+f[64], g[0]+g[64], h[0]+h[64]), scratch_space_addr, message_schedule_addr, i, retdest SWAP3 // stack: i, scratch_space_addr, message_schedule_addr, concat(a[0]+a[64], b[0]+b[64], c[0]+c[64], d[0]+d[64], e[0]+e[64], f[0]+f[64], g[0]+g[64], h[0]+h[64]), retdest From 63301d6b8cab3e08e5668cd819497d723ca7ae70 Mon Sep 17 00:00:00 2001 From: Nicholas Ward Date: Fri, 24 Feb 2023 11:59:22 -0800 Subject: [PATCH 11/23] refactor sha2 compression --- .../cpu/kernel/asm/hash/sha2/compression.asm | 291 +++++++----------- 1 file changed, 110 insertions(+), 181 deletions(-) diff --git a/evm/src/cpu/kernel/asm/hash/sha2/compression.asm b/evm/src/cpu/kernel/asm/hash/sha2/compression.asm index c28a19a2..8b361ead 100644 --- a/evm/src/cpu/kernel/asm/hash/sha2/compression.asm +++ b/evm/src/cpu/kernel/asm/hash/sha2/compression.asm @@ -8,22 +8,7 @@ global sha2_compression: // stack: message_schedule_addr, retdest - PUSH 0 - // stack: i=0, message_schedule_addr, retdest - SWAP1 - // stack: message_schedule_addr, i=0, retdest - PUSH 0 - // stack: 0, message_schedule_addr, i=0, retdest - %mload_kernel_general - // stack: num_blocks, message_schedule_addr, i=0, retdest - DUP1 - // stack: num_blocks, num_blocks, message_schedule_addr, i=0, retdest - %scratch_space_addr_from_num_blocks - // stack: scratch_space_addr, num_blocks, message_schedule_addr, i=0, retdest - SWAP1 - // stack: num_blocks, scratch_space_addr, message_schedule_addr, i=0, retdest // Push the initial hash values; these constants are called H^(0) in the spec. - PUSH 0x5be0cd19 // H^(0)_7 PUSH 0x1f83d9ab // H^(0)_6 PUSH 0x9b05688c // H^(0)_5 PUSH 0x510e527f // H^(0)_4 @@ -31,229 +16,173 @@ global sha2_compression: PUSH 0x3c6ef372 // H^(0)_2 PUSH 0xbb67ae85 // H^(0)_1 PUSH 0x6a09e667 // H^(0)_0 - // stack: a[0], b[0], c[0], d[0], e[0], f[0], g[0], h[0], num_blocks, scratch_space_addr, message_schedule_addr, i=0, retdest + PUSH 0x5be0cd19 // H^(0)_7 + // stack: h[0], a[0], b[0], c[0], d[0], e[0], f[0], g[0], message_schedule_addr, retdest + SWAP8 + // stack: message_schedule_addr, a[0], b[0], c[0], d[0], e[0], f[0], g[0], h[0], retdest + PUSH 0 + // stack: i=0, message_schedule_addr, a[0]..h[0], retdest + SWAP1 + // stack: message_schedule_addr, i=0, a[0]..h[0], retdest + PUSH 0 + // stack: 0, message_schedule_addr, i=0, a[0]..h[0], retdest + %mload_kernel_general + // stack: num_blocks, message_schedule_addr, i=0, a[0]..h[0], retdest + DUP1 + // stack: num_blocks, num_blocks, message_schedule_addr, i=0, a[0]..h[0], retdest + %scratch_space_addr_from_num_blocks + // stack: scratch_space_addr, num_blocks, message_schedule_addr, i=0, a[0]..h[0], retdest + SWAP1 + // stack: num_blocks, scratch_space_addr, message_schedule_addr, i=0, a[0]..h[0], retdest +compression_start_block: + // stack: num_blocks, scratch_space_addr, message_schedule_addr, i=0, a[0]..h[0], retdest + %rep 8 + DUP12 + %endrep + // stack: a[0], b[0], c[0], d[0], e[0], f[0], g[0], h[0], num_blocks, scratch_space_addr, message_schedule_addr, i=0, a[0]..h[0], retdest compression_start_block: - // Store the current values of the working variables, as the "initial values" to be added back in at the end of this block. - DUP10 - // stack: scratch_space_addr, a[0], b[0], c[0], d[0], e[0], f[0], g[0], h[0], num_blocks, scratch_space_addr, message_schedule_addr, i=0, retdest - - DUP2 - DUP2 - // stack: scratch_space_addr, a[0], scratch_space_addr, a[0], b[0], c[0], d[0], e[0], f[0], g[0], h[0], num_blocks, scratch_space_addr, message_schedule_addr, i=0, retdest - %mstore_kernel_general_u32 - // stack: scratch_space_addr, a[0], b[0], c[0], d[0], e[0], f[0], g[0], h[0], num_blocks, scratch_space_addr, message_schedule_addr, i=0, retdest - %add_const(4) - // stack: scratch_space_addr+4, a[0], b[0], c[0], d[0], e[0], f[0], g[0], h[0], num_blocks, scratch_space_addr, message_schedule_addr, i=0, retdest - - DUP3 - DUP2 - // stack: scratch_space_addr+4, b[0], scratch_space_addr+4, a[0], b[0], c[0], d[0], e[0], f[0], g[0], h[0], num_blocks, scratch_space_addr, message_schedule_addr, i=0, retdest - %mstore_kernel_general_u32 - // stack: scratch_space_addr+4, a[0], b[0], c[0], d[0], e[0], f[0], g[0], h[0], num_blocks, scratch_space_addr, message_schedule_addr, i=0, retdest - %add_const(4) - // stack: scratch_space_addr+8, a[0], b[0], c[0], d[0], e[0], f[0], g[0], h[0], num_blocks, scratch_space_addr, message_schedule_addr, i=0, retdest - DUP4 - DUP2 - // stack: scratch_space_addr+8, c[0], scratch_space_addr+8, a[0], b[0], c[0], d[0], e[0], f[0], g[0], h[0], num_blocks, scratch_space_addr, message_schedule_addr, i=0, retdest - %mstore_kernel_general_u32 - // stack: scratch_space_addr+8, a[0], b[0], c[0], d[0], e[0], f[0], g[0], h[0], num_blocks, scratch_space_addr, message_schedule_addr, i=0, retdest - %add_const(4) - // stack: scratch_space_addr+12, a[0], b[0], c[0], d[0], e[0], f[0], g[0], h[0], num_blocks, scratch_space_addr, message_schedule_addr, i=0, retdest - - DUP5 - DUP2 - // stack: scratch_space_addr+12, c[0], scratch_space_addr+8, a[0], b[0], c[0], d[0], e[0], f[0], g[0], h[0], num_blocks, scratch_space_addr, message_schedule_addr, i=0, retdest - %mstore_kernel_general_u32 - // stack: scratch_space_addr+12, a[0], b[0], c[0], d[0], e[0], f[0], g[0], h[0], num_blocks, scratch_space_addr, message_schedule_addr, i=0, retdest - %add_const(4) - // stack: scratch_space_addr+16, a[0], b[0], c[0], d[0], e[0], f[0], g[0], h[0], num_blocks, scratch_space_addr, message_schedule_addr, i=0, retdest - - DUP6 - DUP2 - // stack: scratch_space_addr+16, c[0], scratch_space_addr+8, a[0], b[0], c[0], d[0], e[0], f[0], g[0], h[0], num_blocks, scratch_space_addr, message_schedule_addr, i=0, retdest - %mstore_kernel_general_u32 - // stack: scratch_space_addr+16, a[0], b[0], c[0], d[0], e[0], f[0], g[0], h[0], num_blocks, scratch_space_addr, message_schedule_addr, i=0, retdest - %add_const(4) - // stack: scratch_space_addr+20, a[0], b[0], c[0], d[0], e[0], f[0], g[0], h[0], num_blocks, scratch_space_addr, message_schedule_addr, i=0, retdest - - DUP7 - DUP2 - // stack: scratch_space_addr+20, c[0], scratch_space_addr+8, a[0], b[0], c[0], d[0], e[0], f[0], g[0], h[0], num_blocks, scratch_space_addr, message_schedule_addr, i=0, retdest - %mstore_kernel_general_u32 - // stack: scratch_space_addr+20, a[0], b[0], c[0], d[0], e[0], f[0], g[0], h[0], num_blocks, scratch_space_addr, message_schedule_addr, i=0, retdest - %add_const(4) - // stack: scratch_space_addr+24, a[0], b[0], c[0], d[0], e[0], f[0], g[0], h[0], num_blocks, scratch_space_addr, message_schedule_addr, i=0, retdest - - DUP8 - DUP2 - // stack: scratch_space_addr+24, c[0], scratch_space_addr+8, a[0], b[0], c[0], d[0], e[0], f[0], g[0], h[0], num_blocks, scratch_space_addr, message_schedule_addr, i=0, retdest - %mstore_kernel_general_u32 - // stack: scratch_space_addr+24, a[0], b[0], c[0], d[0], e[0], f[0], g[0], h[0], num_blocks, scratch_space_addr, message_schedule_addr, i=0, retdest - %add_const(4) - // stack: scratch_space_addr+28, a[0], b[0], c[0], d[0], e[0], f[0], g[0], h[0], num_blocks, scratch_space_addr, message_schedule_addr, i=0, retdest - - DUP9 - DUP2 - // stack: scratch_space_addr+28, c[0], scratch_space_addr+8, a[0], b[0], c[0], d[0], e[0], f[0], g[0], h[0], num_blocks, scratch_space_addr, message_schedule_addr, i=0, retdest - %mstore_kernel_general_u32 - // stack: scratch_space_addr+28, a[0], b[0], c[0], d[0], e[0], f[0], g[0], h[0], num_blocks, scratch_space_addr, message_schedule_addr, i=0, retdest - POP - // stack: a[0], b[0], c[0], d[0], e[0], f[0], g[0], h[0], num_blocks, scratch_space_addr, message_schedule_addr, i=0, retdest compression_loop: // Update the eight working variables, using the next constant K[i] and the next message schedule chunk W[i]. - // stack: a[i], b[i], c[i], d[i], e[i], f[i], g[i], h[i], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest + // stack: a[i], b[i], c[i], d[i], e[i], f[i], g[i], h[i], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]..h[0], retdest DUP11 - // stack: message_schedule_addr, a[i], b[i], c[i], d[i], e[i], f[i], g[i], h[i], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest + // stack: message_schedule_addr, a[i], b[i], c[i], d[i], e[i], f[i], g[i], h[i], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]..h[0], retdest DUP13 - // stack: i, message_schedule_addr, a[i], b[i], c[i], d[i], e[i], f[i], g[i], h[i], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest + // stack: i, message_schedule_addr, a[i], b[i], c[i], d[i], e[i], f[i], g[i], h[i], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]..h[0], retdest %mul_const(4) - // stack: 4*i, message_schedule_addr, a[i], b[i], c[i], d[i], e[i], f[i], g[i], h[i], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest + // stack: 4*i, message_schedule_addr, a[i], b[i], c[i], d[i], e[i], f[i], g[i], h[i], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]..h[0], retdest ADD - // stack: message_schedule_addr + 4*i, a[i], b[i], c[i], d[i], e[i], f[i], g[i], h[i], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest + // stack: message_schedule_addr + 4*i, a[i], b[i], c[i], d[i], e[i], f[i], g[i], h[i], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]..h[0], retdest %mload_kernel_general_u32 - // stack: W[i], a[i], b[i], c[i], d[i], e[i], f[i], g[i], h[i], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest + // stack: W[i], a[i], b[i], c[i], d[i], e[i], f[i], g[i], h[i], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]..h[0], retdest PUSH sha2_constants_k - // stack: sha2_constants_k, W[i], a[i], b[i], c[i], d[i], e[i], f[i], g[i], h[i], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest + // stack: sha2_constants_k, W[i], a[i], b[i], c[i], d[i], e[i], f[i], g[i], h[i], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]..h[0], retdest DUP14 - // stack: i, sha2_constants_k, W[i], a[i], b[i], c[i], d[i], e[i], f[i], g[i], h[i], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest + // stack: i, sha2_constants_k, W[i], a[i], b[i], c[i], d[i], e[i], f[i], g[i], h[i], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]..h[0], retdest %mul_const(4) - // stack: 4*i, sha2_constants_k, W[i], a[i], b[i], c[i], d[i], e[i], f[i], g[i], h[i], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest + // stack: 4*i, sha2_constants_k, W[i], a[i], b[i], c[i], d[i], e[i], f[i], g[i], h[i], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]..h[0], retdest ADD - // stack: sha2_constants_k + 4*i, W[i], a[i], b[i], c[i], d[i], e[i], f[i], g[i], h[i], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest + // stack: sha2_constants_k + 4*i, W[i], a[i], b[i], c[i], d[i], e[i], f[i], g[i], h[i], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]..h[0], retdest %mload_kernel_code_u32 - // stack: K[i], W[i], a[i], b[i], c[i], d[i], e[i], f[i], g[i], h[i], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest + // stack: K[i], W[i], a[i], b[i], c[i], d[i], e[i], f[i], g[i], h[i], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]..h[0], retdest %stack (start: 6, e, f, g, h) -> (e, f, g, h, start, e, f, g, h) - // stack: e[i], f[i], g[i], h[i], K[i], W[i], a[i], b[i], c[i], d[i], e[i], f[i], g[i], h[i], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest + // stack: e[i], f[i], g[i], h[i], K[i], W[i], a[i], b[i], c[i], d[i], e[i], f[i], g[i], h[i], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]..h[0], retdest %sha2_temp_word1 - // stack: T1[i], a[i], b[i], c[i], d[i], e[i], f[i], g[i], h[i], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest + // stack: T1[i], a[i], b[i], c[i], d[i], e[i], f[i], g[i], h[i], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]..h[0], retdest %stack (t, a, b, c) -> (a, b, c, t, a, b, c) - // stack: a[i], b[i], c[i], T1[i], a[i], b[i], c[i], d[i], e[i], f[i], g[i], h[i], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest + // stack: a[i], b[i], c[i], T1[i], a[i], b[i], c[i], d[i], e[i], f[i], g[i], h[i], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]..h[0], retdest %sha2_temp_word2 - // stack: T2[i], T1[i], a[i], b[i], c[i], d[i], e[i], f[i], g[i], h[i], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest + // stack: T2[i], T1[i], a[i], b[i], c[i], d[i], e[i], f[i], g[i], h[i], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]..h[0], retdest DUP6 - // stack: d[i], T2[i], T1[i], a[i], b[i], c[i], d[i], e[i], f[i], g[i], h[i], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest + // stack: d[i], T2[i], T1[i], a[i], b[i], c[i], d[i], e[i], f[i], g[i], h[i], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]..h[0], retdest DUP3 - // stack: T1[i], d[i], T2[i], T1[i], a[i], b[i], c[i], d[i], e[i], f[i], g[i], h[i], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest + // stack: T1[i], d[i], T2[i], T1[i], a[i], b[i], c[i], d[i], e[i], f[i], g[i], h[i], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]..h[0], retdest %add_u32 - // stack: e[i+1]=T1[i]+d[i], T2[i], T1[i], a[i], b[i], c[i], d[i], e[i], f[i], g[i], h[i], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest + // stack: e[i+1]=T1[i]+d[i], T2[i], T1[i], a[i], b[i], c[i], d[i], e[i], f[i], g[i], h[i], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]..h[0], retdest SWAP2 - // stack: T2[i], T1[i], e[i+1], a[i], b[i], c[i], d[i], e[i], f[i], g[i], h[i], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest + // stack: T2[i], T1[i], e[i+1], a[i], b[i], c[i], d[i], e[i], f[i], g[i], h[i], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]..h[0], retdest %add_u32 - // stack: a[i+1]=T1[i]+T2[i], e[i+1], b[i+1]=a[i], c[i+1]=b[i], d[i+1]=c[i], d[i], f[i+1]=e[i], g[i+1]=f[i], h[i+1]=g[i], h[i], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest + // stack: a[i+1]=T1[i]+T2[i], e[i+1], b[i+1]=a[i], c[i+1]=b[i], d[i+1]=c[i], d[i], f[i+1]=e[i], g[i+1]=f[i], h[i+1]=g[i], h[i], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]..h[0], retdest %stack (a, e, b, c, d, old_d, f, g, h, old_h) -> (a, b, c, d, e, f, g, h) - // stack: a[i+1], b[i+1], c[i+1], d[i+1], e[i+1], f[i+1], g[i+1], h[i+1], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest + // stack: a[i+1], b[i+1], c[i+1], d[i+1], e[i+1], f[i+1], g[i+1], h[i+1], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]..h[0], retdest DUP12 - // stack: i, a[i+1], b[i+1], c[i+1], d[i+1], e[i+1], f[i+1], g[i+1], h[i+1], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest + // stack: i, a[i+1], b[i+1], c[i+1], d[i+1], e[i+1], f[i+1], g[i+1], h[i+1], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]..h[0], retdest %increment - // stack: i+1, a[i+1], b[i+1], c[i+1], d[i+1], e[i+1], f[i+1], g[i+1], h[i+1], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest + // stack: i+1, a[i+1], b[i+1], c[i+1], d[i+1], e[i+1], f[i+1], g[i+1], h[i+1], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]..h[0], retdest DUP1 - // stack: i+1, i+1, a[i+1], b[i+1], c[i+1], d[i+1], e[i+1], f[i+1], g[i+1], h[i+1], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest + // stack: i+1, i+1, a[i+1], b[i+1], c[i+1], d[i+1], e[i+1], f[i+1], g[i+1], h[i+1], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]..h[0], retdest %eq_const(64) - // stack: i+1==64, i+1, a[i+1], b[i+1], c[i+1], d[i+1], e[i+1], f[i+1], g[i+1], h[i+1], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest + // stack: i+1==64, i+1, a[i+1], b[i+1], c[i+1], d[i+1], e[i+1], f[i+1], g[i+1], h[i+1], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]..h[0], retdest DUP1 - // stack: i+1==64, i+1==64, i+1, a[i+1], b[i+1], c[i+1], d[i+1], e[i+1], f[i+1], g[i+1], h[i+1], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest + // stack: i+1==64, i+1==64, i+1, a[i+1], b[i+1], c[i+1], d[i+1], e[i+1], f[i+1], g[i+1], h[i+1], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]..h[0], retdest DUP12 - // stack: num_blocks, i+1==64, i+1==64, i+1, a[i+1], b[i+1], c[i+1], d[i+1], e[i+1], f[i+1], g[i+1], h[i+1], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest + // stack: num_blocks, i+1==64, i+1==64, i+1, a[i+1], b[i+1], c[i+1], d[i+1], e[i+1], f[i+1], g[i+1], h[i+1], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]..h[0], retdest SUB - // stack: num_blocks new, i+1==64, i+1, a[i+1], b[i+1], c[i+1], d[i+1], e[i+1], f[i+1], g[i+1], h[i+1], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest + // stack: num_blocks new, i+1==64, i+1, a[i+1], b[i+1], c[i+1], d[i+1], e[i+1], f[i+1], g[i+1], h[i+1], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]..h[0], retdest SWAP13 - // stack: message_schedule_addr, i+1==64, i+1, a[i+1], b[i+1], c[i+1], d[i+1], e[i+1], f[i+1], g[i+1], h[i+1], num_blocks, scratch_space_addr, num_blocks new, i, retdest + // stack: message_schedule_addr, i+1==64, i+1, a[i+1], b[i+1], c[i+1], d[i+1], e[i+1], f[i+1], g[i+1], h[i+1], num_blocks, scratch_space_addr, num_blocks new, i, a[0]..h[0], retdest SWAP1 - // stack: i+1==64, message_schedule_addr, i+1, a[i+1], b[i+1], c[i+1], d[i+1], e[i+1], f[i+1], g[i+1], h[i+1], num_blocks, scratch_space_addr, num_blocks new, i, retdest + // stack: i+1==64, message_schedule_addr, i+1, a[i+1], b[i+1], c[i+1], d[i+1], e[i+1], f[i+1], g[i+1], h[i+1], num_blocks, scratch_space_addr, num_blocks new, i, a[0]..h[0], retdest %mul_const(256) - // stack: (i+1==64)*256, message_schedule_addr, i+1, a[i+1], b[i+1], c[i+1], d[i+1], e[i+1], f[i+1], g[i+1], h[i+1], num_blocks, scratch_space_addr, num_blocks new, i, retdest + // stack: (i+1==64)*256, message_schedule_addr, i+1, a[i+1], b[i+1], c[i+1], d[i+1], e[i+1], f[i+1], g[i+1], h[i+1], num_blocks, scratch_space_addr, num_blocks new, i, a[0]..h[0], retdest ADD - // stack: message_schedule_addr new, i+1, a[i+1], b[i+1], c[i+1], d[i+1], e[i+1], f[i+1], g[i+1], h[i+1], num_blocks, scratch_space_addr, num_blocks new, i, retdest + // stack: message_schedule_addr new, i+1, a[i+1], b[i+1], c[i+1], d[i+1], e[i+1], f[i+1], g[i+1], h[i+1], num_blocks, scratch_space_addr, num_blocks new, i, a[0]..h[0], retdest SWAP12 - // stack: num_blocks new, i+1, a[i+1], b[i+1], c[i+1], d[i+1], e[i+1], f[i+1], g[i+1], h[i+1], num_blocks, scratch_space_addr, message_schedule_addr new, i, retdest + // stack: num_blocks new, i+1, a[i+1], b[i+1], c[i+1], d[i+1], e[i+1], f[i+1], g[i+1], h[i+1], num_blocks, scratch_space_addr, message_schedule_addr new, i, a[0]..h[0], retdest SWAP10 - // stack: num_blocks, i+1, a[i+1], b[i+1], c[i+1], d[i+1], e[i+1], f[i+1], g[i+1], h[i+1], num_blocks new, scratch_space_addr, message_schedule_addr new, i, new_retdest + // stack: num_blocks, i+1, a[i+1], b[i+1], c[i+1], d[i+1], e[i+1], f[i+1], g[i+1], h[i+1], num_blocks new, scratch_space_addr, message_schedule_addr new, i, new_a[0]..h[0], retdest POP - // stack: i+1, a[i+1], b[i+1], c[i+1], d[i+1], e[i+1], f[i+1], g[i+1], h[i+1], num_blocks new, scratch_space_addr, message_schedule_addr new, i, new_retdest + // stack: i+1, a[i+1], b[i+1], c[i+1], d[i+1], e[i+1], f[i+1], g[i+1], h[i+1], num_blocks new, scratch_space_addr, message_schedule_addr new, i, new_a[0]..h[0], retdest %and_const(63) - // stack: (i+1)%64, a[i+1], b[i+1], c[i+1], d[i+1], e[i+1], f[i+1], g[i+1], h[i+1], num_blocks new, scratch_space_addr, message_schedule_addr new, i, retdest + // stack: (i+1)%64, a[i+1], b[i+1], c[i+1], d[i+1], e[i+1], f[i+1], g[i+1], h[i+1], num_blocks new, scratch_space_addr, message_schedule_addr new, i, a[0]..h[0], retdest SWAP12 - // stack: i, a[i+1], b[i+1], c[i+1], d[i+1], e[i+1], f[i+1], g[i+1], h[i+1], num_blocks new, scratch_space_addr, message_schedule_addr new, (i+1)%64, retdest + // stack: i, a[i+1], b[i+1], c[i+1], d[i+1], e[i+1], f[i+1], g[i+1], h[i+1], num_blocks new, scratch_space_addr, message_schedule_addr new, (i+1)%64, a[0]..h[0], retdest POP - // stack: a[i+1], b[i+1], c[i+1], d[i+1], e[i+1], f[i+1], g[i+1], h[i+1], num_blocks new, scratch_space_addr, message_schedule_addr new, (i+1)%64, retdest + // stack: a[i+1], b[i+1], c[i+1], d[i+1], e[i+1], f[i+1], g[i+1], h[i+1], num_blocks new, scratch_space_addr, message_schedule_addr new, (i+1)%64, a[0]..h[0], retdest DUP12 - // stack: (i+1)%64, a[i+1], b[i+1], c[i+1], d[i+1], e[i+1], f[i+1], g[i+1], h[i+1], num_blocks new, scratch_space_addr, message_schedule_addr new, (i+1)%64, retdest + // stack: (i+1)%64, a[i+1], b[i+1], c[i+1], d[i+1], e[i+1], f[i+1], g[i+1], h[i+1], num_blocks new, scratch_space_addr, message_schedule_addr new, (i+1)%64, a[0]..h[0], retdest %jumpi(compression_loop) compression_end_block: // Add the initial values of the eight working variables (from the start of this block's compression) back into them. - // stack: a[64], b[64], c[64], d[64], e[64], f[64], g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest - DUP10 - // stack: scratch_space_addr, a[64], b[64], c[64], d[64], e[64], f[64], g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest - %mload_kernel_general_u32 - // stack: a[0], a[64], b[64], c[64], d[64], e[64], f[64], g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest + // stack: a[64], b[64], c[64], d[64], e[64], f[64], g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0], b[0], c[0], d[0], e[0], f[0], g[0], h[0], retdest + PUSH 0 + // stack: 0, a[64], b[64], c[64], d[64], e[64], f[64], g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0], b[0], c[0], d[0], e[0], f[0], g[0], h[0], retdest + SWAP13 + // stack: a[0], a[64], b[64], c[64], d[64], e[64], f[64], g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, 0, b[0], c[0], d[0], e[0], f[0], g[0], h[0], retdest %add_u32 - // stack: a[0]+a[64], b[64], c[64], d[64], e[64], f[64], g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest - SWAP1 - // stack: b[64], a[0]+a[64], c[64], d[64], e[64], f[64], g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest - DUP10 - %add_const(4) - %mload_kernel_general_u32 - // stack: b[0], b[64], a[0]+a[64], c[64], d[64], e[64], f[64], g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest + // stack: a[0]+a[64], b[64], c[64], d[64], e[64], f[64], g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, 0, b[0], c[0], d[0], e[0], f[0], g[0], h[0], retdest + SWAP12 + // stack: 0, b[64], c[64], d[64], e[64], f[64], g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]+a[64], b[0], c[0], d[0], e[0], f[0], g[0], h[0], retdest + SWAP13 + // stack: b[0], b[64], c[64], d[64], e[64], f[64], g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]+a[64], 0, c[0], d[0], e[0], f[0], g[0], h[0], retdest %add_u32 - // stack: b[0]+b[64], a[0]+a[64], c[64], d[64], e[64], f[64], g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest - SWAP2 - // stack: c[64], a[0]+a[64], b[0]+b[64], d[64], e[64], f[64], g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest - DUP10 - %add_const(8) - %mload_kernel_general_u32 - // stack: c[0], c[64], a[0]+a[64], b[0]+b[64], d[64], e[64], f[64], g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest + // stack: b[0]+b[64], c[64], d[64], e[64], f[64], g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]+a[64], 0, c[0], d[0], e[0], f[0], g[0], h[0], retdest + SWAP12 + // stack: 0, c[64], d[64], e[64], f[64], g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]+a[64], b[0]+b[64], c[0], d[0], e[0], f[0], g[0], h[0], retdest + SWAP13 + // stack: c[0], c[64], d[64], e[64], f[64], g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]+a[64], b[0]+b[64], 0, d[0], e[0], f[0], g[0], h[0], retdest %add_u32 - // stack: c[0]+c[64], a[0]+a[64], b[0]+b[64], d[64], e[64], f[64], g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest - SWAP3 - // stack: d[64], a[0]+a[64], b[0]+b[64], c[0]+c[64], e[64], f[64], g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest - DUP10 - %add_const(12) - %mload_kernel_general_u32 - // stack: d[0], d[64], a[0]+a[64], b[0]+b[64], c[0]+c[64], e[64], f[64], g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest + // stack: c[0]+c[64], d[64], e[64], f[64], g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]+a[64], b[0]+b[64], 0, d[0], e[0], f[0], g[0], h[0], retdest + SWAP12 + // stack: 0, d[64], e[64], f[64], g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]+a[64], b[0]+b[64], c[0]+c[64], d[0], e[0], f[0], g[0], h[0], retdest + SWAP13 + // stack: d[0], d[64], e[64], f[64], g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]+a[64], b[0]+b[64], c[0]+c[64], 0, e[0], f[0], g[0], h[0], retdest %add_u32 - // stack: d[0]+d[64], a[0]+a[64], b[0]+b[64], c[0]+c[64], e[64], f[64], g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest - SWAP4 - // stack: e[64], a[0]+a[64], b[0]+b[64], c[0]+c[64], d[0]+d[64], f[64], g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest - DUP10 - %add_const(16) - %mload_kernel_general_u32 - // stack: e[0], e[64], a[0]+a[64], b[0]+b[64], c[0]+c[64], d[0]+d[64], f[64], g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest + // stack: d[0]+d[64], e[64], f[64], g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]+a[64], b[0]+b[64], c[0]+c[64], 0, e[0], f[0], g[0], h[0], retdest + SWAP12 + // stack: 0, e[64], f[64], g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]+a[64], b[0]+b[64], c[0]+c[64], d[0]+d[64], e[0], f[0], g[0], h[0], retdest + SWAP13 + // stack: e[0], e[64], f[64], g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]+a[64], b[0]+b[64], c[0]+c[64], d[0]+d[64], 0, f[0], g[0], h[0], retdest %add_u32 - // stack: e[0]+e[64], a[0]+a[64], b[0]+b[64], c[0]+c[64], d[0]+d[64], f[64], g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest - SWAP5 - // stack: f[64], a[0]+a[64], b[0]+b[64], c[0]+c[64], d[0]+d[64], e[0]+e[64], g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest - DUP10 - %add_const(20) - %mload_kernel_general_u32 - // stack: f[0], f[64], a[0]+a[64], b[0]+b[64], c[0]+c[64], d[0]+d[64], e[0]+e[64], g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest + // stack: e[0]+e[64], f[64], g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]+a[64], b[0]+b[64], c[0]+c[64], d[0]+d[64], 0, f[0], g[0], h[0], retdest + SWAP12 + // stack: 0, f[64], g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]+a[64], b[0]+b[64], c[0]+c[64], d[0]+d[64], e[0]+e[64], f[0], g[0], h[0], retdest + SWAP13 + // stack: f[0], f[64], g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]+a[64], b[0]+b[64], c[0]+c[64], d[0]+d[64], e[0]+e[64], 0, g[0], h[0], retdest %add_u32 - // stack: f[0]+f[64], a[0]+a[64], b[0]+b[64], c[0]+c[64], d[0]+d[64], e[0]+e[64], g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest - SWAP6 - // stack: g[64], a[0]+a[64], b[0]+b[64], c[0]+c[64], d[0]+d[64], e[0]+e[64], f[0]+f[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest - DUP10 - %add_const(24) - %mload_kernel_general_u32 - // stack: g[0], g[64], a[0]+a[64], b[0]+b[64], c[0]+c[64], d[0]+d[64], e[0]+e[64], f[0]+f[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest + // stack: f[0]+f[64], g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]+a[64], b[0]+b[64], c[0]+c[64], d[0]+d[64], e[0]+e[64], 0, g[0], h[0], retdest + SWAP12 + // stack: 0, g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]+a[64], b[0]+b[64], c[0]+c[64], d[0]+d[64], e[0]+e[64], f[0]+f[64], g[0], h[0], retdest + SWAP13 + // stack: g[0], g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]+a[64], b[0]+b[64], c[0]+c[64], d[0]+d[64], e[0]+e[64], f[0]+f[64], 0, h[0], retdest %add_u32 - // stack: g[0]+g[64], a[0]+a[64], b[0]+b[64], c[0]+c[64], d[0]+d[64], e[0]+e[64], f[0]+f[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest - SWAP7 - // stack: h[64], a[0]+a[64], b[0]+b[64], c[0]+c[64], d[0]+d[64], e[0]+e[64], f[0]+f[64], g[0]+g[64], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest - DUP10 - %add_const(28) - %mload_kernel_general_u32 - // stack: h[0], h[64], a[0]+a[64], b[0]+b[64], c[0]+c[64], d[0]+d[64], e[0]+e[64], f[0]+f[64], g[0]+g[64], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest + // stack: g[0]+g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]+a[64], b[0]+b[64], c[0]+c[64], d[0]+d[64], e[0]+e[64], f[0]+f[64], 0, h[0], retdest + SWAP12 + // stack: 0, h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]+a[64], b[0]+b[64], c[0]+c[64], d[0]+d[64], e[0]+e[64], f[0]+f[64], g[0]+g[64], h[0], retdest + SWAP13 + // stack: h[0], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]+a[64], b[0]+b[64], c[0]+c[64], d[0]+d[64], e[0]+e[64], f[0]+f[64], g[0]+g[64], 0, retdest %add_u32 - // stack: h[0]+h[64], a[0]+a[64], b[0]+b[64], c[0]+c[64], d[0]+d[64], e[0]+e[64], f[0]+f[64], g[0]+g[64], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest - SWAP8 - // stack: num_blocks, a[0]+a[64], b[0]+b[64], c[0]+c[64], d[0]+d[64], e[0]+e[64], f[0]+f[64], g[0]+g[64], h[0]+h[64], scratch_space_addr, message_schedule_addr, i, retdest + // stack: h[0]+h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]+a[64], b[0]+b[64], c[0]+c[64], d[0]+d[64], e[0]+e[64], f[0]+f[64], g[0]+g[64], 0, retdest + SWAP12 + // stack: 0, num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]+a[64], b[0]+b[64], c[0]+c[64], d[0]+d[64], e[0]+e[64], f[0]+f[64], g[0]+g[64], h[0]+h[64], retdest + POP + // stack: num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]+a[64], b[0]+b[64], c[0]+c[64], d[0]+d[64], e[0]+e[64], f[0]+f[64], g[0]+g[64], h[0]+h[64], retdest DUP1 - // stack: num_blocks, num_blocks, a[0]+a[64], b[0]+b[64], c[0]+c[64], d[0]+d[64], e[0]+e[64], f[0]+f[64], g[0]+g[64], h[0]+h[64], scratch_space_addr, message_schedule_addr, i, retdest + // stack: num_blocks, num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]+a[64], b[0]+b[64], c[0]+c[64], d[0]+d[64], e[0]+e[64], f[0]+f[64], g[0]+g[64], h[0]+h[64], retdest ISZERO // In this case, we've finished all the blocks. %jumpi(compression_end) - // stack: num_blocks, a[0]+a[64], b[0]+b[64], c[0]+c[64], d[0]+d[64], e[0]+e[64], f[0]+f[64], g[0]+g[64], h[0]+h[64], scratch_space_addr, message_schedule_addr, i, retdest - %stack (num_blocks, working: 8) -> (working, num_blocks) + // stack: num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]+a[64], b[0]+b[64], c[0]+c[64], d[0]+d[64], e[0]+e[64], f[0]+f[64], g[0]+g[64], h[0]+h[64], retdest %jump(compression_start_block) compression_end: // stack: num_blocks, a[0]+a[64], b[0]+b[64], c[0]+c[64], d[0]+d[64], e[0]+e[64], f[0]+f[64], g[0]+g[64], h[0]+h[64], scratch_space_addr, message_schedule_addr, i, retdest From 684b668b60c4d462410aa99e4883f20d390e6cb1 Mon Sep 17 00:00:00 2001 From: Nicholas Ward Date: Fri, 24 Feb 2023 11:59:26 -0800 Subject: [PATCH 12/23] fix --- evm/src/cpu/kernel/asm/hash/blake2b/compression.asm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evm/src/cpu/kernel/asm/hash/blake2b/compression.asm b/evm/src/cpu/kernel/asm/hash/blake2b/compression.asm index 9cf8aeef..498cc9d3 100644 --- a/evm/src/cpu/kernel/asm/hash/blake2b/compression.asm +++ b/evm/src/cpu/kernel/asm/hash/blake2b/compression.asm @@ -148,7 +148,7 @@ compression_loop: // stack: t_hi = t >> 64, t, 4, start + 12, invert_if_last_block, cur_block, retdest SWAP1 // stack: t, t_hi, 4, start + 12, invert_if_last_block, cur_block, retdest - %mod_const(PUSH 0x10000000000000000) + %mod_const(0x10000000000000000) // stack: t_lo = t % (1 << 64), t_hi, 4, start + 12, invert_if_last_block, cur_block, retdest %stack (t_lo, t_hi, i, loc, inv) -> (i, loc, t_lo, t_hi, inv, 0) // stack: 4, start + 12, t_lo, t_hi, invert_if_last_block, 0, cur_block, retdest From 7351a1661e49e0faf0b4a17ede804691d2d48015 Mon Sep 17 00:00:00 2001 From: Nicholas Ward Date: Fri, 24 Feb 2023 12:02:19 -0800 Subject: [PATCH 13/23] fix --- evm/src/cpu/kernel/asm/hash/sha2/compression.asm | 2 -- 1 file changed, 2 deletions(-) diff --git a/evm/src/cpu/kernel/asm/hash/sha2/compression.asm b/evm/src/cpu/kernel/asm/hash/sha2/compression.asm index 8b361ead..800c2010 100644 --- a/evm/src/cpu/kernel/asm/hash/sha2/compression.asm +++ b/evm/src/cpu/kernel/asm/hash/sha2/compression.asm @@ -40,8 +40,6 @@ compression_start_block: DUP12 %endrep // stack: a[0], b[0], c[0], d[0], e[0], f[0], g[0], h[0], num_blocks, scratch_space_addr, message_schedule_addr, i=0, a[0]..h[0], retdest -compression_start_block: - compression_loop: // Update the eight working variables, using the next constant K[i] and the next message schedule chunk W[i]. // stack: a[i], b[i], c[i], d[i], e[i], f[i], g[i], h[i], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]..h[0], retdest From 85411ac4755f17baf5e23a1e48d45f351626a272 Mon Sep 17 00:00:00 2001 From: Nicholas Ward Date: Fri, 24 Feb 2023 13:59:13 -0800 Subject: [PATCH 14/23] fixes --- evm/src/cpu/kernel/asm/hash/sha2/compression.asm | 12 +++++------- evm/src/cpu/kernel/tests/hash.rs | 1 + 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/evm/src/cpu/kernel/asm/hash/sha2/compression.asm b/evm/src/cpu/kernel/asm/hash/sha2/compression.asm index 800c2010..107d20b1 100644 --- a/evm/src/cpu/kernel/asm/hash/sha2/compression.asm +++ b/evm/src/cpu/kernel/asm/hash/sha2/compression.asm @@ -35,6 +35,8 @@ global sha2_compression: SWAP1 // stack: num_blocks, scratch_space_addr, message_schedule_addr, i=0, a[0]..h[0], retdest compression_start_block: + // We keep the current values of the working variables saved at the end of the stack. + // These are the "initial values" to be added back in at the end of this block. // stack: num_blocks, scratch_space_addr, message_schedule_addr, i=0, a[0]..h[0], retdest %rep 8 DUP12 @@ -183,17 +185,13 @@ compression_end_block: // stack: num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]+a[64], b[0]+b[64], c[0]+c[64], d[0]+d[64], e[0]+e[64], f[0]+f[64], g[0]+g[64], h[0]+h[64], retdest %jump(compression_start_block) compression_end: - // stack: num_blocks, a[0]+a[64], b[0]+b[64], c[0]+c[64], d[0]+d[64], e[0]+e[64], f[0]+f[64], g[0]+g[64], h[0]+h[64], scratch_space_addr, message_schedule_addr, i, retdest - POP - // stack: a[0]+a[64], b[0]+b[64], c[0]+c[64], d[0]+d[64], e[0]+e[64], f[0]+f[64], g[0]+g[64], h[0]+h[64], scratch_space_addr, message_schedule_addr, i, retdest + // stack: num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]+a[64], b[0]+b[64], c[0]+c[64], d[0]+d[64], e[0]+e[64], f[0]+f[64], g[0]+g[64], h[0]+h[64], retdest + %pop4 + // stack: a[0]+a[64], b[0]+b[64], c[0]+c[64], d[0]+d[64], e[0]+e[64], f[0]+f[64], g[0]+g[64], h[0]+h[64], retdest %rep 7 %shl_const(32) ADD // OR %endrep - // stack: concat(a[0]+a[64], b[0]+b[64], c[0]+c[64], d[0]+d[64], e[0]+e[64], f[0]+f[64], g[0]+g[64], h[0]+h[64]), scratch_space_addr, message_schedule_addr, i, retdest - SWAP3 - // stack: i, scratch_space_addr, message_schedule_addr, concat(a[0]+a[64], b[0]+b[64], c[0]+c[64], d[0]+d[64], e[0]+e[64], f[0]+f[64], g[0]+g[64], h[0]+h[64]), retdest - %pop3 // stack: sha2_result = concat(a[0]+a[64], b[0]+b[64], c[0]+c[64], d[0]+d[64], e[0]+e[64], f[0]+f[64], g[0]+g[64], h[0]+h[64]), retdest SWAP1 JUMP diff --git a/evm/src/cpu/kernel/tests/hash.rs b/evm/src/cpu/kernel/tests/hash.rs index bc73ecd5..432fcf39 100644 --- a/evm/src/cpu/kernel/tests/hash.rs +++ b/evm/src/cpu/kernel/tests/hash.rs @@ -90,6 +90,7 @@ fn test_hash_256( let (expected, result_stack) = prepare_test(hash_fn_label, hash_input_virt, standard_implementation).unwrap(); + // Extract the final output. let actual = result_stack[0]; From 265d39a5a711ee5d975b55ca8bdcd451d5bf5460 Mon Sep 17 00:00:00 2001 From: Nicholas Ward Date: Fri, 24 Feb 2023 16:15:14 -0800 Subject: [PATCH 15/23] cleanup --- evm/src/cpu/kernel/tests/hash.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/evm/src/cpu/kernel/tests/hash.rs b/evm/src/cpu/kernel/tests/hash.rs index 432fcf39..bc73ecd5 100644 --- a/evm/src/cpu/kernel/tests/hash.rs +++ b/evm/src/cpu/kernel/tests/hash.rs @@ -90,7 +90,6 @@ fn test_hash_256( let (expected, result_stack) = prepare_test(hash_fn_label, hash_input_virt, standard_implementation).unwrap(); - // Extract the final output. let actual = result_stack[0]; From 8f231bd03d5211e7179642d8db29ed49bda5d92a Mon Sep 17 00:00:00 2001 From: Nicholas Ward Date: Fri, 24 Feb 2023 16:21:45 -0800 Subject: [PATCH 16/23] optimization --- evm/src/cpu/kernel/asm/hash/sha2/message_schedule.asm | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/evm/src/cpu/kernel/asm/hash/sha2/message_schedule.asm b/evm/src/cpu/kernel/asm/hash/sha2/message_schedule.asm index 0d7c2637..c8bfae7e 100644 --- a/evm/src/cpu/kernel/asm/hash/sha2/message_schedule.asm +++ b/evm/src/cpu/kernel/asm/hash/sha2/message_schedule.asm @@ -94,9 +94,7 @@ gen_message_schedule_from_block_1_loop: // stack: counter, output_addr - 4, block[1] >> 32, block[0], retdest %decrement DUP1 - ISZERO - %jumpi(gen_message_schedule_from_block_1_end) - %jump(gen_message_schedule_from_block_1_loop) + %jumpi(gen_message_schedule_from_block_1_loop) gen_message_schedule_from_block_1_end: // stack: old counter=0, output_addr, block[1], block[0], retdest POP From 97cb5c75b67a152f095d5325f42ebc5dcd7a1b2f Mon Sep 17 00:00:00 2001 From: Nicholas Ward Date: Mon, 27 Feb 2023 11:48:49 -0800 Subject: [PATCH 17/23] bug fix --- evm/src/cpu/kernel/asm/hash/blake2b/compression.asm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evm/src/cpu/kernel/asm/hash/blake2b/compression.asm b/evm/src/cpu/kernel/asm/hash/blake2b/compression.asm index 498cc9d3..6e8cdb0a 100644 --- a/evm/src/cpu/kernel/asm/hash/blake2b/compression.asm +++ b/evm/src/cpu/kernel/asm/hash/blake2b/compression.asm @@ -131,7 +131,7 @@ compression_loop: // stack: i, i, loc, ... %blake2b_iv // stack: IV_i, i, loc, ... - DUP2 + DUP3 // stack: loc, IV_i, i, loc, ... %mstore_kernel_general // stack: i, loc, ... From 29df451d9d71321eac1334cbef25a7f3b23d6df0 Mon Sep 17 00:00:00 2001 From: Nicholas Ward Date: Mon, 27 Feb 2023 15:24:46 -0800 Subject: [PATCH 18/23] optimizations --- evm/src/cpu/kernel/asm/hash/sha2/main.asm | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/evm/src/cpu/kernel/asm/hash/sha2/main.asm b/evm/src/cpu/kernel/asm/hash/sha2/main.asm index 058224f6..e47d1838 100644 --- a/evm/src/cpu/kernel/asm/hash/sha2/main.asm +++ b/evm/src/cpu/kernel/asm/hash/sha2/main.asm @@ -19,9 +19,7 @@ global sha2_pad: // STEP 1: append 1 // insert 128 (= 1 << 7) at x[num_bytes+1] // stack: num_bytes, retdest - PUSH 1 - PUSH 7 - SHL + PUSH 0x80 // stack: 128, num_bytes, retdest DUP2 // stack: num_bytes, 128, num_bytes, retdest @@ -33,21 +31,19 @@ global sha2_pad: DUP1 // stack: num_bytes, num_bytes, retdest %add_const(8) - %div_const(64) + %shr_const(6) %increment // stack: num_blocks = (num_bytes+8)//64 + 1, num_bytes, retdest // STEP 3: calculate length := num_bytes*8 SWAP1 // stack: num_bytes, num_blocks, retdest - PUSH 8 - MUL + %mul_const(8) // stack: length = num_bytes*8, num_blocks, retdest // STEP 4: write length to x[num_blocks*64-7..num_blocks*64] DUP2 // stack: num_blocks, length, num_blocks, retdest - PUSH 64 - MUL + %mul_const(64) // stack: last_addr = num_blocks*64, length, num_blocks, retdest %sha2_write_length // stack: num_blocks, retdest From 424d8d221642ab91ea341adb5b04fdf80c280b36 Mon Sep 17 00:00:00 2001 From: Nicholas Ward Date: Mon, 27 Feb 2023 15:29:23 -0800 Subject: [PATCH 19/23] more optimizations --- .../kernel/asm/hash/sha2/message_schedule.asm | 31 +++++-------------- 1 file changed, 7 insertions(+), 24 deletions(-) diff --git a/evm/src/cpu/kernel/asm/hash/sha2/message_schedule.asm b/evm/src/cpu/kernel/asm/hash/sha2/message_schedule.asm index c8bfae7e..d8f0500d 100644 --- a/evm/src/cpu/kernel/asm/hash/sha2/message_schedule.asm +++ b/evm/src/cpu/kernel/asm/hash/sha2/message_schedule.asm @@ -59,10 +59,9 @@ gen_message_schedule_from_block_0_loop: gen_message_schedule_from_block_0_end: // stack: old counter=0, output_addr, block[0], block[1], retdest POP - PUSH 8 - // stack: counter=8, output_addr, block[0], block[1], retdest - %stack (counter, out, b0, b1) -> (out, counter, b1, b0) - // stack: output_addr, counter, block[1], block[0], retdest + // stack: output_addr, block[0], block[1], retdest + %stack (out, b0, b1) -> (out, 8, b1, b0) + // stack: output_addr, counter=8, block[1], block[0], retdest %add_const(64) // stack: output_addr + 64, counter, block[1], block[0], retdest SWAP1 @@ -114,11 +113,7 @@ gen_message_schedule_remaining_loop: // stack: output_addr, counter, block[0], block[1], retdest DUP1 // stack: output_addr, output_addr, counter, block[0], block[1], retdest - PUSH 2 - PUSH 4 - MUL - SWAP1 - SUB + %sub_const(8) // stack: output_addr - 2*4, output_addr, counter, block[0], block[1], retdest %mload_kernel_general_u32 // stack: x[output_addr - 2*4], output_addr, counter, block[0], block[1], retdest @@ -128,11 +123,7 @@ gen_message_schedule_remaining_loop: // stack: output_addr, sigma_1(x[output_addr - 2*4]), counter, block[0], block[1], retdest DUP1 // stack: output_addr, output_addr, sigma_1(x[output_addr - 2*4]), counter, block[0], block[1], retdest - PUSH 7 - PUSH 4 - MUL - SWAP1 - SUB + %sub_const(28) // stack: output_addr - 7*4, output_addr, sigma_1(x[output_addr - 2*4]), counter, block[0], block[1], retdest %mload_kernel_general_u32 // stack: x[output_addr - 7*4], output_addr, sigma_1(x[output_addr - 2*4]), counter, block[0], block[1], retdest @@ -140,11 +131,7 @@ gen_message_schedule_remaining_loop: // stack: output_addr, x[output_addr - 7*4], sigma_1(x[output_addr - 2*4]), counter, block[0], block[1], retdest DUP1 // stack: output_addr, output_addr, x[output_addr - 7*4], sigma_1(x[output_addr - 2*4]), counter, block[0], block[1], retdest - PUSH 15 - PUSH 4 - MUL - SWAP1 - SUB + %sub_const(60) // stack: output_addr - 15*4, output_addr, x[output_addr - 7*4], sigma_1(x[output_addr - 2*4]), counter, block[0], block[1], retdest %mload_kernel_general_u32 // stack: x[output_addr - 15*4], output_addr, x[output_addr - 7*4], sigma_1(x[output_addr - 2*4]), counter, block[0], block[1], retdest @@ -154,11 +141,7 @@ gen_message_schedule_remaining_loop: // stack: output_addr, sigma_0(x[output_addr - 15*4]), x[output_addr - 7*4], sigma_1(x[output_addr - 2*4]), counter, block[0], block[1], retdest DUP1 // stack: output_addr, output_addr, sigma_0(x[output_addr - 15*4]), x[output_addr - 7*4], sigma_1(x[output_addr - 2*4]), counter, block[0], block[1], retdest - PUSH 16 - PUSH 4 - MUL - SWAP1 - SUB + %sub_const(64) // stack: output_addr - 16*4, output_addr, sigma_0(x[output_addr - 15*4]), x[output_addr - 7*4], sigma_1(x[output_addr - 2*4]), counter, block[0], block[1], retdest %mload_kernel_general_u32 // stack: x[output_addr - 16*4], output_addr, sigma_0(x[output_addr - 15*4]), x[output_addr - 7*4], sigma_1(x[output_addr - 2*4]), counter, block[0], block[1], retdest From de6f01f4b6aeedd7a046264049083645cb173bf4 Mon Sep 17 00:00:00 2001 From: Nicholas Ward Date: Wed, 8 Mar 2023 13:50:31 -0800 Subject: [PATCH 20/23] small optimizations --- .../cpu/kernel/asm/hash/sha2/compression.asm | 9 +++- evm/src/cpu/kernel/asm/hash/sha2/ops.asm | 51 ++++++++++++------- 2 files changed, 39 insertions(+), 21 deletions(-) diff --git a/evm/src/cpu/kernel/asm/hash/sha2/compression.asm b/evm/src/cpu/kernel/asm/hash/sha2/compression.asm index 107d20b1..0bceb715 100644 --- a/evm/src/cpu/kernel/asm/hash/sha2/compression.asm +++ b/evm/src/cpu/kernel/asm/hash/sha2/compression.asm @@ -65,11 +65,16 @@ compression_loop: // stack: sha2_constants_k + 4*i, W[i], a[i], b[i], c[i], d[i], e[i], f[i], g[i], h[i], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]..h[0], retdest %mload_kernel_code_u32 // stack: K[i], W[i], a[i], b[i], c[i], d[i], e[i], f[i], g[i], h[i], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]..h[0], retdest - %stack (start: 6, e, f, g, h) -> (e, f, g, h, start, e, f, g, h) + DUP10 + DUP10 + DUP10 + DUP10 // stack: e[i], f[i], g[i], h[i], K[i], W[i], a[i], b[i], c[i], d[i], e[i], f[i], g[i], h[i], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]..h[0], retdest %sha2_temp_word1 // stack: T1[i], a[i], b[i], c[i], d[i], e[i], f[i], g[i], h[i], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]..h[0], retdest - %stack (t, a, b, c) -> (a, b, c, t, a, b, c) + DUP4 + DUP4 + DUP4 // stack: a[i], b[i], c[i], T1[i], a[i], b[i], c[i], d[i], e[i], f[i], g[i], h[i], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]..h[0], retdest %sha2_temp_word2 // stack: T2[i], T1[i], a[i], b[i], c[i], d[i], e[i], f[i], g[i], h[i], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]..h[0], retdest diff --git a/evm/src/cpu/kernel/asm/hash/sha2/ops.asm b/evm/src/cpu/kernel/asm/hash/sha2/ops.asm index 7d8054ca..d50e5c9a 100644 --- a/evm/src/cpu/kernel/asm/hash/sha2/ops.asm +++ b/evm/src/cpu/kernel/asm/hash/sha2/ops.asm @@ -26,14 +26,15 @@ // stack: x, x %rotr(7) // stack: rotr(x, 7), x - %stack (rotated, x) -> (x, x, rotated) + SWAP1 + // stack: x, rotr(x, 7) + DUP1 // stack: x, x, rotr(x, 7) %rotr(18) // stack: rotr(x, 18), x, rotr(x, 7) SWAP1 // stack: x, rotr(x, 18), rotr(x, 7) - PUSH 3 - SHR + %shr_const(3) // stack: shr(x, 3), rotr(x, 18), rotr(x, 7) XOR XOR @@ -45,7 +46,9 @@ // stack: x, x %rotr(17) // stack: rotr(x, 17), x - %stack (rotated, x) -> (x, x, rotated) + SWAP1 + // stack: x, rotr(x, 17) + DUP1 // stack: x, x, rotr(x, 17) %rotr(19) // stack: rotr(x, 19), x, rotr(x, 17) @@ -64,7 +67,9 @@ // stack: x, x %rotr(2) // stack: rotr(x, 2), x - %stack (rotated, x) -> (x, x, rotated) + SWAP1 + // stack: x, rotr(x, 2) + DUP1 // stack: x, x, rotr(x, 2) %rotr(13) // stack: rotr(x, 13), x, rotr(x, 2) @@ -82,7 +87,9 @@ // stack: x, x %rotr(6) // stack: rotr(x, 6), x - %stack (rotated, x) -> (x, x, rotated) + SWAP1 + // stack: x, rotr(x, 6) + DUP1 // stack: x, x, rotr(x, 6) %rotr(11) // stack: rotr(x, 11), x, rotr(x, 6) @@ -100,11 +107,13 @@ // stack: x, x, y, z NOT // stack: not x, x, y, z - %stack (notx, x, y, z) -> (notx, z, x, y) - // stack: not x, z, x, y + SWAP1 + // stack: x, not x, y, z + SWAP3 + // stack: z, not x, y, x AND - // stack: (not x) and z, x, y - %stack (nxz, x, y) -> (x, y, nxz) + // stack: (not x) and z, y, x + SWAP2 // stack: x, y, (not x) and z AND // stack: x and y, (not x) and z @@ -113,18 +122,22 @@ %macro sha2_majority // stack: x, y, z - %stack (xyz: 3) -> (xyz, xyz) - // stack: x, y, z, x, y, z + DUP1 + // stack: x, x, y, z + DUP3 + // stack: y, x, x, y, z + DUP5 + // stack: z, y, x, x, y, z AND - // stack: x and y, z, x, y, z + // stack: z and y, x, x, y, z + SWAP4 + // stack: z, x, x, y, z and y + AND + // stack: z and x, x, y, z and y SWAP2 - // stack: x, z, x and y, y, z + // stack: y, x, z and x, z and y AND - // stack: x and z, x and y, y, z - %stack (a: 2, b: 2) -> (b, a) - // stack: y, z, x and z, x and y - AND - // stack: y and z, x and z, x and y + // stack: y and x, z and x, z and y OR OR %endmacro From 12e6527b5701992198da4245934d90c0733694d4 Mon Sep 17 00:00:00 2001 From: Nicholas Ward Date: Wed, 15 Mar 2023 18:46:49 -0700 Subject: [PATCH 21/23] fixed messed up merge --- .../kernel/asm/hash/blake2b/g_functions.asm | 77 +++++++++---------- 1 file changed, 37 insertions(+), 40 deletions(-) diff --git a/evm/src/cpu/kernel/asm/hash/blake2b/g_functions.asm b/evm/src/cpu/kernel/asm/hash/blake2b/g_functions.asm index f62d309c..f2d3b1d2 100644 --- a/evm/src/cpu/kernel/asm/hash/blake2b/g_functions.asm +++ b/evm/src/cpu/kernel/asm/hash/blake2b/g_functions.asm @@ -1,4 +1,4 @@ -blake2b_g_function: +%macro blake2b_g_function // Function to mix two input words, x and y, into the four words indexed by a, b, c, d (which // are in the range 0..16) in the internal state. // The internal state is stored in memory starting at the address start. @@ -37,63 +37,63 @@ blake2b_g_function: %stack (vd, vs: 3) -> (vs, vd) // stack: v[a], v[b], v[c], v[d], a, b, c, d, x, y, start DUP2 - // stack: v[b], v[a], v[b], v[c], v[d], a, b, c, d, x, y, start, retdest + // stack: v[b], v[a], v[b], v[c], v[d], a, b, c, d, x, y, start DUP10 - // stack: x, v[b], v[a], v[b], v[c], v[d], a, b, c, d, x, y, start, retdest + // stack: x, v[b], v[a], v[b], v[c], v[d], a, b, c, d, x, y, start ADD ADD %as_u64 - // stack: v[a]' = (v[a] + v[b] + x) % 2^64, v[b], v[c], v[d], a, b, c, d, x, y, start, retdest + // stack: v[a]' = (v[a] + v[b] + x) % 2^64, v[b], v[c], v[d], a, b, c, d, x, y, start %stack (a, b, c, d) -> (a, d, a, b, c, d) - // stack: v[a]', v[d], v[a]', v[b], v[c], v[d], a, b, c, d, x, y, start, retdest + // stack: v[a]', v[d], v[a]', v[b], v[c], v[d], a, b, c, d, x, y, start XOR %rotr_64(32) - // stack: v[d]' = (v[d] ^ v[a]') >>> 32, v[a]', v[b], v[c], v[d], a, b, c, d, x, y, start, retdest + // stack: v[d]' = (v[d] ^ v[a]') >>> 32, v[a]', v[b], v[c], v[d], a, b, c, d, x, y, start %stack (top: 4, vd) -> (top) - // stack: v[d]', v[a]', v[b], v[c], a, b, c, d, x, y, start, retdest + // stack: v[d]', v[a]', v[b], v[c], a, b, c, d, x, y, start %stack (d, a, b, c) -> (c, d, a, b, d) - // stack: v[c], v[d]', v[a]', v[b], v[d]', a, b, c, d, x, y, start, retdest + // stack: v[c], v[d]', v[a]', v[b], v[d]', a, b, c, d, x, y, start ADD %as_u64 - // stack: v[c]' = (v[c] + v[d]') % 2^64, v[a]', v[b], v[d]', a, b, c, d, x, y, start, retdest + // stack: v[c]' = (v[c] + v[d]') % 2^64, v[a]', v[b], v[d]', a, b, c, d, x, y, start %stack (c, a, b, d) -> (b, c, a, c, d) - // stack: v[b], v[c]', v[a]', v[c]', v[d]', a, b, c, d, x, y, start, retdest + // stack: v[b], v[c]', v[a]', v[c]', v[d]', a, b, c, d, x, y, start XOR %rotr_64(24) - // stack: v[b]' = (v[b] ^ v[c]') >>> 24, v[a]', v[c]', v[d]', a, b, c, d, x, y, start, retdest + // stack: v[b]' = (v[b] ^ v[c]') >>> 24, v[a]', v[c]', v[d]', a, b, c, d, x, y, start SWAP1 - // stack: v[a]', v[b]', v[c]', v[d]', a, b, c, d, x, y, start, retdest + // stack: v[a]', v[b]', v[c]', v[d]', a, b, c, d, x, y, start DUP2 - // stack: v[b]', v[a]', v[b]', v[c]', v[d]', a, b, c, d, x, y, start, retdest + // stack: v[b]', v[a]', v[b]', v[c]', v[d]', a, b, c, d, x, y, start DUP11 - // stack: y, v[b]', v[a]', v[b]', v[c]', v[d]', a, b, c, d, x, y, start, retdest + // stack: y, v[b]', v[a]', v[b]', v[c]', v[d]', a, b, c, d, x, y, start ADD ADD %as_u64 - // stack: v[a]'' = (v[a]' + v[b]' + y) % 2^64, v[b]', v[c]', v[d]', a, b, c, d, x, y, start, retdest + // stack: v[a]'' = (v[a]' + v[b]' + y) % 2^64, v[b]', v[c]', v[d]', a, b, c, d, x, y, start SWAP3 - // stack: v[d]', v[b]', v[c]', v[a]'', a, b, c, d, x, y, start, retdest + // stack: v[d]', v[b]', v[c]', v[a]'', a, b, c, d, x, y, start DUP4 - // stack: v[a]'', v[d]', v[b]', v[c]', v[a]'', a, b, c, d, x, y, start, retdest + // stack: v[a]'', v[d]', v[b]', v[c]', v[a]'', a, b, c, d, x, y, start XOR %rotr_64(16) - // stack: v[d]'' = (v[a]'' ^ v[d]') >>> 8, v[b]', v[c]', v[a]'', a, b, c, d, x, y, start, retdest + // stack: v[d]'' = (v[a]'' ^ v[d]') >>> 8, v[b]', v[c]', v[a]'', a, b, c, d, x, y, start SWAP2 - // stack: v[c]', v[b]', v[d]'', v[a]'', a, b, c, d, x, y, start, retdest + // stack: v[c]', v[b]', v[d]'', v[a]'', a, b, c, d, x, y, start DUP3 - // stack: v[d]'', v[c]', v[b]', v[d]'', v[a]'', a, b, c, d, x, y, start, retdest + // stack: v[d]'', v[c]', v[b]', v[d]'', v[a]'', a, b, c, d, x, y, start ADD %as_u64 - // stack: v[c]'' = (v[c]' + v[d]'') % 2^64, v[b]', v[d]'', v[a]'', a, b, c, d, x, y, start, retdest + // stack: v[c]'' = (v[c]' + v[d]'') % 2^64, v[b]', v[d]'', v[a]'', a, b, c, d, x, y, start DUP1 - // stack: v[c]'', v[c]'', v[b]', v[d]'', v[a]'', a, b, c, d, x, y, start, retdest + // stack: v[c]'', v[c]'', v[b]', v[d]'', v[a]'', a, b, c, d, x, y, start SWAP2 - // stack: v[b]', v[c]'', v[c]'', v[d]'', v[a]'', a, b, c, d, x, y, start, retdest + // stack: v[b]', v[c]'', v[c]'', v[d]'', v[a]'', a, b, c, d, x, y, start XOR %rotr_64(63) - // stack: v[b]'' = (v[b]' ^ v[c]'') >>> 7, v[c]'', v[d]'', v[a]'', a, b, c, d, x, y, start, retdest + // stack: v[b]'' = (v[b]' ^ v[c]'') >>> 7, v[c]'', v[d]'', v[a]'', a, b, c, d, x, y, start %stack (vb, vc, vd, va, a, b, c, d, x, y, start) -> (start, a, va, start, b, vb, start, c, vc, start, d, vd) - // stack: start, a, v[a]'', start, b, v[b]'', start, c, v[c]'', start, d, v[d]'', retdest + // stack: start, a, v[a]'', start, b, v[b]'', start, c, v[c]'', start, d, v[d]'' ADD %mstore_kernel_general ADD @@ -102,27 +102,24 @@ blake2b_g_function: %mstore_kernel_general ADD %mstore_kernel_general - // stack: retdest - JUMP +%endmacro -call_blake2b_g_function: - // stack: a, b, c, d, x_idx, y_idx, round, start, retdest - DUP6 - // stack: y_idx, a, b, c, d, x_idx, y_idx, round, start, retdest - DUP8 - // stack: round, y_idx, a, b, c, d, x_idx, y_idx, round, start, retdest +%macro call_blake2b_g_function(a, b, c, d, x_idx, y_idx) + // stack: round, start + PUSH $y_idx + DUP2 + // stack: round, y_idx, round, start %blake2b_permutation - // stack: s[y_idx], a, b, c, d, x_idx, y_idx, round, start, retdest + // stack: s[y_idx], round, start %blake2b_message_addr ADD %mload_kernel_general - // stack: m[s[y_idx]], a, b, c, d, x_idx, y_idx, round, start, retdest - DUP6 - // stack: x_idx, m[s[y_idx]], a, b, c, d, x_idx, y_idx, round, start, retdest - DUP9 - // stack: round, x_idx, m[s[y_idx]], a, b, c, d, x_idx, y_idx, round, start, retdest + // stack: m[s[y_idx]], round, start + PUSH $x_idx + DUP3 + // stack: round, 2, m[s[y_idx]], round, start %blake2b_permutation - // stack: s[x_idx], m[s[y_idx]], a, b, c, d, x_idx, y_idx, round, start, retdest + // stack: s[x_idx], m[s[y_idx]], round, start %blake2b_message_addr ADD %mload_kernel_general From 9534762179f17a30bcd7cbc0c9030190b273ddb4 Mon Sep 17 00:00:00 2001 From: Nicholas Ward Date: Thu, 16 Mar 2023 11:56:39 -0700 Subject: [PATCH 22/23] div instead of shr --- evm/src/cpu/kernel/asm/hash/sha2/main.asm | 2 +- evm/src/cpu/kernel/asm/hash/sha2/ops.asm | 2 +- evm/src/cpu/kernel/asm/hash/sha2/write_length.asm | 14 +++++++------- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/evm/src/cpu/kernel/asm/hash/sha2/main.asm b/evm/src/cpu/kernel/asm/hash/sha2/main.asm index e47d1838..1deab294 100644 --- a/evm/src/cpu/kernel/asm/hash/sha2/main.asm +++ b/evm/src/cpu/kernel/asm/hash/sha2/main.asm @@ -31,7 +31,7 @@ global sha2_pad: DUP1 // stack: num_bytes, num_bytes, retdest %add_const(8) - %shr_const(6) + %div_const(64) %increment // stack: num_blocks = (num_bytes+8)//64 + 1, num_bytes, retdest diff --git a/evm/src/cpu/kernel/asm/hash/sha2/ops.asm b/evm/src/cpu/kernel/asm/hash/sha2/ops.asm index d50e5c9a..6a4c5e3b 100644 --- a/evm/src/cpu/kernel/asm/hash/sha2/ops.asm +++ b/evm/src/cpu/kernel/asm/hash/sha2/ops.asm @@ -34,7 +34,7 @@ // stack: rotr(x, 18), x, rotr(x, 7) SWAP1 // stack: x, rotr(x, 18), rotr(x, 7) - %shr_const(3) + %div_const(8) // equivalent to %shr_const(3) // stack: shr(x, 3), rotr(x, 18), rotr(x, 7) XOR XOR diff --git a/evm/src/cpu/kernel/asm/hash/sha2/write_length.asm b/evm/src/cpu/kernel/asm/hash/sha2/write_length.asm index 5727498c..bb7a01a8 100644 --- a/evm/src/cpu/kernel/asm/hash/sha2/write_length.asm +++ b/evm/src/cpu/kernel/asm/hash/sha2/write_length.asm @@ -15,7 +15,7 @@ %decrement SWAP1 // stack: length, last_addr - 1 - %shr_const(8) + %div_const(256) // equivalent to %shr_const(8) // stack: length >> 8, last_addr - 1 DUP1 // stack: length >> 8, length >> 8, last_addr - 1 @@ -30,7 +30,7 @@ %decrement SWAP1 // stack: length >> 8, last_addr - 2 - %shr_const(8) + %div_const(256) // equivalent to %shr_const(8) // stack: length >> 16, last_addr - 2 DUP1 // stack: length >> 16, length >> 16, last_addr - 2 @@ -45,7 +45,7 @@ %decrement SWAP1 // stack: length >> 16, last_addr - 3 - %shr_const(8) + %div_const(256) // equivalent to %shr_const(8) // stack: length >> 24, last_addr - 3 DUP1 // stack: length >> 24, length >> 24, last_addr - 3 @@ -60,7 +60,7 @@ %decrement SWAP1 // stack: length >> 24, last_addr - 4 - %shr_const(8) + %div_const(256) // equivalent to %shr_const(8) // stack: length >> 32, last_addr - 4 DUP1 // stack: length >> 32, length >> 32, last_addr - 4 @@ -75,7 +75,7 @@ %decrement SWAP1 // stack: length >> 32, last_addr - 5 - %shr_const(8) + %div_const(256) // equivalent to %shr_const(8) // stack: length >> 40, last_addr - 5 DUP1 // stack: length >> 40, length >> 40, last_addr - 5 @@ -90,7 +90,7 @@ %decrement SWAP1 // stack: length >> 40, last_addr - 6 - %shr_const(8) + %div_const(256) // equivalent to %shr_const(8) // stack: length >> 48, last_addr - 6 DUP1 // stack: length >> 48, length >> 48, last_addr - 6 @@ -105,7 +105,7 @@ %decrement SWAP1 // stack: length >> 48, last_addr - 7 - %shr_const(8) + %div_const(256) // equivalent to %shr_const(8) // stack: length >> 56, last_addr - 7 DUP1 // stack: length >> 56, length >> 56, last_addr - 7 From 38f79e4991ef69cefac987722ed40e244e2568f6 Mon Sep 17 00:00:00 2001 From: Nicholas Ward Date: Thu, 16 Mar 2023 14:58:31 -0700 Subject: [PATCH 23/23] optimizations with rep --- .../cpu/kernel/asm/hash/sha2/compression.asm | 52 +------- .../cpu/kernel/asm/hash/sha2/write_length.asm | 120 +++--------------- 2 files changed, 22 insertions(+), 150 deletions(-) diff --git a/evm/src/cpu/kernel/asm/hash/sha2/compression.asm b/evm/src/cpu/kernel/asm/hash/sha2/compression.asm index 0bceb715..8f7d942c 100644 --- a/evm/src/cpu/kernel/asm/hash/sha2/compression.asm +++ b/evm/src/cpu/kernel/asm/hash/sha2/compression.asm @@ -132,53 +132,11 @@ compression_end_block: // stack: a[64], b[64], c[64], d[64], e[64], f[64], g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0], b[0], c[0], d[0], e[0], f[0], g[0], h[0], retdest PUSH 0 // stack: 0, a[64], b[64], c[64], d[64], e[64], f[64], g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0], b[0], c[0], d[0], e[0], f[0], g[0], h[0], retdest - SWAP13 - // stack: a[0], a[64], b[64], c[64], d[64], e[64], f[64], g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, 0, b[0], c[0], d[0], e[0], f[0], g[0], h[0], retdest - %add_u32 - // stack: a[0]+a[64], b[64], c[64], d[64], e[64], f[64], g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, 0, b[0], c[0], d[0], e[0], f[0], g[0], h[0], retdest - SWAP12 - // stack: 0, b[64], c[64], d[64], e[64], f[64], g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]+a[64], b[0], c[0], d[0], e[0], f[0], g[0], h[0], retdest - SWAP13 - // stack: b[0], b[64], c[64], d[64], e[64], f[64], g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]+a[64], 0, c[0], d[0], e[0], f[0], g[0], h[0], retdest - %add_u32 - // stack: b[0]+b[64], c[64], d[64], e[64], f[64], g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]+a[64], 0, c[0], d[0], e[0], f[0], g[0], h[0], retdest - SWAP12 - // stack: 0, c[64], d[64], e[64], f[64], g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]+a[64], b[0]+b[64], c[0], d[0], e[0], f[0], g[0], h[0], retdest - SWAP13 - // stack: c[0], c[64], d[64], e[64], f[64], g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]+a[64], b[0]+b[64], 0, d[0], e[0], f[0], g[0], h[0], retdest - %add_u32 - // stack: c[0]+c[64], d[64], e[64], f[64], g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]+a[64], b[0]+b[64], 0, d[0], e[0], f[0], g[0], h[0], retdest - SWAP12 - // stack: 0, d[64], e[64], f[64], g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]+a[64], b[0]+b[64], c[0]+c[64], d[0], e[0], f[0], g[0], h[0], retdest - SWAP13 - // stack: d[0], d[64], e[64], f[64], g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]+a[64], b[0]+b[64], c[0]+c[64], 0, e[0], f[0], g[0], h[0], retdest - %add_u32 - // stack: d[0]+d[64], e[64], f[64], g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]+a[64], b[0]+b[64], c[0]+c[64], 0, e[0], f[0], g[0], h[0], retdest - SWAP12 - // stack: 0, e[64], f[64], g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]+a[64], b[0]+b[64], c[0]+c[64], d[0]+d[64], e[0], f[0], g[0], h[0], retdest - SWAP13 - // stack: e[0], e[64], f[64], g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]+a[64], b[0]+b[64], c[0]+c[64], d[0]+d[64], 0, f[0], g[0], h[0], retdest - %add_u32 - // stack: e[0]+e[64], f[64], g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]+a[64], b[0]+b[64], c[0]+c[64], d[0]+d[64], 0, f[0], g[0], h[0], retdest - SWAP12 - // stack: 0, f[64], g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]+a[64], b[0]+b[64], c[0]+c[64], d[0]+d[64], e[0]+e[64], f[0], g[0], h[0], retdest - SWAP13 - // stack: f[0], f[64], g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]+a[64], b[0]+b[64], c[0]+c[64], d[0]+d[64], e[0]+e[64], 0, g[0], h[0], retdest - %add_u32 - // stack: f[0]+f[64], g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]+a[64], b[0]+b[64], c[0]+c[64], d[0]+d[64], e[0]+e[64], 0, g[0], h[0], retdest - SWAP12 - // stack: 0, g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]+a[64], b[0]+b[64], c[0]+c[64], d[0]+d[64], e[0]+e[64], f[0]+f[64], g[0], h[0], retdest - SWAP13 - // stack: g[0], g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]+a[64], b[0]+b[64], c[0]+c[64], d[0]+d[64], e[0]+e[64], f[0]+f[64], 0, h[0], retdest - %add_u32 - // stack: g[0]+g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]+a[64], b[0]+b[64], c[0]+c[64], d[0]+d[64], e[0]+e[64], f[0]+f[64], 0, h[0], retdest - SWAP12 - // stack: 0, h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]+a[64], b[0]+b[64], c[0]+c[64], d[0]+d[64], e[0]+e[64], f[0]+f[64], g[0]+g[64], h[0], retdest - SWAP13 - // stack: h[0], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]+a[64], b[0]+b[64], c[0]+c[64], d[0]+d[64], e[0]+e[64], f[0]+f[64], g[0]+g[64], 0, retdest - %add_u32 - // stack: h[0]+h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]+a[64], b[0]+b[64], c[0]+c[64], d[0]+d[64], e[0]+e[64], f[0]+f[64], g[0]+g[64], 0, retdest - SWAP12 + %rep 8 + SWAP13 + %add_u32 + SWAP12 + %endrep // stack: 0, num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]+a[64], b[0]+b[64], c[0]+c[64], d[0]+d[64], e[0]+e[64], f[0]+f[64], g[0]+g[64], h[0]+h[64], retdest POP // stack: num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]+a[64], b[0]+b[64], c[0]+c[64], d[0]+d[64], e[0]+e[64], f[0]+f[64], g[0]+g[64], h[0]+h[64], retdest diff --git a/evm/src/cpu/kernel/asm/hash/sha2/write_length.asm b/evm/src/cpu/kernel/asm/hash/sha2/write_length.asm index bb7a01a8..4f73fa79 100644 --- a/evm/src/cpu/kernel/asm/hash/sha2/write_length.asm +++ b/evm/src/cpu/kernel/asm/hash/sha2/write_length.asm @@ -10,110 +10,24 @@ // stack: last_addr, length % (1 << 8), length, last_addr %mstore_kernel_general - // stack: length, last_addr - SWAP1 - %decrement - SWAP1 - // stack: length, last_addr - 1 - %div_const(256) // equivalent to %shr_const(8) - // stack: length >> 8, last_addr - 1 - DUP1 - // stack: length >> 8, length >> 8, last_addr - 1 - %and_const(0xff) - // stack: (length >> 8) % (1 << 8), length >> 8, last_addr - 1 - DUP3 - // stack: last_addr - 1, (length >> 8) % (1 << 8), length >> 8, last_addr - 1 - %mstore_kernel_general - - // stack: length >> 8, last_addr - 1 - SWAP1 - %decrement - SWAP1 - // stack: length >> 8, last_addr - 2 - %div_const(256) // equivalent to %shr_const(8) - // stack: length >> 16, last_addr - 2 - DUP1 - // stack: length >> 16, length >> 16, last_addr - 2 - %and_const(0xff) - // stack: (length >> 16) % (1 << 8), length >> 16, last_addr - 2 - DUP3 - // stack: last_addr - 2, (length >> 16) % (1 << 8), length >> 16, last_addr - 2 - %mstore_kernel_general + %rep 7 + // For i = 0 to 6 + // stack: length >> (8 * i), last_addr - i - 1 + SWAP1 + %decrement + SWAP1 + // stack: length >> (8 * i), last_addr - i - 2 + %div_const(256) // equivalent to %shr_const(8) + // stack: length >> (8 * (i + 1)), last_addr - i - 2 + DUP1 + // stack: length >> (8 * (i + 1)), length >> (8 * (i + 1)), last_addr - i - 2 + %mod_const(256) + // stack: (length >> (8 * (i + 1))) % (1 << 8), length >> (8 * (i + 1)), last_addr - i - 2 + DUP3 + // stack: last_addr - i - 2, (length >> (8 * (i + 1))) % (1 << 8), length >> (8 * (i + 1)), last_addr - i - 2 + %mstore_kernel_general + %endrep - // stack: length >> 16, last_addr - 2 - SWAP1 - %decrement - SWAP1 - // stack: length >> 16, last_addr - 3 - %div_const(256) // equivalent to %shr_const(8) - // stack: length >> 24, last_addr - 3 - DUP1 - // stack: length >> 24, length >> 24, last_addr - 3 - %and_const(0xff) - // stack: (length >> 24) % (1 << 8), length >> 24, last_addr - 3 - DUP3 - // stack: last_addr - 3, (length >> 24) % (1 << 8), length >> 24, last_addr - 3 - %mstore_kernel_general - - // stack: length >> 24, last_addr - 3 - SWAP1 - %decrement - SWAP1 - // stack: length >> 24, last_addr - 4 - %div_const(256) // equivalent to %shr_const(8) - // stack: length >> 32, last_addr - 4 - DUP1 - // stack: length >> 32, length >> 32, last_addr - 4 - %and_const(0xff) - // stack: (length >> 32) % (1 << 8), length >> 32, last_addr - 4 - DUP3 - // stack: last_addr - 4, (length >> 32) % (1 << 8), length >> 32, last_addr - 4 - %mstore_kernel_general - - // stack: length >> 32, last_addr - 4 - SWAP1 - %decrement - SWAP1 - // stack: length >> 32, last_addr - 5 - %div_const(256) // equivalent to %shr_const(8) - // stack: length >> 40, last_addr - 5 - DUP1 - // stack: length >> 40, length >> 40, last_addr - 5 - %and_const(0xff) - // stack: (length >> 40) % (1 << 8), length >> 40, last_addr - 5 - DUP3 - // stack: last_addr - 5, (length >> 40) % (1 << 8), length >> 40, last_addr - 5 - %mstore_kernel_general - - // stack: length >> 40, last_addr - 5 - SWAP1 - %decrement - SWAP1 - // stack: length >> 40, last_addr - 6 - %div_const(256) // equivalent to %shr_const(8) - // stack: length >> 48, last_addr - 6 - DUP1 - // stack: length >> 48, length >> 48, last_addr - 6 - %and_const(0xff) - // stack: (length >> 48) % (1 << 8), length >> 48, last_addr - 6 - DUP3 - // stack: last_addr - 6, (length >> 48) % (1 << 8), length >> 48, last_addr - 6 - %mstore_kernel_general - - // stack: length >> 48, last_addr - 6 - SWAP1 - %decrement - SWAP1 - // stack: length >> 48, last_addr - 7 - %div_const(256) // equivalent to %shr_const(8) - // stack: length >> 56, last_addr - 7 - DUP1 - // stack: length >> 56, length >> 56, last_addr - 7 - %and_const(0xff) - // stack: (length >> 56) % (1 << 8), length >> 56, last_addr - 7 - DUP3 - // stack: last_addr - 7, (length >> 56) % (1 << 8), length >> 56, last_addr - 7 - %mstore_kernel_general %pop2 // stack: (empty) %endmacro