diff --git a/evm/Cargo.toml b/evm/Cargo.toml
index 6db81902..5ee3b1ff 100644
--- a/evm/Cargo.toml
+++ b/evm/Cargo.toml
@@ -8,14 +8,15 @@ edition = "2021"
 plonky2 = { path = "../plonky2", default-features = false, features = ["rand", "timing"] }
 plonky2_util = { path = "../util" }
 eth-trie-utils = { git = "https://github.com/mir-protocol/eth-trie-utils.git", rev = "dd3595b4ba7923f8d465450d210f17a2b4e20f96" }
-maybe_rayon = { path = "../maybe_rayon" }
 anyhow = "1.0.40"
 env_logger = "0.9.0"
 ethereum-types = "0.14.0"
 hex = { version = "0.4.3", optional = true }
 hex-literal = "0.3.4"
 itertools = "0.10.3"
+keccak-hash = "0.9.0"
 log = "0.4.14"
+maybe_rayon = { path = "../maybe_rayon" }
 once_cell = "1.13.0"
 pest = "2.1.3"
 pest_derive = "2.1.0"
@@ -23,7 +24,7 @@ rand = "0.8.5"
 rand_chacha = "0.3.1"
 rlp = "0.5.1"
 serde = { version = "1.0.144", features = ["derive"] }
-keccak-hash = "0.9.0"
+sha2 = "0.10.2"
 tiny-keccak = "2.0.2"
 
 [dev-dependencies]
diff --git a/evm/src/cpu/kernel/aggregator.rs b/evm/src/cpu/kernel/aggregator.rs
index 002a84fb..0c3015f2 100644
--- a/evm/src/cpu/kernel/aggregator.rs
+++ b/evm/src/cpu/kernel/aggregator.rs
@@ -39,9 +39,6 @@ pub(crate) fn combined_kernel() -> Kernel {
         include_str!("asm/memory/metadata.asm"),
         include_str!("asm/memory/packing.asm"),
         include_str!("asm/memory/txn_fields.asm"),
-        include_str!("asm/rlp/encode.asm"),
-        include_str!("asm/rlp/decode.asm"),
-        include_str!("asm/rlp/read_to_memory.asm"),
         include_str!("asm/mpt/hash.asm"),
         include_str!("asm/mpt/hash_trie_specific.asm"),
         include_str!("asm/mpt/hex_prefix.asm"),
@@ -51,6 +48,16 @@ pub(crate) fn combined_kernel() -> Kernel {
         include_str!("asm/mpt/storage_write.asm"),
         include_str!("asm/mpt/util.asm"),
         include_str!("asm/mpt/write.asm"),
+        include_str!("asm/rlp/encode.asm"),
+        include_str!("asm/rlp/decode.asm"),
+        include_str!("asm/rlp/read_to_memory.asm"),
+        include_str!("asm/sha2/compression.asm"),
+        include_str!("asm/sha2/constants.asm"),
+        include_str!("asm/sha2/message_schedule.asm"),
+        include_str!("asm/sha2/ops.asm"),
+        include_str!("asm/sha2/store_pad.asm"),
+        include_str!("asm/sha2/temp_words.asm"),
+        include_str!("asm/sha2/write_length.asm"),
         include_str!("asm/transactions/router.asm"),
         include_str!("asm/transactions/type_0.asm"),
         include_str!("asm/transactions/type_1.asm"),
diff --git a/evm/src/cpu/kernel/asm/memory/core.asm b/evm/src/cpu/kernel/asm/memory/core.asm
index 6722b0ca..26196df5 100644
--- a/evm/src/cpu/kernel/asm/memory/core.asm
+++ b/evm/src/cpu/kernel/asm/memory/core.asm
@@ -55,6 +55,148 @@
     // stack: (empty)
 %endmacro
 
+// Load a big-endian u32, consisting of 4 bytes (c_3, c_2, c_1, c_0),
+// from the kernel.
+%macro mload_kernel_u32(segment)
+    // stack: offset
+    DUP1
+    %mload_kernel($segment)
+    // stack: c_3, offset
+    %shl_const(8)
+    // stack: c_3 << 8, offset
+    DUP2
+    %add_const(1)
+    %mload_kernel($segment)
+    OR
+    // stack: (c_3 << 8) | c_2, offset
+    %shl_const(8)
+    // stack: ((c_3 << 8) | c_2) << 8, offset
+    DUP2
+    %add_const(2)
+    %mload_kernel($segment)
+    OR
+    // stack: (((c_3 << 8) | c_2) << 8) | c_1, offset
+    %shl_const(8)
+    // stack: ((((c_3 << 8) | c_2) << 8) | c_1) << 8, offset
+    SWAP1
+    %add_const(3)
+    %mload_kernel($segment)
+    OR
+    // stack: (((((c_3 << 8) | c_2) << 8) | c_1) << 8) | c_0
+%endmacro
+
+// Load a u256 (big-endian) from the kernel.
+%macro mload_kernel_u256(segment)
+    // stack: offset
+    DUP1
+    %mload_kernel_u32($segment)
+    // stack: c_7, offset
+    %shl_const(32)
+    // stack: c7 << 32, offset
+    DUP2
+    %add_const(4)
+    %mload_kernel_u32($segment)
+    OR
+    // stack: (c_7 << 32) | c_6, offset
+    %shl_const(32)
+    // stack: ((c_7 << 32) | c_6) << 32, offset
+    DUP2
+    %add_const(8)
+    %mload_kernel_u32($segment)
+    OR
+    // stack: (c_7 << 64) | (c_6 << 32) | c_5, offset
+    %shl_const(32)
+    // stack: ((c_7 << 64) | (c_6 << 32) | c_5) << 32, offset
+    DUP2
+    %add_const(12)
+    %mload_kernel_u32($segment)
+    OR
+    // stack: (c_7 << 96) | (c_6 << 64) | (c_5 << 32) | c_4, offset
+    %shl_const(32)
+    // stack: ((c_7 << 96) | (c_6 << 64) | (c_5 << 32) | c_4) << 32, offset
+    DUP2
+    %add_const(16)
+    %mload_kernel_u32($segment)
+    OR
+    // stack: (c_7 << 128) | (c_6 << 96) | (c_5 << 64) | (c_4 << 32) | c_3, offset
+    %shl_const(32)
+    // stack: ((c_7 << 128) | (c_6 << 96) | (c_5 << 64) | (c_4 << 32) | c_3) << 32, offset
+    DUP2
+    %add_const(20)
+    %mload_kernel_u32($segment)
+    OR
+    // stack: (c_7 << 160) | (c_6 << 128) | (c_5 << 96) | (c_4 << 64) | (c_3 << 32) | c_2, offset
+    %shl_const(32)
+    // stack: ((c_7 << 160) | (c_6 << 128) | (c_5 << 96) | (c_4 << 64) | (c_3 << 32) | c_2) << 32, offset
+    DUP2
+    %add_const(24)
+    %mload_kernel_u32($segment)
+    OR
+    // stack: (c_7 << 192) | (c_6 << 160) | (c_5 << 128) | (c_4 << 96) | (c_3 << 64) | (c_2 << 32) | c_1, offset
+    %shl_const(32)
+    // stack: ((c_7 << 192) | (c_6 << 160) | (c_5 << 128) | (c_4 << 96) | (c_3 << 64) | (c_2 << 32) | c_1) << 32, offset
+    DUP2
+    %add_const(28)
+    %mload_kernel_u32($segment)
+    OR
+    // stack: (c_7 << 224) | (c_6 << 192) | (c_5 << 160) | (c_4 << 128) | (c_3 << 96) | (c_2 << 64) | (c_1 << 32) | c_0, offset
+    SWAP1
+    POP
+    // stack: (c_7 << 224) | (c_6 << 192) | (c_5 << 160) | (c_4 << 128) | (c_3 << 96) | (c_2 << 64) | (c_1 << 32) | c_0
+%endmacro
+
+// Store a big-endian u32, consisting of 4 bytes (c_3, c_2, c_1, c_0),
+// to the kernel.
+%macro mstore_kernel_u32(segment)
+    // stack: offset, value
+    SWAP1
+    // stack: value, offset
+    DUP1
+    // stack: value, value, offset
+    %and_const(0xff)
+    // stack: c_0 = value % (1 << 8), value, offset
+    SWAP1
+    // stack: value, c_0, offset
+    %shr_const(8)
+    // stack: value >> 8, c_0, offset
+    DUP1
+    // stack: value >> 8, value >> 8, c_0, offset
+    %and_const(0xff)
+    // stack: c_1 = (value >> 8) % (1 << 8), value >> 8, c_0, offset
+    SWAP1
+    // stack: value >> 8, c_1, c_0, offset
+    %shr_const(8)
+    // stack: value >> 16, c_1, c_0, offset
+    DUP1
+    // stack: value >> 16, value >> 16, c_1, c_0, offset
+    %and_const(0xff)
+    // stack: c_2 = (value >> 16) % (1 << 8), value >> 16, c_1, c_0, offset
+    SWAP1
+    // stack: value >> 16, c_2, c_1, c_0, offset
+    %shr_const(8)
+    // stack: value >> 24, c_2, c_1, c_0, offset
+    %and_const(0xff)
+    // stack: c_3 = (value >> 24) % (1 << 8), c_2, c_1, c_0, offset
+    DUP5
+    // stack: offset, c_3, c_2, c_1, c_0, offset
+    %mstore_kernel($segment)
+    // stack: c_2, c_1, c_0, offset
+    DUP4
+    // stack: offset, c_2, c_1, c_0, offset
+    %add_const(1)
+    %mstore_kernel($segment)
+    // stack: c_1, c_0, offset
+    DUP3
+    // stack: offset, c_1, c_0, offset
+    %add_const(2)
+    %mstore_kernel($segment)
+    // stack: c_0, offset
+    SWAP1
+    // stack: offset, c_0
+    %add_const(3)
+    %mstore_kernel($segment)
+%endmacro
+
 // Load a single byte from kernel code.
 %macro mload_kernel_code
     // stack: offset
@@ -62,34 +204,41 @@
     // stack: value
 %endmacro
 
+// Load a single byte from kernel general memory.
+%macro mload_kernel_general
+    // stack: offset
+    %mload_kernel(@SEGMENT_KERNEL_GENERAL)
+    // stack: value
+%endmacro
+
 // Load a big-endian u32, consisting of 4 bytes (c_3, c_2, c_1, c_0),
 // from kernel code.
 %macro mload_kernel_code_u32
     // stack: offset
-    DUP1
-    %mload_kernel_code
-    // stack: c_3, offset
-    %shl_const(8)
-    // stack: c_3 << 8, offset
-    DUP2
-    %add_const(1)
-    %mload_kernel_code
-    OR
-    // stack: (c_3 << 8) | c_2, offset
-    %shl_const(8)
-    // stack: ((c_3 << 8) | c_2) << 8, offset
-    DUP2
-    %add_const(2)
-    %mload_kernel_code
-    OR
-    // stack: (((c_3 << 8) | c_2) << 8) | c_1, offset
-    %shl_const(8)
-    // stack: ((((c_3 << 8) | c_2) << 8) | c_1) << 8, offset
-    SWAP1
-    %add_const(3)
-    %mload_kernel_code
-    OR
-    // stack: (((((c_3 << 8) | c_2) << 8) | c_1) << 8) | c_0
+    %mload_kernel_u32(@SEGMENT_CODE)
+    // stack: value
+%endmacro
+
+// Load a big-endian u32, consisting of 4 bytes (c_3, c_2, c_1, c_0),
+// from kernel general memory.
+%macro mload_kernel_general_u32
+    // stack: offset
+    %mload_kernel_u32(@SEGMENT_KERNEL_GENERAL)
+    // stack: value
+%endmacro
+
+// Load a u256 (big-endian) from kernel code.
+%macro mload_kernel_code_u256
+    // stack: offset
+    %mload_kernel_u256(@SEGMENT_CODE)
+    // stack: value
+%endmacro
+
+// Load a u256 (big-endian) from kernel general memory.
+%macro mload_kernel_general_u256
+    // stack: offset
+    %mload_kernel_u256(@SEGMENT_KERNEL_GENERAL)
+    // stack: value
 %endmacro
 
 // Store a single byte to kernel code.
@@ -99,6 +248,27 @@
     // stack: (empty)
 %endmacro
 
+// Store a single byte to kernel general memory.
+%macro mstore_kernel_general
+    // stack: offset, value
+    %mstore_kernel(@SEGMENT_KERNEL_GENERAL)
+    // stack: (empty)
+%endmacro
+
+// Store a big-endian u32, consisting of 4 bytes (c_3, c_2, c_1, c_0),
+// to kernel code.
+%macro mstore_kernel_code_u32
+    // stack: offset, value
+    %mstore_kernel_u32(@SEGMENT_CODE)
+%endmacro
+
+// Store a big-endian u32, consisting of 4 bytes (c_3, c_2, c_1, c_0),
+// to kernel general memory.
+%macro mstore_kernel_general_u32
+    // stack: offset, value
+    %mstore_kernel_u32(@SEGMENT_KERNEL_GENERAL)
+%endmacro
+
 // Store a single byte to @SEGMENT_RLP_RAW.
 %macro mstore_rlp
     // stack: offset, value
diff --git a/evm/src/cpu/kernel/asm/sha2/compression.asm b/evm/src/cpu/kernel/asm/sha2/compression.asm
new file mode 100644
index 00000000..eb9b73b8
--- /dev/null
+++ b/evm/src/cpu/kernel/asm/sha2/compression.asm
@@ -0,0 +1,306 @@
+// We use memory starting at 320 * num_blocks + 2 (after the message schedule
+// space) as scratch space to store stack values.
+%macro scratch_space_addr_from_num_blocks
+    // stack: num_blocks
+    %mul_const(320)
+    %add_const(2)
+%endmacro
+
+global sha2_compression:
+    // stack: message_schedule_addr, retdest
+    PUSH 0
+    // stack: i=0, message_schedule_addr, retdest
+    SWAP1
+    // stack: message_schedule_addr, i=0, retdest
+    PUSH 0
+    // stack: 0, message_schedule_addr, i=0, retdest
+    %mload_kernel_general
+    // stack: num_blocks, message_schedule_addr, i=0, retdest
+    DUP1
+    // stack: num_blocks, num_blocks, message_schedule_addr, i=0, retdest
+    %scratch_space_addr_from_num_blocks
+    // stack: scratch_space_addr, num_blocks, message_schedule_addr, i=0, retdest
+    SWAP1
+    // stack: num_blocks, scratch_space_addr, message_schedule_addr, i=0, retdest
+    PUSH sha2_constants_h
+    %add_const(28)
+    %mload_kernel_code_u32
+    // stack: h[0], num_blocks, scratch_space_addr, message_schedule_addr, i=0, retdest
+    PUSH sha2_constants_h
+    %add_const(24)
+    %mload_kernel_code_u32
+    // stack: g[0], h[0], num_blocks, scratch_space_addr, message_schedule_addr, i=0, retdest
+    PUSH sha2_constants_h
+    %add_const(20)
+    %mload_kernel_code_u32
+    // stack: f[0], g[0], h[0], num_blocks, scratch_space_addr, message_schedule_addr, i=0, retdest
+    PUSH sha2_constants_h
+    %add_const(16)
+    %mload_kernel_code_u32
+    // stack: e[0], f[0], g[0], h[0], num_blocks, scratch_space_addr, message_schedule_addr, i=0, retdest
+    PUSH sha2_constants_h
+    %add_const(12)
+    %mload_kernel_code_u32
+    // stack: d[0], e[0], f[0], g[0], h[0], num_blocks, scratch_space_addr, message_schedule_addr, i=0, retdest
+    PUSH sha2_constants_h
+    %add_const(8)
+    %mload_kernel_code_u32
+    // stack: c[0], d[0], e[0], f[0], g[0], h[0], num_blocks, scratch_space_addr, message_schedule_addr, i=0, retdest
+    PUSH sha2_constants_h
+    %add_const(4)
+    %mload_kernel_code_u32
+    // stack: b[0], c[0], d[0], e[0], f[0], g[0], h[0], num_blocks, scratch_space_addr, message_schedule_addr, i=0, retdest
+    PUSH sha2_constants_h
+    %mload_kernel_code_u32
+    // stack: a[0], b[0], c[0], d[0], e[0], f[0], g[0], h[0], num_blocks, scratch_space_addr, message_schedule_addr, i=0, retdest
+compression_start_block:
+    // Store the current values of the working variables, as the "initial values" to be added back in at the end of this block.
+    DUP10
+    // stack: scratch_space_addr, a[0], b[0], c[0], d[0], e[0], f[0], g[0], h[0], num_blocks, scratch_space_addr, message_schedule_addr, i=0, retdest
+
+    DUP2
+    DUP2
+    // stack: scratch_space_addr, a[0], scratch_space_addr, a[0], b[0], c[0], d[0], e[0], f[0], g[0], h[0], num_blocks, scratch_space_addr, message_schedule_addr, i=0, retdest
+    %mstore_kernel_general_u32
+    // stack: scratch_space_addr, a[0], b[0], c[0], d[0], e[0], f[0], g[0], h[0], num_blocks, scratch_space_addr, message_schedule_addr, i=0, retdest
+    %add_const(4)
+    // stack: scratch_space_addr+4, a[0], b[0], c[0], d[0], e[0], f[0], g[0], h[0], num_blocks, scratch_space_addr, message_schedule_addr, i=0, retdest
+
+    DUP3
+    DUP2
+    // stack: scratch_space_addr+4, b[0], scratch_space_addr+4, a[0], b[0], c[0], d[0], e[0], f[0], g[0], h[0], num_blocks, scratch_space_addr, message_schedule_addr, i=0, retdest
+    %mstore_kernel_general_u32
+    // stack: scratch_space_addr+4, a[0], b[0], c[0], d[0], e[0], f[0], g[0], h[0], num_blocks, scratch_space_addr, message_schedule_addr, i=0, retdest
+    %add_const(4)
+    // stack: scratch_space_addr+8, a[0], b[0], c[0], d[0], e[0], f[0], g[0], h[0], num_blocks, scratch_space_addr, message_schedule_addr, i=0, retdest
+    
+    DUP4
+    DUP2
+    // stack: scratch_space_addr+8, c[0], scratch_space_addr+8, a[0], b[0], c[0], d[0], e[0], f[0], g[0], h[0], num_blocks, scratch_space_addr, message_schedule_addr, i=0, retdest
+    %mstore_kernel_general_u32
+    // stack: scratch_space_addr+8, a[0], b[0], c[0], d[0], e[0], f[0], g[0], h[0], num_blocks, scratch_space_addr, message_schedule_addr, i=0, retdest
+    %add_const(4)
+    // stack: scratch_space_addr+12, a[0], b[0], c[0], d[0], e[0], f[0], g[0], h[0], num_blocks, scratch_space_addr, message_schedule_addr, i=0, retdest
+    
+    DUP5
+    DUP2
+    // stack: scratch_space_addr+12, c[0], scratch_space_addr+8, a[0], b[0], c[0], d[0], e[0], f[0], g[0], h[0], num_blocks, scratch_space_addr, message_schedule_addr, i=0, retdest
+    %mstore_kernel_general_u32
+    // stack: scratch_space_addr+12, a[0], b[0], c[0], d[0], e[0], f[0], g[0], h[0], num_blocks, scratch_space_addr, message_schedule_addr, i=0, retdest
+    %add_const(4)
+    // stack: scratch_space_addr+16, a[0], b[0], c[0], d[0], e[0], f[0], g[0], h[0], num_blocks, scratch_space_addr, message_schedule_addr, i=0, retdest
+    
+    DUP6
+    DUP2
+    // stack: scratch_space_addr+16, c[0], scratch_space_addr+8, a[0], b[0], c[0], d[0], e[0], f[0], g[0], h[0], num_blocks, scratch_space_addr, message_schedule_addr, i=0, retdest
+    %mstore_kernel_general_u32
+    // stack: scratch_space_addr+16, a[0], b[0], c[0], d[0], e[0], f[0], g[0], h[0], num_blocks, scratch_space_addr, message_schedule_addr, i=0, retdest
+    %add_const(4)
+    // stack: scratch_space_addr+20, a[0], b[0], c[0], d[0], e[0], f[0], g[0], h[0], num_blocks, scratch_space_addr, message_schedule_addr, i=0, retdest
+    
+    DUP7
+    DUP2
+    // stack: scratch_space_addr+20, c[0], scratch_space_addr+8, a[0], b[0], c[0], d[0], e[0], f[0], g[0], h[0], num_blocks, scratch_space_addr, message_schedule_addr, i=0, retdest
+    %mstore_kernel_general_u32
+    // stack: scratch_space_addr+20, a[0], b[0], c[0], d[0], e[0], f[0], g[0], h[0], num_blocks, scratch_space_addr, message_schedule_addr, i=0, retdest
+    %add_const(4)
+    // stack: scratch_space_addr+24, a[0], b[0], c[0], d[0], e[0], f[0], g[0], h[0], num_blocks, scratch_space_addr, message_schedule_addr, i=0, retdest
+    
+    DUP8
+    DUP2
+    // stack: scratch_space_addr+24, c[0], scratch_space_addr+8, a[0], b[0], c[0], d[0], e[0], f[0], g[0], h[0], num_blocks, scratch_space_addr, message_schedule_addr, i=0, retdest
+    %mstore_kernel_general_u32
+    // stack: scratch_space_addr+24, a[0], b[0], c[0], d[0], e[0], f[0], g[0], h[0], num_blocks, scratch_space_addr, message_schedule_addr, i=0, retdest
+    %add_const(4)
+    // stack: scratch_space_addr+28, a[0], b[0], c[0], d[0], e[0], f[0], g[0], h[0], num_blocks, scratch_space_addr, message_schedule_addr, i=0, retdest
+
+    DUP9
+    DUP2
+    // stack: scratch_space_addr+28, c[0], scratch_space_addr+8, a[0], b[0], c[0], d[0], e[0], f[0], g[0], h[0], num_blocks, scratch_space_addr, message_schedule_addr, i=0, retdest
+    %mstore_kernel_general_u32
+    // stack: scratch_space_addr+28, a[0], b[0], c[0], d[0], e[0], f[0], g[0], h[0], num_blocks, scratch_space_addr, message_schedule_addr, i=0, retdest
+    POP
+    // stack: a[0], b[0], c[0], d[0], e[0], f[0], g[0], h[0], num_blocks, scratch_space_addr, message_schedule_addr, i=0, retdest
+compression_loop:
+    // Update the eight working variables, using the next constant K[i] and the next message schedule chunk W[i].
+    // stack: a[i], b[i], c[i], d[i], e[i], f[i], g[i], h[i], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest
+    DUP11
+    // stack: message_schedule_addr, a[i], b[i], c[i], d[i], e[i], f[i], g[i], h[i], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest
+    DUP13
+    // stack: i, message_schedule_addr, a[i], b[i], c[i], d[i], e[i], f[i], g[i], h[i], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest
+    %mul_const(4)
+    // stack: 4*i, message_schedule_addr, a[i], b[i], c[i], d[i], e[i], f[i], g[i], h[i], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest
+    ADD
+    // stack: message_schedule_addr + 4*i, a[i], b[i], c[i], d[i], e[i], f[i], g[i], h[i], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest
+    %mload_kernel_general_u32
+    // stack: W[i], a[i], b[i], c[i], d[i], e[i], f[i], g[i], h[i], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest
+    PUSH sha2_constants_k
+    // stack: sha2_constants_k, W[i], a[i], b[i], c[i], d[i], e[i], f[i], g[i], h[i], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest
+    DUP14
+    // stack: i, sha2_constants_k, W[i], a[i], b[i], c[i], d[i], e[i], f[i], g[i], h[i], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest
+    %mul_const(4)
+    // stack: 4*i, sha2_constants_k, W[i], a[i], b[i], c[i], d[i], e[i], f[i], g[i], h[i], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest
+    ADD
+    // stack: sha2_constants_k + 4*i, W[i], a[i], b[i], c[i], d[i], e[i], f[i], g[i], h[i], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest
+    %mload_kernel_code_u32
+    // stack: K[i], W[i], a[i], b[i], c[i], d[i], e[i], f[i], g[i], h[i], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest
+    %stack (start: 6, e, f, g, h) -> (e, f, g, h, start, e, f, g, h)
+    // stack: e[i], f[i], g[i], h[i], K[i], W[i], a[i], b[i], c[i], d[i], e[i], f[i], g[i], h[i], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest
+    %sha2_temp_word1
+    // stack: T1[i], a[i], b[i], c[i], d[i], e[i], f[i], g[i], h[i], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest
+    %stack (t, a, b, c) -> (a, b, c, t, a, b, c)
+    // stack: a[i], b[i], c[i], T1[i], a[i], b[i], c[i], d[i], e[i], f[i], g[i], h[i], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest
+    %sha2_temp_word2
+    // stack: T2[i], T1[i], a[i], b[i], c[i], d[i], e[i], f[i], g[i], h[i], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest
+    DUP6
+    // stack: d[i], T2[i], T1[i], a[i], b[i], c[i], d[i], e[i], f[i], g[i], h[i], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest
+    DUP3
+    // stack: T1[i], d[i], T2[i], T1[i], a[i], b[i], c[i], d[i], e[i], f[i], g[i], h[i], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest
+    %add_u32
+    // stack: e[i+1]=T1[i]+d[i], T2[i], T1[i], a[i], b[i], c[i], d[i], e[i], f[i], g[i], h[i], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest
+    SWAP2
+    // stack: T2[i], T1[i], e[i+1], a[i], b[i], c[i], d[i], e[i], f[i], g[i], h[i], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest
+    %add_u32
+    // stack: a[i+1]=T1[i]+T2[i], e[i+1], b[i+1]=a[i], c[i+1]=b[i], d[i+1]=c[i], d[i], f[i+1]=e[i], g[i+1]=f[i], h[i+1]=g[i], h[i], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest
+    %stack (a, e, b, c, d, old_d, f, g, h, old_h) -> (a, b, c, d, e, f, g, h)
+    // stack: a[i+1], b[i+1], c[i+1], d[i+1], e[i+1], f[i+1], g[i+1], h[i+1], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest
+    DUP12
+    // stack: i, a[i+1], b[i+1], c[i+1], d[i+1], e[i+1], f[i+1], g[i+1], h[i+1], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest
+    %increment
+    // stack: i+1, a[i+1], b[i+1], c[i+1], d[i+1], e[i+1], f[i+1], g[i+1], h[i+1], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest
+    DUP1
+    // stack: i+1, i+1, a[i+1], b[i+1], c[i+1], d[i+1], e[i+1], f[i+1], g[i+1], h[i+1], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest
+    %eq_const(64)
+    // stack: i+1==64, i+1, a[i+1], b[i+1], c[i+1], d[i+1], e[i+1], f[i+1], g[i+1], h[i+1], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest
+    DUP1
+    // stack: i+1==64, i+1==64, i+1, a[i+1], b[i+1], c[i+1], d[i+1], e[i+1], f[i+1], g[i+1], h[i+1], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest
+    DUP12
+    // stack: num_blocks, i+1==64, i+1==64, i+1, a[i+1], b[i+1], c[i+1], d[i+1], e[i+1], f[i+1], g[i+1], h[i+1], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest
+    SUB
+    // stack: num_blocks new, i+1==64, i+1, a[i+1], b[i+1], c[i+1], d[i+1], e[i+1], f[i+1], g[i+1], h[i+1], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest
+    SWAP13
+    // stack: message_schedule_addr, i+1==64, i+1, a[i+1], b[i+1], c[i+1], d[i+1], e[i+1], f[i+1], g[i+1], h[i+1], num_blocks, scratch_space_addr, num_blocks new, i, retdest
+    SWAP1
+    // stack: i+1==64, message_schedule_addr, i+1, a[i+1], b[i+1], c[i+1], d[i+1], e[i+1], f[i+1], g[i+1], h[i+1], num_blocks, scratch_space_addr, num_blocks new, i, retdest
+    PUSH 256
+    MUL
+    // stack: (i+1==64)*256, message_schedule_addr, i+1, a[i+1], b[i+1], c[i+1], d[i+1], e[i+1], f[i+1], g[i+1], h[i+1], num_blocks, scratch_space_addr, num_blocks new, i, retdest
+    ADD
+    // stack: message_schedule_addr new, i+1, a[i+1], b[i+1], c[i+1], d[i+1], e[i+1], f[i+1], g[i+1], h[i+1], num_blocks, scratch_space_addr, num_blocks new, i, retdest
+    SWAP12
+    // stack: num_blocks new, i+1, a[i+1], b[i+1], c[i+1], d[i+1], e[i+1], f[i+1], g[i+1], h[i+1], num_blocks, scratch_space_addr, message_schedule_addr new, i, retdest
+    SWAP10
+    // stack: num_blocks, i+1, a[i+1], b[i+1], c[i+1], d[i+1], e[i+1], f[i+1], g[i+1], h[i+1], num_blocks new, scratch_space_addr, message_schedule_addr new, i, new_retdest
+    POP
+    // stack: i+1, a[i+1], b[i+1], c[i+1], d[i+1], e[i+1], f[i+1], g[i+1], h[i+1], num_blocks new, scratch_space_addr, message_schedule_addr new, i, new_retdest
+    %and_const(63)
+    // stack: (i+1)%64, a[i+1], b[i+1], c[i+1], d[i+1], e[i+1], f[i+1], g[i+1], h[i+1], num_blocks new, scratch_space_addr, message_schedule_addr new, i, retdest
+    SWAP12
+    // stack: i, a[i+1], b[i+1], c[i+1], d[i+1], e[i+1], f[i+1], g[i+1], h[i+1], num_blocks new, scratch_space_addr, message_schedule_addr new, (i+1)%64, retdest
+    POP
+    // stack: a[i+1], b[i+1], c[i+1], d[i+1], e[i+1], f[i+1], g[i+1], h[i+1], num_blocks new, scratch_space_addr, message_schedule_addr new, (i+1)%64, retdest
+    DUP12
+    // stack: (i+1)%64, a[i+1], b[i+1], c[i+1], d[i+1], e[i+1], f[i+1], g[i+1], h[i+1], num_blocks new, scratch_space_addr, message_schedule_addr new, (i+1)%64, retdest
+    ISZERO
+    %jumpi(compression_end_block)
+    %jump(compression_loop)
+compression_end_block:
+    // Add the initial values of the eight working variables (from the start of this block's compression) back into them.
+    // stack: a[64], b[64], c[64], d[64], e[64], f[64], g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest
+    DUP10
+    // stack: scratch_space_addr, a[64], b[64], c[64], d[64], e[64], f[64], g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest
+    %mload_kernel_general_u32
+    // stack: a[0], a[64], b[64], c[64], d[64], e[64], f[64], g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest
+    %add_u32
+    // stack: a[0]+a[64], b[64], c[64], d[64], e[64], f[64], g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest
+    SWAP1
+    // stack: b[64], a[0]+a[64], c[64], d[64], e[64], f[64], g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest
+    DUP10
+    %add_const(4)
+    %mload_kernel_general_u32
+    // stack: b[0], b[64], a[0]+a[64], c[64], d[64], e[64], f[64], g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest
+    %add_u32
+    // stack: b[0]+b[64], a[0]+a[64], c[64], d[64], e[64], f[64], g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest
+    SWAP2
+    // stack: c[64], a[0]+a[64], b[0]+b[64], d[64], e[64], f[64], g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest
+    DUP10
+    %add_const(8)
+    %mload_kernel_general_u32
+    // stack: c[0], c[64], a[0]+a[64], b[0]+b[64], d[64], e[64], f[64], g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest
+    %add_u32
+    // stack: c[0]+c[64], a[0]+a[64], b[0]+b[64], d[64], e[64], f[64], g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest
+    SWAP3
+    // stack: d[64], a[0]+a[64], b[0]+b[64], c[0]+c[64], e[64], f[64], g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest
+    DUP10
+    %add_const(12)
+    %mload_kernel_general_u32
+    // stack: d[0], d[64], a[0]+a[64], b[0]+b[64], c[0]+c[64], e[64], f[64], g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest
+    %add_u32
+    // stack: d[0]+d[64], a[0]+a[64], b[0]+b[64], c[0]+c[64], e[64], f[64], g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest
+    SWAP4
+    // stack: e[64], a[0]+a[64], b[0]+b[64], c[0]+c[64], d[0]+d[64], f[64], g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest
+    DUP10
+    %add_const(16)
+    %mload_kernel_general_u32
+    // stack: e[0], e[64], a[0]+a[64], b[0]+b[64], c[0]+c[64], d[0]+d[64], f[64], g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest
+    %add_u32
+    // stack: e[0]+e[64], a[0]+a[64], b[0]+b[64], c[0]+c[64], d[0]+d[64], f[64], g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest
+    SWAP5
+    // stack: f[64], a[0]+a[64], b[0]+b[64], c[0]+c[64], d[0]+d[64], e[0]+e[64], g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest
+    DUP10
+    %add_const(20)
+    %mload_kernel_general_u32
+    // stack: f[0], f[64], a[0]+a[64], b[0]+b[64], c[0]+c[64], d[0]+d[64], e[0]+e[64], g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest
+    %add_u32
+    // stack: f[0]+f[64], a[0]+a[64], b[0]+b[64], c[0]+c[64], d[0]+d[64], e[0]+e[64], g[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest
+    SWAP6
+    // stack: g[64], a[0]+a[64], b[0]+b[64], c[0]+c[64], d[0]+d[64], e[0]+e[64], f[0]+f[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest
+    DUP10
+    %add_const(24)
+    %mload_kernel_general_u32
+    // stack: g[0], g[64], a[0]+a[64], b[0]+b[64], c[0]+c[64], d[0]+d[64], e[0]+e[64], f[0]+f[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest
+    %add_u32
+    // stack: g[0]+g[64], a[0]+a[64], b[0]+b[64], c[0]+c[64], d[0]+d[64], e[0]+e[64], f[0]+f[64], h[64], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest
+    SWAP7
+    // stack: h[64], a[0]+a[64], b[0]+b[64], c[0]+c[64], d[0]+d[64], e[0]+e[64], f[0]+f[64], g[0]+g[64], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest
+    DUP10
+    %add_const(28)
+    %mload_kernel_general_u32
+    // stack: h[0], h[64], a[0]+a[64], b[0]+b[64], c[0]+c[64], d[0]+d[64], e[0]+e[64], f[0]+f[64], g[0]+g[64], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest
+    %add_u32
+    // stack: h[0]+h[64], a[0]+a[64], b[0]+b[64], c[0]+c[64], d[0]+d[64], e[0]+e[64], f[0]+f[64], g[0]+g[64], num_blocks, scratch_space_addr, message_schedule_addr, i, retdest
+    SWAP8
+    // stack: num_blocks, a[0]+a[64], b[0]+b[64], c[0]+c[64], d[0]+d[64], e[0]+e[64], f[0]+f[64], g[0]+g[64], h[0]+h[64], scratch_space_addr, message_schedule_addr, i, retdest
+    DUP1
+    // stack: num_blocks, num_blocks, a[0]+a[64], b[0]+b[64], c[0]+c[64], d[0]+d[64], e[0]+e[64], f[0]+f[64], g[0]+g[64], h[0]+h[64], scratch_space_addr, message_schedule_addr, i, retdest
+    ISZERO
+    // In this case, we've finished all the blocks.
+    %jumpi(compression_end)
+    // stack: num_blocks, a[0]+a[64], b[0]+b[64], c[0]+c[64], d[0]+d[64], e[0]+e[64], f[0]+f[64], g[0]+g[64], h[0]+h[64], scratch_space_addr, message_schedule_addr, i, retdest
+    %stack (num_blocks, working: 8) -> (working, num_blocks)
+    %jump(compression_start_block)
+compression_end:
+    // stack: num_blocks, a[0]+a[64], b[0]+b[64], c[0]+c[64], d[0]+d[64], e[0]+e[64], f[0]+f[64], g[0]+g[64], h[0]+h[64], scratch_space_addr, message_schedule_addr, i, retdest
+    POP
+    // stack: a[0]+a[64], b[0]+b[64], c[0]+c[64], d[0]+d[64], e[0]+e[64], f[0]+f[64], g[0]+g[64], h[0]+h[64], scratch_space_addr, message_schedule_addr, i, retdest
+    %shl_const(32)
+    OR
+    %shl_const(32)
+    OR
+    %shl_const(32)
+    OR
+    %shl_const(32)
+    OR
+    %shl_const(32)
+    OR
+    %shl_const(32)
+    OR
+    %shl_const(32)
+    OR
+    // stack: concat(a[0]+a[64], b[0]+b[64], c[0]+c[64], d[0]+d[64], e[0]+e[64], f[0]+f[64], g[0]+g[64], h[0]+h[64]), scratch_space_addr, message_schedule_addr, i, retdest
+    SWAP3
+    // stack: i, scratch_space_addr, message_schedule_addr, concat(a[0]+a[64], b[0]+b[64], c[0]+c[64], d[0]+d[64], e[0]+e[64], f[0]+f[64], g[0]+g[64], h[0]+h[64]), retdest
+    %pop3
+    // stack: sha2_result = concat(a[0]+a[64], b[0]+b[64], c[0]+c[64], d[0]+d[64], e[0]+e[64], f[0]+f[64], g[0]+g[64], h[0]+h[64]), retdest
+    SWAP1
+    JUMP
diff --git a/evm/src/cpu/kernel/asm/sha2/constants.asm b/evm/src/cpu/kernel/asm/sha2/constants.asm
new file mode 100644
index 00000000..d39661f8
--- /dev/null
+++ b/evm/src/cpu/kernel/asm/sha2/constants.asm
@@ -0,0 +1,75 @@
+global sha2_constants_k:
+    BYTES 66, 138, 47, 152
+    BYTES 113, 55, 68, 145
+    BYTES 181, 192, 251, 207
+    BYTES 233, 181, 219, 165
+    BYTES 57, 86, 194, 91
+    BYTES 89, 241, 17, 241
+    BYTES 146, 63, 130, 164
+    BYTES 171, 28, 94, 213
+    BYTES 216, 7, 170, 152
+    BYTES 18, 131, 91, 1
+    BYTES 36, 49, 133, 190
+    BYTES 85, 12, 125, 195
+    BYTES 114, 190, 93, 116
+    BYTES 128, 222, 177, 254
+    BYTES 155, 220, 6, 167
+    BYTES 193, 155, 241, 116
+    BYTES 228, 155, 105, 193
+    BYTES 239, 190, 71, 134
+    BYTES 15, 193, 157, 198
+    BYTES 36, 12, 161, 204
+    BYTES 45, 233, 44, 111
+    BYTES 74, 116, 132, 170
+    BYTES 92, 176, 169, 220
+    BYTES 118, 249, 136, 218
+    BYTES 152, 62, 81, 82
+    BYTES 168, 49, 198, 109
+    BYTES 176, 3, 39, 200
+    BYTES 191, 89, 127, 199
+    BYTES 198, 224, 11, 243
+    BYTES 213, 167, 145, 71
+    BYTES 6, 202, 99, 81
+    BYTES 20, 41, 41, 103
+    BYTES 39, 183, 10, 133
+    BYTES 46, 27, 33, 56
+    BYTES 77, 44, 109, 252
+    BYTES 83, 56, 13, 19
+    BYTES 101, 10, 115, 84
+    BYTES 118, 106, 10, 187
+    BYTES 129, 194, 201, 46
+    BYTES 146, 114, 44, 133
+    BYTES 162, 191, 232, 161
+    BYTES 168, 26, 102, 75
+    BYTES 194, 75, 139, 112
+    BYTES 199, 108, 81, 163
+    BYTES 209, 146, 232, 25
+    BYTES 214, 153, 6, 36
+    BYTES 244, 14, 53, 133
+    BYTES 16, 106, 160, 112
+    BYTES 25, 164, 193, 22
+    BYTES 30, 55, 108, 8
+    BYTES 39, 72, 119, 76
+    BYTES 52, 176, 188, 181
+    BYTES 57, 28, 12, 179
+    BYTES 78, 216, 170, 74
+    BYTES 91, 156, 202, 79
+    BYTES 104, 46, 111, 243
+    BYTES 116, 143, 130, 238
+    BYTES 120, 165, 99, 111
+    BYTES 132, 200, 120, 20
+    BYTES 140, 199, 2, 8
+    BYTES 144, 190, 255, 250
+    BYTES 164, 80, 108, 235
+    BYTES 190, 249, 163, 247
+    BYTES 198, 113, 120, 242
+
+global sha2_constants_h:
+    BYTES 106, 9, 230, 103
+    BYTES 187, 103, 174, 133
+    BYTES 60, 110, 243, 114
+    BYTES 165, 79, 245, 58
+    BYTES 81, 14, 82, 127
+    BYTES 155, 5, 104, 140
+    BYTES 31, 131, 217, 171
+    BYTES 91, 224, 205, 25
diff --git a/evm/src/cpu/kernel/asm/sha2/message_schedule.asm b/evm/src/cpu/kernel/asm/sha2/message_schedule.asm
new file mode 100644
index 00000000..78d98634
--- /dev/null
+++ b/evm/src/cpu/kernel/asm/sha2/message_schedule.asm
@@ -0,0 +1,240 @@
+// We put the message schedule in memory starting at 64 * num_blocks + 2.
+%macro message_schedule_addr_from_num_blocks
+    // stack: num_blocks
+    %mul_const(64)
+    %add_const(2)
+%endmacro
+
+// Precodition: stack contains address of one message block, followed by output address
+// Postcondition: 256 bytes starting at given output address contain the 64 32-bit chunks
+//                of message schedule (in four-byte increments)
+gen_message_schedule_from_block:
+    // stack: block_addr, output_addr, retdest
+    DUP1
+    // stack: block_addr, block_addr, output_addr, retdest
+    %add_const(32)
+    // stack: block_addr + 32, block_addr, output_addr, retdest
+    SWAP1
+    // stack: block_addr, block_addr + 32, output_addr, retdest
+    %mload_kernel_general_u256
+    // stack: block[0], block_addr + 32, output_addr, retdest
+    SWAP1
+    // stack: block_addr + 32, block[0], output_addr, retdest
+    %mload_kernel_general_u256
+    // stack: block[1], block[0], output_addr, retdest
+    SWAP2
+    // stack: output_addr, block[0], block[1], retdest
+    %add_const(28)
+    PUSH 8
+    // stack: counter=8, output_addr + 28, block[0], block[1], retdest
+    %jump(gen_message_schedule_from_block_0_loop)
+gen_message_schedule_from_block_0_loop:
+    // Split the first half (256 bits) of the block into the first eight (32-bit) chunks of the message sdchedule.
+    // stack: counter, output_addr, block[0], block[1], retdest
+    SWAP2
+    // stack: block[0], output_addr, counter, block[1], retdest
+    DUP1
+    // stack: block[0], block[0], output_addr, counter, block[1], retdest
+    %shr_const(32)
+    // stack: block[0] >> 32, block[0], output_addr, counter, block[1], retdest
+    SWAP1
+    // stack: block[0], block[0] >> 32, output_addr, counter, block[1], retdest
+    %as_u32
+    // stack: block[0] % (1 << 32), block[0] >> 32, output_addr, counter, block[1], retdest
+    DUP3
+    // stack: output_addr, block[0] % (1 << 32), block[0] >> 32, output_addr, counter, block[1], retdest
+    %mstore_kernel_general_u32
+    // stack: block[0] >> 32, output_addr, counter, block[1], retdest
+    SWAP1
+    // stack: output_addr, block[0] >> 32, counter, block[1], retdest
+    %sub_const(4)
+    // stack: output_addr - 4, block[0] >> 32, counter, block[1], retdest
+    SWAP1
+    // stack: block[0] >> 32, output_addr - 4, counter, block[1], retdest
+    SWAP2
+    // stack: counter, output_addr - 4, block[0] >> 32, block[1], retdest
+    %decrement
+    DUP1
+    ISZERO
+    %jumpi(gen_message_schedule_from_block_0_end)
+    %jump(gen_message_schedule_from_block_0_loop)
+gen_message_schedule_from_block_0_end:
+    // stack: old counter=0, output_addr, block[0], block[1], retdest
+    POP
+    PUSH 8
+    // stack: counter=8, output_addr, block[0], block[1], retdest
+    %stack (counter, out, b0, b1) -> (out, counter, b1, b0)
+    // stack: output_addr, counter, block[1], block[0], retdest
+    %add_const(64)
+    // stack: output_addr + 64, counter, block[1], block[0], retdest
+    SWAP1
+    // stack: counter, output_addr + 64, block[1], block[0], retdest
+gen_message_schedule_from_block_1_loop:
+    // Split the second half (256 bits) of the block into the next eight (32-bit) chunks of the message sdchedule.
+    // stack: counter, output_addr, block[1], block[0], retdest
+    SWAP2
+    // stack: block[1], output_addr, counter, block[0], retdest
+    DUP1
+    // stack: block[1], block[1], output_addr, counter, block[0], retdest
+    %shr_const(32)
+    // stack: block[1] >> 32, block[1], output_addr, counter, block[0], retdest
+    SWAP1
+    // stack: block[1], block[1] >> 32, output_addr, counter, block[0], retdest
+    %as_u32
+    // stack: block[1] % (1 << 32), block[1] >> 32, output_addr, counter, block[0], retdest
+    DUP3
+    // stack: output_addr, block[1] % (1 << 32), block[1] >> 32, output_addr, counter, block[0], retdest
+    %mstore_kernel_general_u32
+    // stack: block[1] >> 32, output_addr, counter, block[0], retdest
+    SWAP1
+    // stack: output_addr, block[1] >> 32, counter, block[0], retdest
+    %sub_const(4)
+    // stack: output_addr - 4, block[1] >> 32, counter, block[0], retdest
+    SWAP1
+    // stack: block[1] >> 32, output_addr - 4, counter, block[0], retdest
+    SWAP2
+    // stack: counter, output_addr - 4, block[1] >> 32, block[0], retdest
+    %decrement
+    DUP1
+    ISZERO
+    %jumpi(gen_message_schedule_from_block_1_end)
+    %jump(gen_message_schedule_from_block_1_loop)
+gen_message_schedule_from_block_1_end:
+    // stack: old counter=0, output_addr, block[1], block[0], retdest
+    POP
+    // stack: output_addr, block[0], block[1], retdest
+    PUSH 48
+    // stack: counter=48, output_addr, block[0], block[1], retdest
+    SWAP1
+    // stack: output_addr, counter, block[0], block[1], retdest
+    %add_const(36)
+    // stack: output_addr + 36, counter, block[0], block[1], retdest
+    SWAP1
+    // stack: counter, output_addr + 36, block[0], block[1], retdest
+gen_message_schedule_remaining_loop:
+    // Generate the next 48 chunks of the message schedule, one at a time, from prior chunks.
+    // stack: counter, output_addr, block[0], block[1], retdest
+    SWAP1
+    // stack: output_addr, counter, block[0], block[1], retdest
+    DUP1
+    // stack: output_addr, output_addr, counter, block[0], block[1], retdest
+    PUSH 2
+    PUSH 4
+    MUL
+    SWAP1
+    SUB
+    // stack: output_addr - 2*4, output_addr, counter, block[0], block[1], retdest
+    %mload_kernel_general_u32
+    // stack: x[output_addr - 2*4], output_addr, counter, block[0], block[1], retdest
+    %sha2_sigma_1
+    // stack: sigma_1(x[output_addr - 2*4]), output_addr, counter, block[0], block[1], retdest
+    SWAP1
+    // stack: output_addr, sigma_1(x[output_addr - 2*4]), counter, block[0], block[1], retdest
+    DUP1
+    // stack: output_addr, output_addr, sigma_1(x[output_addr - 2*4]), counter, block[0], block[1], retdest
+    PUSH 7
+    PUSH 4
+    MUL
+    SWAP1
+    SUB
+    // stack: output_addr - 7*4, output_addr, sigma_1(x[output_addr - 2*4]), counter, block[0], block[1], retdest
+    %mload_kernel_general_u32
+    // stack: x[output_addr - 7*4], output_addr, sigma_1(x[output_addr - 2*4]), counter, block[0], block[1], retdest
+    SWAP1
+    // stack: output_addr, x[output_addr - 7*4], sigma_1(x[output_addr - 2*4]), counter, block[0], block[1], retdest
+    DUP1
+    // stack: output_addr, output_addr, x[output_addr - 7*4], sigma_1(x[output_addr - 2*4]), counter, block[0], block[1], retdest
+    PUSH 15
+    PUSH 4
+    MUL
+    SWAP1
+    SUB
+    // stack: output_addr - 15*4, output_addr, x[output_addr - 7*4], sigma_1(x[output_addr - 2*4]), counter, block[0], block[1], retdest
+    %mload_kernel_general_u32
+    // stack: x[output_addr - 15*4], output_addr, x[output_addr - 7*4], sigma_1(x[output_addr - 2*4]), counter, block[0], block[1], retdest
+    %sha2_sigma_0
+    // stack: sigma_0(x[output_addr - 15*4]), output_addr, x[output_addr - 7*4], sigma_1(x[output_addr - 2*4]), counter, block[0], block[1], retdest
+    SWAP1
+    // stack: output_addr, sigma_0(x[output_addr - 15*4]), x[output_addr - 7*4], sigma_1(x[output_addr - 2*4]), counter, block[0], block[1], retdest
+    DUP1
+    // stack: output_addr, output_addr, sigma_0(x[output_addr - 15*4]), x[output_addr - 7*4], sigma_1(x[output_addr - 2*4]), counter, block[0], block[1], retdest
+    PUSH 16
+    PUSH 4
+    MUL
+    SWAP1
+    SUB
+    // stack: output_addr - 16*4, output_addr, sigma_0(x[output_addr - 15*4]), x[output_addr - 7*4], sigma_1(x[output_addr - 2*4]), counter, block[0], block[1], retdest
+    %mload_kernel_general_u32
+    // stack: x[output_addr - 16*4], output_addr, sigma_0(x[output_addr - 15*4]), x[output_addr - 7*4], sigma_1(x[output_addr - 2*4]), counter, block[0], block[1], retdest
+    SWAP1
+    // stack: output_addr, x[output_addr - 16*4], sigma_0(x[output_addr - 15*4]), x[output_addr - 7*4], sigma_1(x[output_addr - 2*4]), counter, block[0], block[1], retdest
+    SWAP4
+    // stack: sigma_1(x[output_addr - 2*4]), x[output_addr - 16*4], sigma_0(x[output_addr - 15*4]), x[output_addr - 7*4], output_addr, counter, block[0], block[1], retdest
+    %add_u32
+    %add_u32
+    %add_u32
+    // stack: sigma_1(x[output_addr - 2*4]) + x[output_addr - 16*4] + sigma_0(x[output_addr - 15*4]) + x[output_addr - 7*4], output_addr, counter, block[0], block[1], retdest
+    DUP2
+    // stack: output_addr, sigma_1(x[output_addr - 2*4]) + x[output_addr - 16*4] + sigma_0(x[output_addr - 15*4]) + x[output_addr - 7*4], output_addr, counter, block[0], block[1], retdest
+    %mstore_kernel_general_u32
+    // stack: output_addr, counter, block[0], block[1], retdest
+    %add_const(4)
+    // stack: output_addr + 4, counter, block[0], block[1], retdest
+    SWAP1
+    // stack: counter, output_addr + 4, block[0], block[1], retdest
+    %decrement
+    // stack: counter - 1, output_addr + 4, block[0], block[1], retdest
+    DUP1
+    ISZERO
+    %jumpi(gen_message_schedule_remaining_end)
+    %jump(gen_message_schedule_remaining_loop)
+gen_message_schedule_remaining_end:
+    // stack: counter=0, output_addr, block[0], block[1], retdest
+    %pop4
+    JUMP
+
+// Precodition: memory, starting at 0, contains num_blocks, block0[0], ..., block0[63], block1[0], ..., blocklast[63]
+//              stack contains output_addr
+// Postcondition: starting at output_addr, set of 256 bytes per block
+//                each contains the 64 32-bit chunks of the message schedule for that block (in four-byte increments)
+global sha2_gen_all_message_schedules: 
+    // stack: output_addr, retdest
+    DUP1
+    // stack: output_addr, output_addr, retdest
+    PUSH 0
+    // stack: 0, output_addr, output_addr, retdest
+    %mload_kernel_general
+    // stack: num_blocks, output_addr, output_addr, retdest
+    PUSH 1
+    // stack: cur_addr = 1, counter = num_blocks, output_addr, output_addr, retdest
+gen_all_message_schedules_loop:
+    // stack: cur_addr, counter, cur_output_addr, output_addr, retdest
+    PUSH gen_all_message_schedules_loop_end
+    // stack: new_retdest = gen_all_message_schedules_loop_end, cur_addr, counter, cur_output_addr, output_addr, retdest
+    DUP4
+    // stack: cur_output_addr, new_retdest, cur_addr, counter, cur_output_addr, output_addr, retdest
+    DUP3
+    // stack: cur_addr, cur_output_addr, new_retdest, cur_addr, counter, cur_output_addr, output_addr, retdest
+    %jump(gen_message_schedule_from_block)
+gen_all_message_schedules_loop_end:
+    // stack: cur_addr, counter, cur_output_addr, output_addr, retdest
+    %add_const(64)
+    // stack: cur_addr + 64, counter, cur_output_addr, output_addr, retdest
+    SWAP1
+    %decrement
+    SWAP1
+    // stack: cur_addr + 64, counter - 1, cur_output_addr, output_addr, retdest
+    SWAP2
+    %add_const(256)
+    SWAP2
+    // stack: cur_addr + 64, counter - 1, cur_output_addr + 256, output_addr, retdest
+    DUP2
+    // stack: counter - 1, cur_addr + 64, counter - 1, cur_output_addr + 256, output_addr, retdest
+    ISZERO
+    %jumpi(gen_all_message_schedules_end)
+    %jump(gen_all_message_schedules_loop)
+gen_all_message_schedules_end:
+    // stack: cur_addr + 64, counter - 1, cur_output_addr + 256, output_addr, retdest
+    %pop3
+    // stack: output_addr, retdest
+    %jump(sha2_compression)
diff --git a/evm/src/cpu/kernel/asm/sha2/ops.asm b/evm/src/cpu/kernel/asm/sha2/ops.asm
new file mode 100644
index 00000000..7d8054ca
--- /dev/null
+++ b/evm/src/cpu/kernel/asm/sha2/ops.asm
@@ -0,0 +1,130 @@
+// 32-bit right rotation
+%macro rotr(rot)
+    // stack: value
+    PUSH $rot
+    // stack: rot, value
+    DUP2
+    DUP2
+    // stack: rot, value, rot, value
+    SHR
+    // stack: value >> rot, rot, value
+    %stack (shifted, rot, value) -> (rot, value, shifted)
+    // stack: rot, value, value >> rot
+    PUSH 32
+    SUB
+    // stack: 32 - rot, value, value >> rot
+    SHL
+    // stack: value << (32 - rot), value >> rot
+    %as_u32
+    // stack: (value << (32 - rot)) % (1 << 32), value >> rot
+    ADD
+%endmacro
+
+%macro sha2_sigma_0
+    // stack: x
+    DUP1
+    // stack: x, x
+    %rotr(7)
+    // stack: rotr(x, 7), x
+    %stack (rotated, x) -> (x, x, rotated)
+    // stack: x, x, rotr(x, 7)
+    %rotr(18)
+    // stack: rotr(x, 18), x, rotr(x, 7)
+    SWAP1
+    // stack: x, rotr(x, 18), rotr(x, 7)
+    PUSH 3
+    SHR
+    // stack: shr(x, 3), rotr(x, 18), rotr(x, 7)
+    XOR
+    XOR
+%endmacro
+
+%macro sha2_sigma_1
+    // stack: x
+    DUP1
+    // stack: x, x
+    %rotr(17)
+    // stack: rotr(x, 17), x
+    %stack (rotated, x) -> (x, x, rotated)
+    // stack: x, x, rotr(x, 17)
+    %rotr(19)
+    // stack: rotr(x, 19), x, rotr(x, 17)
+    SWAP1
+    // stack: x, rotr(x, 19), rotr(x, 17)
+    PUSH 10
+    SHR
+    // stack: shr(x, 10), rotr(x, 19), rotr(x, 17)
+    XOR
+    XOR
+%endmacro
+
+%macro sha2_bigsigma_0
+    // stack: x
+    DUP1
+    // stack: x, x
+    %rotr(2)
+    // stack: rotr(x, 2), x
+    %stack (rotated, x) -> (x, x, rotated)
+    // stack: x, x, rotr(x, 2)
+    %rotr(13)
+    // stack: rotr(x, 13), x, rotr(x, 2)
+    SWAP1
+    // stack: x, rotr(x, 13), rotr(x, 2)
+    %rotr(22)
+    // stack: rotr(x, 22), rotr(x, 13), rotr(x, 2)
+    XOR
+    XOR
+%endmacro
+
+%macro sha2_bigsigma_1
+    // stack: x
+    DUP1
+    // stack: x, x
+    %rotr(6)
+    // stack: rotr(x, 6), x
+    %stack (rotated, x) -> (x, x, rotated)
+    // stack: x, x, rotr(x, 6)
+    %rotr(11)
+    // stack: rotr(x, 11), x, rotr(x, 6)
+    SWAP1
+    // stack: x, rotr(x, 11), rotr(x, 6)
+    %rotr(25)
+    // stack: rotr(x, 25), rotr(x, 11), rotr(x, 6)
+    XOR
+    XOR
+%endmacro
+
+%macro sha2_choice
+    // stack: x, y, z
+    DUP1
+    // stack: x, x, y, z
+    NOT
+    // stack: not x, x, y, z
+    %stack (notx, x, y, z) -> (notx, z, x, y)
+    // stack: not x, z, x, y
+    AND
+    // stack: (not x) and z, x, y
+    %stack (nxz, x, y) -> (x, y, nxz)
+    // stack: x, y, (not x) and z
+    AND
+    // stack: x and y, (not x) and z
+    OR
+%endmacro
+
+%macro sha2_majority
+    // stack: x, y, z
+    %stack (xyz: 3) -> (xyz, xyz)
+    // stack: x, y, z, x, y, z
+    AND
+    // stack: x and y, z, x, y, z
+    SWAP2
+    // stack: x, z, x and y, y, z
+    AND
+    // stack: x and z, x and y, y, z
+    %stack (a: 2, b: 2) -> (b, a)
+    // stack: y, z, x and z, x and y
+    AND
+    // stack: y and z, x and z, x and y
+    OR
+    OR
+%endmacro
diff --git a/evm/src/cpu/kernel/asm/sha2/store_pad.asm b/evm/src/cpu/kernel/asm/sha2/store_pad.asm
new file mode 100644
index 00000000..7594eb81
--- /dev/null
+++ b/evm/src/cpu/kernel/asm/sha2/store_pad.asm
@@ -0,0 +1,89 @@
+global sha2:
+    %jump(sha2_store)
+
+global sha2_store:
+    // stack: num_bytes, x[0], x[1], ..., x[num_bytes - 1], retdest
+    DUP1
+    // stack: num_bytes, num_bytes, x[0], x[1], ..., x[num_bytes - 1], retdest
+    PUSH 0
+    // stack: addr=0, num_bytes, num_bytes, x[0], x[1], ..., x[num_bytes - 1], retdest
+    %mstore_kernel_general
+    // stack: num_bytes, x[0], x[1], ..., x[num_bytes - 1], retdest
+    PUSH 1
+    // stack: addr=1, counter=num_bytes, x[0], x[1], x[2], ... , x[num_bytes-1], retdest
+store_loop:
+    // stack: addr, counter, x[num_bytes-counter], ... , x[num_bytes-1], retdest
+    DUP2
+    // stack: counter, addr, counter, x[num_bytes-counter], ... , x[num_bytes-1], retdest
+    ISZERO
+    %jumpi(store_end)
+    // stack: addr, counter, x[num_bytes-counter], ... , x[num_bytes-1], retdest
+    %stack (addr, counter, val) -> (addr, val, counter, addr)
+    // stack: addr, x[num_bytes-counter], counter, addr,  ... , x[num_bytes-1], retdest
+    %mstore_kernel_general
+    // stack: counter, addr,  ... , x[num_bytes-1], retdest
+    %decrement
+    // stack: counter-1, addr,  ... , x[num_bytes-1], retdest
+    SWAP1
+    // stack: addr, counter-1,  ... , x[num_bytes-1], retdest
+    %increment
+    // stack: addr+1, counter-1,  ... , x[num_bytes-1], retdest
+    %jump(store_loop)
+store_end:
+    // stack: addr, counter, retdest
+    %pop2
+    // stack: retdest
+    %jump(sha2_pad)
+
+// Precodition: input is in memory, starting at 0 of kernel general segment, of the form
+//              num_bytes, x[0], x[1], ..., x[num_bytes - 1]
+// Postcodition: output is in memory, starting at 0, of the form
+//               num_blocks, block0[0], ..., block0[63], block1[0], ..., blocklast[63]
+global sha2_pad:
+    // stack: retdest
+    PUSH 0
+    %mload_kernel_general
+    // stack: num_bytes, retdest
+    // STEP 1: append 1
+    // insert 128 (= 1 << 7) at x[num_bytes+1]
+    // stack: num_bytes, retdest
+    PUSH 1
+    PUSH 7
+    SHL
+    // stack: 128, num_bytes, retdest
+    DUP2
+    // stack: num_bytes, 128, num_bytes, retdest
+    %increment
+    // stack: num_bytes+1, 128, num_bytes, retdest
+    %mstore_kernel_general
+    // stack: num_bytes, retdest
+    // STEP 2: calculate num_blocks := (num_bytes+8)//64 + 1
+    DUP1
+    // stack: num_bytes, num_bytes, retdest
+    %add_const(8)
+    %div_const(64)
+    
+    %increment
+    // stack: num_blocks = (num_bytes+8)//64 + 1, num_bytes, retdest
+    // STEP 3: calculate length := num_bytes*8
+    SWAP1
+    // stack: num_bytes, num_blocks, retdest
+    PUSH 8
+    MUL
+    // stack: length = num_bytes*8, num_blocks, retdest
+    // STEP 4: write length to x[num_blocks*64-7..num_blocks*64]
+    DUP2
+    // stack: num_blocks, length, num_blocks, retdest
+    PUSH 64
+    MUL
+    // stack: last_addr = num_blocks*64, length, num_blocks, retdest
+    %sha2_write_length
+    // stack: num_blocks, retdest
+    DUP1
+    // stack: num_blocks, num_blocks, retdest
+    // STEP 5: write num_blocks to x[0]
+    PUSH 0
+    %mstore_kernel_general
+    // stack: num_blocks, retdest
+    %message_schedule_addr_from_num_blocks
+    %jump(sha2_gen_all_message_schedules)
diff --git a/evm/src/cpu/kernel/asm/sha2/temp_words.asm b/evm/src/cpu/kernel/asm/sha2/temp_words.asm
new file mode 100644
index 00000000..ed610947
--- /dev/null
+++ b/evm/src/cpu/kernel/asm/sha2/temp_words.asm
@@ -0,0 +1,32 @@
+// "T_1" in the SHA-256 spec
+%macro sha2_temp_word1
+    // stack: e, f, g, h, K[i], W[i]
+    DUP1
+    // stack: e, e, f, g, h, K[i], W[i]
+    %sha2_bigsigma_1
+    // stack: Sigma_1(e), e, f, g, h, K[i], W[i]
+    %stack (sig, e, f, g) -> (e, f, g, sig)
+    // stack: e, f, g, Sigma_1(e), h, K[i], W[i]
+    %sha2_choice
+    // stack: Ch(e, f, g), Sigma_1(e), h, K[i], W[i]
+    %add_u32
+    %add_u32
+    %add_u32
+    %add_u32
+    // stack: Ch(e, f, g) + Sigma_1(e) + h + K[i] + W[i]
+%endmacro
+
+// "T_2" in the SHA-256 spec
+%macro sha2_temp_word2
+    // stack: a, b, c
+    DUP1
+    // stack: a, a, b, c
+    %sha2_bigsigma_0
+    // stack: Sigma_0(a), a, b, c
+    SWAP3
+    // stack: c, a, b, Sigma_0(a)
+    %sha2_majority
+    // stack: Maj(c, a, b), Sigma_0(a)
+    %add_u32
+    // stack: Maj(c, a, b) + Sigma_0(a)
+%endmacro
diff --git a/evm/src/cpu/kernel/asm/sha2/write_length.asm b/evm/src/cpu/kernel/asm/sha2/write_length.asm
new file mode 100644
index 00000000..5727498c
--- /dev/null
+++ b/evm/src/cpu/kernel/asm/sha2/write_length.asm
@@ -0,0 +1,119 @@
+%macro sha2_write_length
+    // stack: last_addr, length
+    SWAP1
+    // stack: length, last_addr
+    DUP1
+    // stack: length, length, last_addr
+    %and_const(0xff)
+    // stack: length % (1 << 8), length, last_addr
+    DUP3
+    // stack: last_addr, length % (1 << 8), length, last_addr
+    %mstore_kernel_general
+    
+    // stack: length, last_addr
+    SWAP1
+    %decrement
+    SWAP1
+    // stack: length, last_addr - 1
+    %shr_const(8)
+    // stack: length >> 8, last_addr - 1
+    DUP1
+    // stack: length >> 8, length >> 8, last_addr - 1
+    %and_const(0xff)
+    // stack: (length >> 8) % (1 << 8), length >> 8, last_addr - 1
+    DUP3
+    // stack: last_addr - 1, (length >> 8) % (1 << 8), length >> 8, last_addr - 1
+    %mstore_kernel_general
+    
+    // stack: length >> 8, last_addr - 1
+    SWAP1
+    %decrement
+    SWAP1
+    // stack: length >> 8, last_addr - 2
+    %shr_const(8)
+    // stack: length >> 16, last_addr - 2
+    DUP1
+    // stack: length >> 16, length >> 16, last_addr - 2
+    %and_const(0xff)
+    // stack: (length >> 16) % (1 << 8), length >> 16, last_addr - 2
+    DUP3
+    // stack: last_addr - 2, (length >> 16) % (1 << 8), length >> 16, last_addr - 2
+    %mstore_kernel_general
+
+    // stack: length >> 16, last_addr - 2
+    SWAP1
+    %decrement
+    SWAP1
+    // stack: length >> 16, last_addr - 3
+    %shr_const(8)
+    // stack: length >> 24, last_addr - 3
+    DUP1
+    // stack: length >> 24, length >> 24, last_addr - 3
+    %and_const(0xff)
+    // stack: (length >> 24) % (1 << 8), length >> 24, last_addr - 3
+    DUP3
+    // stack: last_addr - 3, (length >> 24) % (1 << 8), length >> 24, last_addr - 3
+    %mstore_kernel_general
+
+    // stack: length >> 24, last_addr - 3
+    SWAP1
+    %decrement
+    SWAP1
+    // stack: length >> 24, last_addr - 4
+    %shr_const(8)
+    // stack: length >> 32, last_addr - 4
+    DUP1
+    // stack: length >> 32, length >> 32, last_addr - 4
+    %and_const(0xff)
+    // stack: (length >> 32) % (1 << 8), length >> 32, last_addr - 4
+    DUP3
+    // stack: last_addr - 4, (length >> 32) % (1 << 8), length >> 32, last_addr - 4
+    %mstore_kernel_general
+
+    // stack: length >> 32, last_addr - 4
+    SWAP1
+    %decrement
+    SWAP1
+    // stack: length >> 32, last_addr - 5
+    %shr_const(8)
+    // stack: length >> 40, last_addr - 5
+    DUP1
+    // stack: length >> 40, length >> 40, last_addr - 5
+    %and_const(0xff)
+    // stack: (length >> 40) % (1 << 8), length >> 40, last_addr - 5
+    DUP3
+    // stack: last_addr - 5, (length >> 40) % (1 << 8), length >> 40, last_addr - 5
+    %mstore_kernel_general
+
+    // stack: length >> 40, last_addr - 5
+    SWAP1
+    %decrement
+    SWAP1
+    // stack: length >> 40, last_addr - 6
+    %shr_const(8)
+    // stack: length >> 48, last_addr - 6
+    DUP1
+    // stack: length >> 48, length >> 48, last_addr - 6
+    %and_const(0xff)
+    // stack: (length >> 48) % (1 << 8), length >> 48, last_addr - 6
+    DUP3
+    // stack: last_addr - 6, (length >> 48) % (1 << 8), length >> 48, last_addr - 6
+    %mstore_kernel_general
+
+    // stack: length >> 48, last_addr - 6
+    SWAP1
+    %decrement
+    SWAP1
+    // stack: length >> 48, last_addr - 7
+    %shr_const(8)
+    // stack: length >> 56, last_addr - 7
+    DUP1
+    // stack: length >> 56, length >> 56, last_addr - 7
+    %and_const(0xff)
+    // stack: (length >> 56) % (1 << 8), length >> 56, last_addr - 7
+    DUP3
+    // stack: last_addr - 7, (length >> 56) % (1 << 8), length >> 56, last_addr - 7
+    %mstore_kernel_general
+    %pop2
+    // stack: (empty)
+%endmacro
diff --git a/evm/src/cpu/kernel/asm/util/basic_macros.asm b/evm/src/cpu/kernel/asm/util/basic_macros.asm
index 13965e39..8ac92258 100644
--- a/evm/src/cpu/kernel/asm/util/basic_macros.asm
+++ b/evm/src/cpu/kernel/asm/util/basic_macros.asm
@@ -1,46 +1,46 @@
 %macro jump(dst)
-    push $dst
+    PUSH $dst
     jump
 %endmacro
 
 %macro jumpi(dst)
-    push $dst
+    PUSH $dst
     jumpi
 %endmacro
 
 %macro pop2
     %rep 2
-        pop
+        POP
     %endrep
 %endmacro
 
 %macro pop3
     %rep 3
-        pop
+        POP
     %endrep
 %endmacro
 
 %macro pop4
     %rep 4
-        pop
+        POP
     %endrep
 %endmacro
 
 %macro pop5
     %rep 5
-        pop
+        POP
     %endrep
 %endmacro
 
 %macro pop6
     %rep 6
-        pop
+        POP
     %endrep
 %endmacro
 
 %macro pop7
     %rep 7
-        pop
+        POP
     %endrep
 %endmacro
 
@@ -162,21 +162,21 @@
 // If pred is zero, yields z; otherwise, yields nz
 %macro select
     // stack: pred, nz, z
-    iszero
+    ISZERO
     // stack: pred == 0, nz, z
-    dup1
+    DUP1
     // stack: pred == 0, pred == 0, nz, z
-    iszero
+    ISZERO
     // stack: pred != 0, pred == 0, nz, z
-    swap3
+    SWAP3
     // stack: z, pred == 0, nz, pred != 0
-    mul
+    MUL
     // stack: (pred == 0) * z, nz, pred != 0
-    swap2
+    SWAP2
     // stack: pred != 0, nz, (pred == 0) * z
-    mul
+    MUL
     // stack: (pred != 0) * nz, (pred == 0) * z
-    add
+    ADD
     // stack: (pred != 0) * nz + (pred == 0) * z
 %endmacro
 
@@ -184,27 +184,27 @@
 // Assumes pred is boolean (either 0 or 1).
 %macro select_bool
     // stack: pred, nz, z
-    dup1
+    DUP1
     // stack: pred, pred, nz, z
-    iszero
+    ISZERO
     // stack: notpred, pred, nz, z
-    swap3
+    SWAP3
     // stack: z, pred, nz, notpred
-    mul
+    MUL
     // stack: pred * z, nz, notpred
-    swap2
+    SWAP2
     // stack: notpred, nz, pred * z
-    mul
+    MUL
     // stack: notpred * nz, pred * z
-    add
+    ADD
     // stack: notpred * nz + pred * z
 %endmacro
 
 %macro square
     // stack: x
-    dup1
+    DUP1
     // stack: x, x
-    mul
+    MUL
     // stack: x^2
 %endmacro
 
@@ -229,3 +229,33 @@
     %select_bool
     // stack: max
 %endmacro
+
+%macro increment
+    %add_const(1)
+%endmacro
+
+%macro decrement
+    %sub_const(1)
+%endmacro
+
+%macro div2
+    %div_const(2)
+%endmacro
+
+%macro iseven
+    %mod_const(2)
+    ISZERO
+%endmacro
+
+%macro as_u32
+    %and_const(0xFFFFFFFF)
+%endmacro
+
+// u32 addition (discarding 2^32 bit)
+%macro add_u32
+    // stack: x, y
+    ADD
+    // stack: x + y
+    %as_u32
+    // stack: (x + y) & u32::MAX
+%endmacro
diff --git a/evm/src/cpu/kernel/assembler.rs b/evm/src/cpu/kernel/assembler.rs
index ede60a29..5980e460 100644
--- a/evm/src/cpu/kernel/assembler.rs
+++ b/evm/src/cpu/kernel/assembler.rs
@@ -579,7 +579,7 @@ mod tests {
         );
 
         let kernel = parse_and_assemble(&["%stack (a) -> (a)"]);
-        assert_eq!(kernel.code, vec![]);
+        assert_eq!(kernel.code, vec![] as Vec<u8>);
 
         let kernel = parse_and_assemble(&["%stack (a, b, c) -> (c, b, a)"]);
         assert_eq!(kernel.code, vec![swap2]);
diff --git a/evm/src/cpu/kernel/ast.rs b/evm/src/cpu/kernel/ast.rs
index 3728aa35..6180b1c8 100644
--- a/evm/src/cpu/kernel/ast.rs
+++ b/evm/src/cpu/kernel/ast.rs
@@ -38,10 +38,7 @@ pub(crate) enum Item {
 
 /// The left hand side of a %stack stack-manipulation macro.
 #[derive(Eq, PartialEq, Clone, Debug)]
-pub(crate) enum StackPlaceholder {
-    Identifier(String),
-    Block(String, usize),
-}
+pub(crate) struct StackPlaceholder(pub String, pub usize);
 
 /// The right hand side of a %stack stack-manipulation macro.
 #[derive(Eq, PartialEq, Clone, Debug)]
diff --git a/evm/src/cpu/kernel/interpreter.rs b/evm/src/cpu/kernel/interpreter.rs
index 343f9773..589ba6b3 100644
--- a/evm/src/cpu/kernel/interpreter.rs
+++ b/evm/src/cpu/kernel/interpreter.rs
@@ -20,7 +20,7 @@ type F = GoldilocksField;
 /// Halt interpreter execution whenever a jump to this offset is done.
 const DEFAULT_HALT_OFFSET: usize = 0xdeadbeef;
 
-#[derive(Debug)]
+#[derive(Clone, Debug)]
 pub(crate) struct InterpreterMemory {
     pub(crate) context_memory: Vec<MemoryContextState>,
 }
@@ -435,14 +435,14 @@ impl<'a> Interpreter<'a> {
 
     fn run_shl(&mut self) {
         let shift = self.pop();
-        let x = self.pop();
-        self.push(x << shift);
+        let value = self.pop();
+        self.push(value << shift);
     }
 
     fn run_shr(&mut self) {
         let shift = self.pop();
-        let x = self.pop();
-        self.push(x >> shift);
+        let value = self.pop();
+        self.push(value >> shift);
     }
 
     fn run_keccak256(&mut self) {
@@ -591,6 +591,7 @@ impl<'a> Interpreter<'a> {
         let segment = Segment::all()[self.pop().as_usize()];
         let offset = self.pop().as_usize();
         let value = self.memory.mload_general(context, segment, offset);
+        assert!(value.bits() <= segment.bit_range());
         self.push(value);
     }
 
@@ -599,6 +600,7 @@ impl<'a> Interpreter<'a> {
         let segment = Segment::all()[self.pop().as_usize()];
         let offset = self.pop().as_usize();
         let value = self.pop();
+        assert!(value.bits() <= segment.bit_range());
         self.memory.mstore_general(context, segment, offset, value);
     }
 }
diff --git a/evm/src/cpu/kernel/parser.rs b/evm/src/cpu/kernel/parser.rs
index fd762eae..b7a8124b 100644
--- a/evm/src/cpu/kernel/parser.rs
+++ b/evm/src/cpu/kernel/parser.rs
@@ -119,12 +119,12 @@ fn parse_stack_placeholder(target: Pair<Rule>) -> StackPlaceholder {
     assert_eq!(target.as_rule(), Rule::stack_placeholder);
     let inner = target.into_inner().next().unwrap();
     match inner.as_rule() {
-        Rule::identifier => StackPlaceholder::Identifier(inner.as_str().into()),
+        Rule::identifier => StackPlaceholder(inner.as_str().into(), 1),
         Rule::stack_block => {
             let mut block = inner.into_inner();
             let identifier = block.next().unwrap().as_str();
             let length = block.next().unwrap().as_str().parse().unwrap();
-            StackPlaceholder::Block(identifier.to_string(), length)
+            StackPlaceholder(identifier.to_string(), length)
         }
         _ => panic!("Unexpected {:?}", inner.as_rule()),
     }
diff --git a/evm/src/cpu/kernel/stack/stack_manipulation.rs b/evm/src/cpu/kernel/stack/stack_manipulation.rs
index ebc54af1..36e4b83a 100644
--- a/evm/src/cpu/kernel/stack/stack_manipulation.rs
+++ b/evm/src/cpu/kernel/stack/stack_manipulation.rs
@@ -1,6 +1,6 @@
 use std::cmp::Ordering;
 use std::collections::hash_map::Entry::{Occupied, Vacant};
-use std::collections::{BinaryHeap, HashMap, HashSet};
+use std::collections::{BinaryHeap, HashMap};
 use std::hash::Hash;
 
 use itertools::Itertools;
@@ -27,25 +27,18 @@ pub(crate) fn expand_stack_manipulation(body: Vec<Item>) -> Vec<Item> {
 
 fn expand(names: Vec<StackPlaceholder>, replacements: Vec<StackReplacement>) -> Vec<Item> {
     let mut stack_blocks = HashMap::new();
-    let mut stack_names = HashSet::new();
 
     let mut src = names
         .iter()
         .cloned()
-        .flat_map(|item| match item {
-            StackPlaceholder::Identifier(name) => {
-                stack_names.insert(name.clone());
-                vec![StackItem::NamedItem(name)]
-            }
-            StackPlaceholder::Block(name, n) => {
-                stack_blocks.insert(name.clone(), n);
-                (0..n)
-                    .map(|i| {
-                        let literal_name = format!("block_{}_{}", name, i);
-                        StackItem::NamedItem(literal_name)
-                    })
-                    .collect_vec()
-            }
+        .flat_map(|StackPlaceholder(name, n)| {
+            stack_blocks.insert(name.clone(), n);
+            (0..n)
+                .map(|i| {
+                    let literal_name = format!("@{}.{}", name, i);
+                    StackItem::NamedItem(literal_name)
+                })
+                .collect_vec()
         })
         .collect_vec();
 
@@ -59,12 +52,10 @@ fn expand(names: Vec<StackPlaceholder>, replacements: Vec<StackReplacement>) ->
                     let n = *stack_blocks.get(&name).unwrap();
                     (0..n)
                         .map(|i| {
-                            let literal_name = format!("block_{}_{}", name, i);
+                            let literal_name = format!("@{}.{}", name, i);
                             StackItem::NamedItem(literal_name)
                         })
                         .collect_vec()
-                } else if stack_names.contains(&name) {
-                    vec![StackItem::NamedItem(name)]
                 } else {
                     vec![StackItem::PushTarget(PushTarget::Label(name))]
                 }
diff --git a/evm/src/cpu/kernel/tests/hash.rs b/evm/src/cpu/kernel/tests/hash.rs
new file mode 100644
index 00000000..3acdce2b
--- /dev/null
+++ b/evm/src/cpu/kernel/tests/hash.rs
@@ -0,0 +1,50 @@
+use std::str::FromStr;
+
+use anyhow::Result;
+use ethereum_types::U256;
+use rand::{thread_rng, Rng};
+use sha2::{Digest, Sha256};
+
+use crate::cpu::kernel::aggregator::combined_kernel;
+use crate::cpu::kernel::interpreter::run_with_kernel;
+
+/// Standard Sha2 implementation.
+fn sha2(input: Vec<u8>) -> U256 {
+    let mut hasher = Sha256::new();
+    hasher.update(input);
+    U256::from(&hasher.finalize()[..])
+}
+
+fn test_hash(hash_fn_label: &str, standard_implementation: &dyn Fn(Vec<u8>) -> U256) -> Result<()> {
+    let kernel = combined_kernel();
+    let mut rng = thread_rng();
+
+    // Generate a random message, between 0 and 9999 bytes.
+    let num_bytes = rng.gen_range(0..10000);
+    let message: Vec<u8> = (0..num_bytes).map(|_| rng.gen()).collect();
+
+    // Hash the message using a standard implementation.
+    let expected = standard_implementation(message.clone());
+
+    // Load the message onto the stack.
+    let mut initial_stack = vec![U256::from(num_bytes)];
+    let bytes: Vec<U256> = message.iter().map(|&x| U256::from(x as u32)).collect();
+    initial_stack.extend(bytes);
+    initial_stack.push(U256::from_str("0xdeadbeef").unwrap());
+    initial_stack.reverse();
+
+    // Run the kernel code.
+    let kernel_function = kernel.global_labels[hash_fn_label];
+    let result = run_with_kernel(&kernel, kernel_function, initial_stack)?;
+    let actual = result.stack()[0];
+
+    // Check that the result is correct.
+    assert_eq!(expected, actual);
+
+    Ok(())
+}
+
+#[test]
+fn test_sha2() -> Result<()> {
+    test_hash("sha2", &sha2)
+}
diff --git a/evm/src/cpu/kernel/tests/mod.rs b/evm/src/cpu/kernel/tests/mod.rs
index a9c8c08c..45feb238 100644
--- a/evm/src/cpu/kernel/tests/mod.rs
+++ b/evm/src/cpu/kernel/tests/mod.rs
@@ -2,6 +2,7 @@ mod core;
 mod curve_ops;
 mod ecrecover;
 mod exp;
+mod hash;
 mod mpt;
 mod packing;
 mod rlp;
diff --git a/evm/src/generation/memory.rs b/evm/src/generation/memory.rs
index 5e2919a4..944b42a6 100644
--- a/evm/src/generation/memory.rs
+++ b/evm/src/generation/memory.rs
@@ -22,13 +22,13 @@ impl Default for MemoryState {
     }
 }
 
-#[derive(Default, Debug)]
+#[derive(Clone, Default, Debug)]
 pub(crate) struct MemoryContextState {
     /// The content of each memory segment.
     pub segments: [MemorySegmentState; Segment::COUNT],
 }
 
-#[derive(Default, Debug)]
+#[derive(Clone, Default, Debug)]
 pub(crate) struct MemorySegmentState {
     pub content: Vec<U256>,
 }