diff --git a/Cargo.toml b/Cargo.toml
index 8d14c3d0..a78d0a96 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,5 +1,5 @@
 [workspace]
-members = ["field", "insertion", "plonky2", "starky", "system_zero", "util", "waksman", "ecdsa", "u32", "evm"]
+members = ["field", "insertion", "plonky2", "starky", "system_zero", "util", "waksman", "ecdsa", "u32", "evm", "maybe_rayon"]
 
 [profile.release]
 opt-level = 3
diff --git a/evm/Cargo.toml b/evm/Cargo.toml
index c10ab104..48ef12d7 100644
--- a/evm/Cargo.toml
+++ b/evm/Cargo.toml
@@ -17,7 +17,7 @@ log = "0.4.14"
 once_cell = "1.13.0"
 pest = "2.1.3"
 pest_derive = "2.1.0"
-rayon = "1.5.1"
+maybe_rayon = { path = "../maybe_rayon" }
 rand = "0.8.5"
 rand_chacha = "0.3.1"
 rlp = "0.5.1"
@@ -28,7 +28,9 @@ keccak-hash = "0.9.0"
 hex = "0.4.3"
 
 [features]
+default = ["parallel"]
 asmtools = ["hex"]
+parallel = ["maybe_rayon/parallel"]
 
 [[bin]]
 name = "assemble"
diff --git a/evm/src/all_stark.rs b/evm/src/all_stark.rs
index ba157fc0..fbcd2115 100644
--- a/evm/src/all_stark.rs
+++ b/evm/src/all_stark.rs
@@ -130,7 +130,7 @@ mod tests {
 
     use anyhow::Result;
     use ethereum_types::U256;
-    use itertools::{izip, Itertools};
+    use itertools::Itertools;
     use plonky2::field::polynomial::PolynomialValues;
     use plonky2::field::types::{Field, PrimeField64};
     use plonky2::iop::witness::PartialWitness;
@@ -143,6 +143,7 @@ mod tests {
     use crate::all_stark::AllStark;
     use crate::config::StarkConfig;
     use crate::cpu::cpu_stark::CpuStark;
+    use crate::cpu::kernel::aggregator::KERNEL;
     use crate::cross_table_lookup::testutils::check_ctls;
     use crate::keccak::keccak_stark::{KeccakStark, NUM_INPUTS, NUM_ROUNDS};
     use crate::logic::{self, LogicStark, Operation};
@@ -246,13 +247,10 @@ mod tests {
             let mut row: cpu::columns::CpuColumnsView<F> =
                 [F::ZERO; CpuStark::<F, D>::COLUMNS].into();
             row.is_keccak = F::ONE;
-            for (j, input, output) in izip!(
-                0..2 * NUM_INPUTS,
-                row.keccak_input_limbs.iter_mut(),
-                row.keccak_output_limbs.iter_mut()
-            ) {
-                *input = keccak_input_limbs[i][j];
-                *output = keccak_output_limbs[i][j];
+            let keccak = row.general.keccak_mut();
+            for j in 0..2 * NUM_INPUTS {
+                keccak.input_limbs[j] = keccak_input_limbs[i][j];
+                keccak.output_limbs[j] = keccak_output_limbs[i][j];
             }
             cpu_stark.generate(row.borrow_mut());
             cpu_trace_rows.push(row.into());
@@ -262,6 +260,7 @@ mod tests {
             let mut row: cpu::columns::CpuColumnsView<F> =
                 [F::ZERO; CpuStark::<F, D>::COLUMNS].into();
             row.is_cpu_cycle = F::ONE;
+            row.program_counter = F::from_canonical_usize(i);
             row.opcode = [
                 (logic::columns::IS_AND, 0x16),
                 (logic::columns::IS_OR, 0x17),
@@ -270,21 +269,22 @@ mod tests {
             .into_iter()
             .map(|(col, opcode)| logic_trace[col].values[i] * F::from_canonical_u64(opcode))
             .sum();
-            for (cols_cpu, cols_logic) in [
-                (&mut row.logic_input0, logic::columns::INPUT0),
-                (&mut row.logic_input1, logic::columns::INPUT1),
-            ] {
-                for (col_cpu, limb_cols_logic) in cols_cpu
-                    .iter_mut()
-                    .zip(logic::columns::limb_bit_cols_for_input(cols_logic))
-                {
-                    *col_cpu =
-                        limb_from_bits_le(limb_cols_logic.map(|col| logic_trace[col].values[i]));
-                }
+            let logic = row.general.logic_mut();
+
+            let input0_bit_cols = logic::columns::limb_bit_cols_for_input(logic::columns::INPUT0);
+            for (col_cpu, limb_cols_logic) in logic.input0.iter_mut().zip(input0_bit_cols) {
+                *col_cpu = limb_from_bits_le(limb_cols_logic.map(|col| logic_trace[col].values[i]));
             }
-            for (col_cpu, col_logic) in row.logic_output.iter_mut().zip(logic::columns::RESULT) {
+
+            let input1_bit_cols = logic::columns::limb_bit_cols_for_input(logic::columns::INPUT1);
+            for (col_cpu, limb_cols_logic) in logic.input1.iter_mut().zip(input1_bit_cols) {
+                *col_cpu = limb_from_bits_le(limb_cols_logic.map(|col| logic_trace[col].values[i]));
+            }
+
+            for (col_cpu, col_logic) in logic.output.iter_mut().zip(logic::columns::RESULT) {
                 *col_cpu = logic_trace[col_logic].values[i];
             }
+
             cpu_stark.generate(row.borrow_mut());
             cpu_trace_rows.push(row.into());
         }
@@ -320,9 +320,25 @@ mod tests {
         }
 
         // Pad to a power of two.
-        for _ in cpu_trace_rows.len()..cpu_trace_rows.len().next_power_of_two() {
-            cpu_trace_rows.push([F::ZERO; CpuStark::<F, D>::COLUMNS]);
+        for i in 0..cpu_trace_rows.len().next_power_of_two() - cpu_trace_rows.len() {
+            let mut row: cpu::columns::CpuColumnsView<F> =
+                [F::ZERO; CpuStark::<F, D>::COLUMNS].into();
+            row.is_cpu_cycle = F::ONE;
+            row.program_counter = F::from_canonical_usize(i + num_logic_rows);
+            cpu_stark.generate(row.borrow_mut());
+            cpu_trace_rows.push(row.into());
         }
+
+        // Ensure we finish in a halted state.
+        {
+            let num_rows = cpu_trace_rows.len();
+            let halt_label = F::from_canonical_usize(KERNEL.global_labels["halt_pc0"]);
+
+            let last_row: &mut cpu::columns::CpuColumnsView<F> =
+                cpu_trace_rows[num_rows - 1].borrow_mut();
+            last_row.program_counter = halt_label;
+        }
+
         trace_rows_to_poly_values(cpu_trace_rows)
     }
 
diff --git a/evm/src/cpu/bootstrap_kernel.rs b/evm/src/cpu/bootstrap_kernel.rs
index af307a28..2c6afb51 100644
--- a/evm/src/cpu/bootstrap_kernel.rs
+++ b/evm/src/cpu/bootstrap_kernel.rs
@@ -56,7 +56,8 @@ pub(crate) fn generate_bootstrap_kernel<F: Field>(state: &mut GenerationState<F>
         }
 
         sponge_state[sponge_input_pos] = packed_bytes;
-        state.current_cpu_row.keccak_input_limbs = sponge_state.map(F::from_canonical_u32);
+        let keccak = state.current_cpu_row.general.keccak_mut();
+        keccak.input_limbs = sponge_state.map(F::from_canonical_u32);
         state.commit_cpu_row();
 
         sponge_input_pos = (sponge_input_pos + 1) % KECCAK_RATE_LIMBS;
@@ -65,7 +66,8 @@ pub(crate) fn generate_bootstrap_kernel<F: Field>(state: &mut GenerationState<F>
         if sponge_input_pos == 0 {
             state.current_cpu_row.is_keccak = F::ONE;
             keccakf_u32s(&mut sponge_state);
-            state.current_cpu_row.keccak_output_limbs = sponge_state.map(F::from_canonical_u32);
+            let keccak = state.current_cpu_row.general.keccak_mut();
+            keccak.output_limbs = sponge_state.map(F::from_canonical_u32);
         }
     }
 }
@@ -97,7 +99,7 @@ pub(crate) fn eval_bootstrap_kernel<F: Field, P: PackedField<Scalar = F>>(
     for (&expected, actual) in KERNEL
         .code_hash
         .iter()
-        .zip(local_values.keccak_output_limbs)
+        .zip(local_values.general.keccak().output_limbs)
     {
         let expected = P::from(F::from_canonical_u32(expected));
         let diff = expected - actual;
@@ -137,7 +139,7 @@ pub(crate) fn eval_bootstrap_kernel_circuit<F: RichField + Extendable<D>, const
     for (&expected, actual) in KERNEL
         .code_hash
         .iter()
-        .zip(local_values.keccak_output_limbs)
+        .zip(local_values.general.keccak().output_limbs)
     {
         let expected = builder.constant_extension(F::Extension::from_canonical_u32(expected));
         let diff = builder.sub_extension(expected, actual);
diff --git a/evm/src/cpu/columns/general.rs b/evm/src/cpu/columns/general.rs
new file mode 100644
index 00000000..600dda87
--- /dev/null
+++ b/evm/src/cpu/columns/general.rs
@@ -0,0 +1,95 @@
+use std::borrow::{Borrow, BorrowMut};
+use std::fmt::{Debug, Formatter};
+use std::mem::{size_of, transmute};
+
+/// General purpose columns, which can have different meanings depending on what CTL or other
+/// operation is occurring at this row.
+pub(crate) union CpuGeneralColumnsView<T: Copy> {
+    keccak: CpuKeccakView<T>,
+    arithmetic: CpuArithmeticView<T>,
+    logic: CpuLogicView<T>,
+}
+
+impl<T: Copy> CpuGeneralColumnsView<T> {
+    // SAFETY: Each view is a valid interpretation of the underlying array.
+    pub(crate) fn keccak(&self) -> &CpuKeccakView<T> {
+        unsafe { &self.keccak }
+    }
+
+    // SAFETY: Each view is a valid interpretation of the underlying array.
+    pub(crate) fn keccak_mut(&mut self) -> &mut CpuKeccakView<T> {
+        unsafe { &mut self.keccak }
+    }
+
+    // SAFETY: Each view is a valid interpretation of the underlying array.
+    pub(crate) fn arithmetic(&self) -> &CpuArithmeticView<T> {
+        unsafe { &self.arithmetic }
+    }
+
+    // SAFETY: Each view is a valid interpretation of the underlying array.
+    pub(crate) fn arithmetic_mut(&mut self) -> &mut CpuArithmeticView<T> {
+        unsafe { &mut self.arithmetic }
+    }
+
+    // SAFETY: Each view is a valid interpretation of the underlying array.
+    pub(crate) fn logic(&self) -> &CpuLogicView<T> {
+        unsafe { &self.logic }
+    }
+
+    // SAFETY: Each view is a valid interpretation of the underlying array.
+    pub(crate) fn logic_mut(&mut self) -> &mut CpuLogicView<T> {
+        unsafe { &mut self.logic }
+    }
+}
+
+impl<T: Copy + PartialEq> PartialEq<Self> for CpuGeneralColumnsView<T> {
+    fn eq(&self, other: &Self) -> bool {
+        let self_arr: &[T; NUM_SHARED_COLUMNS] = self.borrow();
+        let other_arr: &[T; NUM_SHARED_COLUMNS] = other.borrow();
+        self_arr == other_arr
+    }
+}
+
+impl<T: Copy + Eq> Eq for CpuGeneralColumnsView<T> {}
+
+impl<T: Copy + Debug> Debug for CpuGeneralColumnsView<T> {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        let self_arr: &[T; NUM_SHARED_COLUMNS] = self.borrow();
+        Debug::fmt(self_arr, f)
+    }
+}
+
+impl<T: Copy> Borrow<[T; NUM_SHARED_COLUMNS]> for CpuGeneralColumnsView<T> {
+    fn borrow(&self) -> &[T; NUM_SHARED_COLUMNS] {
+        unsafe { transmute(self) }
+    }
+}
+
+impl<T: Copy> BorrowMut<[T; NUM_SHARED_COLUMNS]> for CpuGeneralColumnsView<T> {
+    fn borrow_mut(&mut self) -> &mut [T; NUM_SHARED_COLUMNS] {
+        unsafe { transmute(self) }
+    }
+}
+
+#[derive(Copy, Clone)]
+pub(crate) struct CpuKeccakView<T: Copy> {
+    pub(crate) input_limbs: [T; 50],
+    pub(crate) output_limbs: [T; 50],
+}
+
+#[derive(Copy, Clone)]
+pub(crate) struct CpuArithmeticView<T: Copy> {
+    // TODO: Add "looking" columns for the arithmetic CTL.
+    tmp: T, // temporary, to suppress errors
+}
+
+#[derive(Copy, Clone)]
+pub(crate) struct CpuLogicView<T: Copy> {
+    // Assuming a limb size of 16 bits. This can be changed, but it must be <= 28 bits.
+    pub(crate) input0: [T; 16],
+    pub(crate) input1: [T; 16],
+    pub(crate) output: [T; 16],
+}
+
+// `u8` is guaranteed to have a `size_of` of 1.
+pub const NUM_SHARED_COLUMNS: usize = size_of::<CpuGeneralColumnsView<u8>>();
diff --git a/evm/src/cpu/columns.rs b/evm/src/cpu/columns/mod.rs
similarity index 87%
rename from evm/src/cpu/columns.rs
rename to evm/src/cpu/columns/mod.rs
index ae6872df..01260ec2 100644
--- a/evm/src/cpu/columns.rs
+++ b/evm/src/cpu/columns/mod.rs
@@ -2,14 +2,18 @@
 #![allow(dead_code)]
 
 use std::borrow::{Borrow, BorrowMut};
+use std::fmt::Debug;
 use std::mem::{size_of, transmute, transmute_copy, ManuallyDrop};
 use std::ops::{Index, IndexMut};
 
+use crate::cpu::columns::general::CpuGeneralColumnsView;
 use crate::memory;
 
+mod general;
+
 #[repr(C)]
 #[derive(Eq, PartialEq, Debug)]
-pub struct CpuColumnsView<T> {
+pub struct CpuColumnsView<T: Copy> {
     /// Filter. 1 if the row is part of bootstrapping the kernel code, 0 otherwise.
     pub is_bootstrap_kernel: T,
 
@@ -17,9 +21,12 @@ pub struct CpuColumnsView<T> {
     pub is_bootstrap_contract: T,
 
     /// Filter. 1 if the row corresponds to a cycle of execution and 0 otherwise.
-    /// Lets us re-use decode columns in non-cycle rows.
+    /// Lets us re-use columns in non-cycle rows.
     pub is_cpu_cycle: T,
 
+    /// If CPU cycle: The program counter for the current instruction.
+    pub program_counter: T,
+
     /// If CPU cycle: The opcode being decoded, in {0, ..., 255}.
     pub opcode: T,
 
@@ -103,7 +110,7 @@ pub struct CpuColumnsView<T> {
     pub is_log2: T,
     pub is_log3: T,
     pub is_log4: T,
-    pub is_panic: T,
+    // PANIC does not get a flag; it fails at the decode stage.
     pub is_create: T,
     pub is_call: T,
     pub is_callcode: T,
@@ -141,14 +148,9 @@ pub struct CpuColumnsView<T> {
 
     /// Filter. 1 iff a Keccak permutation is computed on this row.
     pub is_keccak: T,
-    pub keccak_input_limbs: [T; 50],
-    pub keccak_output_limbs: [T; 50],
 
-    // Assuming a limb size of 16 bits. This can be changed, but it must be <= 28 bits.
-    // TODO: These input/output columns can be shared between the logic operations and others.
-    pub logic_input0: [T; 16],
-    pub logic_input1: [T; 16],
-    pub logic_output: [T; 16],
+    pub(crate) general: CpuGeneralColumnsView<T>,
+
     pub simple_logic_diff: T,
     pub simple_logic_diff_inv: T,
 
@@ -174,43 +176,43 @@ unsafe fn transmute_no_compile_time_size_checks<T, U>(value: T) -> U {
     transmute_copy(&value)
 }
 
-impl<T> From<[T; NUM_CPU_COLUMNS]> for CpuColumnsView<T> {
+impl<T: Copy> From<[T; NUM_CPU_COLUMNS]> for CpuColumnsView<T> {
     fn from(value: [T; NUM_CPU_COLUMNS]) -> Self {
         unsafe { transmute_no_compile_time_size_checks(value) }
     }
 }
 
-impl<T> From<CpuColumnsView<T>> for [T; NUM_CPU_COLUMNS] {
+impl<T: Copy> From<CpuColumnsView<T>> for [T; NUM_CPU_COLUMNS] {
     fn from(value: CpuColumnsView<T>) -> Self {
         unsafe { transmute_no_compile_time_size_checks(value) }
     }
 }
 
-impl<T> Borrow<CpuColumnsView<T>> for [T; NUM_CPU_COLUMNS] {
+impl<T: Copy> Borrow<CpuColumnsView<T>> for [T; NUM_CPU_COLUMNS] {
     fn borrow(&self) -> &CpuColumnsView<T> {
         unsafe { transmute(self) }
     }
 }
 
-impl<T> BorrowMut<CpuColumnsView<T>> for [T; NUM_CPU_COLUMNS] {
+impl<T: Copy> BorrowMut<CpuColumnsView<T>> for [T; NUM_CPU_COLUMNS] {
     fn borrow_mut(&mut self) -> &mut CpuColumnsView<T> {
         unsafe { transmute(self) }
     }
 }
 
-impl<T> Borrow<[T; NUM_CPU_COLUMNS]> for CpuColumnsView<T> {
+impl<T: Copy> Borrow<[T; NUM_CPU_COLUMNS]> for CpuColumnsView<T> {
     fn borrow(&self) -> &[T; NUM_CPU_COLUMNS] {
         unsafe { transmute(self) }
     }
 }
 
-impl<T> BorrowMut<[T; NUM_CPU_COLUMNS]> for CpuColumnsView<T> {
+impl<T: Copy> BorrowMut<[T; NUM_CPU_COLUMNS]> for CpuColumnsView<T> {
     fn borrow_mut(&mut self) -> &mut [T; NUM_CPU_COLUMNS] {
         unsafe { transmute(self) }
     }
 }
 
-impl<T, I> Index<I> for CpuColumnsView<T>
+impl<T: Copy, I> Index<I> for CpuColumnsView<T>
 where
     [T]: Index<I>,
 {
@@ -222,7 +224,7 @@ where
     }
 }
 
-impl<T, I> IndexMut<I> for CpuColumnsView<T>
+impl<T: Copy, I> IndexMut<I> for CpuColumnsView<T>
 where
     [T]: IndexMut<I>,
 {
diff --git a/evm/src/cpu/control_flow.rs b/evm/src/cpu/control_flow.rs
new file mode 100644
index 00000000..90a76d46
--- /dev/null
+++ b/evm/src/cpu/control_flow.rs
@@ -0,0 +1,150 @@
+use plonky2::field::extension::Extendable;
+use plonky2::field::packed::PackedField;
+use plonky2::field::types::Field;
+use plonky2::hash::hash_types::RichField;
+use plonky2::iop::ext_target::ExtensionTarget;
+
+use crate::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer};
+use crate::cpu::columns::{CpuColumnsView, COL_MAP};
+use crate::cpu::kernel::aggregator::KERNEL;
+
+// TODO: This list is incomplete.
+const NATIVE_INSTRUCTIONS: [usize; 25] = [
+    COL_MAP.is_add,
+    COL_MAP.is_mul,
+    COL_MAP.is_sub,
+    COL_MAP.is_div,
+    COL_MAP.is_sdiv,
+    COL_MAP.is_mod,
+    COL_MAP.is_smod,
+    COL_MAP.is_addmod,
+    COL_MAP.is_mulmod,
+    COL_MAP.is_signextend,
+    COL_MAP.is_lt,
+    COL_MAP.is_gt,
+    COL_MAP.is_slt,
+    COL_MAP.is_sgt,
+    COL_MAP.is_eq,
+    COL_MAP.is_iszero,
+    COL_MAP.is_and,
+    COL_MAP.is_or,
+    COL_MAP.is_xor,
+    COL_MAP.is_not,
+    COL_MAP.is_byte,
+    COL_MAP.is_shl,
+    COL_MAP.is_shr,
+    COL_MAP.is_sar,
+    COL_MAP.is_pop,
+];
+
+fn get_halt_pcs<F: Field>() -> (F, F) {
+    let halt_pc0 = KERNEL.global_labels["halt_pc0"];
+    let halt_pc1 = KERNEL.global_labels["halt_pc1"];
+
+    (
+        F::from_canonical_usize(halt_pc0),
+        F::from_canonical_usize(halt_pc1),
+    )
+}
+
+pub fn eval_packed_generic<P: PackedField>(
+    lv: &CpuColumnsView<P>,
+    nv: &CpuColumnsView<P>,
+    yield_constr: &mut ConstraintConsumer<P>,
+) {
+    // Once we start executing instructions, then we continue until the end of the table.
+    yield_constr.constraint_transition(lv.is_cpu_cycle * (nv.is_cpu_cycle - P::ONES));
+
+    // If a row is a CPU cycle and executing a native instruction (implemented as a table row; not
+    // microcoded) then the program counter is incremented by 1 to obtain the next row's program
+    // counter.
+    let is_native_instruction: P = NATIVE_INSTRUCTIONS.iter().map(|&col_i| lv[col_i]).sum();
+    yield_constr.constraint_transition(
+        lv.is_cpu_cycle
+            * is_native_instruction
+            * (lv.program_counter - nv.program_counter + P::ONES),
+    );
+
+    // If a non-CPU cycle row is followed by a CPU cycle row, then the `program_counter` of the CPU
+    // cycle row is 0.
+    yield_constr
+        .constraint_transition((lv.is_cpu_cycle - P::ONES) * nv.is_cpu_cycle * nv.program_counter);
+
+    // The first row has nowhere to continue execution from, so if it's a cycle row, then its
+    // `program_counter` must be 0.
+    // NB: I know the first few rows will be used for initialization and will not be CPU cycle rows.
+    // Once that's done, then this constraint can be removed. Until then, it is needed to ensure
+    // that execution starts at 0 and not at any arbitrary offset.
+    yield_constr.constraint_first_row(lv.is_cpu_cycle * lv.program_counter);
+
+    // The last row must be a CPU cycle row.
+    yield_constr.constraint_last_row(lv.is_cpu_cycle - P::ONES);
+    // Also, the last row's `program_counter` must be inside the `halt` infinite loop. Note that
+    // that loop consists of two instructions, so we must check for `halt` and `halt_inner` labels.
+    let (halt_pc0, halt_pc1) = get_halt_pcs::<P::Scalar>();
+    yield_constr
+        .constraint_last_row((lv.program_counter - halt_pc0) * (lv.program_counter - halt_pc1));
+}
+
+pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
+    builder: &mut plonky2::plonk::circuit_builder::CircuitBuilder<F, D>,
+    lv: &CpuColumnsView<ExtensionTarget<D>>,
+    nv: &CpuColumnsView<ExtensionTarget<D>>,
+    yield_constr: &mut RecursiveConstraintConsumer<F, D>,
+) {
+    // Once we start executing instructions, then we continue until the end of the table.
+    {
+        let constr = builder.mul_sub_extension(lv.is_cpu_cycle, nv.is_cpu_cycle, lv.is_cpu_cycle);
+        yield_constr.constraint_transition(builder, constr);
+    }
+
+    // If a row is a CPU cycle and executing a native instruction (implemented as a table row; not
+    // microcoded) then the program counter is incremented by 1 to obtain the next row's program
+    // counter.
+    {
+        let is_native_instruction =
+            builder.add_many_extension(NATIVE_INSTRUCTIONS.iter().map(|&col_i| lv[col_i]));
+        let filter = builder.mul_extension(lv.is_cpu_cycle, is_native_instruction);
+        let pc_diff = builder.sub_extension(lv.program_counter, nv.program_counter);
+        let constr = builder.mul_add_extension(filter, pc_diff, filter);
+        yield_constr.constraint_transition(builder, constr);
+    }
+
+    // If a non-CPU cycle row is followed by a CPU cycle row, then the `program_counter` of the CPU
+    // cycle row is 0.
+    {
+        let constr = builder.mul_extension(nv.is_cpu_cycle, nv.program_counter);
+        let constr = builder.mul_sub_extension(lv.is_cpu_cycle, constr, constr);
+        yield_constr.constraint_transition(builder, constr);
+    }
+
+    // The first row has nowhere to continue execution from, so if it's a cycle row, then its
+    // `program_counter` must be 0.
+    // NB: I know the first few rows will be used for initialization and will not be CPU cycle rows.
+    // Once that's done, then this constraint can be removed. Until then, it is needed to ensure
+    // that execution starts at 0 and not at any arbitrary offset.
+    {
+        let constr = builder.mul_extension(lv.is_cpu_cycle, lv.program_counter);
+        yield_constr.constraint_first_row(builder, constr);
+    }
+
+    // The last row must be a CPU cycle row.
+    {
+        let one = builder.one_extension();
+        let constr = builder.sub_extension(lv.is_cpu_cycle, one);
+        yield_constr.constraint_last_row(builder, constr);
+    }
+    // Also, the last row's `program_counter` must be inside the `halt` infinite loop. Note that
+    // that loop consists of two instructions, so we must check for `halt` and `halt_inner` labels.
+    {
+        let (halt_pc0, halt_pc1) = get_halt_pcs();
+        let halt_pc0_target = builder.constant_extension(halt_pc0);
+        let halt_pc1_target = builder.constant_extension(halt_pc1);
+
+        let halt_pc0_offset = builder.sub_extension(lv.program_counter, halt_pc0_target);
+        let halt_pc1_offset = builder.sub_extension(lv.program_counter, halt_pc1_target);
+        let constr = builder.mul_extension(halt_pc0_offset, halt_pc1_offset);
+
+        yield_constr.constraint_last_row(builder, constr);
+    }
+}
diff --git a/evm/src/cpu/cpu_stark.rs b/evm/src/cpu/cpu_stark.rs
index 1e5cc887..6b0bc0fd 100644
--- a/evm/src/cpu/cpu_stark.rs
+++ b/evm/src/cpu/cpu_stark.rs
@@ -9,15 +9,16 @@ use plonky2::hash::hash_types::RichField;
 
 use crate::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer};
 use crate::cpu::columns::{CpuColumnsView, COL_MAP, NUM_CPU_COLUMNS};
-use crate::cpu::{bootstrap_kernel, decode, simple_logic};
+use crate::cpu::{bootstrap_kernel, control_flow, decode, simple_logic};
 use crate::cross_table_lookup::Column;
 use crate::memory::NUM_CHANNELS;
 use crate::stark::Stark;
 use crate::vars::{StarkEvaluationTargets, StarkEvaluationVars};
 
 pub fn ctl_data_keccak<F: Field>() -> Vec<Column<F>> {
-    let mut res: Vec<_> = Column::singles(COL_MAP.keccak_input_limbs).collect();
-    res.extend(Column::singles(COL_MAP.keccak_output_limbs));
+    let keccak = COL_MAP.general.keccak();
+    let mut res: Vec<_> = Column::singles(keccak.input_limbs).collect();
+    res.extend(Column::singles(keccak.output_limbs));
     res
 }
 
@@ -27,9 +28,10 @@ pub fn ctl_filter_keccak<F: Field>() -> Column<F> {
 
 pub fn ctl_data_logic<F: Field>() -> Vec<Column<F>> {
     let mut res = Column::singles([COL_MAP.is_and, COL_MAP.is_or, COL_MAP.is_xor]).collect_vec();
-    res.extend(Column::singles(COL_MAP.logic_input0));
-    res.extend(Column::singles(COL_MAP.logic_input1));
-    res.extend(Column::singles(COL_MAP.logic_output));
+    let logic = COL_MAP.general.logic();
+    res.extend(Column::singles(logic.input0));
+    res.extend(Column::singles(logic.input1));
+    res.extend(Column::singles(logic.output));
     res
 }
 
@@ -88,7 +90,9 @@ impl<F: RichField + Extendable<D>, const D: usize> Stark<F, D> for CpuStark<F, D
         P: PackedField<Scalar = FE>,
     {
         let local_values = vars.local_values.borrow();
+        let next_values = vars.next_values.borrow();
         bootstrap_kernel::eval_bootstrap_kernel(vars, yield_constr);
+        control_flow::eval_packed_generic(local_values, next_values, yield_constr);
         decode::eval_packed_generic(local_values, yield_constr);
         simple_logic::eval_packed(local_values, yield_constr);
     }
@@ -100,7 +104,9 @@ impl<F: RichField + Extendable<D>, const D: usize> Stark<F, D> for CpuStark<F, D
         yield_constr: &mut RecursiveConstraintConsumer<F, D>,
     ) {
         let local_values = vars.local_values.borrow();
+        let next_values = vars.next_values.borrow();
         bootstrap_kernel::eval_bootstrap_kernel_circuit(builder, vars, yield_constr);
+        control_flow::eval_ext_circuit(builder, local_values, next_values, yield_constr);
         decode::eval_ext_circuit(builder, local_values, yield_constr);
         simple_logic::eval_ext_circuit(builder, local_values, yield_constr);
     }
diff --git a/evm/src/cpu/decode.rs b/evm/src/cpu/decode.rs
index 233c01c4..cf43f909 100644
--- a/evm/src/cpu/decode.rs
+++ b/evm/src/cpu/decode.rs
@@ -15,7 +15,7 @@ use crate::cpu::columns::{CpuColumnsView, COL_MAP};
 // - its start index is a multiple of its length (it is aligned)
 // These properties permit us to check if an opcode belongs to a block of length 2^n by checking its
 // top 8-n bits.
-const OPCODES: [(u64, usize, usize); 107] = [
+const OPCODES: [(u64, usize, usize); 106] = [
     // (start index of block, number of top bits to check (log2), flag column)
     (0x00, 0, COL_MAP.is_stop),
     (0x01, 0, COL_MAP.is_add),
@@ -102,7 +102,7 @@ const OPCODES: [(u64, usize, usize); 107] = [
     (0xa2, 0, COL_MAP.is_log2),
     (0xa3, 0, COL_MAP.is_log3),
     (0xa4, 0, COL_MAP.is_log4),
-    (0xa5, 0, COL_MAP.is_panic),
+    // Opcode 0xa5 is PANIC. Make the proof unverifiable by giving it no flag to decode to.
     (0xa6, 1, COL_MAP.is_invalid_8),  // 0xa6-0xa7
     (0xa8, 3, COL_MAP.is_invalid_9),  // 0xa8-0xaf
     (0xb0, 4, COL_MAP.is_invalid_10), // 0xb0-0xbf
diff --git a/evm/src/cpu/kernel/aggregator.rs b/evm/src/cpu/kernel/aggregator.rs
index 1f8ba0da..4c8a1173 100644
--- a/evm/src/cpu/kernel/aggregator.rs
+++ b/evm/src/cpu/kernel/aggregator.rs
@@ -1,52 +1,31 @@
 //! Loads each kernel assembly file and concatenates them.
 
-use std::collections::HashMap;
-
-use ethereum_types::U256;
-use hex_literal::hex;
 use itertools::Itertools;
 use once_cell::sync::Lazy;
 
 use super::assembler::{assemble, Kernel};
+use crate::cpu::kernel::constants::evm_constants;
 use crate::cpu::kernel::parser::parse;
-use crate::cpu::kernel::txn_fields::NormalizedTxnField;
-use crate::memory::segments::Segment;
 
 pub static KERNEL: Lazy<Kernel> = Lazy::new(combined_kernel);
 
-pub fn evm_constants() -> HashMap<String, U256> {
-    let mut c = HashMap::new();
-    c.insert(
-        "BN_BASE".into(),
-        U256::from_big_endian(&hex!(
-            "30644e72e131a029b85045b68181585d97816a916871ca8d3c208c16d87cfd47"
-        )),
-    );
-    for segment in Segment::all() {
-        c.insert(segment.var_name().into(), (segment as u32).into());
-    }
-    for txn_field in NormalizedTxnField::all() {
-        c.insert(txn_field.var_name().into(), (txn_field as u32).into());
-    }
-    c
-}
-
-#[allow(dead_code)] // TODO: Should be used once witness generation is done.
 pub(crate) fn combined_kernel() -> Kernel {
     let files = vec![
-        include_str!("asm/assertions.asm"),
-        include_str!("asm/basic_macros.asm"),
+        include_str!("asm/curve/bn254/curve_add.asm"),
+        include_str!("asm/curve/bn254/curve_mul.asm"),
+        include_str!("asm/curve/bn254/moddiv.asm"),
+        include_str!("asm/curve/common.asm"),
+        include_str!("asm/curve/secp256k1/curve_mul.asm"),
+        include_str!("asm/curve/secp256k1/curve_add.asm"),
+        include_str!("asm/curve/secp256k1/ecrecover.asm"),
+        include_str!("asm/curve/secp256k1/inverse_scalar.asm"),
+        include_str!("asm/curve/secp256k1/lift_x.asm"),
+        include_str!("asm/curve/secp256k1/moddiv.asm"),
         include_str!("asm/exp.asm"),
-        include_str!("asm/curve_mul.asm"),
-        include_str!("asm/curve_add.asm"),
-        include_str!("asm/memory.asm"),
-        include_str!("asm/moddiv.asm"),
-        include_str!("asm/secp256k1/curve_mul.asm"),
-        include_str!("asm/secp256k1/curve_add.asm"),
-        include_str!("asm/secp256k1/moddiv.asm"),
-        include_str!("asm/secp256k1/lift_x.asm"),
-        include_str!("asm/secp256k1/inverse_scalar.asm"),
-        include_str!("asm/ecrecover.asm"),
+        include_str!("asm/halt.asm"),
+        include_str!("asm/memory/core.asm"),
+        include_str!("asm/memory/memcpy.asm"),
+        include_str!("asm/memory/txn_fields.asm"),
         include_str!("asm/rlp/encode.asm"),
         include_str!("asm/rlp/decode.asm"),
         include_str!("asm/rlp/read_to_memory.asm"),
@@ -57,23 +36,24 @@ pub(crate) fn combined_kernel() -> Kernel {
         include_str!("asm/transactions/type_0.asm"),
         include_str!("asm/transactions/type_1.asm"),
         include_str!("asm/transactions/type_2.asm"),
+        include_str!("asm/util/assertions.asm"),
+        include_str!("asm/util/basic_macros.asm"),
     ];
 
     let parsed_files = files.iter().map(|f| parse(f)).collect_vec();
-    assemble(parsed_files, evm_constants())
+    assemble(parsed_files, evm_constants(), true)
 }
 
 #[cfg(test)]
 mod tests {
+    use env_logger::{try_init_from_env, Env, DEFAULT_FILTER_ENV};
     use log::debug;
 
     use crate::cpu::kernel::aggregator::combined_kernel;
 
     #[test]
     fn make_kernel() {
-        let _ = env_logger::Builder::from_default_env()
-            .format_timestamp(None)
-            .try_init();
+        let _ = try_init_from_env(Env::default().filter_or(DEFAULT_FILTER_ENV, "debug"));
 
         // Make sure we can parse and assemble the entire kernel.
         let kernel = combined_kernel();
diff --git a/evm/src/cpu/kernel/asm/curve_add.asm b/evm/src/cpu/kernel/asm/curve/bn254/curve_add.asm
similarity index 100%
rename from evm/src/cpu/kernel/asm/curve_add.asm
rename to evm/src/cpu/kernel/asm/curve/bn254/curve_add.asm
diff --git a/evm/src/cpu/kernel/asm/curve_mul.asm b/evm/src/cpu/kernel/asm/curve/bn254/curve_mul.asm
similarity index 92%
rename from evm/src/cpu/kernel/asm/curve_mul.asm
rename to evm/src/cpu/kernel/asm/curve/bn254/curve_mul.asm
index d976d9d5..62cf2235 100644
--- a/evm/src/cpu/kernel/asm/curve_mul.asm
+++ b/evm/src/cpu/kernel/asm/curve/bn254/curve_mul.asm
@@ -99,16 +99,3 @@ odd_scalar:
     JUMPDEST
     // stack: x', y', x, y, retdest
     %jump(ec_add_valid_points)
-
-global ret_zero_ec_mul:
-    JUMPDEST
-    // stack: x, y, s, retdest
-    %pop3
-    // stack: retdest
-    PUSH 0
-    // stack: 0, retdest
-    PUSH 0
-    // stack: 0, 0, retdest
-    SWAP2
-    // stack: retdest, 0, 0
-    JUMP
diff --git a/evm/src/cpu/kernel/asm/curve/bn254/moddiv.asm b/evm/src/cpu/kernel/asm/curve/bn254/moddiv.asm
new file mode 100644
index 00000000..780473b9
--- /dev/null
+++ b/evm/src/cpu/kernel/asm/curve/bn254/moddiv.asm
@@ -0,0 +1,39 @@
+/// Division modulo 0x30644e72e131a029b85045b68181585d97816a916871ca8d3c208c16d87cfd47, the BN254 base field order
+/// To replace with more efficient method using non-determinism later.
+
+// Returns y * (x^-1) where the inverse is taken modulo N
+%macro moddiv
+    // stack: x, y
+    %inverse
+    // stack: x^-1, y
+    %mulmodn
+%endmacro
+
+%macro mulmodn
+    // stack: x, y
+    %bn_base
+    // stack: N, x, y
+    SWAP2
+    // stack: y, x, N
+    MULMOD
+%endmacro
+
+%macro squaremodn
+    // stack: x
+    DUP1
+    // stack: x, x
+    %mulmodn
+%endmacro
+
+// Non-deterministically provide the inverse modulo N.
+%macro inverse
+    // stack: x
+    PROVER_INPUT(ff::bn254_base::inverse)
+    // stack: x^-1, x
+    %stack (inv, x) -> (inv, x, @BN_BASE, inv)
+    // stack: x^-1, x, N, x^-1
+    MULMOD
+    // stack: x^-1 * x, x^-1
+    %assert_eq_const(1)
+    // stack: x^-1
+%endmacro
diff --git a/evm/src/cpu/kernel/asm/curve/common.asm b/evm/src/cpu/kernel/asm/curve/common.asm
new file mode 100644
index 00000000..107dc63c
--- /dev/null
+++ b/evm/src/cpu/kernel/asm/curve/common.asm
@@ -0,0 +1,12 @@
+global ret_zero_ec_mul:
+    JUMPDEST
+    // stack: x, y, s, retdest
+    %pop3
+    // stack: retdest
+    PUSH 0
+    // stack: 0, retdest
+    PUSH 0
+    // stack: 0, 0, retdest
+    SWAP2
+    // stack: retdest, 0, 0
+    JUMP
diff --git a/evm/src/cpu/kernel/asm/secp256k1/curve_add.asm b/evm/src/cpu/kernel/asm/curve/secp256k1/curve_add.asm
similarity index 100%
rename from evm/src/cpu/kernel/asm/secp256k1/curve_add.asm
rename to evm/src/cpu/kernel/asm/curve/secp256k1/curve_add.asm
diff --git a/evm/src/cpu/kernel/asm/secp256k1/curve_mul.asm b/evm/src/cpu/kernel/asm/curve/secp256k1/curve_mul.asm
similarity index 93%
rename from evm/src/cpu/kernel/asm/secp256k1/curve_mul.asm
rename to evm/src/cpu/kernel/asm/curve/secp256k1/curve_mul.asm
index 7ad2dd71..f0825e88 100644
--- a/evm/src/cpu/kernel/asm/secp256k1/curve_mul.asm
+++ b/evm/src/cpu/kernel/asm/curve/secp256k1/curve_mul.asm
@@ -2,6 +2,10 @@
 global ec_mul_valid_point_secp:
     JUMPDEST
     // stack: x, y, s, retdest
+    %stack (x,y) -> (x,y,x,y)
+    %ec_isidentity
+    // stack: (x,y)==(0,0), x, y, s, retdest
+    %jumpi(ret_zero_ec_mul)
     DUP3
     // stack: s, x, y, s, retdest
     %jumpi(step_case)
diff --git a/evm/src/cpu/kernel/asm/ecrecover.asm b/evm/src/cpu/kernel/asm/curve/secp256k1/ecrecover.asm
similarity index 100%
rename from evm/src/cpu/kernel/asm/ecrecover.asm
rename to evm/src/cpu/kernel/asm/curve/secp256k1/ecrecover.asm
diff --git a/evm/src/cpu/kernel/asm/curve/secp256k1/inverse_scalar.asm b/evm/src/cpu/kernel/asm/curve/secp256k1/inverse_scalar.asm
new file mode 100644
index 00000000..6e1563e2
--- /dev/null
+++ b/evm/src/cpu/kernel/asm/curve/secp256k1/inverse_scalar.asm
@@ -0,0 +1,31 @@
+/// Division modulo 0xfffffffffffffffffffffffffffffffebaaedce6af48a03bbfd25e8cd0364141, the Secp256k1 scalar field order
+/// To replace with more efficient method using non-determinism later.
+
+%macro mulmodn_secp_scalar
+    // stack: x, y
+    %secp_scalar
+    // stack: N, x, y
+    SWAP2
+    // stack: y, x, N
+    MULMOD
+%endmacro
+
+%macro squaremodn_secp_scalar
+    // stack: x
+    DUP1
+    // stack: x, x
+    %mulmodn_secp_scalar
+%endmacro
+
+// Non-deterministically provide the inverse modulo N.
+%macro inverse_secp_scalar
+    // stack: x
+    PROVER_INPUT(ff::secp256k1_scalar::inverse)
+    // stack: x^-1, x
+    %stack (inv, x) -> (inv, x, @SECP_SCALAR, inv)
+    // stack: x^-1, x, N, x^-1
+    MULMOD
+    // stack: x^-1 * x, x^-1
+    %assert_eq_const(1)
+    // stack: x^-1
+%endmacro
diff --git a/evm/src/cpu/kernel/asm/curve/secp256k1/lift_x.asm b/evm/src/cpu/kernel/asm/curve/secp256k1/lift_x.asm
new file mode 100644
index 00000000..77e484be
--- /dev/null
+++ b/evm/src/cpu/kernel/asm/curve/secp256k1/lift_x.asm
@@ -0,0 +1,73 @@
+// Returns y such that (x,y) is on Secp256k1 and y&1 = parity,
+// as well as a flag indicating whether such a y exists.
+%macro secp_lift_x
+    // stack: x, parity
+    %cubemodn_secp_base
+    // stack: x^3, parity
+    PUSH 7
+    // stack: 7, x^3, parity
+    %addmodn_secp_base
+    // stack: x^3+7, x, parity
+    DUP1
+    // stack: x^3+7, x^3+7, parity
+    %sqrt_secp_base_unsafe
+    // stack: y, x^3+7, x, parity
+    SWAP1
+    // stack: x^3+7, y, parity
+    DUP2
+    // stack: y, x^3+7, y, parity
+    %squaremodn_secp_base
+    // stack: y^2, x^3+7, y, parity
+    EQ
+    // stack: sqrtOk, y, parity
+    SWAP2
+    // stack: parity, y, sqrtOk
+    DUP2
+    // stack: y, parity, y, sqrtOk
+    PUSH 1
+    // stack: 1, y, parity, y, sqrtOk
+    AND
+    // stack: 1 & y, parity, y, sqrtOk
+    EQ
+    // stack: correctParity, y, sqrtOk
+    DUP2
+    // stack: y, correctParity, y, sqrtOk
+    %secp_base
+    // stack: N, y, correctParity, y, sqrtOk
+    SUB
+    // stack: N - y, correctParity, y, sqrtOk
+    SWAP1
+    // stack: correctParity, N - y, y, sqrtOk
+    %select_bool
+    // stack: goody, sqrtOk
+%endmacro
+
+%macro cubemodn_secp_base
+    // stack: x
+    DUP1
+    // stack: x, x
+    %squaremodn_secp_base
+    // stack: x^2, x
+    %mulmodn_secp_base
+%endmacro
+
+%macro addmodn_secp_base
+    // stack: x, y
+    %secp_base
+    // stack: N, x, y
+    SWAP2
+    // stack: y, x, N
+    ADDMOD
+%endmacro
+
+// Non-deterministically provide the square root modulo N.
+// Note: The square root is not checked and the macro doesn't panic if `x` is not a square.
+%macro sqrt_secp_base_unsafe
+    // stack: x
+    PROVER_INPUT(ff::secp256k1_base::sqrt)
+    // stack: √x, x
+    SWAP1
+    // stack: x, √x
+    POP
+    // stack: √x
+%endmacro
\ No newline at end of file
diff --git a/evm/src/cpu/kernel/asm/curve/secp256k1/moddiv.asm b/evm/src/cpu/kernel/asm/curve/secp256k1/moddiv.asm
new file mode 100644
index 00000000..d878dc14
--- /dev/null
+++ b/evm/src/cpu/kernel/asm/curve/secp256k1/moddiv.asm
@@ -0,0 +1,39 @@
+/// Division modulo 0xfffffffffffffffffffffffffffffffffffffffffffffffffffffffefffffc2f, the Secp256k1 base field order
+/// To replace with more efficient method using non-determinism later.
+
+// Returns y * (x^-1) where the inverse is taken modulo N
+%macro moddiv_secp_base
+    // stack: x, y
+    %inverse_secp_base
+    // stack: x^-1, y
+    %mulmodn_secp_base
+%endmacro
+
+%macro mulmodn_secp_base
+    // stack: x, y
+    %secp_base
+    // stack: N, x, y
+    SWAP2
+    // stack: y, x, N
+    MULMOD
+%endmacro
+
+%macro squaremodn_secp_base
+    // stack: x
+    DUP1
+    // stack: x, x
+    %mulmodn_secp_base
+%endmacro
+
+// Non-deterministically provide the inverse modulo N.
+%macro inverse_secp_base
+    // stack: x
+    PROVER_INPUT(ff::secp256k1_base::inverse)
+    // stack: x^-1, x
+    %stack (inv, x) -> (inv, x, @SECP_BASE, inv)
+    // stack: x^-1, x, N, x^-1
+    MULMOD
+    // stack: x^-1 * x, x^-1
+    %assert_eq_const(1)
+    // stack: x^-1
+%endmacro
diff --git a/evm/src/cpu/kernel/asm/halt.asm b/evm/src/cpu/kernel/asm/halt.asm
new file mode 100644
index 00000000..906ce51a
--- /dev/null
+++ b/evm/src/cpu/kernel/asm/halt.asm
@@ -0,0 +1,6 @@
+global halt:
+    PUSH halt_pc0
+global halt_pc0:
+    DUP1
+global halt_pc1:
+    JUMP
diff --git a/evm/src/cpu/kernel/asm/memory.asm b/evm/src/cpu/kernel/asm/memory/core.asm
similarity index 64%
rename from evm/src/cpu/kernel/asm/memory.asm
rename to evm/src/cpu/kernel/asm/memory/core.asm
index 81474d12..2c896345 100644
--- a/evm/src/cpu/kernel/asm/memory.asm
+++ b/evm/src/cpu/kernel/asm/memory/core.asm
@@ -26,10 +26,10 @@
     // stack: (empty)
 %endmacro
 
-// Load a single byte from kernel code.
-%macro mload_kernel_code
+// Load a single value from the given segment of kernel (context 0) memory.
+%macro mload_kernel(segment)
     // stack: offset
-    PUSH @SEGMENT_CODE
+    PUSH $segment
     // stack: segment, offset
     PUSH 0 // kernel has context 0
     // stack: context, segment, offset
@@ -37,6 +37,24 @@
     // stack: value
 %endmacro
 
+// Store a single value from the given segment of kernel (context 0) memory.
+%macro mstore_kernel(segment)
+    // stack: offset, value
+    PUSH $segment
+    // stack: segment, offset, value
+    PUSH 0 // kernel has context 0
+    // stack: context, segment, offset, value
+    MSTORE_GENERAL
+    // stack: (empty)
+%endmacro
+
+// Load a single byte from kernel code.
+%macro mload_kernel_code
+    // stack: offset
+    %mload_kernel(@SEGMENT_CODE)
+    // stack: value
+%endmacro
+
 // Load a big-endian u32, consisting of 4 bytes (c_3, c_2, c_1, c_0),
 // from kernel code.
 %macro mload_kernel_code_u32
@@ -67,54 +85,9 @@
     // stack: (((((c_3 << 8) | c_2) << 8) | c_1) << 8) | c_0
 %endmacro
 
-// Copies `count` values from
-//     SRC = (src_ctx, src_segment, src_addr)
-// to
-//     DST = (dst_ctx, dst_segment, dst_addr).
-// These tuple definitions are used for brevity in the stack comments below.
-global memcpy:
-    JUMPDEST
-    // stack: DST, SRC, count, retdest
-    DUP7
-    // stack: count, DST, SRC, count, retdest
-    ISZERO
-    // stack: count == 0, DST, SRC, count, retdest
-    %jumpi(memcpy_finish)
-    // stack: DST, SRC, count, retdest
-
-    // Copy the next value.
-    DUP6
-    DUP6
-    DUP6
-    // stack: SRC, DST, SRC, count, retdest
-    MLOAD_GENERAL
-    // stack: value, DST, SRC, count, retdest
-    DUP4
-    DUP4
-    DUP4
-    // stack: DST, value, DST, SRC, count, retdest
-    MSTORE_GENERAL
-    // stack: DST, SRC, count, retdest
-
-    // Increment dst_addr.
-    SWAP2
-    %add_const(1)
-    SWAP2
-    // Increment src_addr.
-    SWAP5
-    %add_const(1)
-    SWAP5
-    // Decrement count.
-    SWAP6
-    %sub_const(1)
-    SWAP6
-
-    // Continue the loop.
-    %jump(memcpy)
-
-memcpy_finish:
-    JUMPDEST
-    // stack: DST, SRC, count, retdest
-    %pop7
-    // stack: retdest
-    JUMP
+// Store a single byte to kernel code.
+%macro mstore_kernel_code
+    // stack: offset, value
+    %mstore_kernel(@SEGMENT_CODE)
+    // stack: (empty)
+%endmacro
diff --git a/evm/src/cpu/kernel/asm/memory/memcpy.asm b/evm/src/cpu/kernel/asm/memory/memcpy.asm
new file mode 100644
index 00000000..0a390736
--- /dev/null
+++ b/evm/src/cpu/kernel/asm/memory/memcpy.asm
@@ -0,0 +1,51 @@
+// Copies `count` values from
+//     SRC = (src_ctx, src_segment, src_addr)
+// to
+//     DST = (dst_ctx, dst_segment, dst_addr).
+// These tuple definitions are used for brevity in the stack comments below.
+global memcpy:
+    JUMPDEST
+    // stack: DST, SRC, count, retdest
+    DUP7
+    // stack: count, DST, SRC, count, retdest
+    ISZERO
+    // stack: count == 0, DST, SRC, count, retdest
+    %jumpi(memcpy_finish)
+    // stack: DST, SRC, count, retdest
+
+    // Copy the next value.
+    DUP6
+    DUP6
+    DUP6
+    // stack: SRC, DST, SRC, count, retdest
+    MLOAD_GENERAL
+    // stack: value, DST, SRC, count, retdest
+    DUP4
+    DUP4
+    DUP4
+    // stack: DST, value, DST, SRC, count, retdest
+    MSTORE_GENERAL
+    // stack: DST, SRC, count, retdest
+
+    // Increment dst_addr.
+    SWAP2
+    %add_const(1)
+    SWAP2
+    // Increment src_addr.
+    SWAP5
+    %add_const(1)
+    SWAP5
+    // Decrement count.
+    SWAP6
+    %sub_const(1)
+    SWAP6
+
+    // Continue the loop.
+    %jump(memcpy)
+
+memcpy_finish:
+    JUMPDEST
+    // stack: DST, SRC, count, retdest
+    %pop7
+    // stack: retdest
+    JUMP
diff --git a/evm/src/cpu/kernel/asm/memory/metadata.asm b/evm/src/cpu/kernel/asm/memory/metadata.asm
new file mode 100644
index 00000000..22eb853f
--- /dev/null
+++ b/evm/src/cpu/kernel/asm/memory/metadata.asm
@@ -0,0 +1,35 @@
+// Load the given global metadata field from memory.
+%macro mload_global_metadata(field)
+    // stack: (empty)
+    PUSH $field
+    // stack: offset
+    %mload_kernel(@SEGMENT_GLOBAL_METADATA)
+    // stack: value
+%endmacro
+
+// Store the given global metadata field to memory.
+%macro mstore_global_metadata(field)
+    // stack: value
+    PUSH $field
+    // stack: offset, value
+    %mload_kernel(@SEGMENT_GLOBAL_METADATA)
+    // stack: (empty)
+%endmacro
+
+// Load the given context metadata field from memory.
+%macro mload_context_metadata(field)
+    // stack: (empty)
+    PUSH $field
+    // stack: offset
+    %mload_current(@SEGMENT_CONTEXT_METADATA)
+    // stack: value
+%endmacro
+
+// Store the given context metadata field to memory.
+%macro mstore_context_metadata(field)
+    // stack: value
+    PUSH $field
+    // stack: offset, value
+    %mload_current(@SEGMENT_CONTEXT_METADATA)
+    // stack: (empty)
+%endmacro
diff --git a/evm/src/cpu/kernel/asm/memory/txn_fields.asm b/evm/src/cpu/kernel/asm/memory/txn_fields.asm
new file mode 100644
index 00000000..d15b7264
--- /dev/null
+++ b/evm/src/cpu/kernel/asm/memory/txn_fields.asm
@@ -0,0 +1,17 @@
+// Load the given normalized transaction field from memory.
+%macro mload_txn_field(field)
+    // stack: (empty)
+    PUSH $field
+    // stack: offset
+    %mload_kernel(@SEGMENT_NORMALIZED_TXN)
+    // stack: value
+%endmacro
+
+// Store the given normalized transaction field to memory.
+%macro mstore_txn_field(field)
+    // stack: value
+    PUSH $field
+    // stack: offset, value
+    %mstore_kernel(@SEGMENT_NORMALIZED_TXN)
+    // stack: (empty)
+%endmacro
diff --git a/evm/src/cpu/kernel/asm/moddiv.asm b/evm/src/cpu/kernel/asm/moddiv.asm
deleted file mode 100644
index 891897e5..00000000
--- a/evm/src/cpu/kernel/asm/moddiv.asm
+++ /dev/null
@@ -1,506 +0,0 @@
-/// Division modulo 0x30644e72e131a029b85045b68181585d97816a916871ca8d3c208c16d87cfd47, the BN254 base field order
-/// To replace with more efficient method using non-determinism later.
-
-// Returns y * (x^-1) where the inverse is taken modulo N
-%macro moddiv
-    // stack: x, y
-    %inverse
-    // stack: x^-1, y
-    %mulmodn
-%endmacro
-
-%macro mulmodn
-    // stack: x, y
-    PUSH 0x30644e72e131a029b85045b68181585d97816a916871ca8d3c208c16d87cfd47
-    // stack: N, x, y
-    SWAP2
-    // stack: y, x, N
-    MULMOD
-%endmacro
-
-%macro squaremodn
-    // stack: x
-    DUP1
-    // stack: x, x
-    %mulmodn
-%endmacro
-
-// Computes the inverse modulo N using x^-1 = x^(N-2) mod N and square-and-multiply modular exponentiation.
-%macro inverse
-    DUP1
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    %squaremodn
-    %squaremodn
-    %squaremodn
-    %squaremodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    %squaremodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    %squaremodn
-    %squaremodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    %squaremodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    %squaremodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    %squaremodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    %squaremodn
-    %squaremodn
-    %squaremodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    %squaremodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    %squaremodn
-    %squaremodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    %squaremodn
-    %squaremodn
-    %squaremodn
-    %squaremodn
-    %squaremodn
-    %squaremodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    %squaremodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    %squaremodn
-    %squaremodn
-    %squaremodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    %squaremodn
-    %squaremodn
-    %squaremodn
-    %squaremodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    %squaremodn
-    %squaremodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    %squaremodn
-    %squaremodn
-    %squaremodn
-    %squaremodn
-    %squaremodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    %squaremodn
-    %squaremodn
-    %squaremodn
-    %squaremodn
-    %squaremodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    %squaremodn
-    %squaremodn
-    %squaremodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    %squaremodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    %squaremodn
-    %squaremodn
-    %squaremodn
-    %squaremodn
-    %squaremodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    %squaremodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    %squaremodn
-    %squaremodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    %squaremodn
-    %squaremodn
-    %squaremodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    %squaremodn
-    %squaremodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    %squaremodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    %squaremodn
-    %squaremodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    %squaremodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    %squaremodn
-    %squaremodn
-    %squaremodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    %squaremodn
-    %squaremodn
-    %squaremodn
-    %squaremodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    %squaremodn
-    %squaremodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    %squaremodn
-    %squaremodn
-    %squaremodn
-    %squaremodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    %squaremodn
-    %squaremodn
-    %squaremodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    %squaremodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    %squaremodn
-    %squaremodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    %squaremodn
-    %squaremodn
-    DUP2
-    %mulmodn
-    SWAP1
-    // stack: x, x^-1
-    POP
-    // stack: x^-1
-%endmacro
diff --git a/evm/src/cpu/kernel/asm/ripemd/iterate.asm b/evm/src/cpu/kernel/asm/ripemd/iterate.asm
new file mode 100644
index 00000000..b91624ce
--- /dev/null
+++ b/evm/src/cpu/kernel/asm/ripemd/iterate.asm
@@ -0,0 +1,2 @@
+global R:
+    
\ No newline at end of file
diff --git a/evm/src/cpu/kernel/asm/rlp/decode.asm b/evm/src/cpu/kernel/asm/rlp/decode.asm
index 76daec1a..0388276a 100644
--- a/evm/src/cpu/kernel/asm/rlp/decode.asm
+++ b/evm/src/cpu/kernel/asm/rlp/decode.asm
@@ -32,6 +32,7 @@ global decode_rlp_string_len:
     JUMP
 
 decode_rlp_string_len_medium:
+    JUMPDEST
     // String is 0-55 bytes long. First byte contains the len.
     // stack: first_byte, pos, retdest
     %sub_const(0x80)
@@ -43,6 +44,7 @@ decode_rlp_string_len_medium:
     JUMP
 
 decode_rlp_string_len_large:
+    JUMPDEST
     // String is >55 bytes long. First byte contains the len of the len.
     // stack: first_byte, pos, retdest
     %sub_const(0xb7)
@@ -52,6 +54,13 @@ decode_rlp_string_len_large:
     // stack: pos', len_of_len, retdest
     %jump(decode_int_given_len)
 
+// Convenience macro to call decode_rlp_string_len and return where we left off.
+%macro decode_rlp_string_len
+    %stack (pos) -> (pos, %%after)
+    %jump(decode_rlp_string_len)
+%%after:
+%endmacro
+
 // Parse a scalar from RLP memory.
 // Pre stack: pos, retdest
 // Post stack: pos', scalar
@@ -71,6 +80,13 @@ global decode_rlp_scalar:
     // to decode_int_given_len.
     %jump(decode_rlp_string_len)
 
+// Convenience macro to call decode_rlp_scalar and return where we left off.
+%macro decode_rlp_scalar
+    %stack (pos) -> (pos, %%after)
+    %jump(decode_rlp_scalar)
+%%after:
+%endmacro
+
 // Parse the length of an RLP list from memory.
 // Pre stack: pos, retdest
 // Post stack: pos', len
@@ -109,6 +125,13 @@ decode_rlp_list_len_big:
     // stack: pos', len_of_len, retdest
     %jump(decode_int_given_len)
 
+// Convenience macro to call decode_rlp_list_len and return where we left off.
+%macro decode_rlp_list_len
+    %stack (pos) -> (pos, %%after)
+    %jump(decode_rlp_list_len)
+%%after:
+%endmacro
+
 // Parse an integer of the given length. It is assumed that the integer will
 // fit in a single (256-bit) word on the stack.
 // Pre stack: pos, len, retdest
diff --git a/evm/src/cpu/kernel/asm/rlp/encode.asm b/evm/src/cpu/kernel/asm/rlp/encode.asm
index b2446c37..58cb9230 100644
--- a/evm/src/cpu/kernel/asm/rlp/encode.asm
+++ b/evm/src/cpu/kernel/asm/rlp/encode.asm
@@ -1,17 +1,17 @@
 // RLP-encode a scalar, i.e. a variable-length integer.
-// Pre stack: pos, scalar
+// Pre stack: pos, scalar, retdest
 // Post stack: (empty)
 global encode_rlp_scalar:
     PANIC // TODO: implement
 
 // RLP-encode a fixed-length 160-bit string. Assumes string < 2^160.
-// Pre stack: pos, string
+// Pre stack: pos, string, retdest
 // Post stack: (empty)
 global encode_rlp_160:
     PANIC // TODO: implement
 
 // RLP-encode a fixed-length 256-bit string.
-// Pre stack: pos, string
+// Pre stack: pos, string, retdest
 // Post stack: (empty)
 global encode_rlp_256:
     PANIC // TODO: implement
diff --git a/evm/src/cpu/kernel/asm/secp256k1/inverse_scalar.asm b/evm/src/cpu/kernel/asm/secp256k1/inverse_scalar.asm
deleted file mode 100644
index ce0af757..00000000
--- a/evm/src/cpu/kernel/asm/secp256k1/inverse_scalar.asm
+++ /dev/null
@@ -1,672 +0,0 @@
-/// Division modulo 0xfffffffffffffffffffffffffffffffebaaedce6af48a03bbfd25e8cd0364141, the Secp256k1 scalar field order
-/// To replace with more efficient method using non-determinism later.
-
-%macro mulmodn_secp_scalar
-    // stack: x, y
-    %secp_scalar
-    // stack: N, x, y
-    SWAP2
-    // stack: y, x, N
-    MULMOD
-%endmacro
-
-%macro squaremodn_secp_scalar
-    // stack: x
-    DUP1
-    // stack: x, x
-    %mulmodn_secp_scalar
-%endmacro
-
-// Computes the inverse modulo N using x^-1 = x^(N-2) mod N and square-and-multiply modular exponentiation.
-%macro inverse_secp_scalar
-    DUP1
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    %squaremodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    %squaremodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    %squaremodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    %squaremodn_secp_scalar
-    %squaremodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    %squaremodn_secp_scalar
-    %squaremodn_secp_scalar
-    %squaremodn_secp_scalar
-    %squaremodn_secp_scalar
-    %squaremodn_secp_scalar
-    %squaremodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    %squaremodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    %squaremodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    %squaremodn_secp_scalar
-    %squaremodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    %squaremodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    %squaremodn_secp_scalar
-    %squaremodn_secp_scalar
-    %squaremodn_secp_scalar
-    %squaremodn_secp_scalar
-    %squaremodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    %squaremodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    %squaremodn_secp_scalar
-    %squaremodn_secp_scalar
-    %squaremodn_secp_scalar
-    %squaremodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    %squaremodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    %squaremodn_secp_scalar
-    DUP2
-    %mulmodn_secp_scalar
-    SWAP1
-    // stack: x, x^-1
-    POP
-    // stack: x^-1
-%endmacro
diff --git a/evm/src/cpu/kernel/asm/secp256k1/lift_x.asm b/evm/src/cpu/kernel/asm/secp256k1/lift_x.asm
deleted file mode 100644
index cd392b61..00000000
--- a/evm/src/cpu/kernel/asm/secp256k1/lift_x.asm
+++ /dev/null
@@ -1,818 +0,0 @@
-// Returns y such that (x,y) is on Secp256k1 and y&1 = parity,
-// as well as a flag indicating whether such a y exists.
-%macro secp_lift_x
-    // stack: x, parity
-    %cubemodn_secp_base
-    // stack: x^3, parity
-    PUSH 7
-    // stack: 7, x^3, parity
-    %addmodn_secp_base
-    // stack: x^3+7, x, parity
-    DUP1
-    // stack: x^3+7, x^3+7, parity
-    %sqrt_secp_base
-    // stack: y, x^3+7, x, parity
-    SWAP1
-    // stack: x^3+7, y, parity
-    DUP2
-    // stack: y, x^3+7, y, parity
-    %squaremodn_secp_base
-    // stack: y^2, x^3+7, y, parity
-    EQ
-    // stack: sqrtOk, y, parity
-    SWAP2
-    // stack: parity, y, sqrtOk
-    DUP2
-    // stack: y, parity, y, sqrtOk
-    PUSH 1
-    // stack: 1, y, parity, y, sqrtOk
-    AND
-    // stack: 1 & y, parity, y, sqrtOk
-    EQ
-    // stack: correctParity, y, sqrtOk
-    DUP2
-    // stack: y, correctParity, y, sqrtOk
-    %secp_base
-    // stack: N, y, correctParity, y, sqrtOk
-    SUB
-    // stack: N - y, correctParity, y, sqrtOk
-    SWAP1
-    // stack: correctParity, N - y, y, sqrtOk
-    %select_bool
-    // stack: goody, sqrtOk
-%endmacro
-
-%macro cubemodn_secp_base
-    // stack: x
-    DUP1
-    // stack: x, x
-    %squaremodn_secp_base
-    // stack: x^2, x
-    %mulmodn_secp_base
-%endmacro
-
-%macro addmodn_secp_base
-    // stack: x, y
-    %secp_base
-    // stack: N, x, y
-    SWAP2
-    // stack: y, x, N
-    ADDMOD
-%endmacro
-
-// Returns a square root of x if one exists, otherwise an undefined value.
-// Computed as x^(q+1)/4, with q the Secp base field order.
-// To replace with more efficient method using non-determinism later.
-%macro sqrt_secp_base
-    // stack: x
-    DUP1
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    %squaremodn_secp_base
-    %squaremodn_secp_base
-    %squaremodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    %squaremodn_secp_base
-    SWAP1
-    // stack: x, x^-1
-    POP
-    // stack: x^-1
-%endmacro
\ No newline at end of file
diff --git a/evm/src/cpu/kernel/asm/secp256k1/moddiv.asm b/evm/src/cpu/kernel/asm/secp256k1/moddiv.asm
deleted file mode 100644
index 941fa33a..00000000
--- a/evm/src/cpu/kernel/asm/secp256k1/moddiv.asm
+++ /dev/null
@@ -1,786 +0,0 @@
-/// Division modulo 0xfffffffffffffffffffffffffffffffffffffffffffffffffffffffefffffc2f, the Secp256k1 base field order
-/// To replace with more efficient method using non-determinism later.
-
-// Returns y * (x^-1) where the inverse is taken modulo N
-%macro moddiv_secp_base
-    // stack: x, y
-    %inverse_secp_base
-    // stack: x^-1, y
-    %mulmodn_secp_base
-%endmacro
-
-%macro mulmodn_secp_base
-    // stack: x, y
-    %secp_base
-    // stack: N, x, y
-    SWAP2
-    // stack: y, x, N
-    MULMOD
-%endmacro
-
-%macro squaremodn_secp_base
-    // stack: x
-    DUP1
-    // stack: x, x
-    %mulmodn_secp_base
-%endmacro
-
-// Computes the inverse modulo N using x^-1 = x^(N-2) mod N and square-and-multiply modular exponentiation.
-%macro inverse_secp_base
-    DUP1
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    %squaremodn_secp_base
-    %squaremodn_secp_base
-    %squaremodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    %squaremodn_secp_base
-    %squaremodn_secp_base
-    DUP2
-    %mulmodn_secp_base
-    SWAP1
-    // stack: x, x^-1
-    POP
-    // stack: x^-1
-%endmacro
diff --git a/evm/src/cpu/kernel/asm/transactions/type_0.asm b/evm/src/cpu/kernel/asm/transactions/type_0.asm
index 543095a7..8711790d 100644
--- a/evm/src/cpu/kernel/asm/transactions/type_0.asm
+++ b/evm/src/cpu/kernel/asm/transactions/type_0.asm
@@ -14,87 +14,50 @@
 global process_type_0_txn:
     JUMPDEST
     // stack: (empty)
-    PUSH process_txn_with_len
     PUSH 0 // initial pos
-    // stack: pos, process_txn_with_len
-    %jump(decode_rlp_list_len)
-
-process_txn_with_len:
+    // stack: pos
+    %decode_rlp_list_len
     // We don't actually need the length.
     %stack (pos, len) -> (pos)
 
-    PUSH store_nonce
-    SWAP1
-    // stack: pos, store_nonce
-    %jump(decode_rlp_scalar)
-
-store_nonce:
+    // Decode the nonce and store it.
+    // stack: pos
+    %decode_rlp_scalar
     %stack (pos, nonce) -> (@TXN_FIELD_NONCE, nonce, pos)
     %mstore_current(@SEGMENT_NORMALIZED_TXN)
 
-    // stack: pos
-    PUSH store_gas_price
-    SWAP1
-    // stack: pos, store_gas_price
-    %jump(decode_rlp_scalar)
-
-store_gas_price:
+    // Decode the gas price and store it.
     // For legacy transactions, we set both the
     // TXN_FIELD_MAX_PRIORITY_FEE_PER_GAS and TXN_FIELD_MAX_FEE_PER_GAS
     // fields to gas_price.
+    // stack: pos
+    %decode_rlp_scalar
     %stack (pos, gas_price) -> (@TXN_FIELD_MAX_PRIORITY_FEE_PER_GAS, gas_price,
                                 @TXN_FIELD_MAX_FEE_PER_GAS, gas_price, pos)
     %mstore_current(@SEGMENT_NORMALIZED_TXN)
     %mstore_current(@SEGMENT_NORMALIZED_TXN)
 
+    // Decode the gas limit and store it.
     // stack: pos
-    PUSH store_gas_limit
-    SWAP1
-    // stack: pos, store_gas_limit
-    %jump(decode_rlp_scalar)
-
-store_gas_limit:
+    %decode_rlp_scalar
     %stack (pos, gas_limit) -> (@TXN_FIELD_GAS_LIMIT, gas_limit, pos)
     %mstore_current(@SEGMENT_NORMALIZED_TXN)
 
-    // Peak at the RLP to see if the next byte is zero.
-    // If so, there is no value field, so skip the store_to step.
+    // Decode the "to" field and store it.
     // stack: pos
-    DUP1
-    %mload_current(@SEGMENT_RLP_RAW)
-    ISZERO
-    // stack: to_empty, pos
-    %jumpi(parse_value)
-
-    // If we got here, there is a "to" field.
-    PUSH store_to
-    SWAP1
-    // stack: pos, store_to
-    %jump(decode_rlp_scalar)
-
-store_to:
+    %decode_rlp_scalar
     %stack (pos, to) -> (@TXN_FIELD_TO, to, pos)
     %mstore_current(@SEGMENT_NORMALIZED_TXN)
-    // stack: pos
 
-parse_value:
+    // Decode the value field and store it.
     // stack: pos
-    PUSH store_value
-    SWAP1
-    // stack: pos, store_value
-    %jump(decode_rlp_scalar)
-
-store_value:
+    %decode_rlp_scalar
     %stack (pos, value) -> (@TXN_FIELD_VALUE, value, pos)
     %mstore_current(@SEGMENT_NORMALIZED_TXN)
 
+    // Decode the data length, store it, and compute new_pos after any data.
     // stack: pos
-    PUSH store_data_len
-    SWAP1
-    // stack: pos, store_data_len
-    %jump(decode_rlp_string_len)
-
-store_data_len:
+    %decode_rlp_string_len
     %stack (pos, data_len) -> (@TXN_FIELD_DATA_LEN, data_len, pos, data_len, pos, data_len)
     %mstore_current(@SEGMENT_NORMALIZED_TXN)
     // stack: pos, data_len, pos, data_len
@@ -114,12 +77,7 @@ store_data_len:
 
 parse_v:
     // stack: pos
-    PUSH process_v
-    SWAP1
-    // stack: pos, process_v
-    %jump(decode_rlp_scalar)
-
-process_v:
+    %decode_rlp_scalar
     // stack: pos, v
     SWAP1
     // stack: v, pos
@@ -163,22 +121,12 @@ process_v_new_style:
 
 parse_r:
     // stack: pos
-    PUSH store_r
-    SWAP1
-    // stack: pos, store_r
-    %jump(decode_rlp_scalar)
-
-store_r:
+    %decode_rlp_scalar
     %stack (pos, r) -> (@TXN_FIELD_R, r, pos)
     %mstore_current(@SEGMENT_NORMALIZED_TXN)
 
     // stack: pos
-    PUSH store_s
-    SWAP1
-    // stack: pos, store_s
-    %jump(decode_rlp_scalar)
-
-store_s:
+    %decode_rlp_scalar
     %stack (pos, s) -> (@TXN_FIELD_S, s)
     %mstore_current(@SEGMENT_NORMALIZED_TXN)
     // stack: (empty)
diff --git a/evm/src/cpu/kernel/asm/assertions.asm b/evm/src/cpu/kernel/asm/util/assertions.asm
similarity index 100%
rename from evm/src/cpu/kernel/asm/assertions.asm
rename to evm/src/cpu/kernel/asm/util/assertions.asm
diff --git a/evm/src/cpu/kernel/asm/basic_macros.asm b/evm/src/cpu/kernel/asm/util/basic_macros.asm
similarity index 84%
rename from evm/src/cpu/kernel/asm/basic_macros.asm
rename to evm/src/cpu/kernel/asm/util/basic_macros.asm
index 9583409c..8876fd1e 100644
--- a/evm/src/cpu/kernel/asm/basic_macros.asm
+++ b/evm/src/cpu/kernel/asm/util/basic_macros.asm
@@ -120,7 +120,7 @@
     // stack: input, ...
     PUSH $c
     // stack: c, input, ...
-    GE // Check it backwards: (input <= c) == (c >= input)
+    LT ISZERO // Check it backwards: (input <= c) == !(c < input)
     // stack: input <= c, ...
 %endmacro
 
@@ -136,10 +136,15 @@
     // stack: input, ...
     PUSH $c
     // stack: c, input, ...
-    LE // Check it backwards: (input >= c) == (c <= input)
+    GT ISZERO // Check it backwards: (input >= c) == !(c > input)
     // stack: input >= c, ...
 %endmacro
 
+%macro consume_gas_const(c)
+    PUSH $c
+    CONSUME_GAS
+%endmacro
+
 // If pred is zero, yields z; otherwise, yields nz
 %macro select
     // stack: pred, nz, z
@@ -189,6 +194,7 @@
     // stack: x^2
 %endmacro
 
+<<<<<<< HEAD:evm/src/cpu/kernel/asm/basic_macros.asm
 %macro not_32
     // stack: x
     push 0xffffffff
@@ -203,4 +209,26 @@
     // stack: 0xffffffff, x
     and
     // stack: 0xffffffff & x
+=======
+%macro min
+    // stack: x, y
+    DUP2
+    DUP2
+    // stack: x, y, x, y
+    LT
+    // stack: x < y, x, y
+    %select_bool
+    // stack: min
+%endmacro
+
+%macro max
+    // stack: x, y
+    DUP2
+    DUP2
+    // stack: x, y, x, y
+    GT
+    // stack: x > y, x, y
+    %select_bool
+    // stack: max
+>>>>>>> 65a20bcd8a5a4040f86b7425817b98daecc05a78:evm/src/cpu/kernel/asm/util/basic_macros.asm
 %endmacro
diff --git a/evm/src/cpu/kernel/assembler.rs b/evm/src/cpu/kernel/assembler.rs
index 4dbc46ca..14ec9aa0 100644
--- a/evm/src/cpu/kernel/assembler.rs
+++ b/evm/src/cpu/kernel/assembler.rs
@@ -5,9 +5,13 @@ use itertools::izip;
 use log::debug;
 
 use super::ast::PushTarget;
-use crate::cpu::kernel::ast::{Literal, StackReplacement};
+use crate::cpu::kernel::ast::Item::LocalLabelDeclaration;
+use crate::cpu::kernel::ast::StackReplacement;
 use crate::cpu::kernel::keccak_util::hash_kernel;
+use crate::cpu::kernel::optimizer::optimize_asm;
+use crate::cpu::kernel::prover_input::ProverInputFn;
 use crate::cpu::kernel::stack_manipulation::expand_stack_manipulation;
+use crate::cpu::kernel::utils::u256_to_trimmed_be_bytes;
 use crate::cpu::kernel::{
     ast::{File, Item},
     opcodes::{get_opcode, get_push_opcode},
@@ -16,7 +20,7 @@ use crate::cpu::kernel::{
 /// The number of bytes to push when pushing an offset within the code (i.e. when assembling jumps).
 /// Ideally we would automatically use the minimal number of bytes required, but that would be
 /// nontrivial given the circular dependency between an offset and its size.
-const BYTES_PER_OFFSET: u8 = 3;
+pub(crate) const BYTES_PER_OFFSET: u8 = 3;
 
 #[derive(PartialEq, Eq, Debug)]
 pub struct Kernel {
@@ -27,15 +31,23 @@ pub struct Kernel {
     pub(crate) code_hash: [u32; 8],
 
     pub(crate) global_labels: HashMap<String, usize>,
+
+    /// Map from `PROVER_INPUT` offsets to their corresponding `ProverInputFn`.
+    pub(crate) prover_inputs: HashMap<usize, ProverInputFn>,
 }
 
 impl Kernel {
-    fn new(code: Vec<u8>, global_labels: HashMap<String, usize>) -> Self {
+    fn new(
+        code: Vec<u8>,
+        global_labels: HashMap<String, usize>,
+        prover_inputs: HashMap<usize, ProverInputFn>,
+    ) -> Self {
         let code_hash = hash_kernel(&code);
         Self {
             code,
             code_hash,
             global_labels,
+            prover_inputs,
         }
     }
 }
@@ -54,18 +66,32 @@ impl Macro {
     }
 }
 
-pub(crate) fn assemble(files: Vec<File>, constants: HashMap<String, U256>) -> Kernel {
+pub(crate) fn assemble(
+    files: Vec<File>,
+    constants: HashMap<String, U256>,
+    optimize: bool,
+) -> Kernel {
     let macros = find_macros(&files);
     let mut global_labels = HashMap::new();
+    let mut prover_inputs = HashMap::new();
     let mut offset = 0;
     let mut expanded_files = Vec::with_capacity(files.len());
     let mut local_labels = Vec::with_capacity(files.len());
+    let mut macro_counter = 0;
     for file in files {
-        let expanded_file = expand_macros(file.body, &macros);
+        let expanded_file = expand_macros(file.body, &macros, &mut macro_counter);
         let expanded_file = expand_repeats(expanded_file);
         let expanded_file = inline_constants(expanded_file, &constants);
-        let expanded_file = expand_stack_manipulation(expanded_file);
-        local_labels.push(find_labels(&expanded_file, &mut offset, &mut global_labels));
+        let mut expanded_file = expand_stack_manipulation(expanded_file);
+        if optimize {
+            optimize_asm(&mut expanded_file);
+        }
+        local_labels.push(find_labels(
+            &expanded_file,
+            &mut offset,
+            &mut global_labels,
+            &mut prover_inputs,
+        ));
         expanded_files.push(expanded_file);
     }
     let mut code = vec![];
@@ -76,7 +102,7 @@ pub(crate) fn assemble(files: Vec<File>, constants: HashMap<String, U256>) -> Ke
         debug!("Assembled file size: {} bytes", file_len);
     }
     assert_eq!(code.len(), offset, "Code length doesn't match offset.");
-    Kernel::new(code, global_labels)
+    Kernel::new(code, global_labels, prover_inputs)
 }
 
 fn find_macros(files: &[File]) -> HashMap<String, Macro> {
@@ -96,7 +122,11 @@ fn find_macros(files: &[File]) -> HashMap<String, Macro> {
     macros
 }
 
-fn expand_macros(body: Vec<Item>, macros: &HashMap<String, Macro>) -> Vec<Item> {
+fn expand_macros(
+    body: Vec<Item>,
+    macros: &HashMap<String, Macro>,
+    macro_counter: &mut u32,
+) -> Vec<Item> {
     let mut expanded = vec![];
     for item in body {
         match item {
@@ -104,7 +134,7 @@ fn expand_macros(body: Vec<Item>, macros: &HashMap<String, Macro>) -> Vec<Item>
                 // At this phase, we no longer need macro definitions.
             }
             Item::MacroCall(m, args) => {
-                expanded.extend(expand_macro_call(m, args, macros));
+                expanded.extend(expand_macro_call(m, args, macros, macro_counter));
             }
             item => {
                 expanded.push(item);
@@ -118,6 +148,7 @@ fn expand_macro_call(
     name: String,
     args: Vec<PushTarget>,
     macros: &HashMap<String, Macro>,
+    macro_counter: &mut u32,
 ) -> Vec<Item> {
     let _macro = macros
         .get(&name)
@@ -132,6 +163,8 @@ fn expand_macro_call(
         args.len()
     );
 
+    let get_actual_label = |macro_label| format!("@{}.{}", macro_counter, macro_label);
+
     let get_arg = |var| {
         let param_index = _macro.get_param_index(var);
         args[param_index].clone()
@@ -140,10 +173,13 @@ fn expand_macro_call(
     let expanded_item = _macro
         .items
         .iter()
-        .map(|item| {
-            if let Item::Push(PushTarget::MacroVar(var)) = item {
-                Item::Push(get_arg(var))
-            } else if let Item::MacroCall(name, args) = item {
+        .map(|item| match item {
+            Item::MacroLabelDeclaration(label) => LocalLabelDeclaration(get_actual_label(label)),
+            Item::Push(PushTarget::MacroLabel(label)) => {
+                Item::Push(PushTarget::Label(get_actual_label(label)))
+            }
+            Item::Push(PushTarget::MacroVar(var)) => Item::Push(get_arg(var)),
+            Item::MacroCall(name, args) => {
                 let expanded_args = args
                     .iter()
                     .map(|arg| {
@@ -155,21 +191,35 @@ fn expand_macro_call(
                     })
                     .collect();
                 Item::MacroCall(name.clone(), expanded_args)
-            } else {
-                item.clone()
             }
+            Item::StackManipulation(before, after) => {
+                let after = after
+                    .iter()
+                    .map(|replacement| {
+                        if let StackReplacement::MacroLabel(label) = replacement {
+                            StackReplacement::Identifier(get_actual_label(label))
+                        } else {
+                            replacement.clone()
+                        }
+                    })
+                    .collect();
+                Item::StackManipulation(before.clone(), after)
+            }
+            _ => item.clone(),
         })
         .collect();
 
+    *macro_counter += 1;
+
     // Recursively expand any macros in the expanded code.
-    expand_macros(expanded_item, macros)
+    expand_macros(expanded_item, macros, macro_counter)
 }
 
 fn expand_repeats(body: Vec<Item>) -> Vec<Item> {
     let mut expanded = vec![];
     for item in body {
         if let Item::Repeat(count, block) = item {
-            let reps = count.to_u256().as_usize();
+            let reps = count.as_usize();
             for _ in 0..reps {
                 expanded.extend(block.clone());
             }
@@ -182,12 +232,9 @@ fn expand_repeats(body: Vec<Item>) -> Vec<Item> {
 
 fn inline_constants(body: Vec<Item>, constants: &HashMap<String, U256>) -> Vec<Item> {
     let resolve_const = |c| {
-        Literal::Decimal(
-            constants
-                .get(&c)
-                .unwrap_or_else(|| panic!("No such constant: {}", c))
-                .to_string(),
-        )
+        *constants
+            .get(&c)
+            .unwrap_or_else(|| panic!("No such constant: {}", c))
     };
 
     body.into_iter()
@@ -217,6 +264,7 @@ fn find_labels(
     body: &[Item],
     offset: &mut usize,
     global_labels: &mut HashMap<String, usize>,
+    prover_inputs: &mut HashMap<usize, ProverInputFn>,
 ) -> HashMap<String, usize> {
     // Discover the offset of each label in this file.
     let mut local_labels = HashMap::<String, usize>::new();
@@ -225,7 +273,8 @@ fn find_labels(
             Item::MacroDef(_, _, _)
             | Item::MacroCall(_, _)
             | Item::Repeat(_, _)
-            | Item::StackManipulation(_, _) => {
+            | Item::StackManipulation(_, _)
+            | Item::MacroLabelDeclaration(_) => {
                 panic!("Item should have been expanded already: {:?}", item);
             }
             Item::GlobalLabelDeclaration(label) => {
@@ -237,6 +286,10 @@ fn find_labels(
                 assert!(old.is_none(), "Duplicate local label: {}", label);
             }
             Item::Push(target) => *offset += 1 + push_target_size(target) as usize,
+            Item::ProverInput(prover_input_fn) => {
+                prover_inputs.insert(*offset, prover_input_fn.clone());
+                *offset += 1;
+            }
             Item::StandardOp(_) => *offset += 1,
             Item::Bytes(bytes) => *offset += bytes.len(),
         }
@@ -256,7 +309,8 @@ fn assemble_file(
             Item::MacroDef(_, _, _)
             | Item::MacroCall(_, _)
             | Item::Repeat(_, _)
-            | Item::StackManipulation(_, _) => {
+            | Item::StackManipulation(_, _)
+            | Item::MacroLabelDeclaration(_) => {
                 panic!("Item should have been expanded already: {:?}", item);
             }
             Item::GlobalLabelDeclaration(_) | Item::LocalLabelDeclaration(_) => {
@@ -264,7 +318,7 @@ fn assemble_file(
             }
             Item::Push(target) => {
                 let target_bytes: Vec<u8> = match target {
-                    PushTarget::Literal(literal) => literal.to_trimmed_be_bytes(),
+                    PushTarget::Literal(n) => u256_to_trimmed_be_bytes(&n),
                     PushTarget::Label(label) => {
                         let offset = local_labels
                             .get(&label)
@@ -277,16 +331,20 @@ fn assemble_file(
                             .map(|i| offset.to_le_bytes()[i as usize])
                             .collect()
                     }
+                    PushTarget::MacroLabel(v) => panic!("Macro label not in a macro: {}", v),
                     PushTarget::MacroVar(v) => panic!("Variable not in a macro: {}", v),
                     PushTarget::Constant(c) => panic!("Constant wasn't inlined: {}", c),
                 };
                 code.push(get_push_opcode(target_bytes.len() as u8));
                 code.extend(target_bytes);
             }
+            Item::ProverInput(_) => {
+                code.push(get_opcode("PROVER_INPUT"));
+            }
             Item::StandardOp(opcode) => {
                 code.push(get_opcode(&opcode));
             }
-            Item::Bytes(bytes) => code.extend(bytes.iter().map(|b| b.to_u8())),
+            Item::Bytes(bytes) => code.extend(bytes),
         }
     }
 }
@@ -294,8 +352,9 @@ fn assemble_file(
 /// The size of a `PushTarget`, in bytes.
 fn push_target_size(target: &PushTarget) -> u8 {
     match target {
-        PushTarget::Literal(lit) => lit.to_trimmed_be_bytes().len() as u8,
+        PushTarget::Literal(n) => u256_to_trimmed_be_bytes(n).len() as u8,
         PushTarget::Label(_) => BYTES_PER_OFFSET,
+        PushTarget::MacroLabel(v) => panic!("Macro label not in a macro: {}", v),
         PushTarget::MacroVar(v) => panic!("Variable not in a macro: {}", v),
         PushTarget::Constant(c) => panic!("Constant wasn't inlined: {}", c),
     }
@@ -357,10 +416,10 @@ mod tests {
         expected_global_labels.insert("function_1".to_string(), 0);
         expected_global_labels.insert("function_2".to_string(), 3);
 
-        let expected_kernel = Kernel::new(expected_code, expected_global_labels);
+        let expected_kernel = Kernel::new(expected_code, expected_global_labels, HashMap::new());
 
         let program = vec![file_1, file_2];
-        assert_eq!(assemble(program, HashMap::new()), expected_kernel);
+        assert_eq!(assemble(program, HashMap::new(), false), expected_kernel);
     }
 
     #[test]
@@ -378,7 +437,7 @@ mod tests {
                 Item::StandardOp("JUMPDEST".to_string()),
             ],
         };
-        assemble(vec![file_1, file_2], HashMap::new());
+        assemble(vec![file_1, file_2], HashMap::new(), false);
     }
 
     #[test]
@@ -392,24 +451,15 @@ mod tests {
                 Item::StandardOp("ADD".to_string()),
             ],
         };
-        assemble(vec![file], HashMap::new());
+        assemble(vec![file], HashMap::new(), false);
     }
 
     #[test]
     fn literal_bytes() {
         let file = File {
-            body: vec![
-                Item::Bytes(vec![
-                    Literal::Hex("12".to_string()),
-                    Literal::Decimal("42".to_string()),
-                ]),
-                Item::Bytes(vec![
-                    Literal::Hex("fe".to_string()),
-                    Literal::Decimal("255".to_string()),
-                ]),
-            ],
+            body: vec![Item::Bytes(vec![0x12, 42]), Item::Bytes(vec![0xFE, 255])],
         };
-        let code = assemble(vec![file], HashMap::new()).code;
+        let code = assemble(vec![file], HashMap::new(), false).code;
         assert_eq!(code, vec![0x12, 42, 0xfe, 255]);
     }
 
@@ -426,15 +476,31 @@ mod tests {
 
     #[test]
     fn macro_with_vars() {
-        let kernel = parse_and_assemble(&[
+        let files = &[
             "%macro add(x, y) PUSH $x PUSH $y ADD %endmacro",
             "%add(2, 3)",
-        ]);
+        ];
+        let kernel = parse_and_assemble_ext(files, HashMap::new(), false);
         let push1 = get_push_opcode(1);
         let add = get_opcode("ADD");
         assert_eq!(kernel.code, vec![push1, 2, push1, 3, add]);
     }
 
+    #[test]
+    fn macro_with_label() {
+        let files = &[
+            "%macro spin %%start: PUSH %%start JUMP %endmacro",
+            "%spin %spin",
+        ];
+        let kernel = parse_and_assemble_ext(files, HashMap::new(), false);
+        let push3 = get_push_opcode(BYTES_PER_OFFSET);
+        let jump = get_opcode("JUMP");
+        assert_eq!(
+            kernel.code,
+            vec![push3, 0, 0, 0, jump, push3, 0, 0, 5, jump]
+        );
+    }
+
     #[test]
     fn macro_in_macro_with_vars() {
         let kernel = parse_and_assemble(&[
@@ -467,7 +533,7 @@ mod tests {
         let mut constants = HashMap::new();
         constants.insert("DEAD_BEEF".into(), 0xDEADBEEFu64.into());
 
-        let kernel = parse_and_assemble_with_constants(code, constants);
+        let kernel = parse_and_assemble_ext(code, constants, true);
         let push4 = get_push_opcode(4);
         assert_eq!(kernel.code, vec![push4, 0xDE, 0xAD, 0xBE, 0xEF]);
     }
@@ -482,8 +548,13 @@ mod tests {
     #[test]
     fn stack_manipulation() {
         let pop = get_opcode("POP");
+        let dup1 = get_opcode("DUP1");
         let swap1 = get_opcode("SWAP1");
         let swap2 = get_opcode("SWAP2");
+        let push_label = get_push_opcode(BYTES_PER_OFFSET);
+
+        let kernel = parse_and_assemble(&["%stack (a) -> (a)"]);
+        assert_eq!(kernel.code, vec![]);
 
         let kernel = parse_and_assemble(&["%stack (a, b, c) -> (c, b, a)"]);
         assert_eq!(kernel.code, vec![swap2]);
@@ -493,19 +564,27 @@ mod tests {
 
         let mut consts = HashMap::new();
         consts.insert("LIFE".into(), 42.into());
-        parse_and_assemble_with_constants(&["%stack (a, b) -> (b, @LIFE)"], consts);
+        parse_and_assemble_ext(&["%stack (a, b) -> (b, @LIFE)"], consts, true);
         // We won't check the code since there are two equally efficient implementations.
+
+        let kernel = parse_and_assemble(&["start: %stack (a, b) -> (start)"]);
+        assert_eq!(kernel.code, vec![pop, pop, push_label, 0, 0, 0]);
+
+        // The "start" label gets shadowed by the "start" named stack item.
+        let kernel = parse_and_assemble(&["start: %stack (start) -> (start, start)"]);
+        assert_eq!(kernel.code, vec![dup1]);
     }
 
     fn parse_and_assemble(files: &[&str]) -> Kernel {
-        parse_and_assemble_with_constants(files, HashMap::new())
+        parse_and_assemble_ext(files, HashMap::new(), true)
     }
 
-    fn parse_and_assemble_with_constants(
+    fn parse_and_assemble_ext(
         files: &[&str],
         constants: HashMap<String, U256>,
+        optimize: bool,
     ) -> Kernel {
         let parsed_files = files.iter().map(|f| parse(f)).collect_vec();
-        assemble(parsed_files, constants)
+        assemble(parsed_files, constants, optimize)
     }
 }
diff --git a/evm/src/cpu/kernel/ast.rs b/evm/src/cpu/kernel/ast.rs
index 92728104..24cf01e1 100644
--- a/evm/src/cpu/kernel/ast.rs
+++ b/evm/src/cpu/kernel/ast.rs
@@ -1,19 +1,20 @@
 use ethereum_types::U256;
-use plonky2_util::ceil_div_usize;
+
+use crate::cpu::kernel::prover_input::ProverInputFn;
 
 #[derive(Debug)]
 pub(crate) struct File {
     pub(crate) body: Vec<Item>,
 }
 
-#[derive(Clone, Debug)]
+#[derive(Eq, PartialEq, Clone, Debug)]
 pub(crate) enum Item {
     /// Defines a new macro: name, params, body.
     MacroDef(String, Vec<String>, Vec<Item>),
     /// Calls a macro: name, args.
     MacroCall(String, Vec<PushTarget>),
     /// Repetition, like `%rep` in NASM.
-    Repeat(Literal, Vec<Item>),
+    Repeat(U256, Vec<Item>),
     /// A directive to manipulate the stack according to a specified pattern.
     /// The first list gives names to items on the top of the stack.
     /// The second list specifies replacement items.
@@ -23,88 +24,34 @@ pub(crate) enum Item {
     GlobalLabelDeclaration(String),
     /// Declares a label that is local to the current file.
     LocalLabelDeclaration(String),
+    /// Declares a label that is local to the macro it's declared in.
+    MacroLabelDeclaration(String),
     /// A `PUSH` operation.
     Push(PushTarget),
+    /// A `ProverInput` operation.
+    ProverInput(ProverInputFn),
     /// Any opcode besides a PUSH opcode.
     StandardOp(String),
     /// Literal hex data; should contain an even number of hex chars.
-    Bytes(Vec<Literal>),
+    Bytes(Vec<u8>),
 }
 
-#[derive(Clone, Debug)]
+#[derive(Eq, PartialEq, Clone, Debug)]
 pub(crate) enum StackReplacement {
-    NamedItem(String),
-    Literal(Literal),
+    /// Can be either a named item or a label.
+    Identifier(String),
+    Literal(U256),
+    MacroLabel(String),
     MacroVar(String),
     Constant(String),
 }
 
 /// The target of a `PUSH` operation.
-#[derive(Clone, Debug)]
+#[derive(Clone, Debug, Eq, PartialEq, Hash)]
 pub(crate) enum PushTarget {
-    Literal(Literal),
+    Literal(U256),
     Label(String),
+    MacroLabel(String),
     MacroVar(String),
     Constant(String),
 }
-
-#[derive(Clone, Debug, Eq, PartialEq, Hash)]
-pub(crate) enum Literal {
-    Decimal(String),
-    Hex(String),
-}
-
-impl Literal {
-    pub(crate) fn to_trimmed_be_bytes(&self) -> Vec<u8> {
-        let u256 = self.to_u256();
-        let num_bytes = ceil_div_usize(u256.bits(), 8).max(1);
-        // `byte` is little-endian, so we manually reverse it.
-        (0..num_bytes).rev().map(|i| u256.byte(i)).collect()
-    }
-
-    pub(crate) fn to_u256(&self) -> U256 {
-        let (src, radix) = match self {
-            Literal::Decimal(s) => (s, 10),
-            Literal::Hex(s) => (s, 16),
-        };
-        U256::from_str_radix(src, radix)
-            .unwrap_or_else(|_| panic!("Not a valid u256 literal: {:?}", self))
-    }
-
-    pub(crate) fn to_u8(&self) -> u8 {
-        let (src, radix) = match self {
-            Literal::Decimal(s) => (s, 10),
-            Literal::Hex(s) => (s, 16),
-        };
-        u8::from_str_radix(src, radix)
-            .unwrap_or_else(|_| panic!("Not a valid u8 literal: {:?}", self))
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use crate::cpu::kernel::ast::*;
-
-    #[test]
-    fn literal_to_be_bytes() {
-        assert_eq!(
-            Literal::Decimal("0".into()).to_trimmed_be_bytes(),
-            vec![0x00]
-        );
-
-        assert_eq!(
-            Literal::Decimal("768".into()).to_trimmed_be_bytes(),
-            vec![0x03, 0x00]
-        );
-
-        assert_eq!(
-            Literal::Hex("a1b2".into()).to_trimmed_be_bytes(),
-            vec![0xa1, 0xb2]
-        );
-
-        assert_eq!(
-            Literal::Hex("1b2".into()).to_trimmed_be_bytes(),
-            vec![0x1, 0xb2]
-        );
-    }
-}
diff --git a/evm/src/cpu/kernel/constants.rs b/evm/src/cpu/kernel/constants.rs
new file mode 100644
index 00000000..5bc5908e
--- /dev/null
+++ b/evm/src/cpu/kernel/constants.rs
@@ -0,0 +1,87 @@
+use std::collections::HashMap;
+
+use ethereum_types::U256;
+use hex_literal::hex;
+
+use crate::cpu::kernel::context_metadata::ContextMetadata;
+use crate::cpu::kernel::global_metadata::GlobalMetadata;
+use crate::cpu::kernel::txn_fields::NormalizedTxnField;
+use crate::memory::segments::Segment;
+
+/// Constants that are accessible to our kernel assembly code.
+pub fn evm_constants() -> HashMap<String, U256> {
+    let mut c = HashMap::new();
+    for (name, value) in EC_CONSTANTS {
+        c.insert(name.into(), U256::from_big_endian(&value));
+    }
+    for (name, value) in GAS_CONSTANTS {
+        c.insert(name.into(), U256::from(value));
+    }
+    for segment in Segment::all() {
+        c.insert(segment.var_name().into(), (segment as u32).into());
+    }
+    for txn_field in NormalizedTxnField::all() {
+        c.insert(txn_field.var_name().into(), (txn_field as u32).into());
+    }
+    for txn_field in GlobalMetadata::all() {
+        c.insert(txn_field.var_name().into(), (txn_field as u32).into());
+    }
+    for txn_field in ContextMetadata::all() {
+        c.insert(txn_field.var_name().into(), (txn_field as u32).into());
+    }
+    c
+}
+
+const EC_CONSTANTS: [(&str, [u8; 32]); 3] = [
+    (
+        "BN_BASE",
+        hex!("30644e72e131a029b85045b68181585d97816a916871ca8d3c208c16d87cfd47"),
+    ),
+    (
+        "SECP_BASE",
+        hex!("fffffffffffffffffffffffffffffffffffffffffffffffffffffffefffffc2f"),
+    ),
+    (
+        "SECP_SCALAR",
+        hex!("fffffffffffffffffffffffffffffffebaaedce6af48a03bbfd25e8cd0364141"),
+    ),
+];
+
+const GAS_CONSTANTS: [(&str, u16); 36] = [
+    ("GAS_ZERO", 0),
+    ("GAS_JUMPDEST", 1),
+    ("GAS_BASE", 2),
+    ("GAS_VERYLOW", 3),
+    ("GAS_LOW", 5),
+    ("GAS_MID", 8),
+    ("GAS_HIGH", 10),
+    ("GAS_WARMACCESS", 100),
+    ("GAS_ACCESSLISTADDRESS", 2_400),
+    ("GAS_ACCESSLISTSTORAGE", 1_900),
+    ("GAS_COLDACCOUNTACCESS", 2_600),
+    ("GAS_COLDSLOAD", 2_100),
+    ("GAS_SSET", 20_000),
+    ("GAS_SRESET", 2_900),
+    ("REFUND_SCLEAR", 15_000),
+    ("REFUND_SELFDESTRUCT", 24_000),
+    ("GAS_SELFDESTRUCT", 5_000),
+    ("GAS_CREATE", 32_000),
+    ("GAS_CODEDEPOSIT", 200),
+    ("GAS_CALLVALUE", 9_000),
+    ("GAS_CALLSTIPEND", 2_300),
+    ("GAS_NEWACCOUNT", 25_000),
+    ("GAS_EXP", 10),
+    ("GAS_EXPBYTE", 50),
+    ("GAS_MEMORY", 3),
+    ("GAS_TXCREATE", 32_000),
+    ("GAS_TXDATAZERO", 4),
+    ("GAS_TXDATANONZERO", 16),
+    ("GAS_TRANSACTION", 21_000),
+    ("GAS_LOG", 375),
+    ("GAS_LOGDATA", 8),
+    ("GAS_LOGTOPIC", 375),
+    ("GAS_KECCAK256", 30),
+    ("GAS_KECCAK256WORD", 6),
+    ("GAS_COPY", 3),
+    ("GAS_BLOCKHASH", 20),
+];
diff --git a/evm/src/cpu/kernel/context_metadata.rs b/evm/src/cpu/kernel/context_metadata.rs
new file mode 100644
index 00000000..5b6ce303
--- /dev/null
+++ b/evm/src/cpu/kernel/context_metadata.rs
@@ -0,0 +1,51 @@
+/// These metadata fields contain VM state specific to a particular context.
+#[derive(Copy, Clone, Eq, PartialEq, Hash, Ord, PartialOrd, Debug)]
+pub(crate) enum ContextMetadata {
+    /// The ID of the context which created this one.
+    ParentContext = 0,
+    /// The program counter to return to when we return to the parent context.
+    ParentProgramCounter = 1,
+    CalldataSize = 2,
+    ReturndataSize = 3,
+    /// The address of the account associated with this context.
+    Address = 4,
+    /// The size of the code under the account associated with this context.
+    /// While this information could be obtained from the state trie, it is best to cache it since
+    /// the `CODESIZE` instruction is very cheap.
+    CodeSize = 5,
+    /// The address of the caller who spawned this context.
+    Caller = 6,
+    /// The value (in wei) deposited by the caller.
+    CallValue = 7,
+}
+
+impl ContextMetadata {
+    pub(crate) const COUNT: usize = 8;
+
+    pub(crate) fn all() -> [Self; Self::COUNT] {
+        [
+            Self::ParentContext,
+            Self::ParentProgramCounter,
+            Self::CalldataSize,
+            Self::ReturndataSize,
+            Self::Address,
+            Self::CodeSize,
+            Self::Caller,
+            Self::CallValue,
+        ]
+    }
+
+    /// The variable name that gets passed into kernel assembly code.
+    pub(crate) fn var_name(&self) -> &'static str {
+        match self {
+            ContextMetadata::ParentContext => "CTX_METADATA_PARENT_CONTEXT",
+            ContextMetadata::ParentProgramCounter => "CTX_METADATA_PARENT_PC",
+            ContextMetadata::CalldataSize => "CTX_METADATA_CALLDATA_SIZE",
+            ContextMetadata::ReturndataSize => "CTX_METADATA_RETURNDATA_SIZE",
+            ContextMetadata::Address => "CTX_METADATA_ADDRESS",
+            ContextMetadata::CodeSize => "CTX_METADATA_CODE_SIZE",
+            ContextMetadata::Caller => "CTX_METADATA_CALLER",
+            ContextMetadata::CallValue => "CTX_METADATA_CALL_VALUE",
+        }
+    }
+}
diff --git a/evm/src/cpu/kernel/cost_estimator.rs b/evm/src/cpu/kernel/cost_estimator.rs
new file mode 100644
index 00000000..3dfcf63a
--- /dev/null
+++ b/evm/src/cpu/kernel/cost_estimator.rs
@@ -0,0 +1,37 @@
+use crate::cpu::kernel::assembler::BYTES_PER_OFFSET;
+use crate::cpu::kernel::ast::Item;
+use crate::cpu::kernel::ast::Item::*;
+use crate::cpu::kernel::ast::PushTarget::*;
+use crate::cpu::kernel::utils::u256_to_trimmed_be_bytes;
+
+pub(crate) fn is_code_improved(before: &[Item], after: &[Item]) -> bool {
+    cost_estimate(after) < cost_estimate(before)
+}
+
+fn cost_estimate(code: &[Item]) -> u32 {
+    code.iter().map(cost_estimate_item).sum()
+}
+
+fn cost_estimate_item(item: &Item) -> u32 {
+    match item {
+        MacroDef(_, _, _) => 0,
+        GlobalLabelDeclaration(_) => 0,
+        LocalLabelDeclaration(_) => 0,
+        Push(Literal(n)) => cost_estimate_push(u256_to_trimmed_be_bytes(n).len()),
+        Push(Label(_)) => cost_estimate_push(BYTES_PER_OFFSET as usize),
+        ProverInput(_) => 1,
+        StandardOp(op) => cost_estimate_standard_op(op.as_str()),
+        _ => panic!("Unexpected item: {:?}", item),
+    }
+}
+
+fn cost_estimate_standard_op(_op: &str) -> u32 {
+    // For now we just treat any standard operation as having the same cost. This is pretty naive,
+    // but should work fine with our current set of simple optimization rules.
+    1
+}
+
+fn cost_estimate_push(num_bytes: usize) -> u32 {
+    // TODO: Once PUSH is actually implemented, check if this needs to be revised.
+    num_bytes as u32
+}
diff --git a/evm/src/cpu/kernel/evm_asm.pest b/evm/src/cpu/kernel/evm_asm.pest
index 78938b64..8ea7de4b 100644
--- a/evm/src/cpu/kernel/evm_asm.pest
+++ b/evm/src/cpu/kernel/evm_asm.pest
@@ -15,7 +15,7 @@ literal = { literal_hex | literal_decimal }
 variable = ${ "$" ~ identifier }
 constant = ${ "@" ~ identifier }
 
-item = { macro_def | macro_call | repeat | stack | global_label | local_label | bytes_item | push_instruction | nullary_instruction }
+item = { macro_def | macro_call | repeat | stack | global_label_decl | local_label_decl | macro_label_decl | bytes_item | push_instruction | prover_input_instruction | nullary_instruction }
 macro_def = { ^"%macro" ~ identifier ~ paramlist? ~ item* ~ ^"%endmacro" }
 macro_call = ${ "%" ~ !(^"macro" | ^"endmacro" | ^"rep" | ^"endrep" | ^"stack") ~ identifier ~ macro_arglist? }
 repeat = { ^"%rep" ~ literal ~ item* ~ ^"%endrep" }
@@ -23,12 +23,16 @@ paramlist = { "(" ~ identifier ~ ("," ~ identifier)* ~ ")" }
 macro_arglist = !{ "(" ~ push_target ~ ("," ~ push_target)* ~ ")" }
 stack = { ^"%stack" ~ paramlist ~ "->" ~ stack_replacements }
 stack_replacements = { "(" ~ stack_replacement ~ ("," ~ stack_replacement)* ~ ")" }
-stack_replacement = { literal | identifier | constant }
-global_label = { ^"GLOBAL " ~ identifier ~ ":" }
-local_label = { identifier ~ ":" }
+stack_replacement = { literal | identifier | constant | macro_label | variable }
+global_label_decl = ${ ^"GLOBAL " ~ identifier ~ ":" }
+local_label_decl = ${ identifier ~ ":" }
+macro_label_decl = ${ "%%" ~ identifier ~ ":" }
+macro_label = ${ "%%" ~ identifier }
 bytes_item = { ^"BYTES " ~ literal ~ ("," ~ literal)* }
 push_instruction = { ^"PUSH " ~ push_target }
-push_target = { literal | identifier | variable | constant }
+push_target = { literal | identifier | macro_label | variable | constant }
+prover_input_instruction = { ^"PROVER_INPUT" ~ "(" ~ prover_input_fn ~ ")" }
+prover_input_fn = { identifier ~ ("::" ~ identifier)*}
 nullary_instruction = { identifier }
 
 file = { SOI ~ item* ~ silent_eoi }
diff --git a/evm/src/cpu/kernel/global_metadata.rs b/evm/src/cpu/kernel/global_metadata.rs
new file mode 100644
index 00000000..6343a2e6
--- /dev/null
+++ b/evm/src/cpu/kernel/global_metadata.rs
@@ -0,0 +1,29 @@
+/// These metadata fields contain global VM state, stored in the `Segment::Metadata` segment of the
+/// kernel's context (which is zero).
+#[derive(Copy, Clone, Eq, PartialEq, Hash, Ord, PartialOrd, Debug)]
+pub(crate) enum GlobalMetadata {
+    /// The largest context ID that has been used so far in this execution. Tracking this allows us
+    /// give each new context a unique ID, so that its memory will be zero-initialized.
+    LargestContext = 0,
+    /// The address of the sender of the transaction.
+    Origin = 1,
+    /// The size of active memory, in bytes.
+    MemorySize = 2,
+}
+
+impl GlobalMetadata {
+    pub(crate) const COUNT: usize = 3;
+
+    pub(crate) fn all() -> [Self; Self::COUNT] {
+        [Self::LargestContext, Self::Origin, Self::MemorySize]
+    }
+
+    /// The variable name that gets passed into kernel assembly code.
+    pub(crate) fn var_name(&self) -> &'static str {
+        match self {
+            GlobalMetadata::LargestContext => "GLOBAL_METADATA_LARGEST_CONTEXT",
+            GlobalMetadata::Origin => "GLOBAL_METADATA_ORIGIN",
+            GlobalMetadata::MemorySize => "GLOBAL_METADATA_MEMORY_SIZE",
+        }
+    }
+}
diff --git a/evm/src/cpu/kernel/interpreter.rs b/evm/src/cpu/kernel/interpreter.rs
index 016e3c44..6a5b794f 100644
--- a/evm/src/cpu/kernel/interpreter.rs
+++ b/evm/src/cpu/kernel/interpreter.rs
@@ -1,16 +1,22 @@
+use std::collections::HashMap;
+
 use anyhow::{anyhow, bail};
 use ethereum_types::{BigEndianHash, U256, U512};
 use keccak_hash::keccak;
 
-use crate::generation::memory::MemoryContextState;
+use crate::cpu::kernel::aggregator::KERNEL;
+use crate::cpu::kernel::assembler::Kernel;
+use crate::cpu::kernel::prover_input::ProverInputFn;
+use crate::cpu::kernel::txn_fields::NormalizedTxnField;
+use crate::generation::memory::{MemoryContextState, MemorySegmentState};
 use crate::memory::segments::Segment;
 
 /// Halt interpreter execution whenever a jump to this offset is done.
-const HALT_OFFSET: usize = 0xdeadbeef;
+const DEFAULT_HALT_OFFSET: usize = 0xdeadbeef;
 
 #[derive(Debug)]
 pub(crate) struct InterpreterMemory {
-    context_memory: Vec<MemoryContextState>,
+    pub(crate) context_memory: Vec<MemoryContextState>,
 }
 
 impl Default for InterpreterMemory {
@@ -21,6 +27,18 @@ impl Default for InterpreterMemory {
     }
 }
 
+impl InterpreterMemory {
+    fn with_code_and_stack(code: &[u8], stack: Vec<U256>) -> Self {
+        let mut mem = Self::default();
+        for (i, b) in code.iter().copied().enumerate() {
+            mem.context_memory[0].segments[Segment::Code as usize].set(i, b.into());
+        }
+        mem.context_memory[0].segments[Segment::Stack as usize].content = stack;
+
+        mem
+    }
+}
+
 impl InterpreterMemory {
     fn mload_general(&self, context: usize, segment: Segment, offset: usize) -> U256 {
         self.context_memory[context].segments[segment as usize].get(offset)
@@ -31,78 +49,129 @@ impl InterpreterMemory {
     }
 }
 
-// TODO: Remove `code` and `stack` fields as they are contained in `memory`.
-pub(crate) struct Interpreter<'a> {
-    code: &'a [u8],
+pub struct Interpreter<'a> {
+    kernel_mode: bool,
     jumpdests: Vec<usize>,
     offset: usize,
-    pub(crate) stack: Vec<U256>,
     context: usize,
-    memory: InterpreterMemory,
-    /// Non-deterministic prover inputs, stored backwards so that popping the last item gives the
-    /// next prover input.
+    pub(crate) memory: InterpreterMemory,
+    prover_inputs_map: &'a HashMap<usize, ProverInputFn>,
     prover_inputs: Vec<U256>,
+    pub(crate) halt_offsets: Vec<usize>,
     running: bool,
 }
 
-pub(crate) fn run(
-    code: &[u8],
+pub fn run_with_kernel(
+    // TODO: Remove param and just use KERNEL.
+    kernel: &Kernel,
     initial_offset: usize,
     initial_stack: Vec<U256>,
 ) -> anyhow::Result<Interpreter> {
-    run_with_input(code, initial_offset, initial_stack, vec![])
+    run(
+        &kernel.code,
+        initial_offset,
+        initial_stack,
+        &kernel.prover_inputs,
+    )
 }
 
-pub(crate) fn run_with_input(
-    code: &[u8],
+pub fn run<'a>(
+    code: &'a [u8],
     initial_offset: usize,
     initial_stack: Vec<U256>,
-    mut prover_inputs: Vec<U256>,
-) -> anyhow::Result<Interpreter> {
-    // Prover inputs are stored backwards, so that popping the last item gives the next input.
-    prover_inputs.reverse();
-
-    let mut interpreter = Interpreter {
-        code,
-        jumpdests: find_jumpdests(code),
-        offset: initial_offset,
-        stack: initial_stack,
-        context: 0,
-        memory: InterpreterMemory::default(),
-        prover_inputs,
-        running: true,
-    };
-
-    while interpreter.running {
-        interpreter.run_opcode()?;
-    }
-
+    prover_inputs: &'a HashMap<usize, ProverInputFn>,
+) -> anyhow::Result<Interpreter<'a>> {
+    let mut interpreter = Interpreter::new(code, initial_offset, initial_stack, prover_inputs);
+    interpreter.run()?;
     Ok(interpreter)
 }
 
 impl<'a> Interpreter<'a> {
-    fn slice(&self, n: usize) -> &[u8] {
-        &self.code[self.offset..self.offset + n]
+    pub(crate) fn new_with_kernel(initial_offset: usize, initial_stack: Vec<U256>) -> Self {
+        Self::new(
+            &KERNEL.code,
+            initial_offset,
+            initial_stack,
+            &KERNEL.prover_inputs,
+        )
+    }
+
+    pub(crate) fn new(
+        code: &'a [u8],
+        initial_offset: usize,
+        initial_stack: Vec<U256>,
+        prover_inputs: &'a HashMap<usize, ProverInputFn>,
+    ) -> Self {
+        Self {
+            kernel_mode: true,
+            jumpdests: find_jumpdests(code),
+            offset: initial_offset,
+            memory: InterpreterMemory::with_code_and_stack(code, initial_stack),
+            prover_inputs_map: prover_inputs,
+            prover_inputs: Vec::new(),
+            context: 0,
+            halt_offsets: vec![DEFAULT_HALT_OFFSET],
+            running: true,
+        }
+    }
+
+    pub(crate) fn run(&mut self) -> anyhow::Result<()> {
+        while self.running {
+            self.run_opcode()?;
+        }
+        Ok(())
+    }
+
+    fn code(&self) -> &MemorySegmentState {
+        &self.memory.context_memory[self.context].segments[Segment::Code as usize]
+    }
+
+    fn code_slice(&self, n: usize) -> Vec<u8> {
+        self.code().content[self.offset..self.offset + n]
+            .iter()
+            .map(|u256| u256.byte(0))
+            .collect::<Vec<_>>()
+    }
+
+    pub(crate) fn get_txn_field(&self, field: NormalizedTxnField) -> U256 {
+        self.memory.context_memory[0].segments[Segment::TxnFields as usize].content[field as usize]
+    }
+
+    pub(crate) fn get_txn_data(&self) -> &[U256] {
+        &self.memory.context_memory[0].segments[Segment::TxnData as usize].content
+    }
+
+    pub(crate) fn set_rlp_memory(&mut self, rlp: Vec<u8>) {
+        self.memory.context_memory[0].segments[Segment::RlpRaw as usize].content =
+            rlp.into_iter().map(U256::from).collect();
     }
 
     fn incr(&mut self, n: usize) {
         self.offset += n;
     }
 
+    pub(crate) fn stack(&self) -> &[U256] {
+        &self.memory.context_memory[self.context].segments[Segment::Stack as usize].content
+    }
+
+    fn stack_mut(&mut self) -> &mut Vec<U256> {
+        &mut self.memory.context_memory[self.context].segments[Segment::Stack as usize].content
+    }
+
     fn push(&mut self, x: U256) {
-        self.stack.push(x);
+        self.stack_mut().push(x);
     }
 
     fn push_bool(&mut self, x: bool) {
-        self.stack.push(if x { U256::one() } else { U256::zero() });
+        self.push(if x { U256::one() } else { U256::zero() });
     }
 
     fn pop(&mut self) -> U256 {
-        self.stack.pop().expect("Pop on empty stack.")
+        self.stack_mut().pop().expect("Pop on empty stack.")
     }
 
     fn run_opcode(&mut self) -> anyhow::Result<()> {
-        let opcode = self.code.get(self.offset).copied().unwrap_or_default();
+        let opcode = self.code().get(self.offset).byte(0);
         self.incr(1);
         match opcode {
             0x00 => self.run_stop(),                                   // "STOP",
@@ -128,7 +197,7 @@ impl<'a> Interpreter<'a> {
             0x18 => self.run_xor(),                                    // "XOR",
             0x19 => self.run_not(),                                    // "NOT",
             0x1a => todo!(),                                           // "BYTE",
-            0x1b => todo!(),                                           // "SHL",
+            0x1b => self.run_shl(),                                    // "SHL",
             0x1c => todo!(),                                           // "SHR",
             0x1d => todo!(),                                           // "SAR",
             0x20 => self.run_keccak256(),                              // "KECCAK256",
@@ -311,6 +380,12 @@ impl<'a> Interpreter<'a> {
         self.push(!x);
     }
 
+    fn run_shl(&mut self) {
+        let shift = self.pop();
+        let x = self.pop();
+        self.push(x << shift);
+    }
+
     fn run_keccak256(&mut self) {
         let offset = self.pop().as_usize();
         let size = self.pop().as_usize();
@@ -326,11 +401,13 @@ impl<'a> Interpreter<'a> {
     }
 
     fn run_prover_input(&mut self) -> anyhow::Result<()> {
-        let input = self
-            .prover_inputs
-            .pop()
-            .ok_or_else(|| anyhow!("Out of prover inputs"))?;
-        self.stack.push(input);
+        let prover_input_fn = self
+            .prover_inputs_map
+            .get(&(self.offset - 1))
+            .ok_or_else(|| anyhow!("Offset not in prover inputs."))?;
+        let output = prover_input_fn.run(self.stack());
+        self.push(output);
+        self.prover_inputs.push(output);
         Ok(())
     }
 
@@ -376,40 +453,43 @@ impl<'a> Interpreter<'a> {
 
     fn run_jump(&mut self) {
         let x = self.pop().as_usize();
-        self.offset = x;
-        if self.offset == HALT_OFFSET {
-            self.running = false;
-        } else if self.jumpdests.binary_search(&self.offset).is_err() {
-            panic!("Destination is not a JUMPDEST.");
-        }
+        self.jump_to(x);
     }
 
     fn run_jumpi(&mut self) {
         let x = self.pop().as_usize();
         let b = self.pop();
         if !b.is_zero() {
-            self.offset = x;
-            if self.offset == HALT_OFFSET {
-                self.running = false;
-            } else if self.jumpdests.binary_search(&self.offset).is_err() {
-                panic!("Destination is not a JUMPDEST.");
-            }
+            self.jump_to(x);
+        }
+    }
+
+    fn jump_to(&mut self, offset: usize) {
+        // The JUMPDEST rule is not enforced in kernel mode.
+        if !self.kernel_mode && self.jumpdests.binary_search(&offset).is_err() {
+            panic!("Destination is not a JUMPDEST.");
+        }
+
+        self.offset = offset;
+
+        if self.halt_offsets.contains(&offset) {
+            self.running = false;
         }
     }
 
     fn run_push(&mut self, num_bytes: u8) {
-        let x = U256::from_big_endian(self.slice(num_bytes as usize));
+        let x = U256::from_big_endian(&self.code_slice(num_bytes as usize));
         self.incr(num_bytes as usize);
         self.push(x);
     }
 
     fn run_dup(&mut self, n: u8) {
-        self.push(self.stack[self.stack.len() - n as usize]);
+        self.push(self.stack()[self.stack().len() - n as usize]);
     }
 
     fn run_swap(&mut self, n: u8) {
-        let len = self.stack.len();
-        self.stack.swap(len - 1, len - n as usize - 1);
+        let len = self.stack().len();
+        self.stack_mut().swap(len - 1, len - n as usize - 1);
     }
 
     fn run_get_context(&mut self) {
@@ -458,7 +538,9 @@ fn find_jumpdests(code: &[u8]) -> Vec<usize> {
 
 #[cfg(test)]
 mod tests {
-    use crate::cpu::kernel::interpreter::{run, Interpreter};
+    use std::collections::HashMap;
+
+    use crate::cpu::kernel::interpreter::run;
     use crate::memory::segments::Segment;
 
     #[test]
@@ -466,7 +548,10 @@ mod tests {
         let code = vec![
             0x60, 0x1, 0x60, 0x2, 0x1, 0x63, 0xde, 0xad, 0xbe, 0xef, 0x56,
         ]; // PUSH1, 1, PUSH1, 2, ADD, PUSH4 deadbeef, JUMP
-        assert_eq!(run(&code, 0, vec![])?.stack, vec![0x3.into()]);
+        assert_eq!(
+            run(&code, 0, vec![], &HashMap::new())?.stack(),
+            &[0x3.into()],
+        );
         Ok(())
     }
 
@@ -489,15 +574,15 @@ mod tests {
             0x60, 0xff, 0x60, 0x0, 0x52, 0x60, 0, 0x51, 0x60, 0x1, 0x51, 0x60, 0x42, 0x60, 0x27,
             0x53,
         ];
-        let run = run(&code, 0, vec![])?;
-        let Interpreter { stack, memory, .. } = run;
-        assert_eq!(stack, vec![0xff.into(), 0xff00.into()]);
+        let pis = HashMap::new();
+        let run = run(&code, 0, vec![], &pis)?;
+        assert_eq!(run.stack(), &[0xff.into(), 0xff00.into()]);
         assert_eq!(
-            memory.context_memory[0].segments[Segment::MainMemory as usize].get(0x27),
+            run.memory.context_memory[0].segments[Segment::MainMemory as usize].get(0x27),
             0x42.into()
         );
         assert_eq!(
-            memory.context_memory[0].segments[Segment::MainMemory as usize].get(0x1f),
+            run.memory.context_memory[0].segments[Segment::MainMemory as usize].get(0x1f),
             0xff.into()
         );
         Ok(())
diff --git a/evm/src/cpu/kernel/mod.rs b/evm/src/cpu/kernel/mod.rs
index 1d545260..4879ad76 100644
--- a/evm/src/cpu/kernel/mod.rs
+++ b/evm/src/cpu/kernel/mod.rs
@@ -1,11 +1,18 @@
 pub mod aggregator;
 pub mod assembler;
 mod ast;
+mod constants;
+mod context_metadata;
+mod cost_estimator;
+mod global_metadata;
 pub(crate) mod keccak_util;
 mod opcodes;
+mod optimizer;
 mod parser;
+pub mod prover_input;
 mod stack_manipulation;
 mod txn_fields;
+mod utils;
 
 #[cfg(test)]
 mod interpreter;
@@ -15,12 +22,12 @@ mod tests;
 use assembler::assemble;
 use parser::parse;
 
-use crate::cpu::kernel::aggregator::evm_constants;
+use crate::cpu::kernel::constants::evm_constants;
 
 /// Assemble files, outputting bytes.
 /// This is for debugging the kernel only.
 pub fn assemble_to_bytes(files: &[String]) -> Vec<u8> {
     let parsed_files: Vec<_> = files.iter().map(|f| parse(f)).collect();
-    let kernel = assemble(parsed_files, evm_constants());
+    let kernel = assemble(parsed_files, evm_constants(), true);
     kernel.code
 }
diff --git a/evm/src/cpu/kernel/optimizer.rs b/evm/src/cpu/kernel/optimizer.rs
new file mode 100644
index 00000000..2a1db6d3
--- /dev/null
+++ b/evm/src/cpu/kernel/optimizer.rs
@@ -0,0 +1,260 @@
+use ethereum_types::U256;
+use Item::{Push, StandardOp};
+use PushTarget::Literal;
+
+use crate::cpu::kernel::ast::Item::{GlobalLabelDeclaration, LocalLabelDeclaration};
+use crate::cpu::kernel::ast::PushTarget::Label;
+use crate::cpu::kernel::ast::{Item, PushTarget};
+use crate::cpu::kernel::cost_estimator::is_code_improved;
+use crate::cpu::kernel::utils::{replace_windows, u256_from_bool};
+
+pub(crate) fn optimize_asm(code: &mut Vec<Item>) {
+    // Run the optimizer until nothing changes.
+    loop {
+        let old_code = code.clone();
+        optimize_asm_once(code);
+        if code == &old_code {
+            break;
+        }
+    }
+}
+
+/// A single optimization pass.
+fn optimize_asm_once(code: &mut Vec<Item>) {
+    constant_propagation(code);
+    no_op_jumps(code);
+    remove_swapped_pushes(code);
+    remove_swaps_commutative(code);
+    remove_ignored_values(code);
+}
+
+/// Constant propagation.
+fn constant_propagation(code: &mut Vec<Item>) {
+    // Constant propagation for unary ops: `[PUSH x, UNARYOP] -> [PUSH UNARYOP(x)]`
+    replace_windows_if_better(code, |window| {
+        if let [Push(Literal(x)), StandardOp(op)] = window {
+            match op.as_str() {
+                "ISZERO" => Some(vec![Push(Literal(u256_from_bool(x.is_zero())))]),
+                "NOT" => Some(vec![Push(Literal(!x))]),
+                _ => None,
+            }
+        } else {
+            None
+        }
+    });
+
+    // Constant propagation for binary ops: `[PUSH y, PUSH x, BINOP] -> [PUSH BINOP(x, y)]`
+    replace_windows_if_better(code, |window| {
+        if let [Push(Literal(y)), Push(Literal(x)), StandardOp(op)] = window {
+            match op.as_str() {
+                "ADD" => Some(x.overflowing_add(y).0),
+                "SUB" => Some(x.overflowing_sub(y).0),
+                "MUL" => Some(x.overflowing_mul(y).0),
+                "DIV" => Some(x.checked_div(y).unwrap_or(U256::zero())),
+                "MOD" => Some(x.checked_rem(y).unwrap_or(U256::zero())),
+                "EXP" => Some(x.overflowing_pow(y).0),
+                "SHL" => Some(x << y),
+                "SHR" => Some(x >> y),
+                "AND" => Some(x & y),
+                "OR" => Some(x | y),
+                "XOR" => Some(x ^ y),
+                "LT" => Some(u256_from_bool(x < y)),
+                "GT" => Some(u256_from_bool(x > y)),
+                "EQ" => Some(u256_from_bool(x == y)),
+                "BYTE" => Some(if x < 32.into() {
+                    y.byte(x.as_usize()).into()
+                } else {
+                    U256::zero()
+                }),
+                _ => None,
+            }
+            .map(|res| vec![Push(Literal(res))])
+        } else {
+            None
+        }
+    });
+}
+
+/// Remove no-op jumps: `[PUSH label, JUMP, label:] -> [label:]`.
+fn no_op_jumps(code: &mut Vec<Item>) {
+    replace_windows(code, |window| {
+        if let [Push(Label(l)), StandardOp(jump), decl] = window
+            && &jump == "JUMP"
+            && (decl == LocalLabelDeclaration(l.clone()) || decl == GlobalLabelDeclaration(l.clone()))
+        {
+            Some(vec![LocalLabelDeclaration(l)])
+        } else {
+            None
+        }
+    });
+}
+
+/// Remove swaps: `[PUSH x, PUSH y, SWAP1] -> [PUSH y, PUSH x]`.
+// Could be generalized to recognize more than two pushes.
+fn remove_swapped_pushes(code: &mut Vec<Item>) {
+    replace_windows(code, |window| {
+        if let [Push(x), Push(y), StandardOp(swap1)] = window
+            && &swap1 == "SWAP1" {
+            Some(vec![Push(y), Push(x)])
+        } else {
+            None
+        }
+    });
+}
+
+/// Remove SWAP1 before a commutative function.
+fn remove_swaps_commutative(code: &mut Vec<Item>) {
+    replace_windows(code, |window| {
+        if let [StandardOp(swap1), StandardOp(f)] = window && &swap1 == "SWAP1" {
+            let commutative = matches!(f.as_str(), "ADD" | "MUL" | "AND" | "OR" | "XOR" | "EQ");
+            commutative.then_some(vec![StandardOp(f)])
+        } else {
+            None
+        }
+    });
+}
+
+/// Remove push-pop type patterns, such as: `[DUP1, POP]`.
+// Could be extended to other non-side-effecting operations, e.g. [DUP1, ADD, POP] -> [POP].
+fn remove_ignored_values(code: &mut Vec<Item>) {
+    replace_windows(code, |[a, b]| {
+        if let StandardOp(pop) = b && &pop == "POP" {
+            match a {
+                Push(_) => Some(vec![]),
+                StandardOp(dup) if dup.starts_with("DUP") => Some(vec![]),
+                _ => None,
+            }
+        } else {
+            None
+        }
+    });
+}
+
+/// Like `replace_windows`, but specifically for code, and only makes replacements if our cost
+/// estimator thinks that the new code is more efficient.
+fn replace_windows_if_better<const W: usize, F>(code: &mut Vec<Item>, maybe_replace: F)
+where
+    F: Fn([Item; W]) -> Option<Vec<Item>>,
+{
+    replace_windows(code, |window| {
+        maybe_replace(window.clone()).filter(|suggestion| is_code_improved(&window, suggestion))
+    })
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_constant_propagation_iszero() {
+        let mut code = vec![Push(Literal(3.into())), StandardOp("ISZERO".into())];
+        constant_propagation(&mut code);
+        assert_eq!(code, vec![Push(Literal(0.into()))]);
+    }
+
+    #[test]
+    fn test_constant_propagation_add_overflowing() {
+        let mut code = vec![
+            Push(Literal(U256::max_value())),
+            Push(Literal(U256::max_value())),
+            StandardOp("ADD".into()),
+        ];
+        constant_propagation(&mut code);
+        assert_eq!(code, vec![Push(Literal(U256::max_value() - 1))]);
+    }
+
+    #[test]
+    fn test_constant_propagation_sub_underflowing() {
+        let original = vec![
+            Push(Literal(U256::one())),
+            Push(Literal(U256::zero())),
+            StandardOp("SUB".into()),
+        ];
+        let mut code = original.clone();
+        constant_propagation(&mut code);
+        // Constant propagation could replace the code with [PUSH U256::MAX], but that's actually
+        // more expensive, so the code shouldn't be changed.
+        // (The code could also be replaced with [PUSH 0; NOT], which would be an improvement, but
+        // our optimizer isn't smart enough yet.)
+        assert_eq!(code, original);
+    }
+
+    #[test]
+    fn test_constant_propagation_mul() {
+        let mut code = vec![
+            Push(Literal(3.into())),
+            Push(Literal(4.into())),
+            StandardOp("MUL".into()),
+        ];
+        constant_propagation(&mut code);
+        assert_eq!(code, vec![Push(Literal(12.into()))]);
+    }
+
+    #[test]
+    fn test_constant_propagation_div() {
+        let mut code = vec![
+            Push(Literal(3.into())),
+            Push(Literal(8.into())),
+            StandardOp("DIV".into()),
+        ];
+        constant_propagation(&mut code);
+        assert_eq!(code, vec![Push(Literal(2.into()))]);
+    }
+
+    #[test]
+    fn test_constant_propagation_div_zero() {
+        let mut code = vec![
+            Push(Literal(0.into())),
+            Push(Literal(1.into())),
+            StandardOp("DIV".into()),
+        ];
+        constant_propagation(&mut code);
+        assert_eq!(code, vec![Push(Literal(0.into()))]);
+    }
+
+    #[test]
+    fn test_no_op_jump() {
+        let mut code = vec![
+            Push(Label("mylabel".into())),
+            StandardOp("JUMP".into()),
+            LocalLabelDeclaration("mylabel".into()),
+        ];
+        no_op_jumps(&mut code);
+        assert_eq!(code, vec![LocalLabelDeclaration("mylabel".into())]);
+    }
+
+    #[test]
+    fn test_remove_swapped_pushes() {
+        let mut code = vec![
+            Push(Literal("42".into())),
+            Push(Label("mylabel".into())),
+            StandardOp("SWAP1".into()),
+        ];
+        remove_swapped_pushes(&mut code);
+        assert_eq!(
+            code,
+            vec![Push(Label("mylabel".into())), Push(Literal("42".into()))]
+        );
+    }
+
+    #[test]
+    fn test_remove_swap_mul() {
+        let mut code = vec![StandardOp("SWAP1".into()), StandardOp("MUL".into())];
+        remove_swaps_commutative(&mut code);
+        assert_eq!(code, vec![StandardOp("MUL".into())]);
+    }
+
+    #[test]
+    fn test_remove_push_pop() {
+        let mut code = vec![Push(Literal("42".into())), StandardOp("POP".into())];
+        remove_ignored_values(&mut code);
+        assert_eq!(code, vec![]);
+    }
+
+    #[test]
+    fn test_remove_dup_pop() {
+        let mut code = vec![StandardOp("DUP5".into()), StandardOp("POP".into())];
+        remove_ignored_values(&mut code);
+        assert_eq!(code, vec![]);
+    }
+}
diff --git a/evm/src/cpu/kernel/parser.rs b/evm/src/cpu/kernel/parser.rs
index aa84ee05..9ed578d4 100644
--- a/evm/src/cpu/kernel/parser.rs
+++ b/evm/src/cpu/kernel/parser.rs
@@ -1,7 +1,10 @@
+use std::str::FromStr;
+
+use ethereum_types::U256;
 use pest::iterators::Pair;
 use pest::Parser;
 
-use crate::cpu::kernel::ast::{File, Item, Literal, PushTarget, StackReplacement};
+use crate::cpu::kernel::ast::{File, Item, PushTarget, StackReplacement};
 
 /// Parses EVM assembly code.
 #[derive(pest_derive::Parser)]
@@ -25,15 +28,27 @@ fn parse_item(item: Pair<Rule>) -> Item {
         Rule::macro_call => parse_macro_call(item),
         Rule::repeat => parse_repeat(item),
         Rule::stack => parse_stack(item),
-        Rule::global_label => {
+        Rule::global_label_decl => {
             Item::GlobalLabelDeclaration(item.into_inner().next().unwrap().as_str().into())
         }
-        Rule::local_label => {
+        Rule::local_label_decl => {
             Item::LocalLabelDeclaration(item.into_inner().next().unwrap().as_str().into())
         }
-        Rule::bytes_item => Item::Bytes(item.into_inner().map(parse_literal).collect()),
+        Rule::macro_label_decl => {
+            Item::MacroLabelDeclaration(item.into_inner().next().unwrap().as_str().into())
+        }
+        Rule::bytes_item => Item::Bytes(item.into_inner().map(parse_literal_u8).collect()),
         Rule::push_instruction => Item::Push(parse_push_target(item.into_inner().next().unwrap())),
-        Rule::nullary_instruction => Item::StandardOp(item.as_str().into()),
+        Rule::prover_input_instruction => Item::ProverInput(
+            item.into_inner()
+                .next()
+                .unwrap()
+                .into_inner()
+                .map(|x| x.as_str().into())
+                .collect::<Vec<_>>()
+                .into(),
+        ),
+        Rule::nullary_instruction => Item::StandardOp(item.as_str().to_uppercase()),
         _ => panic!("Unexpected {:?}", item.as_rule()),
     }
 }
@@ -75,7 +90,7 @@ fn parse_macro_call(item: Pair<Rule>) -> Item {
 fn parse_repeat(item: Pair<Rule>) -> Item {
     assert_eq!(item.as_rule(), Rule::repeat);
     let mut inner = item.into_inner().peekable();
-    let count = parse_literal(inner.next().unwrap());
+    let count = parse_literal_u256(inner.next().unwrap());
     Item::Repeat(count, inner.map(parse_item).collect())
 }
 
@@ -103,8 +118,11 @@ fn parse_stack_replacement(target: Pair<Rule>) -> StackReplacement {
     assert_eq!(target.as_rule(), Rule::stack_replacement);
     let inner = target.into_inner().next().unwrap();
     match inner.as_rule() {
-        Rule::identifier => StackReplacement::NamedItem(inner.as_str().into()),
-        Rule::literal => StackReplacement::Literal(parse_literal(inner)),
+        Rule::identifier => StackReplacement::Identifier(inner.as_str().into()),
+        Rule::literal => StackReplacement::Literal(parse_literal_u256(inner)),
+        Rule::macro_label => {
+            StackReplacement::MacroLabel(inner.into_inner().next().unwrap().as_str().into())
+        }
         Rule::variable => {
             StackReplacement::MacroVar(inner.into_inner().next().unwrap().as_str().into())
         }
@@ -119,19 +137,39 @@ fn parse_push_target(target: Pair<Rule>) -> PushTarget {
     assert_eq!(target.as_rule(), Rule::push_target);
     let inner = target.into_inner().next().unwrap();
     match inner.as_rule() {
-        Rule::literal => PushTarget::Literal(parse_literal(inner)),
+        Rule::literal => PushTarget::Literal(parse_literal_u256(inner)),
         Rule::identifier => PushTarget::Label(inner.as_str().into()),
+        Rule::macro_label => {
+            PushTarget::MacroLabel(inner.into_inner().next().unwrap().as_str().into())
+        }
         Rule::variable => PushTarget::MacroVar(inner.into_inner().next().unwrap().as_str().into()),
         Rule::constant => PushTarget::Constant(inner.into_inner().next().unwrap().as_str().into()),
         _ => panic!("Unexpected {:?}", inner.as_rule()),
     }
 }
 
-fn parse_literal(literal: Pair<Rule>) -> Literal {
+fn parse_literal_u8(literal: Pair<Rule>) -> u8 {
     let literal = literal.into_inner().next().unwrap();
     match literal.as_rule() {
-        Rule::literal_decimal => Literal::Decimal(literal.as_str().into()),
-        Rule::literal_hex => Literal::Hex(parse_hex(literal)),
+        Rule::literal_decimal => {
+            u8::from_str(literal.as_str()).expect("Failed to parse literal decimal byte")
+        }
+        Rule::literal_hex => {
+            u8::from_str_radix(&parse_hex(literal), 16).expect("Failed to parse literal hex byte")
+        }
+        _ => panic!("Unexpected {:?}", literal.as_rule()),
+    }
+}
+
+fn parse_literal_u256(literal: Pair<Rule>) -> U256 {
+    let literal = literal.into_inner().next().unwrap();
+    match literal.as_rule() {
+        Rule::literal_decimal => {
+            U256::from_dec_str(literal.as_str()).expect("Failed to parse literal decimal")
+        }
+        Rule::literal_hex => {
+            U256::from_str_radix(&parse_hex(literal), 16).expect("Failed to parse literal hex")
+        }
         _ => panic!("Unexpected {:?}", literal.as_rule()),
     }
 }
diff --git a/evm/src/cpu/kernel/prover_input.rs b/evm/src/cpu/kernel/prover_input.rs
new file mode 100644
index 00000000..38e1914e
--- /dev/null
+++ b/evm/src/cpu/kernel/prover_input.rs
@@ -0,0 +1,139 @@
+use std::str::FromStr;
+
+use ethereum_types::U256;
+
+use crate::cpu::kernel::prover_input::Field::{
+    Bn254Base, Bn254Scalar, Secp256k1Base, Secp256k1Scalar,
+};
+use crate::cpu::kernel::prover_input::FieldOp::{Inverse, Sqrt};
+
+/// Prover input function represented as a scoped function name.
+/// Example: `PROVER_INPUT(ff::bn254_base::inverse)` is represented as `ProverInputFn([ff, bn254_base, inverse])`.
+#[derive(PartialEq, Eq, Debug, Clone)]
+pub struct ProverInputFn(Vec<String>);
+
+impl From<Vec<String>> for ProverInputFn {
+    fn from(v: Vec<String>) -> Self {
+        Self(v)
+    }
+}
+
+impl ProverInputFn {
+    /// Run the function on the stack.
+    pub fn run(&self, stack: &[U256]) -> U256 {
+        match self.0[0].as_str() {
+            "ff" => self.run_ff(stack),
+            "mpt" => todo!(),
+            _ => panic!("Unrecognized prover input function."),
+        }
+    }
+
+    // Finite field operations.
+    fn run_ff(&self, stack: &[U256]) -> U256 {
+        let field = Field::from_str(self.0[1].as_str()).unwrap();
+        let op = FieldOp::from_str(self.0[2].as_str()).unwrap();
+        let x = *stack.last().expect("Empty stack");
+        field.op(op, x)
+    }
+
+    // MPT operations.
+    #[allow(dead_code)]
+    fn run_mpt(&self, _stack: Vec<U256>) -> U256 {
+        todo!()
+    }
+}
+
+enum Field {
+    Bn254Base,
+    Bn254Scalar,
+    Secp256k1Base,
+    Secp256k1Scalar,
+}
+
+enum FieldOp {
+    Inverse,
+    Sqrt,
+}
+
+impl FromStr for Field {
+    type Err = ();
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        Ok(match s {
+            "bn254_base" => Bn254Base,
+            "bn254_scalar" => Bn254Scalar,
+            "secp256k1_base" => Secp256k1Base,
+            "secp256k1_scalar" => Secp256k1Scalar,
+            _ => panic!("Unrecognized field."),
+        })
+    }
+}
+
+impl FromStr for FieldOp {
+    type Err = ();
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        Ok(match s {
+            "inverse" => Inverse,
+            "sqrt" => Sqrt,
+            _ => panic!("Unrecognized field operation."),
+        })
+    }
+}
+
+impl Field {
+    fn order(&self) -> U256 {
+        match self {
+            Field::Bn254Base => {
+                U256::from_str("0x30644e72e131a029b85045b68181585d97816a916871ca8d3c208c16d87cfd47")
+                    .unwrap()
+            }
+            Field::Bn254Scalar => todo!(),
+            Field::Secp256k1Base => {
+                U256::from_str("0xfffffffffffffffffffffffffffffffffffffffffffffffffffffffefffffc2f")
+                    .unwrap()
+            }
+            Field::Secp256k1Scalar => {
+                U256::from_str("0xfffffffffffffffffffffffffffffffebaaedce6af48a03bbfd25e8cd0364141")
+                    .unwrap()
+            }
+        }
+    }
+
+    fn op(&self, op: FieldOp, x: U256) -> U256 {
+        match op {
+            FieldOp::Inverse => self.inverse(x),
+            FieldOp::Sqrt => self.sqrt(x),
+        }
+    }
+
+    fn inverse(&self, x: U256) -> U256 {
+        let n = self.order();
+        assert!(x < n);
+        modexp(x, n - 2, n)
+    }
+
+    fn sqrt(&self, x: U256) -> U256 {
+        let n = self.order();
+        assert!(x < n);
+        let (q, r) = (n + 1).div_mod(4.into());
+        assert!(
+            r.is_zero(),
+            "Only naive sqrt implementation for now. If needed implement Tonelli-Shanks."
+        );
+        modexp(x, q, n)
+    }
+}
+
+fn modexp(x: U256, e: U256, n: U256) -> U256 {
+    let mut current = x;
+    let mut product = U256::one();
+
+    for j in 0..256 {
+        if e.bit(j) {
+            product = U256::try_from(product.full_mul(current) % n).unwrap();
+        }
+        current = U256::try_from(current.full_mul(current) % n).unwrap();
+    }
+    product
+}
diff --git a/evm/src/cpu/kernel/stack_manipulation.rs b/evm/src/cpu/kernel/stack_manipulation.rs
index 63d0566c..a1f02c7e 100644
--- a/evm/src/cpu/kernel/stack_manipulation.rs
+++ b/evm/src/cpu/kernel/stack_manipulation.rs
@@ -5,8 +5,10 @@ use std::collections::{BinaryHeap, HashMap};
 use itertools::Itertools;
 
 use crate::cpu::columns::NUM_CPU_COLUMNS;
-use crate::cpu::kernel::ast::{Item, Literal, PushTarget, StackReplacement};
+use crate::cpu::kernel::assembler::BYTES_PER_OFFSET;
+use crate::cpu::kernel::ast::{Item, PushTarget, StackReplacement};
 use crate::cpu::kernel::stack_manipulation::StackOp::Pop;
+use crate::cpu::kernel::utils::u256_to_trimmed_be_bytes;
 use crate::memory;
 
 pub(crate) fn expand_stack_manipulation(body: Vec<Item>) -> Vec<Item> {
@@ -22,23 +24,27 @@ pub(crate) fn expand_stack_manipulation(body: Vec<Item>) -> Vec<Item> {
 }
 
 fn expand(names: Vec<String>, replacements: Vec<StackReplacement>) -> Vec<Item> {
-    let mut src = names.into_iter().map(StackItem::NamedItem).collect_vec();
-
-    let unique_literals = replacements
+    let mut src = names
         .iter()
-        .filter_map(|item| match item {
-            StackReplacement::Literal(n) => Some(n.clone()),
-            _ => None,
-        })
-        .unique()
+        .cloned()
+        .map(StackItem::NamedItem)
         .collect_vec();
 
     let mut dst = replacements
         .into_iter()
         .map(|item| match item {
-            StackReplacement::NamedItem(name) => StackItem::NamedItem(name),
-            StackReplacement::Literal(n) => StackItem::Literal(n),
-            StackReplacement::MacroVar(_) | StackReplacement::Constant(_) => {
+            StackReplacement::Identifier(name) => {
+                // May be either a named item or a label. Named items have precedence.
+                if names.contains(&name) {
+                    StackItem::NamedItem(name)
+                } else {
+                    StackItem::PushTarget(PushTarget::Label(name))
+                }
+            }
+            StackReplacement::Literal(n) => StackItem::PushTarget(PushTarget::Literal(n)),
+            StackReplacement::MacroLabel(_)
+            | StackReplacement::MacroVar(_)
+            | StackReplacement::Constant(_) => {
                 panic!("Should have been expanded already: {:?}", item)
             }
         })
@@ -49,7 +55,16 @@ fn expand(names: Vec<String>, replacements: Vec<StackReplacement>) -> Vec<Item>
     src.reverse();
     dst.reverse();
 
-    let path = shortest_path(src, dst, unique_literals);
+    let unique_push_targets = dst
+        .iter()
+        .filter_map(|item| match item {
+            StackItem::PushTarget(target) => Some(target.clone()),
+            _ => None,
+        })
+        .unique()
+        .collect_vec();
+
+    let path = shortest_path(src, dst, unique_push_targets);
     path.into_iter().map(StackOp::into_item).collect()
 }
 
@@ -58,7 +73,7 @@ fn expand(names: Vec<String>, replacements: Vec<StackReplacement>) -> Vec<Item>
 fn shortest_path(
     src: Vec<StackItem>,
     dst: Vec<StackItem>,
-    unique_literals: Vec<Literal>,
+    unique_push_targets: Vec<PushTarget>,
 ) -> Vec<StackOp> {
     // Nodes to visit, starting with the lowest-cost node.
     let mut queue = BinaryHeap::new();
@@ -93,7 +108,7 @@ fn shortest_path(
             continue;
         }
 
-        for op in next_ops(&node.stack, &dst, &unique_literals) {
+        for op in next_ops(&node.stack, &dst, &unique_push_targets) {
             let neighbor = match op.apply_to(node.stack.clone()) {
                 Some(n) => n,
                 None => continue,
@@ -151,19 +166,23 @@ impl Ord for Node {
 #[derive(Eq, PartialEq, Hash, Clone, Debug)]
 enum StackItem {
     NamedItem(String),
-    Literal(Literal),
+    PushTarget(PushTarget),
 }
 
 #[derive(Clone, Debug)]
 enum StackOp {
-    Push(Literal),
+    Push(PushTarget),
     Pop,
     Dup(u8),
     Swap(u8),
 }
 
 /// A set of candidate operations to consider for the next step in the path from `src` to `dst`.
-fn next_ops(src: &[StackItem], dst: &[StackItem], unique_literals: &[Literal]) -> Vec<StackOp> {
+fn next_ops(
+    src: &[StackItem],
+    dst: &[StackItem],
+    unique_push_targets: &[PushTarget],
+) -> Vec<StackOp> {
     if let Some(top) = src.last() && !dst.contains(top) {
         // If the top of src doesn't appear in dst, don't bother with anything other than a POP.
         return vec![StackOp::Pop]
@@ -172,12 +191,12 @@ fn next_ops(src: &[StackItem], dst: &[StackItem], unique_literals: &[Literal]) -
     let mut ops = vec![StackOp::Pop];
 
     ops.extend(
-        unique_literals
+        unique_push_targets
             .iter()
-            // Only consider pushing this literal if we need more occurrences of it, otherwise swaps
+            // Only consider pushing this target if we need more occurrences of it, otherwise swaps
             // will be a better way to rearrange the existing occurrences as needed.
-            .filter(|lit| {
-                let item = StackItem::Literal((*lit).clone());
+            .filter(|push_target| {
+                let item = StackItem::PushTarget((*push_target).clone());
                 let src_count = src.iter().filter(|x| **x == item).count();
                 let dst_count = dst.iter().filter(|x| **x == item).count();
                 src_count < dst_count
@@ -209,8 +228,16 @@ fn next_ops(src: &[StackItem], dst: &[StackItem], unique_literals: &[Literal]) -
 impl StackOp {
     fn cost(&self) -> u32 {
         let (cpu_rows, memory_rows) = match self {
-            StackOp::Push(n) => {
-                let bytes = n.to_trimmed_be_bytes().len() as u32;
+            StackOp::Push(target) => {
+                let bytes = match target {
+                    PushTarget::Literal(n) => u256_to_trimmed_be_bytes(n).len() as u32,
+                    PushTarget::Label(_) => BYTES_PER_OFFSET as u32,
+                    PushTarget::MacroLabel(_)
+                    | PushTarget::MacroVar(_)
+                    | PushTarget::Constant(_) => {
+                        panic!("Target should have been expanded already: {:?}", target)
+                    }
+                };
                 // This is just a rough estimate; we can update it after implementing PUSH.
                 (bytes, bytes)
             }
@@ -232,8 +259,8 @@ impl StackOp {
     fn apply_to(&self, mut stack: Vec<StackItem>) -> Option<Vec<StackItem>> {
         let len = stack.len();
         match self {
-            StackOp::Push(n) => {
-                stack.push(StackItem::Literal(n.clone()));
+            StackOp::Push(target) => {
+                stack.push(StackItem::PushTarget(target.clone()));
             }
             Pop => {
                 stack.pop()?;
@@ -253,7 +280,7 @@ impl StackOp {
 
     fn into_item(self) -> Item {
         match self {
-            StackOp::Push(n) => Item::Push(PushTarget::Literal(n)),
+            StackOp::Push(target) => Item::Push(target),
             Pop => Item::StandardOp("POP".into()),
             StackOp::Dup(n) => Item::StandardOp(format!("DUP{}", n)),
             StackOp::Swap(n) => Item::StandardOp(format!("SWAP{}", n)),
diff --git a/evm/src/cpu/kernel/tests/curve_ops.rs b/evm/src/cpu/kernel/tests/curve_ops.rs
index 6d8c6696..0aaa94ea 100644
--- a/evm/src/cpu/kernel/tests/curve_ops.rs
+++ b/evm/src/cpu/kernel/tests/curve_ops.rs
@@ -4,7 +4,7 @@ mod bn {
     use ethereum_types::U256;
 
     use crate::cpu::kernel::aggregator::combined_kernel;
-    use crate::cpu::kernel::interpreter::run;
+    use crate::cpu::kernel::interpreter::run_with_kernel;
     use crate::cpu::kernel::tests::u256ify;
 
     #[test]
@@ -43,76 +43,110 @@ mod bn {
 
         // Standard addition #1
         let initial_stack = u256ify(["0xdeadbeef", point0.1, point0.0, point1.1, point1.0])?;
-        let stack = run(&kernel.code, ec_add, initial_stack)?.stack;
+        let stack = run_with_kernel(&kernel, ec_add, initial_stack)?
+            .stack()
+            .to_vec();
         assert_eq!(stack, u256ify([point2.1, point2.0])?);
         // Standard addition #2
         let initial_stack = u256ify(["0xdeadbeef", point1.1, point1.0, point0.1, point0.0])?;
-        let stack = run(&kernel.code, ec_add, initial_stack)?.stack;
+        let stack = run_with_kernel(&kernel, ec_add, initial_stack)?
+            .stack()
+            .to_vec();
         assert_eq!(stack, u256ify([point2.1, point2.0])?);
 
         // Standard doubling #1
         let initial_stack = u256ify(["0xdeadbeef", point0.1, point0.0, point0.1, point0.0])?;
-        let stack = run(&kernel.code, ec_add, initial_stack)?.stack;
+        let stack = run_with_kernel(&kernel, ec_add, initial_stack)?
+            .stack()
+            .to_vec();
         assert_eq!(stack, u256ify([point3.1, point3.0])?);
         // Standard doubling #2
         let initial_stack = u256ify(["0xdeadbeef", point0.1, point0.0])?;
-        let stack = run(&kernel.code, ec_double, initial_stack)?.stack;
+        let stack = run_with_kernel(&kernel, ec_double, initial_stack)?
+            .stack()
+            .to_vec();
         assert_eq!(stack, u256ify([point3.1, point3.0])?);
         // Standard doubling #3
         let initial_stack = u256ify(["0xdeadbeef", "0x2", point0.1, point0.0])?;
-        let stack = run(&kernel.code, ec_mul, initial_stack)?.stack;
+        let stack = run_with_kernel(&kernel, ec_mul, initial_stack)?
+            .stack()
+            .to_vec();
         assert_eq!(stack, u256ify([point3.1, point3.0])?);
 
         // Addition with identity #1
         let initial_stack = u256ify(["0xdeadbeef", identity.1, identity.0, point1.1, point1.0])?;
-        let stack = run(&kernel.code, ec_add, initial_stack)?.stack;
+        let stack = run_with_kernel(&kernel, ec_add, initial_stack)?
+            .stack()
+            .to_vec();
         assert_eq!(stack, u256ify([point1.1, point1.0])?);
         // Addition with identity #2
         let initial_stack = u256ify(["0xdeadbeef", point1.1, point1.0, identity.1, identity.0])?;
-        let stack = run(&kernel.code, ec_add, initial_stack)?.stack;
+        let stack = run_with_kernel(&kernel, ec_add, initial_stack)?
+            .stack()
+            .to_vec();
         assert_eq!(stack, u256ify([point1.1, point1.0])?);
         // Addition with identity #3
         let initial_stack =
             u256ify(["0xdeadbeef", identity.1, identity.0, identity.1, identity.0])?;
-        let stack = run(&kernel.code, ec_add, initial_stack)?.stack;
+        let stack = run_with_kernel(&kernel, ec_add, initial_stack)?
+            .stack()
+            .to_vec();
         assert_eq!(stack, u256ify([identity.1, identity.0])?);
 
         // Addition with invalid point(s) #1
         let initial_stack = u256ify(["0xdeadbeef", point0.1, point0.0, invalid.1, invalid.0])?;
-        let stack = run(&kernel.code, ec_add, initial_stack)?.stack;
+        let stack = run_with_kernel(&kernel, ec_add, initial_stack)?
+            .stack()
+            .to_vec();
         assert_eq!(stack, vec![U256::MAX, U256::MAX]);
         // Addition with invalid point(s) #2
         let initial_stack = u256ify(["0xdeadbeef", invalid.1, invalid.0, point0.1, point0.0])?;
-        let stack = run(&kernel.code, ec_add, initial_stack)?.stack;
+        let stack = run_with_kernel(&kernel, ec_add, initial_stack)?
+            .stack()
+            .to_vec();
         assert_eq!(stack, vec![U256::MAX, U256::MAX]);
         // Addition with invalid point(s) #3
         let initial_stack = u256ify(["0xdeadbeef", invalid.1, invalid.0, identity.1, identity.0])?;
-        let stack = run(&kernel.code, ec_add, initial_stack)?.stack;
+        let stack = run_with_kernel(&kernel, ec_add, initial_stack)?
+            .stack()
+            .to_vec();
         assert_eq!(stack, vec![U256::MAX, U256::MAX]);
         // Addition with invalid point(s) #4
         let initial_stack = u256ify(["0xdeadbeef", invalid.1, invalid.0, invalid.1, invalid.0])?;
-        let stack = run(&kernel.code, ec_add, initial_stack)?.stack;
+        let stack = run_with_kernel(&kernel, ec_add, initial_stack)?
+            .stack()
+            .to_vec();
         assert_eq!(stack, vec![U256::MAX, U256::MAX]);
 
         // Scalar multiplication #1
         let initial_stack = u256ify(["0xdeadbeef", s, point0.1, point0.0])?;
-        let stack = run(&kernel.code, ec_mul, initial_stack)?.stack;
+        let stack = run_with_kernel(&kernel, ec_mul, initial_stack)?
+            .stack()
+            .to_vec();
         assert_eq!(stack, u256ify([point4.1, point4.0])?);
         // Scalar multiplication #2
         let initial_stack = u256ify(["0xdeadbeef", "0x0", point0.1, point0.0])?;
-        let stack = run(&kernel.code, ec_mul, initial_stack)?.stack;
+        let stack = run_with_kernel(&kernel, ec_mul, initial_stack)?
+            .stack()
+            .to_vec();
         assert_eq!(stack, u256ify([identity.1, identity.0])?);
         // Scalar multiplication #3
         let initial_stack = u256ify(["0xdeadbeef", "0x1", point0.1, point0.0])?;
-        let stack = run(&kernel.code, ec_mul, initial_stack)?.stack;
+        let stack = run_with_kernel(&kernel, ec_mul, initial_stack)?
+            .stack()
+            .to_vec();
         assert_eq!(stack, u256ify([point0.1, point0.0])?);
         // Scalar multiplication #4
         let initial_stack = u256ify(["0xdeadbeef", s, identity.1, identity.0])?;
-        let stack = run(&kernel.code, ec_mul, initial_stack)?.stack;
+        let stack = run_with_kernel(&kernel, ec_mul, initial_stack)?
+            .stack()
+            .to_vec();
         assert_eq!(stack, u256ify([identity.1, identity.0])?);
         // Scalar multiplication #5
         let initial_stack = u256ify(["0xdeadbeef", s, invalid.1, invalid.0])?;
-        let stack = run(&kernel.code, ec_mul, initial_stack)?.stack;
+        let stack = run_with_kernel(&kernel, ec_mul, initial_stack)?
+            .stack()
+            .to_vec();
         assert_eq!(stack, vec![U256::MAX, U256::MAX]);
 
         // Multiple calls
@@ -126,7 +160,9 @@ mod bn {
             point0.1,
             point0.0,
         ])?;
-        let stack = run(&kernel.code, ec_add, initial_stack)?.stack;
+        let stack = run_with_kernel(&kernel, ec_add, initial_stack)?
+            .stack()
+            .to_vec();
         assert_eq!(stack, u256ify([point4.1, point4.0])?);
 
         Ok(())
@@ -138,7 +174,7 @@ mod secp {
     use anyhow::Result;
 
     use crate::cpu::kernel::aggregator::combined_kernel;
-    use crate::cpu::kernel::interpreter::run;
+    use crate::cpu::kernel::interpreter::{run, run_with_kernel};
     use crate::cpu::kernel::tests::u256ify;
 
     #[test]
@@ -176,55 +212,79 @@ mod secp {
 
         // Standard addition #1
         let initial_stack = u256ify(["0xdeadbeef", point0.1, point0.0, point1.1, point1.0])?;
-        let stack = run(&kernel.code, ec_add, initial_stack)?.stack;
+        let stack = run_with_kernel(&kernel, ec_add, initial_stack)?
+            .stack()
+            .to_vec();
         assert_eq!(stack, u256ify([point2.1, point2.0])?);
         // Standard addition #2
         let initial_stack = u256ify(["0xdeadbeef", point1.1, point1.0, point0.1, point0.0])?;
-        let stack = run(&kernel.code, ec_add, initial_stack)?.stack;
+        let stack = run(&kernel.code, ec_add, initial_stack, &kernel.prover_inputs)?
+            .stack()
+            .to_vec();
         assert_eq!(stack, u256ify([point2.1, point2.0])?);
 
         // Standard doubling #1
         let initial_stack = u256ify(["0xdeadbeef", point0.1, point0.0, point0.1, point0.0])?;
-        let stack = run(&kernel.code, ec_add, initial_stack)?.stack;
+        let stack = run_with_kernel(&kernel, ec_add, initial_stack)?
+            .stack()
+            .to_vec();
         assert_eq!(stack, u256ify([point3.1, point3.0])?);
         // Standard doubling #2
         let initial_stack = u256ify(["0xdeadbeef", point0.1, point0.0])?;
-        let stack = run(&kernel.code, ec_double, initial_stack)?.stack;
+        let stack = run_with_kernel(&kernel, ec_double, initial_stack)?
+            .stack()
+            .to_vec();
         assert_eq!(stack, u256ify([point3.1, point3.0])?);
         // Standard doubling #3
         let initial_stack = u256ify(["0xdeadbeef", "0x2", point0.1, point0.0])?;
-        let stack = run(&kernel.code, ec_mul, initial_stack)?.stack;
+        let stack = run_with_kernel(&kernel, ec_mul, initial_stack)?
+            .stack()
+            .to_vec();
         assert_eq!(stack, u256ify([point3.1, point3.0])?);
 
         // Addition with identity #1
         let initial_stack = u256ify(["0xdeadbeef", identity.1, identity.0, point1.1, point1.0])?;
-        let stack = run(&kernel.code, ec_add, initial_stack)?.stack;
+        let stack = run_with_kernel(&kernel, ec_add, initial_stack)?
+            .stack()
+            .to_vec();
         assert_eq!(stack, u256ify([point1.1, point1.0])?);
         // Addition with identity #2
         let initial_stack = u256ify(["0xdeadbeef", point1.1, point1.0, identity.1, identity.0])?;
-        let stack = run(&kernel.code, ec_add, initial_stack)?.stack;
+        let stack = run_with_kernel(&kernel, ec_add, initial_stack)?
+            .stack()
+            .to_vec();
         assert_eq!(stack, u256ify([point1.1, point1.0])?);
         // Addition with identity #3
         let initial_stack =
             u256ify(["0xdeadbeef", identity.1, identity.0, identity.1, identity.0])?;
-        let stack = run(&kernel.code, ec_add, initial_stack)?.stack;
+        let stack = run_with_kernel(&kernel, ec_add, initial_stack)?
+            .stack()
+            .to_vec();
         assert_eq!(stack, u256ify([identity.1, identity.0])?);
 
         // Scalar multiplication #1
         let initial_stack = u256ify(["0xdeadbeef", s, point0.1, point0.0])?;
-        let stack = run(&kernel.code, ec_mul, initial_stack)?.stack;
+        let stack = run_with_kernel(&kernel, ec_mul, initial_stack)?
+            .stack()
+            .to_vec();
         assert_eq!(stack, u256ify([point4.1, point4.0])?);
         // Scalar multiplication #2
         let initial_stack = u256ify(["0xdeadbeef", "0x0", point0.1, point0.0])?;
-        let stack = run(&kernel.code, ec_mul, initial_stack)?.stack;
+        let stack = run_with_kernel(&kernel, ec_mul, initial_stack)?
+            .stack()
+            .to_vec();
         assert_eq!(stack, u256ify([identity.1, identity.0])?);
         // Scalar multiplication #3
         let initial_stack = u256ify(["0xdeadbeef", "0x1", point0.1, point0.0])?;
-        let stack = run(&kernel.code, ec_mul, initial_stack)?.stack;
+        let stack = run_with_kernel(&kernel, ec_mul, initial_stack)?
+            .stack()
+            .to_vec();
         assert_eq!(stack, u256ify([point0.1, point0.0])?);
         // Scalar multiplication #4
         let initial_stack = u256ify(["0xdeadbeef", s, identity.1, identity.0])?;
-        let stack = run(&kernel.code, ec_mul, initial_stack)?.stack;
+        let stack = run_with_kernel(&kernel, ec_mul, initial_stack)?
+            .stack()
+            .to_vec();
         assert_eq!(stack, u256ify([identity.1, identity.0])?);
 
         // Multiple calls
@@ -238,7 +298,9 @@ mod secp {
             point0.1,
             point0.0,
         ])?;
-        let stack = run(&kernel.code, ec_add, initial_stack)?.stack;
+        let stack = run_with_kernel(&kernel, ec_add, initial_stack)?
+            .stack()
+            .to_vec();
         assert_eq!(stack, u256ify([point4.1, point4.0])?);
 
         Ok(())
diff --git a/evm/src/cpu/kernel/tests/ecrecover.rs b/evm/src/cpu/kernel/tests/ecrecover.rs
index 78bdea3e..c01adc53 100644
--- a/evm/src/cpu/kernel/tests/ecrecover.rs
+++ b/evm/src/cpu/kernel/tests/ecrecover.rs
@@ -5,7 +5,7 @@ use ethereum_types::U256;
 
 use crate::cpu::kernel::aggregator::combined_kernel;
 use crate::cpu::kernel::assembler::Kernel;
-use crate::cpu::kernel::interpreter::run;
+use crate::cpu::kernel::interpreter::run_with_kernel;
 use crate::cpu::kernel::tests::u256ify;
 
 fn test_valid_ecrecover(
@@ -18,7 +18,9 @@ fn test_valid_ecrecover(
 ) -> Result<()> {
     let ecrecover = kernel.global_labels["ecrecover"];
     let initial_stack = u256ify(["0xdeadbeef", s, r, v, hash])?;
-    let stack = run(&kernel.code, ecrecover, initial_stack)?.stack;
+    let stack = run_with_kernel(kernel, ecrecover, initial_stack)?
+        .stack()
+        .to_vec();
     assert_eq!(stack[0], U256::from_str(expected).unwrap());
 
     Ok(())
@@ -27,7 +29,9 @@ fn test_valid_ecrecover(
 fn test_invalid_ecrecover(hash: &str, v: &str, r: &str, s: &str, kernel: &Kernel) -> Result<()> {
     let ecrecover = kernel.global_labels["ecrecover"];
     let initial_stack = u256ify(["0xdeadbeef", s, r, v, hash])?;
-    let stack = run(&kernel.code, ecrecover, initial_stack)?.stack;
+    let stack = run_with_kernel(kernel, ecrecover, initial_stack)?
+        .stack()
+        .to_vec();
     assert_eq!(stack, vec![U256::MAX]);
 
     Ok(())
diff --git a/evm/src/cpu/kernel/tests/exp.rs b/evm/src/cpu/kernel/tests/exp.rs
index 25c88623..25bc5ad3 100644
--- a/evm/src/cpu/kernel/tests/exp.rs
+++ b/evm/src/cpu/kernel/tests/exp.rs
@@ -1,11 +1,9 @@
-use std::str::FromStr;
-
 use anyhow::Result;
 use ethereum_types::U256;
 use rand::{thread_rng, Rng};
 
 use crate::cpu::kernel::aggregator::combined_kernel;
-use crate::cpu::kernel::interpreter::run;
+use crate::cpu::kernel::interpreter::{run, run_with_kernel};
 
 #[test]
 fn test_exp() -> Result<()> {
@@ -17,27 +15,39 @@ fn test_exp() -> Result<()> {
     let b = U256([0; 4].map(|_| rng.gen()));
 
     // Random input
-    let initial_stack = vec![U256::from_str("0xdeadbeef")?, b, a];
-    let stack_with_kernel = run(&kernel.code, exp, initial_stack)?.stack;
+    let initial_stack = vec![0xDEADBEEFu32.into(), b, a];
+    let stack_with_kernel = run_with_kernel(&kernel, exp, initial_stack)?
+        .stack()
+        .to_vec();
     let initial_stack = vec![b, a];
     let code = [0xa, 0x63, 0xde, 0xad, 0xbe, 0xef, 0x56]; // EXP, PUSH4 deadbeef, JUMP
-    let stack_with_opcode = run(&code, 0, initial_stack)?.stack;
+    let stack_with_opcode = run(&code, 0, initial_stack, &kernel.prover_inputs)?
+        .stack()
+        .to_vec();
     assert_eq!(stack_with_kernel, stack_with_opcode);
 
     // 0 base
-    let initial_stack = vec![U256::from_str("0xdeadbeef")?, b, U256::zero()];
-    let stack_with_kernel = run(&kernel.code, exp, initial_stack)?.stack;
+    let initial_stack = vec![0xDEADBEEFu32.into(), b, U256::zero()];
+    let stack_with_kernel = run_with_kernel(&kernel, exp, initial_stack)?
+        .stack()
+        .to_vec();
     let initial_stack = vec![b, U256::zero()];
     let code = [0xa, 0x63, 0xde, 0xad, 0xbe, 0xef, 0x56]; // EXP, PUSH4 deadbeef, JUMP
-    let stack_with_opcode = run(&code, 0, initial_stack)?.stack;
+    let stack_with_opcode = run(&code, 0, initial_stack, &kernel.prover_inputs)?
+        .stack()
+        .to_vec();
     assert_eq!(stack_with_kernel, stack_with_opcode);
 
     // 0 exponent
-    let initial_stack = vec![U256::from_str("0xdeadbeef")?, U256::zero(), a];
-    let stack_with_kernel = run(&kernel.code, exp, initial_stack)?.stack;
+    let initial_stack = vec![0xDEADBEEFu32.into(), U256::zero(), a];
+    let stack_with_kernel = run_with_kernel(&kernel, exp, initial_stack)?
+        .stack()
+        .to_vec();
     let initial_stack = vec![U256::zero(), a];
     let code = [0xa, 0x63, 0xde, 0xad, 0xbe, 0xef, 0x56]; // EXP, PUSH4 deadbeef, JUMP
-    let stack_with_opcode = run(&code, 0, initial_stack)?.stack;
+    let stack_with_opcode = run(&code, 0, initial_stack, &kernel.prover_inputs)?
+        .stack()
+        .to_vec();
     assert_eq!(stack_with_kernel, stack_with_opcode);
 
     Ok(())
diff --git a/evm/src/cpu/kernel/tests/mod.rs b/evm/src/cpu/kernel/tests/mod.rs
index 100ef377..ab92c5a0 100644
--- a/evm/src/cpu/kernel/tests/mod.rs
+++ b/evm/src/cpu/kernel/tests/mod.rs
@@ -1,6 +1,8 @@
 mod curve_ops;
 mod ecrecover;
 mod exp;
+mod rlp;
+mod transaction_parsing;
 
 use std::str::FromStr;
 
diff --git a/evm/src/cpu/kernel/tests/rlp.rs b/evm/src/cpu/kernel/tests/rlp.rs
new file mode 100644
index 00000000..a1ca3609
--- /dev/null
+++ b/evm/src/cpu/kernel/tests/rlp.rs
@@ -0,0 +1,114 @@
+use anyhow::Result;
+
+use crate::cpu::kernel::aggregator::KERNEL;
+use crate::cpu::kernel::interpreter::Interpreter;
+
+#[test]
+fn test_decode_rlp_string_len_short() -> Result<()> {
+    let decode_rlp_string_len = KERNEL.global_labels["decode_rlp_string_len"];
+
+    let initial_stack = vec![0xDEADBEEFu32.into(), 2.into()];
+    let mut interpreter = Interpreter::new_with_kernel(decode_rlp_string_len, initial_stack);
+
+    // A couple dummy bytes, followed by "0x70" which is its own encoding.
+    interpreter.set_rlp_memory(vec![123, 234, 0x70]);
+
+    interpreter.run()?;
+    let expected_stack = vec![1.into(), 2.into()]; // len, pos
+    assert_eq!(interpreter.stack(), expected_stack);
+
+    Ok(())
+}
+
+#[test]
+fn test_decode_rlp_string_len_medium() -> Result<()> {
+    let decode_rlp_string_len = KERNEL.global_labels["decode_rlp_string_len"];
+
+    let initial_stack = vec![0xDEADBEEFu32.into(), 2.into()];
+    let mut interpreter = Interpreter::new_with_kernel(decode_rlp_string_len, initial_stack);
+
+    // A couple dummy bytes, followed by the RLP encoding of "1 2 3 4 5".
+    interpreter.set_rlp_memory(vec![123, 234, 0x85, 1, 2, 3, 4, 5]);
+
+    interpreter.run()?;
+    let expected_stack = vec![5.into(), 3.into()]; // len, pos
+    assert_eq!(interpreter.stack(), expected_stack);
+
+    Ok(())
+}
+
+#[test]
+fn test_decode_rlp_string_len_long() -> Result<()> {
+    let decode_rlp_string_len = KERNEL.global_labels["decode_rlp_string_len"];
+
+    let initial_stack = vec![0xDEADBEEFu32.into(), 2.into()];
+    let mut interpreter = Interpreter::new_with_kernel(decode_rlp_string_len, initial_stack);
+
+    // The RLP encoding of the string "1 2 3 ... 56".
+    interpreter.set_rlp_memory(vec![
+        123, 234, 0xb8, 56, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
+        21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
+        44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56,
+    ]);
+
+    interpreter.run()?;
+    let expected_stack = vec![56.into(), 4.into()]; // len, pos
+    assert_eq!(interpreter.stack(), expected_stack);
+
+    Ok(())
+}
+
+#[test]
+fn test_decode_rlp_list_len_short() -> Result<()> {
+    let decode_rlp_list_len = KERNEL.global_labels["decode_rlp_list_len"];
+
+    let initial_stack = vec![0xDEADBEEFu32.into(), 0.into()];
+    let mut interpreter = Interpreter::new_with_kernel(decode_rlp_list_len, initial_stack);
+
+    // The RLP encoding of [1, 2, [3, 4]].
+    interpreter.set_rlp_memory(vec![0xc5, 1, 2, 0xc2, 3, 4]);
+
+    interpreter.run()?;
+    let expected_stack = vec![5.into(), 1.into()]; // len, pos
+    assert_eq!(interpreter.stack(), expected_stack);
+
+    Ok(())
+}
+
+#[test]
+fn test_decode_rlp_list_len_long() -> Result<()> {
+    let decode_rlp_list_len = KERNEL.global_labels["decode_rlp_list_len"];
+
+    let initial_stack = vec![0xDEADBEEFu32.into(), 0.into()];
+    let mut interpreter = Interpreter::new_with_kernel(decode_rlp_list_len, initial_stack);
+
+    // The RLP encoding of [1, ..., 56].
+    interpreter.set_rlp_memory(vec![
+        0xf8, 56, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
+        23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
+        46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56,
+    ]);
+
+    interpreter.run()?;
+    let expected_stack = vec![56.into(), 2.into()]; // len, pos
+    assert_eq!(interpreter.stack(), expected_stack);
+
+    Ok(())
+}
+
+#[test]
+fn test_decode_rlp_scalar() -> Result<()> {
+    let decode_rlp_scalar = KERNEL.global_labels["decode_rlp_scalar"];
+
+    let initial_stack = vec![0xDEADBEEFu32.into(), 0.into()];
+    let mut interpreter = Interpreter::new_with_kernel(decode_rlp_scalar, initial_stack);
+
+    // The RLP encoding of "12 34 56".
+    interpreter.set_rlp_memory(vec![0x83, 0x12, 0x34, 0x56]);
+
+    interpreter.run()?;
+    let expected_stack = vec![0x123456.into(), 4.into()]; // scalar, pos
+    assert_eq!(interpreter.stack(), expected_stack);
+
+    Ok(())
+}
diff --git a/evm/src/cpu/kernel/tests/transaction_parsing/mod.rs b/evm/src/cpu/kernel/tests/transaction_parsing/mod.rs
new file mode 100644
index 00000000..fb50625f
--- /dev/null
+++ b/evm/src/cpu/kernel/tests/transaction_parsing/mod.rs
@@ -0,0 +1 @@
+mod parse_type_0_txn;
diff --git a/evm/src/cpu/kernel/tests/transaction_parsing/parse_type_0_txn.rs b/evm/src/cpu/kernel/tests/transaction_parsing/parse_type_0_txn.rs
new file mode 100644
index 00000000..c01474ce
--- /dev/null
+++ b/evm/src/cpu/kernel/tests/transaction_parsing/parse_type_0_txn.rs
@@ -0,0 +1,65 @@
+use anyhow::Result;
+use ethereum_types::U256;
+use hex_literal::hex;
+use NormalizedTxnField::*;
+
+use crate::cpu::kernel::aggregator::KERNEL;
+use crate::cpu::kernel::interpreter::Interpreter;
+use crate::cpu::kernel::txn_fields::NormalizedTxnField;
+
+#[test]
+fn process_type_0_txn() -> Result<()> {
+    let process_type_0_txn = KERNEL.global_labels["process_type_0_txn"];
+    let process_normalized_txn = KERNEL.global_labels["process_normalized_txn"];
+
+    let mut interpreter = Interpreter::new_with_kernel(process_type_0_txn, vec![]);
+
+    // When we reach process_normalized_txn, we're done with parsing and normalizing.
+    // Processing normalized transactions is outside the scope of this test.
+    interpreter.halt_offsets.push(process_normalized_txn);
+
+    // Generated with py-evm:
+    // import eth, eth_keys, eth_utils, rlp
+    // genesis_params = { 'difficulty': eth.constants.GENESIS_DIFFICULTY }
+    // chain = eth.chains.mainnet.MainnetChain.from_genesis(eth.db.atomic.AtomicDB(), genesis_params, {})
+    // unsigned_txn = chain.create_unsigned_transaction(
+    //     nonce=5,
+    //     gas_price=10,
+    //     gas=22_000,
+    //     to=eth.constants.ZERO_ADDRESS,
+    //     value=100,
+    //     data=b'\x42\x42',
+    // )
+    // sk = eth_keys.keys.PrivateKey(eth_utils.decode_hex('4c0883a69102937d6231471b5dbb6204fe5129617082792ae468d01a3f362318'))
+    // signed_txn = unsigned_txn.as_signed_transaction(sk)
+    // rlp.encode(signed_txn).hex()
+    interpreter.set_rlp_memory(hex!("f861050a8255f0940000000000000000000000000000000000000000648242421ca07c5c61ed975ebd286f6b027b8c504842e50a47d318e1e801719dd744fe93e6c6a01e7b5119b57dd54e175ff2f055c91f3ab1b53eba0b2c184f347cdff0e745aca2").to_vec());
+
+    interpreter.run()?;
+
+    assert_eq!(interpreter.get_txn_field(ChainIdPresent), 0.into());
+    assert_eq!(interpreter.get_txn_field(ChainId), 0.into());
+    assert_eq!(interpreter.get_txn_field(Nonce), 5.into());
+    assert_eq!(interpreter.get_txn_field(MaxPriorityFeePerGas), 10.into());
+    assert_eq!(interpreter.get_txn_field(MaxPriorityFeePerGas), 10.into());
+    assert_eq!(interpreter.get_txn_field(MaxFeePerGas), 10.into());
+    assert_eq!(interpreter.get_txn_field(To), 0.into());
+    assert_eq!(interpreter.get_txn_field(Value), 100.into());
+    assert_eq!(interpreter.get_txn_field(DataLen), 2.into());
+    assert_eq!(interpreter.get_txn_data(), &[0x42.into(), 0x42.into()]);
+    assert_eq!(interpreter.get_txn_field(YParity), 1.into());
+    assert_eq!(
+        interpreter.get_txn_field(R),
+        U256::from_big_endian(&hex!(
+            "7c5c61ed975ebd286f6b027b8c504842e50a47d318e1e801719dd744fe93e6c6"
+        ))
+    );
+    assert_eq!(
+        interpreter.get_txn_field(S),
+        U256::from_big_endian(&hex!(
+            "1e7b5119b57dd54e175ff2f055c91f3ab1b53eba0b2c184f347cdff0e745aca2"
+        ))
+    );
+
+    Ok(())
+}
diff --git a/evm/src/cpu/kernel/utils.rs b/evm/src/cpu/kernel/utils.rs
new file mode 100644
index 00000000..8900b8e2
--- /dev/null
+++ b/evm/src/cpu/kernel/utils.rs
@@ -0,0 +1,71 @@
+use std::fmt::Debug;
+
+use ethereum_types::U256;
+use plonky2_util::ceil_div_usize;
+
+/// Enumerate the length `W` windows of `vec`, and run `maybe_replace` on each one.
+///
+/// Whenever `maybe_replace` returns `Some(replacement)`, the given replacement will be applied.
+pub(crate) fn replace_windows<const W: usize, T, F>(vec: &mut Vec<T>, maybe_replace: F)
+where
+    T: Clone + Debug,
+    F: Fn([T; W]) -> Option<Vec<T>>,
+{
+    let mut start = 0;
+    while start + W <= vec.len() {
+        let range = start..start + W;
+        let window = vec[range.clone()].to_vec().try_into().unwrap();
+        if let Some(replacement) = maybe_replace(window) {
+            vec.splice(range, replacement);
+            // Go back to the earliest window that changed.
+            start = start.saturating_sub(W - 1);
+        } else {
+            start += 1;
+        }
+    }
+}
+
+pub(crate) fn u256_to_trimmed_be_bytes(u256: &U256) -> Vec<u8> {
+    let num_bytes = ceil_div_usize(u256.bits(), 8).max(1);
+    // `byte` is little-endian, so we manually reverse it.
+    (0..num_bytes).rev().map(|i| u256.byte(i)).collect()
+}
+
+pub(crate) fn u256_from_bool(b: bool) -> U256 {
+    if b {
+        U256::one()
+    } else {
+        U256::zero()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_replace_windows() {
+        // This replacement function adds pairs of integers together.
+        let mut vec = vec![1, 2, 3, 4, 5];
+        replace_windows(&mut vec, |[x, y]| Some(vec![x + y]));
+        assert_eq!(vec, vec![15u32]);
+
+        // This replacement function splits each composite integer into two factors.
+        let mut vec = vec![9, 1, 6, 8, 15, 7, 9];
+        replace_windows(&mut vec, |[n]| {
+            (2..n).find(|d| n % d == 0).map(|d| vec![d, n / d])
+        });
+        assert_eq!(vec, vec![3, 3, 1, 2, 3, 2, 2, 2, 3, 5, 7, 3, 3]);
+    }
+
+    #[test]
+    fn literal_to_be_bytes() {
+        assert_eq!(u256_to_trimmed_be_bytes(&0.into()), vec![0x00]);
+
+        assert_eq!(u256_to_trimmed_be_bytes(&768.into()), vec![0x03, 0x00]);
+
+        assert_eq!(u256_to_trimmed_be_bytes(&0xa1b2.into()), vec![0xa1, 0xb2]);
+
+        assert_eq!(u256_to_trimmed_be_bytes(&0x1b2.into()), vec![0x1, 0xb2]);
+    }
+}
diff --git a/evm/src/cpu/mod.rs b/evm/src/cpu/mod.rs
index 8da8a125..6c767998 100644
--- a/evm/src/cpu/mod.rs
+++ b/evm/src/cpu/mod.rs
@@ -1,5 +1,6 @@
 pub(crate) mod bootstrap_kernel;
 pub(crate) mod columns;
+mod control_flow;
 pub mod cpu_stark;
 pub(crate) mod decode;
 pub mod kernel;
diff --git a/evm/src/cpu/simple_logic/eq_iszero.rs b/evm/src/cpu/simple_logic/eq_iszero.rs
index 97e000b6..75bb8bb6 100644
--- a/evm/src/cpu/simple_logic/eq_iszero.rs
+++ b/evm/src/cpu/simple_logic/eq_iszero.rs
@@ -9,6 +9,7 @@ use crate::cpu::columns::CpuColumnsView;
 const LIMB_SIZE: usize = 16;
 
 pub fn generate<F: RichField>(lv: &mut CpuColumnsView<F>) {
+    let logic = lv.general.logic_mut();
     let eq_filter = lv.is_eq.to_canonical_u64();
     let iszero_filter = lv.is_iszero.to_canonical_u64();
     assert!(eq_filter <= 1);
@@ -20,9 +21,10 @@ pub fn generate<F: RichField>(lv: &mut CpuColumnsView<F>) {
     }
 
     let diffs = if eq_filter == 1 {
-        lv.logic_input0
+        logic
+            .input0
             .into_iter()
-            .zip(lv.logic_input1)
+            .zip(logic.input1)
             .map(|(in0, in1)| {
                 assert_eq!(in0.to_canonical_u64() >> LIMB_SIZE, 0);
                 assert_eq!(in1.to_canonical_u64() >> LIMB_SIZE, 0);
@@ -31,7 +33,7 @@ pub fn generate<F: RichField>(lv: &mut CpuColumnsView<F>) {
             })
             .sum()
     } else if iszero_filter == 1 {
-        lv.logic_input0.into_iter().sum()
+        logic.input0.into_iter().sum()
     } else {
         panic!()
     };
@@ -39,8 +41,8 @@ pub fn generate<F: RichField>(lv: &mut CpuColumnsView<F>) {
     lv.simple_logic_diff = diffs;
     lv.simple_logic_diff_inv = diffs.try_inverse().unwrap_or(F::ZERO);
 
-    lv.logic_output[0] = F::from_bool(diffs == F::ZERO);
-    for out_limb_ref in lv.logic_output[1..].iter_mut() {
+    logic.output[0] = F::from_bool(diffs == F::ZERO);
+    for out_limb_ref in logic.output[1..].iter_mut() {
         *out_limb_ref = F::ZERO;
     }
 }
@@ -49,17 +51,18 @@ pub fn eval_packed<P: PackedField>(
     lv: &CpuColumnsView<P>,
     yield_constr: &mut ConstraintConsumer<P>,
 ) {
+    let logic = lv.general.logic();
     let eq_filter = lv.is_eq;
     let iszero_filter = lv.is_iszero;
     let eq_or_iszero_filter = eq_filter + iszero_filter;
 
-    let ls_bit = lv.logic_output[0];
+    let ls_bit = logic.output[0];
 
     // Handle EQ and ISZERO. Most limbs of the output are 0, but the least-significant one is
     // either 0 or 1.
     yield_constr.constraint(eq_or_iszero_filter * ls_bit * (ls_bit - P::ONES));
 
-    for &bit in &lv.logic_output[1..] {
+    for &bit in &logic.output[1..] {
         yield_constr.constraint(eq_or_iszero_filter * bit);
     }
 
@@ -67,13 +70,13 @@ pub fn eval_packed<P: PackedField>(
     let diffs = lv.simple_logic_diff;
     let diffs_inv = lv.simple_logic_diff_inv;
     {
-        let input0_sum: P = lv.logic_input0.into_iter().sum();
+        let input0_sum: P = logic.input0.into_iter().sum();
         yield_constr.constraint(iszero_filter * (diffs - input0_sum));
 
-        let sum_squared_diffs: P = lv
-            .logic_input0
+        let sum_squared_diffs: P = logic
+            .input0
             .into_iter()
-            .zip(lv.logic_input1)
+            .zip(logic.input1)
             .map(|(in0, in1)| (in0 - in1).square())
             .sum();
         yield_constr.constraint(eq_filter * (diffs - sum_squared_diffs));
@@ -90,11 +93,12 @@ pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
     lv: &CpuColumnsView<ExtensionTarget<D>>,
     yield_constr: &mut RecursiveConstraintConsumer<F, D>,
 ) {
+    let logic = lv.general.logic();
     let eq_filter = lv.is_eq;
     let iszero_filter = lv.is_iszero;
     let eq_or_iszero_filter = builder.add_extension(eq_filter, iszero_filter);
 
-    let ls_bit = lv.logic_output[0];
+    let ls_bit = logic.output[0];
 
     // Handle EQ and ISZERO. Most limbs of the output are 0, but the least-significant one is
     // either 0 or 1.
@@ -104,7 +108,7 @@ pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
         yield_constr.constraint(builder, constr);
     }
 
-    for &bit in &lv.logic_output[1..] {
+    for &bit in &logic.output[1..] {
         let constr = builder.mul_extension(eq_or_iszero_filter, bit);
         yield_constr.constraint(builder, constr);
     }
@@ -113,14 +117,14 @@ pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
     let diffs = lv.simple_logic_diff;
     let diffs_inv = lv.simple_logic_diff_inv;
     {
-        let input0_sum = builder.add_many_extension(lv.logic_input0);
+        let input0_sum = builder.add_many_extension(logic.input0);
         {
             let constr = builder.sub_extension(diffs, input0_sum);
             let constr = builder.mul_extension(iszero_filter, constr);
             yield_constr.constraint(builder, constr);
         }
 
-        let sum_squared_diffs = lv.logic_input0.into_iter().zip(lv.logic_input1).fold(
+        let sum_squared_diffs = logic.input0.into_iter().zip(logic.input1).fold(
             builder.zero_extension(),
             |acc, (in0, in1)| {
                 let diff = builder.sub_extension(in0, in1);
diff --git a/evm/src/cpu/simple_logic/not.rs b/evm/src/cpu/simple_logic/not.rs
index d1ba4d46..efbf51a6 100644
--- a/evm/src/cpu/simple_logic/not.rs
+++ b/evm/src/cpu/simple_logic/not.rs
@@ -17,7 +17,8 @@ pub fn generate<F: RichField>(lv: &mut CpuColumnsView<F>) {
     }
     assert_eq!(is_not_filter, 1);
 
-    for (input, output_ref) in lv.logic_input0.into_iter().zip(lv.logic_output.iter_mut()) {
+    let logic = lv.general.logic_mut();
+    for (input, output_ref) in logic.input0.into_iter().zip(logic.output.iter_mut()) {
         let input = input.to_canonical_u64();
         assert_eq!(input >> LIMB_SIZE, 0);
         let output = input ^ ALL_1_LIMB;
@@ -30,10 +31,11 @@ pub fn eval_packed<P: PackedField>(
     yield_constr: &mut ConstraintConsumer<P>,
 ) {
     // This is simple: just do output = 0xffff - input.
+    let logic = lv.general.logic();
     let cycle_filter = lv.is_cpu_cycle;
     let is_not_filter = lv.is_not;
     let filter = cycle_filter * is_not_filter;
-    for (input, output) in lv.logic_input0.into_iter().zip(lv.logic_output) {
+    for (input, output) in logic.input0.into_iter().zip(logic.output) {
         yield_constr
             .constraint(filter * (output + input - P::Scalar::from_canonical_u64(ALL_1_LIMB)));
     }
@@ -44,10 +46,11 @@ pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
     lv: &CpuColumnsView<ExtensionTarget<D>>,
     yield_constr: &mut RecursiveConstraintConsumer<F, D>,
 ) {
+    let logic = lv.general.logic();
     let cycle_filter = lv.is_cpu_cycle;
     let is_not_filter = lv.is_not;
     let filter = builder.mul_extension(cycle_filter, is_not_filter);
-    for (input, output) in lv.logic_input0.into_iter().zip(lv.logic_output) {
+    for (input, output) in logic.input0.into_iter().zip(logic.output) {
         let constr = builder.add_extension(output, input);
         let constr = builder.arithmetic_extension(
             F::ONE,
diff --git a/evm/src/memory/memory_stark.rs b/evm/src/memory/memory_stark.rs
index 82e10869..5a17ed20 100644
--- a/evm/src/memory/memory_stark.rs
+++ b/evm/src/memory/memory_stark.rs
@@ -2,6 +2,7 @@ use std::marker::PhantomData;
 
 use ethereum_types::U256;
 use itertools::Itertools;
+use maybe_rayon::*;
 use plonky2::field::extension::{Extendable, FieldExtension};
 use plonky2::field::packed::PackedField;
 use plonky2::field::polynomial::PolynomialValues;
@@ -10,7 +11,6 @@ use plonky2::hash::hash_types::RichField;
 use plonky2::timed;
 use plonky2::util::timing::TimingTree;
 use plonky2::util::transpose;
-use rayon::prelude::*;
 
 use crate::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer};
 use crate::cross_table_lookup::Column;
@@ -499,7 +499,12 @@ pub(crate) mod tests {
 
                 let (context, segment, virt, vals) = if is_read {
                     let written: Vec<_> = current_memory_values.keys().collect();
-                    let &(context, segment, virt) = written[rng.gen_range(0..written.len())];
+                    let &(mut context, mut segment, mut virt) =
+                        written[rng.gen_range(0..written.len())];
+                    while new_writes_this_cycle.contains_key(&(context, segment, virt)) {
+                        (context, segment, virt) = *written[rng.gen_range(0..written.len())];
+                    }
+
                     let &vals = current_memory_values
                         .get(&(context, segment, virt))
                         .unwrap();
diff --git a/evm/src/memory/segments.rs b/evm/src/memory/segments.rs
index 15545ea0..712db644 100644
--- a/evm/src/memory/segments.rs
+++ b/evm/src/memory/segments.rs
@@ -13,20 +13,21 @@ pub(crate) enum Segment {
     Returndata = 4,
     /// A segment which contains a few fixed-size metadata fields, such as the caller's context, or the
     /// size of `CALLDATA` and `RETURNDATA`.
-    Metadata = 5,
+    GlobalMetadata = 5,
+    ContextMetadata = 6,
     /// General purpose kernel memory, used by various kernel functions.
     /// In general, calling a helper function can result in this memory being clobbered.
-    KernelGeneral = 6,
+    KernelGeneral = 7,
     /// Contains normalized transaction fields; see `TxnField`.
-    TxnFields = 7,
+    TxnFields = 8,
     /// Contains the data field of a transaction.
-    TxnData = 8,
+    TxnData = 9,
     /// Raw RLP data.
-    RlpRaw = 9,
+    RlpRaw = 10,
 }
 
 impl Segment {
-    pub(crate) const COUNT: usize = 10;
+    pub(crate) const COUNT: usize = 11;
 
     pub(crate) fn all() -> [Self; Self::COUNT] {
         [
@@ -35,7 +36,8 @@ impl Segment {
             Self::MainMemory,
             Self::Calldata,
             Self::Returndata,
-            Self::Metadata,
+            Self::GlobalMetadata,
+            Self::ContextMetadata,
             Self::KernelGeneral,
             Self::TxnFields,
             Self::TxnData,
@@ -51,7 +53,8 @@ impl Segment {
             Segment::MainMemory => "SEGMENT_MAIN_MEMORY",
             Segment::Calldata => "SEGMENT_CALLDATA",
             Segment::Returndata => "SEGMENT_RETURNDATA",
-            Segment::Metadata => "SEGMENT_METADATA",
+            Segment::GlobalMetadata => "SEGMENT_GLOBAL_METADATA",
+            Segment::ContextMetadata => "SEGMENT_CONTEXT_METADATA",
             Segment::KernelGeneral => "SEGMENT_KERNEL_GENERAL",
             Segment::TxnFields => "SEGMENT_NORMALIZED_TXN",
             Segment::TxnData => "SEGMENT_TXN_DATA",
@@ -67,7 +70,8 @@ impl Segment {
             Segment::MainMemory => 8,
             Segment::Calldata => 8,
             Segment::Returndata => 8,
-            Segment::Metadata => 256,
+            Segment::GlobalMetadata => 256,
+            Segment::ContextMetadata => 256,
             Segment::KernelGeneral => 256,
             Segment::TxnFields => 256,
             Segment::TxnData => 256,
diff --git a/evm/src/permutation.rs b/evm/src/permutation.rs
index a4039ad2..c21a06de 100644
--- a/evm/src/permutation.rs
+++ b/evm/src/permutation.rs
@@ -1,6 +1,7 @@
 //! Permutation arguments.
 
 use itertools::Itertools;
+use maybe_rayon::*;
 use plonky2::field::batch_util::batch_multiply_inplace;
 use plonky2::field::extension::{Extendable, FieldExtension};
 use plonky2::field::packed::PackedField;
@@ -16,7 +17,6 @@ use plonky2::plonk::plonk_common::{
     reduce_with_powers, reduce_with_powers_circuit, reduce_with_powers_ext_circuit,
 };
 use plonky2::util::reducing::{ReducingFactor, ReducingFactorTarget};
-use rayon::prelude::*;
 
 use crate::config::StarkConfig;
 use crate::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer};
diff --git a/evm/src/proof.rs b/evm/src/proof.rs
index b1275844..4f81308d 100644
--- a/evm/src/proof.rs
+++ b/evm/src/proof.rs
@@ -1,4 +1,5 @@
 use itertools::Itertools;
+use maybe_rayon::*;
 use plonky2::field::extension::{Extendable, FieldExtension};
 use plonky2::fri::oracle::PolynomialBatch;
 use plonky2::fri::proof::{
@@ -12,7 +13,6 @@ use plonky2::hash::merkle_tree::MerkleCap;
 use plonky2::iop::ext_target::ExtensionTarget;
 use plonky2::iop::target::Target;
 use plonky2::plonk::config::GenericConfig;
-use rayon::prelude::*;
 
 use crate::config::StarkConfig;
 use crate::permutation::GrandProductChallengeSet;
diff --git a/evm/src/prover.rs b/evm/src/prover.rs
index 346224a5..8be39b6c 100644
--- a/evm/src/prover.rs
+++ b/evm/src/prover.rs
@@ -1,6 +1,7 @@
 use std::any::type_name;
 
 use anyhow::{ensure, Result};
+use maybe_rayon::*;
 use plonky2::field::extension::Extendable;
 use plonky2::field::packable::Packable;
 use plonky2::field::packed::PackedField;
@@ -15,7 +16,6 @@ use plonky2::timed;
 use plonky2::util::timing::TimingTree;
 use plonky2::util::transpose;
 use plonky2_util::{log2_ceil, log2_strict};
-use rayon::prelude::*;
 
 use crate::all_stark::{AllStark, Table};
 use crate::config::StarkConfig;
diff --git a/evm/src/recursive_verifier.rs b/evm/src/recursive_verifier.rs
index 17db048b..b69a5519 100644
--- a/evm/src/recursive_verifier.rs
+++ b/evm/src/recursive_verifier.rs
@@ -71,46 +71,66 @@ pub fn verify_proof_circuit<
         &nums_permutation_zs,
     );
 
-    verify_stark_proof_with_challenges_circuit::<F, C, _, D>(
+    with_context!(
         builder,
-        cpu_stark,
-        &all_proof.stark_proofs[Table::Cpu as usize],
-        &stark_challenges[Table::Cpu as usize],
-        &ctl_vars_per_table[Table::Cpu as usize],
-        inner_config,
+        "verify CPU proof",
+        verify_stark_proof_with_challenges_circuit::<F, C, _, D>(
+            builder,
+            cpu_stark,
+            &all_proof.stark_proofs[Table::Cpu as usize],
+            &stark_challenges[Table::Cpu as usize],
+            &ctl_vars_per_table[Table::Cpu as usize],
+            inner_config,
+        )
     );
-    verify_stark_proof_with_challenges_circuit::<F, C, _, D>(
+    with_context!(
         builder,
-        keccak_stark,
-        &all_proof.stark_proofs[Table::Keccak as usize],
-        &stark_challenges[Table::Keccak as usize],
-        &ctl_vars_per_table[Table::Keccak as usize],
-        inner_config,
+        "verify Keccak proof",
+        verify_stark_proof_with_challenges_circuit::<F, C, _, D>(
+            builder,
+            keccak_stark,
+            &all_proof.stark_proofs[Table::Keccak as usize],
+            &stark_challenges[Table::Keccak as usize],
+            &ctl_vars_per_table[Table::Keccak as usize],
+            inner_config,
+        )
     );
-    verify_stark_proof_with_challenges_circuit::<F, C, _, D>(
+    with_context!(
         builder,
-        logic_stark,
-        &all_proof.stark_proofs[Table::Logic as usize],
-        &stark_challenges[Table::Logic as usize],
-        &ctl_vars_per_table[Table::Logic as usize],
-        inner_config,
+        "verify logic proof",
+        verify_stark_proof_with_challenges_circuit::<F, C, _, D>(
+            builder,
+            logic_stark,
+            &all_proof.stark_proofs[Table::Logic as usize],
+            &stark_challenges[Table::Logic as usize],
+            &ctl_vars_per_table[Table::Logic as usize],
+            inner_config,
+        )
     );
-    verify_stark_proof_with_challenges_circuit::<F, C, _, D>(
+    with_context!(
         builder,
-        memory_stark,
-        &all_proof.stark_proofs[Table::Memory as usize],
-        &stark_challenges[Table::Memory as usize],
-        &ctl_vars_per_table[Table::Memory as usize],
-        inner_config,
+        "verify memory proof",
+        verify_stark_proof_with_challenges_circuit::<F, C, _, D>(
+            builder,
+            memory_stark,
+            &all_proof.stark_proofs[Table::Memory as usize],
+            &stark_challenges[Table::Memory as usize],
+            &ctl_vars_per_table[Table::Memory as usize],
+            inner_config,
+        )
     );
 
-    verify_cross_table_lookups_circuit::<F, C, D>(
+    with_context!(
         builder,
-        cross_table_lookups,
-        &all_proof.stark_proofs,
-        ctl_challenges,
-        inner_config,
-    )
+        "verify cross-table lookups",
+        verify_cross_table_lookups_circuit::<F, C, D>(
+            builder,
+            cross_table_lookups,
+            &all_proof.stark_proofs,
+            ctl_challenges,
+            inner_config,
+        )
+    );
 }
 
 /// Recursively verifies an inner proof.
diff --git a/field/Cargo.toml b/field/Cargo.toml
index 748b65ac..1a72bd6c 100644
--- a/field/Cargo.toml
+++ b/field/Cargo.toml
@@ -4,12 +4,16 @@ description = "Finite field arithmetic"
 version = "0.1.0"
 edition = "2021"
 
+[features]
+default = ["rand"]
+rand = ["dep:rand"]
+
 [dependencies]
 plonky2_util = { path = "../util" }
 anyhow = "1.0.40"
 itertools = "0.10.0"
 num = { version = "0.4", features = [ "rand" ] }
-rand = "0.8.4"
+rand = { optional = true, version = "0.8.4" }
 serde = { version = "1.0", features = ["derive"] }
 unroll = "0.1.5"
 static_assertions = "1.1.0"
diff --git a/field/src/extension/quadratic.rs b/field/src/extension/quadratic.rs
index 5789ecc1..d68df42e 100644
--- a/field/src/extension/quadratic.rs
+++ b/field/src/extension/quadratic.rs
@@ -4,7 +4,6 @@ use std::ops::{Add, AddAssign, Div, DivAssign, Mul, MulAssign, Neg, Sub, SubAssi
 
 use num::bigint::BigUint;
 use num::Integer;
-use rand::Rng;
 use serde::{Deserialize, Serialize};
 
 use crate::extension::{Extendable, FieldExtension, Frobenius, OEF};
@@ -103,7 +102,8 @@ impl<F: Extendable<2>> Field for QuadraticExtension<F> {
         F::from_noncanonical_u128(n).into()
     }
 
-    fn rand_from_rng<R: Rng>(rng: &mut R) -> Self {
+    #[cfg(feature = "rand")]
+    fn rand_from_rng<R: rand::Rng>(rng: &mut R) -> Self {
         Self([F::rand_from_rng(rng), F::rand_from_rng(rng)])
     }
 }
diff --git a/field/src/extension/quartic.rs b/field/src/extension/quartic.rs
index ed8006f2..fc0cbcf8 100644
--- a/field/src/extension/quartic.rs
+++ b/field/src/extension/quartic.rs
@@ -5,7 +5,6 @@ use std::ops::{Add, AddAssign, Div, DivAssign, Mul, MulAssign, Neg, Sub, SubAssi
 use num::bigint::BigUint;
 use num::traits::Pow;
 use num::Integer;
-use rand::Rng;
 use serde::{Deserialize, Serialize};
 
 use crate::extension::{Extendable, FieldExtension, Frobenius, OEF};
@@ -115,7 +114,8 @@ impl<F: Extendable<4>> Field for QuarticExtension<F> {
         F::from_noncanonical_u128(n).into()
     }
 
-    fn rand_from_rng<R: Rng>(rng: &mut R) -> Self {
+    #[cfg(feature = "rand")]
+    fn rand_from_rng<R: rand::Rng>(rng: &mut R) -> Self {
         Self::from_basefield_array([
             F::rand_from_rng(rng),
             F::rand_from_rng(rng),
diff --git a/field/src/extension/quintic.rs b/field/src/extension/quintic.rs
index 7a992b7d..564674c3 100644
--- a/field/src/extension/quintic.rs
+++ b/field/src/extension/quintic.rs
@@ -4,7 +4,6 @@ use std::ops::{Add, AddAssign, Div, DivAssign, Mul, MulAssign, Neg, Sub, SubAssi
 
 use num::bigint::BigUint;
 use num::traits::Pow;
-use rand::Rng;
 use serde::{Deserialize, Serialize};
 
 use crate::extension::{Extendable, FieldExtension, Frobenius, OEF};
@@ -112,7 +111,8 @@ impl<F: Extendable<5>> Field for QuinticExtension<F> {
         F::from_noncanonical_u128(n).into()
     }
 
-    fn rand_from_rng<R: Rng>(rng: &mut R) -> Self {
+    #[cfg(feature = "rand")]
+    fn rand_from_rng<R: rand::Rng>(rng: &mut R) -> Self {
         Self::from_basefield_array([
             F::rand_from_rng(rng),
             F::rand_from_rng(rng),
diff --git a/field/src/goldilocks_field.rs b/field/src/goldilocks_field.rs
index 545d515a..c5075b5d 100644
--- a/field/src/goldilocks_field.rs
+++ b/field/src/goldilocks_field.rs
@@ -6,7 +6,6 @@ use std::ops::{Add, AddAssign, Div, DivAssign, Mul, MulAssign, Neg, Sub, SubAssi
 
 use num::{BigUint, Integer};
 use plonky2_util::{assume, branch_hint};
-use rand::Rng;
 use serde::{Deserialize, Serialize};
 
 use crate::inversion::try_inverse_u64;
@@ -105,7 +104,8 @@ impl Field for GoldilocksField {
         reduce128(n)
     }
 
-    fn rand_from_rng<R: Rng>(rng: &mut R) -> Self {
+    #[cfg(feature = "rand")]
+    fn rand_from_rng<R: rand::Rng>(rng: &mut R) -> Self {
         Self::from_canonical_u64(rng.gen_range(0..Self::ORDER))
     }
 
diff --git a/field/src/secp256k1_base.rs b/field/src/secp256k1_base.rs
index 9dd41a5d..9e39b982 100644
--- a/field/src/secp256k1_base.rs
+++ b/field/src/secp256k1_base.rs
@@ -5,9 +5,8 @@ use std::iter::{Product, Sum};
 use std::ops::{Add, AddAssign, Div, DivAssign, Mul, MulAssign, Neg, Sub, SubAssign};
 
 use itertools::Itertools;
-use num::bigint::{BigUint, RandBigInt};
+use num::bigint::BigUint;
 use num::{Integer, One};
-use rand::Rng;
 use serde::{Deserialize, Serialize};
 
 use crate::types::{Field, PrimeField};
@@ -133,7 +132,9 @@ impl Field for Secp256K1Base {
         Self([n.0, n.1 as u64, 0, 0])
     }
 
-    fn rand_from_rng<R: Rng>(rng: &mut R) -> Self {
+    #[cfg(feature = "rand")]
+    fn rand_from_rng<R: rand::Rng>(rng: &mut R) -> Self {
+        use num::bigint::RandBigInt;
         Self::from_biguint(rng.gen_biguint_below(&Self::order()))
     }
 }
diff --git a/field/src/secp256k1_scalar.rs b/field/src/secp256k1_scalar.rs
index ec1ad19e..eea67fab 100644
--- a/field/src/secp256k1_scalar.rs
+++ b/field/src/secp256k1_scalar.rs
@@ -6,9 +6,8 @@ use std::iter::{Product, Sum};
 use std::ops::{Add, AddAssign, Div, DivAssign, Mul, MulAssign, Neg, Sub, SubAssign};
 
 use itertools::Itertools;
-use num::bigint::{BigUint, RandBigInt};
+use num::bigint::BigUint;
 use num::{Integer, One};
-use rand::Rng;
 use serde::{Deserialize, Serialize};
 
 use crate::types::{Field, PrimeField};
@@ -142,7 +141,9 @@ impl Field for Secp256K1Scalar {
         Self([n.0, n.1 as u64, 0, 0])
     }
 
-    fn rand_from_rng<R: Rng>(rng: &mut R) -> Self {
+    #[cfg(feature = "rand")]
+    fn rand_from_rng<R: rand::Rng>(rng: &mut R) -> Self {
+        use num::bigint::RandBigInt;
         Self::from_biguint(rng.gen_biguint_below(&Self::order()))
     }
 }
diff --git a/field/src/types.rs b/field/src/types.rs
index 81945e5a..b7335704 100644
--- a/field/src/types.rs
+++ b/field/src/types.rs
@@ -6,7 +6,6 @@ use std::ops::{Add, AddAssign, Div, DivAssign, Mul, MulAssign, Neg, Sub, SubAssi
 use num::bigint::BigUint;
 use num::{Integer, One, ToPrimitive, Zero};
 use plonky2_util::bits_u64;
-use rand::Rng;
 use serde::de::DeserializeOwned;
 use serde::Serialize;
 
@@ -312,7 +311,8 @@ pub trait Field:
         Self::from_noncanonical_u128(n)
     }
 
-    fn rand_from_rng<R: Rng>(rng: &mut R) -> Self;
+    #[cfg(feature = "rand")]
+    fn rand_from_rng<R: rand::Rng>(rng: &mut R) -> Self;
 
     fn exp_power_of_2(&self, power_log: usize) -> Self {
         let mut res = *self;
@@ -391,14 +391,17 @@ pub trait Field:
         }
     }
 
+    #[cfg(feature = "rand")]
     fn rand() -> Self {
         Self::rand_from_rng(&mut rand::thread_rng())
     }
 
+    #[cfg(feature = "rand")]
     fn rand_arr<const N: usize>() -> [Self; N] {
         Self::rand_vec(N).try_into().unwrap()
     }
 
+    #[cfg(feature = "rand")]
     fn rand_vec(n: usize) -> Vec<Self> {
         (0..n).map(|_| Self::rand()).collect()
     }
diff --git a/maybe_rayon/Cargo.toml b/maybe_rayon/Cargo.toml
new file mode 100644
index 00000000..f8cc95fb
--- /dev/null
+++ b/maybe_rayon/Cargo.toml
@@ -0,0 +1,11 @@
+[package]
+name = "maybe_rayon"
+version = "0.1.0"
+edition = "2021"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+[features]
+parallel = ["rayon"]
+
+[dependencies]
+rayon = { version = "1.5.3", optional = true }
diff --git a/maybe_rayon/src/lib.rs b/maybe_rayon/src/lib.rs
new file mode 100644
index 00000000..1a9bd823
--- /dev/null
+++ b/maybe_rayon/src/lib.rs
@@ -0,0 +1,262 @@
+#[cfg(not(feature = "parallel"))]
+use std::{
+    iter::{IntoIterator, Iterator},
+    slice::{Chunks, ChunksExact, ChunksExactMut, ChunksMut},
+};
+
+#[cfg(feature = "parallel")]
+pub use rayon::prelude::{
+    IndexedParallelIterator, ParallelDrainFull, ParallelDrainRange, ParallelExtend,
+    ParallelIterator,
+};
+#[cfg(feature = "parallel")]
+use rayon::{
+    prelude::*,
+    slice::{
+        Chunks as ParChunks, ChunksExact as ParChunksExact, ChunksExactMut as ParChunksExactMut,
+        ChunksMut as ParChunksMut, ParallelSlice, ParallelSliceMut,
+    },
+};
+
+pub trait MaybeParIter<'data> {
+    #[cfg(feature = "parallel")]
+    type Item: Send + 'data;
+
+    #[cfg(feature = "parallel")]
+    type Iter: ParallelIterator<Item = Self::Item>;
+
+    #[cfg(not(feature = "parallel"))]
+    type Item;
+
+    #[cfg(not(feature = "parallel"))]
+    type Iter: Iterator<Item = Self::Item>;
+
+    fn par_iter(&'data self) -> Self::Iter;
+}
+
+#[cfg(feature = "parallel")]
+impl<'data, T> MaybeParIter<'data> for T
+where
+    T: ?Sized + IntoParallelRefIterator<'data>,
+{
+    type Item = T::Item;
+    type Iter = T::Iter;
+
+    fn par_iter(&'data self) -> Self::Iter {
+        self.par_iter()
+    }
+}
+
+#[cfg(not(feature = "parallel"))]
+impl<'data, T: 'data> MaybeParIter<'data> for Vec<T> {
+    type Item = &'data T;
+    type Iter = std::slice::Iter<'data, T>;
+
+    fn par_iter(&'data self) -> Self::Iter {
+        self.iter()
+    }
+}
+
+#[cfg(not(feature = "parallel"))]
+impl<'data, T: 'data> MaybeParIter<'data> for [T] {
+    type Item = &'data T;
+    type Iter = std::slice::Iter<'data, T>;
+
+    fn par_iter(&'data self) -> Self::Iter {
+        self.iter()
+    }
+}
+
+pub trait MaybeParIterMut<'data> {
+    #[cfg(feature = "parallel")]
+    type Item: Send + 'data;
+
+    #[cfg(feature = "parallel")]
+    type Iter: ParallelIterator<Item = Self::Item>;
+
+    #[cfg(not(feature = "parallel"))]
+    type Item;
+
+    #[cfg(not(feature = "parallel"))]
+    type Iter: Iterator<Item = Self::Item>;
+
+    fn par_iter_mut(&'data mut self) -> Self::Iter;
+}
+
+#[cfg(feature = "parallel")]
+impl<'data, T> MaybeParIterMut<'data> for T
+where
+    T: ?Sized + IntoParallelRefMutIterator<'data>,
+{
+    type Item = T::Item;
+    type Iter = T::Iter;
+
+    fn par_iter_mut(&'data mut self) -> Self::Iter {
+        self.par_iter_mut()
+    }
+}
+
+#[cfg(not(feature = "parallel"))]
+impl<'data, T: 'data> MaybeParIterMut<'data> for Vec<T> {
+    type Item = &'data mut T;
+    type Iter = std::slice::IterMut<'data, T>;
+
+    fn par_iter_mut(&'data mut self) -> Self::Iter {
+        self.iter_mut()
+    }
+}
+
+#[cfg(not(feature = "parallel"))]
+impl<'data, T: 'data> MaybeParIterMut<'data> for [T] {
+    type Item = &'data mut T;
+    type Iter = std::slice::IterMut<'data, T>;
+
+    fn par_iter_mut(&'data mut self) -> Self::Iter {
+        self.iter_mut()
+    }
+}
+
+pub trait MaybeIntoParIter {
+    #[cfg(feature = "parallel")]
+    type Item: Send;
+
+    #[cfg(feature = "parallel")]
+    type Iter: ParallelIterator<Item = Self::Item>;
+
+    #[cfg(not(feature = "parallel"))]
+    type Item;
+
+    #[cfg(not(feature = "parallel"))]
+    type Iter: Iterator<Item = Self::Item>;
+
+    fn into_par_iter(self) -> Self::Iter;
+}
+
+#[cfg(feature = "parallel")]
+impl<T> MaybeIntoParIter for T
+where
+    T: IntoParallelIterator,
+{
+    type Item = T::Item;
+    type Iter = T::Iter;
+
+    fn into_par_iter(self) -> Self::Iter {
+        self.into_par_iter()
+    }
+}
+
+#[cfg(not(feature = "parallel"))]
+impl<T> MaybeIntoParIter for T
+where
+    T: IntoIterator,
+{
+    type Item = T::Item;
+    type Iter = T::IntoIter;
+
+    fn into_par_iter(self) -> Self::Iter {
+        self.into_iter()
+    }
+}
+
+#[cfg(feature = "parallel")]
+pub trait MaybeParChunks<T: Sync> {
+    fn par_chunks(&self, chunk_size: usize) -> ParChunks<'_, T>;
+    fn par_chunks_exact(&self, chunk_size: usize) -> ParChunksExact<'_, T>;
+}
+
+#[cfg(not(feature = "parallel"))]
+pub trait MaybeParChunks<T> {
+    fn par_chunks(&self, chunk_size: usize) -> Chunks<'_, T>;
+    fn par_chunks_exact(&self, chunk_size: usize) -> ChunksExact<'_, T>;
+}
+
+#[cfg(feature = "parallel")]
+impl<T: ParallelSlice<U> + ?Sized, U: Sync> MaybeParChunks<U> for T {
+    fn par_chunks(&self, chunk_size: usize) -> ParChunks<'_, U> {
+        self.par_chunks(chunk_size)
+    }
+    fn par_chunks_exact(&self, chunk_size: usize) -> ParChunksExact<'_, U> {
+        self.par_chunks_exact(chunk_size)
+    }
+}
+
+#[cfg(not(feature = "parallel"))]
+impl<T> MaybeParChunks<T> for [T] {
+    fn par_chunks(&self, chunk_size: usize) -> Chunks<'_, T> {
+        self.chunks(chunk_size)
+    }
+
+    fn par_chunks_exact(&self, chunk_size: usize) -> ChunksExact<'_, T> {
+        self.chunks_exact(chunk_size)
+    }
+}
+
+#[cfg(feature = "parallel")]
+pub trait MaybeParChunksMut<T: Send> {
+    fn par_chunks_mut(&mut self, chunk_size: usize) -> ParChunksMut<'_, T>;
+    fn par_chunks_exact_mut(&mut self, chunk_size: usize) -> ParChunksExactMut<'_, T>;
+}
+
+#[cfg(not(feature = "parallel"))]
+pub trait MaybeParChunksMut<T: Send> {
+    fn par_chunks_mut(&mut self, chunk_size: usize) -> ChunksMut<'_, T>;
+    fn par_chunks_exact_mut(&mut self, chunk_size: usize) -> ChunksExactMut<'_, T>;
+}
+
+#[cfg(feature = "parallel")]
+impl<T: ?Sized + ParallelSliceMut<U>, U: Send> MaybeParChunksMut<U> for T {
+    fn par_chunks_mut(&mut self, chunk_size: usize) -> ParChunksMut<'_, U> {
+        self.par_chunks_mut(chunk_size)
+    }
+    fn par_chunks_exact_mut(&mut self, chunk_size: usize) -> ParChunksExactMut<'_, U> {
+        self.par_chunks_exact_mut(chunk_size)
+    }
+}
+
+#[cfg(not(feature = "parallel"))]
+impl<T: Send> MaybeParChunksMut<T> for [T] {
+    fn par_chunks_mut(&mut self, chunk_size: usize) -> ChunksMut<'_, T> {
+        self.chunks_mut(chunk_size)
+    }
+    fn par_chunks_exact_mut(&mut self, chunk_size: usize) -> ChunksExactMut<'_, T> {
+        self.chunks_exact_mut(chunk_size)
+    }
+}
+
+pub trait ParallelIteratorMock {
+    type Item;
+    fn find_any<P>(self, predicate: P) -> Option<Self::Item>
+    where
+        P: Fn(&Self::Item) -> bool + Sync + Send;
+}
+
+impl<T: Iterator> ParallelIteratorMock for T {
+    type Item = T::Item;
+
+    fn find_any<P>(mut self, predicate: P) -> Option<Self::Item>
+    where
+        P: Fn(&Self::Item) -> bool + Sync + Send,
+    {
+        self.find(predicate)
+    }
+}
+
+#[cfg(feature = "parallel")]
+pub fn join<A, B, RA, RB>(oper_a: A, oper_b: B) -> (RA, RB)
+where
+    A: FnOnce() -> RA + Send,
+    B: FnOnce() -> RB + Send,
+    RA: Send,
+    RB: Send,
+{
+    rayon::join(oper_a, oper_b)
+}
+
+#[cfg(not(feature = "parallel"))]
+pub fn join<A, B, RA, RB>(oper_a: A, oper_b: B) -> (RA, RB)
+where
+    A: FnOnce() -> RA,
+    B: FnOnce() -> RB,
+{
+    (oper_a(), oper_b())
+}
diff --git a/plonky2/Cargo.toml b/plonky2/Cargo.toml
index 9c019640..b3fa5113 100644
--- a/plonky2/Cargo.toml
+++ b/plonky2/Cargo.toml
@@ -10,16 +10,23 @@ categories = ["cryptography"]
 edition = "2021"
 default-run = "generate_constants"
 
+[features]
+default = ["parallel", "rand", "rand_chacha", "timing", "gate_testing"]
+parallel = ["maybe_rayon/parallel"]
+rand = ["dep:rand", "plonky2_field/rand"]
+gate_testing = ["rand"]
+rand_chacha = ["dep:rand_chacha"]
+timing = []
+
 [dependencies]
 plonky2_field = { path = "../field" }
 plonky2_util = { path = "../util" }
-env_logger = "0.9.0"
 log = "0.4.14"
 itertools = "0.10.0"
 num = { version = "0.4", features = [ "rand" ] }
-rand = "0.8.4"
-rand_chacha = "0.3.1"
-rayon = "1.5.1"
+rand = { version = "0.8.4", optional = true }
+rand_chacha = { version = "0.3.1", optional = true }
+maybe_rayon = { path = "../maybe_rayon" }
 unroll = "0.1.5"
 anyhow = "1.0.40"
 serde = { version = "1.0", features = ["derive"] }
@@ -28,14 +35,22 @@ keccak-hash = "0.8.0"
 static_assertions = "1.1.0"
 
 [dev-dependencies]
+rand = "0.8.4"
+rand_chacha = "0.3.1"
 criterion = "0.3.5"
+env_logger = "0.9.0"
 tynm = "0.1.6"
 structopt = "0.3.26"
 num_cpus = "1.13.1"
+rayon = "1.5.1" 
 
 [target.'cfg(not(target_env = "msvc"))'.dev-dependencies]
 jemallocator = "0.3.2"
 
+[[bin]]
+name = "generate_constants"
+required-features = ["rand", "rand_chacha"]
+
 [[bench]]
 name = "field_arithmetic"
 harness = false
diff --git a/plonky2/examples/bench_recursion.rs b/plonky2/examples/bench_recursion.rs
index 1f2d127f..8073c9dc 100644
--- a/plonky2/examples/bench_recursion.rs
+++ b/plonky2/examples/bench_recursion.rs
@@ -2,7 +2,6 @@
 // custom CLI argument parsing (even with harness disabled). We could also have
 // put it in `src/bin/`, but then we wouldn't have access to
 // `[dev-dependencies]`.
-
 #![allow(incomplete_features)]
 #![feature(generic_const_exprs)]
 
diff --git a/plonky2/src/fri/oracle.rs b/plonky2/src/fri/oracle.rs
index 312b458b..1f5b648f 100644
--- a/plonky2/src/fri/oracle.rs
+++ b/plonky2/src/fri/oracle.rs
@@ -1,11 +1,11 @@
 use itertools::Itertools;
+use maybe_rayon::*;
 use plonky2_field::extension::Extendable;
 use plonky2_field::fft::FftRootTable;
 use plonky2_field::packed::PackedField;
 use plonky2_field::polynomial::{PolynomialCoeffs, PolynomialValues};
 use plonky2_field::types::Field;
 use plonky2_util::{log2_strict, reverse_index_bits_in_place};
-use rayon::prelude::*;
 
 use crate::fri::proof::FriProof;
 use crate::fri::prover::fri_proof;
diff --git a/plonky2/src/fri/prover.rs b/plonky2/src/fri/prover.rs
index 6136a9a1..39e25869 100644
--- a/plonky2/src/fri/prover.rs
+++ b/plonky2/src/fri/prover.rs
@@ -1,8 +1,8 @@
 use itertools::Itertools;
+use maybe_rayon::*;
 use plonky2_field::extension::{flatten, unflatten, Extendable};
 use plonky2_field::polynomial::{PolynomialCoeffs, PolynomialValues};
 use plonky2_util::reverse_index_bits_in_place;
-use rayon::prelude::*;
 
 use crate::fri::proof::{FriInitialTreeProof, FriProof, FriQueryRound, FriQueryStep};
 use crate::fri::{FriConfig, FriParams};
diff --git a/plonky2/src/gates/exponentiation.rs b/plonky2/src/gates/exponentiation.rs
index db12d697..aa977308 100644
--- a/plonky2/src/gates/exponentiation.rs
+++ b/plonky2/src/gates/exponentiation.rs
@@ -23,7 +23,7 @@ use crate::plonk::vars::{
 
 /// A gate for raising a value to a power.
 #[derive(Clone, Debug)]
-pub(crate) struct ExponentiationGate<F: RichField + Extendable<D>, const D: usize> {
+pub struct ExponentiationGate<F: RichField + Extendable<D>, const D: usize> {
     pub num_power_bits: usize,
     pub _phantom: PhantomData<F>,
 }
diff --git a/plonky2/src/gates/interpolation.rs b/plonky2/src/gates/interpolation.rs
index c98f7fe3..1983e5aa 100644
--- a/plonky2/src/gates/interpolation.rs
+++ b/plonky2/src/gates/interpolation.rs
@@ -22,7 +22,7 @@ use crate::plonk::vars::{EvaluationTargets, EvaluationVars, EvaluationVarsBase};
 /// Interpolation gate with constraints of degree at most `1<<subgroup_bits`.
 /// `eval_unfiltered_recursively` uses less gates than `LowDegreeInterpolationGate`.
 #[derive(Copy, Clone, Debug)]
-pub(crate) struct HighDegreeInterpolationGate<F: RichField + Extendable<D>, const D: usize> {
+pub struct HighDegreeInterpolationGate<F: RichField + Extendable<D>, const D: usize> {
     pub subgroup_bits: usize,
     _phantom: PhantomData<F>,
 }
diff --git a/plonky2/src/gates/low_degree_interpolation.rs b/plonky2/src/gates/low_degree_interpolation.rs
index 4852792a..217f4f0a 100644
--- a/plonky2/src/gates/low_degree_interpolation.rs
+++ b/plonky2/src/gates/low_degree_interpolation.rs
@@ -23,7 +23,7 @@ use crate::plonk::vars::{EvaluationTargets, EvaluationVars, EvaluationVarsBase};
 /// Interpolation gate with constraints of degree 2.
 /// `eval_unfiltered_recursively` uses more gates than `HighDegreeInterpolationGate`.
 #[derive(Copy, Clone, Debug)]
-pub(crate) struct LowDegreeInterpolationGate<F: RichField + Extendable<D>, const D: usize> {
+pub struct LowDegreeInterpolationGate<F: RichField + Extendable<D>, const D: usize> {
     pub subgroup_bits: usize,
     _phantom: PhantomData<F>,
 }
diff --git a/plonky2/src/gates/mod.rs b/plonky2/src/gates/mod.rs
index d02f2978..df65b44c 100644
--- a/plonky2/src/gates/mod.rs
+++ b/plonky2/src/gates/mod.rs
@@ -14,8 +14,8 @@ pub mod multiplication_extension;
 pub mod noop;
 pub mod packed_util;
 pub mod poseidon;
-pub(crate) mod poseidon_mds;
-pub(crate) mod public_input;
+pub mod poseidon_mds;
+pub mod public_input;
 pub mod random_access;
 pub mod reducing;
 pub mod reducing_extension;
@@ -24,4 +24,5 @@ pub mod util;
 
 // Can't use #[cfg(test)] here because it needs to be visible to other crates.
 // See https://github.com/rust-lang/cargo/issues/8379
+#[cfg(any(feature = "gate_testing", test))]
 pub mod gate_testing;
diff --git a/plonky2/src/gates/random_access.rs b/plonky2/src/gates/random_access.rs
index b1f1d529..2df392bc 100644
--- a/plonky2/src/gates/random_access.rs
+++ b/plonky2/src/gates/random_access.rs
@@ -23,7 +23,7 @@ use crate::plonk::vars::{
 
 /// A gate for checking that a particular element of a list matches a given value.
 #[derive(Copy, Clone, Debug)]
-pub(crate) struct RandomAccessGate<F: RichField + Extendable<D>, const D: usize> {
+pub struct RandomAccessGate<F: RichField + Extendable<D>, const D: usize> {
     pub bits: usize,
     pub num_copies: usize,
     pub num_extra_constants: usize,
diff --git a/plonky2/src/hash/hash_types.rs b/plonky2/src/hash/hash_types.rs
index 281930a5..14303ad3 100644
--- a/plonky2/src/hash/hash_types.rs
+++ b/plonky2/src/hash/hash_types.rs
@@ -1,6 +1,5 @@
 use plonky2_field::goldilocks_field::GoldilocksField;
 use plonky2_field::types::{Field, PrimeField64};
-use rand::Rng;
 use serde::{Deserialize, Deserializer, Serialize, Serializer};
 
 use crate::hash::poseidon::Poseidon;
@@ -37,7 +36,8 @@ impl<F: Field> HashOut<F> {
         Self { elements }
     }
 
-    pub fn rand_from_rng<R: Rng>(rng: &mut R) -> Self {
+    #[cfg(feature = "parallel")]
+    pub fn rand_from_rng<R: rand::Rng>(rng: &mut R) -> Self {
         Self {
             elements: [
                 F::rand_from_rng(rng),
@@ -115,12 +115,14 @@ pub struct MerkleCapTarget(pub Vec<HashOutTarget>);
 pub struct BytesHash<const N: usize>(pub [u8; N]);
 
 impl<const N: usize> BytesHash<N> {
-    pub fn rand_from_rng<R: Rng>(rng: &mut R) -> Self {
+    #[cfg(feature = "parallel")]
+    pub fn rand_from_rng<R: rand::Rng>(rng: &mut R) -> Self {
         let mut buf = [0; N];
         rng.fill_bytes(&mut buf);
         Self(buf)
     }
 
+    #[cfg(feature = "rand")]
     pub fn rand() -> Self {
         Self::rand_from_rng(&mut rand::thread_rng())
     }
diff --git a/plonky2/src/hash/merkle_tree.rs b/plonky2/src/hash/merkle_tree.rs
index 69cf2ef9..1da66bff 100644
--- a/plonky2/src/hash/merkle_tree.rs
+++ b/plonky2/src/hash/merkle_tree.rs
@@ -1,8 +1,8 @@
 use std::mem::MaybeUninit;
 use std::slice;
 
+use maybe_rayon::*;
 use plonky2_util::log2_strict;
-use rayon::prelude::*;
 use serde::{Deserialize, Serialize};
 
 use crate::hash::hash_types::RichField;
@@ -77,10 +77,12 @@ where
         let (right_digest_mem, right_digests_buf) = right_digests_buf.split_first_mut().unwrap();
         // Split `leaves` between both children.
         let (left_leaves, right_leaves) = leaves.split_at(leaves.len() / 2);
-        let (left_digest, right_digest) = rayon::join(
+
+        let (left_digest, right_digest) = maybe_rayon::join(
             || fill_subtree::<F, H>(left_digests_buf, left_leaves),
             || fill_subtree::<F, H>(right_digests_buf, right_leaves),
         );
+
         left_digest_mem.write(left_digest);
         right_digest_mem.write(right_digest);
         H::two_to_one(left_digest, right_digest)
diff --git a/plonky2/src/plonk/permutation_argument.rs b/plonky2/src/plonk/permutation_argument.rs
index 076c2a7a..3658a12d 100644
--- a/plonky2/src/plonk/permutation_argument.rs
+++ b/plonky2/src/plonk/permutation_argument.rs
@@ -1,8 +1,8 @@
 use std::collections::HashMap;
 
+use maybe_rayon::*;
 use plonky2_field::polynomial::PolynomialValues;
 use plonky2_field::types::Field;
-use rayon::prelude::*;
 
 use crate::iop::target::Target;
 use crate::iop::wire::Wire;
diff --git a/plonky2/src/plonk/proof.rs b/plonky2/src/plonk/proof.rs
index 18af1f73..922a24bb 100644
--- a/plonky2/src/plonk/proof.rs
+++ b/plonky2/src/plonk/proof.rs
@@ -1,6 +1,6 @@
 use anyhow::ensure;
+use maybe_rayon::*;
 use plonky2_field::extension::Extendable;
-use rayon::prelude::*;
 use serde::{Deserialize, Serialize};
 
 use crate::fri::oracle::PolynomialBatch;
diff --git a/plonky2/src/plonk/prover.rs b/plonky2/src/plonk/prover.rs
index 26626208..3e81942b 100644
--- a/plonky2/src/plonk/prover.rs
+++ b/plonky2/src/plonk/prover.rs
@@ -2,11 +2,11 @@ use std::mem::swap;
 
 use anyhow::ensure;
 use anyhow::Result;
+use maybe_rayon::*;
 use plonky2_field::extension::Extendable;
 use plonky2_field::polynomial::{PolynomialCoeffs, PolynomialValues};
 use plonky2_field::zero_poly_coset::ZeroPolyOnCoset;
 use plonky2_util::{ceil_div_usize, log2_ceil};
-use rayon::prelude::*;
 
 use crate::field::types::Field;
 use crate::fri::oracle::PolynomialBatch;
diff --git a/plonky2/src/util/timing.rs b/plonky2/src/util/timing.rs
index 4250d688..42033038 100644
--- a/plonky2/src/util/timing.rs
+++ b/plonky2/src/util/timing.rs
@@ -1,8 +1,10 @@
+#[cfg(feature = "timing")]
 use std::time::{Duration, Instant};
 
 use log::{log, Level};
 
 /// The hierarchy of scopes, and the time consumed by each one. Useful for profiling.
+#[cfg(feature = "timing")]
 pub struct TimingTree {
     /// The name of this scope.
     name: String,
@@ -16,13 +18,25 @@ pub struct TimingTree {
     children: Vec<TimingTree>,
 }
 
+#[cfg(not(feature = "timing"))]
+pub struct TimingTree(Level);
+
+#[cfg(feature = "timing")]
 impl Default for TimingTree {
     fn default() -> Self {
         TimingTree::new("root", Level::Debug)
     }
 }
 
+#[cfg(not(feature = "timing"))]
+impl Default for TimingTree {
+    fn default() -> Self {
+        TimingTree::new("", Level::Debug)
+    }
+}
+
 impl TimingTree {
+    #[cfg(feature = "timing")]
     pub fn new(root_name: &str, level: Level) -> Self {
         Self {
             name: root_name.to_string(),
@@ -33,18 +47,26 @@ impl TimingTree {
         }
     }
 
+    #[cfg(not(feature = "timing"))]
+    pub fn new(_root_name: &str, level: Level) -> Self {
+        Self(level)
+    }
+
     /// Whether this scope is still in scope.
+    #[cfg(feature = "timing")]
     fn is_open(&self) -> bool {
         self.exit_time.is_none()
     }
 
     /// A description of the stack of currently-open scopes.
+    #[cfg(feature = "timing")]
     pub fn open_stack(&self) -> String {
         let mut stack = Vec::new();
         self.open_stack_helper(&mut stack);
         stack.join(" > ")
     }
 
+    #[cfg(feature = "timing")]
     fn open_stack_helper(&self, stack: &mut Vec<String>) {
         if self.is_open() {
             stack.push(self.name.clone());
@@ -54,6 +76,7 @@ impl TimingTree {
         }
     }
 
+    #[cfg(feature = "timing")]
     pub fn push(&mut self, ctx: &str, mut level: log::Level) {
         assert!(self.is_open());
 
@@ -76,7 +99,11 @@ impl TimingTree {
         })
     }
 
+    #[cfg(not(feature = "timing"))]
+    pub fn push(&mut self, _ctx: &str, _level: log::Level) {}
+
     /// Close the deepest open scope from this tree.
+    #[cfg(feature = "timing")]
     pub fn pop(&mut self) {
         assert!(self.is_open());
 
@@ -90,6 +117,10 @@ impl TimingTree {
         self.exit_time = Some(Instant::now());
     }
 
+    #[cfg(not(feature = "timing"))]
+    pub fn pop(&mut self) {}
+
+    #[cfg(feature = "timing")]
     fn duration(&self) -> Duration {
         self.exit_time
             .unwrap_or_else(Instant::now)
@@ -97,6 +128,7 @@ impl TimingTree {
     }
 
     /// Filter out children with a low duration.
+    #[cfg(feature = "timing")]
     pub fn filter(&self, min_delta: Duration) -> Self {
         Self {
             name: self.name.clone(),
@@ -112,10 +144,20 @@ impl TimingTree {
         }
     }
 
+    #[cfg(feature = "timing")]
     pub fn print(&self) {
         self.print_helper(0);
     }
 
+    #[cfg(not(feature = "timing"))]
+    pub fn print(&self) {
+        log!(
+            self.0,
+            "TimingTree is not supported without the 'timing' feature enabled"
+        );
+    }
+
+    #[cfg(feature = "timing")]
     fn print_helper(&self, depth: usize) {
         let prefix = "| ".repeat(depth);
         log!(
diff --git a/starky/Cargo.toml b/starky/Cargo.toml
index 4e67856d..80a26bfc 100644
--- a/starky/Cargo.toml
+++ b/starky/Cargo.toml
@@ -4,6 +4,10 @@ description = "Implementation of STARKs"
 version = "0.1.0"
 edition = "2021"
 
+[features]
+default = ["parallel"]
+parallel = ["maybe_rayon/parallel"]
+
 [dependencies]
 plonky2 = { path = "../plonky2" }
 plonky2_util = { path = "../util" }
@@ -11,4 +15,4 @@ anyhow = "1.0.40"
 env_logger = "0.9.0"
 itertools = "0.10.0"
 log = "0.4.14"
-rayon = "1.5.1"
+maybe_rayon = { path = "../maybe_rayon"}
diff --git a/starky/src/permutation.rs b/starky/src/permutation.rs
index 88361003..7d422171 100644
--- a/starky/src/permutation.rs
+++ b/starky/src/permutation.rs
@@ -1,6 +1,7 @@
 //! Permutation arguments.
 
 use itertools::Itertools;
+use maybe_rayon::*;
 use plonky2::field::batch_util::batch_multiply_inplace;
 use plonky2::field::extension::{Extendable, FieldExtension};
 use plonky2::field::packed::PackedField;
@@ -13,7 +14,6 @@ use plonky2::iop::target::Target;
 use plonky2::plonk::circuit_builder::CircuitBuilder;
 use plonky2::plonk::config::{AlgebraicHasher, GenericConfig, Hasher};
 use plonky2::util::reducing::{ReducingFactor, ReducingFactorTarget};
-use rayon::prelude::*;
 
 use crate::config::StarkConfig;
 use crate::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer};
diff --git a/starky/src/proof.rs b/starky/src/proof.rs
index c321b484..c9900c08 100644
--- a/starky/src/proof.rs
+++ b/starky/src/proof.rs
@@ -1,4 +1,5 @@
 use itertools::Itertools;
+use maybe_rayon::*;
 use plonky2::field::extension::{Extendable, FieldExtension};
 use plonky2::fri::oracle::PolynomialBatch;
 use plonky2::fri::proof::{
@@ -12,7 +13,6 @@ use plonky2::hash::merkle_tree::MerkleCap;
 use plonky2::iop::ext_target::ExtensionTarget;
 use plonky2::iop::target::Target;
 use plonky2::plonk::config::GenericConfig;
-use rayon::prelude::*;
 
 use crate::config::StarkConfig;
 use crate::permutation::PermutationChallengeSet;
diff --git a/starky/src/prover.rs b/starky/src/prover.rs
index 6cdb1384..24593b45 100644
--- a/starky/src/prover.rs
+++ b/starky/src/prover.rs
@@ -2,6 +2,7 @@ use std::iter::once;
 
 use anyhow::{ensure, Result};
 use itertools::Itertools;
+use maybe_rayon::*;
 use plonky2::field::extension::Extendable;
 use plonky2::field::packable::Packable;
 use plonky2::field::packed::PackedField;
@@ -16,7 +17,6 @@ use plonky2::timed;
 use plonky2::util::timing::TimingTree;
 use plonky2::util::transpose;
 use plonky2_util::{log2_ceil, log2_strict};
-use rayon::prelude::*;
 
 use crate::config::StarkConfig;
 use crate::constraint_consumer::ConstraintConsumer;