diff --git a/evm/src/all_stark.rs b/evm/src/all_stark.rs
index 74f211d6..a2996eb4 100644
--- a/evm/src/all_stark.rs
+++ b/evm/src/all_stark.rs
@@ -1,5 +1,6 @@
 use std::iter;
 
+use itertools::Itertools;
 use plonky2::field::extension::Extendable;
 use plonky2::field::types::Field;
 use plonky2::hash::hash_types::RichField;
diff --git a/evm/src/arithmetic/arithmetic_stark.rs b/evm/src/arithmetic/arithmetic_stark.rs
index 52fc2f78..1d3af2b9 100644
--- a/evm/src/arithmetic/arithmetic_stark.rs
+++ b/evm/src/arithmetic/arithmetic_stark.rs
@@ -11,7 +11,7 @@ use plonky2::plonk::circuit_builder::CircuitBuilder;
 use plonky2::util::transpose;
 use static_assertions::const_assert;
 
-use super::columns::NUM_ARITH_COLUMNS;
+use super::columns::{op_flags, NUM_ARITH_COLUMNS};
 use super::shift;
 use crate::all_stark::Table;
 use crate::arithmetic::columns::{NUM_SHARED_COLS, RANGE_COUNTER, RC_FREQUENCIES, SHARED_COLS};
@@ -208,6 +208,12 @@ impl<F: RichField + Extendable<D>, const D: usize> Stark<F, D> for ArithmeticSta
         let lv: &[P; NUM_ARITH_COLUMNS] = vars.get_local_values().try_into().unwrap();
         let nv: &[P; NUM_ARITH_COLUMNS] = vars.get_next_values().try_into().unwrap();
 
+        // Flags must be boolean.
+        for flag_idx in op_flags() {
+            let flag = lv[flag_idx];
+            yield_constr.constraint(flag * (flag - P::ONES));
+        }
+
         // Check that `OPCODE_COL` holds 0 if the operation is not a range_check.
         let opcode_constraint = (P::ONES - lv[columns::IS_RANGE_CHECK]) * lv[columns::OPCODE_COL];
         yield_constr.constraint(opcode_constraint);
@@ -248,6 +254,13 @@ impl<F: RichField + Extendable<D>, const D: usize> Stark<F, D> for ArithmeticSta
         let nv: &[ExtensionTarget<D>; NUM_ARITH_COLUMNS] =
             vars.get_next_values().try_into().unwrap();
 
+        // Flags must be boolean.
+        for flag_idx in op_flags() {
+            let flag = lv[flag_idx];
+            let constraint = builder.mul_sub_extension(flag, flag, flag);
+            yield_constr.constraint(builder, constraint);
+        }
+
         // Check that `OPCODE_COL` holds 0 if the operation is not a range_check.
         let opcode_constraint = builder.arithmetic_extension(
             F::NEG_ONE,
diff --git a/evm/src/arithmetic/columns.rs b/evm/src/arithmetic/columns.rs
index bcfb29c1..dc535492 100644
--- a/evm/src/arithmetic/columns.rs
+++ b/evm/src/arithmetic/columns.rs
@@ -43,6 +43,10 @@ pub(crate) const IS_RANGE_CHECK: usize = IS_SHR + 1;
 pub(crate) const OPCODE_COL: usize = IS_RANGE_CHECK + 1;
 pub(crate) const START_SHARED_COLS: usize = OPCODE_COL + 1;
 
+pub(crate) const fn op_flags() -> Range<usize> {
+    IS_ADD..IS_RANGE_CHECK + 1
+}
+
 /// Within the Arithmetic Unit, there are shared columns which can be
 /// used by any arithmetic circuit, depending on which one is active
 /// this cycle.
diff --git a/evm/src/cpu/contextops.rs b/evm/src/cpu/contextops.rs
index 0afc070f..77debc75 100644
--- a/evm/src/cpu/contextops.rs
+++ b/evm/src/cpu/contextops.rs
@@ -105,7 +105,7 @@ fn eval_packed_get<P: PackedField>(
 }
 
 /// Circuit version of `eval_packed_get`.
-/// Evalutes constraints for GET_CONTEXT.
+/// Evaluates constraints for GET_CONTEXT.
 fn eval_ext_circuit_get<F: RichField + Extendable<D>, const D: usize>(
     builder: &mut CircuitBuilder<F, D>,
     lv: &CpuColumnsView<ExtensionTarget<D>>,
diff --git a/evm/src/cpu/kernel/asm/bignum/cmp.asm b/evm/src/cpu/kernel/asm/bignum/cmp.asm
index d5abd238..cf6b547f 100644
--- a/evm/src/cpu/kernel/asm/bignum/cmp.asm
+++ b/evm/src/cpu/kernel/asm/bignum/cmp.asm
@@ -12,17 +12,19 @@ global cmp_bignum:
     // stack: len, a_start_loc, b_start_loc, retdest
     SWAP1
     // stack: a_start_loc, len, b_start_loc, retdest
-    DUP2
-    // stack: len, a_start_loc, len, b_start_loc, retdest
+    PUSH 1
+    DUP3
+    SUB
+    // stack: len-1, a_start_loc, len, b_start_loc, retdest
     ADD
-    %decrement
     // stack: a_end_loc, len, b_start_loc, retdest
     SWAP2
     // stack: b_start_loc, len, a_end_loc, retdest
-    DUP2
-    // stack: len, b_start_loc, len, a_end_loc, retdest
+    PUSH 1
+    DUP3
+    SUB
+    // stack: len-1, b_start_loc, len, a_end_loc, retdest
     ADD
-    %decrement
     // stack: b_end_loc, len, a_end_loc, retdest
     %stack (b, l, a) -> (l, a, b)
     // stack: len, a_end_loc, b_end_loc, retdest
diff --git a/evm/src/cpu/kernel/asm/bignum/shr.asm b/evm/src/cpu/kernel/asm/bignum/shr.asm
index adf95770..16d8403c 100644
--- a/evm/src/cpu/kernel/asm/bignum/shr.asm
+++ b/evm/src/cpu/kernel/asm/bignum/shr.asm
@@ -42,9 +42,9 @@ shr_loop:
     // stack: i, carry << 127 | a[i] >> 1, i, new_carry, start_loc, retdest
     %mstore_current_general
     // stack: i, new_carry, start_loc, retdest
-    DUP1
-    // stack: i, i, new_carry, start_loc, retdest
-    %decrement
+    PUSH 1
+    DUP2
+    SUB
     // stack: i-1, i, new_carry, start_loc, retdest
     SWAP1
     // stack: i, i-1, new_carry, start_loc, retdest
diff --git a/evm/src/cpu/kernel/asm/bloom_filter.asm b/evm/src/cpu/kernel/asm/bloom_filter.asm
index 0362d710..8fa84a80 100644
--- a/evm/src/cpu/kernel/asm/bloom_filter.asm
+++ b/evm/src/cpu/kernel/asm/bloom_filter.asm
@@ -142,10 +142,11 @@ logs_bloom_end:
 // Also updates the block bloom filter.
 %macro bloom_write_bit
     // stack: byte_index, byte_bit_index
-    DUP2 
-    // stack: byte_bit_index, byte_index, byte_bit_index
+    PUSH 1
+    DUP3
+    // stack: byte_bit_index, 1, byte_index, byte_bit_index
     PUSH 7 SUB
-    PUSH 1 SWAP1 SHL
+    SHL
     // Updates the current txn bloom filter.
     SWAP2 POP DUP1
     %mload_kernel(@SEGMENT_TXN_BLOOM)
diff --git a/evm/src/cpu/kernel/asm/core/gas.asm b/evm/src/cpu/kernel/asm/core/gas.asm
index d5e4e9bb..2e16c373 100644
--- a/evm/src/cpu/kernel/asm/core/gas.asm
+++ b/evm/src/cpu/kernel/asm/core/gas.asm
@@ -122,7 +122,7 @@ global sys_gasprice:
 //     L(n) = n - floor(n / 64)
 %macro all_but_one_64th
     // stack: n
-    DUP1 %div_const(64)
+    DUP1 %shr_const(6)
     // stack: floor(n / 64), n
     SWAP1 SUB
     // stack: n - floor(n / 64)
diff --git a/evm/src/cpu/kernel/asm/curve/bn254/field_arithmetic/util.asm b/evm/src/cpu/kernel/asm/curve/bn254/field_arithmetic/util.asm
index 6dbddddc..86b179ba 100644
--- a/evm/src/cpu/kernel/asm/curve/bn254/field_arithmetic/util.asm
+++ b/evm/src/cpu/kernel/asm/curve/bn254/field_arithmetic/util.asm
@@ -1,7 +1,7 @@
 // Load a single value from bn254 pairings memory.
 %macro mload_bn254_pairing
     // stack: offset
-    %mload_current(@SEGMENT_KERNEL_BN_PAIRING)
+    %mload_current(@SEGMENT_BN_PAIRING)
     // stack: value
 %endmacro
 
@@ -9,14 +9,14 @@
     // stack:
     PUSH $offset
     // stack: offset
-    %mload_current(@SEGMENT_KERNEL_BN_PAIRING)
+    %mload_current(@SEGMENT_BN_PAIRING)
     // stack: value
 %endmacro
 
 // Store a single value to bn254 pairings memory.
 %macro mstore_bn254_pairing
     // stack: offset, value
-    %mstore_current(@SEGMENT_KERNEL_BN_PAIRING)
+    %mstore_current(@SEGMENT_BN_PAIRING)
     // stack:
 %endmacro
 
@@ -24,7 +24,7 @@
     // stack: value
     PUSH $offset
     // stack: offset, value
-    %mstore_current(@SEGMENT_KERNEL_BN_PAIRING)
+    %mstore_current(@SEGMENT_BN_PAIRING)
     // stack:
 %endmacro
 
diff --git a/evm/src/cpu/kernel/asm/curve/secp256k1/ecrecover.asm b/evm/src/cpu/kernel/asm/curve/secp256k1/ecrecover.asm
index c84536d8..c1103100 100644
--- a/evm/src/cpu/kernel/asm/curve/secp256k1/ecrecover.asm
+++ b/evm/src/cpu/kernel/asm/curve/secp256k1/ecrecover.asm
@@ -87,9 +87,9 @@ ecdsa_after_precompute_loop:
     %mul_const(2) ADD %mul_const(2) ADD %mul_const(2) ADD
     %stack (index, i, accx, accy, a0, a1, b0, b1, retdest) -> (index, index, i, accx, accy, a0, a1, b0, b1, retdest)
     %mul_const(2) %add_const(1)
-    %mload_kernel(@SEGMENT_KERNEL_ECDSA_TABLE)
+    %mload_current(@SEGMENT_ECDSA_TABLE)
     SWAP1 %mul_const(2)
-    %mload_kernel(@SEGMENT_KERNEL_ECDSA_TABLE)
+    %mload_current(@SEGMENT_ECDSA_TABLE)
     %stack (Px, Py, i, accx, accy, a0, a1, b0, b1, retdest) -> (Px, Py, accx, accy, ecdsa_after_precompute_loop_contd, i, a0, a1, b0, b1, retdest)
     %jump(secp_add_valid_points)
 ecdsa_after_precompute_loop_contd:
@@ -97,8 +97,9 @@ ecdsa_after_precompute_loop_contd:
     ISZERO %jumpi(ecdsa_after_precompute_loop_end)
     %jump(secp_double)
 ecdsa_after_precompute_loop_contd2:
-    %stack (accx, accy, i, a0, a1, b0, b1, retdest) -> (i, accx, accy, a0, a1, b0, b1, retdest)
-    %decrement %jump(ecdsa_after_precompute_loop)
+    %stack (accx, accy, i, a0, a1, b0, b1, retdest) -> (i, 1, accx, accy, a0, a1, b0, b1, retdest)
+    SUB // i - 1
+    %jump(ecdsa_after_precompute_loop)
 ecdsa_after_precompute_loop_end:
     // Check that the public key is not the point at infinity. See https://github.com/ethereum/eth-keys/pull/76 for discussion.
     DUP2 DUP2 ISZERO SWAP1 ISZERO MUL %jumpi(pk_is_infinity)
diff --git a/evm/src/cpu/kernel/asm/curve/secp256k1/precomputation.asm b/evm/src/cpu/kernel/asm/curve/secp256k1/precomputation.asm
index 3cea0315..b6bed1b0 100644
--- a/evm/src/cpu/kernel/asm/curve/secp256k1/precomputation.asm
+++ b/evm/src/cpu/kernel/asm/curve/secp256k1/precomputation.asm
@@ -1,27 +1,27 @@
 // Initial stack: Gneg, Qneg, Qx, Qy, retdest
-// Compute a*G ± b*phi(G) + c*Q ± d*phi(Q) for a,b,c,d in {0,1}^4 and store its x-coordinate at location `2*(8a+4b+2c+d)` and its y-coordinate at location `2*(8a+4b+2c+d)+1` in the SEGMENT_KERNEL_ECDSA_TABLE segment.
+// Compute a*G ± b*phi(G) + c*Q ± d*phi(Q) for a,b,c,d in {0,1}^4 and store its x-coordinate at location `2*(8a+4b+2c+d)` and its y-coordinate at location `2*(8a+4b+2c+d)+1` in the SEGMENT_ECDSA_TABLE segment.
 global secp_precompute_table:
     // First store G, ± phi(G), G ± phi(G)
     // Use Gneg for the ±, e.g., ±phi(G) is computed as `Gneg * (-phi(G)) + (1-Gneg)*phi(G)` (note only the y-coordinate needs to be filtered).
     // stack: Gneg, Qneg, Qx, Qy, retdest
     PUSH 32670510020758816978083085130507043184471273380659243275938904335757337482424 PUSH 17 PUSH 55066263022277343669578718895168534326250603453777594175500187360389116729240 PUSH 16
-    %mstore_kernel(@SEGMENT_KERNEL_ECDSA_TABLE) %mstore_kernel(@SEGMENT_KERNEL_ECDSA_TABLE)
+    %mstore_current(@SEGMENT_ECDSA_TABLE) %mstore_current(@SEGMENT_ECDSA_TABLE)
 
     DUP1 DUP1 %mul_const(32670510020758816978083085130507043184471273380659243275938904335757337482424) SWAP1 PUSH 1 SUB %mul_const(83121579216557378445487899878180864668798711284981320763518679672151497189239) ADD
     PUSH 9 PUSH 85340279321737800624759429340272274763154997815782306132637707972559913914315  PUSH 8
-    %mstore_kernel(@SEGMENT_KERNEL_ECDSA_TABLE) %mstore_kernel(@SEGMENT_KERNEL_ECDSA_TABLE)
+    %mstore_current(@SEGMENT_ECDSA_TABLE) %mstore_current(@SEGMENT_ECDSA_TABLE)
 
     DUP1 DUP1 %mul_const(83121579216557378445487899878180864668798711284981320763518679672151497189239) SWAP1 PUSH 1 SUB %mul_const(100652675408719987021357910538015346127426077519185866739835120963490438734674) ADD
     PUSH 25
-    %mstore_kernel(@SEGMENT_KERNEL_ECDSA_TABLE)
+    %mstore_current(@SEGMENT_ECDSA_TABLE)
 
     DUP1 %mul_const(91177636130617246552803821781935006617134368061721227770777272682868638699771) SWAP1 PUSH 1 SUB %mul_const(66837770201594535779099350687042404727408598709762866365333192677982385899440) ADD
     PUSH 24
-    %mstore_kernel(@SEGMENT_KERNEL_ECDSA_TABLE)
+    %mstore_current(@SEGMENT_ECDSA_TABLE)
 
     // Then store Q, ±phi(Q), Q ± phi(Q)
     %stack (Qneg, Qx, Qy, retdest) -> (4, Qx, 5, Qy, Qx, @SECP_BASE, Qneg, Qx, Qy, retdest)
-    %mstore_kernel(@SEGMENT_KERNEL_ECDSA_TABLE) %mstore_kernel(@SEGMENT_KERNEL_ECDSA_TABLE)
+    %mstore_current(@SEGMENT_ECDSA_TABLE) %mstore_current(@SEGMENT_ECDSA_TABLE)
     // stack: Qx, @SECP_BASE, Qx, Qy, retdest
     PUSH @SECP_GLV_BETA MULMOD
     %stack (betaQx, Qneg, Qx, Qy, retdest) -> (Qneg, Qy, Qneg, betaQx, Qx, Qy, retdest)
@@ -29,42 +29,42 @@ global secp_precompute_table:
     // stack: 1-Qneg, Qneg*Qy, betaQx, Qx, Qy, retdest
     DUP5 PUSH @SECP_BASE SUB MUL ADD
     %stack (selectQy, betaQx, Qx, Qy, retdest) -> (2, betaQx, 3, selectQy, betaQx, selectQy, Qx, Qy, precompute_table_contd, retdest)
-    %mstore_kernel(@SEGMENT_KERNEL_ECDSA_TABLE) %mstore_kernel(@SEGMENT_KERNEL_ECDSA_TABLE)
+    %mstore_current(@SEGMENT_ECDSA_TABLE) %mstore_current(@SEGMENT_ECDSA_TABLE)
     %jump(secp_add_valid_points_no_edge_case)
 precompute_table_contd:
     %stack (x, y, retdest) -> (6, x, 7, y, retdest)
-    %mstore_kernel(@SEGMENT_KERNEL_ECDSA_TABLE) %mstore_kernel(@SEGMENT_KERNEL_ECDSA_TABLE)
+    %mstore_current(@SEGMENT_ECDSA_TABLE) %mstore_current(@SEGMENT_ECDSA_TABLE)
     PUSH 2
 // Use a loop to store a*G ± b*phi(G) + c*Q ± d*phi(Q) for a,b,c,d in {0,1}^4.
 precompute_table_loop:
     // stack: i, retdest
-    DUP1 %increment %mload_kernel(@SEGMENT_KERNEL_ECDSA_TABLE)
+    DUP1 %increment %mload_current(@SEGMENT_ECDSA_TABLE)
     %stack (y, i, retdest) -> (i, y, i, retdest)
-    %mload_kernel(@SEGMENT_KERNEL_ECDSA_TABLE)
+    %mload_current(@SEGMENT_ECDSA_TABLE)
     PUSH precompute_table_loop_contd
     DUP3 DUP3
-    PUSH 9 %mload_kernel(@SEGMENT_KERNEL_ECDSA_TABLE)
-    PUSH 8 %mload_kernel(@SEGMENT_KERNEL_ECDSA_TABLE)
+    PUSH 9 %mload_current(@SEGMENT_ECDSA_TABLE)
+    PUSH 8 %mload_current(@SEGMENT_ECDSA_TABLE)
     // stack: Gx, Gy, x, y, precompute_table_loop_contd, x, y, i, retdest
     %jump(secp_add_valid_points)
 precompute_table_loop_contd:
     %stack (Rx, Ry, x, y, i, retdest) -> (i, 8, Rx, i, 9, Ry, x, y, i, retdest)
-    ADD %mstore_kernel(@SEGMENT_KERNEL_ECDSA_TABLE) ADD %mstore_kernel(@SEGMENT_KERNEL_ECDSA_TABLE)
+    ADD %mstore_current(@SEGMENT_ECDSA_TABLE) ADD %mstore_current(@SEGMENT_ECDSA_TABLE)
     DUP2 DUP2
-    PUSH 17 %mload_kernel(@SEGMENT_KERNEL_ECDSA_TABLE)
-    PUSH 16 %mload_kernel(@SEGMENT_KERNEL_ECDSA_TABLE)
+    PUSH 17 %mload_current(@SEGMENT_ECDSA_TABLE)
+    PUSH 16 %mload_current(@SEGMENT_ECDSA_TABLE)
     %stack (Gx, Gy, x, y, x, y, i, retdest) -> (Gx, Gy, x, y, precompute_table_loop_contd2, x, y, i, retdest)
     %jump(secp_add_valid_points)
 precompute_table_loop_contd2:
     %stack (Rx, Ry, x, y, i, retdest) -> (i, 16, Rx, i, 17, Ry, x, y, i, retdest)
-    ADD %mstore_kernel(@SEGMENT_KERNEL_ECDSA_TABLE) ADD %mstore_kernel(@SEGMENT_KERNEL_ECDSA_TABLE)
-    PUSH 25 %mload_kernel(@SEGMENT_KERNEL_ECDSA_TABLE)
-    PUSH 24 %mload_kernel(@SEGMENT_KERNEL_ECDSA_TABLE)
+    ADD %mstore_current(@SEGMENT_ECDSA_TABLE) ADD %mstore_current(@SEGMENT_ECDSA_TABLE)
+    PUSH 25 %mload_current(@SEGMENT_ECDSA_TABLE)
+    PUSH 24 %mload_current(@SEGMENT_ECDSA_TABLE)
     %stack (Gx, Gy, x, y, i, retdest) -> (Gx, Gy, x, y, precompute_table_loop_contd3, i, retdest)
     %jump(secp_add_valid_points)
 precompute_table_loop_contd3:
     %stack (Rx, Ry, i, retdest) -> (i, 24, Rx, i, 25, Ry, i, retdest)
-    ADD %mstore_kernel(@SEGMENT_KERNEL_ECDSA_TABLE) ADD %mstore_kernel(@SEGMENT_KERNEL_ECDSA_TABLE)
+    ADD %mstore_current(@SEGMENT_ECDSA_TABLE) ADD %mstore_current(@SEGMENT_ECDSA_TABLE)
     %add_const(2)
     DUP1 %eq_const(8) %jumpi(precompute_table_end)
     %jump(precompute_table_loop)
diff --git a/evm/src/cpu/kernel/asm/exp.asm b/evm/src/cpu/kernel/asm/exp.asm
index 5dd67366..4b798e84 100644
--- a/evm/src/cpu/kernel/asm/exp.asm
+++ b/evm/src/cpu/kernel/asm/exp.asm
@@ -86,7 +86,7 @@ sys_exp_gas_loop_enter:
     // stack: e >> shift, shift, x, e, return_info
     %jumpi(sys_exp_gas_loop)
     // stack: shift_bits, x, e, return_info
-    %div_const(8)
+    %shr_const(3)
     // stack: byte_size_of_e := shift_bits / 8, x, e, return_info
     %mul_const(@GAS_EXPBYTE)
     %add_const(@GAS_EXP)
diff --git a/evm/src/cpu/kernel/asm/hash/blake2/compression.asm b/evm/src/cpu/kernel/asm/hash/blake2/compression.asm
index 454e5128..ba9ffc13 100644
--- a/evm/src/cpu/kernel/asm/hash/blake2/compression.asm
+++ b/evm/src/cpu/kernel/asm/hash/blake2/compression.asm
@@ -21,10 +21,11 @@ compression_loop:
     // stack: addr, cur_block, retdest
     POP
     // stack: cur_block, retdest
+    PUSH 1
     PUSH 0
     %mload_current_general
-    // stack: num_blocks, cur_block, retdest
-    %decrement
+    // stack: num_blocks, 1, cur_block, retdest
+    SUB
     // stack: num_blocks - 1, cur_block, retdest
     DUP2
     // stack: cur_block, num_blocks - 1, cur_block, retdest
diff --git a/evm/src/cpu/kernel/asm/hash/sha2/main.asm b/evm/src/cpu/kernel/asm/hash/sha2/main.asm
index b311262d..81ec3912 100644
--- a/evm/src/cpu/kernel/asm/hash/sha2/main.asm
+++ b/evm/src/cpu/kernel/asm/hash/sha2/main.asm
@@ -31,7 +31,7 @@ global sha2_pad:
     DUP1
     // stack: num_bytes, num_bytes, retdest
     %add_const(8)
-    %div_const(64)
+    %shr_const(6)
     
     %increment
     // stack: num_blocks = (num_bytes+8)//64 + 1, num_bytes, retdest
diff --git a/evm/src/cpu/kernel/asm/hash/sha2/ops.asm b/evm/src/cpu/kernel/asm/hash/sha2/ops.asm
index 6a4c5e3b..d50e5c9a 100644
--- a/evm/src/cpu/kernel/asm/hash/sha2/ops.asm
+++ b/evm/src/cpu/kernel/asm/hash/sha2/ops.asm
@@ -34,7 +34,7 @@
     // stack: rotr(x, 18), x, rotr(x, 7)
     SWAP1
     // stack: x, rotr(x, 18), rotr(x, 7)
-    %div_const(8) // equivalent to %shr_const(3)
+    %shr_const(3)
     // stack: shr(x, 3), rotr(x, 18), rotr(x, 7)
     XOR
     XOR
diff --git a/evm/src/cpu/kernel/asm/hash/sha2/write_length.asm b/evm/src/cpu/kernel/asm/hash/sha2/write_length.asm
index 438875fb..c412a666 100644
--- a/evm/src/cpu/kernel/asm/hash/sha2/write_length.asm
+++ b/evm/src/cpu/kernel/asm/hash/sha2/write_length.asm
@@ -17,11 +17,12 @@
         %decrement
         SWAP1
         // stack: length >> (8 * i), last_addr - i - 2
-        %div_const(256) // equivalent to %shr_const(8)
+        %shr_const(8)
         // stack: length >> (8 * (i + 1)), last_addr - i - 2
-        DUP1
-        // stack: length >> (8 * (i + 1)), length >> (8 * (i + 1)), last_addr - i - 2
-        %mod_const(256)
+        PUSH 256
+        DUP2
+        // stack: length >> (8 * (i + 1)), 256, length >> (8 * (i + 1)), last_addr - i - 2
+        MOD
         // stack: (length >> (8 * (i + 1))) % (1 << 8), length >> (8 * (i + 1)), last_addr - i - 2
         DUP3
         // stack: last_addr - i - 2, (length >> (8 * (i + 1))) % (1 << 8), length >> (8 * (i + 1)), last_addr - i - 2
diff --git a/evm/src/cpu/kernel/asm/journal/journal.asm b/evm/src/cpu/kernel/asm/journal/journal.asm
index a0e5502d..2715bc6b 100644
--- a/evm/src/cpu/kernel/asm/journal/journal.asm
+++ b/evm/src/cpu/kernel/asm/journal/journal.asm
@@ -199,8 +199,9 @@
 %endmacro
 
 %macro pop_checkpoint
+    PUSH 1
     %mload_context_metadata(@CTX_METADATA_CHECKPOINTS_LEN)
     // stack: i
-    %decrement
+    SUB
     %mstore_context_metadata(@CTX_METADATA_CHECKPOINTS_LEN)
 %endmacro
diff --git a/evm/src/cpu/kernel/asm/journal/log.asm b/evm/src/cpu/kernel/asm/journal/log.asm
index 0b815fae..e1794397 100644
--- a/evm/src/cpu/kernel/asm/journal/log.asm
+++ b/evm/src/cpu/kernel/asm/journal/log.asm
@@ -8,8 +8,9 @@ global revert_log:
     // stack: entry_type, ptr, retdest
     POP
     // First, reduce the number of logs.
+    PUSH 1
     %mload_global_metadata(@GLOBAL_METADATA_LOGS_LEN)
-    %decrement
+    SUB
     %mstore_global_metadata(@GLOBAL_METADATA_LOGS_LEN)
     // stack: ptr, retdest
     // Second, restore payload length.
diff --git a/evm/src/cpu/kernel/asm/memory/core.asm b/evm/src/cpu/kernel/asm/memory/core.asm
index 3a3a17a5..eef7ee1a 100644
--- a/evm/src/cpu/kernel/asm/memory/core.asm
+++ b/evm/src/cpu/kernel/asm/memory/core.asm
@@ -409,8 +409,7 @@
     %mstore_u32
 %endmacro
 
-// Store a single byte to @SEGMENT_RLP_RAW.
-%macro mstore_rlp
+%macro swap_mstore
     // stack: addr, value
     SWAP1
     MSTORE_GENERAL
diff --git a/evm/src/cpu/kernel/asm/memory/metadata.asm b/evm/src/cpu/kernel/asm/memory/metadata.asm
index dfbfb646..e69e292b 100644
--- a/evm/src/cpu/kernel/asm/memory/metadata.asm
+++ b/evm/src/cpu/kernel/asm/memory/metadata.asm
@@ -359,7 +359,7 @@ zero_hash:
     // stack: num_bytes
     %add_const(31)
     // stack: 31 + num_bytes
-    %div_const(32)
+    %shr_const(5)
     // stack: (num_bytes + 31) / 32
 %endmacro
 
@@ -372,7 +372,7 @@ zero_hash:
     SWAP1
     // stack: num_words, num_words * GAS_MEMORY
     %square
-    %div_const(512)
+    %shr_const(9)
     // stack: num_words^2 / 512, num_words * GAS_MEMORY
     ADD
     // stack: cost = num_words^2 / 512 + num_words * GAS_MEMORY
@@ -422,8 +422,9 @@ zero_hash:
 %endmacro
 
 %macro decrement_call_depth
+    PUSH 1
     %mload_global_metadata(@GLOBAL_METADATA_CALL_STACK_DEPTH)
-    %decrement
+    SUB
     %mstore_global_metadata(@GLOBAL_METADATA_CALL_STACK_DEPTH)
 %endmacro
 
diff --git a/evm/src/cpu/kernel/asm/mpt/hash/hash.asm b/evm/src/cpu/kernel/asm/mpt/hash/hash.asm
index 29348f97..15fdd6a9 100644
--- a/evm/src/cpu/kernel/asm/mpt/hash/hash.asm
+++ b/evm/src/cpu/kernel/asm/mpt/hash/hash.asm
@@ -151,8 +151,8 @@ global encode_node_branch:
 
     // No value; append the empty string (0x80).
     // stack: value_ptr, rlp_pos', rlp_start, encode_value, cur_len, retdest
-    %stack (value_ptr, rlp_pos, rlp_start, encode_value) -> (rlp_pos, 0x80, rlp_pos, rlp_start)
-    %mstore_rlp
+    %stack (value_ptr, rlp_pos, rlp_start, encode_value) -> (0x80, rlp_pos, rlp_pos, rlp_start)
+    MSTORE_GENERAL
     // stack: rlp_pos', rlp_start, cur_len, retdest
     %increment
     // stack: rlp_pos'', rlp_start, cur_len, retdest
@@ -192,9 +192,9 @@ encode_node_branch_prepend_prefix:
     SWAP1 DUP1 %sub_const(32) %jumpi(%%unpack)
     // Otherwise, result is a hash, and we need to add the prefix 0x80 + 32 = 160.
     // stack: result_len, result, cur_len, rlp_pos, rlp_start, node_payload_ptr, encode_value, retdest
+    DUP4 // rlp_pos
     PUSH 160
-    DUP5 // rlp_pos
-    %mstore_rlp
+    MSTORE_GENERAL
     SWAP3 %increment SWAP3 // rlp_pos += 1
 %%unpack:
     %stack (result_len, result, cur_len, rlp_pos, rlp_start, node_payload_ptr, encode_value, retdest)
@@ -233,9 +233,9 @@ encode_node_extension_after_hex_prefix:
     // If result_len != 32, result is raw RLP, with an appropriate RLP prefix already.
     DUP4 %sub_const(32) %jumpi(encode_node_extension_unpack)
     // Otherwise, result is a hash, and we need to add the prefix 0x80 + 32 = 160.
+    DUP1 // rlp_pos
     PUSH 160
-    DUP2 // rlp_pos
-    %mstore_rlp
+    MSTORE_GENERAL
     %increment // rlp_pos += 1
 encode_node_extension_unpack:
     %stack (rlp_pos, rlp_start, result, result_len, node_payload_ptr, cur_len)
diff --git a/evm/src/cpu/kernel/asm/mpt/hash/hash_trie_specific.asm b/evm/src/cpu/kernel/asm/mpt/hash/hash_trie_specific.asm
index 6abca0b6..cd07c01f 100644
--- a/evm/src/cpu/kernel/asm/mpt/hash/hash_trie_specific.asm
+++ b/evm/src/cpu/kernel/asm/mpt/hash/hash_trie_specific.asm
@@ -326,8 +326,8 @@ encode_nonzero_receipt_type:
     // stack: rlp_receipt_len, txn_type, rlp_addr, value_ptr, retdest
     DUP3 %encode_rlp_multi_byte_string_prefix
     // stack: rlp_addr, txn_type, old_rlp_addr, value_ptr, retdest
-    DUP2 DUP2
-    %mstore_rlp
+    DUP1 DUP3
+    MSTORE_GENERAL
     %increment
     // stack: rlp_addr, txn_type, old_rlp_addr, value_ptr, retdest
     %stack (rlp_addr, txn_type, old_rlp_addr, value_ptr, retdest) -> (rlp_addr, value_ptr, retdest)
diff --git a/evm/src/cpu/kernel/asm/mpt/hex_prefix.asm b/evm/src/cpu/kernel/asm/mpt/hex_prefix.asm
index 7dd02c34..cee87deb 100644
--- a/evm/src/cpu/kernel/asm/mpt/hex_prefix.asm
+++ b/evm/src/cpu/kernel/asm/mpt/hex_prefix.asm
@@ -27,7 +27,7 @@ first_byte:
     // stack: rlp_addr, num_nibbles, packed_nibbles, terminated, retdest
     // get the first nibble, if num_nibbles is odd, or zero otherwise
     SWAP2
-    // stack: packed_nibbles, num_nibbbles, rlp_addr, terminated, retdest
+    // stack: packed_nibbles, num_nibbles, rlp_addr, terminated, retdest
     DUP2 DUP1
     %mod_const(2)
     // stack: parity, num_nibbles, packed_nibbles, num_nibbles, rlp_addr, terminated, retdest
@@ -50,7 +50,7 @@ first_byte:
     ADD
     // stack: first_byte, rlp_addr, retdest
     DUP2
-    %mstore_rlp
+    %swap_mstore
     %increment
     // stack: rlp_addr', retdest
     SWAP1
@@ -88,7 +88,7 @@ rlp_header_medium:
     // stack: hp_len, rlp_addr, num_nibbles, packed_nibbles, terminated, retdest
     %add_const(0x80) // value = 0x80 + hp_len
     DUP2
-    %mstore_rlp
+    %swap_mstore
     // stack: rlp_addr, num_nibbles, packed_nibbles, terminated, retdest
     // rlp_addr += 1
     %increment
@@ -108,14 +108,14 @@ rlp_header_large:
     // In practice hex-prefix length will never exceed 256, so the length of the
     // length will always be 1 byte in this case.
 
+    DUP2 // rlp_addr
     PUSH 0xb8 // value = 0xb7 + len_of_len = 0xb8
-    DUP3
-    %mstore_rlp
+    MSTORE_GENERAL
     // stack: rlp_addr, value, hp_len, i, rlp_addr, num_nibbles, packed_nibbles, terminated, retdest
 
     // stack: hp_len, rlp_addr, num_nibbles, packed_nibbles, terminated, retdest
     DUP2 %increment
-    %mstore_rlp
+    %swap_mstore
 
     // stack: rlp_addr, num_nibbles, packed_nibbles, terminated, retdest
     // rlp_addr += 2
diff --git a/evm/src/cpu/kernel/asm/mpt/util.asm b/evm/src/cpu/kernel/asm/mpt/util.asm
index 27418b17..db13d889 100644
--- a/evm/src/cpu/kernel/asm/mpt/util.asm
+++ b/evm/src/cpu/kernel/asm/mpt/util.asm
@@ -11,9 +11,9 @@
 %endmacro
 
 %macro initialize_rlp_segment
-    PUSH 0x80
     PUSH @ENCODED_EMPTY_NODE_POS
-    %mstore_rlp
+    PUSH 0x80
+    MSTORE_GENERAL
 %endmacro
 
 %macro alloc_rlp_block
diff --git a/evm/src/cpu/kernel/asm/rlp/encode.asm b/evm/src/cpu/kernel/asm/rlp/encode.asm
index 2319e780..dcd24515 100644
--- a/evm/src/cpu/kernel/asm/rlp/encode.asm
+++ b/evm/src/cpu/kernel/asm/rlp/encode.asm
@@ -29,12 +29,11 @@ global encode_rlp_256:
 // RLP-encode a fixed-length string with the given byte length. Assumes string < 2^(8 * len).
 global encode_rlp_fixed:
     // stack: len, rlp_addr, string, retdest
-    DUP1
+    DUP2
+    DUP2
     %add_const(0x80)
-    // stack: first_byte, len, rlp_addr, string, retdest
-    DUP3
-    // stack: rlp_addr, first_byte, len, rlp_addr, string, retdest
-    %mstore_rlp
+    // stack: first_byte, rlp_addr, len, rlp_addr, string, retdest
+    MSTORE_GENERAL
     // stack: len, rlp_addr, string, retdest
     SWAP1
     %increment // increment rlp_addr
@@ -51,19 +50,17 @@ encode_rlp_fixed_finish:
 // I.e. writes encode(encode(string). Assumes string < 2^(8 * len).
 global doubly_encode_rlp_fixed:
     // stack: len, rlp_addr, string, retdest
-    DUP1
+    DUP2
+    DUP2
     %add_const(0x81)
-    // stack: first_byte, len, rlp_addr, string, retdest
-    DUP3
-    // stack: rlp_addr, first_byte, len, rlp_addr, string, retdest
-    %mstore_rlp
+    // stack: first_byte, rlp_addr, len, rlp_addr, string, retdest
+    MSTORE_GENERAL
     // stack: len, rlp_addr, string, retdest
-    DUP1
+    DUP2 %increment
+    DUP2
     %add_const(0x80)
-    // stack: second_byte, len, original_rlp_addr, string, retdest
-    DUP3 %increment
-    // stack: rlp_addr', second_byte, len, rlp_addr, string, retdest
-    %mstore_rlp
+    // stack: second_byte, rlp_addr', len, original_rlp_addr, string, retdest
+    MSTORE_GENERAL
     // stack: len, rlp_addr, string, retdest
     SWAP1
     %add_const(2) // advance past the two prefix bytes
@@ -87,11 +84,12 @@ global encode_rlp_multi_byte_string_prefix:
     %jumpi(encode_rlp_multi_byte_string_prefix_large)
     // Medium case; prefix is 0x80 + str_len.
     // stack: rlp_addr, str_len, retdest
-    SWAP1 %add_const(0x80)
-    // stack: prefix, rlp_addr, retdest
+    PUSH 0x80
     DUP2
-    // stack: rlp_addr, prefix, rlp_addr, retdest
-    %mstore_rlp
+    // stack: rlp_addr, 0x80, rlp_addr, str_len, retdest
+    SWAP3 ADD
+    // stack: prefix, rlp_addr, rlp_addr, retdest
+    MSTORE_GENERAL
     // stack: rlp_addr, retdest
     %increment
     // stack: rlp_addr', retdest
@@ -104,12 +102,11 @@ encode_rlp_multi_byte_string_prefix_large:
     %num_bytes
     // stack: len_of_len, rlp_addr, str_len, retdest
     SWAP1
-    DUP2 // len_of_len
+    DUP1 // rlp_addr
+    DUP3 // len_of_len
     %add_const(0xb7)
-    // stack: first_byte, rlp_addr, len_of_len, str_len, retdest
-    DUP2
-    // stack: rlp_addr, first_byte, rlp_addr, len_of_len, str_len, retdest
-    %mstore_rlp
+    // stack: first_byte, rlp_addr, rlp_addr, len_of_len, str_len, retdest
+    MSTORE_GENERAL
     // stack: rlp_addr, len_of_len, str_len, retdest
     %increment
     // stack: rlp_addr', len_of_len, str_len, retdest
@@ -132,12 +129,11 @@ global encode_rlp_list_prefix:
     %jumpi(encode_rlp_list_prefix_large)
     // Small case: prefix is just 0xc0 + length.
     // stack: rlp_addr, payload_len, retdest
-    SWAP1
+    DUP1
+    SWAP2
     %add_const(0xc0)
-    // stack: prefix, rlp_addr, retdest
-    DUP2
-    // stack: rlp_addr, prefix, rlp_addr, retdest
-    %mstore_rlp
+    // stack: prefix, rlp_addr, rlp_addr, retdest
+    MSTORE_GENERAL
     // stack: rlp_addr, retdest
     %increment
     SWAP1
@@ -147,10 +143,10 @@ encode_rlp_list_prefix_large:
     // stack: rlp_addr, payload_len, retdest
     DUP2 %num_bytes
     // stack: len_of_len, rlp_addr, payload_len, retdest
-    DUP1 %add_const(0xf7)
-    // stack: first_byte, len_of_len, rlp_addr, payload_len, retdest
-    DUP3 // rlp_addr
-    %mstore_rlp
+    DUP2
+    DUP2 %add_const(0xf7)
+    // stack: first_byte, rlp_addr, len_of_len, rlp_addr, payload_len, retdest
+    MSTORE_GENERAL
     // stack: len_of_len, rlp_addr, payload_len, retdest
     SWAP1 %increment
     // stack: rlp_addr', len_of_len, payload_len, retdest
@@ -184,10 +180,10 @@ global prepend_rlp_list_prefix:
 
     // If we got here, we have a small list, so we prepend 0xc0 + len at rlp_address 8.
     // stack: payload_len, end_rlp_addr, start_rlp_addr, retdest
-    DUP1 %add_const(0xc0)
-    // stack: prefix_byte, payload_len, end_rlp_addr, start_rlp_addr, retdest
-    DUP4 %decrement // offset of prefix
-    %mstore_rlp
+    PUSH 1 DUP4 SUB // offset of prefix
+    DUP2 %add_const(0xc0)
+    // stack: prefix_byte, start_rlp_addr-1, payload_len, end_rlp_addr, start_rlp_addr, retdest
+    MSTORE_GENERAL
     // stack: payload_len, end_rlp_addr, start_rlp_addr, retdest
     %increment
     // stack: rlp_len, end_rlp_addr, start_rlp_addr, retdest
@@ -204,10 +200,10 @@ prepend_rlp_list_prefix_big:
     DUP1 %num_bytes
     // stack: len_of_len, payload_len, end_rlp_addr, start_rlp_addr, retdest
     DUP1
-    DUP5 %decrement // start_rlp_addr - 1
+    PUSH 1 DUP6 SUB // start_rlp_addr - 1
     SUB
     // stack: prefix_start_rlp_addr, len_of_len, payload_len, end_rlp_addr, start_rlp_addr, retdest
-    DUP2 %add_const(0xf7) DUP2 %mstore_rlp // rlp[prefix_start_rlp_addr] = 0xf7 + len_of_len
+    DUP2 %add_const(0xf7) DUP2 %swap_mstore // rlp[prefix_start_rlp_addr] = 0xf7 + len_of_len
     // stack: prefix_start_rlp_addr, len_of_len, payload_len, end_rlp_addr, start_rlp_addr, retdest
     DUP1 %increment // start_len_rlp_addr = prefix_start_rlp_addr + 1
     %stack (start_len_rlp_addr, prefix_start_rlp_addr, len_of_len, payload_len, end_rlp_addr, start_rlp_addr, retdest)
diff --git a/evm/src/cpu/kernel/asm/rlp/encode_rlp_scalar.asm b/evm/src/cpu/kernel/asm/rlp/encode_rlp_scalar.asm
index 8196a452..d311a57e 100644
--- a/evm/src/cpu/kernel/asm/rlp/encode_rlp_scalar.asm
+++ b/evm/src/cpu/kernel/asm/rlp/encode_rlp_scalar.asm
@@ -85,6 +85,14 @@ encode_rlp_scalar_small:
     SWAP1
     JUMP
 
+// Convenience macro to call encode_rlp_scalar and return where we left off.
+// It takes swapped inputs, i.e. `scalar, rlp_addr` instead of `rlp_addr, scalar`.
+%macro encode_rlp_scalar_swapped_inputs
+    %stack (scalar, rlp_addr) -> (rlp_addr, scalar, %%after)
+    %jump(encode_rlp_scalar)
+%%after:
+%endmacro
+
 // Convenience macro to call encode_rlp_scalar and return where we left off.
 %macro encode_rlp_scalar
     %stack (rlp_addr, scalar) -> (rlp_addr, scalar, %%after)
diff --git a/evm/src/cpu/kernel/asm/rlp/num_bytes.asm b/evm/src/cpu/kernel/asm/rlp/num_bytes.asm
index b2a1b9d6..de0a7ca9 100644
--- a/evm/src/cpu/kernel/asm/rlp/num_bytes.asm
+++ b/evm/src/cpu/kernel/asm/rlp/num_bytes.asm
@@ -5,8 +5,8 @@ global num_bytes:
     DUP1 ISZERO %jumpi(return_1)
     // Non-deterministically guess the number of bits
     PROVER_INPUT(num_bits)
-    %stack (num_bits, x) -> (num_bits, x, num_bits)
-    %decrement
+    %stack(num_bits, x) -> (num_bits, 1, x, num_bits)
+    SUB
     SHR
     // stack: 1, num_bits
     %assert_eq_const(1)
@@ -17,7 +17,10 @@ global num_bytes:
     SWAP1
     JUMP
 
-return_1:  POP PUSH  1 SWAP1 JUMP
+return_1:
+    // stack: x, retdest
+    %stack(x, retdest) -> (retdest, 1)
+    JUMP
 
 // Convenience macro to call num_bytes and return where we left off.
 %macro num_bytes
diff --git a/evm/src/cpu/kernel/asm/transactions/router.asm b/evm/src/cpu/kernel/asm/transactions/router.asm
index 2ecccfe0..edabfbc4 100644
--- a/evm/src/cpu/kernel/asm/transactions/router.asm
+++ b/evm/src/cpu/kernel/asm/transactions/router.asm
@@ -5,10 +5,8 @@
 global route_txn:
     // stack: txn_counter, num_nibbles, retdest
     // First load transaction data into memory, where it will be parsed.
-    PUSH read_txn_from_memory
-    SWAP2 SWAP1
-    PUSH update_txn_trie
-    // stack: update_txn_trie, tx_counter, num_nibbles, read_txn_from_memory, retdest
+    %stack(txn_counter, num_nibbles) -> (update_txn_trie, txn_counter, num_nibbles, read_txn_from_memory)
+    // stack: update_txn_trie, txn_counter, num_nibbles, read_txn_from_memory, retdest
     %jump(read_rlp_to_memory)
 
 // At this point, the raw txn data is in memory.
diff --git a/evm/src/cpu/kernel/asm/transactions/type_0.asm b/evm/src/cpu/kernel/asm/transactions/type_0.asm
index 0cd22c10..a3f3bb0d 100644
--- a/evm/src/cpu/kernel/asm/transactions/type_0.asm
+++ b/evm/src/cpu/kernel/asm/transactions/type_0.asm
@@ -61,7 +61,7 @@ process_v_new_style:
     %sub_const(35)
     DUP1
     // stack: v - 35, v - 35, rlp_addr, retdest
-    %div_const(2)
+    %div2
     // stack: chain_id, v - 35, rlp_addr, retdest
     %mstore_txn_field(@TXN_FIELD_CHAIN_ID)
 
@@ -94,11 +94,11 @@ type_0_compute_signed_data:
     // stack: rlp_addr, rlp_addr_start, retdest
 
     %mload_txn_field(@TXN_FIELD_MAX_FEE_PER_GAS)
-    SWAP1 %encode_rlp_scalar
+    %encode_rlp_scalar_swapped_inputs
     // stack: rlp_addr, rlp_addr_start, retdest
 
     %mload_txn_field(@TXN_FIELD_GAS_LIMIT)
-    SWAP1 %encode_rlp_scalar
+    %encode_rlp_scalar_swapped_inputs
     // stack: rlp_addr, rlp_addr_start, retdest
 
     %mload_txn_field(@TXN_FIELD_TO)
@@ -108,12 +108,12 @@ type_0_compute_signed_data:
     %jump(after_to)
 zero_to:
     // stack: to, rlp_addr, rlp_addr_start, retdest
-    SWAP1 %encode_rlp_scalar
+    %encode_rlp_scalar_swapped_inputs
     // stack: rlp_addr, rlp_addr_start, retdest
 
 after_to:
     %mload_txn_field(@TXN_FIELD_VALUE)
-    SWAP1 %encode_rlp_scalar
+    %encode_rlp_scalar_swapped_inputs
     // stack: rlp_addr, rlp_addr_start, retdest
 
     // Encode txn data.
@@ -133,15 +133,15 @@ after_serializing_txn_data:
     // stack: rlp_addr, rlp_addr_start, retdest
 
     %mload_txn_field(@TXN_FIELD_CHAIN_ID)
-    SWAP1 %encode_rlp_scalar
+    %encode_rlp_scalar_swapped_inputs
     // stack: rlp_addr, rlp_addr_start, retdest
 
     PUSH 0
-    SWAP1 %encode_rlp_scalar
+    %encode_rlp_scalar_swapped_inputs
     // stack: rlp_addr, rlp_addr_start, retdest
 
     PUSH 0
-    SWAP1 %encode_rlp_scalar
+    %encode_rlp_scalar_swapped_inputs
     // stack: rlp_addr, rlp_addr_start, retdest
 
 finish_rlp_list:
diff --git a/evm/src/cpu/kernel/asm/transactions/type_1.asm b/evm/src/cpu/kernel/asm/transactions/type_1.asm
index f9142bd4..e64a4aee 100644
--- a/evm/src/cpu/kernel/asm/transactions/type_1.asm
+++ b/evm/src/cpu/kernel/asm/transactions/type_1.asm
@@ -48,15 +48,15 @@ type_1_compute_signed_data:
     // stack: rlp_addr, rlp_addr_start, retdest
 
     %mload_txn_field(@TXN_FIELD_NONCE)
-    SWAP1 %encode_rlp_scalar
+    %encode_rlp_scalar_swapped_inputs
     // stack: rlp_addr, rlp_addr_start, retdest
 
     %mload_txn_field(@TXN_FIELD_MAX_FEE_PER_GAS)
-    SWAP1 %encode_rlp_scalar
+    %encode_rlp_scalar_swapped_inputs
     // stack: rlp_addr, rlp_addr_start, retdest
 
     %mload_txn_field(@TXN_FIELD_GAS_LIMIT)
-    SWAP1 %encode_rlp_scalar
+    %encode_rlp_scalar_swapped_inputs
     // stack: rlp_addr, rlp_addr_start, retdest
 
     %mload_txn_field(@TXN_FIELD_TO)
@@ -66,12 +66,12 @@ type_1_compute_signed_data:
     %jump(after_to)
 zero_to:
     // stack: to, rlp_addr, rlp_addr_start, retdest
-    SWAP1 %encode_rlp_scalar
+    %encode_rlp_scalar_swapped_inputs
     // stack: rlp_addr, rlp_addr_start, retdest
 
 after_to:
     %mload_txn_field(@TXN_FIELD_VALUE)
-    SWAP1 %encode_rlp_scalar
+    %encode_rlp_scalar_swapped_inputs
     // stack: rlp_addr, rlp_addr_start, retdest
 
     // Encode txn data.
diff --git a/evm/src/cpu/kernel/asm/transactions/type_2.asm b/evm/src/cpu/kernel/asm/transactions/type_2.asm
index cd4f85e6..5074c579 100644
--- a/evm/src/cpu/kernel/asm/transactions/type_2.asm
+++ b/evm/src/cpu/kernel/asm/transactions/type_2.asm
@@ -51,19 +51,19 @@ type_2_compute_signed_data:
     // stack: rlp_addr, rlp_start, retdest
 
     %mload_txn_field(@TXN_FIELD_NONCE)
-    SWAP1 %encode_rlp_scalar
+    %encode_rlp_scalar_swapped_inputs
     // stack: rlp_addr, rlp_start, retdest
 
     %mload_txn_field(@TXN_FIELD_MAX_PRIORITY_FEE_PER_GAS)
-    SWAP1 %encode_rlp_scalar
+    %encode_rlp_scalar_swapped_inputs
     // stack: rlp_addr, rlp_start, retdest
 
     %mload_txn_field(@TXN_FIELD_MAX_FEE_PER_GAS)
-    SWAP1 %encode_rlp_scalar
+    %encode_rlp_scalar_swapped_inputs
     // stack: rlp_addr, rlp_start, retdest
 
     %mload_txn_field(@TXN_FIELD_GAS_LIMIT)
-    SWAP1 %encode_rlp_scalar
+    %encode_rlp_scalar_swapped_inputs
     // stack: rlp_addr, rlp_start, retdest
 
     %mload_txn_field(@TXN_FIELD_TO)
@@ -73,12 +73,12 @@ type_2_compute_signed_data:
     %jump(after_to)
 zero_to:
     // stack: to, rlp_addr, rlp_start, retdest
-    SWAP1 %encode_rlp_scalar
+    %encode_rlp_scalar_swapped_inputs
     // stack: rlp_addr, rlp_start, retdest
 
 after_to:
     %mload_txn_field(@TXN_FIELD_VALUE)
-    SWAP1 %encode_rlp_scalar
+    %encode_rlp_scalar_swapped_inputs
     // stack: rlp_addr, rlp_start, retdest
 
     // Encode txn data.
diff --git a/evm/src/cpu/kernel/asm/util/basic_macros.asm b/evm/src/cpu/kernel/asm/util/basic_macros.asm
index 855eb4bb..76def0c7 100644
--- a/evm/src/cpu/kernel/asm/util/basic_macros.asm
+++ b/evm/src/cpu/kernel/asm/util/basic_macros.asm
@@ -284,9 +284,9 @@
 
 %macro ceil_div
     // stack: x, y
-    DUP2
-    // stack: y, x, y
-    %decrement
+    PUSH 1
+    DUP3
+    SUB // y - 1
     // stack: y - 1, x, y
     ADD
     DIV
@@ -346,7 +346,10 @@
 %endmacro
 
 %macro div2
-    %div_const(2)
+    // stack: x
+    PUSH 1
+    SHR
+    // stack: x >> 1
 %endmacro
 
 %macro iseven
diff --git a/evm/src/cpu/kernel/asm/util/math.asm b/evm/src/cpu/kernel/asm/util/math.asm
index 98f7b008..4bdf6902 100644
--- a/evm/src/cpu/kernel/asm/util/math.asm
+++ b/evm/src/cpu/kernel/asm/util/math.asm
@@ -5,7 +5,7 @@ log2_floor_helper:
     ISZERO
     %jumpi(end)
     // stack: val, counter, retdest
-    %div_const(2)
+    %div2
     // stack: val/2, counter, retdest
     SWAP1
     %increment
@@ -22,7 +22,7 @@ end:
 
 global log2_floor:
     // stack: val, retdest
-    %div_const(2)
+    %div2
     // stack: val/2, retdest
     PUSH 0
     // stack: 0, val/2, retdest
diff --git a/evm/src/cpu/kernel/stack/permutations.rs b/evm/src/cpu/kernel/stack/permutations.rs
index 50be605d..69d5f2e6 100644
--- a/evm/src/cpu/kernel/stack/permutations.rs
+++ b/evm/src/cpu/kernel/stack/permutations.rs
@@ -59,7 +59,7 @@ fn apply_perm<T: Eq + Hash + Clone>(permutation: Vec<Vec<usize>>, mut lst: Vec<T
 /// This function does STEP 1.
 /// Given 2 lists A, B find a permutation P such that P . A = B.
 pub(crate) fn find_permutation<T: Eq + Hash + Clone>(lst_a: &[T], lst_b: &[T]) -> Vec<Vec<usize>> {
-    // We should check to ensure that A and B are indeed rearrangments of each other.
+    // We should check to ensure that A and B are indeed rearrangements of each other.
     assert!(is_permutation(lst_a, lst_b));
 
     let n = lst_a.len();
diff --git a/evm/src/cpu/memio.rs b/evm/src/cpu/memio.rs
index 2073e182..7cb4c388 100644
--- a/evm/src/cpu/memio.rs
+++ b/evm/src/cpu/memio.rs
@@ -204,7 +204,7 @@ fn eval_packed_store<P: PackedField>(
 }
 
 /// Circuit version of `eval_packed_store`.
-/// /// Evaluates constraints for MSTORE_GENERAL.
+/// Evaluates constraints for MSTORE_GENERAL.
 fn eval_ext_circuit_store<F: RichField + Extendable<D>, const D: usize>(
     builder: &mut plonky2::plonk::circuit_builder::CircuitBuilder<F, D>,
     lv: &CpuColumnsView<ExtensionTarget<D>>,
diff --git a/evm/src/cpu/syscalls_exceptions.rs b/evm/src/cpu/syscalls_exceptions.rs
index 501b114f..337a0a86 100644
--- a/evm/src/cpu/syscalls_exceptions.rs
+++ b/evm/src/cpu/syscalls_exceptions.rs
@@ -29,6 +29,12 @@ pub(crate) fn eval_packed<P: PackedField>(
     let filter_exception = lv.op.exception;
     let total_filter = filter_syscall + filter_exception;
 
+    // First, constrain filters to be boolean.
+    // Ensuring they are mutually exclusive is done in other modules
+    // through the `is_cpu_cycle` variable.
+    yield_constr.constraint(filter_syscall * (filter_syscall - P::ONES));
+    yield_constr.constraint(filter_exception * (filter_exception - P::ONES));
+
     // If exception, ensure we are not in kernel mode
     yield_constr.constraint(filter_exception * lv.is_kernel_mode);
 
@@ -132,6 +138,14 @@ pub(crate) fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
     let filter_exception = lv.op.exception;
     let total_filter = builder.add_extension(filter_syscall, filter_exception);
 
+    // First, constrain filters to be boolean.
+    // Ensuring they are mutually exclusive is done in other modules
+    // through the `is_cpu_cycle` variable.
+    let constr = builder.mul_sub_extension(filter_syscall, filter_syscall, filter_syscall);
+    yield_constr.constraint(builder, constr);
+    let constr = builder.mul_sub_extension(filter_exception, filter_exception, filter_exception);
+    yield_constr.constraint(builder, constr);
+
     // Ensure that, if exception, we are not in kernel mode
     let constr = builder.mul_extension(filter_exception, lv.is_kernel_mode);
     yield_constr.constraint(builder, constr);
diff --git a/evm/src/cross_table_lookup.rs b/evm/src/cross_table_lookup.rs
index 21f94126..428279ba 100644
--- a/evm/src/cross_table_lookup.rs
+++ b/evm/src/cross_table_lookup.rs
@@ -28,11 +28,14 @@
 //! the current and next row values -- when computing the linear combinations.
 
 use std::borrow::Borrow;
+use std::cmp::min;
 use std::fmt::Debug;
 use std::iter::repeat;
 
 use anyhow::{ensure, Result};
+use hashbrown::HashMap;
 use itertools::Itertools;
+use plonky2::field::batch_util::{batch_add_inplace, batch_multiply_inplace};
 use plonky2::field::extension::{Extendable, FieldExtension};
 use plonky2::field::packed::PackedField;
 use plonky2::field::polynomial::PolynomialValues;
@@ -46,6 +49,7 @@ use plonky2::plonk::config::{AlgebraicHasher, GenericConfig, Hasher};
 use plonky2::plonk::plonk_common::{
     reduce_with_powers, reduce_with_powers_circuit, reduce_with_powers_ext_circuit,
 };
+use plonky2::util::ceil_div_usize;
 use plonky2::util::serialization::{Buffer, IoResult, Read, Write};
 
 use crate::all_stark::{Table, NUM_TABLES};
@@ -466,39 +470,66 @@ impl<F: Field> CrossTableLookup<F> {
         }
     }
 
-    /// Given a `Table` t and the number of challenges, returns the number of Cross-table lookup polynomials associated to t,
-    /// i.e. the number of looking and looked tables among all CTLs whose columns are taken from t.
-    pub(crate) fn num_ctl_zs(ctls: &[Self], table: Table, num_challenges: usize) -> usize {
+    /// Given a table, returns:
+    /// - the total number of helper columns for this table, over all Cross-table lookups,
+    /// - the total number of z polynomials for this table, over all Cross-table lookups,
+    /// - the number of helper columns for this table, for each Cross-table lookup.
+    pub(crate) fn num_ctl_helpers_zs_all(
+        ctls: &[Self],
+        table: Table,
+        num_challenges: usize,
+        constraint_degree: usize,
+    ) -> (usize, usize, Vec<usize>) {
+        let mut num_helpers = 0;
         let mut num_ctls = 0;
-        for ctl in ctls {
+        let mut num_helpers_by_ctl = vec![0; ctls.len()];
+        for (i, ctl) in ctls.iter().enumerate() {
             let all_tables = std::iter::once(&ctl.looked_table).chain(&ctl.looking_tables);
-            num_ctls += all_tables.filter(|twc| twc.table == table).count();
+            let num_appearances = all_tables.filter(|twc| twc.table == table).count();
+            let is_helpers = num_appearances > 2;
+            if is_helpers {
+                num_helpers_by_ctl[i] = ceil_div_usize(num_appearances, constraint_degree - 1);
+                num_helpers += num_helpers_by_ctl[i];
+            }
+
+            if num_appearances > 0 {
+                num_ctls += 1;
+            }
         }
-        num_ctls * num_challenges
+        (
+            num_helpers * num_challenges,
+            num_ctls * num_challenges,
+            num_helpers_by_ctl,
+        )
     }
 }
 
 /// Cross-table lookup data for one table.
 #[derive(Clone, Default)]
-pub(crate) struct CtlData<F: Field> {
+pub(crate) struct CtlData<'a, F: Field> {
     /// Data associated with all Z(x) polynomials for one table.
-    pub(crate) zs_columns: Vec<CtlZData<F>>,
+    pub(crate) zs_columns: Vec<CtlZData<'a, F>>,
 }
 
 /// Cross-table lookup data associated with one Z(x) polynomial.
+/// One Z(x) polynomial can be associated to multiple tables,
+/// built from the same STARK.
 #[derive(Clone)]
-pub(crate) struct CtlZData<F: Field> {
+pub(crate) struct CtlZData<'a, F: Field> {
+    /// Helper columns to verify the Z polynomial values.
+    pub(crate) helper_columns: Vec<PolynomialValues<F>>,
     /// Z polynomial values.
     pub(crate) z: PolynomialValues<F>,
     /// Cross-table lookup challenge.
     pub(crate) challenge: GrandProductChallenge<F>,
-    /// Column linear combination for the current table.
-    pub(crate) columns: Vec<Column<F>>,
-    /// Filter column for the current table. It evaluates to either 1 or 0.
-    pub(crate) filter: Option<Filter<F>>,
+    /// Vector of column linear combinations for the current tables.
+    pub(crate) columns: Vec<&'a [Column<F>]>,
+    /// Vector of filter columns for the current table.
+    /// Each filter evaluates to either 1 or 0.
+    pub(crate) filter: Vec<Option<Filter<F>>>,
 }
 
-impl<F: Field> CtlData<F> {
+impl<'a, F: Field> CtlData<'a, F> {
     /// Returns the number of cross-table lookup polynomials.
     pub(crate) fn len(&self) -> usize {
         self.zs_columns.len()
@@ -509,12 +540,38 @@ impl<F: Field> CtlData<F> {
         self.zs_columns.is_empty()
     }
 
-    /// Returns all the cross-table lookup polynomials.
-    pub(crate) fn z_polys(&self) -> Vec<PolynomialValues<F>> {
-        self.zs_columns
+    /// Returns all the cross-table lookup helper polynomials.
+    pub(crate) fn ctl_helper_polys(&self) -> Vec<PolynomialValues<F>> {
+        let num_polys = self
+            .zs_columns
             .iter()
-            .map(|zs_columns| zs_columns.z.clone())
-            .collect()
+            .fold(0, |acc, z| acc + z.helper_columns.len());
+        let mut res = Vec::with_capacity(num_polys);
+        for z in &self.zs_columns {
+            res.extend(z.helper_columns.clone());
+        }
+
+        res
+    }
+
+    /// Returns all the Z cross-table-lookup polynomials.
+    pub(crate) fn ctl_z_polys(&self) -> Vec<PolynomialValues<F>> {
+        let mut res = Vec::with_capacity(self.zs_columns.len());
+        for z in &self.zs_columns {
+            res.push(z.z.clone());
+        }
+
+        res
+    }
+    /// Returns the number of helper columns for each STARK in each
+    /// `CtlZData`.
+    pub(crate) fn num_ctl_helper_polys(&self) -> Vec<usize> {
+        let mut res = Vec::with_capacity(self.zs_columns.len());
+        for z in &self.zs_columns {
+            res.push(z.helper_columns.len());
+        }
+
+        res
     }
 }
 
@@ -640,16 +697,46 @@ pub(crate) fn get_grand_product_challenge_set_target<
     GrandProductChallengeSet { challenges }
 }
 
+/// Returns the number of helper columns for each `Table`.
+pub(crate) fn num_ctl_helper_columns_by_table<F: Field>(
+    ctls: &[CrossTableLookup<F>],
+    constraint_degree: usize,
+) -> Vec<[usize; NUM_TABLES]> {
+    let mut res = vec![[0; NUM_TABLES]; ctls.len()];
+    for (i, ctl) in ctls.iter().enumerate() {
+        let CrossTableLookup {
+            looking_tables,
+            looked_table,
+        } = ctl;
+        let mut num_by_table = [0; NUM_TABLES];
+
+        let grouped_lookups = looking_tables.iter().group_by(|&a| a.table);
+
+        for (table, group) in grouped_lookups.into_iter() {
+            let sum = group.count();
+            if sum > 2 {
+                // We only need helper columns if there are more than 2 columns.
+                num_by_table[table as usize] = ceil_div_usize(sum, constraint_degree - 1);
+            }
+        }
+
+        res[i] = num_by_table;
+    }
+    res
+}
+
 /// Generates all the cross-table lookup data, for all tables.
 /// - `trace_poly_values` corresponds to the trace values for all tables.
 /// - `cross_table_lookups` corresponds to all the cross-table lookups, i.e. the looked and looking tables, as described in `CrossTableLookup`.
 /// - `ctl_challenges` corresponds to the challenges used for CTLs.
+/// - `constraint_degree` is the maximal constraint degree for the table.
 /// For each `CrossTableLookup`, and each looking/looked table, the partial products for the CTL are computed, and added to the said table's `CtlZData`.
-pub(crate) fn cross_table_lookup_data<F: RichField, const D: usize>(
+pub(crate) fn cross_table_lookup_data<'a, F: RichField, const D: usize>(
     trace_poly_values: &[Vec<PolynomialValues<F>>; NUM_TABLES],
-    cross_table_lookups: &[CrossTableLookup<F>],
+    cross_table_lookups: &'a [CrossTableLookup<F>],
     ctl_challenges: &GrandProductChallengeSet<F>,
-) -> [CtlData<F>; NUM_TABLES] {
+    constraint_degree: usize,
+) -> [CtlData<'a, F>; NUM_TABLES] {
     let mut ctl_data_per_table = [0; NUM_TABLES].map(|_| CtlData::default());
     for CrossTableLookup {
         looking_tables,
@@ -658,103 +745,232 @@ pub(crate) fn cross_table_lookup_data<F: RichField, const D: usize>(
     {
         log::debug!("Processing CTL for {:?}", looked_table.table);
         for &challenge in &ctl_challenges.challenges {
-            let zs_looking = looking_tables.iter().map(|table| {
-                partial_sums(
-                    &trace_poly_values[table.table as usize],
-                    &table.columns,
-                    &table.filter,
-                    challenge,
-                )
-            });
-            let z_looked = partial_sums(
-                &trace_poly_values[looked_table.table as usize],
-                &looked_table.columns,
-                &looked_table.filter,
+            let helper_zs_looking = ctl_helper_zs_cols(
+                trace_poly_values,
+                looking_tables.clone(),
                 challenge,
+                constraint_degree,
             );
-            for (table, z) in looking_tables.iter().zip(zs_looking) {
-                ctl_data_per_table[table.table as usize]
-                    .zs_columns
-                    .push(CtlZData {
-                        z,
-                        challenge,
-                        columns: table.columns.clone(),
-                        filter: table.filter.clone(),
-                    });
+
+            let mut z_looked = partial_sums(
+                &trace_poly_values[looked_table.table as usize],
+                &[(&looked_table.columns, &looked_table.filter)],
+                challenge,
+                constraint_degree,
+            );
+
+            for (table, helpers_zs) in helper_zs_looking {
+                let num_helpers = helpers_zs.len() - 1;
+                let count = looking_tables
+                    .iter()
+                    .filter(|looking_table| looking_table.table as usize == table)
+                    .count();
+                let cols_filts = looking_tables.iter().filter_map(|looking_table| {
+                    if looking_table.table as usize == table {
+                        Some((&looking_table.columns, &looking_table.filter))
+                    } else {
+                        None
+                    }
+                });
+                let mut columns = Vec::with_capacity(count);
+                let mut filter = Vec::with_capacity(count);
+                for (col, filt) in cols_filts {
+                    columns.push(&col[..]);
+                    filter.push(filt.clone());
+                }
+                ctl_data_per_table[table].zs_columns.push(CtlZData {
+                    helper_columns: helpers_zs[..num_helpers].to_vec(),
+                    z: helpers_zs[num_helpers].clone(),
+                    challenge,
+                    columns,
+                    filter,
+                });
             }
+            // There is no helper column for the looking table.
+            let looked_poly = z_looked[0].clone();
             ctl_data_per_table[looked_table.table as usize]
                 .zs_columns
                 .push(CtlZData {
-                    z: z_looked,
+                    helper_columns: vec![],
+                    z: looked_poly,
                     challenge,
-                    columns: looked_table.columns.clone(),
-                    filter: looked_table.filter.clone(),
+                    columns: vec![&looked_table.columns[..]],
+                    filter: vec![looked_table.filter.clone()],
                 });
         }
     }
     ctl_data_per_table
 }
 
+type ColumnFilter<'a, F> = (&'a [Column<F>], &'a Option<Filter<F>>);
+
+/// Given a STARK's trace, and the data associated to one lookup (either CTL or range check),
+/// returns the associated helper polynomials.
+pub(crate) fn get_helper_cols<F: Field>(
+    trace: &[PolynomialValues<F>],
+    degree: usize,
+    columns_filters: &[ColumnFilter<F>],
+    challenge: GrandProductChallenge<F>,
+    constraint_degree: usize,
+) -> Vec<PolynomialValues<F>> {
+    let num_helper_columns = ceil_div_usize(columns_filters.len(), constraint_degree - 1);
+
+    let mut helper_columns = Vec::with_capacity(num_helper_columns);
+
+    let mut filter_index = 0;
+    for mut cols_filts in &columns_filters.iter().chunks(constraint_degree - 1) {
+        let (first_col, first_filter) = cols_filts.next().unwrap();
+
+        let mut filter_col = Vec::with_capacity(degree);
+        let first_combined = (0..degree)
+            .map(|d| {
+                let f = if let Some(filter) = first_filter {
+                    let f = filter.eval_table(trace, d);
+                    filter_col.push(f);
+                    f
+                } else {
+                    filter_col.push(F::ONE);
+                    F::ONE
+                };
+                if f.is_one() {
+                    let evals = first_col
+                        .iter()
+                        .map(|c| c.eval_table(trace, d))
+                        .collect::<Vec<F>>();
+                    challenge.combine(evals.iter())
+                } else {
+                    assert_eq!(f, F::ZERO, "Non-binary filter?");
+                    // Dummy value. Cannot be zero since it will be batch-inverted.
+                    F::ONE
+                }
+            })
+            .collect::<Vec<F>>();
+
+        let mut acc = F::batch_multiplicative_inverse(&first_combined);
+        for d in 0..degree {
+            if filter_col[d].is_zero() {
+                acc[d] = F::ZERO;
+            }
+        }
+
+        for (col, filt) in cols_filts {
+            let mut filter_col = Vec::with_capacity(degree);
+            let mut combined = (0..degree)
+                .map(|d| {
+                    let f = if let Some(filter) = filt {
+                        let f = filter.eval_table(trace, d);
+                        filter_col.push(f);
+                        f
+                    } else {
+                        filter_col.push(F::ONE);
+                        F::ONE
+                    };
+                    if f.is_one() {
+                        let evals = col
+                            .iter()
+                            .map(|c| c.eval_table(trace, d))
+                            .collect::<Vec<F>>();
+                        challenge.combine(evals.iter())
+                    } else {
+                        assert_eq!(f, F::ZERO, "Non-binary filter?");
+                        // Dummy value. Cannot be zero since it will be batch-inverted.
+                        F::ONE
+                    }
+                })
+                .collect::<Vec<F>>();
+
+            combined = F::batch_multiplicative_inverse(&combined);
+
+            for d in 0..degree {
+                if filter_col[d].is_zero() {
+                    combined[d] = F::ZERO;
+                }
+            }
+
+            batch_add_inplace(&mut acc, &combined);
+        }
+
+        helper_columns.push(acc.into());
+    }
+    assert_eq!(helper_columns.len(), num_helper_columns);
+
+    helper_columns
+}
+
+/// Computes helper columns and Z polynomials for all looking tables
+/// of one cross-table lookup (i.e. for one looked table).
+fn ctl_helper_zs_cols<F: Field>(
+    all_stark_traces: &[Vec<PolynomialValues<F>>; NUM_TABLES],
+    looking_tables: Vec<TableWithColumns<F>>,
+    challenge: GrandProductChallenge<F>,
+    constraint_degree: usize,
+) -> Vec<(usize, Vec<PolynomialValues<F>>)> {
+    let grouped_lookups = looking_tables.iter().group_by(|a| a.table);
+
+    grouped_lookups
+        .into_iter()
+        .map(|(table, group)| {
+            let degree = all_stark_traces[table as usize][0].len();
+            let columns_filters = group
+                .map(|table| (&table.columns[..], &table.filter))
+                .collect::<Vec<(&[Column<F>], &Option<Filter<F>>)>>();
+            (
+                table as usize,
+                partial_sums(
+                    &all_stark_traces[table as usize],
+                    &columns_filters,
+                    challenge,
+                    constraint_degree,
+                ),
+            )
+        })
+        .collect::<Vec<(usize, Vec<PolynomialValues<F>>)>>()
+}
+
 /// Computes the cross-table lookup partial sums for one table and given column linear combinations.
 /// `trace` represents the trace values for the given table.
-/// `columns` are all the column linear combinations to evaluate.
-/// `filter_column` is a column linear combination used to determine whether a row should be selected.
+/// `columns` is a vector of column linear combinations to evaluate. Each element in the vector represents columns that need to be combined.
+/// `filter_cols` are column linear combinations used to determine whether a row should be selected.
 /// `challenge` is a cross-table lookup challenge.
 /// The initial sum `s` is 0.
-/// For each row, if the `filter_column` evaluates to 1, then the rows is selected. All the column linear combinations are evaluated at said row. All those evaluations are combined using the challenge to get a value `v`.
-/// The sum is updated: `s += 1/v`, and is pushed to the vector of partial sums.
+/// For each row, if the `filter_column` evaluates to 1, then the row is selected. All the column linear combinations are evaluated at said row.
+/// The evaluations of each elements of `columns` are then combined together to form a value `v`.
+/// The values `v`` are grouped together, in groups of size `constraint_degree - 1` (2 in our case). For each group, we construct a helper
+/// column: h = \sum_i 1/(v_i).
+///
+/// The sum is updated: `s += \sum h_i`, and is pushed to the vector of partial sums `z``.
+/// Returns the helper columns and `z`.
 fn partial_sums<F: Field>(
     trace: &[PolynomialValues<F>],
-    columns: &[Column<F>],
-    filter: &Option<Filter<F>>,
+    columns_filters: &[ColumnFilter<F>],
     challenge: GrandProductChallenge<F>,
-) -> PolynomialValues<F> {
-    let mut partial_sum = F::ZERO;
+    constraint_degree: usize,
+) -> Vec<PolynomialValues<F>> {
     let degree = trace[0].len();
-    let mut filters = Vec::with_capacity(degree);
-    let mut res = Vec::with_capacity(degree);
+    let mut z = Vec::with_capacity(degree);
 
-    for i in (0..degree).rev() {
-        if let Some(filter) = filter {
-            let filter_val = filter.eval_table(trace, i);
-            if filter_val.is_one() {
-                filters.push(true);
-            } else {
-                assert_eq!(filter_val, F::ZERO, "Non-binary filter?");
-                filters.push(false);
-            }
-        } else {
-            filters.push(false);
-        };
+    let mut helper_columns =
+        get_helper_cols(trace, degree, columns_filters, challenge, constraint_degree);
 
-        let combined = if filters[filters.len() - 1] {
-            let evals = columns
-                .iter()
-                .map(|c| c.eval_table(trace, i))
-                .collect::<Vec<_>>();
-            challenge.combine(evals.iter())
-        } else {
-            // Dummy value. Cannot be zero since it will be batch-inverted.
-            F::ONE
-        };
-        res.push(combined);
+    let x = helper_columns
+        .iter()
+        .map(|col| col.values[degree - 1])
+        .sum::<F>();
+    z.push(x);
+
+    for i in (0..degree - 1).rev() {
+        let x = helper_columns.iter().map(|col| col.values[i]).sum::<F>();
+
+        z.push(z[z.len() - 1] + x);
     }
-    res = F::batch_multiplicative_inverse(&res);
-
-    if !filters[0] {
-        res[0] = F::ZERO;
+    z.reverse();
+    if columns_filters.len() > 2 {
+        helper_columns.push(z.into());
+    } else {
+        helper_columns = vec![z.into()];
     }
 
-    for i in (1..degree) {
-        let mut cur_value = res[i - 1];
-        if filters[i] {
-            cur_value += res[i];
-        }
-        res[i] = cur_value;
-    }
-
-    res.reverse();
-    res.into()
+    helper_columns
 }
 
 /// Data necessary to check the cross-table lookups of a given table.
@@ -765,6 +981,9 @@ where
     FE: FieldExtension<D2, BaseField = F>,
     P: PackedField<Scalar = FE>,
 {
+    /// Helper columns to check that the Z polyomial
+    /// was constructed correctly.
+    pub(crate) helper_columns: Vec<P>,
     /// Evaluation of the trace polynomials at point `zeta`.
     pub(crate) local_z: P,
     /// Evaluation of the trace polynomials at point `g * zeta`
@@ -772,9 +991,9 @@ where
     /// Cross-table lookup challenges.
     pub(crate) challenges: GrandProductChallenge<F>,
     /// Column linear combinations of the `CrossTableLookup`s.
-    pub(crate) columns: &'a [Column<F>],
+    pub(crate) columns: Vec<&'a [Column<F>]>,
     /// Filter that evaluates to either 1 or 0.
-    pub(crate) filter: &'a Option<Filter<F>>,
+    pub(crate) filter: Vec<Option<Filter<F>>>,
 }
 
 impl<'a, F: RichField + Extendable<D>, const D: usize>
@@ -786,45 +1005,107 @@ impl<'a, F: RichField + Extendable<D>, const D: usize>
         cross_table_lookups: &'a [CrossTableLookup<F>],
         ctl_challenges: &'a GrandProductChallengeSet<F>,
         num_lookup_columns: &[usize; NUM_TABLES],
+        num_helper_ctl_columns: &Vec<[usize; NUM_TABLES]>,
     ) -> [Vec<Self>; NUM_TABLES] {
+        let mut total_num_helper_cols_by_table = [0; NUM_TABLES];
+        for p_ctls in num_helper_ctl_columns {
+            for j in 0..NUM_TABLES {
+                total_num_helper_cols_by_table[j] += p_ctls[j] * ctl_challenges.challenges.len();
+            }
+        }
+
         // Get all cross-table lookup polynomial openings for each STARK proof.
         let mut ctl_zs = proofs
             .iter()
             .zip(num_lookup_columns)
             .map(|(p, &num_lookup)| {
                 let openings = &p.proof.openings;
-                let ctl_zs = openings.auxiliary_polys.iter().skip(num_lookup);
-                let ctl_zs_next = openings.auxiliary_polys_next.iter().skip(num_lookup);
-                ctl_zs.zip(ctl_zs_next)
+
+                let ctl_zs = &openings.auxiliary_polys[num_lookup..];
+                let ctl_zs_next = &openings.auxiliary_polys_next[num_lookup..];
+                ctl_zs.iter().zip(ctl_zs_next).collect::<Vec<_>>()
             })
             .collect::<Vec<_>>();
 
         // Put each cross-table lookup polynomial into the correct table data: if a CTL polynomial is extracted from looking/looked table t, then we add it to the `CtlCheckVars` of table t.
+        let mut start_indices = [0; NUM_TABLES];
+        let mut z_indices = [0; NUM_TABLES];
         let mut ctl_vars_per_table = [0; NUM_TABLES].map(|_| vec![]);
-        for CrossTableLookup {
-            looking_tables,
-            looked_table,
-        } in cross_table_lookups
+        for (
+            CrossTableLookup {
+                looking_tables,
+                looked_table,
+            },
+            num_ctls,
+        ) in cross_table_lookups.iter().zip(num_helper_ctl_columns)
         {
             for &challenges in &ctl_challenges.challenges {
+                // Group looking tables by `Table`, since we bundle the looking tables taken from the same `Table` together thanks to helper columns.
+                // We want to only iterate on each `Table` once.
+                let mut filtered_looking_tables =
+                    Vec::with_capacity(min(looking_tables.len(), NUM_TABLES));
                 for table in looking_tables {
-                    let (looking_z, looking_z_next) = ctl_zs[table.table as usize].next().unwrap();
-                    ctl_vars_per_table[table.table as usize].push(Self {
+                    if !filtered_looking_tables.contains(&(table.table as usize)) {
+                        filtered_looking_tables.push(table.table as usize);
+                    }
+                }
+
+                for (i, &table) in filtered_looking_tables.iter().enumerate() {
+                    // We have first all the helper polynomials, then all the z polynomials.
+                    let (looking_z, looking_z_next) =
+                        ctl_zs[table][total_num_helper_cols_by_table[table] + z_indices[table]];
+
+                    let count = looking_tables
+                        .iter()
+                        .filter(|looking_table| looking_table.table as usize == table)
+                        .count();
+                    let cols_filts = looking_tables.iter().filter_map(|looking_table| {
+                        if looking_table.table as usize == table {
+                            Some((&looking_table.columns, &looking_table.filter))
+                        } else {
+                            None
+                        }
+                    });
+                    let mut columns = Vec::with_capacity(count);
+                    let mut filter = Vec::with_capacity(count);
+                    for (col, filt) in cols_filts {
+                        columns.push(&col[..]);
+                        filter.push(filt.clone());
+                    }
+                    let helper_columns = ctl_zs[table]
+                        [start_indices[table]..start_indices[table] + num_ctls[table]]
+                        .iter()
+                        .map(|&(h, _)| *h)
+                        .collect::<Vec<_>>();
+
+                    start_indices[table] += num_ctls[table];
+
+                    z_indices[table] += 1;
+                    ctl_vars_per_table[table].push(Self {
+                        helper_columns,
                         local_z: *looking_z,
                         next_z: *looking_z_next,
                         challenges,
-                        columns: &table.columns,
-                        filter: &table.filter,
+                        columns,
+                        filter,
                     });
                 }
 
-                let (looked_z, looked_z_next) = ctl_zs[looked_table.table as usize].next().unwrap();
+                let (looked_z, looked_z_next) = ctl_zs[looked_table.table as usize]
+                    [total_num_helper_cols_by_table[looked_table.table as usize]
+                        + z_indices[looked_table.table as usize]];
+
+                z_indices[looked_table.table as usize] += 1;
+
+                let columns = vec![&looked_table.columns[..]];
+                let filter = vec![looked_table.filter.clone()];
                 ctl_vars_per_table[looked_table.table as usize].push(Self {
+                    helper_columns: vec![],
                     local_z: *looked_z,
                     next_z: *looked_z_next,
                     challenges,
-                    columns: &looked_table.columns,
-                    filter: &looked_table.filter,
+                    columns,
+                    filter,
                 });
             }
         }
@@ -832,6 +1113,61 @@ impl<'a, F: RichField + Extendable<D>, const D: usize>
     }
 }
 
+/// Given data associated to a lookup (either a CTL or a range-check), check the associated helper polynomials.
+pub(crate) fn eval_helper_columns<F, FE, P, const D: usize, const D2: usize>(
+    filter: &[Option<Filter<F>>],
+    columns: &[Vec<P>],
+    local_values: &[P],
+    next_values: &[P],
+    helper_columns: &[P],
+    constraint_degree: usize,
+    challenges: &GrandProductChallenge<F>,
+    consumer: &mut ConstraintConsumer<P>,
+) where
+    F: RichField + Extendable<D>,
+    FE: FieldExtension<D2, BaseField = F>,
+    P: PackedField<Scalar = FE>,
+{
+    if !helper_columns.is_empty() {
+        for (j, chunk) in columns.chunks(constraint_degree - 1).enumerate() {
+            let fs =
+                &filter[(constraint_degree - 1) * j..(constraint_degree - 1) * j + chunk.len()];
+            let h = helper_columns[j];
+
+            match chunk.len() {
+                2 => {
+                    let combin0 = challenges.combine(&chunk[0]);
+                    let combin1 = challenges.combine(chunk[1].iter());
+
+                    let f0 = if let Some(filter0) = &fs[0] {
+                        filter0.eval_filter(local_values, next_values)
+                    } else {
+                        P::ONES
+                    };
+                    let f1 = if let Some(filter1) = &fs[1] {
+                        filter1.eval_filter(local_values, next_values)
+                    } else {
+                        P::ONES
+                    };
+
+                    consumer.constraint(combin1 * combin0 * h - f0 * combin1 - f1 * combin0);
+                }
+                1 => {
+                    let combin = challenges.combine(&chunk[0]);
+                    let f0 = if let Some(filter1) = &fs[0] {
+                        filter1.eval_filter(local_values, next_values)
+                    } else {
+                        P::ONES
+                    };
+                    consumer.constraint(combin * h - f0);
+                }
+
+                _ => todo!("Allow other constraint degrees"),
+            }
+        }
+    }
+}
+
 /// Checks the cross-table lookup Z polynomials for each table:
 /// - Checks that the CTL `Z` partial sums are correctly updated.
 /// - Checks that the final value of the CTL sum is the combination of all STARKs' CTL polynomials.
@@ -843,6 +1179,7 @@ pub(crate) fn eval_cross_table_lookup_checks<F, FE, P, S, const D: usize, const
     vars: &S::EvaluationFrame<FE, P, D2>,
     ctl_vars: &[CtlCheckVars<F, FE, P, D2>],
     consumer: &mut ConstraintConsumer<P>,
+    constraint_degree: usize,
 ) where
     F: RichField + Extendable<D>,
     FE: FieldExtension<D2, BaseField = F>,
@@ -854,6 +1191,7 @@ pub(crate) fn eval_cross_table_lookup_checks<F, FE, P, S, const D: usize, const
 
     for lookup_vars in ctl_vars {
         let CtlCheckVars {
+            helper_columns,
             local_z,
             next_z,
             challenges,
@@ -864,25 +1202,70 @@ pub(crate) fn eval_cross_table_lookup_checks<F, FE, P, S, const D: usize, const
         // Compute all linear combinations on the current table, and combine them using the challenge.
         let evals = columns
             .iter()
-            .map(|c| c.eval_with_next(local_values, next_values))
+            .map(|col| {
+                col.iter()
+                    .map(|c| c.eval_with_next(local_values, next_values))
+                    .collect::<Vec<_>>()
+            })
             .collect::<Vec<_>>();
-        let combined = challenges.combine(evals.iter());
-        let local_filter = if let Some(combin) = filter {
-            combin.eval_filter(local_values, next_values)
-        } else {
-            P::ONES
-        };
 
-        // Check value of `Z(g^(n-1))`
-        consumer.constraint_last_row(*local_z * combined - local_filter);
-        // Check `Z(w) = Z(gw) * (filter / combination)`
-        consumer.constraint_transition((*local_z - *next_z) * combined - local_filter);
+        // Check helper columns.
+        eval_helper_columns(
+            filter,
+            &evals,
+            local_values,
+            next_values,
+            helper_columns,
+            constraint_degree,
+            challenges,
+            consumer,
+        );
+
+        if !helper_columns.is_empty() {
+            let h_sum = helper_columns.iter().fold(P::ZEROS, |acc, x| acc + *x);
+            // Check value of `Z(g^(n-1))`
+            consumer.constraint_last_row(*local_z - h_sum);
+            // Check `Z(w) = Z(gw) + \sum h_i`
+            consumer.constraint_transition(*local_z - *next_z - h_sum);
+        } else if columns.len() > 1 {
+            let combin0 = challenges.combine(&evals[0]);
+            let combin1 = challenges.combine(&evals[1]);
+
+            let f0 = if let Some(filter0) = &filter[0] {
+                filter0.eval_filter(local_values, next_values)
+            } else {
+                P::ONES
+            };
+            let f1 = if let Some(filter1) = &filter[1] {
+                filter1.eval_filter(local_values, next_values)
+            } else {
+                P::ONES
+            };
+
+            consumer
+                .constraint_last_row(combin0 * combin1 * *local_z - f0 * combin1 - f1 * combin0);
+            consumer.constraint_transition(
+                combin0 * combin1 * (*local_z - *next_z) - f0 * combin1 - f1 * combin0,
+            );
+        } else {
+            let combin0 = challenges.combine(&evals[0]);
+            let f0 = if let Some(filter0) = &filter[0] {
+                filter0.eval_filter(local_values, next_values)
+            } else {
+                P::ONES
+            };
+            consumer.constraint_last_row(combin0 * *local_z - f0);
+            consumer.constraint_transition(combin0 * (*local_z - *next_z) - f0);
+        }
     }
 }
 
 /// Circuit version of `CtlCheckVars`. Data necessary to check the cross-table lookups of a given table.
 #[derive(Clone)]
-pub(crate) struct CtlCheckVarsTarget<'a, F: Field, const D: usize> {
+pub(crate) struct CtlCheckVarsTarget<F: Field, const D: usize> {
+    ///Evaluation of the helper columns to check that the Z polyomial
+    /// was constructed correctly.
+    pub(crate) helper_columns: Vec<ExtensionTarget<D>>,
     /// Evaluation of the trace polynomials at point `zeta`.
     pub(crate) local_z: ExtensionTarget<D>,
     /// Evaluation of the trace polynomials at point `g * zeta`.
@@ -890,12 +1273,12 @@ pub(crate) struct CtlCheckVarsTarget<'a, F: Field, const D: usize> {
     /// Cross-table lookup challenges.
     pub(crate) challenges: GrandProductChallenge<Target>,
     /// Column linear combinations of the `CrossTableLookup`s.
-    pub(crate) columns: &'a [Column<F>],
+    pub(crate) columns: Vec<Vec<Column<F>>>,
     /// Filter that evaluates to either 1 or 0.
-    pub(crate) filter: &'a Option<Filter<F>>,
+    pub(crate) filter: Vec<Option<Filter<F>>>,
 }
 
-impl<'a, F: Field, const D: usize> CtlCheckVarsTarget<'a, F, D> {
+impl<'a, F: Field, const D: usize> CtlCheckVarsTarget<F, D> {
     /// Circuit version of `from_proofs`. Extracts the `CtlCheckVarsTarget` for each STARK.
     pub(crate) fn from_proof(
         table: Table,
@@ -903,6 +1286,8 @@ impl<'a, F: Field, const D: usize> CtlCheckVarsTarget<'a, F, D> {
         cross_table_lookups: &'a [CrossTableLookup<F>],
         ctl_challenges: &'a GrandProductChallengeSet<Target>,
         num_lookup_columns: usize,
+        total_num_helper_columns: usize,
+        num_helper_ctl_columns: &[usize],
     ) -> Vec<Self> {
         // Get all cross-table lookup polynomial openings for each STARK proof.
         let mut ctl_zs = {
@@ -912,47 +1297,145 @@ impl<'a, F: Field, const D: usize> CtlCheckVarsTarget<'a, F, D> {
                 .auxiliary_polys_next
                 .iter()
                 .skip(num_lookup_columns);
-            ctl_zs.zip(ctl_zs_next)
+            ctl_zs.zip(ctl_zs_next).collect::<Vec<_>>()
         };
 
         // Put each cross-table lookup polynomial into the correct table data: if a CTL polynomial is extracted from looking/looked table t, then we add it to the `CtlCheckVars` of table t.
+        let mut z_index = 0;
+        let mut start_index = 0;
         let mut ctl_vars = vec![];
-        for CrossTableLookup {
-            looking_tables,
-            looked_table,
-        } in cross_table_lookups
+        for (
+            i,
+            CrossTableLookup {
+                looking_tables,
+                looked_table,
+            },
+        ) in cross_table_lookups.iter().enumerate()
         {
             for &challenges in &ctl_challenges.challenges {
-                for looking_table in looking_tables {
+                // Group looking tables by `Table`, since we bundle the looking tables taken from the same `Table` together thanks to helper columns.
+
+                let count = looking_tables
+                    .iter()
+                    .filter(|looking_table| looking_table.table == table)
+                    .count();
+                let cols_filts = looking_tables.iter().filter_map(|looking_table| {
                     if looking_table.table == table {
-                        let (looking_z, looking_z_next) = ctl_zs.next().unwrap();
-                        ctl_vars.push(Self {
-                            local_z: *looking_z,
-                            next_z: *looking_z_next,
-                            challenges,
-                            columns: &looking_table.columns,
-                            filter: &looking_table.filter,
-                        });
+                        Some((&looking_table.columns, &looking_table.filter))
+                    } else {
+                        None
                     }
+                });
+                if count > 0 {
+                    let mut columns = Vec::with_capacity(count);
+                    let mut filter = Vec::with_capacity(count);
+                    for (col, filt) in cols_filts {
+                        columns.push(col.clone());
+                        filter.push(filt.clone());
+                    }
+                    let (looking_z, looking_z_next) = ctl_zs[total_num_helper_columns + z_index];
+                    let helper_columns = ctl_zs
+                        [start_index..start_index + num_helper_ctl_columns[i]]
+                        .iter()
+                        .map(|(&h, _)| h)
+                        .collect::<Vec<_>>();
+
+                    start_index += num_helper_ctl_columns[i];
+                    z_index += 1;
+                    // let columns = group.0.clone();
+                    // let filter = group.1.clone();
+                    ctl_vars.push(Self {
+                        helper_columns,
+                        local_z: *looking_z,
+                        next_z: *looking_z_next,
+                        challenges,
+                        columns,
+                        filter,
+                    });
                 }
 
                 if looked_table.table == table {
-                    let (looked_z, looked_z_next) = ctl_zs.next().unwrap();
+                    let (looked_z, looked_z_next) = ctl_zs[total_num_helper_columns + z_index];
+                    z_index += 1;
+
+                    let columns = vec![looked_table.columns.clone()];
+                    let filter = vec![looked_table.filter.clone()];
                     ctl_vars.push(Self {
+                        helper_columns: vec![],
                         local_z: *looked_z,
                         next_z: *looked_z_next,
                         challenges,
-                        columns: &looked_table.columns,
-                        filter: &looked_table.filter,
+                        columns,
+                        filter,
                     });
                 }
             }
         }
-        assert!(ctl_zs.next().is_none());
+
         ctl_vars
     }
 }
 
+/// Circuit version of `eval_helper_columns`.
+/// Given data associated to a lookup (either a CTL or a range-check), check the associated helper polynomials.
+pub(crate) fn eval_helper_columns_circuit<F: RichField + Extendable<D>, const D: usize>(
+    builder: &mut CircuitBuilder<F, D>,
+    filter: &[Option<Filter<F>>],
+    columns: &[Vec<ExtensionTarget<D>>],
+    local_values: &[ExtensionTarget<D>],
+    next_values: &[ExtensionTarget<D>],
+    helper_columns: &[ExtensionTarget<D>],
+    constraint_degree: usize,
+    challenges: &GrandProductChallenge<Target>,
+    consumer: &mut RecursiveConstraintConsumer<F, D>,
+) {
+    if !helper_columns.is_empty() {
+        for (j, chunk) in columns.chunks(constraint_degree - 1).enumerate() {
+            let fs =
+                &filter[(constraint_degree - 1) * j..(constraint_degree - 1) * j + chunk.len()];
+            let h = helper_columns[j];
+
+            let one = builder.one_extension();
+            match chunk.len() {
+                2 => {
+                    let combin0 = challenges.combine_circuit(builder, &chunk[0]);
+                    let combin1 = challenges.combine_circuit(builder, &chunk[1]);
+
+                    let f0 = if let Some(filter0) = &fs[0] {
+                        filter0.eval_filter_circuit(builder, local_values, next_values)
+                    } else {
+                        one
+                    };
+                    let f1 = if let Some(filter1) = &fs[1] {
+                        filter1.eval_filter_circuit(builder, local_values, next_values)
+                    } else {
+                        one
+                    };
+
+                    let constr = builder.mul_sub_extension(combin0, h, f0);
+                    let constr = builder.mul_extension(constr, combin1);
+                    let f1_constr = builder.mul_extension(f1, combin0);
+                    let constr = builder.sub_extension(constr, f1_constr);
+
+                    consumer.constraint(builder, constr);
+                }
+                1 => {
+                    let combin = challenges.combine_circuit(builder, &chunk[0]);
+                    let f0 = if let Some(filter1) = &fs[0] {
+                        filter1.eval_filter_circuit(builder, local_values, next_values)
+                    } else {
+                        one
+                    };
+                    let constr = builder.mul_sub_extension(combin, h, f0);
+                    consumer.constraint(builder, constr);
+                }
+
+                _ => todo!("Allow other constraint degrees"),
+            }
+        }
+    }
+}
+
 /// Circuit version of `eval_cross_table_lookup_checks`. Checks the cross-table lookup Z polynomials for each table:
 /// - Checks that the CTL `Z` partial sums are correctly updated.
 /// - Checks that the final value of the CTL sum is the combination of all STARKs' CTL polynomials.
@@ -969,12 +1452,16 @@ pub(crate) fn eval_cross_table_lookup_checks_circuit<
     vars: &S::EvaluationFrameTarget,
     ctl_vars: &[CtlCheckVarsTarget<F, D>],
     consumer: &mut RecursiveConstraintConsumer<F, D>,
+    constraint_degree: usize,
 ) {
     let local_values = vars.get_local_values();
     let next_values = vars.get_next_values();
 
+    let one = builder.one_extension();
+
     for lookup_vars in ctl_vars {
         let CtlCheckVarsTarget {
+            helper_columns,
             local_z,
             next_z,
             challenges,
@@ -982,29 +1469,77 @@ pub(crate) fn eval_cross_table_lookup_checks_circuit<
             filter,
         } = lookup_vars;
 
-        let one = builder.one_extension();
-        let local_filter = if let Some(combin) = filter {
-            combin.eval_filter_circuit(builder, local_values, next_values)
-        } else {
-            one
-        };
-
         // Compute all linear combinations on the current table, and combine them using the challenge.
         let evals = columns
             .iter()
-            .map(|c| c.eval_with_next_circuit(builder, local_values, next_values))
+            .map(|col| {
+                col.iter()
+                    .map(|c| c.eval_with_next_circuit(builder, local_values, next_values))
+                    .collect::<Vec<_>>()
+            })
             .collect::<Vec<_>>();
 
-        let combined = challenges.combine_circuit(builder, &evals);
+        // Check helper columns.
+        eval_helper_columns_circuit(
+            builder,
+            filter,
+            &evals,
+            local_values,
+            next_values,
+            helper_columns,
+            constraint_degree,
+            challenges,
+            consumer,
+        );
 
-        // Check value of `Z(g^(n-1))`
-        let last_row = builder.mul_sub_extension(*local_z, combined, local_filter);
-        consumer.constraint_last_row(builder, last_row);
-        // Check `Z(w) = Z(gw) * (filter / combination)`
         let z_diff = builder.sub_extension(*local_z, *next_z);
-        let lhs = builder.mul_extension(combined, z_diff);
-        let transition = builder.sub_extension(lhs, local_filter);
-        consumer.constraint_transition(builder, transition);
+        if !helper_columns.is_empty() {
+            // Check value of `Z(g^(n-1))`
+            let h_sum = builder.add_many_extension(helper_columns);
+
+            let last_row = builder.sub_extension(*local_z, h_sum);
+            consumer.constraint_last_row(builder, last_row);
+            // Check `Z(w) = Z(gw) * (filter / combination)`
+
+            let transition = builder.sub_extension(z_diff, h_sum);
+            consumer.constraint_transition(builder, transition);
+        } else if columns.len() > 1 {
+            let combin0 = challenges.combine_circuit(builder, &evals[0]);
+            let combin1 = challenges.combine_circuit(builder, &evals[1]);
+
+            let f0 = if let Some(filter0) = &filter[0] {
+                filter0.eval_filter_circuit(builder, local_values, next_values)
+            } else {
+                one
+            };
+            let f1 = if let Some(filter1) = &filter[1] {
+                filter1.eval_filter_circuit(builder, local_values, next_values)
+            } else {
+                one
+            };
+
+            let combined = builder.mul_sub_extension(combin1, *local_z, f1);
+            let combined = builder.mul_extension(combined, combin0);
+            let constr = builder.arithmetic_extension(F::NEG_ONE, F::ONE, f0, combin1, combined);
+            consumer.constraint_last_row(builder, constr);
+
+            let combined = builder.mul_sub_extension(combin1, z_diff, f1);
+            let combined = builder.mul_extension(combined, combin0);
+            let constr = builder.arithmetic_extension(F::NEG_ONE, F::ONE, f0, combin1, combined);
+            consumer.constraint_last_row(builder, constr);
+        } else {
+            let combin0 = challenges.combine_circuit(builder, &evals[0]);
+            let f0 = if let Some(filter0) = &filter[0] {
+                filter0.eval_filter_circuit(builder, local_values, next_values)
+            } else {
+                one
+            };
+
+            let constr = builder.mul_sub_extension(combin0, *local_z, f0);
+            consumer.constraint_last_row(builder, constr);
+            let constr = builder.mul_sub_extension(combin0, z_diff, f0);
+            consumer.constraint_transition(builder, constr);
+        }
     }
 }
 
@@ -1026,11 +1561,19 @@ pub(crate) fn verify_cross_table_lookups<F: RichField + Extendable<D>, const D:
     {
         // Get elements looking into `looked_table` that are not associated to any STARK.
         let extra_sum_vec = &ctl_extra_looking_sums[looked_table.table as usize];
+        // We want to iterate on each looking table only once.
+        let mut filtered_looking_tables = vec![];
+        for table in looking_tables {
+            if !filtered_looking_tables.contains(&(table.table as usize)) {
+                filtered_looking_tables.push(table.table as usize);
+            }
+        }
         for c in 0..config.num_challenges {
             // Compute the combination of all looking table CTL polynomial openings.
-            let looking_zs_sum = looking_tables
+
+            let looking_zs_sum = filtered_looking_tables
                 .iter()
-                .map(|table| *ctl_zs_openings[table.table as usize].next().unwrap())
+                .map(|&table| *ctl_zs_openings[table].next().unwrap())
                 .sum::<F>()
                 + extra_sum_vec[c];
 
@@ -1065,12 +1608,19 @@ pub(crate) fn verify_cross_table_lookups_circuit<F: RichField + Extendable<D>, c
     {
         // Get elements looking into `looked_table` that are not associated to any STARK.
         let extra_sum_vec = &ctl_extra_looking_sums[looked_table.table as usize];
+        // We want to iterate on each looking table only once.
+        let mut filtered_looking_tables = vec![];
+        for table in looking_tables {
+            if !filtered_looking_tables.contains(&(table.table as usize)) {
+                filtered_looking_tables.push(table.table as usize);
+            }
+        }
         for c in 0..inner_config.num_challenges {
             // Compute the combination of all looking table CTL polynomial openings.
             let mut looking_zs_sum = builder.add_many(
-                looking_tables
+                filtered_looking_tables
                     .iter()
-                    .map(|table| *ctl_zs_openings[table.table as usize].next().unwrap()),
+                    .map(|&table| *ctl_zs_openings[table].next().unwrap()),
             );
 
             looking_zs_sum = builder.add(looking_zs_sum, extra_sum_vec[c]);
diff --git a/evm/src/keccak/keccak_stark.rs b/evm/src/keccak/keccak_stark.rs
index b3e31093..63098e79 100644
--- a/evm/src/keccak/keccak_stark.rs
+++ b/evm/src/keccak/keccak_stark.rs
@@ -740,13 +740,14 @@ mod tests {
 
         // Fake CTL data.
         let ctl_z_data = CtlZData {
+            helper_columns: vec![PolynomialValues::zero(degree)],
             z: PolynomialValues::zero(degree),
             challenge: GrandProductChallenge {
                 beta: F::ZERO,
                 gamma: F::ZERO,
             },
             columns: vec![],
-            filter: Some(Filter::new_simple(Column::constant(F::ZERO))),
+            filter: vec![Some(Filter::new_simple(Column::constant(F::ZERO)))],
         };
         let ctl_data = CtlData {
             zs_columns: vec![ctl_z_data.clone(); config.num_challenges],
diff --git a/evm/src/lookup.rs b/evm/src/lookup.rs
index 8bf0c8f6..4b735c64 100644
--- a/evm/src/lookup.rs
+++ b/evm/src/lookup.rs
@@ -12,7 +12,10 @@ use plonky2::plonk::circuit_builder::CircuitBuilder;
 use plonky2_util::ceil_div_usize;
 
 use crate::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer};
-use crate::cross_table_lookup::{Column, Filter};
+use crate::cross_table_lookup::{
+    eval_helper_columns, eval_helper_columns_circuit, get_helper_cols, Column, Filter,
+    GrandProductChallenge,
+};
 use crate::evaluation_frame::StarkEvaluationFrame;
 use crate::stark::Stark;
 
@@ -64,48 +67,38 @@ pub(crate) fn lookup_helper_columns<F: Field>(
     let num_helper_columns = lookup.num_helper_columns(constraint_degree);
     let mut helper_columns: Vec<PolynomialValues<F>> = Vec::with_capacity(num_helper_columns);
 
+    let looking_cols = lookup
+        .columns
+        .iter()
+        .map(|col| vec![col.clone()])
+        .collect::<Vec<Vec<Column<F>>>>();
+
+    let grand_challenge = GrandProductChallenge {
+        beta: F::ONE,
+        gamma: challenge,
+    };
+
+    let columns_filters = looking_cols
+        .iter()
+        .zip(lookup.filter_columns.iter())
+        .map(|(col, filter)| (&col[..], filter))
+        .collect::<Vec<_>>();
     // For each batch of `constraint_degree-1` columns `fi`, compute `sum 1/(f_i+challenge)` and
     // add it to the helper columns.
-    // TODO: This does one batch inversion per column. It would also be possible to do one batch inversion
-    // for every group of columns, but that would require building a big vector of all the columns concatenated.
-    // Not sure which approach is better.
     // Note: these are the h_k(x) polynomials in the paper, with a few differences:
     //       * Here, the first ratio m_0(x)/phi_0(x) is not included with the columns batched up to create the
     //         h_k polynomials; instead there's a separate helper column for it (see below).
     //       * Here, we use 1 instead of -1 as the numerator (and subtract later).
     //       * Here, for now, the batch size (l) is always constraint_degree - 1 = 2.
-    for (i, mut col_inds) in (&lookup.columns.iter().chunks(constraint_degree - 1))
-        .into_iter()
-        .enumerate()
-    {
-        let first = col_inds.next().unwrap();
-
-        let mut column = first.eval_all_rows(trace_poly_values);
-        let length = column.len();
-
-        for x in column.iter_mut() {
-            *x = challenge + *x;
-        }
-        let mut acc = F::batch_multiplicative_inverse(&column);
-        if let Some(filter) = &lookup.filter_columns[0] {
-            batch_multiply_inplace(&mut acc, &filter.eval_all_rows(trace_poly_values));
-        }
-
-        for (j, ind) in col_inds.enumerate() {
-            let mut column = ind.eval_all_rows(trace_poly_values);
-            for x in column.iter_mut() {
-                *x = challenge + *x;
-            }
-            column = F::batch_multiplicative_inverse(&column);
-            let filter_idx = (constraint_degree - 1) * i + j + 1;
-            if let Some(filter) = &lookup.filter_columns[filter_idx] {
-                batch_multiply_inplace(&mut column, &filter.eval_all_rows(trace_poly_values));
-            }
-            batch_add_inplace(&mut acc, &column);
-        }
-
-        helper_columns.push(acc.into());
-    }
+    //       * Here, there are filters for the columns, to only select some rows
+    //         in a given column.
+    let mut helper_columns = get_helper_cols(
+        trace_poly_values,
+        trace_poly_values[0].len(),
+        &columns_filters,
+        grand_challenge,
+        constraint_degree,
+    );
 
     // Add `1/(table+challenge)` to the helper columns.
     // This is 1/phi_0(x) = 1/(x + t(x)) from the paper.
@@ -168,38 +161,30 @@ pub(crate) fn eval_packed_lookups_generic<F, FE, P, S, const D: usize, const D2:
     for lookup in lookups {
         let num_helper_columns = lookup.num_helper_columns(degree);
         for &challenge in &lookup_vars.challenges {
-            let challenge = FE::from_basefield(challenge);
+            let grand_challenge = GrandProductChallenge {
+                beta: F::ONE,
+                gamma: challenge,
+            };
+            let lookup_columns = lookup
+                .columns
+                .iter()
+                .map(|col| vec![col.eval_with_next(local_values, next_values)])
+                .collect::<Vec<Vec<P>>>();
+
             // For each chunk, check that `h_i (x+f_2i) (x+f_{2i+1}) = (x+f_2i) * filter_{2i+1} + (x+f_{2i+1}) * filter_2i` if the chunk has length 2
             // or if it has length 1, check that `h_i * (x+f_2i) = filter_2i`, where x is the challenge
-            for (j, chunk) in lookup.columns.chunks(degree - 1).enumerate() {
-                let mut x = lookup_vars.local_values[start + j];
-                let mut y = P::ZEROS;
-                let col_values = chunk
-                    .iter()
-                    .map(|col| col.eval_with_next(local_values, next_values));
-                let filters = lookup.filter_columns
-                    [(degree - 1) * j..(degree - 1) * j + chunk.len()]
-                    .iter()
-                    .map(|maybe_filter| {
-                        if let Some(filter) = maybe_filter {
-                            filter.eval_filter(local_values, next_values)
-                        } else {
-                            P::ONES
-                        }
-                    })
-                    .rev()
-                    .collect::<Vec<_>>();
-                let last_filter_value = filters[0];
-                for (val, f) in col_values.zip_eq(filters) {
-                    x *= val + challenge;
-                    y += (val + challenge) * f;
-                }
-                match chunk.len() {
-                    2 => yield_constr.constraint(x - y),
-                    1 => yield_constr.constraint(x - last_filter_value),
-                    _ => todo!("Allow other constraint degrees."),
-                }
-            }
+            eval_helper_columns(
+                &lookup.filter_columns,
+                &lookup_columns,
+                local_values,
+                next_values,
+                &lookup_vars.local_values[start..start + num_helper_columns - 1],
+                degree,
+                &grand_challenge,
+                yield_constr,
+            );
+
+            let challenge = FE::from_basefield(challenge);
 
             // Check the `Z` polynomial.
             let z = lookup_vars.local_values[start + num_helper_columns - 1];
@@ -245,45 +230,30 @@ pub(crate) fn eval_ext_lookups_circuit<
     let mut start = 0;
     for lookup in lookups {
         let num_helper_columns = lookup.num_helper_columns(degree);
+        let col_values = lookup
+            .columns
+            .iter()
+            .map(|col| vec![col.eval_with_next_circuit(builder, local_values, next_values)])
+            .collect::<Vec<_>>();
+
         for &challenge in &lookup_vars.challenges {
+            let grand_challenge = GrandProductChallenge {
+                beta: builder.one(),
+                gamma: challenge,
+            };
+
+            eval_helper_columns_circuit(
+                builder,
+                &lookup.filter_columns,
+                &col_values,
+                local_values,
+                next_values,
+                &lookup_vars.local_values[start..start + num_helper_columns - 1],
+                degree,
+                &grand_challenge,
+                yield_constr,
+            );
             let challenge = builder.convert_to_ext(challenge);
-            for (j, chunk) in lookup.columns.chunks(degree - 1).enumerate() {
-                let mut x = lookup_vars.local_values[start + j];
-                let mut y = builder.zero_extension();
-                let col_values = chunk
-                    .iter()
-                    .map(|k| k.eval_with_next_circuit(builder, local_values, next_values))
-                    .collect::<Vec<_>>();
-                let filters = lookup.filter_columns
-                    [(degree - 1) * j..(degree - 1) * j + chunk.len()]
-                    .iter()
-                    .map(|maybe_filter| {
-                        if let Some(filter) = maybe_filter {
-                            filter.eval_filter_circuit(builder, local_values, next_values)
-                        } else {
-                            one
-                        }
-                    })
-                    .rev()
-                    .collect::<Vec<_>>();
-                let last_filter_value = filters[0];
-                for (&val, f) in col_values.iter().zip_eq(filters) {
-                    let tmp = builder.add_extension(val, challenge);
-                    x = builder.mul_extension(x, tmp);
-                    y = builder.mul_add_extension(f, tmp, y);
-                }
-                match chunk.len() {
-                    2 => {
-                        let tmp = builder.sub_extension(x, y);
-                        yield_constr.constraint(builder, tmp)
-                    }
-                    1 => {
-                        let tmp = builder.sub_extension(x, last_filter_value);
-                        yield_constr.constraint(builder, tmp)
-                    }
-                    _ => todo!("Allow other constraint degrees."),
-                }
-            }
 
             let z = lookup_vars.local_values[start + num_helper_columns - 1];
             let next_z = lookup_vars.next_values[start + num_helper_columns - 1];
diff --git a/evm/src/memory/memory_stark.rs b/evm/src/memory/memory_stark.rs
index e596f421..1283965b 100644
--- a/evm/src/memory/memory_stark.rs
+++ b/evm/src/memory/memory_stark.rs
@@ -305,6 +305,10 @@ impl<F: RichField + Extendable<D>, const D: usize> Stark<F, D> for MemoryStark<F
         let filter = local_values[FILTER];
         yield_constr.constraint(filter * (filter - P::ONES));
 
+        // IS_READ must be 0 or 1.
+        // This is implied by the MemoryStark CTL, where corresponding values are either
+        // hardcoded to 0/1, or boolean-constrained in their respective STARK modules.
+
         // If this is a dummy row (filter is off), it must be a read. This means the prover can
         // insert reads which never appear in the CPU trace (which are harmless), but not writes.
         let is_dummy = P::ONES - filter;
@@ -411,6 +415,10 @@ impl<F: RichField + Extendable<D>, const D: usize> Stark<F, D> for MemoryStark<F
         let constraint = builder.mul_sub_extension(filter, filter, filter);
         yield_constr.constraint(builder, constraint);
 
+        // IS_READ must be 0 or 1.
+        // This is implied by the MemoryStark CTL, where corresponding values are either
+        // hardcoded to 0/1, or boolean-constrained in their respective STARK modules.
+
         // If this is a dummy row (filter is off), it must be a read. This means the prover can
         // insert reads which never appear in the CPU trace (which are harmless), but not writes.
         let is_dummy = builder.sub_extension(one, filter);
diff --git a/evm/src/memory/segments.rs b/evm/src/memory/segments.rs
index ac90d210..6177d996 100644
--- a/evm/src/memory/segments.rs
+++ b/evm/src/memory/segments.rs
@@ -138,11 +138,11 @@ impl Segment {
             Segment::TrieData => "SEGMENT_TRIE_DATA",
             Segment::ShiftTable => "SEGMENT_SHIFT_TABLE",
             Segment::JumpdestBits => "SEGMENT_JUMPDEST_BITS",
-            Segment::EcdsaTable => "SEGMENT_KERNEL_ECDSA_TABLE",
+            Segment::EcdsaTable => "SEGMENT_ECDSA_TABLE",
             Segment::BnWnafA => "SEGMENT_BN_WNAF_A",
             Segment::BnWnafB => "SEGMENT_BN_WNAF_B",
             Segment::BnTableQ => "SEGMENT_BN_TABLE_Q",
-            Segment::BnPairing => "SEGMENT_KERNEL_BN_PAIRING",
+            Segment::BnPairing => "SEGMENT_BN_PAIRING",
             Segment::AccessedAddresses => "SEGMENT_ACCESSED_ADDRESSES",
             Segment::AccessedStorageKeys => "SEGMENT_ACCESSED_STORAGE_KEYS",
             Segment::SelfDestructList => "SEGMENT_SELFDESTRUCT_LIST",
diff --git a/evm/src/proof.rs b/evm/src/proof.rs
index 3768f98f..4cc18677 100644
--- a/evm/src/proof.rs
+++ b/evm/src/proof.rs
@@ -974,7 +974,10 @@ impl<F: RichField + Extendable<D>, const D: usize> StarkOpeningSet<F, D> {
         auxiliary_polys_commitment: &PolynomialBatch<F, C, D>,
         quotient_commitment: &PolynomialBatch<F, C, D>,
         num_lookup_columns: usize,
+        num_ctl_polys: &[usize],
     ) -> Self {
+        let total_num_helper_cols: usize = num_ctl_polys.iter().sum();
+
         // Batch evaluates polynomials on the LDE, at a point `z`.
         let eval_commitment = |z: F::Extension, c: &PolynomialBatch<F, C, D>| {
             c.polynomials
@@ -989,6 +992,9 @@ impl<F: RichField + Extendable<D>, const D: usize> StarkOpeningSet<F, D> {
                 .map(|p| p.eval(z))
                 .collect::<Vec<_>>()
         };
+
+        let auxiliary_first = eval_commitment_base(F::ONE, auxiliary_polys_commitment);
+        let ctl_zs_first = auxiliary_first[num_lookup_columns + total_num_helper_cols..].to_vec();
         // `g * zeta`.
         let zeta_next = zeta.scalar_mul(g);
         Self {
@@ -996,9 +1002,7 @@ impl<F: RichField + Extendable<D>, const D: usize> StarkOpeningSet<F, D> {
             next_values: eval_commitment(zeta_next, trace_commitment),
             auxiliary_polys: eval_commitment(zeta, auxiliary_polys_commitment),
             auxiliary_polys_next: eval_commitment(zeta_next, auxiliary_polys_commitment),
-            ctl_zs_first: eval_commitment_base(F::ONE, auxiliary_polys_commitment)
-                [num_lookup_columns..]
-                .to_vec(),
+            ctl_zs_first,
             quotient_polys: eval_commitment(zeta, quotient_commitment),
         }
     }
@@ -1051,7 +1055,7 @@ pub(crate) struct StarkOpeningSetTarget<const D: usize> {
     pub auxiliary_polys: Vec<ExtensionTarget<D>>,
     /// `ExtensionTarget`s for the opening of lookups and cross-table lookups `Z` polynomials at `g * zeta`.
     pub auxiliary_polys_next: Vec<ExtensionTarget<D>>,
-    /// /// `ExtensionTarget`s for the opening of lookups and cross-table lookups `Z` polynomials at 1.
+    /// `ExtensionTarget`s for the opening of lookups and cross-table lookups `Z` polynomials at 1.
     pub ctl_zs_first: Vec<Target>,
     /// `ExtensionTarget`s for the opening of quotient polynomials at `zeta`.
     pub quotient_polys: Vec<ExtensionTarget<D>>,
diff --git a/evm/src/prover.rs b/evm/src/prover.rs
index c90490b8..e6426023 100644
--- a/evm/src/prover.rs
+++ b/evm/src/prover.rs
@@ -135,6 +135,7 @@ where
             &trace_poly_values,
             &all_stark.cross_table_lookups,
             &ctl_challenges,
+            all_stark.arithmetic_stark.constraint_degree()
         )
     );
 
@@ -375,9 +376,14 @@ where
     // We add CTLs to the permutation arguments so that we can batch commit to
     // all auxiliary polynomials.
     let auxiliary_polys = match lookup_helper_columns {
-        None => ctl_data.z_polys(),
+        None => {
+            let mut ctl_polys = ctl_data.ctl_helper_polys();
+            ctl_polys.extend(ctl_data.ctl_z_polys());
+            ctl_polys
+        }
         Some(mut lookup_columns) => {
-            lookup_columns.extend(ctl_data.z_polys());
+            lookup_columns.extend(ctl_data.ctl_helper_polys());
+            lookup_columns.extend(ctl_data.ctl_z_polys());
             lookup_columns
         }
     };
@@ -402,6 +408,8 @@ where
 
     let alphas = challenger.get_n_challenges(config.num_challenges);
 
+    let num_ctl_polys = ctl_data.num_ctl_helper_polys();
+
     #[cfg(test)]
     {
         check_constraints(
@@ -414,6 +422,7 @@ where
             alphas.clone(),
             degree_bits,
             num_lookup_columns,
+            &num_ctl_polys,
         );
     }
 
@@ -432,6 +441,7 @@ where
             alphas,
             degree_bits,
             num_lookup_columns,
+            &num_ctl_polys,
             config,
         )
     );
@@ -486,6 +496,7 @@ where
         &auxiliary_polys_commitment,
         &quotient_commitment,
         stark.num_lookup_helper_columns(config),
+        &num_ctl_polys,
     );
     // Get the FRI openings and observe them.
     challenger.observe_openings(&openings.to_fri_openings());
@@ -502,7 +513,7 @@ where
         timing,
         "compute openings proof",
         PolynomialBatch::prove_openings(
-            &stark.fri_instance(zeta, g, ctl_data.len(), config),
+            &stark.fri_instance(zeta, g, num_ctl_polys.iter().sum(), num_ctl_polys, config),
             &initial_merkle_trees,
             challenger,
             &fri_params,
@@ -535,6 +546,7 @@ fn compute_quotient_polys<'a, F, P, C, S, const D: usize>(
     alphas: Vec<F>,
     degree_bits: usize,
     num_lookup_columns: usize,
+    num_ctl_columns: &[usize],
     config: &StarkConfig,
 ) -> Vec<PolynomialCoeffs<F>>
 where
@@ -545,6 +557,7 @@ where
 {
     let degree = 1 << degree_bits;
     let rate_bits = config.fri_config.rate_bits;
+    let total_num_helper_cols: usize = num_ctl_columns.iter().sum();
 
     let quotient_degree_bits = log2_ceil(stark.quotient_degree_factor());
     assert!(
@@ -616,18 +629,35 @@ where
             // - for each CTL:
             //     - the filter `Column`
             //     - the `Column`s that form the looking/looked table.
+
+            let mut start_index = 0;
             let ctl_vars = ctl_data
                 .zs_columns
                 .iter()
                 .enumerate()
-                .map(|(i, zs_columns)| CtlCheckVars::<F, F, P, 1> {
-                    local_z: auxiliary_polys_commitment.get_lde_values_packed(i_start, step)
-                        [num_lookup_columns + i],
-                    next_z: auxiliary_polys_commitment.get_lde_values_packed(i_next_start, step)
-                        [num_lookup_columns + i],
-                    challenges: zs_columns.challenge,
-                    columns: &zs_columns.columns,
-                    filter: &zs_columns.filter,
+                .map(|(i, zs_columns)| {
+                    let num_ctl_helper_cols = num_ctl_columns[i];
+                    let helper_columns = auxiliary_polys_commitment
+                        .get_lde_values_packed(i_start, step)[num_lookup_columns
+                        + start_index
+                        ..num_lookup_columns + start_index + num_ctl_helper_cols]
+                        .to_vec();
+
+                    let ctl_vars = CtlCheckVars::<F, F, P, 1> {
+                        helper_columns,
+                        local_z: auxiliary_polys_commitment.get_lde_values_packed(i_start, step)
+                            [num_lookup_columns + total_num_helper_cols + i],
+                        next_z: auxiliary_polys_commitment
+                            .get_lde_values_packed(i_next_start, step)
+                            [num_lookup_columns + total_num_helper_cols + i],
+                        challenges: zs_columns.challenge,
+                        columns: zs_columns.columns.clone(),
+                        filter: zs_columns.filter.clone(),
+                    };
+
+                    start_index += num_ctl_helper_cols;
+
+                    ctl_vars
                 })
                 .collect::<Vec<_>>();
 
@@ -691,6 +721,7 @@ fn check_constraints<'a, F, C, S, const D: usize>(
     alphas: Vec<F>,
     degree_bits: usize,
     num_lookup_columns: usize,
+    num_ctl_helper_cols: &[usize],
 ) where
     F: RichField + Extendable<D>,
     C: GenericConfig<D, F = F>,
@@ -699,6 +730,8 @@ fn check_constraints<'a, F, C, S, const D: usize>(
     let degree = 1 << degree_bits;
     let rate_bits = 0; // Set this to higher value to check constraint degree.
 
+    let total_num_helper_cols: usize = num_ctl_helper_cols.iter().sum();
+
     let size = degree << rate_bits;
     let step = 1 << rate_bits;
 
@@ -754,18 +787,34 @@ fn check_constraints<'a, F, C, S, const D: usize>(
             });
 
             // Get the local and next row evaluations for the current STARK's CTL Z polynomials.
+            let mut start_index = 0;
             let ctl_vars = ctl_data
                 .zs_columns
                 .iter()
                 .enumerate()
-                .map(|(iii, zs_columns)| CtlCheckVars::<F, F, F, 1> {
-                    local_z: auxiliary_subgroup_evals[i][num_lookup_columns + iii],
-                    next_z: auxiliary_subgroup_evals[i_next][num_lookup_columns + iii],
-                    challenges: zs_columns.challenge,
-                    columns: &zs_columns.columns,
-                    filter: &zs_columns.filter,
+                .map(|(iii, zs_columns)| {
+                    let num_helper_cols = num_ctl_helper_cols[iii];
+                    let helper_columns = auxiliary_subgroup_evals[i][num_lookup_columns
+                        + start_index
+                        ..num_lookup_columns + start_index + num_helper_cols]
+                        .to_vec();
+                    let ctl_vars = CtlCheckVars::<F, F, F, 1> {
+                        helper_columns,
+                        local_z: auxiliary_subgroup_evals[i]
+                            [num_lookup_columns + total_num_helper_cols + iii],
+                        next_z: auxiliary_subgroup_evals[i_next]
+                            [num_lookup_columns + total_num_helper_cols + iii],
+                        challenges: zs_columns.challenge,
+                        columns: zs_columns.columns.clone(),
+                        filter: zs_columns.filter.clone(),
+                    };
+
+                    start_index += num_helper_cols;
+
+                    ctl_vars
                 })
                 .collect::<Vec<_>>();
+
             // Evaluate the polynomial combining all constraints, including those associated
             // to the permutation and CTL arguments.
             eval_vanishing_poly::<F, F, F, S, D, 1>(
diff --git a/evm/src/recursive_verifier.rs b/evm/src/recursive_verifier.rs
index 5c10e3b3..8d10b498 100644
--- a/evm/src/recursive_verifier.rs
+++ b/evm/src/recursive_verifier.rs
@@ -229,10 +229,24 @@ where
     let zero_target = builder.zero();
 
     let num_lookup_columns = stark.num_lookup_helper_columns(inner_config);
-    let num_ctl_zs =
-        CrossTableLookup::num_ctl_zs(cross_table_lookups, table, inner_config.num_challenges);
-    let proof_target =
-        add_virtual_stark_proof(&mut builder, stark, inner_config, degree_bits, num_ctl_zs);
+    let (total_num_helpers, num_ctl_zs, num_helpers_by_ctl) =
+        CrossTableLookup::num_ctl_helpers_zs_all(
+            cross_table_lookups,
+            table,
+            inner_config.num_challenges,
+            stark.constraint_degree(),
+        );
+    let num_ctl_helper_zs = num_ctl_zs + total_num_helpers;
+
+    let proof_target = add_virtual_stark_proof(
+        &mut builder,
+        stark,
+        inner_config,
+        degree_bits,
+        num_ctl_helper_zs,
+        num_ctl_zs,
+    );
+
     builder.register_public_inputs(
         &proof_target
             .trace_cap
@@ -257,6 +271,8 @@ where
         cross_table_lookups,
         &ctl_challenges_target,
         num_lookup_columns,
+        total_num_helpers,
+        &num_helpers_by_ctl,
     );
 
     let init_challenger_state_target =
@@ -330,6 +346,12 @@ fn verify_stark_proof_with_challenges_circuit<
     let zero = builder.zero();
     let one = builder.one_extension();
 
+    let num_ctl_polys = ctl_vars
+        .iter()
+        .map(|ctl| ctl.helper_columns.len())
+        .sum::<usize>();
+    let num_ctl_z_polys = ctl_vars.len();
+
     let StarkOpeningSetTarget {
         local_values,
         next_values,
@@ -407,6 +429,7 @@ fn verify_stark_proof_with_challenges_circuit<
         builder,
         challenges.stark_zeta,
         F::primitive_root_of_unity(degree_bits),
+        num_ctl_polys,
         ctl_zs_first.len(),
         inner_config,
     );
@@ -759,6 +782,7 @@ pub(crate) fn add_virtual_stark_proof<
     stark: &S,
     config: &StarkConfig,
     degree_bits: usize,
+    num_ctl_helper_zs: usize,
     num_ctl_zs: usize,
 ) -> StarkProofTarget<D> {
     let fri_params = config.fri_params(degree_bits);
@@ -766,7 +790,7 @@ pub(crate) fn add_virtual_stark_proof<
 
     let num_leaves_per_oracle = vec![
         S::COLUMNS,
-        stark.num_lookup_helper_columns(config) + num_ctl_zs,
+        stark.num_lookup_helper_columns(config) + num_ctl_helper_zs,
         stark.quotient_degree_factor() * config.num_challenges,
     ];
 
@@ -776,7 +800,13 @@ pub(crate) fn add_virtual_stark_proof<
         trace_cap: builder.add_virtual_cap(cap_height),
         auxiliary_polys_cap,
         quotient_polys_cap: builder.add_virtual_cap(cap_height),
-        openings: add_virtual_stark_opening_set::<F, S, D>(builder, stark, num_ctl_zs, config),
+        openings: add_virtual_stark_opening_set::<F, S, D>(
+            builder,
+            stark,
+            num_ctl_helper_zs,
+            num_ctl_zs,
+            config,
+        ),
         opening_proof: builder.add_virtual_fri_proof(&num_leaves_per_oracle, &fri_params),
     }
 }
@@ -784,6 +814,7 @@ pub(crate) fn add_virtual_stark_proof<
 fn add_virtual_stark_opening_set<F: RichField + Extendable<D>, S: Stark<F, D>, const D: usize>(
     builder: &mut CircuitBuilder<F, D>,
     stark: &S,
+    num_ctl_helper_zs: usize,
     num_ctl_zs: usize,
     config: &StarkConfig,
 ) -> StarkOpeningSetTarget<D> {
@@ -791,10 +822,12 @@ fn add_virtual_stark_opening_set<F: RichField + Extendable<D>, S: Stark<F, D>, c
     StarkOpeningSetTarget {
         local_values: builder.add_virtual_extension_targets(S::COLUMNS),
         next_values: builder.add_virtual_extension_targets(S::COLUMNS),
-        auxiliary_polys: builder
-            .add_virtual_extension_targets(stark.num_lookup_helper_columns(config) + num_ctl_zs),
-        auxiliary_polys_next: builder
-            .add_virtual_extension_targets(stark.num_lookup_helper_columns(config) + num_ctl_zs),
+        auxiliary_polys: builder.add_virtual_extension_targets(
+            stark.num_lookup_helper_columns(config) + num_ctl_helper_zs,
+        ),
+        auxiliary_polys_next: builder.add_virtual_extension_targets(
+            stark.num_lookup_helper_columns(config) + num_ctl_helper_zs,
+        ),
         ctl_zs_first: builder.add_virtual_targets(num_ctl_zs),
         quotient_polys: builder
             .add_virtual_extension_targets(stark.quotient_degree_factor() * num_challenges),
diff --git a/evm/src/stark.rs b/evm/src/stark.rs
index 6099abdd..6ee26f58 100644
--- a/evm/src/stark.rs
+++ b/evm/src/stark.rs
@@ -92,7 +92,8 @@ pub trait Stark<F: RichField + Extendable<D>, const D: usize>: Sync {
         &self,
         zeta: F::Extension,
         g: F,
-        num_ctl_zs: usize,
+        num_ctl_helpers: usize,
+        num_ctl_zs: Vec<usize>,
         config: &StarkConfig,
     ) -> FriInstanceInfo<F, D> {
         let trace_oracle = FriOracleInfo {
@@ -102,7 +103,7 @@ pub trait Stark<F: RichField + Extendable<D>, const D: usize>: Sync {
         let trace_info = FriPolynomialInfo::from_range(TRACE_ORACLE_INDEX, 0..Self::COLUMNS);
 
         let num_lookup_columns = self.num_lookup_helper_columns(config);
-        let num_auxiliary_polys = num_lookup_columns + num_ctl_zs;
+        let num_auxiliary_polys = num_lookup_columns + num_ctl_helpers + num_ctl_zs.len();
         let auxiliary_oracle = FriOracleInfo {
             num_polys: num_auxiliary_polys,
             blinding: false,
@@ -110,9 +111,10 @@ pub trait Stark<F: RichField + Extendable<D>, const D: usize>: Sync {
         let auxiliary_polys_info =
             FriPolynomialInfo::from_range(AUXILIARY_ORACLE_INDEX, 0..num_auxiliary_polys);
 
+        let mut start_index = num_lookup_columns;
         let ctl_zs_info = FriPolynomialInfo::from_range(
             AUXILIARY_ORACLE_INDEX,
-            num_lookup_columns..num_lookup_columns + num_ctl_zs,
+            num_lookup_columns + num_ctl_helpers..num_auxiliary_polys,
         );
 
         let num_quotient_polys = self.num_quotient_polys(config);
@@ -152,6 +154,7 @@ pub trait Stark<F: RichField + Extendable<D>, const D: usize>: Sync {
         builder: &mut CircuitBuilder<F, D>,
         zeta: ExtensionTarget<D>,
         g: F,
+        num_ctl_helper_polys: usize,
         num_ctl_zs: usize,
         inner_config: &StarkConfig,
     ) -> FriInstanceInfoTarget<D> {
@@ -162,7 +165,7 @@ pub trait Stark<F: RichField + Extendable<D>, const D: usize>: Sync {
         let trace_info = FriPolynomialInfo::from_range(TRACE_ORACLE_INDEX, 0..Self::COLUMNS);
 
         let num_lookup_columns = self.num_lookup_helper_columns(inner_config);
-        let num_auxiliary_polys = num_lookup_columns + num_ctl_zs;
+        let num_auxiliary_polys = num_lookup_columns + num_ctl_helper_polys + num_ctl_zs;
         let auxiliary_oracle = FriOracleInfo {
             num_polys: num_auxiliary_polys,
             blinding: false,
@@ -172,7 +175,8 @@ pub trait Stark<F: RichField + Extendable<D>, const D: usize>: Sync {
 
         let ctl_zs_info = FriPolynomialInfo::from_range(
             AUXILIARY_ORACLE_INDEX,
-            num_lookup_columns..num_lookup_columns + num_ctl_zs,
+            num_lookup_columns + num_ctl_helper_polys
+                ..num_lookup_columns + num_ctl_helper_polys + num_ctl_zs,
         );
 
         let num_quotient_polys = self.num_quotient_polys(inner_config);
diff --git a/evm/src/vanishing_poly.rs b/evm/src/vanishing_poly.rs
index 15444c7e..c1f2d0f9 100644
--- a/evm/src/vanishing_poly.rs
+++ b/evm/src/vanishing_poly.rs
@@ -42,7 +42,12 @@ pub(crate) fn eval_vanishing_poly<F, FE, P, S, const D: usize, const D2: usize>(
         );
     }
     // Evaluate the STARK constraints related to the cross-table lookups.
-    eval_cross_table_lookup_checks::<F, FE, P, S, D, D2>(vars, ctl_vars, consumer);
+    eval_cross_table_lookup_checks::<F, FE, P, S, D, D2>(
+        vars,
+        ctl_vars,
+        consumer,
+        stark.constraint_degree(),
+    );
 }
 
 /// Circuit version of `eval_vanishing_poly`.
@@ -66,5 +71,11 @@ pub(crate) fn eval_vanishing_poly_circuit<F, S, const D: usize>(
         eval_ext_lookups_circuit::<F, S, D>(builder, stark, vars, lookup_vars, consumer);
     }
     // Evaluate all of the STARK's constraints related to the cross-table lookups.
-    eval_cross_table_lookup_checks_circuit::<S, F, D>(builder, vars, ctl_vars, consumer);
+    eval_cross_table_lookup_checks_circuit::<S, F, D>(
+        builder,
+        vars,
+        ctl_vars,
+        consumer,
+        stark.constraint_degree(),
+    );
 }
diff --git a/evm/src/verifier.rs b/evm/src/verifier.rs
index b69cd927..9eb12100 100644
--- a/evm/src/verifier.rs
+++ b/evm/src/verifier.rs
@@ -16,7 +16,8 @@ use crate::constraint_consumer::ConstraintConsumer;
 use crate::cpu::kernel::aggregator::KERNEL;
 use crate::cpu::kernel::constants::global_metadata::GlobalMetadata;
 use crate::cross_table_lookup::{
-    verify_cross_table_lookups, CtlCheckVars, GrandProductChallenge, GrandProductChallengeSet,
+    num_ctl_helper_columns_by_table, verify_cross_table_lookups, CtlCheckVars,
+    GrandProductChallenge, GrandProductChallengeSet,
 };
 use crate::evaluation_frame::StarkEvaluationFrame;
 use crate::lookup::LookupCheckVars;
@@ -56,11 +57,17 @@ where
         cross_table_lookups,
     } = all_stark;
 
+    let num_ctl_helper_cols = num_ctl_helper_columns_by_table(
+        cross_table_lookups,
+        all_stark.arithmetic_stark.constraint_degree(),
+    );
+
     let ctl_vars_per_table = CtlCheckVars::from_proofs(
         &all_proof.stark_proofs,
         cross_table_lookups,
         &ctl_challenges,
         &num_lookup_columns,
+        &num_ctl_helper_cols,
     );
 
     verify_stark_proof_with_challenges(
@@ -300,7 +307,12 @@ pub(crate) fn verify_stark_proof_with_challenges<
     config: &StarkConfig,
 ) -> Result<()> {
     log::debug!("Checking proof: {}", type_name::<S>());
-    validate_proof_shape(stark, proof, config, ctl_vars.len())?;
+    let num_ctl_polys = ctl_vars
+        .iter()
+        .map(|ctl| ctl.helper_columns.len())
+        .sum::<usize>();
+    let num_ctl_z_polys = ctl_vars.len();
+    validate_proof_shape(stark, proof, config, num_ctl_polys, num_ctl_z_polys)?;
     let StarkOpeningSet {
         local_values,
         next_values,
@@ -374,11 +386,16 @@ pub(crate) fn verify_stark_proof_with_challenges<
         proof.quotient_polys_cap.clone(),
     ];
 
+    let num_ctl_zs = ctl_vars
+        .iter()
+        .map(|ctl| ctl.helper_columns.len())
+        .collect::<Vec<_>>();
     verify_fri_proof::<F, C, D>(
         &stark.fri_instance(
             challenges.stark_zeta,
             F::primitive_root_of_unity(degree_bits),
-            ctl_zs_first.len(),
+            num_ctl_polys,
+            num_ctl_zs,
             config,
         ),
         &proof.openings.to_fri_openings(),
@@ -395,6 +412,7 @@ fn validate_proof_shape<F, C, S, const D: usize>(
     stark: &S,
     proof: &StarkProof<F, C, D>,
     config: &StarkConfig,
+    num_ctl_helpers: usize,
     num_ctl_zs: usize,
 ) -> anyhow::Result<()>
 where
@@ -424,7 +442,8 @@ where
     let degree_bits = proof.recover_degree_bits(config);
     let fri_params = config.fri_params(degree_bits);
     let cap_height = fri_params.config.cap_height;
-    let num_auxiliary = num_ctl_zs + stark.num_lookup_helper_columns(config);
+
+    let num_auxiliary = num_ctl_helpers + stark.num_lookup_helper_columns(config) + num_ctl_zs;
 
     ensure!(trace_cap.height() == cap_height);
     ensure!(auxiliary_polys_cap.height() == cap_height);
diff --git a/field/src/extension/mod.rs b/field/src/extension/mod.rs
index bbbaca25..3586055e 100644
--- a/field/src/extension/mod.rs
+++ b/field/src/extension/mod.rs
@@ -15,7 +15,7 @@ pub trait OEF<const D: usize>: FieldExtension<D> {
     // Element W of BaseField, such that `X^d - W` is irreducible over BaseField.
     const W: Self::BaseField;
 
-    // Element of BaseField such that DTH_ROOT^D == 1. Implementors
+    // Element of BaseField such that DTH_ROOT^D == 1. Implementers
     // should set this to W^((p - 1)/D), where W is as above and p is
     // the order of the BaseField.
     const DTH_ROOT: Self::BaseField;
diff --git a/plonky2/src/gates/selectors.rs b/plonky2/src/gates/selectors.rs
index 1018ba75..be9a9da8 100644
--- a/plonky2/src/gates/selectors.rs
+++ b/plonky2/src/gates/selectors.rs
@@ -40,7 +40,7 @@ pub enum LookupSelectors {
 }
 
 /// Returns selector polynomials for each LUT. We have two constraint domains (remember that gates are stored upside down):
-/// - [last_lut_row, first_lut_row] (Sum and RE transition contraints),
+/// - [last_lut_row, first_lut_row] (Sum and RE transition constraints),
 /// - [last_lu_row, last_lut_row - 1] (LDC column transition constraints).
 /// We also add two more:
 /// - {first_lut_row + 1} where we check the initial values of sum and RE (which are 0),
diff --git a/plonky2/src/hash/poseidon_goldilocks.rs b/plonky2/src/hash/poseidon_goldilocks.rs
index 3e089e64..1fd5af13 100644
--- a/plonky2/src/hash/poseidon_goldilocks.rs
+++ b/plonky2/src/hash/poseidon_goldilocks.rs
@@ -323,7 +323,7 @@ mod poseidon12_mds {
         let (u8, u9, u10) = fft4_real([s2, s5, s8, s11]);
 
         // This where the multiplication in frequency domain is done. More precisely, and with
-        // the appropriate permuations in between, the sequence of
+        // the appropriate permutations in between, the sequence of
         // 3-point FFTs --> multiplication by twiddle factors --> Hadamard multiplication -->
         // 3 point iFFTs --> multiplication by (inverse) twiddle factors
         // is "squashed" into one step composed of the functions "block1", "block2" and "block3".