diff --git a/evm/Cargo.toml b/evm/Cargo.toml
index 09a47ee2..848dff15 100644
--- a/evm/Cargo.toml
+++ b/evm/Cargo.toml
@@ -16,6 +16,7 @@ hex-literal = "0.3.4"
 itertools = "0.10.3"
 keccak-hash = "0.9.0"
 log = "0.4.14"
+num = "0.4.0"
 maybe_rayon = { path = "../maybe_rayon" }
 once_cell = "1.13.0"
 pest = "2.1.3"
diff --git a/evm/src/arithmetic/add.rs b/evm/src/arithmetic/add.rs
index e87566b6..d2520fb9 100644
--- a/evm/src/arithmetic/add.rs
+++ b/evm/src/arithmetic/add.rs
@@ -165,6 +165,8 @@ mod tests {
     use crate::arithmetic::columns::NUM_ARITH_COLUMNS;
     use crate::constraint_consumer::ConstraintConsumer;
 
+    const N_RND_TESTS: usize = 1000;
+
     // TODO: Should be able to refactor this test to apply to all operations.
     #[test]
     fn generate_eval_consistency_not_add() {
@@ -177,14 +179,14 @@ mod tests {
         // if all values are garbage.
         lv[IS_ADD] = F::ZERO;
 
-        let mut constrant_consumer = ConstraintConsumer::new(
+        let mut constraint_consumer = ConstraintConsumer::new(
             vec![GoldilocksField(2), GoldilocksField(3), GoldilocksField(5)],
             GoldilocksField::ONE,
             GoldilocksField::ONE,
             GoldilocksField::ONE,
         );
-        eval_packed_generic(&lv, &mut constrant_consumer);
-        for &acc in &constrant_consumer.constraint_accs {
+        eval_packed_generic(&lv, &mut constraint_consumer);
+        for &acc in &constraint_consumer.constraint_accs {
             assert_eq!(acc, GoldilocksField::ZERO);
         }
     }
@@ -198,23 +200,26 @@ mod tests {
 
         // set `IS_ADD == 1` and ensure all constraints are satisfied.
         lv[IS_ADD] = F::ONE;
-        // set inputs to random values
-        for (&ai, bi) in ADD_INPUT_0.iter().zip(ADD_INPUT_1) {
-            lv[ai] = F::from_canonical_u16(rng.gen());
-            lv[bi] = F::from_canonical_u16(rng.gen());
-        }
 
-        generate(&mut lv);
+        for _ in 0..N_RND_TESTS {
+            // set inputs to random values
+            for (&ai, bi) in ADD_INPUT_0.iter().zip(ADD_INPUT_1) {
+                lv[ai] = F::from_canonical_u16(rng.gen());
+                lv[bi] = F::from_canonical_u16(rng.gen());
+            }
 
-        let mut constrant_consumer = ConstraintConsumer::new(
-            vec![GoldilocksField(2), GoldilocksField(3), GoldilocksField(5)],
-            GoldilocksField::ONE,
-            GoldilocksField::ONE,
-            GoldilocksField::ONE,
-        );
-        eval_packed_generic(&lv, &mut constrant_consumer);
-        for &acc in &constrant_consumer.constraint_accs {
-            assert_eq!(acc, GoldilocksField::ZERO);
+            generate(&mut lv);
+
+            let mut constraint_consumer = ConstraintConsumer::new(
+                vec![GoldilocksField(2), GoldilocksField(3), GoldilocksField(5)],
+                GoldilocksField::ONE,
+                GoldilocksField::ONE,
+                GoldilocksField::ONE,
+            );
+            eval_packed_generic(&lv, &mut constraint_consumer);
+            for &acc in &constraint_consumer.constraint_accs {
+                assert_eq!(acc, GoldilocksField::ZERO);
+            }
         }
     }
 }
diff --git a/evm/src/arithmetic/arithmetic_stark.rs b/evm/src/arithmetic/arithmetic_stark.rs
index 58b8afff..fc168cee 100644
--- a/evm/src/arithmetic/arithmetic_stark.rs
+++ b/evm/src/arithmetic/arithmetic_stark.rs
@@ -9,6 +9,7 @@ use plonky2::hash::hash_types::RichField;
 use crate::arithmetic::add;
 use crate::arithmetic::columns;
 use crate::arithmetic::compare;
+use crate::arithmetic::modular;
 use crate::arithmetic::mul;
 use crate::arithmetic::sub;
 use crate::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer};
@@ -50,6 +51,12 @@ impl<F: RichField, const D: usize> ArithmeticStark<F, D> {
             compare::generate(local_values, columns::IS_LT);
         } else if local_values[columns::IS_GT].is_one() {
             compare::generate(local_values, columns::IS_GT);
+        } else if local_values[columns::IS_ADDMOD].is_one() {
+            modular::generate(local_values, columns::IS_ADDMOD);
+        } else if local_values[columns::IS_MULMOD].is_one() {
+            modular::generate(local_values, columns::IS_MULMOD);
+        } else if local_values[columns::IS_MOD].is_one() {
+            modular::generate(local_values, columns::IS_MOD);
         } else {
             todo!("the requested operation has not yet been implemented");
         }
@@ -72,6 +79,7 @@ impl<F: RichField + Extendable<D>, const D: usize> Stark<F, D> for ArithmeticSta
         sub::eval_packed_generic(lv, yield_constr);
         mul::eval_packed_generic(lv, yield_constr);
         compare::eval_packed_generic(lv, yield_constr);
+        modular::eval_packed_generic(lv, yield_constr);
     }
 
     fn eval_ext_circuit(
@@ -85,6 +93,7 @@ impl<F: RichField + Extendable<D>, const D: usize> Stark<F, D> for ArithmeticSta
         sub::eval_ext_circuit(builder, lv, yield_constr);
         mul::eval_ext_circuit(builder, lv, yield_constr);
         compare::eval_ext_circuit(builder, lv, yield_constr);
+        modular::eval_ext_circuit(builder, lv, yield_constr);
     }
 
     fn constraint_degree(&self) -> usize {
diff --git a/evm/src/arithmetic/columns.rs b/evm/src/arithmetic/columns.rs
index 7b44adc1..ca8ba549 100644
--- a/evm/src/arithmetic/columns.rs
+++ b/evm/src/arithmetic/columns.rs
@@ -44,7 +44,7 @@ pub(crate) const ALL_OPERATIONS: [usize; 16] = [
 /// used by any arithmetic circuit, depending on which one is active
 /// this cycle.  Can be increased as needed as other operations are
 /// implemented.
-const NUM_SHARED_COLS: usize = 64;
+const NUM_SHARED_COLS: usize = 144; // only need 64 for add, sub, and mul
 
 const fn shared_col(i: usize) -> usize {
     assert!(i < NUM_SHARED_COLS);
@@ -64,7 +64,10 @@ const fn gen_input_cols<const N: usize>(start: usize) -> [usize; N] {
 const GENERAL_INPUT_0: [usize; N_LIMBS] = gen_input_cols::<N_LIMBS>(0);
 const GENERAL_INPUT_1: [usize; N_LIMBS] = gen_input_cols::<N_LIMBS>(N_LIMBS);
 const GENERAL_INPUT_2: [usize; N_LIMBS] = gen_input_cols::<N_LIMBS>(2 * N_LIMBS);
-const AUX_INPUT_0: [usize; N_LIMBS] = gen_input_cols::<N_LIMBS>(3 * N_LIMBS);
+const GENERAL_INPUT_3: [usize; N_LIMBS] = gen_input_cols::<N_LIMBS>(3 * N_LIMBS);
+const AUX_INPUT_0: [usize; 2 * N_LIMBS] = gen_input_cols::<{ 2 * N_LIMBS }>(4 * N_LIMBS);
+const AUX_INPUT_1: [usize; 2 * N_LIMBS] = gen_input_cols::<{ 2 * N_LIMBS }>(6 * N_LIMBS);
+const AUX_INPUT_2: [usize; N_LIMBS] = gen_input_cols::<N_LIMBS>(8 * N_LIMBS);
 
 pub(crate) const ADD_INPUT_0: [usize; N_LIMBS] = GENERAL_INPUT_0;
 pub(crate) const ADD_INPUT_1: [usize; N_LIMBS] = GENERAL_INPUT_1;
@@ -77,11 +80,21 @@ pub(crate) const SUB_OUTPUT: [usize; N_LIMBS] = GENERAL_INPUT_2;
 pub(crate) const MUL_INPUT_0: [usize; N_LIMBS] = GENERAL_INPUT_0;
 pub(crate) const MUL_INPUT_1: [usize; N_LIMBS] = GENERAL_INPUT_1;
 pub(crate) const MUL_OUTPUT: [usize; N_LIMBS] = GENERAL_INPUT_2;
-pub(crate) const MUL_AUX_INPUT: [usize; N_LIMBS] = AUX_INPUT_0;
+pub(crate) const MUL_AUX_INPUT: [usize; N_LIMBS] = GENERAL_INPUT_3;
 
 pub(crate) const CMP_INPUT_0: [usize; N_LIMBS] = GENERAL_INPUT_0;
 pub(crate) const CMP_INPUT_1: [usize; N_LIMBS] = GENERAL_INPUT_1;
 pub(crate) const CMP_OUTPUT: usize = GENERAL_INPUT_2[0];
-pub(crate) const CMP_AUX_INPUT: [usize; N_LIMBS] = AUX_INPUT_0;
+pub(crate) const CMP_AUX_INPUT: [usize; N_LIMBS] = GENERAL_INPUT_3;
+
+pub(crate) const MODULAR_INPUT_0: [usize; N_LIMBS] = GENERAL_INPUT_0;
+pub(crate) const MODULAR_INPUT_1: [usize; N_LIMBS] = GENERAL_INPUT_1;
+pub(crate) const MODULAR_MODULUS: [usize; N_LIMBS] = GENERAL_INPUT_2;
+pub(crate) const MODULAR_OUTPUT: [usize; N_LIMBS] = GENERAL_INPUT_3;
+pub(crate) const MODULAR_QUO_INPUT: [usize; 2 * N_LIMBS] = AUX_INPUT_0;
+// NB: Last value is not used in AUX, it is used in IS_ZERO
+pub(crate) const MODULAR_AUX_INPUT: [usize; 2 * N_LIMBS] = AUX_INPUT_1;
+pub(crate) const MODULAR_MOD_IS_ZERO: usize = AUX_INPUT_1[2 * N_LIMBS - 1];
+pub(crate) const MODULAR_OUT_AUX_RED: [usize; N_LIMBS] = AUX_INPUT_2;
 
 pub const NUM_ARITH_COLUMNS: usize = START_SHARED_COLS + NUM_SHARED_COLS;
diff --git a/evm/src/arithmetic/mod.rs b/evm/src/arithmetic/mod.rs
index 69fbda09..a6f59446 100644
--- a/evm/src/arithmetic/mod.rs
+++ b/evm/src/arithmetic/mod.rs
@@ -1,5 +1,6 @@
 mod add;
 mod compare;
+mod modular;
 mod mul;
 mod sub;
 mod utils;
diff --git a/evm/src/arithmetic/modular.rs b/evm/src/arithmetic/modular.rs
new file mode 100644
index 00000000..1fd31bb1
--- /dev/null
+++ b/evm/src/arithmetic/modular.rs
@@ -0,0 +1,593 @@
+//! Support for the EVM modular instructions ADDMOD, MULMOD and MOD.
+//!
+//! This crate verifies an EVM modular instruction, which takes three
+//! 256-bit inputs A, B and M, and produces a 256-bit output C satisfying
+//!
+//!    C = operation(A, B) (mod M).
+//!
+//! where operation can be addition, multiplication, or just return
+//! the first argument (for MOD).  Inputs A, B and M, and output C,
+//! are given as arrays of 16-bit limbs. For example, if the limbs of
+//! A are a[0]...a[15], then
+//!
+//!    A = \sum_{i=0}^15 a[i] β^i,
+//!
+//! where β = 2^16 = 2^LIMB_BITS. To verify that A, B, M and C satisfy
+//! the equation we proceed as follows. Define
+//!
+//!    a(x) = \sum_{i=0}^15 a[i] x^i
+//!
+//! (so A = a(β)) and similarly for b(x), m(x) and c(x). Then
+//! operation(A,B) = C (mod M) if and only if the polynomial
+//!
+//!    operation(a(x), b(x)) - c(x) - m(x) * q(x)
+//!
+//! is zero when evaluated at x = β, i.e. it is divisible by (x - β).
+//! Thus exists a polynomial s such that
+//!
+//!    operation(a(x), b(x)) - c(x) - m(x) * q(x) - (x - β) * s(x) == 0
+//!
+//! if and only if operation(A,B) = C (mod M). In the code below, this
+//! "constraint polynomial" is constructed in the variable
+//! `constr_poly`. It must be identically zero for the modular
+//! operation to be verified, or, equivalently, each of its
+//! coefficients must be zero. The variable names of the constituent
+//! polynomials are (writing N for N_LIMBS=16):
+//!
+//!   a(x) = \sum_{i=0}^{N-1} input0[i] * β^i
+//!   b(x) = \sum_{i=0}^{N-1} input1[i] * β^i
+//!   c(x) = \sum_{i=0}^{N-1} output[i] * β^i
+//!   m(x) = \sum_{i=0}^{N-1} modulus[i] * β^i
+//!   q(x) = \sum_{i=0}^{2N-1} quot[i] * β^i
+//!   s(x) = \sum_i^{2N-2} aux[i] * β^i
+//!
+//! Because A, B, M and C are 256-bit numbers, the degrees of a, b, m
+//! and c are (at most) N-1 = 15. If m = 1, then Q would be A*B which
+//! can be up to 2^512 - ε, so deg(q) can be up to 2*N-1 = 31. Note
+//! that, although for arbitrary m and q we might have deg(m*q) = 3*N-2,
+//! because the magnitude of M*Q must match that of operation(A,B), we
+//! always have deg(m*q) <= 2*N-1. Finally, in order for all the degrees
+//! to match, we have deg(s) <= 2*N-2 = 30.
+//!
+//! -*-
+//!
+//! To verify that the output is reduced, that is, output < modulus,
+//! the prover supplies the value `out_aux_red` which must satisfy
+//!
+//!    output - modulus = out_aux_red + 2^256
+//!
+//! and these values are passed to the "less than" operation.
+//!
+//! -*-
+//!
+//! The EVM defines division by zero as zero. We handle this as
+//! follows:
+//!
+//! The prover supplies a binary value `mod_is_zero` which is one if
+//! the modulus is zero and zero otherwise. This is verified, then
+//! added to the modulus (this can't overflow, as modulus[0] was
+//! range-checked and mod_is_zero is 0 or 1). The rest of the
+//! calculation proceeds as if modulus was actually 1; this correctly
+//! verifies that the output is zero, as required by the standard.
+//! To summarise:
+//!
+//! - mod_is_zero is 0 or 1
+//! - if mod_is_zero is 1, then
+//!    - given modulus is 0
+//!    - updated modulus is 1, which forces the correct output of 0
+//! - if mod_is_zero is 0, then
+//!    - given modulus can be 0 or non-zero
+//!    - updated modulus is same as given
+//!    - if modulus is non-zero, correct output is obtained
+//!    - if modulus is 0, then the test output < modulus, checking that
+//!      the output is reduced, will fail, because output is non-negative.
+
+use num::{BigUint, Zero};
+use plonky2::field::extension::Extendable;
+use plonky2::field::packed::PackedField;
+use plonky2::field::types::Field;
+use plonky2::hash::hash_types::RichField;
+use plonky2::iop::ext_target::ExtensionTarget;
+use plonky2::plonk::circuit_builder::CircuitBuilder;
+
+use super::columns;
+use crate::arithmetic::columns::*;
+use crate::arithmetic::compare::{eval_ext_circuit_lt, eval_packed_generic_lt};
+use crate::arithmetic::utils::*;
+use crate::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer};
+use crate::range_check_error;
+
+/// Convert the base-2^16 representation of a number into a BigUint.
+///
+/// Given `N` unsigned 16-bit values in `limbs`, return the BigUint
+///
+///   \sum_{i=0}^{N-1} limbs[i] * β^i.
+///
+fn columns_to_biguint<const N: usize>(limbs: &[i64; N]) -> BigUint {
+    const BASE: i64 = 1i64 << LIMB_BITS;
+
+    // Although the input type is i64, the values must always be in
+    // [0, 2^16 + ε) because of the caller's range check on the inputs
+    // (the ε allows us to convert calculated output, which can be
+    // bigger than 2^16).
+    debug_assert!(limbs.iter().all(|&x| x >= 0));
+
+    let mut limbs_u32 = Vec::with_capacity(N / 2 + 1);
+    let mut cy = 0i64; // cy is necessary to handle ε > 0
+    for i in 0..(N / 2) {
+        let t = cy + limbs[2 * i] + BASE * limbs[2 * i + 1];
+        limbs_u32.push(t as u32);
+        cy = t >> 32;
+    }
+    if N & 1 != 0 {
+        // If N is odd we need to add the last limb on its own
+        let t = cy + limbs[N - 1];
+        limbs_u32.push(t as u32);
+        cy = t >> 32;
+    }
+    limbs_u32.push(cy as u32);
+
+    BigUint::from_slice(&limbs_u32)
+}
+
+/// Convert a BigUint into a base-2^16 representation.
+///
+/// Given a BigUint `num`, return an array of `N` unsigned 16-bit
+/// values, say `limbs`, such that
+///
+///   num = \sum_{i=0}^{N-1} limbs[i] * β^i.
+///
+/// Note that `N` must be at least ceil(log2(num)/16) in order to be
+/// big enough to hold `num`.
+fn biguint_to_columns<const N: usize>(num: &BigUint) -> [i64; N] {
+    assert!(num.bits() <= 16 * N as u64);
+    let mut output = [0i64; N];
+    for (i, limb) in num.iter_u32_digits().enumerate() {
+        output[2 * i] = limb as u16 as i64;
+        output[2 * i + 1] = (limb >> LIMB_BITS) as i64;
+    }
+    output
+}
+
+/// Generate the output and auxiliary values for given `operation`.
+///
+/// NB: `operation` can set the higher order elements in its result to
+/// zero if they are not used.
+fn generate_modular_op<F: RichField>(
+    lv: &mut [F; NUM_ARITH_COLUMNS],
+    operation: fn([i64; N_LIMBS], [i64; N_LIMBS]) -> [i64; 2 * N_LIMBS - 1],
+) {
+    // Inputs are all range-checked in [0, 2^16), so the "as i64"
+    // conversion is safe.
+    let input0_limbs = MODULAR_INPUT_0.map(|c| F::to_canonical_u64(&lv[c]) as i64);
+    let input1_limbs = MODULAR_INPUT_1.map(|c| F::to_canonical_u64(&lv[c]) as i64);
+    let mut modulus_limbs = MODULAR_MODULUS.map(|c| F::to_canonical_u64(&lv[c]) as i64);
+
+    // The use of BigUints is just to avoid having to implement
+    // modular reduction.
+    let mut modulus = columns_to_biguint(&modulus_limbs);
+
+    // constr_poly is initialised to the calculated input, and is
+    // used as such for the BigUint reduction; later, other values are
+    // added/subtracted, which is where its meaning as the "constraint
+    // polynomial" comes in.
+    let mut constr_poly = [0i64; 2 * N_LIMBS];
+    constr_poly[..2 * N_LIMBS - 1].copy_from_slice(&operation(input0_limbs, input1_limbs));
+
+    if modulus.is_zero() {
+        modulus += 1u32;
+        modulus_limbs[0] += 1i64;
+        lv[MODULAR_MOD_IS_ZERO] = F::ONE;
+    } else {
+        lv[MODULAR_MOD_IS_ZERO] = F::ZERO;
+    }
+
+    let input = columns_to_biguint(&constr_poly);
+
+    // modulus != 0 here, because, if the given modulus was zero, then
+    // we added 1 to it above.
+    let output = &input % &modulus;
+    let output_limbs = biguint_to_columns::<N_LIMBS>(&output);
+    let quot = (&input - &output) / &modulus; // exact division
+    let quot_limbs = biguint_to_columns::<{ 2 * N_LIMBS }>(&quot);
+
+    // two_exp_256 == 2^256
+    let mut two_exp_256 = BigUint::zero();
+    two_exp_256.set_bit(256, true);
+    // output < modulus here, so the proof requires (output - modulus) % 2^256:
+    let out_aux_red = biguint_to_columns::<N_LIMBS>(&(two_exp_256 + output - modulus));
+
+    // constr_poly is the array of coefficients of the polynomial
+    //
+    //   operation(a(x), b(x)) - c(x) - s(x)*m(x).
+    //
+    pol_sub_assign(&mut constr_poly, &output_limbs);
+    let prod = pol_mul_wide2(quot_limbs, modulus_limbs);
+    pol_sub_assign(&mut constr_poly, &prod[0..2 * N_LIMBS]);
+
+    // Higher order terms of the product must be zero for valid quot and modulus:
+    debug_assert!(&prod[2 * N_LIMBS..].iter().all(|&x| x == 0i64));
+
+    // constr_poly must be zero when evaluated at x = β :=
+    // 2^LIMB_BITS, hence it's divisible by (x - β). `aux_limbs` is
+    // the result of removing that root.
+    let aux_limbs = pol_remove_root_2exp::<LIMB_BITS, _>(constr_poly);
+
+    for deg in 0..N_LIMBS {
+        lv[MODULAR_OUTPUT[deg]] = F::from_canonical_i64(output_limbs[deg]);
+        lv[MODULAR_OUT_AUX_RED[deg]] = F::from_canonical_i64(out_aux_red[deg]);
+        lv[MODULAR_QUO_INPUT[deg]] = F::from_canonical_i64(quot_limbs[deg]);
+        lv[MODULAR_QUO_INPUT[deg + N_LIMBS]] = F::from_canonical_i64(quot_limbs[deg + N_LIMBS]);
+        lv[MODULAR_AUX_INPUT[deg]] = F::from_noncanonical_i64(aux_limbs[deg]);
+        // Don't overwrite MODULAR_MOD_IS_ZERO, which is at the last
+        // index of MODULAR_AUX_INPUT
+        if deg < N_LIMBS - 1 {
+            lv[MODULAR_AUX_INPUT[deg + N_LIMBS]] =
+                F::from_noncanonical_i64(aux_limbs[deg + N_LIMBS]);
+        }
+    }
+}
+
+/// Generate the output and auxiliary values for modular operations.
+///
+/// `filter` must be one of `columns::IS_{ADDMOD,MULMOD,MOD}`.
+pub(crate) fn generate<F: RichField>(lv: &mut [F; NUM_ARITH_COLUMNS], filter: usize) {
+    match filter {
+        columns::IS_ADDMOD => generate_modular_op(lv, pol_add),
+        columns::IS_MULMOD => generate_modular_op(lv, pol_mul_wide),
+        columns::IS_MOD => generate_modular_op(lv, |a, _| pol_extend(a)),
+        _ => panic!("generate modular operation called with unknown opcode"),
+    }
+}
+
+/// Build the part of the constraint polynomial that's common to all
+/// modular operations, and perform the common verifications.
+///
+/// Specifically, with the notation above, build the polynomial
+///
+///   c(x) + q(x) * m(x) + (x - β) * s(x)
+///
+/// and check consistency when m = 0, and that c is reduced.
+#[allow(clippy::needless_range_loop)]
+fn modular_constr_poly<P: PackedField>(
+    lv: &[P; NUM_ARITH_COLUMNS],
+    yield_constr: &mut ConstraintConsumer<P>,
+    filter: P,
+) -> [P; 2 * N_LIMBS] {
+    range_check_error!(MODULAR_INPUT_0, 16);
+    range_check_error!(MODULAR_INPUT_1, 16);
+    range_check_error!(MODULAR_MODULUS, 16);
+    range_check_error!(MODULAR_QUO_INPUT, 16);
+    range_check_error!(MODULAR_AUX_INPUT, 20, signed);
+    range_check_error!(MODULAR_OUTPUT, 16);
+
+    let mut modulus = MODULAR_MODULUS.map(|c| lv[c]);
+    let mod_is_zero = lv[MODULAR_MOD_IS_ZERO];
+
+    // Check that mod_is_zero is zero or one
+    yield_constr.constraint(filter * (mod_is_zero * mod_is_zero - mod_is_zero));
+
+    // Check that mod_is_zero is zero if modulus is not zero (they
+    // could both be zero)
+    let limb_sum = modulus.into_iter().sum::<P>();
+    yield_constr.constraint(filter * limb_sum * mod_is_zero);
+
+    // See the file documentation for why this suffices to handle
+    // modulus = 0.
+    modulus[0] += mod_is_zero;
+
+    let output = MODULAR_OUTPUT.map(|c| lv[c]);
+
+    // Verify that the output is reduced, i.e. output < modulus.
+    let out_aux_red = MODULAR_OUT_AUX_RED.map(|c| lv[c]);
+    let is_less_than = P::ONES;
+    eval_packed_generic_lt(
+        yield_constr,
+        filter,
+        output,
+        modulus,
+        out_aux_red,
+        is_less_than,
+    );
+
+    // prod = q(x) * m(x)
+    let quot = MODULAR_QUO_INPUT.map(|c| lv[c]);
+    let prod = pol_mul_wide2(quot, modulus);
+    // higher order terms must be zero
+    for &x in prod[2 * N_LIMBS..].iter() {
+        yield_constr.constraint(filter * x);
+    }
+
+    // constr_poly = c(x) + q(x) * m(x)
+    let mut constr_poly: [_; 2 * N_LIMBS] = prod[0..2 * N_LIMBS].try_into().unwrap();
+    pol_add_assign(&mut constr_poly, &output);
+
+    // constr_poly = c(x) + q(x) * m(x) + (x - β) * s(x)
+    let aux = MODULAR_AUX_INPUT.map(|c| lv[c]);
+    let base = P::Scalar::from_canonical_u64(1 << LIMB_BITS);
+    pol_add_assign(&mut constr_poly, &pol_adjoin_root(aux, base));
+
+    constr_poly
+}
+
+/// Add constraints for modular operations.
+pub(crate) fn eval_packed_generic<P: PackedField>(
+    lv: &[P; NUM_ARITH_COLUMNS],
+    yield_constr: &mut ConstraintConsumer<P>,
+) {
+    // NB: The CTL code guarantees that filter is 0 or 1, i.e. that
+    // only one of the operations below is "live".
+    let filter = lv[columns::IS_ADDMOD] + lv[columns::IS_MULMOD] + lv[columns::IS_MOD];
+
+    // constr_poly has 2*N_LIMBS limbs
+    let constr_poly = modular_constr_poly(lv, yield_constr, filter);
+
+    let input0 = MODULAR_INPUT_0.map(|c| lv[c]);
+    let input1 = MODULAR_INPUT_1.map(|c| lv[c]);
+
+    let add_input = pol_add(input0, input1);
+    let mul_input = pol_mul_wide(input0, input1);
+    let mod_input = pol_extend(input0);
+
+    for (input, &filter) in [
+        (&add_input, &lv[columns::IS_ADDMOD]),
+        (&mul_input, &lv[columns::IS_MULMOD]),
+        (&mod_input, &lv[columns::IS_MOD]),
+    ] {
+        // Need constr_poly_copy to be the first argument to
+        // pol_sub_assign, since it is the longer of the two
+        // arguments.
+        let mut constr_poly_copy = constr_poly;
+        pol_sub_assign(&mut constr_poly_copy, input);
+
+        // At this point constr_poly_copy holds the coefficients of
+        // the polynomial
+        //
+        //   operation(a(x), b(x)) - c(x) - q(x) * m(x) - (x - β) * s(x)
+        //
+        // where operation is add, mul or |a,b|->a.  The modular
+        // operation is valid if and only if all of those coefficients
+        // are zero.
+        for &c in constr_poly_copy.iter() {
+            yield_constr.constraint(filter * c);
+        }
+    }
+}
+
+fn modular_constr_poly_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
+    lv: &[ExtensionTarget<D>; NUM_ARITH_COLUMNS],
+    builder: &mut CircuitBuilder<F, D>,
+    yield_constr: &mut RecursiveConstraintConsumer<F, D>,
+    filter: ExtensionTarget<D>,
+) -> [ExtensionTarget<D>; 2 * N_LIMBS] {
+    let mut modulus = MODULAR_MODULUS.map(|c| lv[c]);
+    let mod_is_zero = lv[MODULAR_MOD_IS_ZERO];
+
+    let t = builder.mul_sub_extension(mod_is_zero, mod_is_zero, mod_is_zero);
+    let t = builder.mul_extension(filter, t);
+    yield_constr.constraint(builder, t);
+
+    let limb_sum = builder.add_many_extension(modulus);
+    let t = builder.mul_extension(limb_sum, mod_is_zero);
+    let t = builder.mul_extension(filter, t);
+    yield_constr.constraint(builder, t);
+
+    modulus[0] = builder.add_extension(modulus[0], mod_is_zero);
+
+    let output = MODULAR_OUTPUT.map(|c| lv[c]);
+    let out_aux_red = MODULAR_OUT_AUX_RED.map(|c| lv[c]);
+    let is_less_than = builder.one_extension();
+    eval_ext_circuit_lt(
+        builder,
+        yield_constr,
+        filter,
+        output,
+        modulus,
+        out_aux_red,
+        is_less_than,
+    );
+
+    let quot = MODULAR_QUO_INPUT.map(|c| lv[c]);
+    let prod = pol_mul_wide2_ext_circuit(builder, quot, modulus);
+    for &x in prod[2 * N_LIMBS..].iter() {
+        let t = builder.mul_extension(filter, x);
+        yield_constr.constraint(builder, t);
+    }
+
+    let mut constr_poly: [_; 2 * N_LIMBS] = prod[0..2 * N_LIMBS].try_into().unwrap();
+    pol_add_assign_ext_circuit(builder, &mut constr_poly, &output);
+
+    let aux = MODULAR_AUX_INPUT.map(|c| lv[c]);
+    let base = builder.constant_extension(F::Extension::from_canonical_u64(1u64 << LIMB_BITS));
+    let t = pol_adjoin_root_ext_circuit(builder, aux, base);
+    pol_add_assign_ext_circuit(builder, &mut constr_poly, &t);
+
+    constr_poly
+}
+
+pub(crate) fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
+    builder: &mut CircuitBuilder<F, D>,
+    lv: &[ExtensionTarget<D>; NUM_ARITH_COLUMNS],
+    yield_constr: &mut RecursiveConstraintConsumer<F, D>,
+) {
+    let filter = builder.add_many_extension([
+        lv[columns::IS_ADDMOD],
+        lv[columns::IS_MULMOD],
+        lv[columns::IS_MOD],
+    ]);
+
+    let constr_poly = modular_constr_poly_ext_circuit(lv, builder, yield_constr, filter);
+
+    let input0 = MODULAR_INPUT_0.map(|c| lv[c]);
+    let input1 = MODULAR_INPUT_1.map(|c| lv[c]);
+
+    let add_input = pol_add_ext_circuit(builder, input0, input1);
+    let mul_input = pol_mul_wide_ext_circuit(builder, input0, input1);
+    let mod_input = pol_extend_ext_circuit(builder, input0);
+
+    for (input, &filter) in [
+        (&add_input, &lv[columns::IS_ADDMOD]),
+        (&mul_input, &lv[columns::IS_MULMOD]),
+        (&mod_input, &lv[columns::IS_MOD]),
+    ] {
+        let mut constr_poly_copy = constr_poly;
+        pol_sub_assign_ext_circuit(builder, &mut constr_poly_copy, input);
+        for &c in constr_poly_copy.iter() {
+            let t = builder.mul_extension(filter, c);
+            yield_constr.constraint(builder, t);
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use itertools::izip;
+    use plonky2::field::goldilocks_field::GoldilocksField;
+    use plonky2::field::types::Field;
+    use rand::{Rng, SeedableRng};
+    use rand_chacha::ChaCha8Rng;
+
+    use super::*;
+    use crate::arithmetic::columns::NUM_ARITH_COLUMNS;
+    use crate::constraint_consumer::ConstraintConsumer;
+
+    const N_RND_TESTS: usize = 1000;
+
+    // TODO: Should be able to refactor this test to apply to all operations.
+    #[test]
+    fn generate_eval_consistency_not_modular() {
+        type F = GoldilocksField;
+
+        let mut rng = ChaCha8Rng::seed_from_u64(0x6feb51b7ec230f25);
+        let mut lv = [F::default(); NUM_ARITH_COLUMNS].map(|_| F::rand_from_rng(&mut rng));
+
+        // if `IS_ADDMOD == 0`, then the constraints should be met even
+        // if all values are garbage.
+        lv[IS_ADDMOD] = F::ZERO;
+        lv[IS_MULMOD] = F::ZERO;
+        lv[IS_MOD] = F::ZERO;
+
+        let mut constraint_consumer = ConstraintConsumer::new(
+            vec![GoldilocksField(2), GoldilocksField(3), GoldilocksField(5)],
+            GoldilocksField::ONE,
+            GoldilocksField::ONE,
+            GoldilocksField::ONE,
+        );
+        eval_packed_generic(&lv, &mut constraint_consumer);
+        for &acc in &constraint_consumer.constraint_accs {
+            assert_eq!(acc, GoldilocksField::ZERO);
+        }
+    }
+
+    #[test]
+    fn generate_eval_consistency() {
+        type F = GoldilocksField;
+
+        let mut rng = ChaCha8Rng::seed_from_u64(0x6feb51b7ec230f25);
+        let mut lv = [F::default(); NUM_ARITH_COLUMNS].map(|_| F::rand_from_rng(&mut rng));
+
+        for op_filter in [IS_ADDMOD, IS_MOD, IS_MULMOD] {
+            // Reset operation columns, then select one
+            lv[IS_ADDMOD] = F::ZERO;
+            lv[IS_MULMOD] = F::ZERO;
+            lv[IS_MOD] = F::ZERO;
+            lv[op_filter] = F::ONE;
+
+            for i in 0..N_RND_TESTS {
+                // set inputs to random values
+                for (&ai, &bi, &mi) in izip!(
+                    MODULAR_INPUT_0.iter(),
+                    MODULAR_INPUT_1.iter(),
+                    MODULAR_MODULUS.iter()
+                ) {
+                    lv[ai] = F::from_canonical_u16(rng.gen());
+                    lv[bi] = F::from_canonical_u16(rng.gen());
+                    lv[mi] = F::from_canonical_u16(rng.gen());
+                }
+
+                // For the second half of the tests, set the top 16 -
+                // start digits of the modulus to zero so it is much
+                // smaller than the inputs.
+                if i > N_RND_TESTS / 2 {
+                    // 1 <= start < N_LIMBS
+                    let start = (rng.gen::<usize>() % (N_LIMBS - 1)) + 1;
+                    for &mi in &MODULAR_MODULUS[start..N_LIMBS] {
+                        lv[mi] = F::ZERO;
+                    }
+                }
+
+                generate(&mut lv, op_filter);
+
+                let mut constraint_consumer = ConstraintConsumer::new(
+                    vec![GoldilocksField(2), GoldilocksField(3), GoldilocksField(5)],
+                    GoldilocksField::ONE,
+                    GoldilocksField::ONE,
+                    GoldilocksField::ONE,
+                );
+                eval_packed_generic(&lv, &mut constraint_consumer);
+                for &acc in &constraint_consumer.constraint_accs {
+                    assert_eq!(acc, GoldilocksField::ZERO);
+                }
+            }
+        }
+    }
+
+    #[test]
+    fn zero_modulus() {
+        type F = GoldilocksField;
+
+        let mut rng = ChaCha8Rng::seed_from_u64(0x6feb51b7ec230f25);
+        let mut lv = [F::default(); NUM_ARITH_COLUMNS].map(|_| F::rand_from_rng(&mut rng));
+
+        for op_filter in [IS_ADDMOD, IS_MOD, IS_MULMOD] {
+            // Reset operation columns, then select one
+            lv[IS_ADDMOD] = F::ZERO;
+            lv[IS_MULMOD] = F::ZERO;
+            lv[IS_MOD] = F::ZERO;
+            lv[op_filter] = F::ONE;
+
+            for _i in 0..N_RND_TESTS {
+                // set inputs to random values and the modulus to zero;
+                // the output is defined to be zero when modulus is zero.
+                for (&ai, &bi, &mi) in izip!(
+                    MODULAR_INPUT_0.iter(),
+                    MODULAR_INPUT_1.iter(),
+                    MODULAR_MODULUS.iter()
+                ) {
+                    lv[ai] = F::from_canonical_u16(rng.gen());
+                    lv[bi] = F::from_canonical_u16(rng.gen());
+                    lv[mi] = F::ZERO;
+                }
+
+                generate(&mut lv, op_filter);
+
+                // check that the correct output was generated
+                assert!(MODULAR_OUTPUT.iter().all(|&oi| lv[oi] == F::ZERO));
+
+                let mut constraint_consumer = ConstraintConsumer::new(
+                    vec![GoldilocksField(2), GoldilocksField(3), GoldilocksField(5)],
+                    GoldilocksField::ONE,
+                    GoldilocksField::ONE,
+                    GoldilocksField::ONE,
+                );
+                eval_packed_generic(&lv, &mut constraint_consumer);
+                assert!(constraint_consumer
+                    .constraint_accs
+                    .iter()
+                    .all(|&acc| acc == F::ZERO));
+
+                // Corrupt one output limb by setting it to a non-zero value
+                let random_oi = MODULAR_OUTPUT[rng.gen::<usize>() % N_LIMBS];
+                lv[random_oi] = F::from_canonical_u16(rng.gen_range(1..u16::MAX));
+
+                eval_packed_generic(&lv, &mut constraint_consumer);
+
+                // Check that at least one of the constraints was non-zero
+                assert!(constraint_consumer
+                    .constraint_accs
+                    .iter()
+                    .any(|&acc| acc != F::ZERO));
+            }
+        }
+    }
+}
diff --git a/evm/src/arithmetic/mul.rs b/evm/src/arithmetic/mul.rs
index 72270517..9d6638f1 100644
--- a/evm/src/arithmetic/mul.rs
+++ b/evm/src/arithmetic/mul.rs
@@ -15,7 +15,7 @@
 //! and similarly for b(x) and c(x). Then A*B = C (mod 2^256) if and only
 //! if there exist polynomials q and m such that
 //!
-//!    a(x)*b(x) - c(x) - m(x)*x^16 - (x - β)*q(x) == 0.
+//!    a(x)*b(x) - c(x) - m(x)*x^16 - (β - x)*q(x) == 0.
 //!
 //! Because A, B and C are 256-bit numbers, the degrees of a, b and c
 //! are (at most) 15. Thus deg(a*b) <= 30, so deg(m) <= 14 and deg(q)
@@ -24,7 +24,7 @@
 //! them evaluating at β gives a factor of β^16 = 2^256 which is 0.
 //!
 //! Hence, to verify the equality, we don't need m(x) at all, and we
-//! only need to know q(x) up to degree 14 (so that (x-β)*q(x) has
+//! only need to know q(x) up to degree 14 (so that (β - x)*q(x) has
 //! degree 15). On the other hand, the coefficients of q(x) can be as
 //! large as 16*(β-2) or 20 bits.
 
@@ -35,6 +35,7 @@ use plonky2::hash::hash_types::RichField;
 use plonky2::iop::ext_target::ExtensionTarget;
 
 use crate::arithmetic::columns::*;
+use crate::arithmetic::utils::{pol_mul_lo, pol_sub_assign};
 use crate::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer};
 use crate::range_check_error;
 
@@ -48,26 +49,17 @@ pub fn generate<F: RichField>(lv: &mut [F; NUM_ARITH_COLUMNS]) {
     let mut aux_in_limbs = [0u64; N_LIMBS];
     let mut output_limbs = [0u64; N_LIMBS];
 
-    let mut unreduced_prod = [0u64; N_LIMBS];
-
     // Column-wise pen-and-paper long multiplication on 16-bit limbs.
-    // We have heaps of space at the top of each limb, so by
-    // calculating column-wise (instead of the usual row-wise) we
-    // avoid a bunch of carry propagation handling (at the expense of
-    // slightly worse cache coherency), and it makes it easy to
-    // calculate the coefficients of a(x)*b(x) (in unreduced_prod).
+    // First calculate the coefficients of a(x)*b(x) (in unreduced_prod),
+    // then do carry propagation to obtain C = c(β) = a(β)*b(β).
     let mut cy = 0u64;
+    let mut unreduced_prod = pol_mul_lo(input0_limbs, input1_limbs);
     for col in 0..N_LIMBS {
-        for i in 0..=col {
-            // Invariant: i + j = col
-            let j = col - i;
-            let ai_x_bj = input0_limbs[i] * input1_limbs[j];
-            unreduced_prod[col] += ai_x_bj;
-        }
         let t = unreduced_prod[col] + cy;
         cy = t >> LIMB_BITS;
         output_limbs[col] = t & MASK;
     }
+
     // In principle, the last cy could be dropped because this is
     // multiplication modulo 2^256. However, we need it below for
     // aux_in_limbs to handle the fact that unreduced_prod will
@@ -76,23 +68,22 @@ pub fn generate<F: RichField>(lv: &mut [F; NUM_ARITH_COLUMNS]) {
     for (&c, output_limb) in MUL_OUTPUT.iter().zip(output_limbs) {
         lv[c] = F::from_canonical_u64(output_limb);
     }
-    for deg in 0..N_LIMBS {
-        // deg'th element <- a*b - c
-        unreduced_prod[deg] -= output_limbs[deg];
-    }
+    pol_sub_assign(&mut unreduced_prod, &output_limbs);
 
     // unreduced_prod is the coefficients of the polynomial a(x)*b(x) - c(x).
-    // This must be zero when evaluated at x = B = 2^LIMB_BITS, hence it's
-    // divisible by (B - x). If we write unreduced_prod as
+    // This must be zero when evaluated at x = β = 2^LIMB_BITS, hence it's
+    // divisible by (β - x). If we write unreduced_prod as
     //
-    //   a(x)*b(x) - c(x) = \sum_{i=0}^n p_i x^i
-    //                    = (B - x) \sum_{i=0}^{n-1} q_i x^i
+    //   a(x)*b(x) - c(x) = \sum_{i=0}^n p_i x^i  + terms of degree > n
+    //                    = (β - x) \sum_{i=0}^{n-1} q_i x^i  + terms of degree > n
     //
     // then by comparing coefficients it is easy to see that
     //
-    //   q_0 = p_0 / B  and  q_i = (p_i + q_{i-1}) / B
+    //   q_0 = p_0 / β  and  q_i = (p_i + q_{i-1}) / β
     //
-    // for 0 < i < n-1 (and the divisions are exact).
+    // for 0 < i < n-1 (and the divisions are exact). Because we're
+    // only calculating the result modulo 2^256, we can ignore the
+    // terms of degree > n = 15.
     aux_in_limbs[0] = unreduced_prod[0] >> LIMB_BITS;
     for deg in 1..N_LIMBS - 1 {
         aux_in_limbs[deg] = (unreduced_prod[deg] + aux_in_limbs[deg - 1]) >> LIMB_BITS;
@@ -122,14 +113,10 @@ pub fn eval_packed_generic<P: PackedField>(
 
     // Constraint poly holds the coefficients of the polynomial that
     // must be identically zero for this multiplication to be
-    // verified. It is initialised to the /negative/ of the claimed
-    // output.
-    let mut constr_poly = [P::ZEROS; N_LIMBS];
-
-    assert_eq!(constr_poly.len(), N_LIMBS);
-
-    // After this loop constr_poly holds the coefficients of the
-    // polynomial A(x)B(x) - C(x), where A, B and C are the polynomials
+    // verified.
+    //
+    // These two lines set constr_poly to the polynomial A(x)B(x) - C(x),
+    // where A, B and C are the polynomials
     //
     //   A(x) = \sum_i input0_limbs[i] * 2^LIMB_BITS
     //   B(x) = \sum_i input1_limbs[i] * 2^LIMB_BITS
@@ -139,14 +126,8 @@ pub fn eval_packed_generic<P: PackedField>(
     //
     //   Q(x) = \sum_i aux_limbs[i] * 2^LIMB_BITS
     //
-    for col in 0..N_LIMBS {
-        // Invariant: i + j = col
-        for i in 0..=col {
-            let j = col - i;
-            constr_poly[col] += input0_limbs[i] * input1_limbs[j];
-        }
-        constr_poly[col] -= output_limbs[col];
-    }
+    let mut constr_poly = pol_mul_lo(input0_limbs, input1_limbs);
+    pol_sub_assign(&mut constr_poly, &output_limbs);
 
     // This subtracts (2^LIMB_BITS - x) * Q(x) from constr_poly.
     let base = P::Scalar::from_canonical_u64(1 << LIMB_BITS);
@@ -156,7 +137,7 @@ pub fn eval_packed_generic<P: PackedField>(
     }
 
     // At this point constr_poly holds the coefficients of the
-    // polynomial A(x)B(x) - C(x) - (x - 2^LIMB_BITS)*Q(x). The
+    // polynomial A(x)B(x) - C(x) - (2^LIMB_BITS - x)*Q(x). The
     // multiplication is valid if and only if all of those
     // coefficients are zero.
     for &c in &constr_poly {
@@ -189,12 +170,20 @@ pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
     }
 
     let base = F::from_canonical_u64(1 << LIMB_BITS);
-    let t = builder.mul_const_extension(base, aux_in_limbs[0]);
-    constr_poly[0] = builder.sub_extension(constr_poly[0], t);
+    let one = builder.one_extension();
+    // constr_poly[0] = constr_poly[0] - base * aux_in_limbs[0]
+    constr_poly[0] =
+        builder.arithmetic_extension(F::ONE, -base, constr_poly[0], one, aux_in_limbs[0]);
     for deg in 1..N_LIMBS {
-        let t0 = builder.mul_const_extension(base, aux_in_limbs[deg]);
-        let t1 = builder.sub_extension(t0, aux_in_limbs[deg - 1]);
-        constr_poly[deg] = builder.sub_extension(constr_poly[deg], t1);
+        // constr_poly[deg] -= (base*aux_in_limbs[deg] - aux_in_limbs[deg-1])
+        let t = builder.arithmetic_extension(
+            base,
+            F::NEG_ONE,
+            aux_in_limbs[deg],
+            one,
+            aux_in_limbs[deg - 1],
+        );
+        constr_poly[deg] = builder.sub_extension(constr_poly[deg], t);
     }
 
     for &c in &constr_poly {
@@ -214,6 +203,8 @@ mod tests {
     use crate::arithmetic::columns::NUM_ARITH_COLUMNS;
     use crate::constraint_consumer::ConstraintConsumer;
 
+    const N_RND_TESTS: usize = 1000;
+
     // TODO: Should be able to refactor this test to apply to all operations.
     #[test]
     fn generate_eval_consistency_not_mul() {
@@ -226,14 +217,14 @@ mod tests {
         // if all values are garbage.
         lv[IS_MUL] = F::ZERO;
 
-        let mut constrant_consumer = ConstraintConsumer::new(
+        let mut constraint_consumer = ConstraintConsumer::new(
             vec![GoldilocksField(2), GoldilocksField(3), GoldilocksField(5)],
             GoldilocksField::ONE,
             GoldilocksField::ONE,
             GoldilocksField::ONE,
         );
-        eval_packed_generic(&lv, &mut constrant_consumer);
-        for &acc in &constrant_consumer.constraint_accs {
+        eval_packed_generic(&lv, &mut constraint_consumer);
+        for &acc in &constraint_consumer.constraint_accs {
             assert_eq!(acc, GoldilocksField::ZERO);
         }
     }
@@ -247,23 +238,26 @@ mod tests {
 
         // set `IS_MUL == 1` and ensure all constraints are satisfied.
         lv[IS_MUL] = F::ONE;
-        // set inputs to random values
-        for (&ai, bi) in MUL_INPUT_0.iter().zip(MUL_INPUT_1) {
-            lv[ai] = F::from_canonical_u16(rng.gen());
-            lv[bi] = F::from_canonical_u16(rng.gen());
-        }
 
-        generate(&mut lv);
+        for _i in 0..N_RND_TESTS {
+            // set inputs to random values
+            for (&ai, bi) in MUL_INPUT_0.iter().zip(MUL_INPUT_1) {
+                lv[ai] = F::from_canonical_u16(rng.gen());
+                lv[bi] = F::from_canonical_u16(rng.gen());
+            }
 
-        let mut constrant_consumer = ConstraintConsumer::new(
-            vec![GoldilocksField(2), GoldilocksField(3), GoldilocksField(5)],
-            GoldilocksField::ONE,
-            GoldilocksField::ONE,
-            GoldilocksField::ONE,
-        );
-        eval_packed_generic(&lv, &mut constrant_consumer);
-        for &acc in &constrant_consumer.constraint_accs {
-            assert_eq!(acc, GoldilocksField::ZERO);
+            generate(&mut lv);
+
+            let mut constraint_consumer = ConstraintConsumer::new(
+                vec![GoldilocksField(2), GoldilocksField(3), GoldilocksField(5)],
+                GoldilocksField::ONE,
+                GoldilocksField::ONE,
+                GoldilocksField::ONE,
+            );
+            eval_packed_generic(&lv, &mut constraint_consumer);
+            for &acc in &constraint_consumer.constraint_accs {
+                assert_eq!(acc, GoldilocksField::ZERO);
+            }
         }
     }
 }
diff --git a/evm/src/arithmetic/sub.rs b/evm/src/arithmetic/sub.rs
index c632eb94..25834406 100644
--- a/evm/src/arithmetic/sub.rs
+++ b/evm/src/arithmetic/sub.rs
@@ -96,6 +96,8 @@ mod tests {
     use crate::arithmetic::columns::NUM_ARITH_COLUMNS;
     use crate::constraint_consumer::ConstraintConsumer;
 
+    const N_RND_TESTS: usize = 1000;
+
     // TODO: Should be able to refactor this test to apply to all operations.
     #[test]
     fn generate_eval_consistency_not_sub() {
@@ -108,14 +110,14 @@ mod tests {
         // if all values are garbage.
         lv[IS_SUB] = F::ZERO;
 
-        let mut constrant_consumer = ConstraintConsumer::new(
+        let mut constraint_consumer = ConstraintConsumer::new(
             vec![GoldilocksField(2), GoldilocksField(3), GoldilocksField(5)],
             GoldilocksField::ONE,
             GoldilocksField::ONE,
             GoldilocksField::ONE,
         );
-        eval_packed_generic(&lv, &mut constrant_consumer);
-        for &acc in &constrant_consumer.constraint_accs {
+        eval_packed_generic(&lv, &mut constraint_consumer);
+        for &acc in &constraint_consumer.constraint_accs {
             assert_eq!(acc, GoldilocksField::ZERO);
         }
     }
@@ -129,23 +131,26 @@ mod tests {
 
         // set `IS_SUB == 1` and ensure all constraints are satisfied.
         lv[IS_SUB] = F::ONE;
-        // set inputs to random values
-        for (&ai, bi) in SUB_INPUT_0.iter().zip(SUB_INPUT_1) {
-            lv[ai] = F::from_canonical_u16(rng.gen());
-            lv[bi] = F::from_canonical_u16(rng.gen());
-        }
 
-        generate(&mut lv);
+        for _ in 0..N_RND_TESTS {
+            // set inputs to random values
+            for (&ai, bi) in SUB_INPUT_0.iter().zip(SUB_INPUT_1) {
+                lv[ai] = F::from_canonical_u16(rng.gen());
+                lv[bi] = F::from_canonical_u16(rng.gen());
+            }
 
-        let mut constrant_consumer = ConstraintConsumer::new(
-            vec![GoldilocksField(2), GoldilocksField(3), GoldilocksField(5)],
-            GoldilocksField::ONE,
-            GoldilocksField::ONE,
-            GoldilocksField::ONE,
-        );
-        eval_packed_generic(&lv, &mut constrant_consumer);
-        for &acc in &constrant_consumer.constraint_accs {
-            assert_eq!(acc, GoldilocksField::ZERO);
+            generate(&mut lv);
+
+            let mut constraint_consumer = ConstraintConsumer::new(
+                vec![GoldilocksField(2), GoldilocksField(3), GoldilocksField(5)],
+                GoldilocksField::ONE,
+                GoldilocksField::ONE,
+                GoldilocksField::ONE,
+            );
+            eval_packed_generic(&lv, &mut constraint_consumer);
+            for &acc in &constraint_consumer.constraint_accs {
+                assert_eq!(acc, GoldilocksField::ZERO);
+            }
         }
     }
 }
diff --git a/evm/src/arithmetic/utils.rs b/evm/src/arithmetic/utils.rs
index c50481f3..b5356a78 100644
--- a/evm/src/arithmetic/utils.rs
+++ b/evm/src/arithmetic/utils.rs
@@ -1,14 +1,28 @@
+use std::ops::{Add, AddAssign, Mul, Neg, Shr, Sub, SubAssign};
+
 use log::error;
+use plonky2::field::extension::Extendable;
+use plonky2::hash::hash_types::RichField;
+use plonky2::iop::ext_target::ExtensionTarget;
+use plonky2::plonk::circuit_builder::CircuitBuilder;
+
+use crate::arithmetic::columns::N_LIMBS;
 
 /// Emit an error message regarding unchecked range assumptions.
 /// Assumes the values in `cols` are `[cols[0], cols[0] + 1, ...,
 /// cols[0] + cols.len() - 1]`.
-pub(crate) fn _range_check_error<const RC_BITS: u32>(file: &str, line: u32, cols: &[usize]) {
+pub(crate) fn _range_check_error<const RC_BITS: u32>(
+    file: &str,
+    line: u32,
+    cols: &[usize],
+    signedness: &str,
+) {
     error!(
-        "{}:{}: arithmetic unit skipped {}-bit range-checks on columns {}--{}: not yet implemented",
+        "{}:{}: arithmetic unit skipped {}-bit {} range-checks on columns {}--{}: not yet implemented",
         line,
         file,
         RC_BITS,
+        signedness,
         cols[0],
         cols[0] + cols.len() - 1
     );
@@ -17,9 +31,297 @@ pub(crate) fn _range_check_error<const RC_BITS: u32>(file: &str, line: u32, cols
 #[macro_export]
 macro_rules! range_check_error {
     ($cols:ident, $rc_bits:expr) => {
-        $crate::arithmetic::utils::_range_check_error::<$rc_bits>(file!(), line!(), &$cols);
+        $crate::arithmetic::utils::_range_check_error::<$rc_bits>(
+            file!(),
+            line!(),
+            &$cols,
+            "unsigned",
+        );
+    };
+    ($cols:ident, $rc_bits:expr, signed) => {
+        $crate::arithmetic::utils::_range_check_error::<$rc_bits>(
+            file!(),
+            line!(),
+            &$cols,
+            "signed",
+        );
     };
     ([$cols:ident], $rc_bits:expr) => {
-        $crate::arithmetic::utils::_range_check_error::<$rc_bits>(file!(), line!(), &[$cols]);
+        $crate::arithmetic::utils::_range_check_error::<$rc_bits>(
+            file!(),
+            line!(),
+            &[$cols],
+            "unsigned",
+        );
     };
 }
+
+/// Return an array of `N` zeros of type T.
+pub(crate) fn pol_zero<T, const N: usize>() -> [T; N]
+where
+    T: Copy + Default,
+{
+    // TODO: This should really be T::zero() from num::Zero, because
+    // default() doesn't guarantee to initialise to zero (though in
+    // our case it always does). However I couldn't work out how to do
+    // that without touching half of the entire crate because it
+    // involves replacing Field::is_zero() with num::Zero::is_zero()
+    // which is used everywhere. Hence Default::default() it is.
+    [T::default(); N]
+}
+
+/// a(x) += b(x), but must have deg(a) >= deg(b).
+pub(crate) fn pol_add_assign<T>(a: &mut [T], b: &[T])
+where
+    T: AddAssign + Copy + Default,
+{
+    debug_assert!(a.len() >= b.len(), "expected {} >= {}", a.len(), b.len());
+    for (a_item, b_item) in a.iter_mut().zip(b) {
+        *a_item += *b_item;
+    }
+}
+
+pub(crate) fn pol_add_assign_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
+    builder: &mut CircuitBuilder<F, D>,
+    a: &mut [ExtensionTarget<D>],
+    b: &[ExtensionTarget<D>],
+) {
+    debug_assert!(a.len() >= b.len(), "expected {} >= {}", a.len(), b.len());
+    for (a_item, b_item) in a.iter_mut().zip(b) {
+        *a_item = builder.add_extension(*a_item, *b_item);
+    }
+}
+
+/// Return a(x) + b(x); returned array is bigger than necessary to
+/// make the interface consistent with `pol_mul_wide`.
+pub(crate) fn pol_add<T>(a: [T; N_LIMBS], b: [T; N_LIMBS]) -> [T; 2 * N_LIMBS - 1]
+where
+    T: Add<Output = T> + Copy + Default,
+{
+    let mut sum = pol_zero();
+    for i in 0..N_LIMBS {
+        sum[i] = a[i] + b[i];
+    }
+    sum
+}
+
+pub(crate) fn pol_add_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
+    builder: &mut CircuitBuilder<F, D>,
+    a: [ExtensionTarget<D>; N_LIMBS],
+    b: [ExtensionTarget<D>; N_LIMBS],
+) -> [ExtensionTarget<D>; 2 * N_LIMBS - 1] {
+    let zero = builder.zero_extension();
+    let mut sum = [zero; 2 * N_LIMBS - 1];
+    for i in 0..N_LIMBS {
+        sum[i] = builder.add_extension(a[i], b[i]);
+    }
+    sum
+}
+
+/// a(x) -= b(x), but must have deg(a) >= deg(b).
+pub(crate) fn pol_sub_assign<T>(a: &mut [T], b: &[T])
+where
+    T: SubAssign + Copy,
+{
+    debug_assert!(a.len() >= b.len(), "expected {} >= {}", a.len(), b.len());
+    for (a_item, b_item) in a.iter_mut().zip(b) {
+        *a_item -= *b_item;
+    }
+}
+
+pub(crate) fn pol_sub_assign_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
+    builder: &mut CircuitBuilder<F, D>,
+    a: &mut [ExtensionTarget<D>],
+    b: &[ExtensionTarget<D>],
+) {
+    debug_assert!(a.len() >= b.len(), "expected {} >= {}", a.len(), b.len());
+    for (a_item, b_item) in a.iter_mut().zip(b) {
+        *a_item = builder.sub_extension(*a_item, *b_item);
+    }
+}
+
+/// Given polynomials a(x) and b(x), return a(x)*b(x).
+///
+/// NB: The caller is responsible for ensuring that no undesired
+/// overflow occurs during the calculation of the coefficients of the
+/// product.
+pub(crate) fn pol_mul_wide<T>(a: [T; N_LIMBS], b: [T; N_LIMBS]) -> [T; 2 * N_LIMBS - 1]
+where
+    T: AddAssign + Copy + Mul<Output = T> + Default,
+{
+    let mut res = [T::default(); 2 * N_LIMBS - 1];
+    for (i, &ai) in a.iter().enumerate() {
+        for (j, &bj) in b.iter().enumerate() {
+            res[i + j] += ai * bj;
+        }
+    }
+    res
+}
+
+pub(crate) fn pol_mul_wide_ext_circuit<
+    F: RichField + Extendable<D>,
+    const D: usize,
+    const M: usize,
+    const N: usize,
+    const P: usize,
+>(
+    builder: &mut CircuitBuilder<F, D>,
+    a: [ExtensionTarget<D>; M],
+    b: [ExtensionTarget<D>; N],
+) -> [ExtensionTarget<D>; P] {
+    let zero = builder.zero_extension();
+    let mut res = [zero; P];
+    for (i, &ai) in a.iter().enumerate() {
+        for (j, &bj) in b.iter().enumerate() {
+            res[i + j] = builder.mul_add_extension(ai, bj, res[i + j]);
+        }
+    }
+    res
+}
+
+/// As for `pol_mul_wide` but the first argument has 2N elements and
+/// hence the result has 3N-1.
+pub(crate) fn pol_mul_wide2<T>(a: [T; 2 * N_LIMBS], b: [T; N_LIMBS]) -> [T; 3 * N_LIMBS - 1]
+where
+    T: AddAssign + Copy + Mul<Output = T> + Default,
+{
+    let mut res = [T::default(); 3 * N_LIMBS - 1];
+    for (i, &ai) in a.iter().enumerate() {
+        for (j, &bj) in b.iter().enumerate() {
+            res[i + j] += ai * bj;
+        }
+    }
+    res
+}
+
+pub(crate) fn pol_mul_wide2_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
+    builder: &mut CircuitBuilder<F, D>,
+    a: [ExtensionTarget<D>; 2 * N_LIMBS],
+    b: [ExtensionTarget<D>; N_LIMBS],
+) -> [ExtensionTarget<D>; 3 * N_LIMBS - 1] {
+    let zero = builder.zero_extension();
+    let mut res = [zero; 3 * N_LIMBS - 1];
+    for (i, &ai) in a.iter().enumerate() {
+        for (j, &bj) in b.iter().enumerate() {
+            res[i + j] = builder.mul_add_extension(ai, bj, res[i + j]);
+        }
+    }
+    res
+}
+
+/// Given a(x) and b(x), return a(x)*b(x) mod 2^256.
+pub(crate) fn pol_mul_lo<T, const N: usize>(a: [T; N], b: [T; N]) -> [T; N]
+where
+    T: AddAssign + Copy + Default + Mul<Output = T>,
+{
+    let mut res = pol_zero();
+    for deg in 0..N {
+        // Invariant: i + j = deg
+        for i in 0..=deg {
+            let j = deg - i;
+            res[deg] += a[i] * b[j];
+        }
+    }
+    res
+}
+
+/// Adjoin M - N zeros to a, returning [a[0], a[1], ..., a[N-1], 0, 0, ..., 0].
+pub(crate) fn pol_extend<T, const N: usize, const M: usize>(a: [T; N]) -> [T; M]
+where
+    T: Copy + Default,
+{
+    assert_eq!(M, 2 * N - 1);
+
+    let mut zero_extend = pol_zero();
+    zero_extend[..N].copy_from_slice(&a);
+    zero_extend
+}
+
+pub(crate) fn pol_extend_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
+    builder: &mut CircuitBuilder<F, D>,
+    a: [ExtensionTarget<D>; N_LIMBS],
+) -> [ExtensionTarget<D>; 2 * N_LIMBS - 1] {
+    let zero = builder.zero_extension();
+    let mut zero_extend = [zero; 2 * N_LIMBS - 1];
+
+    zero_extend[..N_LIMBS].copy_from_slice(&a);
+    zero_extend
+}
+
+/// Given polynomial a(x) = \sum_{i=0}^{2N-2} a[i] x^i and an element
+/// `root`, return b = (x - root) * a(x).
+///
+/// NB: Ignores element a[2 * N_LIMBS - 1], treating it as if it's 0.
+pub(crate) fn pol_adjoin_root<T, U>(a: [T; 2 * N_LIMBS], root: U) -> [T; 2 * N_LIMBS]
+where
+    T: Add<Output = T> + Copy + Default + Mul<Output = T> + Sub<Output = T>,
+    U: Copy + Mul<T, Output = T> + Neg<Output = U>,
+{
+    // \sum_i res[i] x^i = (x - root) \sum_i a[i] x^i. Comparing
+    // coefficients, res[0] = -root*a[0] and
+    // res[i] = a[i-1] - root * a[i]
+
+    let mut res = [T::default(); 2 * N_LIMBS];
+    res[0] = -root * a[0];
+    for deg in 1..(2 * N_LIMBS - 1) {
+        res[deg] = a[deg - 1] - (root * a[deg]);
+    }
+    // NB: We assume that a[2 * N_LIMBS - 1] = 0, so the last
+    // iteration has no "* root" term.
+    res[2 * N_LIMBS - 1] = a[2 * N_LIMBS - 2];
+    res
+}
+
+pub(crate) fn pol_adjoin_root_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
+    builder: &mut CircuitBuilder<F, D>,
+    a: [ExtensionTarget<D>; 2 * N_LIMBS],
+    root: ExtensionTarget<D>,
+) -> [ExtensionTarget<D>; 2 * N_LIMBS] {
+    let zero = builder.zero_extension();
+    let mut res = [zero; 2 * N_LIMBS];
+    // res[deg] = NEG_ONE * root * a[0] + ZERO * zero
+    res[0] = builder.arithmetic_extension(F::NEG_ONE, F::ZERO, root, a[0], zero);
+    for deg in 1..(2 * N_LIMBS - 1) {
+        // res[deg] = NEG_ONE * root * a[deg] + ONE * a[deg - 1]
+        res[deg] = builder.arithmetic_extension(F::NEG_ONE, F::ONE, root, a[deg], a[deg - 1]);
+    }
+    // NB: We assumes that a[2 * N_LIMBS - 1] = 0, so the last
+    // iteration has no "* root" term.
+    res[2 * N_LIMBS - 1] = a[2 * N_LIMBS - 2];
+    res
+}
+
+/// Given polynomial a(x) = \sum_{i=0}^{2N-1} a[i] x^i and a root of `a`
+/// of the form 2^EXP, return q(x) satisfying a(x) = (x - root) * q(x).
+///
+/// NB: We do not verify that a(2^EXP) = 0; if this doesn't hold the
+/// result is basically junk.
+///
+/// NB: The result could be returned in 2*N-1 elements, but we return
+/// 2*N and set the last element to zero since the calling code
+/// happens to require a result zero-extended to 2*N elements.
+pub(crate) fn pol_remove_root_2exp<const EXP: usize, T>(a: [T; 2 * N_LIMBS]) -> [T; 2 * N_LIMBS]
+where
+    T: Copy + Default + Neg<Output = T> + Shr<usize, Output = T> + Sub<Output = T>,
+{
+    // By assumption β := 2^EXP is a root of `a`, i.e. (x - β) divides
+    // `a`; if we write
+    //
+    //    a(x) = \sum_{i=0}^{2N-1} a[i] x^i
+    //         = (x - β) \sum_{i=0}^{2N-2} q[i] x^i
+    //
+    // then by comparing coefficients it is easy to see that
+    //
+    //   q[0] = -a[0] / β  and  q[i] = (q[i-1] - a[i]) / β
+    //
+    // for 0 < i <= 2N-1 (and the divisions are exact).
+
+    let mut q = [T::default(); 2 * N_LIMBS];
+    q[0] = -(a[0] >> EXP);
+
+    // NB: Last element of q is deliberately left equal to zero.
+    for deg in 1..2 * N_LIMBS - 1 {
+        q[deg] = (q[deg - 1] - a[deg]) >> EXP;
+    }
+    q
+}
diff --git a/field/src/goldilocks_field.rs b/field/src/goldilocks_field.rs
index c1bb60b0..e036ab9b 100644
--- a/field/src/goldilocks_field.rs
+++ b/field/src/goldilocks_field.rs
@@ -130,6 +130,18 @@ impl Field64 for GoldilocksField {
         Self(n)
     }
 
+    #[inline]
+    fn from_noncanonical_i64(n: i64) -> Self {
+        Self::from_canonical_u64(if n < 0 {
+            // If n < 0, then this is guaranteed to overflow since
+            // both arguments have their high bit set, so the result
+            // is in the canonical range.
+            Self::ORDER.wrapping_add(n as u64)
+        } else {
+            n as u64
+        })
+    }
+
     #[inline]
     unsafe fn add_canonical_u64(&self, rhs: u64) -> Self {
         let (res_wrapped, carry) = self.0.overflowing_add(rhs);
diff --git a/field/src/types.rs b/field/src/types.rs
index 7130b7f5..b112fde2 100644
--- a/field/src/types.rs
+++ b/field/src/types.rs
@@ -490,6 +490,18 @@ pub trait Field64: Field {
     // TODO: Move to `Field`.
     fn from_noncanonical_u64(n: u64) -> Self;
 
+    /// Returns `n` as an element of this field.
+    // TODO: Move to `Field`.
+    fn from_noncanonical_i64(n: i64) -> Self;
+
+    /// Returns `n` as an element of this field. Assumes that `0 <= n < Self::ORDER`.
+    // TODO: Move to `Field`.
+    // TODO: Should probably be unsafe.
+    #[inline]
+    fn from_canonical_i64(n: i64) -> Self {
+        Self::from_canonical_u64(n as u64)
+    }
+
     #[inline]
     // TODO: Move to `Field`.
     fn add_one(&self) -> Self {