diff --git a/evm/src/arithmetic/modular.rs b/evm/src/arithmetic/modular.rs
index 1fd31bb1..fd2a2e28 100644
--- a/evm/src/arithmetic/modular.rs
+++ b/evm/src/arithmetic/modular.rs
@@ -18,12 +18,13 @@
 //!    a(x) = \sum_{i=0}^15 a[i] x^i
 //!
 //! (so A = a(β)) and similarly for b(x), m(x) and c(x). Then
-//! operation(A,B) = C (mod M) if and only if the polynomial
+//! operation(A,B) = C (mod M) if and only if there exists q such that
+//! the polynomial
 //!
 //!    operation(a(x), b(x)) - c(x) - m(x) * q(x)
 //!
-//! is zero when evaluated at x = β, i.e. it is divisible by (x - β).
-//! Thus exists a polynomial s such that
+//! is zero when evaluated at x = β, i.e. it is divisible by (x - β);
+//! equivalently, there exists a polynomial s such that
 //!
 //!    operation(a(x), b(x)) - c(x) - m(x) * q(x) - (x - β) * s(x) == 0
 //!
@@ -34,12 +35,12 @@
 //! coefficients must be zero. The variable names of the constituent
 //! polynomials are (writing N for N_LIMBS=16):
 //!
-//!   a(x) = \sum_{i=0}^{N-1} input0[i] * β^i
-//!   b(x) = \sum_{i=0}^{N-1} input1[i] * β^i
-//!   c(x) = \sum_{i=0}^{N-1} output[i] * β^i
-//!   m(x) = \sum_{i=0}^{N-1} modulus[i] * β^i
-//!   q(x) = \sum_{i=0}^{2N-1} quot[i] * β^i
-//!   s(x) = \sum_i^{2N-2} aux[i] * β^i
+//!   a(x) = \sum_{i=0}^{N-1} input0[i] * x^i
+//!   b(x) = \sum_{i=0}^{N-1} input1[i] * x^i
+//!   c(x) = \sum_{i=0}^{N-1} output[i] * x^i
+//!   m(x) = \sum_{i=0}^{N-1} modulus[i] * x^i
+//!   q(x) = \sum_{i=0}^{2N-1} quot[i] * x^i
+//!   s(x) = \sum_i^{2N-2} aux[i] * x^i
 //!
 //! Because A, B, M and C are 256-bit numbers, the degrees of a, b, m
 //! and c are (at most) N-1 = 15. If m = 1, then Q would be A*B which
@@ -211,7 +212,7 @@ fn generate_modular_op<F: RichField>(
     // constr_poly must be zero when evaluated at x = β :=
     // 2^LIMB_BITS, hence it's divisible by (x - β). `aux_limbs` is
     // the result of removing that root.
-    let aux_limbs = pol_remove_root_2exp::<LIMB_BITS, _>(constr_poly);
+    let aux_limbs = pol_remove_root_2exp::<LIMB_BITS, _, { 2 * N_LIMBS }>(constr_poly);
 
     for deg in 0..N_LIMBS {
         lv[MODULAR_OUTPUT[deg]] = F::from_canonical_i64(output_limbs[deg]);
@@ -303,7 +304,8 @@ fn modular_constr_poly<P: PackedField>(
     pol_add_assign(&mut constr_poly, &output);
 
     // constr_poly = c(x) + q(x) * m(x) + (x - β) * s(x)
-    let aux = MODULAR_AUX_INPUT.map(|c| lv[c]);
+    let mut aux = MODULAR_AUX_INPUT.map(|c| lv[c]);
+    aux[2 * N_LIMBS - 1] = P::ZEROS; // zero out the MOD_IS_ZERO flag
     let base = P::Scalar::from_canonical_u64(1 << LIMB_BITS);
     pol_add_assign(&mut constr_poly, &pol_adjoin_root(aux, base));
 
@@ -397,7 +399,8 @@ fn modular_constr_poly_ext_circuit<F: RichField + Extendable<D>, const D: usize>
     let mut constr_poly: [_; 2 * N_LIMBS] = prod[0..2 * N_LIMBS].try_into().unwrap();
     pol_add_assign_ext_circuit(builder, &mut constr_poly, &output);
 
-    let aux = MODULAR_AUX_INPUT.map(|c| lv[c]);
+    let mut aux = MODULAR_AUX_INPUT.map(|c| lv[c]);
+    aux[2 * N_LIMBS - 1] = builder.zero_extension();
     let base = builder.constant_extension(F::Extension::from_canonical_u64(1u64 << LIMB_BITS));
     let t = pol_adjoin_root_ext_circuit(builder, aux, base);
     pol_add_assign_ext_circuit(builder, &mut constr_poly, &t);
diff --git a/evm/src/arithmetic/mul.rs b/evm/src/arithmetic/mul.rs
index 9d6638f1..c98b9af8 100644
--- a/evm/src/arithmetic/mul.rs
+++ b/evm/src/arithmetic/mul.rs
@@ -3,30 +3,57 @@
 //! This crate verifies an EVM MUL instruction, which takes two
 //! 256-bit inputs A and B, and produces a 256-bit output C satisfying
 //!
-//!    C = A*B (mod 2^256).
+//!    C = A*B (mod 2^256),
 //!
-//! Inputs A and B, and output C, are given as arrays of 16-bit
+//! i.e. C is the lower half of the usual long multiplication
+//! A*B. Inputs A and B, and output C, are given as arrays of 16-bit
 //! limbs. For example, if the limbs of A are a[0]...a[15], then
 //!
 //!    A = \sum_{i=0}^15 a[i] β^i,
 //!
-//! where β = 2^16. To verify that A, B and C satisfy the equation we
-//! proceed as follows. Define a(x) = \sum_{i=0}^15 a[i] x^i (so A = a(β))
-//! and similarly for b(x) and c(x). Then A*B = C (mod 2^256) if and only
-//! if there exist polynomials q and m such that
+//! where β = 2^16 = 2^LIMB_BITS. To verify that A, B and C satisfy
+//! the equation we proceed as follows. Define
 //!
-//!    a(x)*b(x) - c(x) - m(x)*x^16 - (β - x)*q(x) == 0.
+//!    a(x) = \sum_{i=0}^15 a[i] x^i
+//!
+//! (so A = a(β)) and similarly for b(x) and c(x). Then A*B = C (mod
+//! 2^256) if and only if there exists q such that the polynomial
+//!
+//!    a(x) * b(x) - c(x) - x^16 * q(x)
+//!
+//! is zero when evaluated at x = β, i.e. it is divisible by (x - β);
+//! equivalently, there exists a polynomial s (representing the
+//! carries from the long multiplication) such that
+//!
+//!    a(x) * b(x) - c(x) - x^16 * q(x) - (x - β) * s(x) == 0
+//!
+//! As we only need the lower half of the product, we can omit q(x)
+//! since it is multiplied by the modulus β^16 = 2^256. Thus we only
+//! need to verify
+//!
+//!    a(x) * b(x) - c(x) - (x - β) * s(x) == 0
+//!
+//! In the code below, this "constraint polynomial" is constructed in
+//! the variable `constr_poly`. It must be identically zero for the
+//! multiplication operation to be verified, or, equivalently, each of
+//! its coefficients must be zero. The variable names of the
+//! constituent polynomials are (writing N for N_LIMBS=16):
+//!
+//!   a(x) = \sum_{i=0}^{N-1} input0[i] * x^i
+//!   b(x) = \sum_{i=0}^{N-1} input1[i] * x^i
+//!   c(x) = \sum_{i=0}^{N-1} output[i] * x^i
+//!   s(x) = \sum_i^{2N-3} aux[i] * x^i
 //!
 //! Because A, B and C are 256-bit numbers, the degrees of a, b and c
-//! are (at most) 15. Thus deg(a*b) <= 30, so deg(m) <= 14 and deg(q)
-//! <= 29. However, the fact that we're verifying the equality modulo
-//! 2^256 means that we can ignore terms of degree >= 16, since for
-//! them evaluating at β gives a factor of β^16 = 2^256 which is 0.
+//! are (at most) 15. Thus deg(a*b) <= 30 and deg(s) <= 29; however,
+//! as we're only verifying the lower half of A*B, we only need to
+//! know s(x) up to degree 14 (so that (x - β)*s(x) has degree 15). On
+//! the other hand, the coefficients of s(x) can be as large as
+//! 16*(β-2) or 20 bits.
 //!
-//! Hence, to verify the equality, we don't need m(x) at all, and we
-//! only need to know q(x) up to degree 14 (so that (β - x)*q(x) has
-//! degree 15). On the other hand, the coefficients of q(x) can be as
-//! large as 16*(β-2) or 20 bits.
+//! Note that, unlike for the general modular multiplication (see the
+//! file `modular.rs`), we don't need to check that output is reduced,
+//! since any value of output is less than β^16 and is hence reduced.
 
 use plonky2::field::extension::Extendable;
 use plonky2::field::packed::PackedField;
@@ -35,64 +62,42 @@ use plonky2::hash::hash_types::RichField;
 use plonky2::iop::ext_target::ExtensionTarget;
 
 use crate::arithmetic::columns::*;
-use crate::arithmetic::utils::{pol_mul_lo, pol_sub_assign};
+use crate::arithmetic::utils::*;
 use crate::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer};
 use crate::range_check_error;
 
 pub fn generate<F: RichField>(lv: &mut [F; NUM_ARITH_COLUMNS]) {
-    let input0_limbs = MUL_INPUT_0.map(|c| lv[c].to_canonical_u64());
-    let input1_limbs = MUL_INPUT_1.map(|c| lv[c].to_canonical_u64());
+    let input0_limbs = MUL_INPUT_0.map(|c| lv[c].to_canonical_u64() as i64);
+    let input1_limbs = MUL_INPUT_1.map(|c| lv[c].to_canonical_u64() as i64);
 
-    const MASK: u64 = (1u64 << LIMB_BITS) - 1u64;
+    const MASK: i64 = (1i64 << LIMB_BITS) - 1i64;
 
     // Input and output have 16-bit limbs
-    let mut aux_in_limbs = [0u64; N_LIMBS];
-    let mut output_limbs = [0u64; N_LIMBS];
+    let mut output_limbs = [0i64; N_LIMBS];
 
     // Column-wise pen-and-paper long multiplication on 16-bit limbs.
     // First calculate the coefficients of a(x)*b(x) (in unreduced_prod),
     // then do carry propagation to obtain C = c(β) = a(β)*b(β).
-    let mut cy = 0u64;
+    let mut cy = 0i64;
     let mut unreduced_prod = pol_mul_lo(input0_limbs, input1_limbs);
     for col in 0..N_LIMBS {
         let t = unreduced_prod[col] + cy;
         cy = t >> LIMB_BITS;
         output_limbs[col] = t & MASK;
     }
-
     // In principle, the last cy could be dropped because this is
     // multiplication modulo 2^256. However, we need it below for
-    // aux_in_limbs to handle the fact that unreduced_prod will
-    // inevitably contain a one digit's worth that is > 2^256.
+    // aux_limbs to handle the fact that unreduced_prod will
+    // inevitably contain one digit's worth that is > 2^256.
 
-    for (&c, output_limb) in MUL_OUTPUT.iter().zip(output_limbs) {
-        lv[c] = F::from_canonical_u64(output_limb);
-    }
     pol_sub_assign(&mut unreduced_prod, &output_limbs);
 
-    // unreduced_prod is the coefficients of the polynomial a(x)*b(x) - c(x).
-    // This must be zero when evaluated at x = β = 2^LIMB_BITS, hence it's
-    // divisible by (β - x). If we write unreduced_prod as
-    //
-    //   a(x)*b(x) - c(x) = \sum_{i=0}^n p_i x^i  + terms of degree > n
-    //                    = (β - x) \sum_{i=0}^{n-1} q_i x^i  + terms of degree > n
-    //
-    // then by comparing coefficients it is easy to see that
-    //
-    //   q_0 = p_0 / β  and  q_i = (p_i + q_{i-1}) / β
-    //
-    // for 0 < i < n-1 (and the divisions are exact). Because we're
-    // only calculating the result modulo 2^256, we can ignore the
-    // terms of degree > n = 15.
-    aux_in_limbs[0] = unreduced_prod[0] >> LIMB_BITS;
-    for deg in 1..N_LIMBS - 1 {
-        aux_in_limbs[deg] = (unreduced_prod[deg] + aux_in_limbs[deg - 1]) >> LIMB_BITS;
-    }
-    aux_in_limbs[N_LIMBS - 1] = cy;
+    let mut aux_limbs = pol_remove_root_2exp::<LIMB_BITS, _, N_LIMBS>(unreduced_prod);
+    aux_limbs[N_LIMBS - 1] = -cy;
 
     for deg in 0..N_LIMBS {
-        let c = MUL_AUX_INPUT[deg];
-        lv[c] = F::from_canonical_u64(aux_in_limbs[deg]);
+        lv[MUL_OUTPUT[deg]] = F::from_canonical_i64(output_limbs[deg]);
+        lv[MUL_AUX_INPUT[deg]] = F::from_noncanonical_i64(aux_limbs[deg]);
     }
 }
 
@@ -115,29 +120,26 @@ pub fn eval_packed_generic<P: PackedField>(
     // must be identically zero for this multiplication to be
     // verified.
     //
-    // These two lines set constr_poly to the polynomial A(x)B(x) - C(x),
-    // where A, B and C are the polynomials
+    // These two lines set constr_poly to the polynomial a(x)b(x) - c(x),
+    // where a, b and c are the polynomials
     //
-    //   A(x) = \sum_i input0_limbs[i] * 2^LIMB_BITS
-    //   B(x) = \sum_i input1_limbs[i] * 2^LIMB_BITS
-    //   C(x) = \sum_i output_limbs[i] * 2^LIMB_BITS
+    //   a(x) = \sum_i input0_limbs[i] * β^i
+    //   b(x) = \sum_i input1_limbs[i] * β^i
+    //   c(x) = \sum_i output_limbs[i] * β^i
     //
-    // This polynomial should equal (2^LIMB_BITS - x) * Q(x) where Q is
+    // This polynomial should equal  where s is
     //
-    //   Q(x) = \sum_i aux_limbs[i] * 2^LIMB_BITS
+    //   s(x) = \sum_i aux_limbs[i] * β^i
     //
     let mut constr_poly = pol_mul_lo(input0_limbs, input1_limbs);
     pol_sub_assign(&mut constr_poly, &output_limbs);
 
-    // This subtracts (2^LIMB_BITS - x) * Q(x) from constr_poly.
+    // This subtracts (x - β) * s(x) from constr_poly.
     let base = P::Scalar::from_canonical_u64(1 << LIMB_BITS);
-    constr_poly[0] -= base * aux_limbs[0];
-    for deg in 1..N_LIMBS {
-        constr_poly[deg] -= (base * aux_limbs[deg]) - aux_limbs[deg - 1];
-    }
+    pol_sub_assign(&mut constr_poly, &pol_adjoin_root(aux_limbs, base));
 
     // At this point constr_poly holds the coefficients of the
-    // polynomial A(x)B(x) - C(x) - (2^LIMB_BITS - x)*Q(x). The
+    // polynomial a(x)b(x) - c(x) - (x - β)*s(x). The
     // multiplication is valid if and only if all of those
     // coefficients are zero.
     for &c in &constr_poly {
@@ -154,37 +156,14 @@ pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
     let input0_limbs = MUL_INPUT_0.map(|c| lv[c]);
     let input1_limbs = MUL_INPUT_1.map(|c| lv[c]);
     let output_limbs = MUL_OUTPUT.map(|c| lv[c]);
-    let aux_in_limbs = MUL_AUX_INPUT.map(|c| lv[c]);
+    let aux_limbs = MUL_AUX_INPUT.map(|c| lv[c]);
 
-    let zero = builder.zero_extension();
-    let mut constr_poly = [zero; N_LIMBS]; // pointless init
+    let mut constr_poly = pol_mul_lo_ext_circuit(builder, input0_limbs, input1_limbs);
+    pol_sub_assign_ext_circuit(builder, &mut constr_poly, &output_limbs);
 
-    // Invariant: i + j = deg
-    for col in 0..N_LIMBS {
-        let mut acc = zero;
-        for i in 0..=col {
-            let j = col - i;
-            acc = builder.mul_add_extension(input0_limbs[i], input1_limbs[j], acc);
-        }
-        constr_poly[col] = builder.sub_extension(acc, output_limbs[col]);
-    }
-
-    let base = F::from_canonical_u64(1 << LIMB_BITS);
-    let one = builder.one_extension();
-    // constr_poly[0] = constr_poly[0] - base * aux_in_limbs[0]
-    constr_poly[0] =
-        builder.arithmetic_extension(F::ONE, -base, constr_poly[0], one, aux_in_limbs[0]);
-    for deg in 1..N_LIMBS {
-        // constr_poly[deg] -= (base*aux_in_limbs[deg] - aux_in_limbs[deg-1])
-        let t = builder.arithmetic_extension(
-            base,
-            F::NEG_ONE,
-            aux_in_limbs[deg],
-            one,
-            aux_in_limbs[deg - 1],
-        );
-        constr_poly[deg] = builder.sub_extension(constr_poly[deg], t);
-    }
+    let base = builder.constant_extension(F::Extension::from_canonical_u64(1 << LIMB_BITS));
+    let rhs = pol_adjoin_root_ext_circuit(builder, aux_limbs, base);
+    pol_sub_assign_ext_circuit(builder, &mut constr_poly, &rhs);
 
     for &c in &constr_poly {
         let filter = builder.mul_extension(is_mul, c);
diff --git a/evm/src/arithmetic/utils.rs b/evm/src/arithmetic/utils.rs
index b5356a78..ccb8bc0a 100644
--- a/evm/src/arithmetic/utils.rs
+++ b/evm/src/arithmetic/utils.rs
@@ -225,6 +225,22 @@ where
     res
 }
 
+pub(crate) fn pol_mul_lo_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
+    builder: &mut CircuitBuilder<F, D>,
+    a: [ExtensionTarget<D>; N_LIMBS],
+    b: [ExtensionTarget<D>; N_LIMBS],
+) -> [ExtensionTarget<D>; N_LIMBS] {
+    let zero = builder.zero_extension();
+    let mut res = [zero; N_LIMBS];
+    for deg in 0..N_LIMBS {
+        for i in 0..=deg {
+            let j = deg - i;
+            res[deg] = builder.mul_add_extension(a[i], b[j], res[deg]);
+        }
+    }
+    res
+}
+
 /// Adjoin M - N zeros to a, returning [a[0], a[1], ..., a[N-1], 0, 0, ..., 0].
 pub(crate) fn pol_extend<T, const N: usize, const M: usize>(a: [T; N]) -> [T; M]
 where
@@ -248,11 +264,9 @@ pub(crate) fn pol_extend_ext_circuit<F: RichField + Extendable<D>, const D: usiz
     zero_extend
 }
 
-/// Given polynomial a(x) = \sum_{i=0}^{2N-2} a[i] x^i and an element
+/// Given polynomial a(x) = \sum_{i=0}^{N-2} a[i] x^i and an element
 /// `root`, return b = (x - root) * a(x).
-///
-/// NB: Ignores element a[2 * N_LIMBS - 1], treating it as if it's 0.
-pub(crate) fn pol_adjoin_root<T, U>(a: [T; 2 * N_LIMBS], root: U) -> [T; 2 * N_LIMBS]
+pub(crate) fn pol_adjoin_root<T, U, const N: usize>(a: [T; N], root: U) -> [T; N]
 where
     T: Add<Output = T> + Copy + Default + Mul<Output = T> + Sub<Output = T>,
     U: Copy + Mul<T, Output = T> + Neg<Output = U>,
@@ -261,66 +275,64 @@ where
     // coefficients, res[0] = -root*a[0] and
     // res[i] = a[i-1] - root * a[i]
 
-    let mut res = [T::default(); 2 * N_LIMBS];
+    let mut res = [T::default(); N];
     res[0] = -root * a[0];
-    for deg in 1..(2 * N_LIMBS - 1) {
+    for deg in 1..N {
         res[deg] = a[deg - 1] - (root * a[deg]);
     }
-    // NB: We assume that a[2 * N_LIMBS - 1] = 0, so the last
-    // iteration has no "* root" term.
-    res[2 * N_LIMBS - 1] = a[2 * N_LIMBS - 2];
     res
 }
 
-pub(crate) fn pol_adjoin_root_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
+pub(crate) fn pol_adjoin_root_ext_circuit<
+    F: RichField + Extendable<D>,
+    const D: usize,
+    const N: usize,
+>(
     builder: &mut CircuitBuilder<F, D>,
-    a: [ExtensionTarget<D>; 2 * N_LIMBS],
+    a: [ExtensionTarget<D>; N],
     root: ExtensionTarget<D>,
-) -> [ExtensionTarget<D>; 2 * N_LIMBS] {
+) -> [ExtensionTarget<D>; N] {
     let zero = builder.zero_extension();
-    let mut res = [zero; 2 * N_LIMBS];
+    let mut res = [zero; N];
     // res[deg] = NEG_ONE * root * a[0] + ZERO * zero
     res[0] = builder.arithmetic_extension(F::NEG_ONE, F::ZERO, root, a[0], zero);
-    for deg in 1..(2 * N_LIMBS - 1) {
+    for deg in 1..N {
         // res[deg] = NEG_ONE * root * a[deg] + ONE * a[deg - 1]
         res[deg] = builder.arithmetic_extension(F::NEG_ONE, F::ONE, root, a[deg], a[deg - 1]);
     }
-    // NB: We assumes that a[2 * N_LIMBS - 1] = 0, so the last
-    // iteration has no "* root" term.
-    res[2 * N_LIMBS - 1] = a[2 * N_LIMBS - 2];
     res
 }
 
-/// Given polynomial a(x) = \sum_{i=0}^{2N-1} a[i] x^i and a root of `a`
+/// Given polynomial a(x) = \sum_{i=0}^{N-1} a[i] x^i and a root of `a`
 /// of the form 2^EXP, return q(x) satisfying a(x) = (x - root) * q(x).
 ///
 /// NB: We do not verify that a(2^EXP) = 0; if this doesn't hold the
 /// result is basically junk.
 ///
-/// NB: The result could be returned in 2*N-1 elements, but we return
-/// 2*N and set the last element to zero since the calling code
-/// happens to require a result zero-extended to 2*N elements.
-pub(crate) fn pol_remove_root_2exp<const EXP: usize, T>(a: [T; 2 * N_LIMBS]) -> [T; 2 * N_LIMBS]
+/// NB: The result could be returned in N-1 elements, but we return
+/// N and set the last element to zero since the calling code
+/// happens to require a result zero-extended to N elements.
+pub(crate) fn pol_remove_root_2exp<const EXP: usize, T, const N: usize>(a: [T; N]) -> [T; N]
 where
     T: Copy + Default + Neg<Output = T> + Shr<usize, Output = T> + Sub<Output = T>,
 {
     // By assumption β := 2^EXP is a root of `a`, i.e. (x - β) divides
     // `a`; if we write
     //
-    //    a(x) = \sum_{i=0}^{2N-1} a[i] x^i
-    //         = (x - β) \sum_{i=0}^{2N-2} q[i] x^i
+    //    a(x) = \sum_{i=0}^{N-1} a[i] x^i
+    //         = (x - β) \sum_{i=0}^{N-2} q[i] x^i
     //
     // then by comparing coefficients it is easy to see that
     //
     //   q[0] = -a[0] / β  and  q[i] = (q[i-1] - a[i]) / β
     //
-    // for 0 < i <= 2N-1 (and the divisions are exact).
+    // for 0 < i <= N-1 (and the divisions are exact).
 
-    let mut q = [T::default(); 2 * N_LIMBS];
+    let mut q = [T::default(); N];
     q[0] = -(a[0] >> EXP);
 
     // NB: Last element of q is deliberately left equal to zero.
-    for deg in 1..2 * N_LIMBS - 1 {
+    for deg in 1..N - 1 {
         q[deg] = (q[deg - 1] - a[deg]) >> EXP;
     }
     q