diff --git a/evm/src/arithmetic/add.rs b/evm/src/arithmetic/add.rs
new file mode 100644
index 00000000..80f03d63
--- /dev/null
+++ b/evm/src/arithmetic/add.rs
@@ -0,0 +1,211 @@
+use itertools::izip;
+use plonky2::field::extension::Extendable;
+use plonky2::field::packed::PackedField;
+use plonky2::field::types::Field;
+use plonky2::hash::hash_types::RichField;
+use plonky2::iop::ext_target::ExtensionTarget;
+
+use crate::arithmetic::columns::*;
+use crate::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer};
+use crate::range_check_error;
+
+/// Given two sequences `larger` and `smaller` of equal length (not
+/// checked), verifies that \sum_i larger[i] 2^(LIMB_BITS * i) ==
+/// \sum_i smaller[i] 2^(LIMB_BITS * i), taking care of carry propagation.
+///
+/// The sequences must have been produced by `{add,sub}::eval_packed_generic()`.
+pub(crate) fn eval_packed_generic_are_equal<P, I, J>(
+    yield_constr: &mut ConstraintConsumer<P>,
+    is_op: P,
+    larger: I,
+    smaller: J,
+) where
+    P: PackedField,
+    I: Iterator<Item = P>,
+    J: Iterator<Item = P>,
+{
+    let overflow = P::Scalar::from_canonical_u64(1 << LIMB_BITS);
+    let overflow_inv = overflow.inverse();
+    let mut cy = P::ZEROS;
+    for (a, b) in larger.zip(smaller) {
+        // t should be either 0 or 2^LIMB_BITS
+        let t = cy + a - b;
+        yield_constr.constraint(is_op * t * (overflow - t));
+        // cy <-- 0 or 1
+        // NB: this is multiplication by a constant, so doesn't
+        // increase the degree of the constraint.
+        cy = t * overflow_inv;
+    }
+}
+
+pub(crate) fn eval_ext_circuit_are_equal<F, const D: usize, I, J>(
+    builder: &mut plonky2::plonk::circuit_builder::CircuitBuilder<F, D>,
+    yield_constr: &mut RecursiveConstraintConsumer<F, D>,
+    is_op: ExtensionTarget<D>,
+    larger: I,
+    smaller: J,
+) where
+    F: RichField + Extendable<D>,
+    I: Iterator<Item = ExtensionTarget<D>>,
+    J: Iterator<Item = ExtensionTarget<D>>,
+{
+    // 2^LIMB_BITS in the base field
+    let overflow_base = F::from_canonical_u64(1 << LIMB_BITS);
+    // 2^LIMB_BITS in the extension field as an ExtensionTarget
+    let overflow = builder.constant_extension(F::Extension::from(overflow_base));
+    // 2^-LIMB_BITS in the base field.
+    let overflow_inv = F::inverse_2exp(LIMB_BITS);
+
+    let mut cy = builder.zero_extension();
+    for (a, b) in larger.zip(smaller) {
+        // t0 = cy + a
+        let t0 = builder.add_extension(cy, a);
+        // t  = t0 - b
+        let t = builder.sub_extension(t0, b);
+        // t1 = overflow - t
+        let t1 = builder.sub_extension(overflow, t);
+        // t2 = t * t1
+        let t2 = builder.mul_extension(t, t1);
+
+        let filtered_limb_constraint = builder.mul_extension(is_op, t2);
+        yield_constr.constraint(builder, filtered_limb_constraint);
+
+        cy = builder.mul_const_extension(overflow_inv, t);
+    }
+}
+
+pub fn generate<F: RichField>(lv: &mut [F; NUM_ARITH_COLUMNS]) {
+    let input0_limbs = ADD_INPUT_0.map(|c| lv[c].to_canonical_u64());
+    let input1_limbs = ADD_INPUT_1.map(|c| lv[c].to_canonical_u64());
+
+    // Input and output have 16-bit limbs
+    let mut output_limbs = [0u64; N_LIMBS];
+
+    const MASK: u64 = (1u64 << LIMB_BITS) - 1u64;
+    let mut cy = 0u64;
+    for (i, a, b) in izip!(0.., input0_limbs, input1_limbs) {
+        let s = a + b + cy;
+        cy = s >> LIMB_BITS;
+        assert!(cy <= 1u64, "input limbs were larger than 16 bits");
+        output_limbs[i] = s & MASK;
+    }
+    // last carry is dropped because this is addition modulo 2^256.
+
+    for (&c, output_limb) in ADD_OUTPUT.iter().zip(output_limbs) {
+        lv[c] = F::from_canonical_u64(output_limb);
+    }
+}
+
+pub fn eval_packed_generic<P: PackedField>(
+    lv: &[P; NUM_ARITH_COLUMNS],
+    yield_constr: &mut ConstraintConsumer<P>,
+) {
+    range_check_error!(ADD_INPUT_0, 16);
+    range_check_error!(ADD_INPUT_1, 16);
+    range_check_error!(ADD_OUTPUT, 16);
+
+    let is_add = lv[IS_ADD];
+    let input0_limbs = ADD_INPUT_0.iter().map(|&c| lv[c]);
+    let input1_limbs = ADD_INPUT_1.iter().map(|&c| lv[c]);
+    let output_limbs = ADD_OUTPUT.iter().map(|&c| lv[c]);
+
+    // This computed output is not yet reduced; i.e. some limbs may be
+    // more than 16 bits.
+    let output_computed = input0_limbs.zip(input1_limbs).map(|(a, b)| a + b);
+
+    eval_packed_generic_are_equal(yield_constr, is_add, output_computed, output_limbs);
+}
+
+#[allow(clippy::needless_collect)]
+pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
+    builder: &mut plonky2::plonk::circuit_builder::CircuitBuilder<F, D>,
+    lv: &[ExtensionTarget<D>; NUM_ARITH_COLUMNS],
+    yield_constr: &mut RecursiveConstraintConsumer<F, D>,
+) {
+    let is_add = lv[IS_ADD];
+    let input0_limbs = ADD_INPUT_0.iter().map(|&c| lv[c]);
+    let input1_limbs = ADD_INPUT_1.iter().map(|&c| lv[c]);
+    let output_limbs = ADD_OUTPUT.iter().map(|&c| lv[c]);
+
+    // Since `map` is lazy and the closure passed to it borrows
+    // `builder`, we can't then borrow builder again below in the call
+    // to `eval_ext_circuit_are_equal`. The solution is to force
+    // evaluation with `collect`.
+    let output_computed = input0_limbs
+        .zip(input1_limbs)
+        .map(|(a, b)| builder.add_extension(a, b))
+        .collect::<Vec<ExtensionTarget<D>>>();
+
+    eval_ext_circuit_are_equal(
+        builder,
+        yield_constr,
+        is_add,
+        output_computed.into_iter(),
+        output_limbs,
+    );
+}
+
+#[cfg(test)]
+mod tests {
+    use plonky2::field::goldilocks_field::GoldilocksField;
+    use plonky2::field::types::Field;
+    use rand::{Rng, SeedableRng};
+    use rand_chacha::ChaCha8Rng;
+
+    use super::*;
+    use crate::arithmetic::columns::NUM_ARITH_COLUMNS;
+    use crate::constraint_consumer::ConstraintConsumer;
+
+    // TODO: Should be able to refactor this test to apply to all operations.
+    #[test]
+    fn generate_eval_consistency_not_add() {
+        type F = GoldilocksField;
+
+        let mut rng = ChaCha8Rng::seed_from_u64(0x6feb51b7ec230f25);
+        let mut lv = [F::default(); NUM_ARITH_COLUMNS].map(|_| F::rand_from_rng(&mut rng));
+
+        // if `IS_ADD == 0`, then the constraints should be met even
+        // if all values are garbage.
+        lv[IS_ADD] = F::ZERO;
+
+        let mut constrant_consumer = ConstraintConsumer::new(
+            vec![GoldilocksField(2), GoldilocksField(3), GoldilocksField(5)],
+            GoldilocksField::ONE,
+            GoldilocksField::ONE,
+            GoldilocksField::ONE,
+        );
+        eval_packed_generic(&lv, &mut constrant_consumer);
+        for &acc in &constrant_consumer.constraint_accs {
+            assert_eq!(acc, GoldilocksField::ZERO);
+        }
+    }
+
+    #[test]
+    fn generate_eval_consistency_add() {
+        type F = GoldilocksField;
+
+        let mut rng = ChaCha8Rng::seed_from_u64(0x6feb51b7ec230f25);
+        let mut lv = [F::default(); NUM_ARITH_COLUMNS].map(|_| F::rand_from_rng(&mut rng));
+
+        // set `IS_ADD == 1` and ensure all constraints are satisfied.
+        lv[IS_ADD] = F::ONE;
+        // set inputs to random values
+        for (&ai, bi) in ADD_INPUT_0.iter().zip(ADD_INPUT_1) {
+            lv[ai] = F::from_canonical_u16(rng.gen());
+            lv[bi] = F::from_canonical_u16(rng.gen());
+        }
+
+        generate(&mut lv);
+
+        let mut constrant_consumer = ConstraintConsumer::new(
+            vec![GoldilocksField(2), GoldilocksField(3), GoldilocksField(5)],
+            GoldilocksField::ONE,
+            GoldilocksField::ONE,
+            GoldilocksField::ONE,
+        );
+        eval_packed_generic(&lv, &mut constrant_consumer);
+        for &acc in &constrant_consumer.constraint_accs {
+            assert_eq!(acc, GoldilocksField::ZERO);
+        }
+    }
+}
diff --git a/evm/src/arithmetic/arithmetic_stark.rs b/evm/src/arithmetic/arithmetic_stark.rs
new file mode 100644
index 00000000..ce8c7528
--- /dev/null
+++ b/evm/src/arithmetic/arithmetic_stark.rs
@@ -0,0 +1,87 @@
+use std::marker::PhantomData;
+use std::ops::Add;
+
+use itertools::Itertools;
+use plonky2::field::extension::{Extendable, FieldExtension};
+use plonky2::field::packed::PackedField;
+use plonky2::hash::hash_types::RichField;
+
+use crate::arithmetic::add;
+use crate::arithmetic::columns;
+use crate::arithmetic::mul;
+use crate::arithmetic::sub;
+use crate::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer};
+use crate::stark::Stark;
+use crate::vars::{StarkEvaluationTargets, StarkEvaluationVars};
+
+#[derive(Copy, Clone)]
+pub struct ArithmeticStark<F, const D: usize> {
+    pub f: PhantomData<F>,
+}
+
+impl<F: RichField, const D: usize> ArithmeticStark<F, D> {
+    pub fn generate(&self, local_values: &mut [F; columns::NUM_ARITH_COLUMNS]) {
+        // Check that at most one operation column is "one" and that the
+        // rest are "zero".
+        assert_eq!(
+            columns::ALL_OPERATIONS
+                .iter()
+                .map(|&c| {
+                    if local_values[c] == F::ONE {
+                        Ok(1u64)
+                    } else if local_values[c] == F::ZERO {
+                        Ok(0u64)
+                    } else {
+                        Err("column was not 0 nor 1")
+                    }
+                })
+                .fold_ok(0u64, Add::add),
+            Ok(1)
+        );
+
+        if local_values[columns::IS_ADD].is_one() {
+            add::generate(local_values);
+        } else if local_values[columns::IS_SUB].is_one() {
+            sub::generate(local_values);
+        } else if local_values[columns::IS_MUL].is_one() {
+            mul::generate(local_values);
+        } else {
+            todo!("the requested operation has not yet been implemented");
+        }
+    }
+}
+
+impl<F: RichField + Extendable<D>, const D: usize> Stark<F, D> for ArithmeticStark<F, D> {
+    const COLUMNS: usize = columns::NUM_ARITH_COLUMNS;
+    const PUBLIC_INPUTS: usize = 0;
+
+    fn eval_packed_generic<FE, P, const D2: usize>(
+        &self,
+        vars: StarkEvaluationVars<FE, P, { Self::COLUMNS }, { Self::PUBLIC_INPUTS }>,
+        yield_constr: &mut ConstraintConsumer<P>,
+    ) where
+        FE: FieldExtension<D2, BaseField = F>,
+        P: PackedField<Scalar = FE>,
+    {
+        let lv = vars.local_values;
+        add::eval_packed_generic(lv, yield_constr);
+        sub::eval_packed_generic(lv, yield_constr);
+        mul::eval_packed_generic(lv, yield_constr);
+    }
+
+    fn eval_ext_circuit(
+        &self,
+        builder: &mut plonky2::plonk::circuit_builder::CircuitBuilder<F, D>,
+        vars: StarkEvaluationTargets<D, { Self::COLUMNS }, { Self::PUBLIC_INPUTS }>,
+        yield_constr: &mut RecursiveConstraintConsumer<F, D>,
+    ) {
+        let lv = vars.local_values;
+        add::eval_ext_circuit(builder, lv, yield_constr);
+        sub::eval_ext_circuit(builder, lv, yield_constr);
+        mul::eval_ext_circuit(builder, lv, yield_constr);
+    }
+
+    fn constraint_degree(&self) -> usize {
+        3
+    }
+}
diff --git a/evm/src/arithmetic/columns.rs b/evm/src/arithmetic/columns.rs
new file mode 100644
index 00000000..e51419a8
--- /dev/null
+++ b/evm/src/arithmetic/columns.rs
@@ -0,0 +1,82 @@
+//! Arithmetic unit
+
+pub const LIMB_BITS: usize = 16;
+const EVM_REGISTER_BITS: usize = 256;
+
+/// Return the number of LIMB_BITS limbs that are in an EVM
+/// register-sized number, panicking if LIMB_BITS doesn't divide in
+/// the EVM register size.
+const fn n_limbs() -> usize {
+    if EVM_REGISTER_BITS % LIMB_BITS != 0 {
+        panic!("limb size must divide EVM register size");
+    }
+    EVM_REGISTER_BITS / LIMB_BITS
+}
+
+/// Number of LIMB_BITS limbs that are in on EVM register-sized number.
+pub const N_LIMBS: usize = n_limbs();
+
+pub const IS_ADD: usize = 0;
+pub const IS_MUL: usize = IS_ADD + 1;
+pub const IS_SUB: usize = IS_MUL + 1;
+pub const IS_DIV: usize = IS_SUB + 1;
+pub const IS_SDIV: usize = IS_DIV + 1;
+pub const IS_MOD: usize = IS_SDIV + 1;
+pub const IS_SMOD: usize = IS_MOD + 1;
+pub const IS_ADDMOD: usize = IS_SMOD + 1;
+pub const IS_MULMOD: usize = IS_ADDMOD + 1;
+pub const IS_LT: usize = IS_MULMOD + 1;
+pub const IS_GT: usize = IS_LT + 1;
+pub const IS_SLT: usize = IS_GT + 1;
+pub const IS_SGT: usize = IS_SLT + 1;
+pub const IS_SHL: usize = IS_SGT + 1;
+pub const IS_SHR: usize = IS_SHL + 1;
+pub const IS_SAR: usize = IS_SHR + 1;
+
+const START_SHARED_COLS: usize = IS_SAR + 1;
+
+pub(crate) const ALL_OPERATIONS: [usize; 16] = [
+    IS_ADD, IS_MUL, IS_SUB, IS_DIV, IS_SDIV, IS_MOD, IS_SMOD, IS_ADDMOD, IS_MULMOD, IS_LT, IS_GT,
+    IS_SLT, IS_SGT, IS_SHL, IS_SHR, IS_SAR,
+];
+
+/// Within the Arithmetic Unit, there are shared columns which can be
+/// used by any arithmetic circuit, depending on which one is active
+/// this cycle.  Can be increased as needed as other operations are
+/// implemented.
+const NUM_SHARED_COLS: usize = 64;
+
+const fn shared_col(i: usize) -> usize {
+    assert!(i < NUM_SHARED_COLS);
+    START_SHARED_COLS + i
+}
+
+const fn gen_input_cols<const N: usize>(start: usize) -> [usize; N] {
+    let mut cols = [0usize; N];
+    let mut i = 0;
+    while i < N {
+        cols[i] = shared_col(start + i);
+        i += 1;
+    }
+    cols
+}
+
+const GENERAL_INPUT_0: [usize; N_LIMBS] = gen_input_cols::<N_LIMBS>(0);
+const GENERAL_INPUT_1: [usize; N_LIMBS] = gen_input_cols::<N_LIMBS>(N_LIMBS);
+const GENERAL_INPUT_2: [usize; N_LIMBS] = gen_input_cols::<N_LIMBS>(2 * N_LIMBS);
+const AUX_INPUT_0: [usize; N_LIMBS] = gen_input_cols::<N_LIMBS>(3 * N_LIMBS);
+
+pub(crate) const ADD_INPUT_0: [usize; N_LIMBS] = GENERAL_INPUT_0;
+pub(crate) const ADD_INPUT_1: [usize; N_LIMBS] = GENERAL_INPUT_1;
+pub(crate) const ADD_OUTPUT: [usize; N_LIMBS] = GENERAL_INPUT_2;
+
+pub(crate) const SUB_INPUT_0: [usize; N_LIMBS] = GENERAL_INPUT_0;
+pub(crate) const SUB_INPUT_1: [usize; N_LIMBS] = GENERAL_INPUT_1;
+pub(crate) const SUB_OUTPUT: [usize; N_LIMBS] = GENERAL_INPUT_2;
+
+pub(crate) const MUL_INPUT_0: [usize; N_LIMBS] = GENERAL_INPUT_0;
+pub(crate) const MUL_INPUT_1: [usize; N_LIMBS] = GENERAL_INPUT_1;
+pub(crate) const MUL_OUTPUT: [usize; N_LIMBS] = GENERAL_INPUT_2;
+pub(crate) const MUL_AUX_INPUT: [usize; N_LIMBS] = AUX_INPUT_0;
+
+pub const NUM_ARITH_COLUMNS: usize = START_SHARED_COLS + NUM_SHARED_COLS;
diff --git a/evm/src/arithmetic/mod.rs b/evm/src/arithmetic/mod.rs
new file mode 100644
index 00000000..07c4c5a9
--- /dev/null
+++ b/evm/src/arithmetic/mod.rs
@@ -0,0 +1,7 @@
+mod add;
+mod mul;
+mod sub;
+mod utils;
+
+pub mod arithmetic_stark;
+pub(crate) mod columns;
diff --git a/evm/src/arithmetic/mul.rs b/evm/src/arithmetic/mul.rs
new file mode 100644
index 00000000..72270517
--- /dev/null
+++ b/evm/src/arithmetic/mul.rs
@@ -0,0 +1,269 @@
+//! Support for the EVM MUL instruction.
+//!
+//! This crate verifies an EVM MUL instruction, which takes two
+//! 256-bit inputs A and B, and produces a 256-bit output C satisfying
+//!
+//!    C = A*B (mod 2^256).
+//!
+//! Inputs A and B, and output C, are given as arrays of 16-bit
+//! limbs. For example, if the limbs of A are a[0]...a[15], then
+//!
+//!    A = \sum_{i=0}^15 a[i] β^i,
+//!
+//! where β = 2^16. To verify that A, B and C satisfy the equation we
+//! proceed as follows. Define a(x) = \sum_{i=0}^15 a[i] x^i (so A = a(β))
+//! and similarly for b(x) and c(x). Then A*B = C (mod 2^256) if and only
+//! if there exist polynomials q and m such that
+//!
+//!    a(x)*b(x) - c(x) - m(x)*x^16 - (x - β)*q(x) == 0.
+//!
+//! Because A, B and C are 256-bit numbers, the degrees of a, b and c
+//! are (at most) 15. Thus deg(a*b) <= 30, so deg(m) <= 14 and deg(q)
+//! <= 29. However, the fact that we're verifying the equality modulo
+//! 2^256 means that we can ignore terms of degree >= 16, since for
+//! them evaluating at β gives a factor of β^16 = 2^256 which is 0.
+//!
+//! Hence, to verify the equality, we don't need m(x) at all, and we
+//! only need to know q(x) up to degree 14 (so that (x-β)*q(x) has
+//! degree 15). On the other hand, the coefficients of q(x) can be as
+//! large as 16*(β-2) or 20 bits.
+
+use plonky2::field::extension::Extendable;
+use plonky2::field::packed::PackedField;
+use plonky2::field::types::Field;
+use plonky2::hash::hash_types::RichField;
+use plonky2::iop::ext_target::ExtensionTarget;
+
+use crate::arithmetic::columns::*;
+use crate::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer};
+use crate::range_check_error;
+
+pub fn generate<F: RichField>(lv: &mut [F; NUM_ARITH_COLUMNS]) {
+    let input0_limbs = MUL_INPUT_0.map(|c| lv[c].to_canonical_u64());
+    let input1_limbs = MUL_INPUT_1.map(|c| lv[c].to_canonical_u64());
+
+    const MASK: u64 = (1u64 << LIMB_BITS) - 1u64;
+
+    // Input and output have 16-bit limbs
+    let mut aux_in_limbs = [0u64; N_LIMBS];
+    let mut output_limbs = [0u64; N_LIMBS];
+
+    let mut unreduced_prod = [0u64; N_LIMBS];
+
+    // Column-wise pen-and-paper long multiplication on 16-bit limbs.
+    // We have heaps of space at the top of each limb, so by
+    // calculating column-wise (instead of the usual row-wise) we
+    // avoid a bunch of carry propagation handling (at the expense of
+    // slightly worse cache coherency), and it makes it easy to
+    // calculate the coefficients of a(x)*b(x) (in unreduced_prod).
+    let mut cy = 0u64;
+    for col in 0..N_LIMBS {
+        for i in 0..=col {
+            // Invariant: i + j = col
+            let j = col - i;
+            let ai_x_bj = input0_limbs[i] * input1_limbs[j];
+            unreduced_prod[col] += ai_x_bj;
+        }
+        let t = unreduced_prod[col] + cy;
+        cy = t >> LIMB_BITS;
+        output_limbs[col] = t & MASK;
+    }
+    // In principle, the last cy could be dropped because this is
+    // multiplication modulo 2^256. However, we need it below for
+    // aux_in_limbs to handle the fact that unreduced_prod will
+    // inevitably contain a one digit's worth that is > 2^256.
+
+    for (&c, output_limb) in MUL_OUTPUT.iter().zip(output_limbs) {
+        lv[c] = F::from_canonical_u64(output_limb);
+    }
+    for deg in 0..N_LIMBS {
+        // deg'th element <- a*b - c
+        unreduced_prod[deg] -= output_limbs[deg];
+    }
+
+    // unreduced_prod is the coefficients of the polynomial a(x)*b(x) - c(x).
+    // This must be zero when evaluated at x = B = 2^LIMB_BITS, hence it's
+    // divisible by (B - x). If we write unreduced_prod as
+    //
+    //   a(x)*b(x) - c(x) = \sum_{i=0}^n p_i x^i
+    //                    = (B - x) \sum_{i=0}^{n-1} q_i x^i
+    //
+    // then by comparing coefficients it is easy to see that
+    //
+    //   q_0 = p_0 / B  and  q_i = (p_i + q_{i-1}) / B
+    //
+    // for 0 < i < n-1 (and the divisions are exact).
+    aux_in_limbs[0] = unreduced_prod[0] >> LIMB_BITS;
+    for deg in 1..N_LIMBS - 1 {
+        aux_in_limbs[deg] = (unreduced_prod[deg] + aux_in_limbs[deg - 1]) >> LIMB_BITS;
+    }
+    aux_in_limbs[N_LIMBS - 1] = cy;
+
+    for deg in 0..N_LIMBS {
+        let c = MUL_AUX_INPUT[deg];
+        lv[c] = F::from_canonical_u64(aux_in_limbs[deg]);
+    }
+}
+
+pub fn eval_packed_generic<P: PackedField>(
+    lv: &[P; NUM_ARITH_COLUMNS],
+    yield_constr: &mut ConstraintConsumer<P>,
+) {
+    range_check_error!(MUL_INPUT_0, 16);
+    range_check_error!(MUL_INPUT_1, 16);
+    range_check_error!(MUL_OUTPUT, 16);
+    range_check_error!(MUL_AUX_INPUT, 20);
+
+    let is_mul = lv[IS_MUL];
+    let input0_limbs = MUL_INPUT_0.map(|c| lv[c]);
+    let input1_limbs = MUL_INPUT_1.map(|c| lv[c]);
+    let output_limbs = MUL_OUTPUT.map(|c| lv[c]);
+    let aux_limbs = MUL_AUX_INPUT.map(|c| lv[c]);
+
+    // Constraint poly holds the coefficients of the polynomial that
+    // must be identically zero for this multiplication to be
+    // verified. It is initialised to the /negative/ of the claimed
+    // output.
+    let mut constr_poly = [P::ZEROS; N_LIMBS];
+
+    assert_eq!(constr_poly.len(), N_LIMBS);
+
+    // After this loop constr_poly holds the coefficients of the
+    // polynomial A(x)B(x) - C(x), where A, B and C are the polynomials
+    //
+    //   A(x) = \sum_i input0_limbs[i] * 2^LIMB_BITS
+    //   B(x) = \sum_i input1_limbs[i] * 2^LIMB_BITS
+    //   C(x) = \sum_i output_limbs[i] * 2^LIMB_BITS
+    //
+    // This polynomial should equal (2^LIMB_BITS - x) * Q(x) where Q is
+    //
+    //   Q(x) = \sum_i aux_limbs[i] * 2^LIMB_BITS
+    //
+    for col in 0..N_LIMBS {
+        // Invariant: i + j = col
+        for i in 0..=col {
+            let j = col - i;
+            constr_poly[col] += input0_limbs[i] * input1_limbs[j];
+        }
+        constr_poly[col] -= output_limbs[col];
+    }
+
+    // This subtracts (2^LIMB_BITS - x) * Q(x) from constr_poly.
+    let base = P::Scalar::from_canonical_u64(1 << LIMB_BITS);
+    constr_poly[0] -= base * aux_limbs[0];
+    for deg in 1..N_LIMBS {
+        constr_poly[deg] -= (base * aux_limbs[deg]) - aux_limbs[deg - 1];
+    }
+
+    // At this point constr_poly holds the coefficients of the
+    // polynomial A(x)B(x) - C(x) - (x - 2^LIMB_BITS)*Q(x). The
+    // multiplication is valid if and only if all of those
+    // coefficients are zero.
+    for &c in &constr_poly {
+        yield_constr.constraint(is_mul * c);
+    }
+}
+
+pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
+    builder: &mut plonky2::plonk::circuit_builder::CircuitBuilder<F, D>,
+    lv: &[ExtensionTarget<D>; NUM_ARITH_COLUMNS],
+    yield_constr: &mut RecursiveConstraintConsumer<F, D>,
+) {
+    let is_mul = lv[IS_MUL];
+    let input0_limbs = MUL_INPUT_0.map(|c| lv[c]);
+    let input1_limbs = MUL_INPUT_1.map(|c| lv[c]);
+    let output_limbs = MUL_OUTPUT.map(|c| lv[c]);
+    let aux_in_limbs = MUL_AUX_INPUT.map(|c| lv[c]);
+
+    let zero = builder.zero_extension();
+    let mut constr_poly = [zero; N_LIMBS]; // pointless init
+
+    // Invariant: i + j = deg
+    for col in 0..N_LIMBS {
+        let mut acc = zero;
+        for i in 0..=col {
+            let j = col - i;
+            acc = builder.mul_add_extension(input0_limbs[i], input1_limbs[j], acc);
+        }
+        constr_poly[col] = builder.sub_extension(acc, output_limbs[col]);
+    }
+
+    let base = F::from_canonical_u64(1 << LIMB_BITS);
+    let t = builder.mul_const_extension(base, aux_in_limbs[0]);
+    constr_poly[0] = builder.sub_extension(constr_poly[0], t);
+    for deg in 1..N_LIMBS {
+        let t0 = builder.mul_const_extension(base, aux_in_limbs[deg]);
+        let t1 = builder.sub_extension(t0, aux_in_limbs[deg - 1]);
+        constr_poly[deg] = builder.sub_extension(constr_poly[deg], t1);
+    }
+
+    for &c in &constr_poly {
+        let filter = builder.mul_extension(is_mul, c);
+        yield_constr.constraint(builder, filter);
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use plonky2::field::goldilocks_field::GoldilocksField;
+    use plonky2::field::types::Field;
+    use rand::{Rng, SeedableRng};
+    use rand_chacha::ChaCha8Rng;
+
+    use super::*;
+    use crate::arithmetic::columns::NUM_ARITH_COLUMNS;
+    use crate::constraint_consumer::ConstraintConsumer;
+
+    // TODO: Should be able to refactor this test to apply to all operations.
+    #[test]
+    fn generate_eval_consistency_not_mul() {
+        type F = GoldilocksField;
+
+        let mut rng = ChaCha8Rng::seed_from_u64(0x6feb51b7ec230f25);
+        let mut lv = [F::default(); NUM_ARITH_COLUMNS].map(|_| F::rand_from_rng(&mut rng));
+
+        // if `IS_MUL == 0`, then the constraints should be met even
+        // if all values are garbage.
+        lv[IS_MUL] = F::ZERO;
+
+        let mut constrant_consumer = ConstraintConsumer::new(
+            vec![GoldilocksField(2), GoldilocksField(3), GoldilocksField(5)],
+            GoldilocksField::ONE,
+            GoldilocksField::ONE,
+            GoldilocksField::ONE,
+        );
+        eval_packed_generic(&lv, &mut constrant_consumer);
+        for &acc in &constrant_consumer.constraint_accs {
+            assert_eq!(acc, GoldilocksField::ZERO);
+        }
+    }
+
+    #[test]
+    fn generate_eval_consistency_mul() {
+        type F = GoldilocksField;
+
+        let mut rng = ChaCha8Rng::seed_from_u64(0x6feb51b7ec230f25);
+        let mut lv = [F::default(); NUM_ARITH_COLUMNS].map(|_| F::rand_from_rng(&mut rng));
+
+        // set `IS_MUL == 1` and ensure all constraints are satisfied.
+        lv[IS_MUL] = F::ONE;
+        // set inputs to random values
+        for (&ai, bi) in MUL_INPUT_0.iter().zip(MUL_INPUT_1) {
+            lv[ai] = F::from_canonical_u16(rng.gen());
+            lv[bi] = F::from_canonical_u16(rng.gen());
+        }
+
+        generate(&mut lv);
+
+        let mut constrant_consumer = ConstraintConsumer::new(
+            vec![GoldilocksField(2), GoldilocksField(3), GoldilocksField(5)],
+            GoldilocksField::ONE,
+            GoldilocksField::ONE,
+            GoldilocksField::ONE,
+        );
+        eval_packed_generic(&lv, &mut constrant_consumer);
+        for &acc in &constrant_consumer.constraint_accs {
+            assert_eq!(acc, GoldilocksField::ZERO);
+        }
+    }
+}
diff --git a/evm/src/arithmetic/sub.rs b/evm/src/arithmetic/sub.rs
new file mode 100644
index 00000000..ce7932e2
--- /dev/null
+++ b/evm/src/arithmetic/sub.rs
@@ -0,0 +1,148 @@
+use itertools::izip;
+use plonky2::field::extension::Extendable;
+use plonky2::field::packed::PackedField;
+use plonky2::hash::hash_types::RichField;
+use plonky2::iop::ext_target::ExtensionTarget;
+
+use crate::arithmetic::add::{eval_ext_circuit_are_equal, eval_packed_generic_are_equal};
+use crate::arithmetic::columns::*;
+use crate::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer};
+use crate::range_check_error;
+
+pub fn generate<F: RichField>(lv: &mut [F; NUM_ARITH_COLUMNS]) {
+    let input0_limbs = SUB_INPUT_0.map(|c| lv[c].to_canonical_u64());
+    let input1_limbs = SUB_INPUT_1.map(|c| lv[c].to_canonical_u64());
+
+    // Input and output have 16-bit limbs
+    let mut output_limbs = [0u64; N_LIMBS];
+
+    const LIMB_BOUNDARY: u64 = 1 << LIMB_BITS;
+    const MASK: u64 = LIMB_BOUNDARY - 1u64;
+
+    let mut br = 0u64;
+    for (i, a, b) in izip!(0.., input0_limbs, input1_limbs) {
+        let d = LIMB_BOUNDARY + a - b - br;
+        // if a < b, then d < 2^16 so br = 1
+        // if a >= b, then d >= 2^16 so br = 0
+        br = 1u64 - (d >> LIMB_BITS);
+        assert!(br <= 1u64, "input limbs were larger than 16 bits");
+        output_limbs[i] = d & MASK;
+    }
+    // last borrow is dropped because this is subtraction modulo 2^256.
+
+    for (&c, output_limb) in SUB_OUTPUT.iter().zip(output_limbs) {
+        lv[c] = F::from_canonical_u64(output_limb);
+    }
+}
+
+pub fn eval_packed_generic<P: PackedField>(
+    lv: &[P; NUM_ARITH_COLUMNS],
+    yield_constr: &mut ConstraintConsumer<P>,
+) {
+    range_check_error!(SUB_INPUT_0, 16);
+    range_check_error!(SUB_INPUT_1, 16);
+    range_check_error!(SUB_OUTPUT, 16);
+
+    let is_sub = lv[IS_SUB];
+    let input0_limbs = SUB_INPUT_0.iter().map(|&c| lv[c]);
+    let input1_limbs = SUB_INPUT_1.iter().map(|&c| lv[c]);
+    let output_limbs = SUB_OUTPUT.iter().map(|&c| lv[c]);
+
+    let output_computed = input0_limbs.zip(input1_limbs).map(|(a, b)| a - b);
+
+    eval_packed_generic_are_equal(yield_constr, is_sub, output_limbs, output_computed);
+}
+
+#[allow(clippy::needless_collect)]
+pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
+    builder: &mut plonky2::plonk::circuit_builder::CircuitBuilder<F, D>,
+    lv: &[ExtensionTarget<D>; NUM_ARITH_COLUMNS],
+    yield_constr: &mut RecursiveConstraintConsumer<F, D>,
+) {
+    let is_sub = lv[IS_SUB];
+    let input0_limbs = SUB_INPUT_0.iter().map(|&c| lv[c]);
+    let input1_limbs = SUB_INPUT_1.iter().map(|&c| lv[c]);
+    let output_limbs = SUB_OUTPUT.iter().map(|&c| lv[c]);
+
+    // Since `map` is lazy and the closure passed to it borrows
+    // `builder`, we can't then borrow builder again below in the call
+    // to `eval_ext_circuit_are_equal`. The solution is to force
+    // evaluation with `collect`.
+    let output_computed = input0_limbs
+        .zip(input1_limbs)
+        .map(|(a, b)| builder.sub_extension(a, b))
+        .collect::<Vec<ExtensionTarget<D>>>();
+
+    eval_ext_circuit_are_equal(
+        builder,
+        yield_constr,
+        is_sub,
+        output_limbs,
+        output_computed.into_iter(),
+    );
+}
+
+#[cfg(test)]
+mod tests {
+    use plonky2::field::goldilocks_field::GoldilocksField;
+    use plonky2::field::types::Field;
+    use rand::{Rng, SeedableRng};
+    use rand_chacha::ChaCha8Rng;
+
+    use super::*;
+    use crate::arithmetic::columns::NUM_ARITH_COLUMNS;
+    use crate::constraint_consumer::ConstraintConsumer;
+
+    // TODO: Should be able to refactor this test to apply to all operations.
+    #[test]
+    fn generate_eval_consistency_not_sub() {
+        type F = GoldilocksField;
+
+        let mut rng = ChaCha8Rng::seed_from_u64(0x6feb51b7ec230f25);
+        let mut lv = [F::default(); NUM_ARITH_COLUMNS].map(|_| F::rand_from_rng(&mut rng));
+
+        // if `IS_SUB == 0`, then the constraints should be met even
+        // if all values are garbage.
+        lv[IS_SUB] = F::ZERO;
+
+        let mut constrant_consumer = ConstraintConsumer::new(
+            vec![GoldilocksField(2), GoldilocksField(3), GoldilocksField(5)],
+            GoldilocksField::ONE,
+            GoldilocksField::ONE,
+            GoldilocksField::ONE,
+        );
+        eval_packed_generic(&lv, &mut constrant_consumer);
+        for &acc in &constrant_consumer.constraint_accs {
+            assert_eq!(acc, GoldilocksField::ZERO);
+        }
+    }
+
+    #[test]
+    fn generate_eval_consistency_sub() {
+        type F = GoldilocksField;
+
+        let mut rng = ChaCha8Rng::seed_from_u64(0x6feb51b7ec230f25);
+        let mut lv = [F::default(); NUM_ARITH_COLUMNS].map(|_| F::rand_from_rng(&mut rng));
+
+        // set `IS_SUB == 1` and ensure all constraints are satisfied.
+        lv[IS_SUB] = F::ONE;
+        // set inputs to random values
+        for (&ai, bi) in SUB_INPUT_0.iter().zip(SUB_INPUT_1) {
+            lv[ai] = F::from_canonical_u16(rng.gen());
+            lv[bi] = F::from_canonical_u16(rng.gen());
+        }
+
+        generate(&mut lv);
+
+        let mut constrant_consumer = ConstraintConsumer::new(
+            vec![GoldilocksField(2), GoldilocksField(3), GoldilocksField(5)],
+            GoldilocksField::ONE,
+            GoldilocksField::ONE,
+            GoldilocksField::ONE,
+        );
+        eval_packed_generic(&lv, &mut constrant_consumer);
+        for &acc in &constrant_consumer.constraint_accs {
+            assert_eq!(acc, GoldilocksField::ZERO);
+        }
+    }
+}
diff --git a/evm/src/arithmetic/utils.rs b/evm/src/arithmetic/utils.rs
new file mode 100644
index 00000000..dc9a0a2f
--- /dev/null
+++ b/evm/src/arithmetic/utils.rs
@@ -0,0 +1,22 @@
+use log::error;
+
+/// Emit an error message regarding unchecked range assumptions.
+/// Assumes the values in `cols` are `[cols[0], cols[0] + 1, ...,
+/// cols[0] + cols.len() - 1]`.
+pub(crate) fn _range_check_error<const RC_BITS: u32>(file: &str, line: u32, cols: &[usize]) {
+    error!(
+        "{}:{}: arithmetic unit skipped {}-bit range-checks on columns {}--{}: not yet implemented",
+        line,
+        file,
+        RC_BITS,
+        cols[0],
+        cols[0] + cols.len() - 1
+    );
+}
+
+#[macro_export]
+macro_rules! range_check_error {
+    ($cols:ident, $rc_bits:expr) => {
+        $crate::arithmetic::utils::_range_check_error::<$rc_bits>(file!(), line!(), &$cols);
+    };
+}
diff --git a/evm/src/lib.rs b/evm/src/lib.rs
index 9ded1431..1d91e8e0 100644
--- a/evm/src/lib.rs
+++ b/evm/src/lib.rs
@@ -5,6 +5,7 @@
 #![feature(generic_const_exprs)]
 
 pub mod all_stark;
+pub mod arithmetic;
 pub mod config;
 pub mod constraint_consumer;
 pub mod cpu;