From 69228491d8e2b8e37a8262fe28767d82d2c64a83 Mon Sep 17 00:00:00 2001
From: Hamish Ivey-Law <426294+unzvfu@users.noreply.github.com>
Date: Tue, 7 Feb 2023 23:52:58 +1100
Subject: [PATCH] Unify generation and verification of ADD/SUB/LT/GT operations
 (#872)

* Unify handling of ADD, SUB, LT, GT under the formula x+y=z+cy*2^256.

* Rename general column ranges to "registers" instead of "inputs".

* Rename 'compare' module to 'addcc'.

* Update comments.

* Enforce length equality in iteration.

* Address William's PR comments.
---
 evm/src/arithmetic/add.rs              | 236 ----------------
 evm/src/arithmetic/addcc.rs            | 355 +++++++++++++++++++++++++
 evm/src/arithmetic/arithmetic_stark.rs |  12 +-
 evm/src/arithmetic/columns.rs          |  72 +++--
 evm/src/arithmetic/compare.rs          | 264 ------------------
 evm/src/arithmetic/mod.rs              |   4 +-
 evm/src/arithmetic/modular.rs          |  51 ++--
 evm/src/arithmetic/mul.rs              |   3 +-
 evm/src/arithmetic/operations.rs       |  42 ++-
 evm/src/arithmetic/sub.rs              | 158 -----------
 10 files changed, 434 insertions(+), 763 deletions(-)
 delete mode 100644 evm/src/arithmetic/add.rs
 create mode 100644 evm/src/arithmetic/addcc.rs
 delete mode 100644 evm/src/arithmetic/compare.rs
 delete mode 100644 evm/src/arithmetic/sub.rs
diff --git a/evm/src/arithmetic/add.rs b/evm/src/arithmetic/add.rs
deleted file mode 100644
index f97b709d..00000000
--- a/evm/src/arithmetic/add.rs
+++ /dev/null
@@ -1,236 +0,0 @@
-use itertools::izip;
-use plonky2::field::extension::Extendable;
-use plonky2::field::packed::PackedField;
-use plonky2::field::types::Field;
-use plonky2::hash::hash_types::RichField;
-use plonky2::iop::ext_target::ExtensionTarget;
-
-use crate::arithmetic::columns::*;
-use crate::arithmetic::utils::read_value_u64_limbs;
-use crate::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer};
-
-pub(crate) fn u256_add_cc(input0: [u64; N_LIMBS], input1: [u64; N_LIMBS]) -> ([u64; N_LIMBS], u64) {
-    // Input and output have 16-bit limbs
-    let mut output = [0u64; N_LIMBS];
-
-    const MASK: u64 = (1u64 << LIMB_BITS) - 1u64;
-    let mut cy = 0u64;
-    for (i, a, b) in izip!(0.., input0, input1) {
-        let s = a + b + cy;
-        cy = s >> LIMB_BITS;
-        assert!(cy <= 1u64, "input limbs were larger than 16 bits");
-        output[i] = s & MASK;
-    }
-    (output, cy)
-}
-
-/// Given two sequences `larger` and `smaller` of equal length (not
-/// checked), verifies that \sum_i larger[i] 2^(LIMB_BITS * i) ==
-/// \sum_i smaller[i] 2^(LIMB_BITS * i), taking care of carry propagation.
-///
-/// The sequences must have been produced by `{add,sub}::eval_packed_generic()`.
-pub(crate) fn eval_packed_generic_are_equal<P, I, J>(
-    yield_constr: &mut ConstraintConsumer<P>,
-    is_op: P,
-    larger: I,
-    smaller: J,
-    is_two_row_op: bool,
-) -> P
-where
-    P: PackedField,
-    I: Iterator<Item = P>,
-    J: Iterator<Item = P>,
-{
-    let overflow = P::Scalar::from_canonical_u64(1 << LIMB_BITS);
-    let overflow_inv = overflow.inverse();
-    let mut cy = P::ZEROS;
-    for (a, b) in larger.zip(smaller) {
-        // t should be either 0 or 2^LIMB_BITS
-        let t = cy + a - b;
-        if is_two_row_op {
-            yield_constr.constraint_transition(is_op * t * (overflow - t));
-        } else {
-            yield_constr.constraint(is_op * t * (overflow - t));
-        }
-        // cy <-- 0 or 1
-        // NB: this is multiplication by a constant, so doesn't
-        // increase the degree of the constraint.
-        cy = t * overflow_inv;
-    }
-    cy
-}
-
-pub(crate) fn eval_ext_circuit_are_equal<F, const D: usize, I, J>(
-    builder: &mut plonky2::plonk::circuit_builder::CircuitBuilder<F, D>,
-    yield_constr: &mut RecursiveConstraintConsumer<F, D>,
-    is_op: ExtensionTarget<D>,
-    larger: I,
-    smaller: J,
-    is_two_row_op: bool,
-) -> ExtensionTarget<D>
-where
-    F: RichField + Extendable<D>,
-    I: Iterator<Item = ExtensionTarget<D>>,
-    J: Iterator<Item = ExtensionTarget<D>>,
-{
-    // 2^LIMB_BITS in the base field
-    let overflow_base = F::from_canonical_u64(1 << LIMB_BITS);
-    // 2^LIMB_BITS in the extension field as an ExtensionTarget
-    let overflow = builder.constant_extension(F::Extension::from(overflow_base));
-    // 2^-LIMB_BITS in the base field.
-    let overflow_inv = F::inverse_2exp(LIMB_BITS);
-
-    let mut cy = builder.zero_extension();
-    for (a, b) in larger.zip(smaller) {
-        // t0 = cy + a
-        let t0 = builder.add_extension(cy, a);
-        // t  = t0 - b
-        let t = builder.sub_extension(t0, b);
-        // t1 = overflow - t
-        let t1 = builder.sub_extension(overflow, t);
-        // t2 = t * t1
-        let t2 = builder.mul_extension(t, t1);
-
-        let filtered_limb_constraint = builder.mul_extension(is_op, t2);
-        if is_two_row_op {
-            yield_constr.constraint_transition(builder, filtered_limb_constraint);
-        } else {
-            yield_constr.constraint(builder, filtered_limb_constraint);
-        }
-
-        cy = builder.mul_const_extension(overflow_inv, t);
-    }
-    cy
-}
-
-pub fn generate<F: RichField>(lv: &mut [F]) {
-    let input0 = read_value_u64_limbs(lv, ADD_INPUT_0);
-    let input1 = read_value_u64_limbs(lv, ADD_INPUT_1);
-
-    // Input and output have 16-bit limbs
-    let (output_limbs, _) = u256_add_cc(input0, input1);
-    lv[ADD_OUTPUT].copy_from_slice(&output_limbs.map(|c| F::from_canonical_u64(c)));
-}
-
-pub fn eval_packed_generic<P: PackedField>(
-    lv: &[P; NUM_ARITH_COLUMNS],
-    yield_constr: &mut ConstraintConsumer<P>,
-) {
-    let is_add = lv[IS_ADD];
-    let input0_limbs = &lv[ADD_INPUT_0];
-    let input1_limbs = &lv[ADD_INPUT_1];
-    let output_limbs = &lv[ADD_OUTPUT];
-
-    // This computed output is not yet reduced; i.e. some limbs may be
-    // more than 16 bits.
-    let output_computed = input0_limbs.iter().zip(input1_limbs).map(|(&a, &b)| a + b);
-
-    eval_packed_generic_are_equal(
-        yield_constr,
-        is_add,
-        output_computed,
-        output_limbs.iter().copied(),
-        false,
-    );
-}
-
-#[allow(clippy::needless_collect)]
-pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
-    builder: &mut plonky2::plonk::circuit_builder::CircuitBuilder<F, D>,
-    lv: &[ExtensionTarget<D>; NUM_ARITH_COLUMNS],
-    yield_constr: &mut RecursiveConstraintConsumer<F, D>,
-) {
-    let is_add = lv[IS_ADD];
-    let input0_limbs = &lv[ADD_INPUT_0];
-    let input1_limbs = &lv[ADD_INPUT_1];
-    let output_limbs = &lv[ADD_OUTPUT];
-
-    // Since `map` is lazy and the closure passed to it borrows
-    // `builder`, we can't then borrow builder again below in the call
-    // to `eval_ext_circuit_are_equal`. The solution is to force
-    // evaluation with `collect`.
-    let output_computed = input0_limbs
-        .iter()
-        .zip(input1_limbs)
-        .map(|(&a, &b)| builder.add_extension(a, b))
-        .collect::<Vec<ExtensionTarget<D>>>();
-
-    eval_ext_circuit_are_equal(
-        builder,
-        yield_constr,
-        is_add,
-        output_computed.into_iter(),
-        output_limbs.iter().copied(),
-        false,
-    );
-}
-
-#[cfg(test)]
-mod tests {
-    use plonky2::field::goldilocks_field::GoldilocksField;
-    use plonky2::field::types::{Field, Sample};
-    use rand::{Rng, SeedableRng};
-    use rand_chacha::ChaCha8Rng;
-
-    use super::*;
-    use crate::arithmetic::columns::NUM_ARITH_COLUMNS;
-    use crate::constraint_consumer::ConstraintConsumer;
-
-    const N_RND_TESTS: usize = 1000;
-
-    // TODO: Should be able to refactor this test to apply to all operations.
-    #[test]
-    fn generate_eval_consistency_not_add() {
-        type F = GoldilocksField;
-
-        let mut rng = ChaCha8Rng::seed_from_u64(0x6feb51b7ec230f25);
-        let mut lv = [F::default(); NUM_ARITH_COLUMNS].map(|_| F::sample(&mut rng));
-
-        // if `IS_ADD == 0`, then the constraints should be met even
-        // if all values are garbage.
-        lv[IS_ADD] = F::ZERO;
-
-        let mut constraint_consumer = ConstraintConsumer::new(
-            vec![GoldilocksField(2), GoldilocksField(3), GoldilocksField(5)],
-            GoldilocksField::ONE,
-            GoldilocksField::ONE,
-            GoldilocksField::ONE,
-        );
-        eval_packed_generic(&lv, &mut constraint_consumer);
-        for &acc in &constraint_consumer.constraint_accs {
-            assert_eq!(acc, GoldilocksField::ZERO);
-        }
-    }
-
-    #[test]
-    fn generate_eval_consistency_add() {
-        type F = GoldilocksField;
-
-        let mut rng = ChaCha8Rng::seed_from_u64(0x6feb51b7ec230f25);
-        let mut lv = [F::default(); NUM_ARITH_COLUMNS].map(|_| F::sample(&mut rng));
-
-        // set `IS_ADD == 1` and ensure all constraints are satisfied.
-        lv[IS_ADD] = F::ONE;
-
-        for _ in 0..N_RND_TESTS {
-            // set inputs to random values
-            for (ai, bi) in ADD_INPUT_0.zip(ADD_INPUT_1) {
-                lv[ai] = F::from_canonical_u16(rng.gen());
-                lv[bi] = F::from_canonical_u16(rng.gen());
-            }
-
-            generate(&mut lv);
-
-            let mut constraint_consumer = ConstraintConsumer::new(
-                vec![GoldilocksField(2), GoldilocksField(3), GoldilocksField(5)],
-                GoldilocksField::ONE,
-                GoldilocksField::ONE,
-                GoldilocksField::ONE,
-            );
-            eval_packed_generic(&lv, &mut constraint_consumer);
-            for &acc in &constraint_consumer.constraint_accs {
-                assert_eq!(acc, GoldilocksField::ZERO);
-            }
-        }
-    }
-}
diff --git a/evm/src/arithmetic/addcc.rs b/evm/src/arithmetic/addcc.rs
new file mode 100644
index 00000000..ed173ec7
--- /dev/null
+++ b/evm/src/arithmetic/addcc.rs
@@ -0,0 +1,355 @@
+//! Support for EVM instructions ADD, SUB, LT and GT
+//!
+//! This crate verifies EVM instructions ADD, SUB, LT and GT (i.e. for
+//! unsigned inputs). Each of these instructions can be verified using
+//! the "add with carry out" equation
+//!
+//!   X + Y = Z + CY * 2^256
+//!
+//! by an appropriate assignment of "inputs" and "outputs" to the
+//! variables X, Y, Z and CY. Specifically,
+//!
+//! ADD: X + Y, inputs X, Y, output Z, ignore CY
+//! SUB: Z - X, inputs X, Z, output Y, ignore CY
+//!  GT: X > Z, inputs X, Z, output CY, auxiliary output Y
+//!  LT: Z < X, inputs Z, X, output CY, auxiliary output Y
+
+use itertools::{izip, Itertools};
+use plonky2::field::extension::Extendable;
+use plonky2::field::packed::PackedField;
+use plonky2::field::types::Field;
+use plonky2::hash::hash_types::RichField;
+use plonky2::iop::ext_target::ExtensionTarget;
+use plonky2::plonk::circuit_builder::CircuitBuilder;
+
+use crate::arithmetic::columns::*;
+use crate::arithmetic::utils::read_value_u64_limbs;
+use crate::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer};
+
+fn u256_add_cc(input0: [u64; N_LIMBS], input1: [u64; N_LIMBS]) -> ([u64; N_LIMBS], u64) {
+    // Input and output have 16-bit limbs
+    let mut output = [0u64; N_LIMBS];
+
+    const MASK: u64 = (1u64 << LIMB_BITS) - 1u64;
+    let mut cy = 0u64;
+    for (i, a, b) in izip!(0.., input0, input1) {
+        let s = a + b + cy;
+        cy = s >> LIMB_BITS;
+        assert!(cy <= 1u64, "input limbs were larger than 16 bits");
+        output[i] = s & MASK;
+    }
+    (output, cy)
+}
+
+fn u256_sub_br(input0: [u64; N_LIMBS], input1: [u64; N_LIMBS]) -> ([u64; N_LIMBS], u64) {
+    const LIMB_BOUNDARY: u64 = 1 << LIMB_BITS;
+    const MASK: u64 = LIMB_BOUNDARY - 1u64;
+
+    let mut output = [0u64; N_LIMBS];
+    let mut br = 0u64;
+    for (i, a, b) in izip!(0.., input0, input1) {
+        let d = LIMB_BOUNDARY + a - b - br;
+        // if a < b, then d < 2^16 so br = 1
+        // if a >= b, then d >= 2^16 so br = 0
+        br = 1u64 - (d >> LIMB_BITS);
+        assert!(br <= 1u64, "input limbs were larger than 16 bits");
+        output[i] = d & MASK;
+    }
+
+    (output, br)
+}
+
+/// Generate row for ADD, SUB, GT and LT operations.
+///
+/// A row consists of four values, GENERAL_REGISTER_[012] and
+/// GENERAL_REGISTER_BIT. The interpretation of these values for each
+/// operation is as follows:
+///
+/// ADD: REGISTER_0 + REGISTER_1, output in REGISTER_2, ignore REGISTER_BIT
+/// SUB: REGISTER_2 - REGISTER_0, output in REGISTER_1, ignore REGISTER_BIT
+///  GT: REGISTER_0 > REGISTER_2, output in REGISTER_BIT, auxiliary output in REGISTER_1
+///  LT: REGISTER_2 < REGISTER_0, output in REGISTER_BIT, auxiliary output in REGISTER_1
+pub(crate) fn generate<F: RichField>(lv: &mut [F], filter: usize) {
+    match filter {
+        IS_ADD => {
+            let x = read_value_u64_limbs(lv, GENERAL_REGISTER_0);
+            let y = read_value_u64_limbs(lv, GENERAL_REGISTER_1);
+
+            // x + y == z + cy*2^256
+            let (z, cy) = u256_add_cc(x, y);
+
+            lv[GENERAL_REGISTER_2].copy_from_slice(&z.map(F::from_canonical_u64));
+            lv[GENERAL_REGISTER_BIT] = F::from_canonical_u64(cy);
+        }
+        IS_SUB | IS_GT | IS_LT => {
+            let x = read_value_u64_limbs(lv, GENERAL_REGISTER_0);
+            let z = read_value_u64_limbs(lv, GENERAL_REGISTER_2);
+
+            // y == z - x + cy*2^256
+            let (y, cy) = u256_sub_br(z, x);
+
+            lv[GENERAL_REGISTER_1].copy_from_slice(&y.map(F::from_canonical_u64));
+            lv[GENERAL_REGISTER_BIT] = F::from_canonical_u64(cy);
+        }
+        _ => panic!("unexpected operation filter"),
+    };
+}
+
+fn eval_packed_generic_check_is_one_bit<P: PackedField>(
+    yield_constr: &mut ConstraintConsumer<P>,
+    filter: P,
+    x: P,
+) {
+    yield_constr.constraint(filter * x * (x - P::ONES));
+}
+
+fn eval_ext_circuit_check_is_one_bit<F: RichField + Extendable<D>, const D: usize>(
+    builder: &mut CircuitBuilder<F, D>,
+    yield_constr: &mut RecursiveConstraintConsumer<F, D>,
+    filter: ExtensionTarget<D>,
+    x: ExtensionTarget<D>,
+) {
+    let constr = builder.mul_sub_extension(x, x, x);
+    let filtered_constr = builder.mul_extension(filter, constr);
+    yield_constr.constraint(builder, filtered_constr);
+}
+
+/// 2^-16 mod (2^64 - 2^32 + 1)
+const GOLDILOCKS_INVERSE_65536: u64 = 18446462594437939201;
+
+/// Constrains x + y == z + cy*2^256, assuming filter != 0.
+///
+/// NB: This function DOES NOT verify that cy is 0 or 1; the caller
+/// must do that.
+///
+/// Set `is_two_row_op=true` to allow the code to be called from the
+/// two-row `modular` code (for checking that the modular output is
+/// reduced).
+///
+/// Note that the digits of `x + y` are in `[0, 2*(2^16-1)]`
+/// (i.e. they are the sums of two 16-bit numbers), whereas the digits
+/// of `z` can only be in `[0, 2^16-1]`. In the function we check that:
+///
+/// \sum_i (x_i + y_i) * 2^(16*i) = \sum_i z_i * 2^(16*i) + given_cy*2^256.
+///
+/// If `N_LIMBS = 1`, then this amounts to verifying that either `x_0
+/// + y_0 = z_0` or `x_0 + y_0 == z_0 + cy*2^16` (this is `t` on line
+/// 127ff). Ok. Now assume the constraints are valid for `N_LIMBS =
+/// n-1`. Then by induction,
+///
+/// \sum_{i=0}^{n-1} (x_i + y_i) * 2^(16*i) + (x_n + y_n)*2^(16*n) ==
+/// \sum_{i=0}^{n-1} z_i * 2^(16*i) + cy_{n-1}*2^(16*n) + z_n*2^(16*n)
+/// + cy_n*2^(16*n)
+///
+/// is true if `(x_n + y_n)*2^(16*n) == cy_{n-1}*2^(16*n) +
+/// z_n*2^(16*n) + cy_n*2^(16*n)` (again, this is `t` on line 127ff)
+/// with the last `cy_n` checked against the `given_cy` given as input.
+pub(crate) fn eval_packed_generic_add_cc<P: PackedField>(
+    yield_constr: &mut ConstraintConsumer<P>,
+    filter: P,
+    x: &[P],
+    y: &[P],
+    z: &[P],
+    given_cy: P,
+    is_two_row_op: bool,
+) {
+    debug_assert!(x.len() == N_LIMBS && y.len() == N_LIMBS && z.len() == N_LIMBS);
+
+    let overflow = P::Scalar::from_canonical_u64(1u64 << LIMB_BITS);
+    let overflow_inv = P::Scalar::from_canonical_u64(GOLDILOCKS_INVERSE_65536);
+    debug_assert!(
+        overflow * overflow_inv == P::Scalar::ONE,
+        "only works with LIMB_BITS=16 and F=Goldilocks"
+    );
+
+    let mut cy = P::ZEROS;
+    for ((&xi, &yi), &zi) in x.iter().zip_eq(y).zip_eq(z) {
+        // Verify that (xi + yi) - zi is either 0 or 2^LIMB_BITS
+        let t = cy + xi + yi - zi;
+        if is_two_row_op {
+            yield_constr.constraint_transition(filter * t * (overflow - t));
+        } else {
+            yield_constr.constraint(filter * t * (overflow - t));
+        }
+        // cy <-- 0 or 1
+        // NB: this is multiplication by a constant, so doesn't
+        // increase the degree of the constraint.
+        cy = t * overflow_inv;
+    }
+
+    if is_two_row_op {
+        yield_constr.constraint_transition(filter * (cy - given_cy));
+    } else {
+        yield_constr.constraint(filter * (cy - given_cy));
+    }
+}
+
+pub fn eval_packed_generic<P: PackedField>(
+    lv: &[P; NUM_ARITH_COLUMNS],
+    yield_constr: &mut ConstraintConsumer<P>,
+) {
+    let is_add = lv[IS_ADD];
+    let is_sub = lv[IS_SUB];
+    let is_lt = lv[IS_LT];
+    let is_gt = lv[IS_GT];
+
+    let x = &lv[GENERAL_REGISTER_0];
+    let y = &lv[GENERAL_REGISTER_1];
+    let z = &lv[GENERAL_REGISTER_2];
+    let cy = lv[GENERAL_REGISTER_BIT];
+
+    let op_filter = is_add + is_sub + is_lt + is_gt;
+    eval_packed_generic_check_is_one_bit(yield_constr, op_filter, cy);
+
+    // x + y = z + cy*2^256
+    eval_packed_generic_add_cc(yield_constr, op_filter, x, y, z, cy, false);
+}
+
+#[allow(clippy::needless_collect)]
+pub(crate) fn eval_ext_circuit_add_cc<F: RichField + Extendable<D>, const D: usize>(
+    builder: &mut plonky2::plonk::circuit_builder::CircuitBuilder<F, D>,
+    yield_constr: &mut RecursiveConstraintConsumer<F, D>,
+    filter: ExtensionTarget<D>,
+    x: &[ExtensionTarget<D>],
+    y: &[ExtensionTarget<D>],
+    z: &[ExtensionTarget<D>],
+    given_cy: ExtensionTarget<D>,
+    is_two_row_op: bool,
+) {
+    debug_assert!(x.len() == N_LIMBS && y.len() == N_LIMBS && z.len() == N_LIMBS);
+
+    // 2^LIMB_BITS in the base field
+    let overflow_base = F::from_canonical_u64(1 << LIMB_BITS);
+    // 2^LIMB_BITS in the extension field as an ExtensionTarget
+    let overflow = builder.constant_extension(F::Extension::from(overflow_base));
+    // 2^-LIMB_BITS in the base field.
+    let overflow_inv = F::from_canonical_u64(GOLDILOCKS_INVERSE_65536);
+
+    let mut cy = builder.zero_extension();
+    for ((&xi, &yi), &zi) in x.iter().zip_eq(y).zip_eq(z) {
+        // t0 = cy + xi + yi
+        let t0 = builder.add_many_extension([cy, xi, yi]);
+        // t  = t0 - zi
+        let t = builder.sub_extension(t0, zi);
+        // t1 = overflow - t
+        let t1 = builder.sub_extension(overflow, t);
+        // t2 = t * t1
+        let t2 = builder.mul_extension(t, t1);
+
+        let filtered_limb_constraint = builder.mul_extension(filter, t2);
+        if is_two_row_op {
+            yield_constr.constraint_transition(builder, filtered_limb_constraint);
+        } else {
+            yield_constr.constraint(builder, filtered_limb_constraint);
+        }
+
+        cy = builder.mul_const_extension(overflow_inv, t);
+    }
+
+    let good_cy = builder.sub_extension(cy, given_cy);
+    let filter = builder.mul_extension(filter, good_cy);
+    if is_two_row_op {
+        yield_constr.constraint_transition(builder, filter);
+    } else {
+        yield_constr.constraint(builder, filter);
+    }
+}
+
+pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
+    builder: &mut plonky2::plonk::circuit_builder::CircuitBuilder<F, D>,
+    lv: &[ExtensionTarget<D>; NUM_ARITH_COLUMNS],
+    yield_constr: &mut RecursiveConstraintConsumer<F, D>,
+) {
+    let is_add = lv[IS_ADD];
+    let is_sub = lv[IS_SUB];
+    let is_lt = lv[IS_LT];
+    let is_gt = lv[IS_GT];
+
+    let x = &lv[GENERAL_REGISTER_0];
+    let y = &lv[GENERAL_REGISTER_1];
+    let z = &lv[GENERAL_REGISTER_2];
+    let cy = lv[GENERAL_REGISTER_BIT];
+
+    let op_filter = builder.add_many_extension([is_add, is_sub, is_lt, is_gt]);
+    eval_ext_circuit_check_is_one_bit(builder, yield_constr, op_filter, cy);
+    eval_ext_circuit_add_cc(builder, yield_constr, op_filter, x, y, z, cy, false);
+}
+
+#[cfg(test)]
+mod tests {
+    use plonky2::field::goldilocks_field::GoldilocksField;
+    use plonky2::field::types::{Field, Sample};
+    use rand::{Rng, SeedableRng};
+    use rand_chacha::ChaCha8Rng;
+
+    use super::*;
+    use crate::arithmetic::columns::NUM_ARITH_COLUMNS;
+    use crate::constraint_consumer::ConstraintConsumer;
+
+    // TODO: Should be able to refactor this test to apply to all operations.
+    #[test]
+    fn generate_eval_consistency_not_addcc() {
+        type F = GoldilocksField;
+
+        let mut rng = ChaCha8Rng::seed_from_u64(0x6feb51b7ec230f25);
+        let mut lv = [F::default(); NUM_ARITH_COLUMNS].map(|_| F::sample(&mut rng));
+
+        // if the operation filters are all zero, then the constraints
+        // should be met even if all values are
+        // garbage.
+        lv[IS_ADD] = F::ZERO;
+        lv[IS_SUB] = F::ZERO;
+        lv[IS_LT] = F::ZERO;
+        lv[IS_GT] = F::ZERO;
+
+        let mut constrant_consumer = ConstraintConsumer::new(
+            vec![GoldilocksField(2), GoldilocksField(3), GoldilocksField(5)],
+            F::ONE,
+            F::ONE,
+            F::ONE,
+        );
+        eval_packed_generic(&lv, &mut constrant_consumer);
+        for &acc in &constrant_consumer.constraint_accs {
+            assert_eq!(acc, F::ZERO);
+        }
+    }
+
+    #[test]
+    fn generate_eval_consistency_addcc() {
+        type F = GoldilocksField;
+
+        let mut rng = ChaCha8Rng::seed_from_u64(0x6feb51b7ec230f25);
+        const N_ITERS: usize = 1000;
+
+        for _ in 0..N_ITERS {
+            for op_filter in [IS_ADD, IS_SUB, IS_LT, IS_GT] {
+                // set entire row to random 16-bit values
+                let mut lv = [F::default(); NUM_ARITH_COLUMNS]
+                    .map(|_| F::from_canonical_u16(rng.gen::<u16>()));
+
+                // set operation filter and ensure all constraints are
+                // satisfied.  we have to explicitly set the other
+                // operation filters to zero since all are treated by
+                // the call.
+                lv[IS_ADD] = F::ZERO;
+                lv[IS_SUB] = F::ZERO;
+                lv[IS_LT] = F::ZERO;
+                lv[IS_GT] = F::ZERO;
+                lv[op_filter] = F::ONE;
+
+                generate(&mut lv, op_filter);
+
+                let mut constrant_consumer = ConstraintConsumer::new(
+                    vec![GoldilocksField(2), GoldilocksField(3), GoldilocksField(5)],
+                    F::ONE,
+                    F::ONE,
+                    F::ONE,
+                );
+                eval_packed_generic(&lv, &mut constrant_consumer);
+                for &acc in &constrant_consumer.constraint_accs {
+                    assert_eq!(acc, F::ZERO);
+                }
+            }
+        }
+    }
+}
diff --git a/evm/src/arithmetic/arithmetic_stark.rs b/evm/src/arithmetic/arithmetic_stark.rs
index 80742cd7..9af1a9c3 100644
--- a/evm/src/arithmetic/arithmetic_stark.rs
+++ b/evm/src/arithmetic/arithmetic_stark.rs
@@ -8,7 +8,7 @@ use plonky2::hash::hash_types::RichField;
 use plonky2::util::transpose;
 
 use crate::arithmetic::operations::Operation;
-use crate::arithmetic::{add, columns, compare, modular, mul, sub};
+use crate::arithmetic::{addcc, columns, modular, mul};
 use crate::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer};
 use crate::lookup::{eval_lookups, eval_lookups_circuit, permuted_cols};
 use crate::permutation::PermutationPair;
@@ -96,10 +96,8 @@ impl<F: RichField + Extendable<D>, const D: usize> Stark<F, D> for ArithmeticSta
         let lv = vars.local_values;
         let nv = vars.next_values;
 
-        add::eval_packed_generic(lv, yield_constr);
-        sub::eval_packed_generic(lv, yield_constr);
         mul::eval_packed_generic(lv, yield_constr);
-        compare::eval_packed_generic(lv, yield_constr);
+        addcc::eval_packed_generic(lv, yield_constr);
         modular::eval_packed_generic(lv, nv, yield_constr);
     }
 
@@ -116,10 +114,8 @@ impl<F: RichField + Extendable<D>, const D: usize> Stark<F, D> for ArithmeticSta
 
         let lv = vars.local_values;
         let nv = vars.next_values;
-        add::eval_ext_circuit(builder, lv, yield_constr);
-        sub::eval_ext_circuit(builder, lv, yield_constr);
         mul::eval_ext_circuit(builder, lv, yield_constr);
-        compare::eval_ext_circuit(builder, lv, yield_constr);
+        addcc::eval_ext_circuit(builder, lv, yield_constr);
         modular::eval_ext_circuit(builder, lv, nv, yield_constr);
     }
 
@@ -235,7 +231,7 @@ mod tests {
         // Each operation has a single word answer that we can check
         let expected_output = [
             // Row (some ops take two rows), col, expected
-            (0, columns::ADD_OUTPUT, 579),
+            (0, columns::GENERAL_REGISTER_2, 579), // ADD_OUTPUT
             (1, columns::MODULAR_OUTPUT, 703),
             (3, columns::MODULAR_OUTPUT, 674),
             (5, columns::MUL_OUTPUT, 56088),
diff --git a/evm/src/arithmetic/columns.rs b/evm/src/arithmetic/columns.rs
index aafc22a8..e05a4070 100644
--- a/evm/src/arithmetic/columns.rs
+++ b/evm/src/arithmetic/columns.rs
@@ -46,51 +46,41 @@ pub(crate) const START_SHARED_COLS: usize = IS_GT + 1;
 pub(crate) const NUM_SHARED_COLS: usize = 6 * N_LIMBS;
 pub(crate) const SHARED_COLS: Range<usize> = START_SHARED_COLS..START_SHARED_COLS + NUM_SHARED_COLS;
 
-pub(crate) const GENERAL_INPUT_0: Range<usize> = START_SHARED_COLS..START_SHARED_COLS + N_LIMBS;
-pub(crate) const GENERAL_INPUT_1: Range<usize> = GENERAL_INPUT_0.end..GENERAL_INPUT_0.end + N_LIMBS;
-const GENERAL_INPUT_2: Range<usize> = GENERAL_INPUT_1.end..GENERAL_INPUT_1.end + N_LIMBS;
-const GENERAL_INPUT_3: Range<usize> = GENERAL_INPUT_2.end..GENERAL_INPUT_2.end + N_LIMBS;
+pub(crate) const GENERAL_REGISTER_0: Range<usize> = START_SHARED_COLS..START_SHARED_COLS + N_LIMBS;
+pub(crate) const GENERAL_REGISTER_1: Range<usize> =
+    GENERAL_REGISTER_0.end..GENERAL_REGISTER_0.end + N_LIMBS;
+pub(crate) const GENERAL_REGISTER_2: Range<usize> =
+    GENERAL_REGISTER_1.end..GENERAL_REGISTER_1.end + N_LIMBS;
+const GENERAL_REGISTER_3: Range<usize> = GENERAL_REGISTER_2.end..GENERAL_REGISTER_2.end + N_LIMBS;
+// NB: Uses first slot of the GENERAL_REGISTER_3 register.
+pub(crate) const GENERAL_REGISTER_BIT: usize = GENERAL_REGISTER_3.start;
+
 // NB: Only one of these two sets of columns will be used for a given operation
-const GENERAL_INPUT_4: Range<usize> = GENERAL_INPUT_3.end..GENERAL_INPUT_3.end + N_LIMBS;
-const GENERAL_INPUT_4_DBL: Range<usize> = GENERAL_INPUT_3.end..GENERAL_INPUT_3.end + 2 * N_LIMBS;
+const GENERAL_REGISTER_4: Range<usize> = GENERAL_REGISTER_3.end..GENERAL_REGISTER_3.end + N_LIMBS;
+const GENERAL_REGISTER_4_DBL: Range<usize> =
+    GENERAL_REGISTER_3.end..GENERAL_REGISTER_3.end + 2 * N_LIMBS;
 
 // The auxiliary input columns overlap the general input columns
 // because they correspond to the values in the second row for modular
 // operations.
-const AUX_INPUT_0: Range<usize> = START_SHARED_COLS..START_SHARED_COLS + N_LIMBS;
-const AUX_INPUT_1: Range<usize> = AUX_INPUT_0.end..AUX_INPUT_0.end + 2 * N_LIMBS;
+const AUX_REGISTER_0: Range<usize> = START_SHARED_COLS..START_SHARED_COLS + N_LIMBS;
+const AUX_REGISTER_1: Range<usize> = AUX_REGISTER_0.end..AUX_REGISTER_0.end + 2 * N_LIMBS;
 // These auxiliary input columns are awkwardly split across two rows,
 // with the first half after the general input columns and the second
 // half after the auxiliary input columns.
-const AUX_INPUT_2: Range<usize> = AUX_INPUT_1.end..AUX_INPUT_1.end + 2 * N_LIMBS - 1;
+const AUX_REGISTER_2: Range<usize> = AUX_REGISTER_1.end..AUX_REGISTER_1.end + 2 * N_LIMBS - 1;
 
-// Each element c of {MUL,MODULAR}_AUX_INPUT is -2^20 <= c <= 2^20;
+// Each element c of {MUL,MODULAR}_AUX_REGISTER is -2^20 <= c <= 2^20;
 // this value is used as an offset so that everything is positive in
 // the range checks.
 pub(crate) const AUX_COEFF_ABS_MAX: i64 = 1 << 20;
 
-// ADD takes 3 * N_LIMBS = 48 columns
-pub(crate) const ADD_INPUT_0: Range<usize> = GENERAL_INPUT_0;
-pub(crate) const ADD_INPUT_1: Range<usize> = GENERAL_INPUT_1;
-pub(crate) const ADD_OUTPUT: Range<usize> = GENERAL_INPUT_2;
-
-// SUB takes 3 * N_LIMBS = 48 columns
-pub(crate) const SUB_INPUT_0: Range<usize> = GENERAL_INPUT_0;
-pub(crate) const SUB_INPUT_1: Range<usize> = GENERAL_INPUT_1;
-pub(crate) const SUB_OUTPUT: Range<usize> = GENERAL_INPUT_2;
-
 // MUL takes 5 * N_LIMBS = 80 columns
-pub(crate) const MUL_INPUT_0: Range<usize> = GENERAL_INPUT_0;
-pub(crate) const MUL_INPUT_1: Range<usize> = GENERAL_INPUT_1;
-pub(crate) const MUL_OUTPUT: Range<usize> = GENERAL_INPUT_2;
-pub(crate) const MUL_AUX_INPUT_LO: Range<usize> = GENERAL_INPUT_3;
-pub(crate) const MUL_AUX_INPUT_HI: Range<usize> = GENERAL_INPUT_4;
-
-// LT and GT take 4 * N_LIMBS = 64 columns
-pub(crate) const CMP_INPUT_0: Range<usize> = GENERAL_INPUT_0;
-pub(crate) const CMP_INPUT_1: Range<usize> = GENERAL_INPUT_1;
-pub(crate) const CMP_OUTPUT: usize = GENERAL_INPUT_2.start;
-pub(crate) const CMP_AUX_INPUT: Range<usize> = GENERAL_INPUT_3;
+pub(crate) const MUL_INPUT_0: Range<usize> = GENERAL_REGISTER_0;
+pub(crate) const MUL_INPUT_1: Range<usize> = GENERAL_REGISTER_1;
+pub(crate) const MUL_OUTPUT: Range<usize> = GENERAL_REGISTER_2;
+pub(crate) const MUL_AUX_INPUT_LO: Range<usize> = GENERAL_REGISTER_3;
+pub(crate) const MUL_AUX_INPUT_HI: Range<usize> = GENERAL_REGISTER_4;
 
 // MULMOD takes 4 * N_LIMBS + 3 * 2*N_LIMBS + N_LIMBS = 176 columns
 // but split over two rows of 96 columns and 80 columns.
@@ -98,18 +88,18 @@ pub(crate) const CMP_AUX_INPUT: Range<usize> = GENERAL_INPUT_3;
 // ADDMOD, SUBMOD, MOD and DIV are currently implemented in terms of
 // the general modular code, so they also take 144 columns (also split
 // over two rows).
-pub(crate) const MODULAR_INPUT_0: Range<usize> = GENERAL_INPUT_0;
-pub(crate) const MODULAR_INPUT_1: Range<usize> = GENERAL_INPUT_1;
-pub(crate) const MODULAR_MODULUS: Range<usize> = GENERAL_INPUT_2;
-pub(crate) const MODULAR_OUTPUT: Range<usize> = GENERAL_INPUT_3;
-pub(crate) const MODULAR_QUO_INPUT: Range<usize> = GENERAL_INPUT_4_DBL;
-pub(crate) const MODULAR_OUT_AUX_RED: Range<usize> = AUX_INPUT_0;
+pub(crate) const MODULAR_INPUT_0: Range<usize> = GENERAL_REGISTER_0;
+pub(crate) const MODULAR_INPUT_1: Range<usize> = GENERAL_REGISTER_1;
+pub(crate) const MODULAR_MODULUS: Range<usize> = GENERAL_REGISTER_2;
+pub(crate) const MODULAR_OUTPUT: Range<usize> = GENERAL_REGISTER_3;
+pub(crate) const MODULAR_QUO_INPUT: Range<usize> = GENERAL_REGISTER_4_DBL;
+pub(crate) const MODULAR_OUT_AUX_RED: Range<usize> = AUX_REGISTER_0;
 // NB: Last value is not used in AUX, it is used in MOD_IS_ZERO
-pub(crate) const MODULAR_MOD_IS_ZERO: usize = AUX_INPUT_1.start;
-pub(crate) const MODULAR_AUX_INPUT_LO: Range<usize> = AUX_INPUT_1.start + 1..AUX_INPUT_1.end;
-pub(crate) const MODULAR_AUX_INPUT_HI: Range<usize> = AUX_INPUT_2;
+pub(crate) const MODULAR_MOD_IS_ZERO: usize = AUX_REGISTER_1.start;
+pub(crate) const MODULAR_AUX_INPUT_LO: Range<usize> = AUX_REGISTER_1.start + 1..AUX_REGISTER_1.end;
+pub(crate) const MODULAR_AUX_INPUT_HI: Range<usize> = AUX_REGISTER_2;
 // Must be set to MOD_IS_ZERO for DIV operation i.e. MOD_IS_ZERO * lv[IS_DIV]
-pub(crate) const MODULAR_DIV_DENOM_IS_ZERO: usize = AUX_INPUT_2.end;
+pub(crate) const MODULAR_DIV_DENOM_IS_ZERO: usize = AUX_REGISTER_2.end;
 
 pub(crate) const DIV_NUMERATOR: Range<usize> = MODULAR_INPUT_0;
 pub(crate) const DIV_DENOMINATOR: Range<usize> = MODULAR_MODULUS;
diff --git a/evm/src/arithmetic/compare.rs b/evm/src/arithmetic/compare.rs
deleted file mode 100644
index 318b197f..00000000
--- a/evm/src/arithmetic/compare.rs
+++ /dev/null
@@ -1,264 +0,0 @@
-//! Support for EVM LT and GT instructions
-//!
-//! This crate verifies EVM LT and GT instructions (i.e. for unsigned
-//! inputs). The difference between LT and GT is of course just a
-//! matter of the order of the inputs. The verification is essentially
-//! identical to the SUB instruction: For both SUB and LT we have values
-//!
-//!   - `input0`
-//!   - `input1`
-//!   - `difference` (mod 2^256)
-//!   - `borrow` (= 0 or 1)
-//!
-//! satisfying `input0 - input1 = difference + borrow * 2^256`. Where
-//! SUB verifies `difference` and ignores `borrow`, LT verifies
-//! `borrow` (and uses `difference` as an auxiliary input).
-
-use plonky2::field::extension::Extendable;
-use plonky2::field::packed::PackedField;
-use plonky2::hash::hash_types::RichField;
-use plonky2::iop::ext_target::ExtensionTarget;
-
-use crate::arithmetic::add::{eval_ext_circuit_are_equal, eval_packed_generic_are_equal};
-use crate::arithmetic::columns::*;
-use crate::arithmetic::sub::u256_sub_br;
-use crate::arithmetic::utils::read_value_u64_limbs;
-use crate::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer};
-
-pub(crate) fn generate<F: RichField>(lv: &mut [F], filter: usize) {
-    let input0 = read_value_u64_limbs(lv, CMP_INPUT_0);
-    let input1 = read_value_u64_limbs(lv, CMP_INPUT_1);
-
-    let (diff, br) = match filter {
-        // input0 - input1 == diff + br*2^256
-        IS_LT => u256_sub_br(input0, input1),
-        // input1 - input0 == diff + br*2^256
-        IS_GT => u256_sub_br(input1, input0),
-        _ => panic!("op code not a comparison"),
-    };
-
-    lv[CMP_AUX_INPUT].copy_from_slice(&diff.map(|c| F::from_canonical_u64(c)));
-    lv[CMP_OUTPUT] = F::from_canonical_u64(br);
-}
-
-fn eval_packed_generic_check_is_one_bit<P: PackedField>(
-    yield_constr: &mut ConstraintConsumer<P>,
-    filter: P,
-    x: P,
-) {
-    yield_constr.constraint(filter * x * (x - P::ONES));
-}
-
-pub(crate) fn eval_packed_generic_lt<P: PackedField>(
-    yield_constr: &mut ConstraintConsumer<P>,
-    is_op: P,
-    input0: &[P],
-    input1: &[P],
-    aux: &[P],
-    output: P,
-    is_two_row_op: bool,
-) {
-    debug_assert!(input0.len() == N_LIMBS && input1.len() == N_LIMBS && aux.len() == N_LIMBS);
-
-    // Verify (input0 < input1) == output by providing aux such that
-    // input0 - input1 == aux + output*2^256.
-    let lhs_limbs = input0.iter().zip(input1).map(|(&a, &b)| a - b);
-    let cy = eval_packed_generic_are_equal(
-        yield_constr,
-        is_op,
-        aux.iter().copied(),
-        lhs_limbs,
-        is_two_row_op,
-    );
-    // We don't need to check that cy is 0 or 1, since output has
-    // already been checked to be 0 or 1.
-    if is_two_row_op {
-        yield_constr.constraint_transition(is_op * (cy - output));
-    } else {
-        yield_constr.constraint(is_op * (cy - output));
-    }
-}
-
-pub fn eval_packed_generic<P: PackedField>(
-    lv: &[P; NUM_ARITH_COLUMNS],
-    yield_constr: &mut ConstraintConsumer<P>,
-) {
-    let is_lt = lv[IS_LT];
-    let is_gt = lv[IS_GT];
-
-    let input0 = &lv[CMP_INPUT_0];
-    let input1 = &lv[CMP_INPUT_1];
-    let aux = &lv[CMP_AUX_INPUT];
-    let output = lv[CMP_OUTPUT];
-
-    let is_cmp = is_lt + is_gt;
-    eval_packed_generic_check_is_one_bit(yield_constr, is_cmp, output);
-
-    eval_packed_generic_lt(yield_constr, is_lt, input0, input1, aux, output, false);
-    eval_packed_generic_lt(yield_constr, is_gt, input1, input0, aux, output, false);
-}
-
-fn eval_ext_circuit_check_is_one_bit<F: RichField + Extendable<D>, const D: usize>(
-    builder: &mut plonky2::plonk::circuit_builder::CircuitBuilder<F, D>,
-    yield_constr: &mut RecursiveConstraintConsumer<F, D>,
-    filter: ExtensionTarget<D>,
-    x: ExtensionTarget<D>,
-) {
-    let constr = builder.mul_sub_extension(x, x, x);
-    let filtered_constr = builder.mul_extension(filter, constr);
-    yield_constr.constraint(builder, filtered_constr);
-}
-
-#[allow(clippy::needless_collect)]
-pub(crate) fn eval_ext_circuit_lt<F: RichField + Extendable<D>, const D: usize>(
-    builder: &mut plonky2::plonk::circuit_builder::CircuitBuilder<F, D>,
-    yield_constr: &mut RecursiveConstraintConsumer<F, D>,
-    is_op: ExtensionTarget<D>,
-    input0: &[ExtensionTarget<D>],
-    input1: &[ExtensionTarget<D>],
-    aux: &[ExtensionTarget<D>],
-    output: ExtensionTarget<D>,
-    is_two_row_op: bool,
-) {
-    debug_assert!(input0.len() == N_LIMBS && input1.len() == N_LIMBS && aux.len() == N_LIMBS);
-
-    // Since `map` is lazy and the closure passed to it borrows
-    // `builder`, we can't then borrow builder again below in the call
-    // to `eval_ext_circuit_are_equal`. The solution is to force
-    // evaluation with `collect`.
-    let lhs_limbs = input0
-        .iter()
-        .zip(input1)
-        .map(|(&a, &b)| builder.sub_extension(a, b))
-        .collect::<Vec<ExtensionTarget<D>>>();
-
-    let cy = eval_ext_circuit_are_equal(
-        builder,
-        yield_constr,
-        is_op,
-        aux.iter().copied(),
-        lhs_limbs.into_iter(),
-        is_two_row_op,
-    );
-    let good_output = builder.sub_extension(cy, output);
-    let filter = builder.mul_extension(is_op, good_output);
-    if is_two_row_op {
-        yield_constr.constraint_transition(builder, filter);
-    } else {
-        yield_constr.constraint(builder, filter);
-    }
-}
-
-pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
-    builder: &mut plonky2::plonk::circuit_builder::CircuitBuilder<F, D>,
-    lv: &[ExtensionTarget<D>; NUM_ARITH_COLUMNS],
-    yield_constr: &mut RecursiveConstraintConsumer<F, D>,
-) {
-    let is_lt = lv[IS_LT];
-    let is_gt = lv[IS_GT];
-
-    let input0 = &lv[CMP_INPUT_0];
-    let input1 = &lv[CMP_INPUT_1];
-    let aux = &lv[CMP_AUX_INPUT];
-    let output = lv[CMP_OUTPUT];
-
-    let is_cmp = builder.add_extension(is_lt, is_gt);
-    eval_ext_circuit_check_is_one_bit(builder, yield_constr, is_cmp, output);
-
-    eval_ext_circuit_lt(
-        builder,
-        yield_constr,
-        is_lt,
-        input0,
-        input1,
-        aux,
-        output,
-        false,
-    );
-    eval_ext_circuit_lt(
-        builder,
-        yield_constr,
-        is_gt,
-        input1,
-        input0,
-        aux,
-        output,
-        false,
-    );
-}
-
-#[cfg(test)]
-mod tests {
-    use plonky2::field::goldilocks_field::GoldilocksField;
-    use plonky2::field::types::{Field, Sample};
-    use rand::{Rng, SeedableRng};
-    use rand_chacha::ChaCha8Rng;
-
-    use super::*;
-    use crate::arithmetic::columns::NUM_ARITH_COLUMNS;
-    use crate::constraint_consumer::ConstraintConsumer;
-
-    // TODO: Should be able to refactor this test to apply to all operations.
-    #[test]
-    fn generate_eval_consistency_not_compare() {
-        type F = GoldilocksField;
-
-        let mut rng = ChaCha8Rng::seed_from_u64(0x6feb51b7ec230f25);
-        let mut lv = [F::default(); NUM_ARITH_COLUMNS].map(|_| F::sample(&mut rng));
-
-        // if `IS_LT == 0`, then the constraints should be met even if
-        // all values are garbage. `eval_packed_generic` handles IS_GT
-        // at the same time, so we check both at once.
-        lv[IS_LT] = F::ZERO;
-        lv[IS_GT] = F::ZERO;
-
-        let mut constrant_consumer = ConstraintConsumer::new(
-            vec![GoldilocksField(2), GoldilocksField(3), GoldilocksField(5)],
-            F::ONE,
-            F::ONE,
-            F::ONE,
-        );
-        eval_packed_generic(&lv, &mut constrant_consumer);
-        for &acc in &constrant_consumer.constraint_accs {
-            assert_eq!(acc, F::ZERO);
-        }
-    }
-
-    #[test]
-    fn generate_eval_consistency_compare() {
-        type F = GoldilocksField;
-
-        let mut rng = ChaCha8Rng::seed_from_u64(0x6feb51b7ec230f25);
-        let mut lv = [F::default(); NUM_ARITH_COLUMNS].map(|_| F::sample(&mut rng));
-        const N_ITERS: usize = 1000;
-
-        for _ in 0..N_ITERS {
-            for (op, other_op) in [(IS_LT, IS_GT), (IS_GT, IS_LT)] {
-                // set op == 1 and ensure all constraints are satisfied.
-                // we have to explicitly set the other op to zero since both
-                // are treated by the call.
-                lv[op] = F::ONE;
-                lv[other_op] = F::ZERO;
-
-                // set inputs to random values
-                for (ai, bi) in CMP_INPUT_0.zip(CMP_INPUT_1) {
-                    lv[ai] = F::from_canonical_u16(rng.gen());
-                    lv[bi] = F::from_canonical_u16(rng.gen());
-                }
-
-                generate(&mut lv, op);
-
-                let mut constrant_consumer = ConstraintConsumer::new(
-                    vec![GoldilocksField(2), GoldilocksField(3), GoldilocksField(5)],
-                    F::ONE,
-                    F::ONE,
-                    F::ONE,
-                );
-                eval_packed_generic(&lv, &mut constrant_consumer);
-                for &acc in &constrant_consumer.constraint_accs {
-                    assert_eq!(acc, F::ZERO);
-                }
-            }
-        }
-    }
-}
diff --git a/evm/src/arithmetic/mod.rs b/evm/src/arithmetic/mod.rs
index ee6399da..d214fa3b 100644
--- a/evm/src/arithmetic/mod.rs
+++ b/evm/src/arithmetic/mod.rs
@@ -4,11 +4,9 @@ use ethereum_types::U256;
 
 use crate::util::{addmod, mulmod, submod};
 
-mod add;
-mod compare;
+mod addcc;
 mod modular;
 mod mul;
-mod sub;
 mod utils;
 
 pub mod arithmetic_stark;
diff --git a/evm/src/arithmetic/modular.rs b/evm/src/arithmetic/modular.rs
index 73e8acb6..799a4ad4 100644
--- a/evm/src/arithmetic/modular.rs
+++ b/evm/src/arithmetic/modular.rs
@@ -118,8 +118,8 @@ use plonky2::iop::ext_target::ExtensionTarget;
 use plonky2::plonk::circuit_builder::CircuitBuilder;
 
 use super::columns;
+use crate::arithmetic::addcc::{eval_ext_circuit_add_cc, eval_packed_generic_add_cc};
 use crate::arithmetic::columns::*;
-use crate::arithmetic::compare::{eval_ext_circuit_lt, eval_packed_generic_lt};
 use crate::arithmetic::utils::*;
 use crate::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer};
 
@@ -213,18 +213,12 @@ fn generate_modular_op<F: RichField>(
     let mut constr_poly = [0i64; 2 * N_LIMBS];
     constr_poly[..2 * N_LIMBS - 1].copy_from_slice(&operation(input0_limbs, input1_limbs));
 
-    // two_exp_256 == 2^256
-    let two_exp_256 = {
-        let mut t = BigInt::zero();
-        t.set_bit(256, true);
-        t
-    };
-
     let mut mod_is_zero = F::ZERO;
     if modulus.is_zero() {
         if filter == columns::IS_DIV {
-            // set modulus = 2^256
-            modulus = two_exp_256.clone();
+            // set modulus = 2^256; the condition above means we know
+            // it's zero at this point, so we can just set bit 256.
+            modulus.set_bit(256, true);
             // modulus_limbs don't play a role below
         } else {
             // set modulus = 1
@@ -248,8 +242,8 @@ fn generate_modular_op<F: RichField>(
     let quot = (&input - &output) / &modulus; // exact division; can be -ve
     let quot_limbs = bigint_to_columns::<{ 2 * N_LIMBS }>(&quot);
 
-    // output < modulus here, so the proof requires (output - modulus) % 2^256:
-    let out_aux_red = bigint_to_columns::<N_LIMBS>(&(two_exp_256 + output - modulus));
+    // output < modulus here, so the proof requires (modulus - output).
+    let out_aux_red = bigint_to_columns::<N_LIMBS>(&(modulus - output));
 
     // constr_poly is the array of coefficients of the polynomial
     //
@@ -338,29 +332,30 @@ fn modular_constr_poly<P: PackedField>(
     yield_constr.constraint_transition(filter * (mod_is_zero * lv[IS_DIV] - div_denom_is_zero));
 
     // Needed to compensate for adding mod_is_zero to modulus above,
-    // since the call eval_packed_generic_lt() below subtracts modulus
+    // since the call eval_packed_generic_add_cc() below subtracts modulus
     // to verify in the case of a DIV.
     output[0] += div_denom_is_zero;
 
     // Verify that the output is reduced, i.e. output < modulus.
     let out_aux_red = &nv[MODULAR_OUT_AUX_RED];
-    // This sets is_less_than to 1 unless we get mod_is_zero when
-    // doing a DIV; in that case, we need is_less_than=0, since the
-    // function checks
+    // This sets is_greater_than to 0 unless we get mod_is_zero when
+    // doing a DIV; in that case, we need is_greater_than=1, since
+    // eval_packed_generic_add_cc checks
     //
-    //   output - modulus == out_aux_red + is_less_than*2^256
+    //   output + out_aux_red == modulus + is_greater_than*2^256
     //
     // and we were given output = out_aux_red
-    let is_less_than = P::ONES - mod_is_zero * lv[IS_DIV];
-    // NB: output and modulus in lv while out_aux_red and is_less_than
-    // (via mod_is_zero) depend on nv.
-    eval_packed_generic_lt(
+    let is_greater_than = mod_is_zero * lv[IS_DIV];
+    // NB: output and modulus in lv while out_aux_red and
+    // is_greater_than (via mod_is_zero) depend on nv, hence the
+    // 'is_two_row_op' argument is set to 'true'.
+    eval_packed_generic_add_cc(
         yield_constr,
         filter,
         &output,
-        &modulus,
         out_aux_red,
-        is_less_than,
+        &modulus,
+        is_greater_than,
         true,
     );
     // restore output[0]
@@ -488,18 +483,16 @@ fn modular_constr_poly_ext_circuit<F: RichField + Extendable<D>, const D: usize>
     output[0] = builder.add_extension(output[0], div_denom_is_zero);
 
     let out_aux_red = &nv[MODULAR_OUT_AUX_RED];
-    let one = builder.one_extension();
-    let is_less_than =
-        builder.arithmetic_extension(F::NEG_ONE, F::ONE, mod_is_zero, lv[IS_DIV], one);
+    let is_greater_than = builder.mul_extension(mod_is_zero, lv[IS_DIV]);
 
-    eval_ext_circuit_lt(
+    eval_ext_circuit_add_cc(
         builder,
         yield_constr,
         filter,
         &output,
-        &modulus,
         out_aux_red,
-        is_less_than,
+        &modulus,
+        is_greater_than,
         true,
     );
     output[0] = builder.sub_extension(output[0], div_denom_is_zero);
diff --git a/evm/src/arithmetic/mul.rs b/evm/src/arithmetic/mul.rs
index ea93990e..f12b407a 100644
--- a/evm/src/arithmetic/mul.rs
+++ b/evm/src/arithmetic/mul.rs
@@ -60,6 +60,7 @@ use plonky2::field::packed::PackedField;
 use plonky2::field::types::Field;
 use plonky2::hash::hash_types::RichField;
 use plonky2::iop::ext_target::ExtensionTarget;
+use plonky2::plonk::circuit_builder::CircuitBuilder;
 
 use crate::arithmetic::columns::*;
 use crate::arithmetic::utils::*;
@@ -161,7 +162,7 @@ pub fn eval_packed_generic<P: PackedField>(
 }
 
 pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
-    builder: &mut plonky2::plonk::circuit_builder::CircuitBuilder<F, D>,
+    builder: &mut CircuitBuilder<F, D>,
     lv: &[ExtensionTarget<D>; NUM_ARITH_COLUMNS],
     yield_constr: &mut RecursiveConstraintConsumer<F, D>,
 ) {
diff --git a/evm/src/arithmetic/operations.rs b/evm/src/arithmetic/operations.rs
index 8c89a45a..f8aed566 100644
--- a/evm/src/arithmetic/operations.rs
+++ b/evm/src/arithmetic/operations.rs
@@ -3,7 +3,7 @@ use plonky2::hash::hash_types::RichField;
 use static_assertions::const_assert;
 
 use crate::arithmetic::columns::*;
-use crate::arithmetic::{add, compare, modular, mul, sub};
+use crate::arithmetic::{addcc, modular, mul};
 
 #[inline]
 fn u64_to_array<F: RichField>(out: &mut [F], x: u64) {
@@ -64,30 +64,26 @@ impl<F: RichField> Operation<F> for SimpleBinaryOp {
         let mut row = vec![F::ZERO; NUM_ARITH_COLUMNS];
         row[self.op_filter] = F::ONE;
 
-        // Each of these operations uses the same columns for input; the
-        // asserts ensure no-one changes this.
-        // TODO: This is ugly; should just remove the other
-        // *_INPUT_[01] variables and remove this.
-        debug_assert!([ADD_INPUT_0, SUB_INPUT_0, MUL_INPUT_0, CMP_INPUT_0,]
-            .iter()
-            .all(|x| *x == GENERAL_INPUT_0));
-        debug_assert!([ADD_INPUT_1, SUB_INPUT_1, MUL_INPUT_1, CMP_INPUT_1,]
-            .iter()
-            .all(|x| *x == GENERAL_INPUT_1));
-
-        u256_to_array(&mut row[GENERAL_INPUT_0], self.input0);
-        u256_to_array(&mut row[GENERAL_INPUT_1], self.input1);
-
-        // This is ugly, but it avoids the huge amount of boilerplate
-        // required to dispatch directly to each add/sub/etc. operation.
-        match self.op_filter {
-            IS_ADD => add::generate(&mut row),
-            IS_SUB => sub::generate(&mut row),
-            IS_MUL => mul::generate(&mut row),
-            IS_LT | IS_GT => compare::generate(&mut row, self.op_filter),
-            _ => panic!("unrecognised operation"),
+        if self.op_filter == IS_SUB || self.op_filter == IS_GT {
+            u256_to_array(&mut row[GENERAL_REGISTER_2], self.input0);
+            u256_to_array(&mut row[GENERAL_REGISTER_0], self.input1);
+        } else if self.op_filter == IS_LT {
+            u256_to_array(&mut row[GENERAL_REGISTER_0], self.input0);
+            u256_to_array(&mut row[GENERAL_REGISTER_2], self.input1);
+        } else {
+            assert!(
+                self.op_filter == IS_ADD || self.op_filter == IS_MUL,
+                "unrecognised operation"
+            );
+            u256_to_array(&mut row[GENERAL_REGISTER_0], self.input0);
+            u256_to_array(&mut row[GENERAL_REGISTER_1], self.input1);
         }
 
+        if self.op_filter == IS_MUL {
+            mul::generate(&mut row);
+        } else {
+            addcc::generate(&mut row, self.op_filter);
+        }
         (row, None)
     }
 }
diff --git a/evm/src/arithmetic/sub.rs b/evm/src/arithmetic/sub.rs
deleted file mode 100644
index e94e2d5c..00000000
--- a/evm/src/arithmetic/sub.rs
+++ /dev/null
@@ -1,158 +0,0 @@
-use itertools::izip;
-use plonky2::field::extension::Extendable;
-use plonky2::field::packed::PackedField;
-use plonky2::hash::hash_types::RichField;
-use plonky2::iop::ext_target::ExtensionTarget;
-
-use crate::arithmetic::add::{eval_ext_circuit_are_equal, eval_packed_generic_are_equal};
-use crate::arithmetic::columns::*;
-use crate::arithmetic::utils::read_value_u64_limbs;
-use crate::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer};
-
-pub(crate) fn u256_sub_br(input0: [u64; N_LIMBS], input1: [u64; N_LIMBS]) -> ([u64; N_LIMBS], u64) {
-    const LIMB_BOUNDARY: u64 = 1 << LIMB_BITS;
-    const MASK: u64 = LIMB_BOUNDARY - 1u64;
-
-    let mut output = [0u64; N_LIMBS];
-    let mut br = 0u64;
-    for (i, a, b) in izip!(0.., input0, input1) {
-        let d = LIMB_BOUNDARY + a - b - br;
-        // if a < b, then d < 2^16 so br = 1
-        // if a >= b, then d >= 2^16 so br = 0
-        br = 1u64 - (d >> LIMB_BITS);
-        assert!(br <= 1u64, "input limbs were larger than 16 bits");
-        output[i] = d & MASK;
-    }
-
-    (output, br)
-}
-
-pub fn generate<F: RichField>(lv: &mut [F]) {
-    let input0 = read_value_u64_limbs(lv, SUB_INPUT_0);
-    let input1 = read_value_u64_limbs(lv, SUB_INPUT_1);
-
-    let (output_limbs, _) = u256_sub_br(input0, input1);
-
-    lv[SUB_OUTPUT].copy_from_slice(&output_limbs.map(|c| F::from_canonical_u64(c)));
-}
-
-pub fn eval_packed_generic<P: PackedField>(
-    lv: &[P; NUM_ARITH_COLUMNS],
-    yield_constr: &mut ConstraintConsumer<P>,
-) {
-    let is_sub = lv[IS_SUB];
-    let input0_limbs = &lv[SUB_INPUT_0];
-    let input1_limbs = &lv[SUB_INPUT_1];
-    let output_limbs = &lv[SUB_OUTPUT];
-
-    let output_computed = input0_limbs.iter().zip(input1_limbs).map(|(&a, &b)| a - b);
-
-    eval_packed_generic_are_equal(
-        yield_constr,
-        is_sub,
-        output_limbs.iter().copied(),
-        output_computed,
-        false,
-    );
-}
-
-#[allow(clippy::needless_collect)]
-pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
-    builder: &mut plonky2::plonk::circuit_builder::CircuitBuilder<F, D>,
-    lv: &[ExtensionTarget<D>; NUM_ARITH_COLUMNS],
-    yield_constr: &mut RecursiveConstraintConsumer<F, D>,
-) {
-    let is_sub = lv[IS_SUB];
-    let input0_limbs = &lv[SUB_INPUT_0];
-    let input1_limbs = &lv[SUB_INPUT_1];
-    let output_limbs = &lv[SUB_OUTPUT];
-
-    // Since `map` is lazy and the closure passed to it borrows
-    // `builder`, we can't then borrow builder again below in the call
-    // to `eval_ext_circuit_are_equal`. The solution is to force
-    // evaluation with `collect`.
-    let output_computed = input0_limbs
-        .iter()
-        .zip(input1_limbs)
-        .map(|(&a, &b)| builder.sub_extension(a, b))
-        .collect::<Vec<ExtensionTarget<D>>>();
-
-    eval_ext_circuit_are_equal(
-        builder,
-        yield_constr,
-        is_sub,
-        output_limbs.iter().copied(),
-        output_computed.into_iter(),
-        false,
-    );
-}
-
-#[cfg(test)]
-mod tests {
-    use plonky2::field::goldilocks_field::GoldilocksField;
-    use plonky2::field::types::{Field, Sample};
-    use rand::{Rng, SeedableRng};
-    use rand_chacha::ChaCha8Rng;
-
-    use super::*;
-    use crate::arithmetic::columns::NUM_ARITH_COLUMNS;
-    use crate::constraint_consumer::ConstraintConsumer;
-
-    const N_RND_TESTS: usize = 1000;
-
-    // TODO: Should be able to refactor this test to apply to all operations.
-    #[test]
-    fn generate_eval_consistency_not_sub() {
-        type F = GoldilocksField;
-
-        let mut rng = ChaCha8Rng::seed_from_u64(0x6feb51b7ec230f25);
-        let mut lv = [F::default(); NUM_ARITH_COLUMNS].map(|_| F::sample(&mut rng));
-
-        // if `IS_SUB == 0`, then the constraints should be met even
-        // if all values are garbage.
-        lv[IS_SUB] = F::ZERO;
-
-        let mut constraint_consumer = ConstraintConsumer::new(
-            vec![GoldilocksField(2), GoldilocksField(3), GoldilocksField(5)],
-            GoldilocksField::ONE,
-            GoldilocksField::ONE,
-            GoldilocksField::ONE,
-        );
-        eval_packed_generic(&lv, &mut constraint_consumer);
-        for &acc in &constraint_consumer.constraint_accs {
-            assert_eq!(acc, GoldilocksField::ZERO);
-        }
-    }
-
-    #[test]
-    fn generate_eval_consistency_sub() {
-        type F = GoldilocksField;
-
-        let mut rng = ChaCha8Rng::seed_from_u64(0x6feb51b7ec230f25);
-        let mut lv = [F::default(); NUM_ARITH_COLUMNS].map(|_| F::sample(&mut rng));
-
-        // set `IS_SUB == 1` and ensure all constraints are satisfied.
-        lv[IS_SUB] = F::ONE;
-
-        for _ in 0..N_RND_TESTS {
-            // set inputs to random values
-            for (ai, bi) in SUB_INPUT_0.zip(SUB_INPUT_1) {
-                lv[ai] = F::from_canonical_u16(rng.gen());
-                lv[bi] = F::from_canonical_u16(rng.gen());
-            }
-
-            generate(&mut lv);
-
-            let mut constraint_consumer = ConstraintConsumer::new(
-                vec![GoldilocksField(2), GoldilocksField(3), GoldilocksField(5)],
-                GoldilocksField::ONE,
-                GoldilocksField::ONE,
-                GoldilocksField::ONE,
-            );
-            eval_packed_generic(&lv, &mut constraint_consumer);
-            for &acc in &constraint_consumer.constraint_accs {
-                assert_eq!(acc, GoldilocksField::ZERO);
-            }
-        }
-    }
-}