diff --git a/evm/src/arithmetic/arithmetic_stark.rs b/evm/src/arithmetic/arithmetic_stark.rs
index 4695798a..5441cf27 100644
--- a/evm/src/arithmetic/arithmetic_stark.rs
+++ b/evm/src/arithmetic/arithmetic_stark.rs
@@ -27,10 +27,17 @@ use crate::vars::{StarkEvaluationTargets, StarkEvaluationVars};
 /// This is done by taking pairs of columns (x, y) of the arithmetic
 /// table and combining them as x + y*2^16 to ensure they equal the
 /// corresponding 32-bit number in the CPU table.
-fn cpu_arith_data_link<F: Field>(ops: &[usize], regs: &[Range<usize>]) -> Vec<Column<F>> {
+fn cpu_arith_data_link<F: Field>(
+    combined_ops: &[(usize, u8)],
+    regs: &[Range<usize>],
+) -> Vec<Column<F>> {
     let limb_base = F::from_canonical_u64(1 << columns::LIMB_BITS);
 
-    let mut res = Column::singles(ops).collect_vec();
+    let mut res = vec![Column::linear_combination(
+        combined_ops
+            .iter()
+            .map(|&(col, code)| (col, F::from_canonical_u8(code))),
+    )];
 
     // The inner for loop below assumes N_LIMBS is even.
     const_assert!(columns::N_LIMBS % 2 == 0);
@@ -49,21 +56,27 @@ fn cpu_arith_data_link<F: Field>(ops: &[usize], regs: &[Range<usize>]) -> Vec<Co
 }
 
 pub fn ctl_arithmetic_rows<F: Field>() -> TableWithColumns<F> {
-    const ARITH_OPS: [usize; 14] = [
-        columns::IS_ADD,
-        columns::IS_SUB,
-        columns::IS_MUL,
-        columns::IS_LT,
-        columns::IS_GT,
-        columns::IS_ADDFP254,
-        columns::IS_MULFP254,
-        columns::IS_SUBFP254,
-        columns::IS_ADDMOD,
-        columns::IS_MULMOD,
-        columns::IS_SUBMOD,
-        columns::IS_DIV,
-        columns::IS_MOD,
-        columns::IS_BYTE,
+    // We scale each filter flag with the associated opcode value.
+    // If an arithmetic operation is happening on the CPU side,
+    // the CTL will enforce that the reconstructed opcode value
+    // from the opcode bits matches.
+    const COMBINED_OPS: [(usize, u8); 16] = [
+        (columns::IS_ADD, 0x01),
+        (columns::IS_MUL, 0x02),
+        (columns::IS_SUB, 0x03),
+        (columns::IS_DIV, 0x04),
+        (columns::IS_MOD, 0x06),
+        (columns::IS_ADDMOD, 0x08),
+        (columns::IS_MULMOD, 0x09),
+        (columns::IS_ADDFP254, 0x0c),
+        (columns::IS_MULFP254, 0x0d),
+        (columns::IS_SUBFP254, 0x0e),
+        (columns::IS_SUBMOD, 0x0f),
+        (columns::IS_LT, 0x10),
+        (columns::IS_GT, 0x11),
+        (columns::IS_BYTE, 0x1a),
+        (columns::IS_SHL, 0x1b),
+        (columns::IS_SHR, 0x1c),
     ];
 
     const REGISTER_MAP: [Range<usize>; 4] = [
@@ -73,6 +86,8 @@ pub fn ctl_arithmetic_rows<F: Field>() -> TableWithColumns<F> {
         columns::OUTPUT_REGISTER,
     ];
 
+    let filter_column = Some(Column::sum(COMBINED_OPS.iter().map(|(c, _v)| *c)));
+
     // Create the Arithmetic Table whose columns are those of the
     // operations listed in `ops` whose inputs and outputs are given
     // by `regs`, where each element of `regs` is a range of columns
@@ -80,8 +95,8 @@ pub fn ctl_arithmetic_rows<F: Field>() -> TableWithColumns<F> {
     // is used as the operation filter).
     TableWithColumns::new(
         Table::Arithmetic,
-        cpu_arith_data_link(&ARITH_OPS, &REGISTER_MAP),
-        Some(Column::sum(ARITH_OPS)),
+        cpu_arith_data_link(&COMBINED_OPS, &REGISTER_MAP),
+        filter_column,
     )
 }
 
diff --git a/evm/src/arithmetic/columns.rs b/evm/src/arithmetic/columns.rs
index afdd5832..48e00f8e 100644
--- a/evm/src/arithmetic/columns.rs
+++ b/evm/src/arithmetic/columns.rs
@@ -36,8 +36,10 @@ pub(crate) const IS_SUBMOD: usize = IS_SUBFP254 + 1;
 pub(crate) const IS_LT: usize = IS_SUBMOD + 1;
 pub(crate) const IS_GT: usize = IS_LT + 1;
 pub(crate) const IS_BYTE: usize = IS_GT + 1;
+pub(crate) const IS_SHL: usize = IS_BYTE + 1;
+pub(crate) const IS_SHR: usize = IS_SHL + 1;
 
-pub(crate) const START_SHARED_COLS: usize = IS_BYTE + 1;
+pub(crate) const START_SHARED_COLS: usize = IS_SHR + 1;
 
 /// Within the Arithmetic Unit, there are shared columns which can be
 /// used by any arithmetic circuit, depending on which one is active
diff --git a/evm/src/arithmetic/divmod.rs b/evm/src/arithmetic/divmod.rs
index 4f2dd748..258c131f 100644
--- a/evm/src/arithmetic/divmod.rs
+++ b/evm/src/arithmetic/divmod.rs
@@ -45,7 +45,7 @@ pub(crate) fn generate<F: PrimeField64>(
     }
 
     match filter {
-        IS_DIV => {
+        IS_DIV | IS_SHR => {
             debug_assert!(
                 lv[OUTPUT_REGISTER]
                     .iter()
@@ -104,11 +104,14 @@ pub(crate) fn eval_packed<P: PackedField>(
     nv: &[P; NUM_ARITH_COLUMNS],
     yield_constr: &mut ConstraintConsumer<P>,
 ) {
+    // Constrain IS_SHR independently, so that it doesn't impact the
+    // constraints when combining the flag with IS_DIV.
+    yield_constr.constraint_last_row(lv[IS_SHR]);
     eval_packed_divmod_helper(
         lv,
         nv,
         yield_constr,
-        lv[IS_DIV],
+        lv[IS_DIV] + lv[IS_SHR],
         OUTPUT_REGISTER,
         AUX_INPUT_REGISTER_0,
     );
@@ -161,12 +164,14 @@ pub(crate) fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
     nv: &[ExtensionTarget<D>; NUM_ARITH_COLUMNS],
     yield_constr: &mut RecursiveConstraintConsumer<F, D>,
 ) {
+    yield_constr.constraint_last_row(builder, lv[IS_SHR]);
+    let div_shr_flag = builder.add_extension(lv[IS_DIV], lv[IS_SHR]);
     eval_ext_circuit_divmod_helper(
         builder,
         lv,
         nv,
         yield_constr,
-        lv[IS_DIV],
+        div_shr_flag,
         OUTPUT_REGISTER,
         AUX_INPUT_REGISTER_0,
     );
@@ -209,6 +214,8 @@ mod tests {
         for op in MODULAR_OPS {
             lv[op] = F::ZERO;
         }
+        // Deactivate the SHR flag so that a DIV operation is not triggered.
+        lv[IS_SHR] = F::ZERO;
 
         let mut constraint_consumer = ConstraintConsumer::new(
             vec![GoldilocksField(2), GoldilocksField(3), GoldilocksField(5)],
@@ -240,6 +247,7 @@ mod tests {
                 for op in MODULAR_OPS {
                     lv[op] = F::ZERO;
                 }
+                lv[IS_SHR] = F::ZERO;
                 lv[op_filter] = F::ONE;
 
                 let input0 = U256::from(rng.gen::<[u8; 32]>());
@@ -300,6 +308,7 @@ mod tests {
                 for op in MODULAR_OPS {
                     lv[op] = F::ZERO;
                 }
+                lv[IS_SHR] = F::ZERO;
                 lv[op_filter] = F::ONE;
 
                 let input0 = U256::from(rng.gen::<[u8; 32]>());
diff --git a/evm/src/arithmetic/mod.rs b/evm/src/arithmetic/mod.rs
index d9d63a0b..bd6d56e8 100644
--- a/evm/src/arithmetic/mod.rs
+++ b/evm/src/arithmetic/mod.rs
@@ -27,15 +27,17 @@ pub(crate) enum BinaryOperator {
     MulFp254,
     SubFp254,
     Byte,
+    Shl, // simulated with MUL
+    Shr, // simulated with DIV
 }
 
 impl BinaryOperator {
     pub(crate) fn result(&self, input0: U256, input1: U256) -> U256 {
         match self {
             BinaryOperator::Add => input0.overflowing_add(input1).0,
-            BinaryOperator::Mul => input0.overflowing_mul(input1).0,
+            BinaryOperator::Mul | BinaryOperator::Shl => input0.overflowing_mul(input1).0,
             BinaryOperator::Sub => input0.overflowing_sub(input1).0,
-            BinaryOperator::Div => {
+            BinaryOperator::Div | BinaryOperator::Shr => {
                 if input1.is_zero() {
                     U256::zero()
                 } else {
@@ -77,6 +79,8 @@ impl BinaryOperator {
             BinaryOperator::MulFp254 => columns::IS_MULFP254,
             BinaryOperator::SubFp254 => columns::IS_SUBFP254,
             BinaryOperator::Byte => columns::IS_BYTE,
+            BinaryOperator::Shl => columns::IS_SHL,
+            BinaryOperator::Shr => columns::IS_SHR,
         }
     }
 }
@@ -107,6 +111,7 @@ impl TernaryOperator {
     }
 }
 
+/// An enum representing arithmetic operations that can be either binary or ternary.
 #[derive(Debug)]
 pub(crate) enum Operation {
     BinaryOperation {
@@ -125,6 +130,21 @@ pub(crate) enum Operation {
 }
 
 impl Operation {
+    /// Create a binary operator with given inputs.
+    ///
+    /// NB: This works as you would expect, EXCEPT for SHL and SHR,
+    /// whose inputs need a small amount of preprocessing. Specifically,
+    /// to create `SHL(shift, value)`, call (note the reversal of
+    /// argument order):
+    ///
+    ///    `Operation::binary(BinaryOperator::Shl, value, 1 << shift)`
+    ///
+    /// Similarly, to create `SHR(shift, value)`, call
+    ///
+    ///    `Operation::binary(BinaryOperator::Shr, value, 1 << shift)`
+    ///
+    /// See witness/operation.rs::append_shift() for an example (indeed
+    /// the only call site for such inputs).
     pub(crate) fn binary(operator: BinaryOperator, input0: U256, input1: U256) -> Self {
         let result = operator.result(input0, input1);
         Self::BinaryOperation {
@@ -164,6 +184,10 @@ impl Operation {
     /// use vectors because that's what utils::transpose (who consumes
     /// the result of this function as part of the range check code)
     /// expects.
+    ///
+    /// The `is_simulated` bool indicates whether we use a native arithmetic
+    /// operation or simulate one with another. This is used to distinguish
+    /// SHL and SHR operations that are simulated through MUL and DIV respectively.
     fn to_rows<F: PrimeField64>(&self) -> (Vec<F>, Option<Vec<F>>) {
         match *self {
             Operation::BinaryOperation {
@@ -214,11 +238,11 @@ fn binary_op_to_rows<F: PrimeField64>(
             addcy::generate(&mut row, op.row_filter(), input0, input1);
             (row, None)
         }
-        BinaryOperator::Mul => {
+        BinaryOperator::Mul | BinaryOperator::Shl => {
             mul::generate(&mut row, input0, input1);
             (row, None)
         }
-        BinaryOperator::Div | BinaryOperator::Mod => {
+        BinaryOperator::Div | BinaryOperator::Mod | BinaryOperator::Shr => {
             let mut nv = vec![F::ZERO; columns::NUM_ARITH_COLUMNS];
             divmod::generate(&mut row, &mut nv, op.row_filter(), input0, input1, result);
             (row, Some(nv))
diff --git a/evm/src/arithmetic/mul.rs b/evm/src/arithmetic/mul.rs
index 597d4051..efb4d822 100644
--- a/evm/src/arithmetic/mul.rs
+++ b/evm/src/arithmetic/mul.rs
@@ -121,7 +121,7 @@ pub fn eval_packed_generic<P: PackedField>(
 ) {
     let base = P::Scalar::from_canonical_u64(1 << LIMB_BITS);
 
-    let is_mul = lv[IS_MUL];
+    let is_mul = lv[IS_MUL] + lv[IS_SHL];
     let input0_limbs = read_value::<N_LIMBS, _>(lv, INPUT_REGISTER_0);
     let input1_limbs = read_value::<N_LIMBS, _>(lv, INPUT_REGISTER_1);
     let output_limbs = read_value::<N_LIMBS, _>(lv, OUTPUT_REGISTER);
@@ -173,7 +173,7 @@ pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
     lv: &[ExtensionTarget<D>; NUM_ARITH_COLUMNS],
     yield_constr: &mut RecursiveConstraintConsumer<F, D>,
 ) {
-    let is_mul = lv[IS_MUL];
+    let is_mul = builder.add_extension(lv[IS_MUL], lv[IS_SHL]);
     let input0_limbs = read_value::<N_LIMBS, _>(lv, INPUT_REGISTER_0);
     let input1_limbs = read_value::<N_LIMBS, _>(lv, INPUT_REGISTER_1);
     let output_limbs = read_value::<N_LIMBS, _>(lv, OUTPUT_REGISTER);
@@ -229,6 +229,8 @@ mod tests {
         // if `IS_MUL == 0`, then the constraints should be met even
         // if all values are garbage.
         lv[IS_MUL] = F::ZERO;
+        // Deactivate the SHL flag so that a MUL operation is not triggered.
+        lv[IS_SHL] = F::ZERO;
 
         let mut constraint_consumer = ConstraintConsumer::new(
             vec![GoldilocksField(2), GoldilocksField(3), GoldilocksField(5)],
diff --git a/evm/src/cpu/columns/ops.rs b/evm/src/cpu/columns/ops.rs
index 6c68a183..b8a4d8a6 100644
--- a/evm/src/cpu/columns/ops.rs
+++ b/evm/src/cpu/columns/ops.rs
@@ -7,33 +7,17 @@ use crate::util::{indices_arr, transmute_no_compile_time_size_checks};
 #[repr(C)]
 #[derive(Clone, Copy, Eq, PartialEq, Debug)]
 pub struct OpsColumnsView<T: Copy> {
-    // TODO: combine ADD, MUL, SUB, DIV, MOD, ADDFP254, MULFP254, SUBFP254, LT, and GT into one flag
-    pub add: T,
-    pub mul: T,
-    pub sub: T,
-    pub div: T,
-    pub mod_: T,
-    // TODO: combine ADDMOD, MULMOD and SUBMOD into one flag
-    pub addmod: T,
-    pub mulmod: T,
-    pub addfp254: T,
-    pub mulfp254: T,
-    pub subfp254: T,
-    pub submod: T,
-    pub lt: T,
-    pub gt: T,
-    pub eq_iszero: T, // Combines EQ and ISZERO flags.
-    pub logic_op: T,  // Combines AND, OR and XOR flags.
+    pub binary_op: T,  // Combines ADD, MUL, SUB, DIV, MOD, LT, GT and BYTE flags.
+    pub ternary_op: T, // Combines ADDMOD, MULMOD and SUBMOD flags.
+    pub fp254_op: T,   // Combines ADD_FP254, MUL_FP254 and SUB_FP254 flags.
+    pub eq_iszero: T,  // Combines EQ and ISZERO flags.
+    pub logic_op: T,   // Combines AND, OR and XOR flags.
     pub not: T,
-    pub byte: T,
-    // TODO: combine SHL and SHR into one flag
-    pub shl: T,
-    pub shr: T,
+    pub shift: T, // Combines SHL and SHR flags.
     pub keccak_general: T,
     pub prover_input: T,
     pub pop: T,
-    // TODO: combine JUMP and JUMPI into one flag
-    pub jumps: T, // Note: This column must be 0 when is_cpu_cycle = 0.
+    pub jumps: T, // Combines JUMP and JUMPI flags.
     pub pc: T,
     pub jumpdest: T,
     pub push0: T,
diff --git a/evm/src/cpu/control_flow.rs b/evm/src/cpu/control_flow.rs
index 0bea5c7c..8d0ee264 100644
--- a/evm/src/cpu/control_flow.rs
+++ b/evm/src/cpu/control_flow.rs
@@ -8,24 +8,14 @@ use crate::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer
 use crate::cpu::columns::{CpuColumnsView, COL_MAP};
 use crate::cpu::kernel::aggregator::KERNEL;
 
-const NATIVE_INSTRUCTIONS: [usize; 28] = [
-    COL_MAP.op.add,
-    COL_MAP.op.mul,
-    COL_MAP.op.sub,
-    COL_MAP.op.div,
-    COL_MAP.op.mod_,
-    COL_MAP.op.addmod,
-    COL_MAP.op.mulmod,
-    COL_MAP.op.addfp254,
-    COL_MAP.op.mulfp254,
-    COL_MAP.op.subfp254,
-    COL_MAP.op.lt,
-    COL_MAP.op.gt,
+const NATIVE_INSTRUCTIONS: [usize; 18] = [
+    COL_MAP.op.binary_op,
+    COL_MAP.op.ternary_op,
+    COL_MAP.op.fp254_op,
     COL_MAP.op.eq_iszero,
     COL_MAP.op.logic_op,
     COL_MAP.op.not,
-    COL_MAP.op.shl,
-    COL_MAP.op.shr,
+    COL_MAP.op.shift,
     COL_MAP.op.keccak_general,
     COL_MAP.op.prover_input,
     COL_MAP.op.pop,
diff --git a/evm/src/cpu/cpu_stark.rs b/evm/src/cpu/cpu_stark.rs
index 25e7cc6b..820ccd3d 100644
--- a/evm/src/cpu/cpu_stark.rs
+++ b/evm/src/cpu/cpu_stark.rs
@@ -48,9 +48,8 @@ pub fn ctl_filter_keccak_sponge<F: Field>() -> Column<F> {
 
 /// Create the vector of Columns corresponding to the two inputs and
 /// one output of a binary operation.
-fn ctl_data_binops<F: Field>(ops: &[usize]) -> Vec<Column<F>> {
-    let mut res = Column::singles(ops).collect_vec();
-    res.extend(Column::singles(COL_MAP.mem_channels[0].value));
+fn ctl_data_binops<F: Field>() -> Vec<Column<F>> {
+    let mut res = Column::singles(COL_MAP.mem_channels[0].value).collect_vec();
     res.extend(Column::singles(COL_MAP.mem_channels[1].value));
     res.extend(Column::singles(
         COL_MAP.mem_channels[NUM_GP_CHANNELS - 1].value,
@@ -70,10 +69,9 @@ fn ctl_data_binops<F: Field>(ops: &[usize]) -> Vec<Column<F>> {
 /// case of shift operations, which will skip the first memory channel and use the
 /// next three as ternary inputs. Because both `MUL` and `DIV` are binary operations,
 /// the last memory channel used for the inputs will be safely ignored.
-fn ctl_data_ternops<F: Field>(ops: &[usize], is_shift: bool) -> Vec<Column<F>> {
+fn ctl_data_ternops<F: Field>(is_shift: bool) -> Vec<Column<F>> {
     let offset = is_shift as usize;
-    let mut res = Column::singles(ops).collect_vec();
-    res.extend(Column::singles(COL_MAP.mem_channels[offset].value));
+    let mut res = Column::singles(COL_MAP.mem_channels[offset].value).collect_vec();
     res.extend(Column::singles(COL_MAP.mem_channels[offset + 1].value));
     res.extend(Column::singles(COL_MAP.mem_channels[offset + 2].value));
     res.extend(Column::singles(
@@ -85,7 +83,7 @@ fn ctl_data_ternops<F: Field>(ops: &[usize], is_shift: bool) -> Vec<Column<F>> {
 pub fn ctl_data_logic<F: Field>() -> Vec<Column<F>> {
     // Instead of taking single columns, we reconstruct the entire opcode value directly.
     let mut res = vec![Column::le_bits(COL_MAP.opcode_bits)];
-    res.extend(ctl_data_binops(&[]));
+    res.extend(ctl_data_binops());
     res
 }
 
@@ -94,22 +92,9 @@ pub fn ctl_filter_logic<F: Field>() -> Column<F> {
 }
 
 pub fn ctl_arithmetic_base_rows<F: Field>() -> TableWithColumns<F> {
-    const OPS: [usize; 14] = [
-        COL_MAP.op.add,
-        COL_MAP.op.sub,
-        COL_MAP.op.mul,
-        COL_MAP.op.lt,
-        COL_MAP.op.gt,
-        COL_MAP.op.addfp254,
-        COL_MAP.op.mulfp254,
-        COL_MAP.op.subfp254,
-        COL_MAP.op.addmod,
-        COL_MAP.op.mulmod,
-        COL_MAP.op.submod,
-        COL_MAP.op.div,
-        COL_MAP.op.mod_,
-        COL_MAP.op.byte,
-    ];
+    // Instead of taking single columns, we reconstruct the entire opcode value directly.
+    let mut columns = vec![Column::le_bits(COL_MAP.opcode_bits)];
+    columns.extend(ctl_data_ternops(false));
     // Create the CPU Table whose columns are those with the three
     // inputs and one output of the ternary operations listed in `ops`
     // (also `ops` is used as the operation filter). The list of
@@ -117,40 +102,25 @@ pub fn ctl_arithmetic_base_rows<F: Field>() -> TableWithColumns<F> {
     // the third input.
     TableWithColumns::new(
         Table::Cpu,
-        ctl_data_ternops(&OPS, false),
-        Some(Column::sum(OPS)),
+        columns,
+        Some(Column::sum([
+            COL_MAP.op.binary_op,
+            COL_MAP.op.fp254_op,
+            COL_MAP.op.ternary_op,
+        ])),
     )
 }
 
 pub fn ctl_arithmetic_shift_rows<F: Field>() -> TableWithColumns<F> {
-    const OPS: [usize; 14] = [
-        COL_MAP.op.add,
-        COL_MAP.op.sub,
-        // SHL is interpreted as MUL on the arithmetic side
-        COL_MAP.op.shl,
-        COL_MAP.op.lt,
-        COL_MAP.op.gt,
-        COL_MAP.op.addfp254,
-        COL_MAP.op.mulfp254,
-        COL_MAP.op.subfp254,
-        COL_MAP.op.addmod,
-        COL_MAP.op.mulmod,
-        COL_MAP.op.submod,
-        // SHR is interpreted as DIV on the arithmetic side
-        COL_MAP.op.shr,
-        COL_MAP.op.mod_,
-        COL_MAP.op.byte,
-    ];
+    // Instead of taking single columns, we reconstruct the entire opcode value directly.
+    let mut columns = vec![Column::le_bits(COL_MAP.opcode_bits)];
+    columns.extend(ctl_data_ternops(true));
     // Create the CPU Table whose columns are those with the three
     // inputs and one output of the ternary operations listed in `ops`
     // (also `ops` is used as the operation filter). The list of
     // operations includes binary operations which will simply ignore
     // the third input.
-    TableWithColumns::new(
-        Table::Cpu,
-        ctl_data_ternops(&OPS, true),
-        Some(Column::sum([COL_MAP.op.shl, COL_MAP.op.shr])),
-    )
+    TableWithColumns::new(Table::Cpu, columns, Some(Column::single(COL_MAP.op.shift)))
 }
 
 pub fn ctl_data_byte_packing<F: Field>() -> Vec<Column<F>> {
diff --git a/evm/src/cpu/decode.rs b/evm/src/cpu/decode.rs
index 9a9c5723..cc87281c 100644
--- a/evm/src/cpu/decode.rs
+++ b/evm/src/cpu/decode.rs
@@ -22,26 +22,15 @@ use crate::cpu::columns::{CpuColumnsView, COL_MAP};
 /// behavior.
 /// Note: invalid opcodes are not represented here. _Any_ opcode is permitted to decode to
 /// `is_invalid`. The kernel then verifies that the opcode was _actually_ invalid.
-const OPCODES: [(u8, usize, bool, usize); 33] = [
+const OPCODES: [(u8, usize, bool, usize); 18] = [
     // (start index of block, number of top bits to check (log2), kernel-only, flag column)
-    (0x01, 0, false, COL_MAP.op.add),
-    (0x02, 0, false, COL_MAP.op.mul),
-    (0x03, 0, false, COL_MAP.op.sub),
-    (0x04, 0, false, COL_MAP.op.div),
-    (0x06, 0, false, COL_MAP.op.mod_),
-    (0x08, 0, false, COL_MAP.op.addmod),
-    (0x09, 0, false, COL_MAP.op.mulmod),
-    (0x0c, 0, true, COL_MAP.op.addfp254),
-    (0x0d, 0, true, COL_MAP.op.mulfp254),
-    (0x0e, 0, true, COL_MAP.op.subfp254),
-    (0x10, 0, false, COL_MAP.op.lt),
-    (0x11, 0, false, COL_MAP.op.gt),
+    // ADD, MUL, SUB, DIV, MOD, LT, GT and BYTE flags are handled partly manually here, and partly through the Arithmetic table CTL.
+    // ADDMOD, MULMOD and SUBMOD flags are handled partly manually here, and partly through the Arithmetic table CTL.
+    // FP254 operation flags are handled partly manually here, and partly through the Arithmetic table CTL.
     (0x14, 1, false, COL_MAP.op.eq_iszero),
     // AND, OR and XOR flags are handled partly manually here, and partly through the Logic table CTL.
     (0x19, 0, false, COL_MAP.op.not),
-    (0x1a, 0, false, COL_MAP.op.byte),
-    (0x1b, 0, false, COL_MAP.op.shl),
-    (0x1c, 0, false, COL_MAP.op.shr),
+    // SHL and SHR flags are handled partly manually here, and partly through the Logic table CTL.
     (0x21, 0, true, COL_MAP.op.keccak_general),
     (0x49, 0, true, COL_MAP.op.prover_input),
     (0x50, 0, false, COL_MAP.op.pop),
@@ -60,6 +49,17 @@ const OPCODES: [(u8, usize, bool, usize); 33] = [
     (0xfc, 0, true, COL_MAP.op.mstore_general),
 ];
 
+/// List of combined opcodes requiring a special handling.
+/// Each index in the list corresponds to an arbitrary combination
+/// of opcodes defined in evm/src/cpu/columns/ops.rs.
+const COMBINED_OPCODES: [usize; 5] = [
+    COL_MAP.op.logic_op,
+    COL_MAP.op.fp254_op,
+    COL_MAP.op.binary_op,
+    COL_MAP.op.ternary_op,
+    COL_MAP.op.shift,
+];
+
 pub fn generate<F: RichField>(lv: &mut CpuColumnsView<F>) {
     let cycle_filter: F = COL_MAP.op.iter().map(|&col_i| lv[col_i]).sum();
 
@@ -134,17 +134,17 @@ pub fn eval_packed_generic<P: PackedField>(
         let flag = lv[flag_col];
         yield_constr.constraint(flag * (flag - P::ONES));
     }
-    // Manually check the logic_op flag combining AND, OR and XOR.
-    let flag = lv.op.logic_op;
-    yield_constr.constraint(flag * (flag - P::ONES));
+    // Also check that the combined instruction flags are valid.
+    for flag_idx in COMBINED_OPCODES {
+        yield_constr.constraint(lv[flag_idx] * (lv[flag_idx] - P::ONES));
+    }
 
-    // Now check that they sum to 0 or 1.
-    // Includes the logic_op flag encompassing AND, OR and XOR opcodes.
+    // Now check that they sum to 0 or 1, including the combined flags.
     let flag_sum: P = OPCODES
         .into_iter()
         .map(|(_, _, _, flag_col)| lv[flag_col])
-        .sum::<P>()
-        + lv.op.logic_op;
+        .chain(COMBINED_OPCODES.map(|op| lv[op]))
+        .sum::<P>();
     yield_constr.constraint(flag_sum * (flag_sum - P::ONES));
 
     // Finally, classify all opcodes, together with the kernel flag, into blocks
@@ -204,15 +204,16 @@ pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
         let constr = builder.mul_sub_extension(flag, flag, flag);
         yield_constr.constraint(builder, constr);
     }
-    // Manually check the logic_op flag combining AND, OR and XOR.
-    let flag = lv.op.logic_op;
-    let constr = builder.mul_sub_extension(flag, flag, flag);
-    yield_constr.constraint(builder, constr);
+    // Also check that the combined instruction flags are valid.
+    for flag_idx in COMBINED_OPCODES {
+        let constr = builder.mul_sub_extension(lv[flag_idx], lv[flag_idx], lv[flag_idx]);
+        yield_constr.constraint(builder, constr);
+    }
 
-    // Now check that they sum to 0 or 1.
-    // Includes the logic_op flag encompassing AND, OR and XOR opcodes.
+    // Now check that they sum to 0 or 1, including the combined flags.
     {
-        let mut flag_sum = lv.op.logic_op;
+        let mut flag_sum =
+            builder.add_many_extension(COMBINED_OPCODES.into_iter().map(|idx| lv[idx]));
         for (_, _, _, flag_col) in OPCODES {
             let flag = lv[flag_col];
             flag_sum = builder.add_extension(flag_sum, flag);
diff --git a/evm/src/cpu/gas.rs b/evm/src/cpu/gas.rs
index e967c07e..a4a499ad 100644
--- a/evm/src/cpu/gas.rs
+++ b/evm/src/cpu/gas.rs
@@ -19,25 +19,13 @@ const G_MID: Option<u32> = Some(8);
 const G_HIGH: Option<u32> = Some(10);
 
 const SIMPLE_OPCODES: OpsColumnsView<Option<u32>> = OpsColumnsView {
-    add: G_VERYLOW,
-    mul: G_LOW,
-    sub: G_VERYLOW,
-    div: G_LOW,
-    mod_: G_LOW,
-    addmod: G_MID,
-    mulmod: G_MID,
-    addfp254: KERNEL_ONLY_INSTR,
-    mulfp254: KERNEL_ONLY_INSTR,
-    subfp254: KERNEL_ONLY_INSTR,
-    submod: KERNEL_ONLY_INSTR,
-    lt: G_VERYLOW,
-    gt: G_VERYLOW,
+    binary_op: None,  // This is handled manually below
+    ternary_op: None, // This is handled manually below
+    fp254_op: KERNEL_ONLY_INSTR,
     eq_iszero: G_VERYLOW,
     logic_op: G_VERYLOW,
     not: G_VERYLOW,
-    byte: G_VERYLOW,
-    shl: G_VERYLOW,
-    shr: G_VERYLOW,
+    shift: G_VERYLOW,
     keccak_general: KERNEL_ONLY_INSTR,
     prover_input: KERNEL_ONLY_INSTR,
     pop: G_BASE,
@@ -97,6 +85,21 @@ fn eval_packed_accumulate<P: PackedField>(
     let jump_gas_cost = P::Scalar::from_canonical_u32(G_MID.unwrap())
         + lv.opcode_bits[0] * P::Scalar::from_canonical_u32(G_HIGH.unwrap() - G_MID.unwrap());
     yield_constr.constraint_transition(lv.op.jumps * (nv.gas - lv.gas - jump_gas_cost));
+
+    // For binary_ops.
+    // MUL, DIV and MOD are differentiated from ADD, SUB, LT, GT and BYTE by their first and fifth bits set to 0.
+    let cost_filter = lv.opcode_bits[0] + lv.opcode_bits[4] - lv.opcode_bits[0] * lv.opcode_bits[4];
+    let binary_op_cost = P::Scalar::from_canonical_u32(G_LOW.unwrap())
+        + cost_filter
+            * (P::Scalar::from_canonical_u32(G_VERYLOW.unwrap())
+                - P::Scalar::from_canonical_u32(G_LOW.unwrap()));
+    yield_constr.constraint_transition(lv.op.binary_op * (nv.gas - lv.gas - binary_op_cost));
+
+    // For ternary_ops.
+    // SUBMOD is differentiated by its second bit set to 1.
+    let ternary_op_cost = P::Scalar::from_canonical_u32(G_MID.unwrap())
+        - lv.opcode_bits[1] * P::Scalar::from_canonical_u32(G_MID.unwrap());
+    yield_constr.constraint_transition(lv.op.ternary_op * (nv.gas - lv.gas - ternary_op_cost));
 }
 
 fn eval_packed_init<P: PackedField>(
@@ -186,6 +189,41 @@ fn eval_ext_circuit_accumulate<F: RichField + Extendable<D>, const D: usize>(
     let gas_diff = builder.sub_extension(nv_lv_diff, jump_gas_cost);
     let constr = builder.mul_extension(filter, gas_diff);
     yield_constr.constraint_transition(builder, constr);
+
+    // For binary_ops.
+    // MUL, DIV and MOD are differentiated from ADD, SUB, LT, GT and BYTE by their first and fifth bits set to 0.
+    let filter = lv.op.binary_op;
+    let cost_filter = {
+        let a = builder.add_extension(lv.opcode_bits[0], lv.opcode_bits[4]);
+        let b = builder.mul_extension(lv.opcode_bits[0], lv.opcode_bits[4]);
+        builder.sub_extension(a, b)
+    };
+    let binary_op_cost = builder.mul_const_extension(
+        F::from_canonical_u32(G_VERYLOW.unwrap()) - F::from_canonical_u32(G_LOW.unwrap()),
+        cost_filter,
+    );
+    let binary_op_cost =
+        builder.add_const_extension(binary_op_cost, F::from_canonical_u32(G_LOW.unwrap()));
+
+    let nv_lv_diff = builder.sub_extension(nv.gas, lv.gas);
+    let gas_diff = builder.sub_extension(nv_lv_diff, binary_op_cost);
+    let constr = builder.mul_extension(filter, gas_diff);
+    yield_constr.constraint_transition(builder, constr);
+
+    // For ternary_ops.
+    // SUBMOD is differentiated by its second bit set to 1.
+    let filter = lv.op.ternary_op;
+    let ternary_op_cost = builder.mul_const_extension(
+        F::from_canonical_u32(G_MID.unwrap()).neg(),
+        lv.opcode_bits[1],
+    );
+    let ternary_op_cost =
+        builder.add_const_extension(ternary_op_cost, F::from_canonical_u32(G_MID.unwrap()));
+
+    let nv_lv_diff = builder.sub_extension(nv.gas, lv.gas);
+    let gas_diff = builder.sub_extension(nv_lv_diff, ternary_op_cost);
+    let constr = builder.mul_extension(filter, gas_diff);
+    yield_constr.constraint_transition(builder, constr);
 }
 
 fn eval_ext_circuit_init<F: RichField + Extendable<D>, const D: usize>(
diff --git a/evm/src/cpu/modfp254.rs b/evm/src/cpu/modfp254.rs
index e6a2815d..86f08052 100644
--- a/evm/src/cpu/modfp254.rs
+++ b/evm/src/cpu/modfp254.rs
@@ -19,7 +19,7 @@ pub fn eval_packed<P: PackedField>(
     lv: &CpuColumnsView<P>,
     yield_constr: &mut ConstraintConsumer<P>,
 ) {
-    let filter = lv.op.addfp254 + lv.op.mulfp254 + lv.op.subfp254;
+    let filter = lv.op.fp254_op;
 
     // We want to use all the same logic as the usual mod operations, but without needing to read
     // the modulus from the stack. We simply constrain `mem_channels[2]` to be our prime (that's
@@ -36,7 +36,7 @@ pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
     lv: &CpuColumnsView<ExtensionTarget<D>>,
     yield_constr: &mut RecursiveConstraintConsumer<F, D>,
 ) {
-    let filter = builder.add_many_extension([lv.op.addfp254, lv.op.mulfp254, lv.op.subfp254]);
+    let filter = lv.op.fp254_op;
 
     // We want to use all the same logic as the usual mod operations, but without needing to read
     // the modulus from the stack. We simply constrain `mem_channels[2]` to be our prime (that's
diff --git a/evm/src/cpu/shift.rs b/evm/src/cpu/shift.rs
index a8acf5d4..a4249297 100644
--- a/evm/src/cpu/shift.rs
+++ b/evm/src/cpu/shift.rs
@@ -13,7 +13,7 @@ pub(crate) fn eval_packed<P: PackedField>(
     lv: &CpuColumnsView<P>,
     yield_constr: &mut ConstraintConsumer<P>,
 ) {
-    let is_shift = lv.op.shl + lv.op.shr;
+    let is_shift = lv.op.shift;
     let displacement = lv.mem_channels[0]; // holds the shift displacement d
     let two_exp = lv.mem_channels[2]; // holds 2^d
 
@@ -64,7 +64,7 @@ pub(crate) fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
     lv: &CpuColumnsView<ExtensionTarget<D>>,
     yield_constr: &mut RecursiveConstraintConsumer<F, D>,
 ) {
-    let is_shift = builder.add_extension(lv.op.shl, lv.op.shr);
+    let is_shift = lv.op.shift;
     let displacement = lv.mem_channels[0];
     let two_exp = lv.mem_channels[2];
 
diff --git a/evm/src/cpu/stack.rs b/evm/src/cpu/stack.rs
index cfeaa1b0..a0c8df5c 100644
--- a/evm/src/cpu/stack.rs
+++ b/evm/src/cpu/stack.rs
@@ -50,29 +50,13 @@ pub(crate) const JUMPI_OP: Option<StackBehavior> = Some(StackBehavior {
 // except the first `num_pops` and the last `pushes as usize` channels have their read flag and
 // address constrained automatically in this file.
 const STACK_BEHAVIORS: OpsColumnsView<Option<StackBehavior>> = OpsColumnsView {
-    add: BASIC_BINARY_OP,
-    mul: BASIC_BINARY_OP,
-    sub: BASIC_BINARY_OP,
-    div: BASIC_BINARY_OP,
-    mod_: BASIC_BINARY_OP,
-    addmod: BASIC_TERNARY_OP,
-    mulmod: BASIC_TERNARY_OP,
-    addfp254: BASIC_BINARY_OP,
-    mulfp254: BASIC_BINARY_OP,
-    subfp254: BASIC_BINARY_OP,
-    submod: BASIC_TERNARY_OP,
-    lt: BASIC_BINARY_OP,
-    gt: BASIC_BINARY_OP,
+    binary_op: BASIC_BINARY_OP,
+    ternary_op: BASIC_TERNARY_OP,
+    fp254_op: BASIC_BINARY_OP,
     eq_iszero: None, // EQ is binary, IS_ZERO is unary.
     logic_op: BASIC_BINARY_OP,
     not: BASIC_UNARY_OP,
-    byte: BASIC_BINARY_OP,
-    shl: Some(StackBehavior {
-        num_pops: 2,
-        pushes: true,
-        disable_other_channels: false,
-    }),
-    shr: Some(StackBehavior {
+    shift: Some(StackBehavior {
         num_pops: 2,
         pushes: true,
         disable_other_channels: false,
diff --git a/evm/src/witness/gas.rs b/evm/src/witness/gas.rs
index 3a46c044..aa312078 100644
--- a/evm/src/witness/gas.rs
+++ b/evm/src/witness/gas.rs
@@ -25,8 +25,8 @@ pub(crate) fn gas_to_charge(op: Operation) -> u64 {
         BinaryArithmetic(Lt) => G_VERYLOW,
         BinaryArithmetic(Gt) => G_VERYLOW,
         BinaryArithmetic(Byte) => G_VERYLOW,
-        Shl => G_VERYLOW,
-        Shr => G_VERYLOW,
+        BinaryArithmetic(Shl) => G_VERYLOW,
+        BinaryArithmetic(Shr) => G_VERYLOW,
         BinaryArithmetic(AddFp254) => KERNEL_ONLY_INSTR,
         BinaryArithmetic(MulFp254) => KERNEL_ONLY_INSTR,
         BinaryArithmetic(SubFp254) => KERNEL_ONLY_INSTR,
diff --git a/evm/src/witness/operation.rs b/evm/src/witness/operation.rs
index 7d07576d..23d64be4 100644
--- a/evm/src/witness/operation.rs
+++ b/evm/src/witness/operation.rs
@@ -29,8 +29,6 @@ use crate::{arithmetic, logic};
 pub(crate) enum Operation {
     Iszero,
     Not,
-    Shl,
-    Shr,
     Syscall(u8, usize, bool), // (syscall number, minimum stack length, increases stack length)
     Eq,
     BinaryLogic(logic::Op),
@@ -473,6 +471,7 @@ pub(crate) fn generate_iszero<F: Field>(
 fn append_shift<F: Field>(
     state: &mut GenerationState<F>,
     mut row: CpuColumnsView<F>,
+    is_shl: bool,
     input0: U256,
     input1: U256,
     log_in0: MemoryOp,
@@ -500,10 +499,10 @@ fn append_shift<F: Field>(
     } else {
         U256::one() << input0
     };
-    let operator = if row.op.shl.is_one() {
-        BinaryOperator::Mul
+    let operator = if is_shl {
+        BinaryOperator::Shl
     } else {
-        BinaryOperator::Div
+        BinaryOperator::Shr
     };
     let operation = arithmetic::Operation::binary(operator, input1, input0);
 
@@ -527,7 +526,7 @@ pub(crate) fn generate_shl<F: Field>(
     } else {
         input1 << input0
     };
-    append_shift(state, row, input0, input1, log_in0, log_in1, result)
+    append_shift(state, row, true, input0, input1, log_in0, log_in1, result)
 }
 
 pub(crate) fn generate_shr<F: Field>(
@@ -542,7 +541,7 @@ pub(crate) fn generate_shr<F: Field>(
     } else {
         input1 >> input0
     };
-    append_shift(state, row, input0, input1, log_in0, log_in1, result)
+    append_shift(state, row, false, input0, input1, log_in0, log_in1, result)
 }
 
 pub(crate) fn generate_syscall<F: Field>(
diff --git a/evm/src/witness/transition.rs b/evm/src/witness/transition.rs
index 6e279cdf..9532aa33 100644
--- a/evm/src/witness/transition.rs
+++ b/evm/src/witness/transition.rs
@@ -70,8 +70,8 @@ fn decode(registers: RegistersState, opcode: u8) -> Result<Operation, ProgramErr
         (0x1a, _) => Ok(Operation::BinaryArithmetic(
             arithmetic::BinaryOperator::Byte,
         )),
-        (0x1b, _) => Ok(Operation::Shl),
-        (0x1c, _) => Ok(Operation::Shr),
+        (0x1b, _) => Ok(Operation::BinaryArithmetic(arithmetic::BinaryOperator::Shl)),
+        (0x1c, _) => Ok(Operation::BinaryArithmetic(arithmetic::BinaryOperator::Shr)),
         (0x1d, _) => Ok(Operation::Syscall(opcode, 2, false)), // SAR
         (0x20, _) => Ok(Operation::Syscall(opcode, 2, false)), // KECCAK256
         (0x21, true) => Ok(Operation::KeccakGeneral),
@@ -162,22 +162,13 @@ fn fill_op_flag<F: Field>(op: Operation, row: &mut CpuColumnsView<F>) {
         Operation::Not => &mut flags.not,
         Operation::Syscall(_, _, _) => &mut flags.syscall,
         Operation::BinaryLogic(_) => &mut flags.logic_op,
-        Operation::BinaryArithmetic(arithmetic::BinaryOperator::Add) => &mut flags.add,
-        Operation::BinaryArithmetic(arithmetic::BinaryOperator::Mul) => &mut flags.mul,
-        Operation::BinaryArithmetic(arithmetic::BinaryOperator::Sub) => &mut flags.sub,
-        Operation::BinaryArithmetic(arithmetic::BinaryOperator::Div) => &mut flags.div,
-        Operation::BinaryArithmetic(arithmetic::BinaryOperator::Mod) => &mut flags.mod_,
-        Operation::BinaryArithmetic(arithmetic::BinaryOperator::Lt) => &mut flags.lt,
-        Operation::BinaryArithmetic(arithmetic::BinaryOperator::Gt) => &mut flags.gt,
-        Operation::BinaryArithmetic(arithmetic::BinaryOperator::Byte) => &mut flags.byte,
-        Operation::Shl => &mut flags.shl,
-        Operation::Shr => &mut flags.shr,
-        Operation::BinaryArithmetic(arithmetic::BinaryOperator::AddFp254) => &mut flags.addfp254,
-        Operation::BinaryArithmetic(arithmetic::BinaryOperator::MulFp254) => &mut flags.mulfp254,
-        Operation::BinaryArithmetic(arithmetic::BinaryOperator::SubFp254) => &mut flags.subfp254,
-        Operation::TernaryArithmetic(arithmetic::TernaryOperator::AddMod) => &mut flags.addmod,
-        Operation::TernaryArithmetic(arithmetic::TernaryOperator::MulMod) => &mut flags.mulmod,
-        Operation::TernaryArithmetic(arithmetic::TernaryOperator::SubMod) => &mut flags.submod,
+        Operation::BinaryArithmetic(arithmetic::BinaryOperator::AddFp254)
+        | Operation::BinaryArithmetic(arithmetic::BinaryOperator::MulFp254)
+        | Operation::BinaryArithmetic(arithmetic::BinaryOperator::SubFp254) => &mut flags.fp254_op,
+        Operation::BinaryArithmetic(arithmetic::BinaryOperator::Shl)
+        | Operation::BinaryArithmetic(arithmetic::BinaryOperator::Shr) => &mut flags.shift,
+        Operation::BinaryArithmetic(_) => &mut flags.binary_op,
+        Operation::TernaryArithmetic(_) => &mut flags.ternary_op,
         Operation::KeccakGeneral => &mut flags.keccak_general,
         Operation::ProverInput => &mut flags.prover_input,
         Operation::Pop => &mut flags.pop,
@@ -204,8 +195,8 @@ fn perform_op<F: Field>(
         Operation::Swap(n) => generate_swap(n, state, row)?,
         Operation::Iszero => generate_iszero(state, row)?,
         Operation::Not => generate_not(state, row)?,
-        Operation::Shl => generate_shl(state, row)?,
-        Operation::Shr => generate_shr(state, row)?,
+        Operation::BinaryArithmetic(arithmetic::BinaryOperator::Shl) => generate_shl(state, row)?,
+        Operation::BinaryArithmetic(arithmetic::BinaryOperator::Shr) => generate_shr(state, row)?,
         Operation::Syscall(opcode, stack_values_read, stack_len_increased) => {
             generate_syscall(opcode, stack_values_read, stack_len_increased, state, row)?
         }