diff --git a/evm/src/cpu/kernel/aggregator.rs b/evm/src/cpu/kernel/aggregator.rs
index 53787e48..53584a1f 100644
--- a/evm/src/cpu/kernel/aggregator.rs
+++ b/evm/src/cpu/kernel/aggregator.rs
@@ -12,6 +12,7 @@ pub static KERNEL: Lazy<Kernel> = Lazy::new(combined_kernel);
 pub(crate) fn combined_kernel() -> Kernel {
     let files = vec![
         include_str!("asm/bignum/add.asm"),
+        include_str!("asm/bignum/addmul.asm"),
         include_str!("asm/bignum/ge.asm"),
         include_str!("asm/bignum/iszero.asm"),
         include_str!("asm/bignum/mul.asm"),
diff --git a/evm/src/cpu/kernel/asm/bignum/add.asm b/evm/src/cpu/kernel/asm/bignum/add.asm
index 508078ce..d38c9701 100644
--- a/evm/src/cpu/kernel/asm/bignum/add.asm
+++ b/evm/src/cpu/kernel/asm/bignum/add.asm
@@ -5,7 +5,7 @@
 global add_bignum:
     // stack: len, a_start_loc, b_start_loc, retdest
     PUSH 0
-    // stack: carry=0, i=len, a_start_loc, b_start_loc, retdest
+    // stack: carry=0, i=len, a_cur_loc=a_start_loc, b_cur_loc=b_start_loc, retdest
 add_loop:
     // stack: carry, i, a_cur_loc, b_cur_loc, retdest
     DUP4
diff --git a/evm/src/cpu/kernel/asm/bignum/addmul.asm b/evm/src/cpu/kernel/asm/bignum/addmul.asm
new file mode 100644
index 00000000..742bb829
--- /dev/null
+++ b/evm/src/cpu/kernel/asm/bignum/addmul.asm
@@ -0,0 +1,99 @@
+// Arithmetic on little-endian integers represented with 128-bit limbs.
+// All integers must be under a given length bound, and are padded with leading zeroes.
+
+// Sets a[0:len] += b[0:len] * val.
+global addmul_bignum:
+    // stack: len, a_start_loc, b_start_loc, val, retdest
+    PUSH 0
+    // stack: carry=0, i=len, a_cur_loc=a_start_loc, b_cur_loc=b_start_loc, val, retdest
+addmul_loop:
+    // stack: carry, i, a_cur_loc, b_cur_loc, val, retdest
+    DUP4
+    // stack: b_cur_loc, carry, i, a_cur_loc, b_cur_loc, val, retdest
+    %mload_kernel_general
+    // stack: b[cur], carry, i, a_cur_loc, b_cur_loc, val, retdest
+    DUP6
+    // stack: val, b[cur], carry, i, a_cur_loc, b_cur_loc, val, retdest
+    MUL
+    // stack: val * b[cur], carry, i, a_cur_loc, b_cur_loc, val, retdest
+    DUP1
+    // stack: val * b[cur], val * b[cur], carry, i, a_cur_loc, b_cur_loc, val, retdest
+    %shr_const(128)
+    // stack: (val * b[cur]) // 2^128, val * b[cur], carry, i, a_cur_loc, b_cur_loc, val, retdest
+    SWAP1
+    // stack: val * b[cur], (val * b[cur]) // 2^128, carry, i, a_cur_loc, b_cur_loc, val, retdest
+    %shl_const(128)
+    %shr_const(128)
+    // stack: prod_lo = val * b[cur] % 2^128, prod_hi = (val * b[cur]) // 2^128, carry, i, a_cur_loc, b_cur_loc, val, retdest
+    DUP5
+    // stack: a_cur_loc, prod_lo, prod_hi, carry, i, a_cur_loc, b_cur_loc, val, retdest
+    %mload_kernel_general
+    // stack: a[cur], prod_lo, prod_hi, carry, i, a_cur_loc, b_cur_loc, val, retdest
+    DUP1
+    // stack: a[cur], a[cur], prod_lo, prod_hi, carry, i, a_cur_loc, b_cur_loc, val, retdest
+    SWAP2
+    // stack: prod_lo, a[cur], a[cur], prod_hi, carry, i, a_cur_loc, b_cur_loc, val, retdest
+    ADD
+    %shl_const(128)
+    %shr_const(128)
+    // stack: prod_lo' = (prod_lo + a[cur]) % 2^128, a[cur], prod_hi, carry, i, a_cur_loc, b_cur_loc, val, retdest
+    DUP1
+    // stack: prod_lo', prod_lo', a[cur], prod_hi, carry, i, a_cur_loc, b_cur_loc, val, retdest
+    SWAP2
+    // stack: a[cur], prod_lo', prod_lo', prod_hi, carry, i, a_cur_loc, b_cur_loc, val, retdest
+    GT
+    // stack: prod_lo_carry = a[cur] > prod_lo', prod_lo', prod_hi, carry, i, a_cur_loc, b_cur_loc, val, retdest
+    SWAP1
+    // stack: prod_lo', prod_lo_carry, prod_hi, carry, i, a_cur_loc, b_cur_loc, val, retdest
+    SWAP2
+    // stack: prod_hi, prod_lo_carry, prod_lo', carry, i, a_cur_loc, b_cur_loc, val, retdest
+    ADD
+    // stack: prod_hi' = prod_hi + prod_lo_carry, prod_lo', carry, i, a_cur_loc, b_cur_loc, val, retdest
+    DUP3
+    // stack: carry, prod_lo', prod_hi', carry, i, a_cur_loc, b_cur_loc, val, retdest
+    DUP2
+    // stack: prod_lo', carry, prod_lo', prod_hi', carry, i, a_cur_loc, b_cur_loc, val, retdest
+    ADD
+    %shl_const(128)
+    %shr_const(128)
+    // stack: to_write = (prod_lo' + carry) % 2^128, prod_lo', prod_hi', carry, i, a_cur_loc, b_cur_loc, val, retdest
+    SWAP1
+    // stack: prod_lo', to_write, prod_hi', carry, i, a_cur_loc, b_cur_loc, val, retdest
+    DUP2
+    // stack: to_write, prod_lo', to_write, prod_hi', carry, i, a_cur_loc, b_cur_loc, val, retdest
+    LT
+    // stack: carry_new = to_write < prod_lo', to_write, prod_hi', carry, i, a_cur_loc, b_cur_loc, val, retdest
+    %stack (cn, tw, ph, c) -> (cn, ph, tw)
+    // stack: carry_new, prod_hi', to_write, i, a_cur_loc, b_cur_loc, val, retdest
+    ADD
+    // stack: carry = carry_new' + prod_hi', to_write, i, a_cur_loc, b_cur_loc, val, retdest
+    SWAP1
+    // stack: to_write, carry, i, a_cur_loc, b_cur_loc, val, retdest
+    DUP4
+    // stack: a_cur_loc, to_write, carry, i, a_cur_loc, b_cur_loc, val, retdest
+    %mstore_kernel_general
+    // stack: carry, i, a_cur_loc, b_cur_loc, val, retdest
+    SWAP1
+    // stack: i, carry, a_cur_loc, b_cur_loc, val, retdest
+    %decrement
+    // stack: i-1, carry, a_cur_loc, b_cur_loc, val, retdest
+    SWAP2
+    // stack: a_cur_loc, carry, i-1, b_cur_loc, val, retdest
+    %increment
+    // stack: a_cur_loc+1, carry, i-1, b_cur_loc, val, retdest
+    SWAP3
+    // stack: b_cur_loc, carry, i-1, a_cur_loc+1, val, retdest
+    %increment
+    // stack: b_cur_loc+1, carry, i-1, a_cur_loc+1, val, retdest
+    %stack (b, c, i, a) -> (c, i, a, b)
+    // stack: carry, i-1, a_cur_loc+1, b_cur_loc+1, val, retdest
+    DUP2
+    // stack: i-1, carry, i-1, a_cur_loc+1, b_cur_loc+1, val, retdest
+    %jumpi(addmul_loop)
+addmul_end:
+    // stack: carry_new, i-1, a_cur_loc+1, b_cur_loc+1, retdest
+    %stack (c, i, a, b) -> (c)
+    // stack: carry_new, retdest
+    SWAP1
+    // stack: retdest, carry_new
+    JUMP
diff --git a/evm/src/cpu/kernel/asm/bignum/mul.asm b/evm/src/cpu/kernel/asm/bignum/mul.asm
index d96518fe..1b947c23 100644
--- a/evm/src/cpu/kernel/asm/bignum/mul.asm
+++ b/evm/src/cpu/kernel/asm/bignum/mul.asm
@@ -1,101 +1,6 @@
 // Arithmetic on little-endian integers represented with 128-bit limbs.
 // All integers must be under a given length bound, and are padded with leading zeroes.
 
-// Multiplies a bignum by a single-limb value. Resulting limbs may be larger than 128 bits.
-// This is a naive multiplication algorithm (BasecaseMultiply from Modern Computer Arithmetic).
-mul_bignum_helper:
-    // stack: len, start_loc, val, retdest
-    DUP2
-    // stack: start_loc, len, start_loc, val, retdest
-    ADD
-    // stack: end_loc, start_loc, val, retdest
-    SWAP2
-    SWAP1
-    // stack: i=start_loc, val, end_loc, retdest
-mul_helper_loop:
-    // stack: i, val, end_loc, retdest
-    DUP1
-    // stack: i, i, val, end_loc, retdest
-    %mload_kernel_general
-    // stack: bignum[i], i, val, end_loc, retdest
-    DUP3
-    // stack: val, bignum[i], i, val, end_loc, retdest
-    MUL
-    // stack: val * bignum[i], i, val, end_loc, retdest
-    DUP2
-    // stack: i, val * bignum[i], i, val, end_loc, retdest
-    %mstore_kernel_general
-    // stack: i, val, end_loc, retdest
-    %increment
-    // stack: i + 1, val, end_loc, retdest
-    DUP1
-    // stack: i + 1, i + 1, val, end_loc, retdest
-    DUP4
-    // stack: end_loc, i + 1, i + 1, val, end_loc, retdest
-    GT
-    %jumpi(mul_helper_loop)
-    // stack: n = 0, i, val, retdest
-    %pop3
-    // stack: retdest
-    JUMP
-
-// Reduces a bignum with limbs possibly greater than 128 bits to a normalized bignum with length len + 1.
-// Used after `mul_bignum_helper` to complete the process of multiplying a bignum by a constant value.
-mul_bignum_reduce_helper:
-    // stack: len, start_loc, retdest
-    DUP2
-    // stack: start_loc, len, start_loc, retdest
-    ADD
-    // stack: end_loc, start_loc, retdest
-    SWAP1
-    // stack: i=start_loc, end_loc, retdest
-reduce_loop:
-    // stack: i, end_loc, retdest
-    DUP1
-    // stack: i, i, end_loc, retdest
-    %mload_kernel_general
-    // stack: bignum[i], i, end_loc, retdest
-    DUP1
-    // stack: bignum[i], bignum[i], i, end_loc, retdest
-    %shl_const(128)
-    %shr_const(128)
-    // stack: bignum[i] % 2^128, bignum[i], i, end_loc, retdest
-    SWAP1
-    // stack: bignum[i], bignum[i] % 2^128, i, end_loc, retdest
-    %shr_const(128)
-    // stack: bignum[i] // 2^128, bignum[i] % 2^128, i, end_loc, retdest
-    DUP3
-    // stack: i, bignum[i] // 2^128, bignum[i] % 2^128, i, end_loc, retdest
-    %increment
-    // stack: i+1, bignum[i] // 2^128, bignum[i] % 2^128, i, end_loc, retdest
-    SWAP1
-    // stack: bignum[i] // 2^128, i+1, bignum[i] % 2^128, i, end_loc, retdest
-    DUP2
-    // stack: i+1, bignum[i] // 2^128, i+1, bignum[i] % 2^128, i, end_loc, retdest
-    %mload_kernel_general
-    // stack: bignum[i+1], bignum[i] // 2^128, i+1, bignum[i] % 2^128, i, end_loc, retdest
-    ADD
-    // stack: bignum[i+1] + bignum[i] // 2^128, i+1, bignum[i] % 2^128, i, end_loc, retdest
-    SWAP1
-    // stack: i+1, bignum[i+1] + bignum[i] // 2^128, bignum[i] % 2^128, i, end_loc, retdest
-    %mstore_kernel_general
-    // stack: bignum[i] % 2^128, i, end_loc, retdest
-    DUP2
-    // stack: i, bignum[i] % 2^128, i, end_loc, retdest
-    %mstore_kernel_general
-    // stack: i, end_loc, retdest
-    %increment
-    // stack: i + 1, end_loc, retdest
-    %stack (vals: 2) -> (vals, vals)
-    // stack: i + 1, end_loc, i + 1, end_loc, retdest
-    EQ
-    %jumpi(reduce_loop)
-reduce_end:
-    // stack: n = 0, i, retdest
-    %pop2
-    // stack: retdest
-    JUMP
-
 // Stores a * b in output_loc, leaving a and b unchanged.
 // Both a and b have length len; a * b will have length 2 * len.
 // Both output_loc and scratch_space must be initialized as zeroes (2 * len of them in the case
@@ -130,43 +35,21 @@ mul_loop:
     %mstore_kernel_general
     // stack: len, n, a_start_loc, bi, output_cur, scratch_space, retdest
 
-    // Use scratch_space to multiply a by b[i].
-    PUSH mul_return_1
-    // stack: mul_return_1, len, n, a_start_loc, bi, output_cur, scratch_space, retdest
+    // Multiply a by b[i] and add into output_cur.
+    PUSH mul_return
+    // stack: mul_return, len, n, a_start_loc, bi, output_cur, scratch_space, retdest
     DUP5
-    // stack: bi, mul_return_1, len, n, a_start_loc, bi, output_cur, scratch_space, retdest
+    // stack: bi, mul_return, len, n, a_start_loc, bi, output_cur, scratch_space, retdest
     %mload_kernel_general
-    // stack: b[i], mul_return_1, len, n, a_start_loc, bi, output_cur, scratch_space, retdest
+    // stack: b[i], mul_return, len, n, a_start_loc, bi, output_cur, scratch_space, retdest
+    DUP5
+    // stack: a_start_loc, b[i], mul_return, len, n, a_start_loc, bi, output_cur, scratch_space, retdest
     DUP8
-    // stack: scratch_space, b[i], mul_return_1, len, n, a_start_loc, bi, output_cur, scratch_space, retdest
-    DUP4
-    // stack: len, scratch_space, b[i], mul_return_1, len, n, a_start_loc, bi, output_cur, scratch_space, retdest
-    %jump(mul_bignum_helper)
-mul_return_1:
-    // stack: len, n, a_start_loc, bi, output_cur, scratch_space, retdest
-    PUSH mul_return_2
-    // stack: mul_return_2, len, n, a_start_loc, bi, output_cur, scratch_space, retdest
-    DUP7
-    // stack: scratch_space, mul_return_2, len, n, a_start_loc, bi, output_cur, scratch_space, retdest
-    DUP3
-    // stack: len, scratch_space, mul_return_2, len, n, a_start_loc, bi, output_cur, scratch_space, retdest
-    %jump(mul_bignum_reduce_helper)
-mul_return_2:
-    // stack: len, n, a_start_loc, bi, output_cur, scratch_space, retdest
-
-    // Add the multiplication result into output_cur = output_len[i].
-    PUSH mul_return_3
-    // stack: mul_return_3, len, n, a_start_loc, bi, output_cur, scratch_space, retdest
-    DUP7
-    // stack: scratch_space, mul_return_3, len, n, a_start_loc, bi, output_cur, scratch_space, retdest
-    DUP7
-    // stack: output_cur, scratch_space, mul_return_3, len, n, a_start_loc, bi, output_cur, scratch_space, retdest
-    DUP4
-    // stack: len, output_cur, scratch_space, mul_return_3, len, n, a_start_loc, bi, output_cur, scratch_space, retdest
-    %increment
-    // stack: len + 1, output_cur, scratch_space, mul_return_3, len, n, a_start_loc, bi, output_cur, scratch_space, retdest
-    %jump(add_bignum)
-mul_return_3:
+    // stack: output_cur, a_start_loc, b[i], mul_return, len, n, a_start_loc, bi, output_cur, scratch_space, retdest
+    DUP5
+    // stack: len, output_cur, a_start_loc, b[i], mul_return, len, n, a_start_loc, bi, output_cur, scratch_space, retdest
+    %jump(addmul_bignum)
+mul_return:
     // stack: carry, len, n, a_start_loc, bi, output_cur, scratch_space, retdest
     DUP6
     // stack: output_cur, carry, len, n, a_start_loc, bi, output_cur, scratch_space, retdest
diff --git a/evm/src/cpu/kernel/tests/bignum.rs b/evm/src/cpu/kernel/tests/bignum.rs
index f866caad..49d28342 100644
--- a/evm/src/cpu/kernel/tests/bignum.rs
+++ b/evm/src/cpu/kernel/tests/bignum.rs
@@ -210,6 +210,8 @@ fn test_mul_bignum() -> Result<()> {
     // Run mul function.
     interpreter.run()?;
 
+    dbg!(interpreter.stack());
+
     // Determine actual product.
     let new_memory = interpreter.get_kernel_general_memory();
     let output_location: usize = output_loc.try_into().unwrap();