Merge branch 'main' into glv

# Conflicts: # ecdsa/src/curve/ecdsa.rs # plonky2/src/gadgets/ecdsa.rs
2026-02-26 16:53:12 +00:00 · 2022-03-18 07:19:28 +01:00 · 2022-03-18 07:19:28 +01:00 · c472afe12e
commit c472afe12e
parent deec6a784c 7d6c0a448d
57 changed files with 2528 additions and 1168 deletions
--- a/ecdsa/src/curve/ecdsa.rs
+++ b/ecdsa/src/curve/ecdsa.rs
@ -1,8 +1,8 @@
-use plonky2_field::field_types::Field;
 use serde::{Deserialize, Serialize};

 use crate::curve::curve_msm::msm_parallel;
 use crate::curve::curve_types::{base_to_scalar, AffinePoint, Curve, CurveScalar};
+use crate::field::field_types::Field;

 #[derive(Copy, Clone, Debug, Deserialize, Eq, Hash, PartialEq, Serialize)]
 pub struct ECDSASignature<C: Curve> {
@ -16,6 +16,10 @@ pub struct ECDSASecretKey<C: Curve>(pub C::ScalarField);
 #[derive(Copy, Clone, Debug, Deserialize, Eq, Hash, PartialEq, Serialize)]
 pub struct ECDSAPublicKey<C: Curve>(pub AffinePoint<C>);

+pub fn secret_to_public<C: Curve>(sk: ECDSASecretKey<C>) -> ECDSAPublicKey<C> {
+    ECDSAPublicKey((CurveScalar(sk.0) * C::GENERATOR_PROJECTIVE).to_affine())
+}
+
 pub fn sign_message<C: Curve>(msg: C::ScalarField, sk: ECDSASecretKey<C>) -> ECDSASignature<C> {
    let (k, rr) = {
        let mut k = C::ScalarField::rand();
@ -57,20 +61,18 @@ pub fn verify_message<C: Curve>(

 #[cfg(test)]
 mod tests {
-    use plonky2_field::field_types::Field;
-    use plonky2_field::secp256k1_scalar::Secp256K1Scalar;
-
-    use crate::curve::curve_types::{Curve, CurveScalar};
-    use crate::curve::ecdsa::{sign_message, verify_message, ECDSAPublicKey, ECDSASecretKey};
+    use crate::curve::ecdsa::{secret_to_public, sign_message, verify_message, ECDSASecretKey};
    use crate::curve::secp256k1::Secp256K1;
+    use crate::field::field_types::Field;
+    use crate::field::secp256k1_scalar::Secp256K1Scalar;

    #[test]
    fn test_ecdsa_native() {
        type C = Secp256K1;

        let msg = Secp256K1Scalar::rand();
-        let sk = ECDSASecretKey(Secp256K1Scalar::rand());
-        let pk = ECDSAPublicKey((CurveScalar(sk.0) * C::GENERATOR_PROJECTIVE).to_affine());
+        let sk = ECDSASecretKey::<C>(Secp256K1Scalar::rand());
+        let pk = secret_to_public(sk);

        let sig = sign_message(msg, sk);
        let result = verify_message(msg, sig, pk);
--- a/field/Cargo.toml
+++ b/field/Cargo.toml
@ -12,3 +12,4 @@ num = { version = "0.4", features = [ "rand" ] }
 rand = "0.8.4"
 serde = { version = "1.0", features = ["derive"] }
 unroll = "0.1.5"
+static_assertions = "1.1.0"
--- a/field/src/extension_field/quadratic.rs
+++ b/field/src/extension_field/quadratic.rs
@ -170,7 +170,7 @@ impl<F: Extendable<2>> Mul for QuadraticExtension<F> {
    type Output = Self;

    #[inline]
-    fn mul(self, rhs: Self) -> Self {
+    default fn mul(self, rhs: Self) -> Self {
        let Self([a0, a1]) = self;
        let Self([b0, b1]) = rhs;

--- a/field/src/extension_field/quartic.rs
+++ b/field/src/extension_field/quartic.rs
@ -201,7 +201,7 @@ impl<F: Extendable<4>> Mul for QuarticExtension<F> {
    type Output = Self;

    #[inline]
-    fn mul(self, rhs: Self) -> Self {
+    default fn mul(self, rhs: Self) -> Self {
        let Self([a0, a1, a2, a3]) = self;
        let Self([b0, b1, b2, b3]) = rhs;

--- a/field/src/extension_field/quintic.rs
+++ b/field/src/extension_field/quintic.rs
@ -201,7 +201,7 @@ impl<F: Extendable<5>> Mul for QuinticExtension<F> {
    type Output = Self;

    #[inline]
-    fn mul(self, rhs: Self) -> Self {
+    default fn mul(self, rhs: Self) -> Self {
        let Self([a0, a1, a2, a3, a4]) = self;
        let Self([b0, b1, b2, b3, b4]) = rhs;
        let w = <Self as OEF<5>>::W;
--- a/field/src/field_types.rs
+++ b/field/src/field_types.rs
@ -462,6 +462,11 @@ pub trait PrimeField64: PrimeField + Field64 {
    fn to_canonical_u64(&self) -> u64;

    fn to_noncanonical_u64(&self) -> u64;
+
+    #[inline(always)]
+    fn to_canonical(&self) -> Self {
+        Self::from_canonical_u64(self.to_canonical_u64())
+    }
 }

 /// An iterator over the powers of a certain base element `b`: `b^0, b^1, b^2, ...`.
--- a/field/src/goldilocks_extensions.rs
+++ b/field/src/goldilocks_extensions.rs
@ -0,0 +1,495 @@
+use std::ops::Mul;
+
+use static_assertions::const_assert;
+
+use crate::extension_field::quadratic::QuadraticExtension;
+use crate::extension_field::quartic::QuarticExtension;
+use crate::extension_field::quintic::QuinticExtension;
+use crate::extension_field::{Extendable, Frobenius};
+use crate::field_types::Field;
+use crate::goldilocks_field::{reduce160, GoldilocksField};
+
+impl Frobenius<1> for GoldilocksField {}
+
+impl Extendable<2> for GoldilocksField {
+    type Extension = QuadraticExtension<Self>;
+
+    // Verifiable in Sage with
+    // `R.<x> = GF(p)[]; assert (x^2 - 7).is_irreducible()`.
+    const W: Self = Self(7);
+
+    // DTH_ROOT = W^((ORDER - 1)/2)
+    const DTH_ROOT: Self = Self(18446744069414584320);
+
+    const EXT_MULTIPLICATIVE_GROUP_GENERATOR: [Self; 2] =
+        [Self(18081566051660590251), Self(16121475356294670766)];
+
+    const EXT_POWER_OF_TWO_GENERATOR: [Self; 2] = [Self(0), Self(15659105665374529263)];
+}
+
+impl Mul for QuadraticExtension<GoldilocksField> {
+    #[inline]
+    fn mul(self, rhs: Self) -> Self {
+        let Self([a0, a1]) = self;
+        let Self([b0, b1]) = rhs;
+        let c = ext2_mul([a0.0, a1.0], [b0.0, b1.0]);
+        Self(c)
+    }
+}
+
+impl Extendable<4> for GoldilocksField {
+    type Extension = QuarticExtension<Self>;
+
+    const W: Self = Self(7);
+
+    // DTH_ROOT = W^((ORDER - 1)/4)
+    const DTH_ROOT: Self = Self(281474976710656);
+
+    const EXT_MULTIPLICATIVE_GROUP_GENERATOR: [Self; 4] = [
+        Self(5024755240244648895),
+        Self(13227474371289740625),
+        Self(3912887029498544536),
+        Self(3900057112666848848),
+    ];
+
+    const EXT_POWER_OF_TWO_GENERATOR: [Self; 4] =
+        [Self(0), Self(0), Self(0), Self(12587610116473453104)];
+}
+
+impl Mul for QuarticExtension<GoldilocksField> {
+    #[inline]
+    fn mul(self, rhs: Self) -> Self {
+        let Self([a0, a1, a2, a3]) = self;
+        let Self([b0, b1, b2, b3]) = rhs;
+        let c = ext4_mul([a0.0, a1.0, a2.0, a3.0], [b0.0, b1.0, b2.0, b3.0]);
+        Self(c)
+    }
+}
+
+impl Extendable<5> for GoldilocksField {
+    type Extension = QuinticExtension<Self>;
+
+    const W: Self = Self(3);
+
+    // DTH_ROOT = W^((ORDER - 1)/5)
+    const DTH_ROOT: Self = Self(1041288259238279555);
+
+    const EXT_MULTIPLICATIVE_GROUP_GENERATOR: [Self; 5] = [
+        Self(2899034827742553394),
+        Self(13012057356839176729),
+        Self(14593811582388663055),
+        Self(7722900811313895436),
+        Self(4557222484695340057),
+    ];
+
+    const EXT_POWER_OF_TWO_GENERATOR: [Self; 5] = [
+        Self::POWER_OF_TWO_GENERATOR,
+        Self(0),
+        Self(0),
+        Self(0),
+        Self(0),
+    ];
+}
+
+impl Mul for QuinticExtension<GoldilocksField> {
+    #[inline]
+    fn mul(self, rhs: Self) -> Self {
+        let Self([a0, a1, a2, a3, a4]) = self;
+        let Self([b0, b1, b2, b3, b4]) = rhs;
+        let c = ext5_mul(
+            [a0.0, a1.0, a2.0, a3.0, a4.0],
+            [b0.0, b1.0, b2.0, b3.0, b4.0],
+        );
+        Self(c)
+    }
+}
+
+/*
+ * The functions extD_add_prods[0-4] are helper functions for
+ * computing products for extensions of degree D over the Goldilocks
+ * field. They are faster than the generic method because all
+ * reductions are delayed until the end which means only one per
+ * result coefficient is necessary.
+ */
+
+/// Return a, b such that a + b*2^128 = 3*x with a < 2^128 and b < 2^32.
+#[inline(always)]
+fn u160_times_3(x: u128, y: u32) -> (u128, u32) {
+    let (s, cy) = x.overflowing_add(x << 1);
+    (s, 3 * y + (x >> 127) as u32 + cy as u32)
+}
+
+/// Return a, b such that a + b*2^128 = 7*x with a < 2^128 and b < 2^32.
+#[inline(always)]
+fn u160_times_7(x: u128, y: u32) -> (u128, u32) {
+    let (d, br) = (x << 3).overflowing_sub(x);
+    // NB: subtracting the borrow can't underflow
+    (d, 7 * y + (x >> (128 - 3)) as u32 - br as u32)
+}
+
+/*
+ * Quadratic multiplication and squaring
+ */
+
+#[inline(always)]
+fn ext2_add_prods0(a: &[u64; 2], b: &[u64; 2]) -> GoldilocksField {
+    // Computes a0 * b0 + W * a1 * b1;
+    let [a0, a1] = *a;
+    let [b0, b1] = *b;
+
+    let cy;
+
+    // W * a1 * b1
+    let (mut cumul_lo, mut cumul_hi) = u160_times_7((a1 as u128) * (b1 as u128), 0u32);
+
+    // a0 * b0
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a0 as u128) * (b0 as u128));
+    cumul_hi += cy as u32;
+
+    unsafe { reduce160(cumul_lo, cumul_hi) }
+}
+
+#[inline(always)]
+fn ext2_add_prods1(a: &[u64; 2], b: &[u64; 2]) -> GoldilocksField {
+    // Computes a0 * b1 + a1 * b0;
+    let [a0, a1] = *a;
+    let [b0, b1] = *b;
+
+    let cy;
+
+    // a0 * b1
+    let mut cumul_lo = (a0 as u128) * (b1 as u128);
+
+    // a1 * b0
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a1 as u128) * (b0 as u128));
+    let cumul_hi = cy as u32;
+
+    unsafe { reduce160(cumul_lo, cumul_hi) }
+}
+
+/// Multiply a and b considered as elements of GF(p^2).
+#[inline(always)]
+pub(crate) fn ext2_mul(a: [u64; 2], b: [u64; 2]) -> [GoldilocksField; 2] {
+    // The code in ext2_add_prods[01] assumes the quadratic extension
+    // generator is 7.
+    const_assert!(<GoldilocksField as Extendable<2>>::W.0 == 7u64);
+
+    let c0 = ext2_add_prods0(&a, &b);
+    let c1 = ext2_add_prods1(&a, &b);
+    [c0, c1]
+}
+
+/*
+ * Quartic multiplication and squaring
+ */
+
+#[inline(always)]
+fn ext4_add_prods0(a: &[u64; 4], b: &[u64; 4]) -> GoldilocksField {
+    // Computes c0 = a0 * b0 + W * (a1 * b3 + a2 * b2 + a3 * b1)
+
+    let [a0, a1, a2, a3] = *a;
+    let [b0, b1, b2, b3] = *b;
+
+    let mut cy;
+
+    // a1 * b3
+    let mut cumul_lo = (a1 as u128) * (b3 as u128);
+
+    // a2 * b2
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a2 as u128) * (b2 as u128));
+    let mut cumul_hi = cy as u32;
+
+    // a3 * b1
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a3 as u128) * (b1 as u128));
+    cumul_hi += cy as u32;
+
+    // * W
+    (cumul_lo, cumul_hi) = u160_times_7(cumul_lo, cumul_hi);
+
+    // a0 * b0
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a0 as u128) * (b0 as u128));
+    cumul_hi += cy as u32;
+
+    unsafe { reduce160(cumul_lo, cumul_hi) }
+}
+
+#[inline(always)]
+fn ext4_add_prods1(a: &[u64; 4], b: &[u64; 4]) -> GoldilocksField {
+    // Computes c1 = a0 * b1 + a1 * b0 + W * (a2 * b3 + a3 * b2);
+
+    let [a0, a1, a2, a3] = *a;
+    let [b0, b1, b2, b3] = *b;
+
+    let mut cy;
+
+    // a2 * b3
+    let mut cumul_lo = (a2 as u128) * (b3 as u128);
+
+    // a3 * b2
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a3 as u128) * (b2 as u128));
+    let mut cumul_hi = cy as u32;
+
+    // * W
+    (cumul_lo, cumul_hi) = u160_times_7(cumul_lo, cumul_hi);
+
+    // a0 * b1
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a0 as u128) * (b1 as u128));
+    cumul_hi += cy as u32;
+
+    // a1 * b0
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a1 as u128) * (b0 as u128));
+    cumul_hi += cy as u32;
+
+    unsafe { reduce160(cumul_lo, cumul_hi) }
+}
+
+#[inline(always)]
+fn ext4_add_prods2(a: &[u64; 4], b: &[u64; 4]) -> GoldilocksField {
+    // Computes c2 = a0 * b2 + a1 * b1 + a2 * b0 + W * a3 * b3;
+
+    let [a0, a1, a2, a3] = *a;
+    let [b0, b1, b2, b3] = *b;
+
+    let mut cy;
+
+    // W * a3 * b3
+    let (mut cumul_lo, mut cumul_hi) = u160_times_7((a3 as u128) * (b3 as u128), 0u32);
+
+    // a0 * b2
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a0 as u128) * (b2 as u128));
+    cumul_hi += cy as u32;
+
+    // a1 * b1
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a1 as u128) * (b1 as u128));
+    cumul_hi += cy as u32;
+
+    // a2 * b0
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a2 as u128) * (b0 as u128));
+    cumul_hi += cy as u32;
+
+    unsafe { reduce160(cumul_lo, cumul_hi) }
+}
+
+#[inline(always)]
+fn ext4_add_prods3(a: &[u64; 4], b: &[u64; 4]) -> GoldilocksField {
+    // Computes c3 = a0 * b3 + a1 * b2 + a2 * b1 + a3 * b0;
+
+    let [a0, a1, a2, a3] = *a;
+    let [b0, b1, b2, b3] = *b;
+
+    let mut cy;
+
+    // a0 * b3
+    let mut cumul_lo = (a0 as u128) * (b3 as u128);
+
+    // a1 * b2
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a1 as u128) * (b2 as u128));
+    let mut cumul_hi = cy as u32;
+
+    // a2 * b1
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a2 as u128) * (b1 as u128));
+    cumul_hi += cy as u32;
+
+    // a3 * b0
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a3 as u128) * (b0 as u128));
+    cumul_hi += cy as u32;
+
+    unsafe { reduce160(cumul_lo, cumul_hi) }
+}
+
+/// Multiply a and b considered as elements of GF(p^4).
+#[inline(always)]
+pub(crate) fn ext4_mul(a: [u64; 4], b: [u64; 4]) -> [GoldilocksField; 4] {
+    // The code in ext4_add_prods[0-3] assumes the quartic extension
+    // generator is 7.
+    const_assert!(<GoldilocksField as Extendable<4>>::W.0 == 7u64);
+
+    let c0 = ext4_add_prods0(&a, &b);
+    let c1 = ext4_add_prods1(&a, &b);
+    let c2 = ext4_add_prods2(&a, &b);
+    let c3 = ext4_add_prods3(&a, &b);
+    [c0, c1, c2, c3]
+}
+
+/*
+ * Quintic multiplication and squaring
+ */
+
+#[inline(always)]
+fn ext5_add_prods0(a: &[u64; 5], b: &[u64; 5]) -> GoldilocksField {
+    // Computes c0 = a0 * b0 + W * (a1 * b4 + a2 * b3 + a3 * b2 + a4 * b1)
+
+    let [a0, a1, a2, a3, a4] = *a;
+    let [b0, b1, b2, b3, b4] = *b;
+
+    let mut cy;
+
+    // a1 * b4
+    let mut cumul_lo = (a1 as u128) * (b4 as u128);
+
+    // a2 * b3
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a2 as u128) * (b3 as u128));
+    let mut cumul_hi = cy as u32;
+
+    // a3 * b2
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a3 as u128) * (b2 as u128));
+    cumul_hi += cy as u32;
+
+    // a4 * b1
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a4 as u128) * (b1 as u128));
+    cumul_hi += cy as u32;
+
+    // * W
+    (cumul_lo, cumul_hi) = u160_times_3(cumul_lo, cumul_hi);
+
+    // a0 * b0
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a0 as u128) * (b0 as u128));
+    cumul_hi += cy as u32;
+
+    unsafe { reduce160(cumul_lo, cumul_hi) }
+}
+
+#[inline(always)]
+fn ext5_add_prods1(a: &[u64; 5], b: &[u64; 5]) -> GoldilocksField {
+    // Computes c1 = a0 * b1 + a1 * b0 + W * (a2 * b4 + a3 * b3 + a4 * b2);
+
+    let [a0, a1, a2, a3, a4] = *a;
+    let [b0, b1, b2, b3, b4] = *b;
+
+    let mut cy;
+
+    // a2 * b4
+    let mut cumul_lo = (a2 as u128) * (b4 as u128);
+
+    // a3 * b3
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a3 as u128) * (b3 as u128));
+    let mut cumul_hi = cy as u32;
+
+    // a4 * b2
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a4 as u128) * (b2 as u128));
+    cumul_hi += cy as u32;
+
+    // * W
+    (cumul_lo, cumul_hi) = u160_times_3(cumul_lo, cumul_hi);
+
+    // a0 * b1
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a0 as u128) * (b1 as u128));
+    cumul_hi += cy as u32;
+
+    // a1 * b0
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a1 as u128) * (b0 as u128));
+    cumul_hi += cy as u32;
+
+    unsafe { reduce160(cumul_lo, cumul_hi) }
+}
+
+#[inline(always)]
+fn ext5_add_prods2(a: &[u64; 5], b: &[u64; 5]) -> GoldilocksField {
+    // Computes c2 = a0 * b2 + a1 * b1 + a2 * b0 + W * (a3 * b4 + a4 * b3);
+
+    let [a0, a1, a2, a3, a4] = *a;
+    let [b0, b1, b2, b3, b4] = *b;
+
+    let mut cy;
+
+    // a3 * b4
+    let mut cumul_lo = (a3 as u128) * (b4 as u128);
+
+    // a4 * b3
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a4 as u128) * (b3 as u128));
+    let mut cumul_hi = cy as u32;
+
+    // * W
+    (cumul_lo, cumul_hi) = u160_times_3(cumul_lo, cumul_hi);
+
+    // a0 * b2
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a0 as u128) * (b2 as u128));
+    cumul_hi += cy as u32;
+
+    // a1 * b1
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a1 as u128) * (b1 as u128));
+    cumul_hi += cy as u32;
+
+    // a2 * b0
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a2 as u128) * (b0 as u128));
+    cumul_hi += cy as u32;
+
+    unsafe { reduce160(cumul_lo, cumul_hi) }
+}
+
+#[inline(always)]
+fn ext5_add_prods3(a: &[u64; 5], b: &[u64; 5]) -> GoldilocksField {
+    // Computes c3 = a0 * b3 + a1 * b2 + a2 * b1 + a3 * b0 + W * a4 * b4;
+
+    let [a0, a1, a2, a3, a4] = *a;
+    let [b0, b1, b2, b3, b4] = *b;
+
+    let mut cy;
+
+    // W * a4 * b4
+    let (mut cumul_lo, mut cumul_hi) = u160_times_3((a4 as u128) * (b4 as u128), 0u32);
+
+    // a0 * b3
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a0 as u128) * (b3 as u128));
+    cumul_hi += cy as u32;
+
+    // a1 * b2
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a1 as u128) * (b2 as u128));
+    cumul_hi += cy as u32;
+
+    // a2 * b1
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a2 as u128) * (b1 as u128));
+    cumul_hi += cy as u32;
+
+    // a3 * b0
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a3 as u128) * (b0 as u128));
+    cumul_hi += cy as u32;
+
+    unsafe { reduce160(cumul_lo, cumul_hi) }
+}
+
+#[inline(always)]
+fn ext5_add_prods4(a: &[u64; 5], b: &[u64; 5]) -> GoldilocksField {
+    // Computes c4 = a0 * b4 + a1 * b3 + a2 * b2 + a3 * b1 + a4 * b0;
+
+    let [a0, a1, a2, a3, a4] = *a;
+    let [b0, b1, b2, b3, b4] = *b;
+
+    let mut cy;
+
+    // a0 * b4
+    let mut cumul_lo = (a0 as u128) * (b4 as u128);
+
+    // a1 * b3
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a1 as u128) * (b3 as u128));
+    let mut cumul_hi = cy as u32;
+
+    // a2 * b2
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a2 as u128) * (b2 as u128));
+    cumul_hi += cy as u32;
+
+    // a3 * b1
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a3 as u128) * (b1 as u128));
+    cumul_hi += cy as u32;
+
+    // a4 * b0
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a4 as u128) * (b0 as u128));
+    cumul_hi += cy as u32;
+
+    unsafe { reduce160(cumul_lo, cumul_hi) }
+}
+
+/// Multiply a and b considered as elements of GF(p^5).
+#[inline(always)]
+pub(crate) fn ext5_mul(a: [u64; 5], b: [u64; 5]) -> [GoldilocksField; 5] {
+    // The code in ext5_add_prods[0-4] assumes the quintic extension
+    // generator is 3.
+    const_assert!(<GoldilocksField as Extendable<5>>::W.0 == 3u64);
+
+    let c0 = ext5_add_prods0(&a, &b);
+    let c1 = ext5_add_prods1(&a, &b);
+    let c2 = ext5_add_prods2(&a, &b);
+    let c3 = ext5_add_prods3(&a, &b);
+    let c4 = ext5_add_prods4(&a, &b);
+    [c0, c1, c2, c3, c4]
+}
--- a/field/src/goldilocks_field.rs
+++ b/field/src/goldilocks_field.rs
@ -9,10 +9,6 @@ use plonky2_util::{assume, branch_hint};
 use rand::Rng;
 use serde::{Deserialize, Serialize};

-use crate::extension_field::quadratic::QuadraticExtension;
-use crate::extension_field::quartic::QuarticExtension;
-use crate::extension_field::quintic::QuinticExtension;
-use crate::extension_field::{Extendable, Frobenius};
 use crate::field_types::{Field, Field64, PrimeField, PrimeField64};
 use crate::inversion::try_inverse_u64;

@ -99,7 +95,7 @@ impl Field for GoldilocksField {
        Self(n.mod_floor(&Self::order()).to_u64_digits()[0])
    }

-    #[inline]
+    #[inline(always)]
    fn from_canonical_u64(n: u64) -> Self {
        debug_assert!(n < Self::ORDER);
        Self(n)
@ -160,6 +156,7 @@ impl PrimeField64 for GoldilocksField {
        c
    }

+    #[inline(always)]
    fn to_noncanonical_u64(&self) -> u64 {
        self.0
    }
@ -283,66 +280,6 @@ impl DivAssign for GoldilocksField {
    }
 }

-impl Extendable<2> for GoldilocksField {
-    type Extension = QuadraticExtension<Self>;
-
-    // Verifiable in Sage with
-    // `R.<x> = GF(p)[]; assert (x^2 - 7).is_irreducible()`.
-    const W: Self = Self(7);
-
-    // DTH_ROOT = W^((ORDER - 1)/2)
-    const DTH_ROOT: Self = Self(18446744069414584320);
-
-    const EXT_MULTIPLICATIVE_GROUP_GENERATOR: [Self; 2] =
-        [Self(18081566051660590251), Self(16121475356294670766)];
-
-    const EXT_POWER_OF_TWO_GENERATOR: [Self; 2] = [Self(0), Self(15659105665374529263)];
-}
-
-impl Extendable<4> for GoldilocksField {
-    type Extension = QuarticExtension<Self>;
-
-    const W: Self = Self(7);
-
-    // DTH_ROOT = W^((ORDER - 1)/4)
-    const DTH_ROOT: Self = Self(281474976710656);
-
-    const EXT_MULTIPLICATIVE_GROUP_GENERATOR: [Self; 4] = [
-        Self(5024755240244648895),
-        Self(13227474371289740625),
-        Self(3912887029498544536),
-        Self(3900057112666848848),
-    ];
-
-    const EXT_POWER_OF_TWO_GENERATOR: [Self; 4] =
-        [Self(0), Self(0), Self(0), Self(12587610116473453104)];
-}
-
-impl Extendable<5> for GoldilocksField {
-    type Extension = QuinticExtension<Self>;
-
-    const W: Self = Self(3);
-
-    // DTH_ROOT = W^((ORDER - 1)/5)
-    const DTH_ROOT: Self = Self(1041288259238279555);
-
-    const EXT_MULTIPLICATIVE_GROUP_GENERATOR: [Self; 5] = [
-        Self(2899034827742553394),
-        Self(13012057356839176729),
-        Self(14593811582388663055),
-        Self(7722900811313895436),
-        Self(4557222484695340057),
-    ];
-
-    const EXT_POWER_OF_TWO_GENERATOR: [Self; 5] = [
-        Self::POWER_OF_TWO_GENERATOR,
-        Self(0),
-        Self(0),
-        Self(0),
-        Self(0),
-    ];
-}
-
 /// Fast addition modulo ORDER for x86-64.
 /// This function is marked unsafe for the following reasons:
 ///   - It is only correct if x + y < 2**64 + ORDER = 0x1ffffffff00000001.
@ -407,7 +344,34 @@ fn split(x: u128) -> (u64, u64) {
    (x as u64, (x >> 64) as u64)
 }

-impl Frobenius<1> for GoldilocksField {}
+/// Reduce the value x_lo + x_hi * 2^128 to an element in the
+/// Goldilocks field.
+///
+/// This function is marked 'unsafe' because correctness relies on the
+/// unchecked assumption that x < 2^160 - 2^128 + 2^96. Further,
+/// performance may degrade as x_hi increases beyond 2**40 or so.
+#[inline(always)]
+pub(crate) unsafe fn reduce160(x_lo: u128, x_hi: u32) -> GoldilocksField {
+    let x_hi = (x_lo >> 96) as u64 + ((x_hi as u64) << 32); // shld to form x_hi
+    let x_mid = (x_lo >> 64) as u32; // shr to form x_mid
+    let x_lo = x_lo as u64;
+
+    // sub + jc (should fuse)
+    let (mut t0, borrow) = x_lo.overflowing_sub(x_hi);
+    if borrow {
+        // The maximum possible value of x is (2^64 - 1)^2 * 4 * 7 < 2^133,
+        // so x_hi < 2^37. A borrow will happen roughly one in 134 million
+        // times, so it's best to branch.
+        branch_hint();
+        // NB: this assumes that x < 2^160 - 2^128 + 2^96.
+        t0 -= EPSILON; // Cannot underflow if x_hi is canonical.
+    }
+    // imul
+    let t1 = (x_mid as u64) * EPSILON;
+    // add, sbb, add
+    let t2 = add_no_canonicalize_trashing_input(t0, t1);
+    GoldilocksField(t2)
+}

 #[cfg(test)]
 mod tests {
--- a/field/src/lib.rs
+++ b/field/src/lib.rs
@ -15,6 +15,7 @@ pub mod cosets;
 pub mod extension_field;
 pub mod fft;
 pub mod field_types;
+pub mod goldilocks_extensions;
 pub mod goldilocks_field;
 pub mod interpolation;
 mod inversion;
--- a/plonky2/benches/field_arithmetic.rs
+++ b/plonky2/benches/field_arithmetic.rs
@ -1,4 +1,5 @@
 use criterion::{criterion_group, criterion_main, BatchSize, Criterion};
+use plonky2::field::extension_field::quadratic::QuadraticExtension;
 use plonky2::field::extension_field::quartic::QuarticExtension;
 use plonky2::field::extension_field::quintic::QuinticExtension;
 use plonky2::field::field_types::Field;
@ -175,6 +176,7 @@ pub(crate) fn bench_field<F: Field>(c: &mut Criterion) {

 fn criterion_benchmark(c: &mut Criterion) {
    bench_field::<GoldilocksField>(c);
+    bench_field::<QuadraticExtension<GoldilocksField>>(c);
    bench_field::<QuarticExtension<GoldilocksField>>(c);
    bench_field::<QuinticExtension<GoldilocksField>>(c);
 }
--- a/plonky2/src/bin/generate_constants.rs
+++ b/plonky2/src/bin/generate_constants.rs
@ -7,22 +7,15 @@ use plonky2_field::goldilocks_field::GoldilocksField;
 use rand::{Rng, SeedableRng};
 use rand_chacha::ChaCha8Rng;

-// For historical reasons, we sample from 0..0xffffffff70000001, which is slightly larger than the
-// range of GoldilocksField, then verify that each constant also fits in GoldilocksField.
-const SAMPLE_RANGE_END: u64 = 0xffffffff70000001;
+const SAMPLE_RANGE_END: u64 = GoldilocksField::ORDER;

-// const N: usize = 8 * 30; // For Posiedon-8
-const N: usize = 12 * 30; // For Posiedon-12
+const N: usize = 12 * 30; // For Poseidon-12

 pub(crate) fn main() {
    let mut rng = ChaCha8Rng::seed_from_u64(0);
    let mut constants = [0u64; N];
    for i in 0..N {
        constants[i] = rng.gen_range(0..SAMPLE_RANGE_END);
-        // Make sure the constant fits in Goldilocks. If so, we also have random numbers in
-        // GoldilocksField::ORDER. This may be viewed as rejection sampling, except that we never
-        // encounter a rejection in practice, so we don't bother handling it.
-        assert!(constants[i] < GoldilocksField::ORDER);
    }

    // Print the constants in the format we prefer in our code.
--- a/plonky2/src/fri/mod.rs
+++ b/plonky2/src/fri/mod.rs
@ -35,6 +35,7 @@ impl FriConfig {
        let reduction_arity_bits = self.reduction_strategy.reduction_arity_bits(
            degree_bits,
            self.rate_bits,
+            self.cap_height,
            self.num_query_rounds,
        );
        FriParams {
@ -67,7 +68,7 @@ pub struct FriParams {
 }

 impl FriParams {
-    pub(crate) fn total_arities(&self) -> usize {
+    pub fn total_arities(&self) -> usize {
        self.reduction_arity_bits.iter().sum()
    }

--- a/plonky2/src/fri/reduction_strategies.rs
+++ b/plonky2/src/fri/reduction_strategies.rs
@ -9,9 +9,10 @@ pub enum FriReductionStrategy {
    Fixed(Vec<usize>),

    /// `ConstantArityBits(arity_bits, final_poly_bits)` applies reductions of arity `2^arity_bits`
-    /// until the polynomial degree is `2^final_poly_bits` or less. This tends to work well in the
-    /// recursive setting, as it avoids needing multiple configurations of gates used in FRI
-    /// verification, such as `InterpolationGate`.
+    /// until the polynomial degree is less than or equal to `2^final_poly_bits` or until any further
+    /// `arity_bits`-reduction makes the last FRI tree have height less than `cap_height`.
+    /// This tends to work well in the recursive setting, as it avoids needing multiple configurations
+    /// of gates used in FRI verification, such as `InterpolationGate`.
    ConstantArityBits(usize, usize),

    /// `MinSize(opt_max_arity_bits)` searches for an optimal sequence of reduction arities, with an
@ -26,17 +27,20 @@ impl FriReductionStrategy {
        &self,
        mut degree_bits: usize,
        rate_bits: usize,
+        cap_height: usize,
        num_queries: usize,
    ) -> Vec<usize> {
        match self {
            FriReductionStrategy::Fixed(reduction_arity_bits) => reduction_arity_bits.to_vec(),

-            FriReductionStrategy::ConstantArityBits(arity_bits, final_poly_bits) => {
+            &FriReductionStrategy::ConstantArityBits(arity_bits, final_poly_bits) => {
                let mut result = Vec::new();
-                while degree_bits > *final_poly_bits {
-                    result.push(*arity_bits);
-                    assert!(degree_bits >= *arity_bits);
-                    degree_bits -= *arity_bits;
+                while degree_bits > final_poly_bits
+                    && degree_bits + rate_bits - arity_bits >= cap_height
+                {
+                    result.push(arity_bits);
+                    assert!(degree_bits >= arity_bits);
+                    degree_bits -= arity_bits;
                }
                result.shrink_to_fit();
                result
--- a/plonky2/src/gadgets/ecdsa.rs
+++ b/plonky2/src/gadgets/ecdsa.rs
--- a/plonky2/src/gates/poseidon_mds.rs
+++ b/plonky2/src/gates/poseidon_mds.rs
@ -51,9 +51,13 @@ impl<F: RichField + Extendable<D> + Poseidon, const D: usize> PoseidonMdsGate<F,
        let mut res = ExtensionAlgebra::ZERO;

        for i in 0..SPONGE_WIDTH {
-            let coeff = F::Extension::from_canonical_u64(1 << <F as Poseidon>::MDS_MATRIX_EXPS[i]);
+            let coeff = F::Extension::from_canonical_u64(<F as Poseidon>::MDS_MATRIX_CIRC[i]);
            res += v[(i + r) % SPONGE_WIDTH].scalar_mul(coeff);
        }
+        {
+            let coeff = F::Extension::from_canonical_u64(<F as Poseidon>::MDS_MATRIX_DIAG[r]);
+            res += v[r].scalar_mul(coeff);
+        }

        res
    }
@ -69,10 +73,16 @@ impl<F: RichField + Extendable<D> + Poseidon, const D: usize> PoseidonMdsGate<F,

        for i in 0..SPONGE_WIDTH {
            let coeff = builder.constant_extension(F::Extension::from_canonical_u64(
-                1 << <F as Poseidon>::MDS_MATRIX_EXPS[i],
+                <F as Poseidon>::MDS_MATRIX_CIRC[i],
            ));
            res = builder.scalar_mul_add_ext_algebra(coeff, v[(i + r) % SPONGE_WIDTH], res);
        }
+        {
+            let coeff = builder.constant_extension(F::Extension::from_canonical_u64(
+                <F as Poseidon>::MDS_MATRIX_DIAG[r],
+            ));
+            res = builder.scalar_mul_add_ext_algebra(coeff, v[r], res);
+        }

        res
    }
--- a/plonky2/src/hash/arch/aarch64/poseidon_goldilocks_neon.rs
+++ b/plonky2/src/hash/arch/aarch64/poseidon_goldilocks_neon.rs
@ -2,37 +2,24 @@

 use std::arch::aarch64::*;
 use std::arch::asm;
+use std::mem::transmute;

-use plonky2_field::field_types::Field64;
 use plonky2_field::goldilocks_field::GoldilocksField;
 use plonky2_util::branch_hint;
 use static_assertions::const_assert;
 use unroll::unroll_for_loops;

-use crate::hash::poseidon::{
-    Poseidon, ALL_ROUND_CONSTANTS, HALF_N_FULL_ROUNDS, N_PARTIAL_ROUNDS, N_ROUNDS,
-};
+use crate::hash::poseidon::Poseidon;

 // ========================================== CONSTANTS ===========================================

 const WIDTH: usize = 12;

-// The order below is arbitrary. Repeated coefficients have been removed so these constants fit in
-// two registers.
-// TODO: ensure this is aligned to 16 bytes (for vector loads), ideally on the same cacheline
-const MDS_CONSTS: [u32; 8] = [
-    0xffffffff,
-    1 << 1,
-    1 << 3,
-    1 << 5,
-    1 << 8,
-    1 << 10,
-    1 << 12,
-    1 << 16,
-];
+const EPSILON: u64 = 0xffffffff;

-// The round constants to be applied by the second set of full rounds. These are just the usual round constants,
-// shifted by one round, with zeros shifted in.
+// The round constants to be applied by the second set of full rounds. These are just the usual
+// round constants, shifted by one round, with zeros shifted in.
+/*
 const fn make_final_round_constants() -> [u64; WIDTH * HALF_N_FULL_ROUNDS] {
    let mut res = [0; WIDTH * HALF_N_FULL_ROUNDS];
    let mut i: usize = 0;
@ -43,6 +30,7 @@ const fn make_final_round_constants() -> [u64; WIDTH * HALF_N_FULL_ROUNDS] {
    res
 }
 const FINAL_ROUND_CONSTANTS: [u64; WIDTH * HALF_N_FULL_ROUNDS] = make_final_round_constants();
+*/

 // ===================================== COMPILE-TIME CHECKS ======================================

@ -52,9 +40,12 @@ const FINAL_ROUND_CONSTANTS: [u64; WIDTH * HALF_N_FULL_ROUNDS] = make_final_roun
 const fn check_mds_matrix() -> bool {
    // Can't == two arrays in a const_assert! (:
    let mut i = 0;
-    let wanted_matrix_exps = [0, 0, 1, 0, 3, 5, 1, 8, 12, 3, 16, 10];
+    let wanted_matrix_circ = [17, 15, 41, 16, 2, 28, 13, 13, 39, 18, 34, 20];
+    let wanted_matrix_diag = [8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
    while i < WIDTH {
-        if <GoldilocksField as Poseidon>::MDS_MATRIX_EXPS[i] != wanted_matrix_exps[i] {
+        if <GoldilocksField as Poseidon>::MDS_MATRIX_CIRC[i] != wanted_matrix_circ[i]
+            || <GoldilocksField as Poseidon>::MDS_MATRIX_DIAG[i] != wanted_matrix_diag[i]
+        {
            return false;
        }
        i += 1;
@ -63,37 +54,10 @@ const fn check_mds_matrix() -> bool {
 }
 const_assert!(check_mds_matrix());

-/// The maximum amount by which the MDS matrix will multiply the input.
-/// i.e. max(MDS(state)) <= mds_matrix_inf_norm() * max(state).
-const fn mds_matrix_inf_norm() -> u64 {
-    let mut cumul = 0;
-    let mut i = 0;
-    while i < WIDTH {
-        cumul += 1 << <GoldilocksField as Poseidon>::MDS_MATRIX_EXPS[i];
-        i += 1;
-    }
-    cumul
-}
-
-/// Ensure that adding round constants to the low result of the MDS multiplication can never
-/// overflow.
-#[allow(dead_code)]
-const fn check_round_const_bounds_mds() -> bool {
-    let max_mds_res = mds_matrix_inf_norm() * (u32::MAX as u64);
-    let mut i = WIDTH; // First const layer is handled specially.
-    while i < WIDTH * N_ROUNDS {
-        if ALL_ROUND_CONSTANTS[i].overflowing_add(max_mds_res).1 {
-            return false;
-        }
-        i += 1;
-    }
-    true
-}
-const_assert!(check_round_const_bounds_mds());
-
 /// Ensure that the first WIDTH round constants are in canonical* form. This is required because
 /// the first constant layer does not handle double overflow.
 /// *: round_const == GoldilocksField::ORDER is safe.
+/*
 #[allow(dead_code)]
 const fn check_round_const_bounds_init() -> bool {
    let mut i = 0;
@ -106,11 +70,9 @@ const fn check_round_const_bounds_init() -> bool {
    true
 }
 const_assert!(check_round_const_bounds_init());
-
+*/
 // ====================================== SCALAR ARITHMETIC =======================================

-const EPSILON: u64 = 0xffffffff;
-
 /// Addition modulo ORDER accounting for wraparound. Correct only when a + b < 2**64 + ORDER.
 #[inline(always)]
 unsafe fn add_with_wraparound(a: u64, b: u64) -> u64 {
@ -133,7 +95,16 @@ unsafe fn add_with_wraparound(a: u64, b: u64) -> u64 {
 /// Subtraction of a and (b >> 32) modulo ORDER accounting for wraparound.
 #[inline(always)]
 unsafe fn sub_with_wraparound_lsr32(a: u64, b: u64) -> u64 {
-    let b_hi = b >> 32;
+    let mut b_hi = b >> 32;
+    // Make sure that LLVM emits two separate instructions for the shift and the subtraction. This
+    // reduces pressure on the execution units with access to the flags, as they are no longer
+    // responsible for the shift. The hack is to insert a fake computation between the two
+    // instructions with an `asm` block to make LLVM think that they can't be merged.
+    asm!(
+        "/* {0} */", // Make Rust think we're using the register.
+        inlateout(reg) b_hi,
+        options(nomem, nostack, preserves_flags, pure),
+    );
    // This could be done with a.overflowing_add(b_hi), but `checked_sub` signals to the compiler
    // that overflow is unlikely (note: this is a standard library implementation detail, not part
    // of the spec).
@ -153,7 +124,8 @@ unsafe fn sub_with_wraparound_lsr32(a: u64, b: u64) -> u64 {
 unsafe fn mul_epsilon(x: u64) -> u64 {
    let res;
    asm!(
-        // Use UMULL to save one instruction. The compiler emits two: extract the low word and then multiply.
+        // Use UMULL to save one instruction. The compiler emits two: extract the low word and then
+        // multiply.
        "umull {res}, {x:w}, {epsilon:w}",
        x = in(reg) x,
        epsilon = in(reg) EPSILON,
@ -179,8 +151,9 @@ unsafe fn multiply(x: u64, y: u64) -> u64 {

 // ==================================== STANDALONE CONST LAYER =====================================

-/// Standalone const layer. Run only once, at the start of round 1. Remaining const layers are fused with the preceeding
-/// MDS matrix multiplication.
+/// Standalone const layer. Run only once, at the start of round 1. Remaining const layers are fused
+/// with the preceeding MDS matrix multiplication.
+/*
 #[inline(always)]
 #[unroll_for_loops]
 unsafe fn const_layer_full(
@ -195,15 +168,15 @@ unsafe fn const_layer_full(
    }
    state
 }
-
+*/
 // ========================================== FULL ROUNDS ==========================================

 /// Full S-box.
 #[inline(always)]
 #[unroll_for_loops]
 unsafe fn sbox_layer_full(state: [u64; WIDTH]) -> [u64; WIDTH] {
-    // This is done in scalar. S-boxes in vector are only slightly slower throughput-wise but have an insane latency
-    // (~100 cycles) on the M1.
+    // This is done in scalar. S-boxes in vector are only slightly slower throughput-wise but have
+    // an insane latency (~100 cycles) on the M1.

    let mut state2 = [0u64; WIDTH];
    assert!(WIDTH == 12);
@ -228,297 +201,227 @@ unsafe fn sbox_layer_full(state: [u64; WIDTH]) -> [u64; WIDTH] {
    state7
 }

-// Aliases for readability. E.g. MDS[5] can be found in mdsv5[MDSI5].
-const MDSI2: i32 = 1; // MDS[2] == 1
-const MDSI4: i32 = 2; // MDS[4] == 3
-const MDSI5: i32 = 3; // MDS[5] == 5
-const MDSI6: i32 = 1; // MDS[6] == 1
-const MDSI7: i32 = 0; // MDS[7] == 8
-const MDSI8: i32 = 2; // MDS[8] == 12
-const MDSI9: i32 = 2; // MDS[9] == 3
-const MDSI10: i32 = 3; // MDS[10] == 16
-const MDSI11: i32 = 1; // MDS[11] == 10
-
 #[inline(always)]
 unsafe fn mds_reduce(
-    [[cumul0_a, cumul0_b], [cumul1_a, cumul1_b]]: [[uint64x2_t; 2]; 2],
+    // `cumul_a` and `cumul_b` represent two separate field elements. We take advantage of
+    // vectorization by reducing them simultaneously.
+    [cumul_a, cumul_b]: [uint32x4_t; 2],
 ) -> uint64x2_t {
-    // mds_consts0 == [0xffffffff, 1 << 1, 1 << 3, 1 << 5]
-    let mds_consts0: uint32x4_t = vld1q_u32((&MDS_CONSTS[0..4]).as_ptr().cast::<u32>());
-
-    // Merge accumulators
-    let cumul0 = vaddq_u64(cumul0_a, cumul0_b);
-    let cumul1 = vaddq_u64(cumul1_a, cumul1_b);
-
-    // Swizzle
-    let res_lo = vzip1q_u64(cumul0, cumul1);
-    let res_hi = vzip2q_u64(cumul0, cumul1);
-
-    // Reduce from u96
-    let res_hi = vsraq_n_u64::<32>(res_hi, res_lo);
-    let res_lo = vsliq_n_u64::<32>(res_lo, res_hi);
-
-    // Extract high 32-bits.
-    let res_hi_hi = vget_low_u32(vuzp2q_u32(
-        vreinterpretq_u32_u64(res_hi),
-        vreinterpretq_u32_u64(res_hi),
-    ));
-
-    // Multiply by EPSILON and accumulate.
-    let res_unadj = vmlal_laneq_u32::<0>(res_lo, res_hi_hi, mds_consts0);
-    let res_adj = vcgtq_u64(res_lo, res_unadj);
-    vsraq_n_u64::<32>(res_unadj, res_adj)
+    // Form:
+    // `lo = [cumul_a[0] + cumul_a[2] * 2**32, cumul_b[0] + cumul_b[2] * 2**32]`
+    // `hi = [cumul_a[1] + cumul_a[3] * 2**32, cumul_b[1] + cumul_b[3] * 2**32]`
+    // Observe that the result `== lo + hi * 2**16 (mod Goldilocks)`.
+    let mut lo = vreinterpretq_u64_u32(vuzp1q_u32(cumul_a, cumul_b));
+    let mut hi = vreinterpretq_u64_u32(vuzp2q_u32(cumul_a, cumul_b));
+    // Add the high 48 bits of `lo` to `hi`. This cannot overflow.
+    hi = vsraq_n_u64::<16>(hi, lo);
+    // Now, result `== lo.bits[0..16] + hi * 2**16 (mod Goldilocks)`.
+    // Set the high 48 bits of `lo` to the low 48 bits of `hi`.
+    lo = vsliq_n_u64::<16>(lo, hi);
+    // At this point, result `== lo + hi.bits[48..64] * 2**64 (mod Goldilocks)`.
+    // It remains to fold `hi.bits[48..64]` into `lo`.
+    let top = {
+        // Extract the top 16 bits of `hi` as a `u32`.
+        // Interpret `hi` as a vector of bytes, so we can use a table lookup instruction.
+        let hi_u8 = vreinterpretq_u8_u64(hi);
+        // Indices defining the permutation. `0xff` is out of bounds, producing `0`.
+        let top_idx =
+            transmute::<[u8; 8], uint8x8_t>([0x06, 0x07, 0xff, 0xff, 0x0e, 0x0f, 0xff, 0xff]);
+        let top_u8 = vqtbl1_u8(hi_u8, top_idx);
+        vreinterpret_u32_u8(top_u8)
+    };
+    // result `== lo + top * 2**64 (mod Goldilocks)`.
+    let adj_lo = vmlal_n_u32(lo, top, EPSILON as u32);
+    let wraparound_mask = vcgtq_u64(lo, adj_lo);
+    vsraq_n_u64::<32>(adj_lo, wraparound_mask) // Add epsilon on overflow.
 }

 #[inline(always)]
-unsafe fn mds_const_layers_full(
-    state: [u64; WIDTH],
-    round_constants: &[u64; WIDTH],
-) -> [u64; WIDTH] {
-    // mds_consts0 == [0xffffffff, 1 << 1, 1 << 3, 1 << 5]
-    // mds_consts1 == [1 << 8, 1 << 10, 1 << 12, 1 << 16]
-    let mds_consts0: uint32x4_t = vld1q_u32((&MDS_CONSTS[0..4]).as_ptr().cast::<u32>());
-    let mds_consts1: uint32x4_t = vld1q_u32((&MDS_CONSTS[4..8]).as_ptr().cast::<u32>());
+unsafe fn mds_layer_full(state: [u64; WIDTH]) -> [u64; WIDTH] {
+    // This function performs an MDS multiplication in complex FFT space.
+    // However, instead of performing a width-12 FFT, we perform three width-4 FFTs, which is
+    // cheaper. The 12x12 matrix-vector multiplication (a convolution) becomes two 3x3 real
+    // matrix-vector multiplications and one 3x3 complex matrix-vector multiplication.

-    // Aliases for readability. E.g. MDS[5] can be found in mdsv5[mdsi5]. MDS[0], MDS[1], and
-    // MDS[3] are 0, so they are not needed.
-    let mdsv2 = mds_consts0; // MDS[2] == 1
-    let mdsv4 = mds_consts0; // MDS[4] == 3
-    let mdsv5 = mds_consts0; // MDS[5] == 5
-    let mdsv6 = mds_consts0; // MDS[6] == 1
-    let mdsv7 = mds_consts1; // MDS[7] == 8
-    let mdsv8 = mds_consts1; // MDS[8] == 12
-    let mdsv9 = mds_consts0; // MDS[9] == 3
-    let mdsv10 = mds_consts1; // MDS[10] == 16
-    let mdsv11 = mds_consts1; // MDS[11] == 10
+    // We split each 64-bit into four chunks of 16 bits. To prevent overflow, each chunk is 32 bits
+    // long. Each NEON vector below represents one field element and consists of four 32-bit chunks:
+    // `elem == vector[0] + vector[1] * 2**16 + vector[2] * 2**32 + vector[3] * 2**48`.

-    // For i even, we combine state[i] and state[i + 1] into one vector to save on registers.
-    // Thus, state1 actually contains state0 and state1 but is only used in the intrinsics that
-    // access the high high doubleword.
-    let state1: uint32x4_t =
-        vreinterpretq_u32_u64(vcombine_u64(vcreate_u64(state[0]), vcreate_u64(state[1])));
-    let state3: uint32x4_t =
-        vreinterpretq_u32_u64(vcombine_u64(vcreate_u64(state[2]), vcreate_u64(state[3])));
-    let state5: uint32x4_t =
-        vreinterpretq_u32_u64(vcombine_u64(vcreate_u64(state[4]), vcreate_u64(state[5])));
-    let state7: uint32x4_t =
-        vreinterpretq_u32_u64(vcombine_u64(vcreate_u64(state[6]), vcreate_u64(state[7])));
-    let state9: uint32x4_t =
-        vreinterpretq_u32_u64(vcombine_u64(vcreate_u64(state[8]), vcreate_u64(state[9])));
-    let state11: uint32x4_t =
-        vreinterpretq_u32_u64(vcombine_u64(vcreate_u64(state[10]), vcreate_u64(state[11])));
-    // state0 is an alias to the low doubleword of state1. The compiler should use one register for both.
-    let state0: uint32x2_t = vget_low_u32(state1);
-    let state2: uint32x2_t = vget_low_u32(state3);
-    let state4: uint32x2_t = vget_low_u32(state5);
-    let state6: uint32x2_t = vget_low_u32(state7);
-    let state8: uint32x2_t = vget_low_u32(state9);
-    let state10: uint32x2_t = vget_low_u32(state11);
+    // Constants that we multiply by.
+    let mut consts: uint32x4_t = transmute::<[u32; 4], _>([2, 4, 8, 16]);

-    // Two accumulators per output to hide latency. Each accumulator is a vector of two u64s,
-    // containing the result for the low 32 bits and the high 32 bits. Thus, the final result at
-    // index i is (cumuli_a[0] + cumuli_b[0]) + (cumuli_a[1] + cumuli_b[1]) * 2**32.
+    // Prevent LLVM from turning fused multiply (by power of 2)-add (1 instruction) into shift and
+    // add (two instructions). This fake `asm` block means that LLVM no longer knows the contents of
+    // `consts`.
+    asm!("/* {0:v} */", // Make Rust think the register is being used.
+         inout(vreg) consts,
+         options(pure, nomem, nostack, preserves_flags),
+    );

-    // Start by loading the round constants.
-    let mut cumul0_a = vcombine_u64(vld1_u64(&round_constants[0]), vcreate_u64(0));
-    let mut cumul1_a = vcombine_u64(vld1_u64(&round_constants[1]), vcreate_u64(0));
-    let mut cumul2_a = vcombine_u64(vld1_u64(&round_constants[2]), vcreate_u64(0));
-    let mut cumul3_a = vcombine_u64(vld1_u64(&round_constants[3]), vcreate_u64(0));
-    let mut cumul4_a = vcombine_u64(vld1_u64(&round_constants[4]), vcreate_u64(0));
-    let mut cumul5_a = vcombine_u64(vld1_u64(&round_constants[5]), vcreate_u64(0));
-    let mut cumul6_a = vcombine_u64(vld1_u64(&round_constants[6]), vcreate_u64(0));
-    let mut cumul7_a = vcombine_u64(vld1_u64(&round_constants[7]), vcreate_u64(0));
-    let mut cumul8_a = vcombine_u64(vld1_u64(&round_constants[8]), vcreate_u64(0));
-    let mut cumul9_a = vcombine_u64(vld1_u64(&round_constants[9]), vcreate_u64(0));
-    let mut cumul10_a = vcombine_u64(vld1_u64(&round_constants[10]), vcreate_u64(0));
-    let mut cumul11_a = vcombine_u64(vld1_u64(&round_constants[11]), vcreate_u64(0));
+    // Four length-3 complex FFTs.
+    let mut state_fft = [vdupq_n_u32(0); 12];
+    for i in 0..3 {
+        // Interpret each field element as a 4-vector of `u16`s.
+        let x0 = vcreate_u16(state[i]);
+        let x1 = vcreate_u16(state[i + 3]);
+        let x2 = vcreate_u16(state[i + 6]);
+        let x3 = vcreate_u16(state[i + 9]);

-    // Now the matrix multiplication.
-    // MDS exps: [0, 0, 1, 0, 3, 5, 1, 8, 12, 3, 16, 10]
-    // out[i] += in[j] << mds[j - i]
+        // `vaddl_u16` and `vsubl_u16` yield 4-vectors of `u32`s.
+        let y0 = vaddl_u16(x0, x2);
+        let y1 = vaddl_u16(x1, x3);
+        let y2 = vsubl_u16(x0, x2);
+        let y3 = vsubl_u16(x1, x3);

-    let mut cumul0_b = vshll_n_u32::<0>(state0); // MDS[0]
-    let mut cumul1_b = vshll_n_u32::<10>(state0); // MDS[11]
-    let mut cumul2_b = vshll_n_u32::<16>(state0); // MDS[10]
-    let mut cumul3_b = vshll_n_u32::<3>(state0); // MDS[9]
-    let mut cumul4_b = vshll_n_u32::<12>(state0); // MDS[8]
-    let mut cumul5_b = vshll_n_u32::<8>(state0); // MDS[7]
-    let mut cumul6_b = vshll_n_u32::<1>(state0); // MDS[6]
-    let mut cumul7_b = vshll_n_u32::<5>(state0); // MDS[5]
-    let mut cumul8_b = vshll_n_u32::<3>(state0); // MDS[4]
-    let mut cumul9_b = vshll_n_u32::<0>(state0); // MDS[3]
-    let mut cumul10_b = vshll_n_u32::<1>(state0); // MDS[2]
-    let mut cumul11_b = vshll_n_u32::<0>(state0); // MDS[1]
+        let z0 = vaddq_u32(y0, y1);
+        let z1 = vsubq_u32(y0, y1);
+        let z2 = y2;
+        let z3 = y3;

-    cumul0_a = vaddw_high_u32(cumul0_a, state1); // MDS[1]
-    cumul1_a = vaddw_high_u32(cumul1_a, state1); // MDS[0]
-    cumul2_a = vmlal_high_laneq_u32::<MDSI11>(cumul2_a, state1, mdsv11); // MDS[11]
-    cumul3_a = vmlal_high_laneq_u32::<MDSI10>(cumul3_a, state1, mdsv10); // MDS[10]
-    cumul4_a = vmlal_high_laneq_u32::<MDSI9>(cumul4_a, state1, mdsv9); // MDS[9]
-    cumul5_a = vmlal_high_laneq_u32::<MDSI8>(cumul5_a, state1, mdsv8); // MDS[8]
-    cumul6_a = vmlal_high_laneq_u32::<MDSI7>(cumul6_a, state1, mdsv7); // MDS[7]
-    cumul7_a = vmlal_high_laneq_u32::<MDSI6>(cumul7_a, state1, mdsv6); // MDS[6]
-    cumul8_a = vmlal_high_laneq_u32::<MDSI5>(cumul8_a, state1, mdsv5); // MDS[5]
-    cumul9_a = vmlal_high_laneq_u32::<MDSI4>(cumul9_a, state1, mdsv4); // MDS[4]
-    cumul10_a = vaddw_high_u32(cumul10_a, state1); // MDS[3]
-    cumul11_a = vmlal_high_laneq_u32::<MDSI2>(cumul11_a, state1, mdsv2); // MDS[2]
+        // The FFT is `[z0, z2 + z3 i, z1, z2 - z3 i]`.

-    cumul0_b = vmlal_laneq_u32::<MDSI2>(cumul0_b, state2, mdsv2); // MDS[2]
-    cumul1_b = vaddw_u32(cumul1_b, state2); // MDS[1]
-    cumul2_b = vaddw_u32(cumul2_b, state2); // MDS[0]
-    cumul3_b = vmlal_laneq_u32::<MDSI11>(cumul3_b, state2, mdsv11); // MDS[11]
-    cumul4_b = vmlal_laneq_u32::<MDSI10>(cumul4_b, state2, mdsv10); // MDS[10]
-    cumul5_b = vmlal_laneq_u32::<MDSI9>(cumul5_b, state2, mdsv9); // MDS[9]
-    cumul6_b = vmlal_laneq_u32::<MDSI8>(cumul6_b, state2, mdsv8); // MDS[8]
-    cumul7_b = vmlal_laneq_u32::<MDSI7>(cumul7_b, state2, mdsv7); // MDS[7]
-    cumul8_b = vmlal_laneq_u32::<MDSI6>(cumul8_b, state2, mdsv6); // MDS[6]
-    cumul9_b = vmlal_laneq_u32::<MDSI5>(cumul9_b, state2, mdsv5); // MDS[5]
-    cumul10_b = vmlal_laneq_u32::<MDSI4>(cumul10_b, state2, mdsv4); // MDS[4]
-    cumul11_b = vaddw_u32(cumul11_b, state2); // MDS[3]
+        state_fft[i] = z0;
+        state_fft[i + 3] = z1;
+        state_fft[i + 6] = z2;
+        state_fft[i + 9] = z3;
+    }

-    cumul0_a = vaddw_high_u32(cumul0_a, state3); // MDS[3]
-    cumul1_a = vmlal_high_laneq_u32::<MDSI2>(cumul1_a, state3, mdsv2); // MDS[2]
-    cumul2_a = vaddw_high_u32(cumul2_a, state3); // MDS[1]
-    cumul3_a = vaddw_high_u32(cumul3_a, state3); // MDS[0]
-    cumul4_a = vmlal_high_laneq_u32::<MDSI11>(cumul4_a, state3, mdsv11); // MDS[11]
-    cumul5_a = vmlal_high_laneq_u32::<MDSI10>(cumul5_a, state3, mdsv10); // MDS[10]
-    cumul6_a = vmlal_high_laneq_u32::<MDSI9>(cumul6_a, state3, mdsv9); // MDS[9]
-    cumul7_a = vmlal_high_laneq_u32::<MDSI8>(cumul7_a, state3, mdsv8); // MDS[8]
-    cumul8_a = vmlal_high_laneq_u32::<MDSI7>(cumul8_a, state3, mdsv7); // MDS[7]
-    cumul9_a = vmlal_high_laneq_u32::<MDSI6>(cumul9_a, state3, mdsv6); // MDS[6]
-    cumul10_a = vmlal_high_laneq_u32::<MDSI5>(cumul10_a, state3, mdsv5); // MDS[5]
-    cumul11_a = vmlal_high_laneq_u32::<MDSI4>(cumul11_a, state3, mdsv4); // MDS[4]
+    // 3x3 real matrix-vector mul for component 0 of the FFTs.
+    // Multiply the vector `[x0, x1, x2]` by the matrix
+    // `[[ 64,  64, 128],`
+    // ` [128,  64,  64],`
+    // ` [ 64, 128,  64]]`
+    // The results are divided by 4 (this ends up cancelling out some later computations).
+    {
+        let x0 = state_fft[0];
+        let x1 = state_fft[1];
+        let x2 = state_fft[2];

-    cumul0_b = vmlal_laneq_u32::<MDSI4>(cumul0_b, state4, mdsv4); // MDS[4]
-    cumul1_b = vaddw_u32(cumul1_b, state4); // MDS[3]
-    cumul2_b = vmlal_laneq_u32::<MDSI2>(cumul2_b, state4, mdsv2); // MDS[2]
-    cumul3_b = vaddw_u32(cumul3_b, state4); // MDS[1]
-    cumul4_b = vaddw_u32(cumul4_b, state4); // MDS[0]
-    cumul5_b = vmlal_laneq_u32::<MDSI11>(cumul5_b, state4, mdsv11); // MDS[11]
-    cumul6_b = vmlal_laneq_u32::<MDSI10>(cumul6_b, state4, mdsv10); // MDS[10]
-    cumul7_b = vmlal_laneq_u32::<MDSI9>(cumul7_b, state4, mdsv9); // MDS[9]
-    cumul8_b = vmlal_laneq_u32::<MDSI8>(cumul8_b, state4, mdsv8); // MDS[8]
-    cumul9_b = vmlal_laneq_u32::<MDSI7>(cumul9_b, state4, mdsv7); // MDS[7]
-    cumul10_b = vmlal_laneq_u32::<MDSI6>(cumul10_b, state4, mdsv6); // MDS[6]
-    cumul11_b = vmlal_laneq_u32::<MDSI5>(cumul11_b, state4, mdsv5); // MDS[5]
+        let t = vshlq_n_u32::<4>(x0);
+        let u = vaddq_u32(x1, x2);

-    cumul0_a = vmlal_high_laneq_u32::<MDSI5>(cumul0_a, state5, mdsv5); // MDS[5]
-    cumul1_a = vmlal_high_laneq_u32::<MDSI4>(cumul1_a, state5, mdsv4); // MDS[4]
-    cumul2_a = vaddw_high_u32(cumul2_a, state5); // MDS[3]
-    cumul3_a = vmlal_high_laneq_u32::<MDSI2>(cumul3_a, state5, mdsv2); // MDS[2]
-    cumul4_a = vaddw_high_u32(cumul4_a, state5); // MDS[1]
-    cumul5_a = vaddw_high_u32(cumul5_a, state5); // MDS[0]
-    cumul6_a = vmlal_high_laneq_u32::<MDSI11>(cumul6_a, state5, mdsv11); // MDS[11]
-    cumul7_a = vmlal_high_laneq_u32::<MDSI10>(cumul7_a, state5, mdsv10); // MDS[10]
-    cumul8_a = vmlal_high_laneq_u32::<MDSI9>(cumul8_a, state5, mdsv9); // MDS[9]
-    cumul9_a = vmlal_high_laneq_u32::<MDSI8>(cumul9_a, state5, mdsv8); // MDS[8]
-    cumul10_a = vmlal_high_laneq_u32::<MDSI7>(cumul10_a, state5, mdsv7); // MDS[7]
-    cumul11_a = vmlal_high_laneq_u32::<MDSI6>(cumul11_a, state5, mdsv6); // MDS[6]
+        let y0 = vshlq_n_u32::<4>(u);
+        let y1 = vmlaq_laneq_u32::<3>(t, x2, consts);
+        let y2 = vmlaq_laneq_u32::<3>(t, x1, consts);

-    cumul0_b = vmlal_laneq_u32::<MDSI6>(cumul0_b, state6, mdsv6); // MDS[6]
-    cumul1_b = vmlal_laneq_u32::<MDSI5>(cumul1_b, state6, mdsv5); // MDS[5]
-    cumul2_b = vmlal_laneq_u32::<MDSI4>(cumul2_b, state6, mdsv4); // MDS[4]
-    cumul3_b = vaddw_u32(cumul3_b, state6); // MDS[3]
-    cumul4_b = vmlal_laneq_u32::<MDSI2>(cumul4_b, state6, mdsv2); // MDS[2]
-    cumul5_b = vaddw_u32(cumul5_b, state6); // MDS[1]
-    cumul6_b = vaddw_u32(cumul6_b, state6); // MDS[0]
-    cumul7_b = vmlal_laneq_u32::<MDSI11>(cumul7_b, state6, mdsv11); // MDS[11]
-    cumul8_b = vmlal_laneq_u32::<MDSI10>(cumul8_b, state6, mdsv10); // MDS[10]
-    cumul9_b = vmlal_laneq_u32::<MDSI9>(cumul9_b, state6, mdsv9); // MDS[9]
-    cumul10_b = vmlal_laneq_u32::<MDSI8>(cumul10_b, state6, mdsv8); // MDS[8]
-    cumul11_b = vmlal_laneq_u32::<MDSI7>(cumul11_b, state6, mdsv7); // MDS[7]
+        state_fft[0] = vaddq_u32(y0, y1);
+        state_fft[1] = vaddq_u32(y1, y2);
+        state_fft[2] = vaddq_u32(y0, y2);
+    }

-    cumul0_a = vmlal_high_laneq_u32::<MDSI7>(cumul0_a, state7, mdsv7); // MDS[7]
-    cumul1_a = vmlal_high_laneq_u32::<MDSI6>(cumul1_a, state7, mdsv6); // MDS[6]
-    cumul2_a = vmlal_high_laneq_u32::<MDSI5>(cumul2_a, state7, mdsv5); // MDS[5]
-    cumul3_a = vmlal_high_laneq_u32::<MDSI4>(cumul3_a, state7, mdsv4); // MDS[4]
-    cumul4_a = vaddw_high_u32(cumul4_a, state7); // MDS[3]
-    cumul5_a = vmlal_high_laneq_u32::<MDSI2>(cumul5_a, state7, mdsv2); // MDS[2]
-    cumul6_a = vaddw_high_u32(cumul6_a, state7); // MDS[1]
-    cumul7_a = vaddw_high_u32(cumul7_a, state7); // MDS[0]
-    cumul8_a = vmlal_high_laneq_u32::<MDSI11>(cumul8_a, state7, mdsv11); // MDS[11]
-    cumul9_a = vmlal_high_laneq_u32::<MDSI10>(cumul9_a, state7, mdsv10); // MDS[10]
-    cumul10_a = vmlal_high_laneq_u32::<MDSI9>(cumul10_a, state7, mdsv9); // MDS[9]
-    cumul11_a = vmlal_high_laneq_u32::<MDSI8>(cumul11_a, state7, mdsv8); // MDS[8]
+    // 3x3 real matrix-vector mul for component 2 of the FFTs.
+    // Multiply the vector `[x0, x1, x2]` by the matrix
+    // `[[ -4,  -8,  32],`
+    // ` [-32,  -4,  -8],`
+    // ` [  8, -32,  -4]]`
+    // The results are divided by 4 (this ends up cancelling out some later computations).
+    {
+        let x0 = state_fft[3];
+        let x1 = state_fft[4];
+        let x2 = state_fft[5];
+        state_fft[3] = vmlsq_laneq_u32::<2>(vmlaq_laneq_u32::<0>(x0, x1, consts), x2, consts);
+        state_fft[4] = vmlaq_laneq_u32::<0>(vmlaq_laneq_u32::<2>(x1, x0, consts), x2, consts);
+        state_fft[5] = vmlsq_laneq_u32::<0>(x2, vmlsq_laneq_u32::<1>(x0, x1, consts), consts);
+    }

-    cumul0_b = vmlal_laneq_u32::<MDSI8>(cumul0_b, state8, mdsv8); // MDS[8]
-    cumul1_b = vmlal_laneq_u32::<MDSI7>(cumul1_b, state8, mdsv7); // MDS[7]
-    cumul2_b = vmlal_laneq_u32::<MDSI6>(cumul2_b, state8, mdsv6); // MDS[6]
-    cumul3_b = vmlal_laneq_u32::<MDSI5>(cumul3_b, state8, mdsv5); // MDS[5]
-    cumul4_b = vmlal_laneq_u32::<MDSI4>(cumul4_b, state8, mdsv4); // MDS[4]
-    cumul5_b = vaddw_u32(cumul5_b, state8); // MDS[3]
-    cumul6_b = vmlal_laneq_u32::<MDSI2>(cumul6_b, state8, mdsv2); // MDS[2]
-    cumul7_b = vaddw_u32(cumul7_b, state8); // MDS[1]
-    cumul8_b = vaddw_u32(cumul8_b, state8); // MDS[0]
-    cumul9_b = vmlal_laneq_u32::<MDSI11>(cumul9_b, state8, mdsv11); // MDS[11]
-    cumul10_b = vmlal_laneq_u32::<MDSI10>(cumul10_b, state8, mdsv10); // MDS[10]
-    cumul11_b = vmlal_laneq_u32::<MDSI9>(cumul11_b, state8, mdsv9); // MDS[9]
+    // 3x3 complex matrix-vector mul for components 1 and 3 of the FFTs.
+    // Multiply the vector `[x0r + x0i i, x1r + x1i i, x2r + x2i i]` by the matrix
+    // `[[ 4 +  2i,  2 + 32i,  2 -  8i],`
+    // ` [-8 -  2i,  4 +  2i,  2 + 32i],`
+    // ` [32 -  2i, -8 -  2i,  4 +  2i]]`
+    // The results are divided by 2 (this ends up cancelling out some later computations).
+    {
+        let x0r = state_fft[6];
+        let x1r = state_fft[7];
+        let x2r = state_fft[8];

-    cumul0_a = vmlal_high_laneq_u32::<MDSI9>(cumul0_a, state9, mdsv9); // MDS[9]
-    cumul1_a = vmlal_high_laneq_u32::<MDSI8>(cumul1_a, state9, mdsv8); // MDS[8]
-    cumul2_a = vmlal_high_laneq_u32::<MDSI7>(cumul2_a, state9, mdsv7); // MDS[7]
-    cumul3_a = vmlal_high_laneq_u32::<MDSI6>(cumul3_a, state9, mdsv6); // MDS[6]
-    cumul4_a = vmlal_high_laneq_u32::<MDSI5>(cumul4_a, state9, mdsv5); // MDS[5]
-    cumul5_a = vmlal_high_laneq_u32::<MDSI4>(cumul5_a, state9, mdsv4); // MDS[4]
-    cumul6_a = vaddw_high_u32(cumul6_a, state9); // MDS[3]
-    cumul7_a = vmlal_high_laneq_u32::<MDSI2>(cumul7_a, state9, mdsv2); // MDS[2]
-    cumul8_a = vaddw_high_u32(cumul8_a, state9); // MDS[1]
-    cumul9_a = vaddw_high_u32(cumul9_a, state9); // MDS[0]
-    cumul10_a = vmlal_high_laneq_u32::<MDSI11>(cumul10_a, state9, mdsv11); // MDS[11]
-    cumul11_a = vmlal_high_laneq_u32::<MDSI10>(cumul11_a, state9, mdsv10); // MDS[10]
+        let x0i = state_fft[9];
+        let x1i = state_fft[10];
+        let x2i = state_fft[11];

-    cumul0_b = vmlal_laneq_u32::<MDSI10>(cumul0_b, state10, mdsv10); // MDS[10]
-    cumul1_b = vmlal_laneq_u32::<MDSI9>(cumul1_b, state10, mdsv9); // MDS[9]
-    cumul2_b = vmlal_laneq_u32::<MDSI8>(cumul2_b, state10, mdsv8); // MDS[8]
-    cumul3_b = vmlal_laneq_u32::<MDSI7>(cumul3_b, state10, mdsv7); // MDS[7]
-    cumul4_b = vmlal_laneq_u32::<MDSI6>(cumul4_b, state10, mdsv6); // MDS[6]
-    cumul5_b = vmlal_laneq_u32::<MDSI5>(cumul5_b, state10, mdsv5); // MDS[5]
-    cumul6_b = vmlal_laneq_u32::<MDSI4>(cumul6_b, state10, mdsv4); // MDS[4]
-    cumul7_b = vaddw_u32(cumul7_b, state10); // MDS[3]
-    cumul8_b = vmlal_laneq_u32::<MDSI2>(cumul8_b, state10, mdsv2); // MDS[2]
-    cumul9_b = vaddw_u32(cumul9_b, state10); // MDS[1]
-    cumul10_b = vaddw_u32(cumul10_b, state10); // MDS[0]
-    cumul11_b = vmlal_laneq_u32::<MDSI11>(cumul11_b, state10, mdsv11); // MDS[11]
+        // real part of result <- real part of input
+        let r0rr = vaddq_u32(vmlaq_laneq_u32::<0>(x1r, x0r, consts), x2r);
+        let r1rr = vmlaq_laneq_u32::<0>(x2r, vmlsq_laneq_u32::<0>(x1r, x0r, consts), consts);
+        let r2rr = vmlsq_laneq_u32::<0>(x2r, vmlsq_laneq_u32::<1>(x1r, x0r, consts), consts);

-    cumul0_a = vmlal_high_laneq_u32::<MDSI11>(cumul0_a, state11, mdsv11); // MDS[11]
-    cumul1_a = vmlal_high_laneq_u32::<MDSI10>(cumul1_a, state11, mdsv10); // MDS[10]
-    cumul2_a = vmlal_high_laneq_u32::<MDSI9>(cumul2_a, state11, mdsv9); // MDS[9]
-    cumul3_a = vmlal_high_laneq_u32::<MDSI8>(cumul3_a, state11, mdsv8); // MDS[8]
-    cumul4_a = vmlal_high_laneq_u32::<MDSI7>(cumul4_a, state11, mdsv7); // MDS[7]
-    cumul5_a = vmlal_high_laneq_u32::<MDSI6>(cumul5_a, state11, mdsv6); // MDS[6]
-    cumul6_a = vmlal_high_laneq_u32::<MDSI5>(cumul6_a, state11, mdsv5); // MDS[5]
-    cumul7_a = vmlal_high_laneq_u32::<MDSI4>(cumul7_a, state11, mdsv4); // MDS[4]
-    cumul8_a = vaddw_high_u32(cumul8_a, state11); // MDS[3]
-    cumul9_a = vmlal_high_laneq_u32::<MDSI2>(cumul9_a, state11, mdsv2); // MDS[2]
-    cumul10_a = vaddw_high_u32(cumul10_a, state11); // MDS[1]
-    cumul11_a = vaddw_high_u32(cumul11_a, state11); // MDS[0]
+        // real part of result <- imaginary part of input
+        let r0ri = vmlsq_laneq_u32::<1>(vmlaq_laneq_u32::<3>(x0i, x1i, consts), x2i, consts);
+        let r1ri = vmlsq_laneq_u32::<3>(vsubq_u32(x0i, x1i), x2i, consts);
+        let r2ri = vsubq_u32(vaddq_u32(x0i, x1i), x2i);

-    let reduced = [
-        mds_reduce([[cumul0_a, cumul0_b], [cumul1_a, cumul1_b]]),
-        mds_reduce([[cumul2_a, cumul2_b], [cumul3_a, cumul3_b]]),
-        mds_reduce([[cumul4_a, cumul4_b], [cumul5_a, cumul5_b]]),
-        mds_reduce([[cumul6_a, cumul6_b], [cumul7_a, cumul7_b]]),
-        mds_reduce([[cumul8_a, cumul8_b], [cumul9_a, cumul9_b]]),
-        mds_reduce([[cumul10_a, cumul10_b], [cumul11_a, cumul11_b]]),
-    ];
-    [
-        vgetq_lane_u64::<0>(reduced[0]),
-        vgetq_lane_u64::<1>(reduced[0]),
-        vgetq_lane_u64::<0>(reduced[1]),
-        vgetq_lane_u64::<1>(reduced[1]),
-        vgetq_lane_u64::<0>(reduced[2]),
-        vgetq_lane_u64::<1>(reduced[2]),
-        vgetq_lane_u64::<0>(reduced[3]),
-        vgetq_lane_u64::<1>(reduced[3]),
-        vgetq_lane_u64::<0>(reduced[4]),
-        vgetq_lane_u64::<1>(reduced[4]),
-        vgetq_lane_u64::<0>(reduced[5]),
-        vgetq_lane_u64::<1>(reduced[5]),
-    ]
+        // real part of result (total)
+        let r0r = vsubq_u32(r0rr, r0ri);
+        let r1r = vaddq_u32(r1rr, r1ri);
+        let r2r = vmlaq_laneq_u32::<0>(r2ri, r2rr, consts);
+
+        // imaginary part of result <- real part of input
+        let r0ir = vmlsq_laneq_u32::<1>(vmlaq_laneq_u32::<3>(x0r, x1r, consts), x2r, consts);
+        let r1ir = vmlaq_laneq_u32::<3>(vsubq_u32(x1r, x0r), x2r, consts);
+        let r2ir = vsubq_u32(x2r, vaddq_u32(x0r, x1r));
+
+        // imaginary part of result <- imaginary part of input
+        let r0ii = vaddq_u32(vmlaq_laneq_u32::<0>(x1i, x0i, consts), x2i);
+        let r1ii = vmlaq_laneq_u32::<0>(x2i, vmlsq_laneq_u32::<0>(x1i, x0i, consts), consts);
+        let r2ii = vmlsq_laneq_u32::<0>(x2i, vmlsq_laneq_u32::<1>(x1i, x0i, consts), consts);
+
+        // imaginary part of result (total)
+        let r0i = vaddq_u32(r0ir, r0ii);
+        let r1i = vaddq_u32(r1ir, r1ii);
+        let r2i = vmlaq_laneq_u32::<0>(r2ir, r2ii, consts);
+
+        state_fft[6] = r0r;
+        state_fft[7] = r1r;
+        state_fft[8] = r2r;
+
+        state_fft[9] = r0i;
+        state_fft[10] = r1i;
+        state_fft[11] = r2i;
+    }
+
+    // Three length-4 inverse FFTs.
+    // Normally, such IFFT would divide by 4, but we've already taken care of that.
+    for i in 0..3 {
+        let z0 = state_fft[i];
+        let z1 = state_fft[i + 3];
+        let z2 = state_fft[i + 6];
+        let z3 = state_fft[i + 9];
+
+        let y0 = vsubq_u32(z0, z1);
+        let y1 = vaddq_u32(z0, z1);
+        let y2 = z2;
+        let y3 = z3;
+
+        let x0 = vaddq_u32(y0, y2);
+        let x1 = vaddq_u32(y1, y3);
+        let x2 = vsubq_u32(y0, y2);
+        let x3 = vsubq_u32(y1, y3);
+
+        state_fft[i] = x0;
+        state_fft[i + 3] = x1;
+        state_fft[i + 6] = x2;
+        state_fft[i + 9] = x3;
+    }
+
+    // Perform `res[0] += state[0] * 8` for the diagonal component of the MDS matrix.
+    state_fft[0] = vmlal_laneq_u16::<4>(
+        state_fft[0],
+        vcreate_u16(state[0]),         // Each 16-bit chunk gets zero-extended.
+        vreinterpretq_u16_u32(consts), // Hack: these constants fit in `u16s`, so we can bit-cast.
+    );
+
+    let mut res_arr = [0; 12];
+    for i in 0..6 {
+        let res = mds_reduce([state_fft[2 * i], state_fft[2 * i + 1]]);
+        res_arr[2 * i] = vgetq_lane_u64::<0>(res);
+        res_arr[2 * i + 1] = vgetq_lane_u64::<1>(res);
+    }
+
+    res_arr
 }

 // ======================================== PARTIAL ROUNDS =========================================

+/*
 #[rustfmt::skip]
 macro_rules! mds_reduce_asm {
    ($c0:literal, $c1:literal, $out:literal, $consts:literal) => {
@ -961,13 +864,15 @@ unsafe fn partial_round(
        [res23, res45, res67, res89, res1011],
    )
 }
+*/

 // ========================================== GLUE CODE ===========================================

+/*
 #[inline(always)]
 unsafe fn full_round(state: [u64; 12], round_constants: &[u64; WIDTH]) -> [u64; 12] {
    let state = sbox_layer_full(state);
-    mds_const_layers_full(state, round_constants)
+    mds_layer_full(state, round_constants)
 }

 #[inline]
@ -1001,43 +906,19 @@ unsafe fn partial_rounds(
    }
    state.0
 }
+*/

 #[inline(always)]
 fn unwrap_state(state: [GoldilocksField; 12]) -> [u64; 12] {
-    [
-        state[0].0,
-        state[1].0,
-        state[2].0,
-        state[3].0,
-        state[4].0,
-        state[5].0,
-        state[6].0,
-        state[7].0,
-        state[8].0,
-        state[9].0,
-        state[10].0,
-        state[11].0,
-    ]
+    state.map(|s| s.0)
 }

 #[inline(always)]
 fn wrap_state(state: [u64; 12]) -> [GoldilocksField; 12] {
-    [
-        GoldilocksField(state[0]),
-        GoldilocksField(state[1]),
-        GoldilocksField(state[2]),
-        GoldilocksField(state[3]),
-        GoldilocksField(state[4]),
-        GoldilocksField(state[5]),
-        GoldilocksField(state[6]),
-        GoldilocksField(state[7]),
-        GoldilocksField(state[8]),
-        GoldilocksField(state[9]),
-        GoldilocksField(state[10]),
-        GoldilocksField(state[11]),
-    ]
+    state.map(GoldilocksField)
 }

+/*
 #[inline(always)]
 pub unsafe fn poseidon(state: [GoldilocksField; 12]) -> [GoldilocksField; 12] {
    let state = unwrap_state(state);
@ -1058,6 +939,7 @@ pub unsafe fn poseidon(state: [GoldilocksField; 12]) -> [GoldilocksField; 12] {
    let state = full_rounds(state, &FINAL_ROUND_CONSTANTS);
    wrap_state(state)
 }
+*/

 #[inline(always)]
 pub unsafe fn sbox_layer(state: &mut [GoldilocksField; WIDTH]) {
@ -1067,8 +949,6 @@ pub unsafe fn sbox_layer(state: &mut [GoldilocksField; WIDTH]) {
 #[inline(always)]
 pub unsafe fn mds_layer(state: &[GoldilocksField; WIDTH]) -> [GoldilocksField; WIDTH] {
    let state = unwrap_state(*state);
-    // We want to do an MDS layer without the constant layer.
-    let round_consts = [0u64; WIDTH];
-    let state = mds_const_layers_full(state, &round_consts);
+    let state = mds_layer_full(state);
    wrap_state(state)
 }
--- a/plonky2/src/hash/arch/x86_64/mod.rs
+++ b/plonky2/src/hash/arch/x86_64/mod.rs
@ -1,5 +1,5 @@
-// Requires:
-// - AVX2
-// - BMI2 (for MULX and SHRX)
-#[cfg(all(target_feature = "avx2", target_feature = "bmi2"))]
-pub(crate) mod poseidon_goldilocks_avx2_bmi2;
+// // Requires:
+// // - AVX2
+// // - BMI2 (for MULX and SHRX)
+// #[cfg(all(target_feature = "avx2", target_feature = "bmi2"))]
+// pub(crate) mod poseidon_goldilocks_avx2_bmi2;
--- a/plonky2/src/hash/poseidon.rs
+++ b/plonky2/src/hash/poseidon.rs
@ -55,96 +55,96 @@ pub const ALL_ROUND_CONSTANTS: [u64; MAX_WIDTH * N_ROUNDS]  = [
    // WARNING: If these are changed in any way, then all the
    // implementations of Poseidon must be regenerated. See comments
    // in `poseidon_goldilocks.rs`.
-    0xb585f767417ee042, 0x7746a55f77c10331, 0xb2fb0d321d356f7a, 0x0f6760a486f1621f,
-    0xe10d6666b36abcdf, 0x8cae14cb455cc50b, 0xd438539cf2cee334, 0xef781c7d4c1fd8b4,
-    0xcdc4a23a0aca4b1f, 0x277fa208d07b52e3, 0xe17653a300493d38, 0xc54302f27c287dc1,
-    0x8628782231d47d10, 0x59cd1a8a690b49f2, 0xc3b919ad9efec0b0, 0xa484c4c637641d97,
-    0x308bbd23f191398b, 0x6e4a40c1bf713cf1, 0x9a2eedb7510414fb, 0xe360c6e111c2c63b,
-    0xd5c771901d4d89aa, 0xc35eae076e7d6b2f, 0x849c2656d0a09cad, 0xc0572c8c5cf1df2b,
-    0xe9fa634a883b8bf3, 0xf56f6d4900fb1fdd, 0xf7d713e872a72a1b, 0x8297132b6ba47612,
-    0xad6805e12ee8af1c, 0xac51d9f6485c22b9, 0x502ad7dc3bd56bf8, 0x57a1550c3761c577,
-    0x66bbd30e99d311da, 0x0da2abef5e948f87, 0xf0612750443f8e94, 0x28b8ec3afb937d8c,
-    0x92a756e6be54ca18, 0x70e741ec304e925d, 0x019d5ee2b037c59f, 0x6f6f2ed7a30707d1,
-    0x7cf416d01e8c169c, 0x61df517bb17617df, 0x85dc499b4c67dbaa, 0x4b959b48dad27b23,
-    0xe8be3e5e0dd779a0, 0xf5c0bc1e525ed8e6, 0x40b12cbf263cf853, 0xa637093f13e2ea3c,
-    0x3cc3f89232e3b0c8, 0x2e479dc16bfe86c0, 0x6f49de07d6d39469, 0x213ce7beecc232de,
-    0x5b043134851fc00a, 0xa2de45784a861506, 0x7103aaf97bed8dd5, 0x5326fc0dbb88a147,
-    0xa9ceb750364cb77a, 0x27f8ec88cc9e991f, 0xfceb4fda8c93fb83, 0xfac6ff13b45b260e,
-    0x7131aa455813380b, 0x93510360d5d68119, 0xad535b24fb96e3db, 0x4627f5c6b7efc045,
-    0x645cf794e4da78a9, 0x241c70ed1ac2877f, 0xacb8e076b009e825, 0x3737e9db6477bd9d,
-    0xe7ea5e344cd688ed, 0x90dee4a009214640, 0xd1b1edf7c77e74af, 0x0b65481bab42158e,
-    0x99ad1aab4b4fe3e7, 0x438a7c91f1a360cd, 0xb60de3bd159088bf, 0xc99cab6b47a3e3bb,
-    0x69a5ed92d5677cef, 0x5e7b329c482a9396, 0x5fc0ac0829f893c9, 0x32db82924fb757ea,
-    0x0ade699c5cf24145, 0x7cc5583b46d7b5bb, 0x85df9ed31bf8abcb, 0x6604df501ad4de64,
-    0xeb84f60941611aec, 0xda60883523989bd4, 0x8f97fe40bf3470bf, 0xa93f485ce0ff2b32,
-    0x6704e8eebc2afb4b, 0xcee3e9ac788ad755, 0x510d0e66062a270d, 0xf6323f48d74634a0,
-    0x0b508cdf04990c90, 0xf241708a4ef7ddf9, 0x60e75c28bb368f82, 0xa6217d8c3f0f9989,
-    0x7159cd30f5435b53, 0x839b4e8fe97ec79f, 0x0d3f3e5e885db625, 0x8f7d83be1daea54b,
-    0x780f22441e8dbc04, 0xeb9158465aedacd3, 0xd19e120d826c1b6c, 0x016ee53a7f007110,
-    0xcb5fd54ed22dd1ca, 0xacb84178c58de144, 0x9c22190c2c463227, 0x5d693c1bcc98406d,
-    0xdcef0798235f321a, 0x3d639263f55e0b1e, 0xe273fd977edb8fda, 0x418f027049d10fe7,
-    0x8c25fda3f253a284, 0x2cbaed4dc25a884e, 0x5f58e6aff78dc2af, 0x284650ac6fb9d206,
-    0x635b337f1391c13c, 0x9f9a036f1ac6361f, 0xb93e260cff6747b4, 0xb0a7eae8c7272e33,
-    0xd0762cbce7da0a9f, 0x34c6efb829c754d6, 0x40bf0ab6166855c1, 0xb6b570fccc46a242,
-    0x5a27b90055549545, 0xb1a5b166048b306f, 0x8722e0ad24f1006d, 0x788ee3b3b315049a,
-    0x14a726661e5b0351, 0x98b7672fe1c3f13e, 0xbb93ae77bdc3aa8f, 0x28fd3b04756fc222,
-    0x30a46805a86d7109, 0x337dc00c7844a0e7, 0xd5eca245253c861b, 0x77626382990d8546,
-    0xc1e434bf33c3ae7a, 0x0299351a54dbf35e, 0xb2d456e4fb620184, 0x3e9ed1fdc00265ea,
-    0x2972a92bb672e8db, 0x20216dd789f333ec, 0xadffe8cf746494a1, 0x1c4dbb1c5889d420,
-    0x15a16a8a8c9972f5, 0x388a128b98960e26, 0x2300e5d6ca3e5589, 0x2f63aa865c9ceb9f,
-    0xf1c36ce8d894420f, 0x271811252953f84a, 0xe5840293d5466a8e, 0x4d9bbc3e24e5f20e,
-    0xea35bc29cfa2794b, 0x18e21b4bf59e2d28, 0x1e3b9fc632ef6adb, 0x25d643627a05e678,
-    0x5a3f1bb1ecb63263, 0xdb7f0238ca031e31, 0xb462065960bfc4c4, 0x49c24ae463c280f4,
-    0xd793862c6f7b901a, 0xaadd1106bdce475e, 0xc43b6e0eed8ad58f, 0xe29024c1f2060cb7,
-    0x5e50c2755efbe17a, 0x10383f20ac183625, 0x38e8ee9d8a8a435d, 0xdd511837bcc52452,
-    0x7750059861a7da6a, 0x86ab99b518d1dbef, 0xb1204f608ccfe33b, 0xef61ac84d8dfca49,
-    0x1bbcd90f1f4eff36, 0x0cd1dabd9be9850a, 0x11a3ae5bf354bb11, 0xf755bfef11bb5516,
-    0xa3b832506e2f3adb, 0x516306f4b617e6ba, 0xddb4ac4a2aeead3a, 0x64bb6dec62af4430,
-    0xf9cc95c29895a152, 0x08d37f75632771b9, 0xeec49b619cee6b56, 0xf143933b56b3711a,
-    0xe4c5dd82b9f6570c, 0xe7ad775756eefdc4, 0x92c2318bc834ef78, 0x739c25f93007aa0a,
-    0x5636caca1725f788, 0xdd8f909af47cd0b6, 0xc6401fe16bc24d4e, 0x8ad97b342e6b3a3c,
-    0x0c49366bb7be8ce2, 0x0784d3d2f4b39fb5, 0x530fb67ec5d77a58, 0x41049229b8221f3b,
-    0x139542347cb606a3, 0x9cb0bd5ee62e6438, 0x02e3f615c4d3054a, 0x985d4f4adefb64a0,
-    0x775b9feb32053cde, 0x304265a64d6c1ba6, 0x593664c3be7acd42, 0x4f0a2e5fd2bd6718,
-    0xdd611f10619bf1da, 0xd8185f9b3e74f9a4, 0xef87139d126ec3b3, 0x3ba71336dd67f99b,
-    0x7d3a455d8d808091, 0x660d32e15cbdecc7, 0x297a863f5af2b9ff, 0x90e0a736e6b434df,
-    0x549f80ce7a12182e, 0x0f73b29235fb5b84, 0x16bf1f74056e3a01, 0x6d1f5a593019a39f,
-    0x02ff876fa73f6305, 0xc5cb72a2fb9a5bd7, 0x8470f39d674dfaa3, 0x25abb3f1e41aea30,
-    0x23eb8cc9c32951c7, 0xd687ba56242ac4ea, 0xda8d9e915d2de6b7, 0xe3cbdc7d938d8f1e,
-    0xb9a8c9b4001efad6, 0xc0d28a5c64f2285c, 0x45d7ac9b878575b8, 0xeeb76e39d8da283e,
-    0x3d06c8bd2fc7daac, 0x9c9c9820c13589f5, 0x65700b51db40bae3, 0x911f451579044242,
-    0x7ae6849ff1fee8cc, 0x3bb340ebba896ae5, 0xb46e9d8bb71f0b4b, 0x8dcf22f9e1bde2a3,
-    0x77bdaeda8cc55427, 0xf19e400ababa0e12, 0xc368a34939eb5c7f, 0x9ef1cd612c03bc5e,
-    0xe89cd8553b94bbd8, 0x5cd377dcb4550713, 0xa7b0fb78cd4c5665, 0x7684403ef76c7128,
-    0x5fa3f06f79c4f483, 0x8df57ac159dbade6, 0x2db01efa321b2625, 0x54846de4cfd58cb6,
-    0xba674538aa20f5cd, 0x541d4963699f9777, 0xe9096784dadaa548, 0xdfe8992458bf85ff,
-    0xece5a71e74a35593, 0x5ff98fd5ff1d14fd, 0x83e89419524c06e1, 0x5922040b6ef03286,
-    0xf97d750eab002858, 0x5080d4c2dba7b3ec, 0xa7de115ba038b508, 0x6a9242acb5f37ec0,
-    0xf7856ef865619ed0, 0x2265fc930dbd7a89, 0x17dfc8e5022c723b, 0x9001a64248f2d676,
-    0x90004c13b0b8b50e, 0xb932b7cfc63485b0, 0xa0b1df81fd4c2bc5, 0x8ef1dd26b594c383,
-    0x0541a4f9d20ba562, 0x9e611061be0a3c5b, 0xb3767e80e1e1624a, 0x0098d57820a88c6b,
-    0x31d191cd71e01691, 0x410fefafbf90a57a, 0xbdf8f2433633aea8, 0x9e8cd55b9cc11c28,
-    0xde122bec4acb869f, 0x4d001fd5b0b03314, 0xca66370067416209, 0x2f2339d6399888c6,
-    0x6d1a7918f7c98a13, 0xdf9a493995f688f3, 0xebc2151f4ded22ca, 0x03cc2ba8a2bab82f,
-    0xd341d03844ad9a9b, 0x387cb5d273ab3f58, 0xbba2515f74a7a221, 0x7248fe7737f37d9c,
-    0x4d61e56a7437f6b9, 0x262e963c9e54bef8, 0x59e89b097477d296, 0x055d5b52b9e47452,
-    0x82b27eb36e430708, 0xd30094caf3080f94, 0xcf5cb38227c2a3be, 0xfeed4db701262c7c,
-    0x41703f5391dd0154, 0x5eeea9412666f57b, 0x4cd1f1b196abdbc4, 0x4a20358594b3662b,
-    0x1478d361e4b47c26, 0x6f02dc0801d2c79f, 0x296a202eeb03c4b6, 0x2afd6799aec20c38,
-    0x7acfd96f3050383d, 0x6798ba0c380dfdd3, 0x34c6f57b3de02c88, 0x5736e1baf82eb8a0,
-    0x20057d2a0e58b8de, 0x3dea5bd5eb6e1404, 0x16e50d89874a6a98, 0x29bff3eccbfba19a,
-    0x475cd3207974793c, 0x18a42105cde34cfa, 0x023e7414b0618331, 0x151471081b52594b,
-    0xe4a3dff23bdeb0f3, 0x01a8d1a588c232ef, 0x11b4c74ee221d621, 0xe587cc0dce129c8c,
-    0x1ff7327025a65080, 0x594e29c44b8602b1, 0xf6f31db1f5a56fd3, 0xc02ac5e4c7258a5e,
-    0xe70201e9c5dc598f, 0x6f90ff3b9b3560b2, 0x42747a7262faf016, 0xd1f507e496927d26,
-    0x1c86d265fdd24cd9, 0x3996ce73f6b5266e, 0x8e7fba02d68a061e, 0xba0dec71548b7546,
-    0x9e9cbd785b8d8f40, 0xdae86459f6b3828c, 0xdebe08541314f71d, 0xa49229d29501358f,
-    0x7be5ba0010c4df7c, 0xa3c95eaf09ecc39c, 0x0230bca8f5d457cd, 0x4135c2bedc68cdf9,
-    0x166fc0cc4d5b20cc, 0x3762b59aa3236e6e, 0xe8928a4ceed163d2, 0x2a440b51b71223d9,
-    0x80cefd2bb5f48e46, 0xbb9879c738328b71, 0x6e7c8f1ab47cced0, 0x164bb2de257ffc0a,
-    0xf3c12fe5b800ea30, 0x40b9e92309e8c7e1, 0x551f5b0fe3b8d017, 0x25032aa7d4fc7aba,
-    0xaaed340795de0a0a, 0x8ffd96bc38c8ba0f, 0x70fc91eb8aa58833, 0x7f795e2a97566d73,
-    0x4543d9df72c4831d, 0xf172d73e69f20739, 0xdfd1c4ff1eb3d868, 0xbc8dfb62d26376f7,
+    0xb585f766f2144405, 0x7746a55f43921ad7, 0xb2fb0d31cee799b4, 0x0f6760a4803427d7,
+    0xe10d666650f4e012, 0x8cae14cb07d09bf1, 0xd438539c95f63e9f, 0xef781c7ce35b4c3d,
+    0xcdc4a239b0c44426, 0x277fa208bf337bff, 0xe17653a29da578a1, 0xc54302f225db2c76,
+    0x86287821f722c881, 0x59cd1a8a41c18e55, 0xc3b919ad495dc574, 0xa484c4c5ef6a0781,
+    0x308bbd23dc5416cc, 0x6e4a40c18f30c09c, 0x9a2eedb70d8f8cfa, 0xe360c6e0ae486f38,
+    0xd5c7718fbfc647fb, 0xc35eae071903ff0b, 0x849c2656969c4be7, 0xc0572c8c08cbbbad,
+    0xe9fa634a21de0082, 0xf56f6d48959a600d, 0xf7d713e806391165, 0x8297132b32825daf,
+    0xad6805e0e30b2c8a, 0xac51d9f5fcf8535e, 0x502ad7dc18c2ad87, 0x57a1550c110b3041,
+    0x66bbd30e6ce0e583, 0x0da2abef589d644e, 0xf061274fdb150d61, 0x28b8ec3ae9c29633,
+    0x92a756e67e2b9413, 0x70e741ebfee96586, 0x019d5ee2af82ec1c, 0x6f6f2ed772466352,
+    0x7cf416cfe7e14ca1, 0x61df517b86a46439, 0x85dc499b11d77b75, 0x4b959b48b9c10733,
+    0xe8be3e5da8043e57, 0xf5c0bc1de6da8699, 0x40b12cbf09ef74bf, 0xa637093ecb2ad631,
+    0x3cc3f892184df408, 0x2e479dc157bf31bb, 0x6f49de07a6234346, 0x213ce7bede378d7b,
+    0x5b0431345d4dea83, 0xa2de45780344d6a1, 0x7103aaf94a7bf308, 0x5326fc0d97279301,
+    0xa9ceb74fec024747, 0x27f8ec88bb21b1a3, 0xfceb4fda1ded0893, 0xfac6ff1346a41675,
+    0x7131aa45268d7d8c, 0x9351036095630f9f, 0xad535b24afc26bfb, 0x4627f5c6993e44be,
+    0x645cf794b8f1cc58, 0x241c70ed0af61617, 0xacb8e076647905f1, 0x3737e9db4c4f474d,
+    0xe7ea5e33e75fffb6, 0x90dee49fc9bfc23a, 0xd1b1edf76bc09c92, 0x0b65481ba645c602,
+    0x99ad1aab0814283b, 0x438a7c91d416ca4d, 0xb60de3bcc5ea751c, 0xc99cab6aef6f58bc,
+    0x69a5ed92a72ee4ff, 0x5e7b329c1ed4ad71, 0x5fc0ac0800144885, 0x32db829239774eca,
+    0x0ade699c5830f310, 0x7cc5583b10415f21, 0x85df9ed2e166d64f, 0x6604df4fee32bcb1,
+    0xeb84f608da56ef48, 0xda608834c40e603d, 0x8f97fe408061f183, 0xa93f485c96f37b89,
+    0x6704e8ee8f18d563, 0xcee3e9ac1e072119, 0x510d0e65e2b470c1, 0xf6323f486b9038f0,
+    0x0b508cdeffa5ceef, 0xf2417089e4fb3cbd, 0x60e75c2890d15730, 0xa6217d8bf660f29c,
+    0x7159cd30c3ac118e, 0x839b4e8fafead540, 0x0d3f3e5e82920adc, 0x8f7d83bddee7bba8,
+    0x780f2243ea071d06, 0xeb915845f3de1634, 0xd19e120d26b6f386, 0x016ee53a7e5fecc6,
+    0xcb5fd54e7933e477, 0xacb8417879fd449f, 0x9c22190be7f74732, 0x5d693c1ba3ba3621,
+    0xdcef0797c2b69ec7, 0x3d639263da827b13, 0xe273fd971bc8d0e7, 0x418f02702d227ed5,
+    0x8c25fda3b503038c, 0x2cbaed4daec8c07c, 0x5f58e6afcdd6ddc2, 0x284650ac5e1b0eba,
+    0x635b337ee819dab5, 0x9f9a036ed4f2d49f, 0xb93e260cae5c170e, 0xb0a7eae879ddb76d,
+    0xd0762cbc8ca6570c, 0x34c6efb812b04bf5, 0x40bf0ab5fa14c112, 0xb6b570fc7c5740d3,
+    0x5a27b9002de33454, 0xb1a5b165b6d2b2d2, 0x8722e0ace9d1be22, 0x788ee3b37e5680fb,
+    0x14a726661551e284, 0x98b7672f9ef3b419, 0xbb93ae776bb30e3a, 0x28fd3b046380f850,
+    0x30a4680593258387, 0x337dc00c61bd9ce1, 0xd5eca244c7a4ff1d, 0x7762638264d279bd,
+    0xc1e434bedeefd767, 0x0299351a53b8ec22, 0xb2d456e4ad251b80, 0x3e9ed1fda49cea0b,
+    0x2972a92ba450bed8, 0x20216dd77be493de, 0xadffe8cf28449ec6, 0x1c4dbb1c4c27d243,
+    0x15a16a8a8322d458, 0x388a128b7fd9a609, 0x2300e5d6baedf0fb, 0x2f63aa8647e15104,
+    0xf1c36ce86ecec269, 0x27181125183970c9, 0xe584029370dca96d, 0x4d9bbc3e02f1cfb2,
+    0xea35bc29692af6f8, 0x18e21b4beabb4137, 0x1e3b9fc625b554f4, 0x25d64362697828fd,
+    0x5a3f1bb1c53a9645, 0xdb7f023869fb8d38, 0xb462065911d4e1fc, 0x49c24ae4437d8030,
+    0xd793862c112b0566, 0xaadd1106730d8feb, 0xc43b6e0e97b0d568, 0xe29024c18ee6fca2,
+    0x5e50c27535b88c66, 0x10383f20a4ff9a87, 0x38e8ee9d71a45af8, 0xdd5118375bf1a9b9,
+    0x775005982d74d7f7, 0x86ab99b4dde6c8b0, 0xb1204f603f51c080, 0xef61ac8470250ecf,
+    0x1bbcd90f132c603f, 0x0cd1dabd964db557, 0x11a3ae5beb9d1ec9, 0xf755bfeea585d11d,
+    0xa3b83250268ea4d7, 0x516306f4927c93af, 0xddb4ac49c9efa1da, 0x64bb6dec369d4418,
+    0xf9cc95c22b4c1fcc, 0x08d37f755f4ae9f6, 0xeec49b613478675b, 0xf143933aed25e0b0,
+    0xe4c5dd8255dfc622, 0xe7ad7756f193198e, 0x92c2318b87fff9cb, 0x739c25f8fd73596d,
+    0x5636cac9f16dfed0, 0xdd8f909a938e0172, 0xc6401fe115063f5b, 0x8ad97b33f1ac1455,
+    0x0c49366bb25e8513, 0x0784d3d2f1698309, 0x530fb67ea1809a81, 0x410492299bb01f49,
+    0x139542347424b9ac, 0x9cb0bd5ea1a1115e, 0x02e3f615c38f49a1, 0x985d4f4a9c5291ef,
+    0x775b9feafdcd26e7, 0x304265a6384f0f2d, 0x593664c39773012c, 0x4f0a2e5fb028f2ce,
+    0xdd611f1000c17442, 0xd8185f9adfea4fd0, 0xef87139ca9a3ab1e, 0x3ba71336c34ee133,
+    0x7d3a455d56b70238, 0x660d32e130182684, 0x297a863f48cd1f43, 0x90e0a736a751ebb7,
+    0x549f80ce550c4fd3, 0x0f73b2922f38bd64, 0x16bf1f73fb7a9c3f, 0x6d1f5a59005bec17,
+    0x02ff876fa5ef97c4, 0xc5cb72a2a51159b0, 0x8470f39d2d5c900e, 0x25abb3f1d39fcb76,
+    0x23eb8cc9b372442f, 0xd687ba55c64f6364, 0xda8d9e90fd8ff158, 0xe3cbdc7d2fe45ea7,
+    0xb9a8c9b3aee52297, 0xc0d28a5c10960bd3, 0x45d7ac9b68f71a34, 0xeeb76e397069e804,
+    0x3d06c8bd1514e2d9, 0x9c9c98207cb10767, 0x65700b51aedfb5ef, 0x911f451539869408,
+    0x7ae6849fbc3a0ec6, 0x3bb340eba06afe7e, 0xb46e9d8b682ea65e, 0x8dcf22f9a3b34356,
+    0x77bdaeda586257a7, 0xf19e400a5104d20d, 0xc368a348e46d950f, 0x9ef1cd60e679f284,
+    0xe89cd854d5d01d33, 0x5cd377dc8bb882a2, 0xa7b0fb7883eee860, 0x7684403ec392950d,
+    0x5fa3f06f4fed3b52, 0x8df57ac11bc04831, 0x2db01efa1e1e1897, 0x54846de4aadb9ca2,
+    0xba6745385893c784, 0x541d496344d2c75b, 0xe909678474e687fe, 0xdfe89923f6c9c2ff,
+    0xece5a71e0cfedc75, 0x5ff98fd5d51fe610, 0x83e8941918964615, 0x5922040b47f150c1,
+    0xf97d750e3dd94521, 0x5080d4c2b86f56d7, 0xa7de115b56c78d70, 0x6a9242ac87538194,
+    0xf7856ef7f9173e44, 0x2265fc92feb0dc09, 0x17dfc8e4f7ba8a57, 0x9001a64209f21db8,
+    0x90004c1371b893c5, 0xb932b7cf752e5545, 0xa0b1df81b6fe59fc, 0x8ef1dd26770af2c2,
+    0x0541a4f9cfbeed35, 0x9e61106178bfc530, 0xb3767e80935d8af2, 0x0098d5782065af06,
+    0x31d191cd5c1466c7, 0x410fefafa319ac9d, 0xbdf8f242e316c4ab, 0x9e8cd55b57637ed0,
+    0xde122bebe9a39368, 0x4d001fd58f002526, 0xca6637000eb4a9f8, 0x2f2339d624f91f78,
+    0x6d1a7918c80df518, 0xdf9a4939342308e9, 0xebc2151ee6c8398c, 0x03cc2ba8a1116515,
+    0xd341d037e840cf83, 0x387cb5d25af4afcc, 0xbba2515f22909e87, 0x7248fe7705f38e47,
+    0x4d61e56a525d225a, 0x262e963c8da05d3d, 0x59e89b094d220ec2, 0x055d5b52b78b9c5e,
+    0x82b27eb33514ef99, 0xd30094ca96b7ce7b, 0xcf5cb381cd0a1535, 0xfeed4db6919e5a7c,
+    0x41703f53753be59f, 0x5eeea940fcde8b6f, 0x4cd1f1b175100206, 0x4a20358574454ec0,
+    0x1478d361dbbf9fac, 0x6f02dc07d141875c, 0x296a202ed8e556a2, 0x2afd67999bf32ee5,
+    0x7acfd96efa95491d, 0x6798ba0c0abb2c6d, 0x34c6f57b26c92122, 0x5736e1bad206b5de,
+    0x20057d2a0056521b, 0x3dea5bd5d0578bd7, 0x16e50d897d4634ac, 0x29bff3ecb9b7a6e3,
+    0x475cd3205a3bdcde, 0x18a42105c31b7e88, 0x023e7414af663068, 0x15147108121967d7,
+    0xe4a3dff1d7d6fef9, 0x01a8d1a588085737, 0x11b4c74eda62beef, 0xe587cc0d69a73346,
+    0x1ff7327017aa2a6e, 0x594e29c42473d06b, 0xf6f31db1899b12d5, 0xc02ac5e47312d3ca,
+    0xe70201e960cb78b8, 0x6f90ff3b6a65f108, 0x42747a7245e7fa84, 0xd1f507e43ab749b2,
+    0x1c86d265f15750cd, 0x3996ce73dd832c1c, 0x8e7fba02983224bd, 0xba0dec7103255dd4,
+    0x9e9cbd781628fc5b, 0xdae8645996edd6a5, 0xdebe0853b1a1d378, 0xa49229d24d014343,
+    0x7be5b9ffda905e1c, 0xa3c95eaec244aa30, 0x0230bca8f4df0544, 0x4135c2bebfe148c6,
+    0x166fc0cc438a3c72, 0x3762b59a8ae83efa, 0xe8928a4c89114750, 0x2a440b51a4945ee5,
+    0x80cefd2b7d99ff83, 0xbb9879c6e61fd62a, 0x6e7c8f1a84265034, 0x164bb2de1bbeddc8,
+    0xf3c12fe54d5c653b, 0x40b9e922ed9771e2, 0x551f5b0fbe7b1840, 0x25032aa7c4cb1811,
+    0xaaed34074b164346, 0x8ffd96bbf9c9c81d, 0x70fc91eb5937085c, 0x7f795e2a5f915440,
+    0x4543d9df5476d3cb, 0xf172d73e004fc90d, 0xdfd1c4febcc81238, 0xbc8dfb627fe558fc,
 ];

 const WIDTH: usize = SPONGE_WIDTH;
@ -153,9 +153,10 @@ pub trait Poseidon: PrimeField64 {
    // times number of rounds.
    const N_ROUND_CONSTANTS: usize = WIDTH * N_ROUNDS;

-    // Use the MDS matrix which is circulant with entries 2^x for each
-    // x in MDS_MATRIX_EXPS.
-    const MDS_MATRIX_EXPS: [u64; WIDTH];
+    // The MDS matrix we use is C + D, where C is the circulant matrix whose first row is given by
+    // `MDS_MATRIX_CIRC`, and D is the diagonal matrix whose diagonal is given by `MDS_MATRIX_DIAG`.
+    const MDS_MATRIX_CIRC: [u64; WIDTH];
+    const MDS_MATRIX_DIAG: [u64; WIDTH];

    // Precomputed constants for the fast Poseidon calculation. See
    // the paper.
@ -169,9 +170,10 @@ pub trait Poseidon: PrimeField64 {
    #[unroll_for_loops]
    fn mds_row_shf(r: usize, v: &[u64; WIDTH]) -> u128 {
        debug_assert!(r < WIDTH);
-        // The values of MDS_MATRIX_EXPS are known to be small, so we can
-        // accumulate all the products for each row and reduce just once
-        // at the end (done by the caller).
+        // The values of `MDS_MATRIX_CIRC` and `MDS_MATRIX_DIAG` are
+        // known to be small, so we can accumulate all the products for
+        // each row and reduce just once at the end (done by the
+        // caller).

        // NB: Unrolling this, calculating each term independently, and
        // summing at the end, didn't improve performance for me.
@ -180,9 +182,10 @@ pub trait Poseidon: PrimeField64 {
        // This is a hacky way of fully unrolling the loop.
        for i in 0..12 {
            if i < WIDTH {
-                res += (v[(i + r) % WIDTH] as u128) << Self::MDS_MATRIX_EXPS[i];
+                res += (v[(i + r) % WIDTH] as u128) * (Self::MDS_MATRIX_CIRC[i] as u128);
            }
        }
+        res += (v[r] as u128) * (Self::MDS_MATRIX_DIAG[r] as u128);

        res
    }
@ -196,8 +199,9 @@ pub trait Poseidon: PrimeField64 {
        let mut res = F::ZERO;

        for i in 0..WIDTH {
-            res += v[(i + r) % WIDTH] * F::from_canonical_u64(1 << Self::MDS_MATRIX_EXPS[i]);
+            res += v[(i + r) % WIDTH] * F::from_canonical_u64(Self::MDS_MATRIX_CIRC[i]);
        }
+        res += v[r] * F::from_canonical_u64(Self::MDS_MATRIX_DIAG[r]);

        res
    }
@ -215,9 +219,13 @@ pub trait Poseidon: PrimeField64 {
        let mut res = builder.zero_extension();

        for i in 0..WIDTH {
-            let c = Self::from_canonical_u64(1 << <Self as Poseidon>::MDS_MATRIX_EXPS[i]);
+            let c = Self::from_canonical_u64(<Self as Poseidon>::MDS_MATRIX_CIRC[i]);
            res = builder.mul_const_add_extension(c, v[(i + r) % WIDTH], res);
        }
+        {
+            let c = Self::from_canonical_u64(<Self as Poseidon>::MDS_MATRIX_DIAG[r]);
+            res = builder.mul_const_add_extension(c, v[r], res);
+        }

        res
    }
@ -395,7 +403,8 @@ pub trait Poseidon: PrimeField64 {
            }
        }
        let s0 = state[0].to_noncanonical_u64() as u128;
-        d_sum = add_u160_u128(d_sum, s0 << Self::MDS_MATRIX_EXPS[0]);
+        let mds0to0 = (Self::MDS_MATRIX_CIRC[0] + Self::MDS_MATRIX_DIAG[0]) as u128;
+        d_sum = add_u160_u128(d_sum, s0 * mds0to0);
        let d = reduce_u160::<Self>(d_sum);

        // result = [d] concat [state[0] * v + state[shift up by 1]]
@ -416,7 +425,8 @@ pub trait Poseidon: PrimeField64 {
        r: usize,
    ) -> [F; WIDTH] {
        let s0 = state[0];
-        let mut d = s0 * F::from_canonical_u64(1 << Self::MDS_MATRIX_EXPS[0]);
+        let mds0to0 = Self::MDS_MATRIX_CIRC[0] + Self::MDS_MATRIX_DIAG[0];
+        let mut d = s0 * F::from_canonical_u64(mds0to0);
        for i in 1..WIDTH {
            let t = F::from_canonical_u64(Self::FAST_PARTIAL_ROUND_W_HATS[r][i - 1]);
            d += state[i] * t;
@ -442,10 +452,8 @@ pub trait Poseidon: PrimeField64 {
        Self: RichField + Extendable<D>,
    {
        let s0 = state[0];
-        let mut d = builder.mul_const_extension(
-            Self::from_canonical_u64(1 << <Self as Poseidon>::MDS_MATRIX_EXPS[0]),
-            s0,
-        );
+        let mds0to0 = Self::MDS_MATRIX_CIRC[0] + Self::MDS_MATRIX_DIAG[0];
+        let mut d = builder.mul_const_extension(Self::from_canonical_u64(mds0to0), s0);
        for i in 1..WIDTH {
            let t = <Self as Poseidon>::FAST_PARTIAL_ROUND_W_HATS[r][i - 1];
            let t = Self::Extension::from_canonical_u64(t);
--- a/plonky2/src/hash/poseidon_goldilocks.rs
+++ b/plonky2/src/hash/poseidon_goldilocks.rs
@ -10,8 +10,8 @@ use crate::hash::poseidon::{Poseidon, N_PARTIAL_ROUNDS};

 #[rustfmt::skip]
 impl Poseidon for GoldilocksField {
-    // The MDS matrix we use is the circulant matrix with first row given by the vector
-    // [ 2^x for x in MDS_MATRIX_EXPS] = [1, 1, 2, 1, 8, 32, 2, 256, 4096, 8, 65536, 1024]
+    // The MDS matrix we use is C + D, where C is the circulant matrix whose first row is given by
+    // `MDS_MATRIX_CIRC`, and D is the diagonal matrix whose diagonal is given by `MDS_MATRIX_DIAG`.
    //
    // WARNING: If the MDS matrix is changed, then the following
    // constants need to be updated accordingly:
@ -19,237 +19,238 @@ impl Poseidon for GoldilocksField {
    //  - FAST_PARTIAL_ROUND_VS
    //  - FAST_PARTIAL_ROUND_W_HATS
    //  - FAST_PARTIAL_ROUND_INITIAL_MATRIX
-    const MDS_MATRIX_EXPS: [u64; 12] = [0, 0, 1, 0, 3, 5, 1, 8, 12, 3, 16, 10];
+    const MDS_MATRIX_CIRC: [u64; 12] = [17, 15, 41, 16, 2, 28, 13, 13, 39, 18, 34, 20];
+    const MDS_MATRIX_DIAG: [u64; 12] = [8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];

    const FAST_PARTIAL_FIRST_ROUND_CONSTANT: [u64; 12]  = [
-        0x3cc3f89232e3b0c8, 0x3a8304bc56985013, 0x2a9f75c2280d2a8e, 0x53b9e0fac07c9b2b,
-        0x276ef5190ab36dd6, 0xdccc95c1f434ce8d, 0x28d717d689301db6, 0x2662f1723650b872,
-        0xc6b0375cf47850da, 0xbdfcca7661d81f17, 0x911992a4f6d9591f, 0xb718e4720c9f542f,
+        0x3cc3f892184df408, 0xe993fd841e7e97f1, 0xf2831d3575f0f3af, 0xd2500e0a350994ca,
+        0xc5571f35d7288633, 0x91d89c5184109a02, 0xf37f925d04e5667b, 0x2d6e448371955a69,
+        0x740ef19ce01398a1, 0x694d24c0752fdf45, 0x60936af96ee2f148, 0xc33448feadc78f0c,
    ];

    const FAST_PARTIAL_ROUND_CONSTANTS: [u64; N_PARTIAL_ROUNDS]  = [
-        0x1c92804be083d129, 0x81d932f4620fcfc6, 0x29f58a72045f76a0, 0x434472d6c6e34f30,
-        0xc82c90fad781bb5c, 0xe6dfefae3135c450, 0xd0a0c9c9fff4798f, 0x97517f4034e7c8e6,
-        0xae8b5030952e5949, 0xf77251b77cc297e2, 0x879c3a97606f1160, 0xed4e1e98780bdc19,
-        0x5a9120e0c05b1660, 0xc4b244ea04b27221, 0x7fe9d55a335d7b82, 0xd69ff91c66ec999a,
-        0x4c389b1b8180f1f5, 0x1b289f8c7fdeea1e, 0x3d464c75140b20e7, 0x74d158e1be40eb73,
-        0xfc787193d2a84ea4, 0x0,
+        0x74cb2e819ae421ab, 0xd2559d2370e7f663, 0x62bf78acf843d17c, 0xd5ab7b67e14d1fb4,
+        0xb9fe2ae6e0969bdc, 0xe33fdf79f92a10e8, 0x0ea2bb4c2b25989b, 0xca9121fbf9d38f06,
+        0xbdd9b0aa81f58fa4, 0x83079fa4ecf20d7e, 0x650b838edfcc4ad3, 0x77180c88583c76ac,
+        0xaf8c20753143a180, 0xb8ccfe9989a39175, 0x954a1729f60cc9c5, 0xdeb5b550c4dca53b,
+        0xf01bb0b00f77011e, 0xa1ebb404b676afd9, 0x860b6e1597a0173e, 0x308bb65a036acbce,
+        0x1aca78f31c97c876, 0x0,
    ];

    const FAST_PARTIAL_ROUND_VS: [[u64; 12 - 1]; N_PARTIAL_ROUNDS] = [
-        [0x9a5dd25dc32e6569, 0xd4b82de00e7510fa, 0x165bdcd7b344404a, 0xa85b4c126b8edfd4,
-         0xcd2735bf92ab4f96, 0xdc07742c7da8ac41, 0x953fc266fc5ae49f, 0x0a151c20bfc847bf,
-         0x0c550caef5afedb5, 0x74d28901888c5fa8, 0xdc51b68c30cc1741, ],
-        [0x4f765e0a4246c828, 0xbbdc8cbadd477a84, 0x052a5abd7de2344c, 0xab88daa04d9c7fab,
-         0xbc8fd7acbee798ef, 0xe55d796c0d8a7a09, 0x40824732ed2c556c, 0x298a94d56eabeaa4,
-         0x719fcd5e11312b6c, 0x1ec9a560131d1ac7, 0xabc54a42497f7fd1, ],
-        [0xb51f81e6eeeeb0d6, 0xc6f3c34e7161d1ef, 0x1e93b9e2255eed5b, 0xa78338e63ec48cc2,
-         0xea6e89d1c7220a56, 0xaa52f6a1c2814bc5, 0x5896b6395e09fba0, 0xf7fc97a18d5f1eee,
-         0xf2712e64111823e8, 0x4f84821bf1f857f4, 0x02041415d72da206, ],
-        [0x39286a4a4a391e77, 0x4ac16c7bebc97214, 0x7427cbbcb895a01f, 0x2ef8491d0b14759b,
-         0xbec7625ee20fa616, 0x7c64393faf749b6f, 0x0f61c751c9826dc5, 0x700e6f3ee8ccb8a7,
-         0x5bdea3b447ef8667, 0xa0f569a5a6e97588, 0xcc9e78115d7cae2d, ],
-        [0x0933079ab678e5ee, 0xed6861bf33c54a28, 0x62503e6e1749a497, 0x745a9c65dea83ac6,
-         0x20ce351f6e700cf0, 0x2ec0b18d30fafb8a, 0x0312f54c22b5f299, 0x5222977218fd6cd5,
-         0x82662e8445868eec, 0xc4cab6335040265d, 0x12e5790e9efb9217, ],
-        [0x0d829aec63871f55, 0x384d8a425086dd8c, 0x13e78b54657bfd3e, 0x2a45a17a03093566,
-         0x7b6872656233b9be, 0xddc0281bb12bbb4c, 0xa224ebff0652d7c8, 0xc5ca97207780ea5c,
-         0x484236194d3586ba, 0x432a56d44a44f3f7, 0xc41f926f862fc532, ],
-        [0x9366cd7ed9ef5e06, 0xd7f941098175f223, 0x9af7dda3e1c9f2b1, 0x9a0ec6d0a03525f5,
-         0x3ab244f4fb0fb387, 0xd8c4e357eb1d5778, 0xe62157e2e25edbbb, 0xafcd6630f841f1f8,
-         0xc3969199738708fb, 0xa8224d311e6a551f, 0xc2c0a01fc655fd9f, ],
-        [0xd78498f2013cd9b6, 0x675d21a200b2908c, 0x70bfd23b9e88c707, 0x85472dcbcfd078e3,
-         0x5658c961cfffd574, 0x89e05a2cda3ca315, 0x1b51ae1ff8186a9f, 0xca648f8c6c7822cb,
-         0x7233c92647957f4d, 0x520bf21c62d37ffa, 0x897496c7407a2ca7, ],
-        [0x8e80cf5bca4eee19, 0x754779126bc1afcf, 0x07e887764b379cb0, 0x7dc7c14e12f91d5e,
-         0xc8f5dab5fb6b0264, 0x1c842cf8021f9176, 0x69b56a7e2e2db2c0, 0xf30253f77fef3445,
-         0x14bb3a62919efb99, 0xff9976d424a5d89c, 0x59dde7be0331a202, ],
-        [0xdbe04b62126330a2, 0x0409b2138da1eaec, 0x7bd4558eb2262691, 0xafa86cfa8d52b05b,
-         0xb83f570197d8c584, 0xb3ded6cc13990ac1, 0xfd33937cb072c9e1, 0xe3b3989341d92952,
-         0xd26e76d6ca949ad9, 0x35c89a8548f88e86, 0x8af785bd940c3b43, ],
-        [0xcbf3b86701c790da, 0x63634f67e29f4005, 0x008f903982363b81, 0xc2b07f99d6eb0229,
-         0xa8344b83d15e2558, 0x880f4e5fd103b7b0, 0xd40eddb0a5929072, 0x476e27ccee571f49,
-         0xe71439b4b989f9eb, 0x97e55074f852b2fe, 0xdd258c2137e1a2c5, ],
-        [0x982b90366d23259b, 0xb2667eacaa76b306, 0xecf233e82020ede1, 0x3cee7ac07d4a88c7,
-         0x31428be2fe5a5854, 0xf1beea1d55c4c4db, 0x584fd6b580f1ffd2, 0x6e2381c3c8ba0d0b,
-         0x21ab749cbafc0611, 0x8ed389f39aba3001, 0xa24ba694f2b42f13, ],
-        [0xdb30cd9db02606f9, 0x1b0d6736682ba257, 0x0d3bcdecf5808443, 0x31c330001dbd3dbd,
-         0x9684d22370447946, 0xde0e24e6426c6935, 0xf487270dd081ef69, 0xd943f4ef48f2b252,
-         0x4c52a7fdd1c52d24, 0xc293082029ea139d, 0xc2ba73ab3da0468a, ],
-        [0xd093bd0dcc74e0d1, 0xe91428f9ce6a98e5, 0x673dee716909dc21, 0xf22e3223548219d7,
-         0x3297978d881a1300, 0x51157b1e8218d77c, 0x0e3b0a5c07843889, 0x273b48dfa36752b6,
-         0x5dbf2c6323576866, 0x1c032b70763df9a7, 0x1a8d7ed4159ecbf4, ],
-        [0x8e40b29fa6c4f3ad, 0x43bc06dba91daa9b, 0x445df1620dd6d846, 0xae1e72ed68c45c46,
-         0x496ee4e593ade46d, 0x1d3642eddce9118f, 0x71a88114bd8fd755, 0x4a10d6b22514943d,
-         0x56dca305d4d72fee, 0xe2e4d9ce95fa62bf, 0xfb6bfffd47b50b0a, ],
-        [0x4c6c14946cc557ee, 0x9b1bcbaac7ba3226, 0xdd7410361fa0dd20, 0x9c8a098cbaf95b26,
-         0x3da4f26593503adf, 0xffb07b45cd3bf859, 0xaf034373af54a559, 0xd6b9bace407146bb,
-         0x7b92c04c972f4ec6, 0xfe71df71165b9845, 0xad0134b9dc9ebe51, ],
-        [0xfdaa64ceec88aa7c, 0x565342e2d815525c, 0xe382458f259429a8, 0x0f6ba5afd5d1d1ca,
-         0xcba85de412439a41, 0x212d3c62049ccb1a, 0x930c0bf5950267e3, 0x60f87fe43fc560d8,
-         0x8f1fbdbcd878a33b, 0xd28b789abf9af16f, 0xd921f0434fa0eb07, ],
-        [0xd69c2c80635e7c18, 0x5a3d78c8772f293f, 0x844fe5e72ad1ceb5, 0x81b217e5910dc916,
-         0x2951409fb7c8ba85, 0x5c135dd95693e367, 0xc2e8a723f9f7ebd2, 0x10bb79bf5d63f38d,
-         0x34625b1550385a89, 0xdc6235328d791163, 0x1eb12b7aed4d5133, ],
-        [0x01426faca89577d0, 0x003ca90136ac4fd0, 0x00289223dc45a17f, 0x0009921704320612,
-         0x0007efae3669e451, 0x006499f206b3349d, 0x1001120d9b5dcfe1, 0x000e3aa47db4da94,
-         0x0320dc8339d35692, 0x4030a0a16247ecbd, 0x04368a659c160a6b, ],
-        [0x0000001237b408f0, 0x00000004c8f1b79c, 0x0000000446de5309, 0x00000032a3e2d4ac,
-         0x00000c007600eeb7, 0x000100040ee771b0, 0x00000198394d0817, 0x0000301810a981ba,
-         0x0000030f37d86f5a, 0x0000030ab1cc04d4, 0x000000c0e7c0b7e9, ],
-        [0x00000000000234a0, 0x0000000000114630, 0x000000000800260c, 0x0000000100005288,
-         0x0000000000900194, 0x00000000200800a3, 0x0000000002011034, 0x000000000105100e,
-         0x0000000000604025, 0x0000000000114a03, 0x0000000000061481, ],
-        [0x0000000000000400, 0x0000000000010000, 0x0000000000000008, 0x0000000000001000,
-         0x0000000000000100, 0x0000000000000002, 0x0000000000000020, 0x0000000000000008,
-         0x0000000000000001, 0x0000000000000002, 0x0000000000000001, ],
+        [0x94877900674181c3, 0xc6c67cc37a2a2bbd, 0xd667c2055387940f, 0x0ba63a63e94b5ff0,
+         0x99460cc41b8f079f, 0x7ff02375ed524bb3, 0xea0870b47a8caf0e, 0xabcad82633b7bc9d,
+         0x3b8d135261052241, 0xfb4515f5e5b0d539, 0x3ee8011c2b37f77c, ],
+        [0x0adef3740e71c726, 0xa37bf67c6f986559, 0xc6b16f7ed4fa1b00, 0x6a065da88d8bfc3c,
+         0x4cabc0916844b46f, 0x407faac0f02e78d1, 0x07a786d9cf0852cf, 0x42433fb6949a629a,
+         0x891682a147ce43b0, 0x26cfd58e7b003b55, 0x2bbf0ed7b657acb3, ],
+        [0x481ac7746b159c67, 0xe367de32f108e278, 0x73f260087ad28bec, 0x5cfc82216bc1bdca,
+         0xcaccc870a2663a0e, 0xdb69cd7b4298c45d, 0x7bc9e0c57243e62d, 0x3cc51c5d368693ae,
+         0x366b4e8cc068895b, 0x2bd18715cdabbca4, 0xa752061c4f33b8cf, ],
+        [0xb22d2432b72d5098, 0x9e18a487f44d2fe4, 0x4b39e14ce22abd3c, 0x9e77fde2eb315e0d,
+         0xca5e0385fe67014d, 0x0c2cb99bf1b6bddb, 0x99ec1cd2a4460bfe, 0x8577a815a2ff843f,
+         0x7d80a6b4fd6518a5, 0xeb6c67123eab62cb, 0x8f7851650eca21a5, ],
+        [0x11ba9a1b81718c2a, 0x9f7d798a3323410c, 0xa821855c8c1cf5e5, 0x535e8d6fac0031b2,
+         0x404e7c751b634320, 0xa729353f6e55d354, 0x4db97d92e58bb831, 0xb53926c27897bf7d,
+         0x965040d52fe115c5, 0x9565fa41ebd31fd7, 0xaae4438c877ea8f4, ],
+        [0x37f4e36af6073c6e, 0x4edc0918210800e9, 0xc44998e99eae4188, 0x9f4310d05d068338,
+         0x9ec7fe4350680f29, 0xc5b2c1fdc0b50874, 0xa01920c5ef8b2ebe, 0x59fa6f8bd91d58ba,
+         0x8bfc9eb89b515a82, 0xbe86a7a2555ae775, 0xcbb8bbaa3810babf, ],
+        [0x577f9a9e7ee3f9c2, 0x88c522b949ace7b1, 0x82f07007c8b72106, 0x8283d37c6675b50e,
+         0x98b074d9bbac1123, 0x75c56fb7758317c1, 0xfed24e206052bc72, 0x26d7c3d1bc07dae5,
+         0xf88c5e441e28dbb4, 0x4fe27f9f96615270, 0x514d4ba49c2b14fe, ],
+        [0xf02a3ac068ee110b, 0x0a3630dafb8ae2d7, 0xce0dc874eaf9b55c, 0x9a95f6cff5b55c7e,
+         0x626d76abfed00c7b, 0xa0c1cf1251c204ad, 0xdaebd3006321052c, 0x3d4bd48b625a8065,
+         0x7f1e584e071f6ed2, 0x720574f0501caed3, 0xe3260ba93d23540a, ],
+        [0xab1cbd41d8c1e335, 0x9322ed4c0bc2df01, 0x51c3c0983d4284e5, 0x94178e291145c231,
+         0xfd0f1a973d6b2085, 0xd427ad96e2b39719, 0x8a52437fecaac06b, 0xdc20ee4b8c4c9a80,
+         0xa2c98e9549da2100, 0x1603fe12613db5b6, 0x0e174929433c5505, ],
+        [0x3d4eab2b8ef5f796, 0xcfff421583896e22, 0x4143cb32d39ac3d9, 0x22365051b78a5b65,
+         0x6f7fd010d027c9b6, 0xd9dd36fba77522ab, 0xa44cf1cb33e37165, 0x3fc83d3038c86417,
+         0xc4588d418e88d270, 0xce1320f10ab80fe2, 0xdb5eadbbec18de5d, ],
+        [0x1183dfce7c454afd, 0x21cea4aa3d3ed949, 0x0fce6f70303f2304, 0x19557d34b55551be,
+         0x4c56f689afc5bbc9, 0xa1e920844334f944, 0xbad66d423d2ec861, 0xf318c785dc9e0479,
+         0x99e2032e765ddd81, 0x400ccc9906d66f45, 0xe1197454db2e0dd9, ],
+        [0x84d1ecc4d53d2ff1, 0xd8af8b9ceb4e11b6, 0x335856bb527b52f4, 0xc756f17fb59be595,
+         0xc0654e4ea5553a78, 0x9e9a46b61f2ea942, 0x14fc8b5b3b809127, 0xd7009f0f103be413,
+         0x3e0ee7b7a9fb4601, 0xa74e888922085ed7, 0xe80a7cde3d4ac526, ],
+        [0x238aa6daa612186d, 0x9137a5c630bad4b4, 0xc7db3817870c5eda, 0x217e4f04e5718dc9,
+         0xcae814e2817bd99d, 0xe3292e7ab770a8ba, 0x7bb36ef70b6b9482, 0x3c7835fb85bca2d3,
+         0xfe2cdf8ee3c25e86, 0x61b3915ad7274b20, 0xeab75ca7c918e4ef, ],
+        [0xd6e15ffc055e154e, 0xec67881f381a32bf, 0xfbb1196092bf409c, 0xdc9d2e07830ba226,
+         0x0698ef3245ff7988, 0x194fae2974f8b576, 0x7a5d9bea6ca4910e, 0x7aebfea95ccdd1c9,
+         0xf9bd38a67d5f0e86, 0xfa65539de65492d8, 0xf0dfcbe7653ff787, ],
+        [0x0bd87ad390420258, 0x0ad8617bca9e33c8, 0x0c00ad377a1e2666, 0x0ac6fc58b3f0518f,
+         0x0c0cc8a892cc4173, 0x0c210accb117bc21, 0x0b73630dbb46ca18, 0x0c8be4920cbd4a54,
+         0x0bfe877a21be1690, 0x0ae790559b0ded81, 0x0bf50db2f8d6ce31, ],
+        [0x000cf29427ff7c58, 0x000bd9b3cf49eec8, 0x000d1dc8aa81fb26, 0x000bc792d5c394ef,
+         0x000d2ae0b2266453, 0x000d413f12c496c1, 0x000c84128cfed618, 0x000db5ebd48fc0d4,
+         0x000d1b77326dcb90, 0x000beb0ccc145421, 0x000d10e5b22b11d1, ],
+        [0x00000e24c99adad8, 0x00000cf389ed4bc8, 0x00000e580cbf6966, 0x00000cde5fd7e04f,
+         0x00000e63628041b3, 0x00000e7e81a87361, 0x00000dabe78f6d98, 0x00000efb14cac554,
+         0x00000e5574743b10, 0x00000d05709f42c1, 0x00000e4690c96af1, ],
+        [0x0000000f7157bc98, 0x0000000e3006d948, 0x0000000fa65811e6, 0x0000000e0d127e2f,
+         0x0000000fc18bfe53, 0x0000000fd002d901, 0x0000000eed6461d8, 0x0000001068562754,
+         0x0000000fa0236f50, 0x0000000e3af13ee1, 0x0000000fa460f6d1, ],
+        [0x0000000011131738, 0x000000000f56d588, 0x0000000011050f86, 0x000000000f848f4f,
+         0x00000000111527d3, 0x00000000114369a1, 0x00000000106f2f38, 0x0000000011e2ca94,
+         0x00000000110a29f0, 0x000000000fa9f5c1, 0x0000000010f625d1, ],
+        [0x000000000011f718, 0x000000000010b6c8, 0x0000000000134a96, 0x000000000010cf7f,
+         0x0000000000124d03, 0x000000000013f8a1, 0x0000000000117c58, 0x0000000000132c94,
+         0x0000000000134fc0, 0x000000000010a091, 0x0000000000128961, ],
+        [0x0000000000001300, 0x0000000000001750, 0x000000000000114e, 0x000000000000131f,
+         0x000000000000167b, 0x0000000000001371, 0x0000000000001230, 0x000000000000182c,
+         0x0000000000001368, 0x0000000000000f31, 0x00000000000015c9, ],
+        [0x0000000000000014, 0x0000000000000022, 0x0000000000000012, 0x0000000000000027,
+         0x000000000000000d, 0x000000000000000d, 0x000000000000001c, 0x0000000000000002,
+         0x0000000000000010, 0x0000000000000029, 0x000000000000000f, ],
    ];

    const FAST_PARTIAL_ROUND_W_HATS: [[u64; 12 - 1]; N_PARTIAL_ROUNDS] = [
-        [0x54accab273d3aeca, 0x12fecae33b1f1da9, 0x573bb85449ea9a27, 0x6b5ddc139f172aad,
-         0xd2b6d0ca34465d4c, 0x51cf0aafbddfc269, 0x6075e64679e7a403, 0x678316c041900ac9,
-         0x10019c84b343fc57, 0xde5b81280922f644, 0x42490a86b2f2f305, ],
-        [0x337c5930f7bacc46, 0x334792a4f1afb921, 0xc97ea5f1426e540e, 0x5fc74568337bd780,
-         0xfd5718cc391d80ef, 0xef90b77a337d923c, 0xb28561998f153fea, 0xed5f65b8894345aa,
-         0x7e2aacb5985893a7, 0xcbde536cb644fcf0, 0x07338300a07fc43b, ],
-        [0xd4c9ad02fcc8b4c1, 0x2890dac7a1caa815, 0x7d62bc45c45f5db2, 0x0a902300db5deac2,
-         0x663f3726307f62a4, 0x050bda7dc7d8eb3b, 0xd9db68f3f051c5b6, 0xc5110194a38210aa,
-         0x403862136533be0e, 0x20039e053d9b227d, 0xe2c90d16262c5f3c, ],
-        [0x6578da963396c755, 0xea6b546e6bc1e86f, 0x4e562ef0c66c2be3, 0x35b839dae0f9d22e,
-         0x4aab3d88857b058c, 0x4f7443e07ac462d3, 0x93c2c5bbc385e50f, 0xc0c0c5c8ea023ce2,
-         0x8409c53d4b62965d, 0x0489f2258135dcd1, 0x32958358c736aec9, ],
-        [0xe13b50ca15b0a455, 0x9878071e2b5d4547, 0xb8e50d27b4172b30, 0xbf312f828d3ea142,
-         0x5b8510573020e6e8, 0x7c3091c29d8d6afa, 0x7e2d900a50f194fa, 0xb236d5080d0b0409,
-         0x08f148b6c3b99320, 0x679c6b9cadbe604c, 0x6b0313be2ad9b9f2, ],
-        [0x12038ac320459b0e, 0x7abd36c6b25cd8e0, 0x37cc3583930e5a13, 0xafe725c4446a691d,
-         0x99d89ccadeb38d80, 0x96c820be5528ec36, 0x9b63969fdc84ede6, 0x8f8f21cf5ad78c48,
-         0x1a4d3573bc3c2d8b, 0x9f5a7bd9e771866e, 0x5bcef938b72497fc, ],
-        [0x5f969817be6add7a, 0x572b04c1ae5a4c6d, 0x8d219b8fac9a287b, 0x4566b3c56372f434,
-         0xdd3f46f108bf4441, 0xd7e1469baa3912c4, 0xac36377b68e071fc, 0xf348c609201d771a,
-         0x0bb926a5e2ebdd96, 0x30efa780aee4705a, 0xb24ff2673691146a, ],
-        [0x5d0324b3a1dab6e2, 0xbd1491a0cc9e564b, 0xb8699e13b528ef99, 0x7743d9a8753ee023,
-         0xce577363cdb5bcbc, 0xc056688d4f006774, 0x61f9363c10d7fdf2, 0x5f730e5530f6e06d,
-         0x25efb9ef3adf0072, 0xcf971d58e21a8aa7, 0xd830d7e8d0d70680, ],
-        [0x36e69157ac42f39d, 0x3e7aca69ddf62d3e, 0xbbbef86cac42bb30, 0xa2e793ae56c27043,
-         0x2a315dc4bc40c8a0, 0x84022758f3b3af55, 0x668809e74e7a470d, 0xf2d91eaafdee1820,
-         0x50f19afd16d03294, 0x30c087d3223bcd4b, 0xf5739d95458cc633, ],
-        [0x15266b5a75028317, 0x8059f198c9f88799, 0x437a070386c65244, 0xc70e0bb73942929d,
-         0xa8b32cb37ae137ea, 0xc2e556278323a459, 0xbc486da754091692, 0x7815a23467d6b541,
-         0x3e6dba4e930e8be6, 0x6b4277b0915d56ba, 0x20212bfac7922ea0, ],
-        [0xeeba270c067b0c8b, 0xa4d576458941f29a, 0xecdf04a28c8c83be, 0xc808f0af215d7dda,
-         0x424f4bfbecced0fb, 0xe4cbf6c0c10e58b3, 0x66a87bebfa09c031, 0x614ffc9443d5f0a4,
-         0x96c96636f7b7975a, 0x58d4222a6f860cc5, 0x2d4f51c75bf50169, ],
-        [0xab43452aec55310f, 0x0a719e77ec2b398c, 0x8f946888a3f5f74f, 0x7b447e0d9f7ad4fb,
-         0x7a2887ceb40ef226, 0x8840b904c1c49e50, 0xd91ea2510b0eaddc, 0x6617fa40a1a220fb,
-         0xb1c41a72a845cb45, 0x02c2715281868092, 0xaf5b1b6c46ca37bd, ],
-        [0xe27649b9dbcbe631, 0x4afdf11d1d5e73b2, 0x05285a0e99160910, 0x23bfd6197ed8d3ba,
-         0xb1e6292028792aab, 0xc997f6cc14e05cae, 0x34793ec255a555bd, 0xeb4f2da35a76dd03,
-         0x767a5552c9910f3a, 0x4c4cc6987c30a447, 0x64da2b6920578f8d, ],
-        [0xe97ce2fecc0720ac, 0x99fc5741fcdeae8a, 0x0ac47be58b345692, 0x75a446121f2cccda,
-         0xf38e40a102691c8e, 0xdbe5d707594714ef, 0x6ab183bdab92e450, 0x0aed83850dc10451,
-         0x66e16941a4373c93, 0x22af15bb3e1034a1, 0xab2136f22ed23ccc, ],
-        [0xb0d3214d3c4c46c1, 0x3983bffd4053346c, 0xab1239b72a6a9e64, 0x669bcbda2406c089,
-         0xf3118af8e563feda, 0x58323dbdd43a9c95, 0x5438aa910b51fd8c, 0xcbf071f9573f7e4f,
-         0x476c8fde40075e51, 0xa10f54d3c77d8bed, 0xfecafe7ec7346beb, ],
-        [0x79e00c6916f68fa8, 0x80e39c20c11400d6, 0x242e2b46a7c116b7, 0xea660990074fcff6,
-         0x18e3369da4c9272b, 0xfa6471be8be33b80, 0xede2ed2a83a4574a, 0x9e595d610deaaed6,
-         0xc7d2cf35fcacdc58, 0xc65cf113a9af2302, 0x35a74c3d0cac5fde, ],
-        [0x35d6cf1a9aeabd4b, 0x4dc004b0b64954c3, 0xcb67ab54210b4c8f, 0xa2359b770621d28e,
-         0x027a0a0a5e315bf6, 0xed6aad0492a86ef6, 0x127074e28969232c, 0x3e3d68e6354d396f,
-         0x3cf204ab96edf7c6, 0x513a9050b70c18bf, 0x73b3b7399a3f5281, ],
-        [0x0af9319d5b7cd620, 0x0514fbcecd8a897d, 0x542dd32e46738f8d, 0x49248ae425e9bd45,
-         0x8bb9ef7ac36e53ea, 0x97981020c414a723, 0xe587f186c024e0c8, 0x14f01dd28e990ad2,
-         0x4d3fca72e19ea756, 0x01a3824f1ee8e7f1, 0xb048d25b575f250e, ],
-        [0xe78a4cfe6c6aa236, 0x4840deffdefd3b04, 0x6e0952d028e63e47, 0x249d49fb1d93304d,
-         0xd41ce9ed49f7fbb3, 0xba255e808ea77466, 0x5ce52e6dc2005436, 0x8b5bf13acd881a04,
-         0xf80f439f3ac011d1, 0x1d3618fb2cc3f916, 0xf41489c837e14938, ],
-        [0x41e065665af15054, 0x71752ac86d1bba64, 0x9bfddd30f8ceadeb, 0x4f59dd5e6c985767,
-         0x8aa3e0718ecaa657, 0x355f734ed4199ca2, 0x110f361baec4d693, 0x283a46e9e134b5b1,
-         0x4fda33376f5c6514, 0xcca192f9565e7d13, 0x2251835db1c24c39, ],
-        [0xc583f62f5970a849, 0xb6cc325741cd89dd, 0xf83288467f07ac1f, 0xfd82624964b845e7,
-         0x11967e4e00a49fdd, 0x2fb200fae9f72577, 0xd6fb31913c7d5da7, 0xfad9ae578dd090cc,
-         0xcd13b2be741ea5d8, 0xc1c54f9cf54b0c27, 0x29520a761b657cce, ],
-        [0x0ac0e496a2b39f4a, 0x20571abb59e27953, 0xe9971143579a1d30, 0x980359c3dba518cb,
-         0x05ecee5a85b427c4, 0x4620dd90ad0b5366, 0x95c98f9c5b859365, 0x0fbb1806fbc56995,
-         0xfe4526fd802afae2, 0x70e3786431084092, 0xa8d78a0494939111, ],
+        [0x3d999c961b7c63b0, 0x814e82efcd172529, 0x2421e5d236704588, 0x887af7d4dd482328,
+         0xa5e9c291f6119b27, 0xbdc52b2676a4b4aa, 0x64832009d29bcf57, 0x09c4155174a552cc,
+         0x463f9ee03d290810, 0xc810936e64982542, 0x043b1c289f7bc3ac, ],
+        [0x673655aae8be5a8b, 0xd510fe714f39fa10, 0x2c68a099b51c9e73, 0xa667bfa9aa96999d,
+         0x4d67e72f063e2108, 0xf84dde3e6acda179, 0x40f9cc8c08f80981, 0x5ead032050097142,
+         0x6591b02092d671bb, 0x00e18c71963dd1b7, 0x8a21bcd24a14218a, ],
+        [0x202800f4addbdc87, 0xe4b5bdb1cc3504ff, 0xbe32b32a825596e7, 0x8e0f68c5dc223b9a,
+         0x58022d9e1c256ce3, 0x584d29227aa073ac, 0x8b9352ad04bef9e7, 0xaead42a3f445ecbf,
+         0x3c667a1d833a3cca, 0xda6f61838efa1ffe, 0xe8f749470bd7c446, ],
+        [0xc5b85bab9e5b3869, 0x45245258aec51cf7, 0x16e6b8e68b931830, 0xe2ae0f051418112c,
+         0x0470e26a0093a65b, 0x6bef71973a8146ed, 0x119265be51812daf, 0xb0be7356254bea2e,
+         0x8584defff7589bd7, 0x3c5fe4aeb1fb52ba, 0x9e7cd88acf543a5e, ],
+        [0x179be4bba87f0a8c, 0xacf63d95d8887355, 0x6696670196b0074f, 0xd99ddf1fe75085f9,
+         0xc2597881fef0283b, 0xcf48395ee6c54f14, 0x15226a8e4cd8d3b6, 0xc053297389af5d3b,
+         0x2c08893f0d1580e2, 0x0ed3cbcff6fcc5ba, 0xc82f510ecf81f6d0, ],
+        [0x94b06183acb715cc, 0x500392ed0d431137, 0x861cc95ad5c86323, 0x05830a443f86c4ac,
+         0x3b68225874a20a7c, 0x10b3309838e236fb, 0x9b77fc8bcd559e2c, 0xbdecf5e0cb9cb213,
+         0x30276f1221ace5fa, 0x7935dd342764a144, 0xeac6db520bb03708, ],
+        [0x7186a80551025f8f, 0x622247557e9b5371, 0xc4cbe326d1ad9742, 0x55f1523ac6a23ea2,
+         0xa13dfe77a3d52f53, 0xe30750b6301c0452, 0x08bd488070a3a32b, 0xcd800caef5b72ae3,
+         0x83329c90f04233ce, 0xb5b99e6664a0a3ee, 0x6b0731849e200a7f, ],
+        [0xec3fabc192b01799, 0x382b38cee8ee5375, 0x3bfb6c3f0e616572, 0x514abd0cf6c7bc86,
+         0x47521b1361dcc546, 0x178093843f863d14, 0xad1003c5d28918e7, 0x738450e42495bc81,
+         0xaf947c59af5e4047, 0x4653fb0685084ef2, 0x057fde2062ae35bf, ],
+        [0xe376678d843ce55e, 0x66f3860d7514e7fc, 0x7817f3dfff8b4ffa, 0x3929624a9def725b,
+         0x0126ca37f215a80a, 0xfce2f5d02762a303, 0x1bc927375febbad7, 0x85b481e5243f60bf,
+         0x2d3c5f42a39c91a0, 0x0811719919351ae8, 0xf669de0add993131, ],
+        [0x7de38bae084da92d, 0x5b848442237e8a9b, 0xf6c705da84d57310, 0x31e6a4bdb6a49017,
+         0x889489706e5c5c0f, 0x0e4a205459692a1b, 0xbac3fa75ee26f299, 0x5f5894f4057d755e,
+         0xb0dc3ecd724bb076, 0x5e34d8554a6452ba, 0x04f78fd8c1fdcc5f, ],
+        [0x4dd19c38779512ea, 0xdb79ba02704620e9, 0x92a29a3675a5d2be, 0xd5177029fe495166,
+         0xd32b3298a13330c1, 0x251c4a3eb2c5f8fd, 0xe1c48b26e0d98825, 0x3301d3362a4ffccb,
+         0x09bb6c88de8cd178, 0xdc05b676564f538a, 0x60192d883e473fee, ],
+        [0x16b9774801ac44a0, 0x3cb8411e786d3c8e, 0xa86e9cf505072491, 0x0178928152e109ae,
+         0x5317b905a6e1ab7b, 0xda20b3be7f53d59f, 0xcb97dedecebee9ad, 0x4bd545218c59f58d,
+         0x77dc8d856c05a44a, 0x87948589e4f243fd, 0x7e5217af969952c2, ],
+        [0xbc58987d06a84e4d, 0x0b5d420244c9cae3, 0xa3c4711b938c02c0, 0x3aace640a3e03990,
+         0x865a0f3249aacd8a, 0x8d00b2a7dbed06c7, 0x6eacb905beb7e2f8, 0x045322b216ec3ec7,
+         0xeb9de00d594828e6, 0x088c5f20df9e5c26, 0xf555f4112b19781f, ],
+        [0xa8cedbff1813d3a7, 0x50dcaee0fd27d164, 0xf1cb02417e23bd82, 0xfaf322786e2abe8b,
+         0x937a4315beb5d9b6, 0x1b18992921a11d85, 0x7d66c4368b3c497b, 0x0e7946317a6b4e99,
+         0xbe4430134182978b, 0x3771e82493ab262d, 0xa671690d8095ce82, ],
+        [0xb035585f6e929d9d, 0xba1579c7e219b954, 0xcb201cf846db4ba3, 0x287bf9177372cf45,
+         0xa350e4f61147d0a6, 0xd5d0ecfb50bcff99, 0x2e166aa6c776ed21, 0xe1e66c991990e282,
+         0x662b329b01e7bb38, 0x8aa674b36144d9a9, 0xcbabf78f97f95e65, ],
+        [0xeec24b15a06b53fe, 0xc8a7aa07c5633533, 0xefe9c6fa4311ad51, 0xb9173f13977109a1,
+         0x69ce43c9cc94aedc, 0xecf623c9cd118815, 0x28625def198c33c7, 0xccfc5f7de5c3636a,
+         0xf5e6c40f1621c299, 0xcec0e58c34cb64b1, 0xa868ea113387939f, ],
+        [0xd8dddbdc5ce4ef45, 0xacfc51de8131458c, 0x146bb3c0fe499ac0, 0x9e65309f15943903,
+         0x80d0ad980773aa70, 0xf97817d4ddbf0607, 0xe4626620a75ba276, 0x0dfdc7fd6fc74f66,
+         0xf464864ad6f2bb93, 0x02d55e52a5d44414, 0xdd8de62487c40925, ],
+        [0xc15acf44759545a3, 0xcbfdcf39869719d4, 0x33f62042e2f80225, 0x2599c5ead81d8fa3,
+         0x0b306cb6c1d7c8d0, 0x658c80d3df3729b1, 0xe8d1b2b21b41429c, 0xa1b67f09d4b3ccb8,
+         0x0e1adf8b84437180, 0x0d593a5e584af47b, 0xa023d94c56e151c7, ],
+        [0x49026cc3a4afc5a6, 0xe06dff00ab25b91b, 0x0ab38c561e8850ff, 0x92c3c8275e105eeb,
+         0xb65256e546889bd0, 0x3c0468236ea142f6, 0xee61766b889e18f2, 0xa206f41b12c30415,
+         0x02fe9d756c9f12d1, 0xe9633210630cbf12, 0x1ffea9fe85a0b0b1, ],
+        [0x81d1ae8cc50240f3, 0xf4c77a079a4607d7, 0xed446b2315e3efc1, 0x0b0a6b70915178c3,
+         0xb11ff3e089f15d9a, 0x1d4dba0b7ae9cc18, 0x65d74e2f43b48d05, 0xa2df8c6b8ae0804a,
+         0xa4e6f0a8c33348a6, 0xc0a26efc7be5669b, 0xa6b6582c547d0d60, ],
+        [0x84afc741f1c13213, 0x2f8f43734fc906f3, 0xde682d72da0a02d9, 0x0bb005236adb9ef2,
+         0x5bdf35c10a8b5624, 0x0739a8a343950010, 0x52f515f44785cfbc, 0xcbaf4e5d82856c60,
+         0xac9ea09074e3e150, 0x8f0fa011a2035fb0, 0x1a37905d8450904a, ],
+        [0x3abeb80def61cc85, 0x9d19c9dd4eac4133, 0x075a652d9641a985, 0x9daf69ae1b67e667,
+         0x364f71da77920a18, 0x50bd769f745c95b1, 0xf223d1180dbbf3fc, 0x2f885e584e04aa99,
+         0xb69a0fa70aea684a, 0x09584acaa6e062a0, 0x0bc051640145b19b, ],
    ];

    // NB: This is in ROW-major order to support cache-friendly pre-multiplication.
    const FAST_PARTIAL_ROUND_INITIAL_MATRIX: [[u64; 12 - 1]; 12 - 1] = [
-        [0xb8dee12bf8e622dc, 0x2a0bcfdad25a7a77, 0x35f873e941f6055d, 0x99b7b85b6028982e,
-         0x86d6993880e836f7, 0x1ef8de305b9c354d, 0x8b0a80ef933c37dc, 0x715c7164aacaf4a8,
-         0x43845bd4f75ac7f5, 0x3e71bb7b0ec57a1a, 0xffc5b2f8946575c3, ],
-        [0x863ca0992eae09b0, 0x68901dfa3ecc7696, 0x6ba9546fc13ba8be, 0x555b7567255c9650,
-         0x4570c6ac5e80551b, 0x8e440c6cc2d0ed18, 0xbad8ae4dbfba0799, 0x8b71ed9e65a6ed7a,
-         0xaade0f9eb69ee576, 0xdebe1855920c6e64, 0x3e71bb7b0ec57a1a, ],
-        [0x2c3887c29246a985, 0x5aeb127ffeece78f, 0xa86e940514be2461, 0x2cb276ddf6094068,
-         0x81e59e8f82a28b3c, 0x27bc037b1569fb52, 0x706ee8b692c2ebc7, 0xeba6949241aedb71,
-         0xc416ad39f1f908f8, 0xaade0f9eb69ee576, 0x43845bd4f75ac7f5, ],
-        [0x03df3a62e1ea48d2, 0xbb484c2d408e9b12, 0x0fbf2169623ec24c, 0x50955930c2f9eb19,
-         0x3dfc3cc6123745cc, 0xa2a8d3774d197b2c, 0xd16417e43d20feab, 0xd998a362dba538ba,
-         0xeba6949241aedb71, 0x8b71ed9e65a6ed7a, 0x715c7164aacaf4a8, ],
-        [0xbbf73d77fc6c411c, 0xad7f124615d240ee, 0x4e413fcebe9020ee, 0x540bd8044c672f2b,
-         0x6db739f6d2e9f37d, 0x9aa1b0a8f56ad33d, 0x53c179d92714378f, 0xd16417e43d20feab,
-         0x706ee8b692c2ebc7, 0xbad8ae4dbfba0799, 0x8b0a80ef933c37dc, ],
-        [0xab92e860ecde7bdc, 0xa58fc91c605c26d5, 0xfbe68b79a8d5e0b9, 0x3e7edc1407cbd848,
-         0xf69c76d11eaf57bf, 0x941ef2c6beace374, 0x9aa1b0a8f56ad33d, 0xa2a8d3774d197b2c,
-         0x27bc037b1569fb52, 0x8e440c6cc2d0ed18, 0x1ef8de305b9c354d, ],
-        [0xb522132046b25eaf, 0x2b7b18e882c3e2c6, 0xe3322ad433ba15c8, 0x87355794faf87b1b,
-         0x14f6e5ac86065fce, 0xf69c76d11eaf57bf, 0x6db739f6d2e9f37d, 0x3dfc3cc6123745cc,
-         0x81e59e8f82a28b3c, 0x4570c6ac5e80551b, 0x86d6993880e836f7, ],
-        [0x0084dd11f5c0d55c, 0x9d664d307df18036, 0x1d80d847dca52945, 0xee3eecb9b2df1658,
-         0x87355794faf87b1b, 0x3e7edc1407cbd848, 0x540bd8044c672f2b, 0x50955930c2f9eb19,
-         0x2cb276ddf6094068, 0x555b7567255c9650, 0x99b7b85b6028982e, ],
-        [0xeb7c39655546eba5, 0xf07245b62d94cf71, 0x17db9b690f0031a3, 0x1d80d847dca52945,
-         0xe3322ad433ba15c8, 0xfbe68b79a8d5e0b9, 0x4e413fcebe9020ee, 0x0fbf2169623ec24c,
-         0xa86e940514be2461, 0x6ba9546fc13ba8be, 0x35f873e941f6055d, ],
-        [0xcb7fc57923717f84, 0x795a850bf5f9e397, 0xf07245b62d94cf71, 0x9d664d307df18036,
-         0x2b7b18e882c3e2c6, 0xa58fc91c605c26d5, 0xad7f124615d240ee, 0xbb484c2d408e9b12,
-         0x5aeb127ffeece78f, 0x68901dfa3ecc7696, 0x2a0bcfdad25a7a77, ],
-        [0x3107f5edca2f02b8, 0xcb7fc57923717f84, 0xeb7c39655546eba5, 0x0084dd11f5c0d55c,
-         0xb522132046b25eaf, 0xab92e860ecde7bdc, 0xbbf73d77fc6c411c, 0x03df3a62e1ea48d2,
-         0x2c3887c29246a985, 0x863ca0992eae09b0, 0xb8dee12bf8e622dc, ],
+        [0x80772dc2645b280b, 0xdc927721da922cf8, 0xc1978156516879ad, 0x90e80c591f48b603,
+         0x3a2432625475e3ae, 0x00a2d4321cca94fe, 0x77736f524010c932, 0x904d3f2804a36c54,
+         0xbf9b39e28a16f354, 0x3a1ded54a6cd058b, 0x42392870da5737cf, ],
+        [0xe796d293a47a64cb, 0xb124c33152a2421a, 0x0ee5dc0ce131268a, 0xa9032a52f930fae6,
+         0x7e33ca8c814280de, 0xad11180f69a8c29e, 0xc75ac6d5b5a10ff3, 0xf0674a8dc5a387ec,
+         0xb36d43120eaa5e2b, 0x6f232aab4b533a25, 0x3a1ded54a6cd058b, ],
+        [0xdcedab70f40718ba, 0x14a4a64da0b2668f, 0x4715b8e5ab34653b, 0x1e8916a99c93a88e,
+         0xbba4b5d86b9a3b2c, 0xe76649f9bd5d5c2e, 0xaf8e2518a1ece54d, 0xdcda1344cdca873f,
+         0xcd080204256088e5, 0xb36d43120eaa5e2b, 0xbf9b39e28a16f354, ],
+        [0xf4a437f2888ae909, 0xc537d44dc2875403, 0x7f68007619fd8ba9, 0xa4911db6a32612da,
+         0x2f7e9aade3fdaec1, 0xe7ffd578da4ea43d, 0x43a608e7afa6b5c2, 0xca46546aa99e1575,
+         0xdcda1344cdca873f, 0xf0674a8dc5a387ec, 0x904d3f2804a36c54, ],
+        [0xf97abba0dffb6c50, 0x5e40f0c9bb82aab5, 0x5996a80497e24a6b, 0x07084430a7307c9a,
+         0xad2f570a5b8545aa, 0xab7f81fef4274770, 0xcb81f535cf98c9e9, 0x43a608e7afa6b5c2,
+         0xaf8e2518a1ece54d, 0xc75ac6d5b5a10ff3, 0x77736f524010c932, ],
+        [0x7f8e41e0b0a6cdff, 0x4b1ba8d40afca97d, 0x623708f28fca70e8, 0xbf150dc4914d380f,
+         0xc26a083554767106, 0x753b8b1126665c22, 0xab7f81fef4274770, 0xe7ffd578da4ea43d,
+         0xe76649f9bd5d5c2e, 0xad11180f69a8c29e, 0x00a2d4321cca94fe, ],
+        [0x726af914971c1374, 0x1d7f8a2cce1a9d00, 0x18737784700c75cd, 0x7fb45d605dd82838,
+         0x862361aeab0f9b6e, 0xc26a083554767106, 0xad2f570a5b8545aa, 0x2f7e9aade3fdaec1,
+         0xbba4b5d86b9a3b2c, 0x7e33ca8c814280de, 0x3a2432625475e3ae, ],
+        [0x64dd936da878404d, 0x4db9a2ead2bd7262, 0xbe2e19f6d07f1a83, 0x02290fe23c20351a,
+         0x7fb45d605dd82838, 0xbf150dc4914d380f, 0x07084430a7307c9a, 0xa4911db6a32612da,
+         0x1e8916a99c93a88e, 0xa9032a52f930fae6, 0x90e80c591f48b603, ],
+        [0x85418a9fef8a9890, 0xd8a2eb7ef5e707ad, 0xbfe85ababed2d882, 0xbe2e19f6d07f1a83,
+         0x18737784700c75cd, 0x623708f28fca70e8, 0x5996a80497e24a6b, 0x7f68007619fd8ba9,
+         0x4715b8e5ab34653b, 0x0ee5dc0ce131268a, 0xc1978156516879ad, ],
+        [0x156048ee7a738154, 0x91f7562377e81df5, 0xd8a2eb7ef5e707ad, 0x4db9a2ead2bd7262,
+         0x1d7f8a2cce1a9d00, 0x4b1ba8d40afca97d, 0x5e40f0c9bb82aab5, 0xc537d44dc2875403,
+         0x14a4a64da0b2668f, 0xb124c33152a2421a, 0xdc927721da922cf8, ],
+        [0xd841e8ef9dde8ba0, 0x156048ee7a738154, 0x85418a9fef8a9890, 0x64dd936da878404d,
+         0x726af914971c1374, 0x7f8e41e0b0a6cdff, 0xf97abba0dffb6c50, 0xf4a437f2888ae909,
+         0xdcedab70f40718ba, 0xe796d293a47a64cb, 0x80772dc2645b280b, ],
    ];

-    #[cfg(all(target_arch="x86_64", target_feature="avx2", target_feature="bmi2"))]
-    #[inline]
-    fn poseidon(input: [Self; 12]) -> [Self; 12] {
-        unsafe {
-            crate::hash::arch::x86_64::poseidon_goldilocks_avx2_bmi2::poseidon(&input)
-        }
-    }
+    // #[cfg(all(target_arch="x86_64", target_feature="avx2", target_feature="bmi2"))]
+    // #[inline]
+    // fn poseidon(input: [Self; 12]) -> [Self; 12] {
+    //     unsafe {
+    //         crate::hash::arch::x86_64::poseidon_goldilocks_avx2_bmi2::poseidon(&input)
+    //     }
+    // }

-    #[cfg(all(target_arch="x86_64", target_feature="avx2", target_feature="bmi2"))]
-    #[inline(always)]
-    fn constant_layer(state: &mut [Self; 12], round_ctr: usize) {
-        unsafe {
-            crate::hash::arch::x86_64::poseidon_goldilocks_avx2_bmi2::constant_layer(state, round_ctr);
-        }
-    }
+    // #[cfg(all(target_arch="x86_64", target_feature="avx2", target_feature="bmi2"))]
+    // #[inline(always)]
+    // fn constant_layer(state: &mut [Self; 12], round_ctr: usize) {
+    //     unsafe {
+    //         crate::hash::arch::x86_64::poseidon_goldilocks_avx2_bmi2::constant_layer(state, round_ctr);
+    //     }
+    // }

-    #[cfg(all(target_arch="x86_64", target_feature="avx2", target_feature="bmi2"))]
-    #[inline(always)]
-    fn sbox_layer(state: &mut [Self; 12]) {
-        unsafe {
-            crate::hash::arch::x86_64::poseidon_goldilocks_avx2_bmi2::sbox_layer(state);
-        }
-    }
+    // #[cfg(all(target_arch="x86_64", target_feature="avx2", target_feature="bmi2"))]
+    // #[inline(always)]
+    // fn sbox_layer(state: &mut [Self; 12]) {
+    //     unsafe {
+    //         crate::hash::arch::x86_64::poseidon_goldilocks_avx2_bmi2::sbox_layer(state);
+    //     }
+    // }

-    #[cfg(all(target_arch="x86_64", target_feature="avx2", target_feature="bmi2"))]
-    #[inline(always)]
-    fn mds_layer(state: &[Self; 12]) -> [Self; 12] {
-        unsafe {
-            crate::hash::arch::x86_64::poseidon_goldilocks_avx2_bmi2::mds_layer(state)
-        }
-    }
+    // #[cfg(all(target_arch="x86_64", target_feature="avx2", target_feature="bmi2"))]
+    // #[inline(always)]
+    // fn mds_layer(state: &[Self; 12]) -> [Self; 12] {
+    //     unsafe {
+    //         crate::hash::arch::x86_64::poseidon_goldilocks_avx2_bmi2::mds_layer(state)
+    //     }
+    // }

-    #[cfg(all(target_arch="aarch64", target_feature="neon"))]
-    #[inline]
-    fn poseidon(input: [Self; 12]) -> [Self; 12] {
-        unsafe {
-            crate::hash::arch::aarch64::poseidon_goldilocks_neon::poseidon(input)
-        }
-    }
+    // #[cfg(all(target_arch="aarch64", target_feature="neon"))]
+    // #[inline]
+    // fn poseidon(input: [Self; 12]) -> [Self; 12] {
+    //     unsafe {
+    //         crate::hash::arch::aarch64::poseidon_goldilocks_neon::poseidon(input)
+    //     }
+    // }

    #[cfg(all(target_arch="aarch64", target_feature="neon"))]
    #[inline(always)]
@ -287,46 +288,28 @@ mod tests {

        let neg_one: u64 = F::NEG_ONE.to_canonical_u64();

-        #[rustfmt::skip]
-        let _test_vectors8: Vec<([u64; 8], [u64; 8])> = vec![
-            ([0, 0, 0, 0, 0, 0, 0, 0, ],
-             [0x649eec3229475d06, 0x72afe85b8b600222, 0x816d0a50ddd39228, 0x5083133a721a187c,
-              0xbb69bd7d90c490a6, 0xea1d33a65d0a3287, 0xb4d27542d2fba3bc, 0xf9756d565d90c20a, ]),
-            ([0, 1, 2, 3, 4, 5, 6, 7, ],
-             [0xdfda4e2a7ec338f4, 0x3ac8d668054b1873, 0xeaaef2f72528e7ff, 0xee7bcc836ae165bc,
-              0x95561d9377c3e696, 0x2e7d39c369dfccaa, 0x992178c050936f8f, 0x34e38ec33f572850, ]),
-            ([neg_one, neg_one, neg_one, neg_one,
-              neg_one, neg_one, neg_one, neg_one, ],
-             [0x9d8553546c658f67, 0xd5f6422aea26962b, 0xffb40b4db302da75, 0x34f43bbd7882c16c,
-              0xccb375313fa146b0, 0x87574c332e89201a, 0x60e9e6c0c0be3a16, 0xf0e2a741e90756ba, ]),
-            ([0x016f2dde9ccdaf6f, 0x77e29cda821fece4, 0x2f6686f781255f78, 0xd2c4c9a53070b44f,
-              0x4d7035c9fd01fc40, 0xc8d460945c91d509, 0x14855cd8a36a097f, 0x49f640d6a30f9cf0, ],
-             [0x4c3c58a3fac4ba05, 0x3f26fc2bcb33a3d4, 0xe13fcddcd7a136bb, 0x27b05be73a91e2f2,
-              0x37804ed8ca07fcd5, 0xe78ec2f213e28456, 0xecf67d2aacb4dbe3, 0xad14575187c496ca, ]),
-        ];
-
        #[rustfmt::skip]
        let test_vectors12: Vec<([u64; 12], [u64; 12])> = vec![
            ([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ],
-             [0x3901858a44be6b3a, 0xb3470607c5f0ba0e, 0xb3b3ac3d89b37e8e, 0xd389513a7f6fe6e9,
-              0x1eceb92f5da1c96b, 0x55d0bdfc6a842adf, 0x0112c568afb8819c, 0x6ac21107619569ee,
-              0x3de33babbb421a85, 0x83688eb15ffe4ca3, 0x47e285b477551fa9, 0x1dd3dda781901271, ]),
+             [0x3c18a9786cb0b359, 0xc4055e3364a246c3, 0x7953db0ab48808f4, 0xc71603f33a1144ca,
+              0xd7709673896996dc, 0x46a84e87642f44ed, 0xd032648251ee0b3c, 0x1c687363b207df62,
+              0xdf8565563e8045fe, 0x40f5b37ff4254dae, 0xd070f637b431067c, 0x1792b1c4342109d7, ]),
            ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, ],
-             [0x641772a94a77c7e5, 0x38d2cec9c47e7314, 0x3577218e825058c9, 0x1cdb3b4d22c54bcc,
-              0x803234d4b16eb152, 0xbbb6c8438627c0f0, 0x1b219561c95a41fa, 0x9bdc97531bacc401,
-              0x4251f4fac8271d9d, 0x0279ffa7ba5ce9aa, 0x63baf77c533b5874, 0xb7ada3e1f98b25e7, ]),
+             [0xd64e1e3efc5b8e9e, 0x53666633020aaa47, 0xd40285597c6a8825, 0x613a4f81e81231d2,
+              0x414754bfebd051f0, 0xcb1f8980294a023f, 0x6eb2a9e4d54a9d0f, 0x1902bc3af467e056,
+              0xf045d5eafdc6021f, 0xe4150f77caaa3be5, 0xc9bfd01d39b50cce, 0x5c0a27fcb0e1459b, ]),
            ([neg_one, neg_one, neg_one, neg_one,
              neg_one, neg_one, neg_one, neg_one,
              neg_one, neg_one, neg_one, neg_one, ],
-             [0xd2e4605ed1eb9613, 0x62510e8cbaf8a3b5, 0x64dc1e941dbaf46c, 0x1d6c5a5fd43cc4c5,
-              0xac4b4f6bf503a6b4, 0x19e17983f5e52404, 0x927b08e033b29b6f, 0xa41bc2cb5ddb9bc0,
-              0x270d528b1accc148, 0x022169acf46c71ae, 0xbbd4566e7b49ad7d, 0x0ed1ea54401533ef, ]),
-            ([0xa48728856b047229, 0xc43ab5e4aa986608, 0x715f470f075c057f, 0x36e955a095478013,
-              0x7c036db7200ba52d, 0x20377cd3410dc7dc, 0x058c0956659b05b2, 0xa66c880ee57e8399,
-              0xb06521c88afbd610, 0xdfa4d72ba95c8895, 0x25b403dac3622acc, 0xda607d79268a8fce, ],
-             [0xe85b56b0764df429, 0x7c0796201b43fe68, 0x231673b8300a6a16, 0x25db4745a952a677,
-              0x01431a6817415a4d, 0xfdfbbe63602076eb, 0x82c643dabf1154c1, 0x896e7e87b3f3417d,
-              0x27eca78818ef9c27, 0xf08c93583c24dc47, 0x1c9e1552c07a9f73, 0x7659179192cfdc88, ]),
+             [0xbe0085cfc57a8357, 0xd95af71847d05c09, 0xcf55a13d33c1c953, 0x95803a74f4530e82,
+              0xfcd99eb30a135df1, 0xe095905e913a3029, 0xde0392461b42919b, 0x7d3260e24e81d031,
+              0x10d3d0465d9deaa0, 0xa87571083dfc2a47, 0xe18263681e9958f8, 0xe28e96f1ae5e60d3, ]),
+            ([0x8ccbbbea4fe5d2b7, 0xc2af59ee9ec49970, 0x90f7e1a9e658446a, 0xdcc0630a3ab8b1b8,
+              0x7ff8256bca20588c, 0x5d99a7ca0c44ecfb, 0x48452b17a70fbee3, 0xeb09d654690b6c88,
+              0x4a55d3a39c676a88, 0xc0407a38d2285139, 0xa234bac9356386d1, 0xe1633f2bad98a52f, ],
+             [0xa89280105650c4ec, 0xab542d53860d12ed, 0x5704148e9ccab94f, 0xd3a826d4b62da9f5,
+              0x8a7a6ca87892574f, 0xc7017e1cad1a674e, 0x1f06668922318e34, 0xa3b203bc8102676f,
+              0xfcc781b0ce382bf2, 0x934c69ff3ed14ba5, 0x504688a5996e8f13, 0x401f3f2ed524a2ba, ]),
        ];

        check_test_vectors::<F>(test_vectors12);
--- a/plonky2/src/iop/challenger.rs
+++ b/plonky2/src/iop/challenger.rs
@ -208,7 +208,7 @@ impl<F: RichField + Extendable<D>, H: AlgebraicHasher<F>, const D: usize>
        }
    }

-    pub(crate) fn get_challenge(&mut self, builder: &mut CircuitBuilder<F, D>) -> Target {
+    pub fn get_challenge(&mut self, builder: &mut CircuitBuilder<F, D>) -> Target {
        self.absorb_buffered_inputs(builder);

        if self.output_buffer.is_empty() {
--- a/plonky2/src/plonk/circuit_builder.rs
+++ b/plonky2/src/plonk/circuit_builder.rs
@ -640,6 +640,7 @@ impl<F: RichField + Extendable<D>, const D: usize> CircuitBuilder<F, D> {
        let mut timing = TimingTree::new("preprocess", Level::Trace);
        let start = Instant::now();
        let rate_bits = self.config.fri_config.rate_bits;
+        let cap_height = self.config.fri_config.cap_height;

        // Hash the public inputs, and route them to a `PublicInputGate` which will enforce that
        // those hash wires match the claimed public inputs.
@ -665,7 +666,7 @@ impl<F: RichField + Extendable<D>, const D: usize> CircuitBuilder<F, D> {
        let degree_bits = log2_strict(degree);
        let fri_params = self.fri_params(degree_bits);
        assert!(
-            fri_params.total_arities() <= degree_bits,
+            fri_params.total_arities() <= degree_bits + rate_bits - cap_height,
            "FRI total reduction arity is too large.",
        );

@ -706,7 +707,7 @@ impl<F: RichField + Extendable<D>, const D: usize> CircuitBuilder<F, D> {
            constants_sigmas_vecs,
            rate_bits,
            PlonkOracle::CONSTANTS_SIGMAS.blinding,
-            self.config.fri_config.cap_height,
+            cap_height,
            &mut timing,
            Some(&fft_root_table),
        );
--- a/plonky2/src/plonk/recursive_verifier.rs
+++ b/plonky2/src/plonk/recursive_verifier.rs
@ -4,6 +4,7 @@ use crate::hash::hash_types::{HashOutTarget, RichField};
 use crate::plonk::circuit_builder::CircuitBuilder;
 use crate::plonk::circuit_data::{CommonCircuitData, VerifierCircuitTarget};
 use crate::plonk::config::{AlgebraicHasher, GenericConfig};
+use crate::plonk::plonk_common::salt_size;
 use crate::plonk::proof::{
    OpeningSetTarget, ProofChallengesTarget, ProofTarget, ProofWithPublicInputsTarget,
 };
@ -141,11 +142,12 @@ impl<F: RichField + Extendable<D>, const D: usize> CircuitBuilder<F, D> {
        let fri_params = &common_data.fri_params;
        let cap_height = fri_params.config.cap_height;

+        let salt = salt_size(common_data.fri_params.hiding);
        let num_leaves_per_oracle = &[
            common_data.num_preprocessed_polys(),
-            config.num_wires,
-            common_data.num_zs_partial_products_polys(),
-            common_data.num_quotient_polys(),
+            config.num_wires + salt,
+            common_data.num_zs_partial_products_polys() + salt,
+            common_data.num_quotient_polys() + salt,
        ];

        ProofTarget {
@ -200,7 +202,7 @@ mod tests {
        const D: usize = 2;
        type C = PoseidonGoldilocksConfig;
        type F = <C as GenericConfig<D>>::F;
-        let config = CircuitConfig::standard_recursion_config();
+        let config = CircuitConfig::standard_recursion_zk_config();

        let (proof, vd, cd) = dummy_proof::<F, C, D>(&config, 4_000)?;
        let (proof, _vd, cd) =
--- a/plonky2/src/util/reducing.rs
+++ b/plonky2/src/util/reducing.rs
@ -1,6 +1,6 @@
 use std::borrow::Borrow;

-use plonky2_field::extension_field::Extendable;
+use plonky2_field::extension_field::{Extendable, FieldExtension};
 use plonky2_field::field_types::Field;
 use plonky2_field::polynomial::PolynomialCoeffs;

@ -35,6 +35,11 @@ impl<F: Field> ReducingFactor<F> {
        self.base * x
    }

+    fn mul_ext<FE: FieldExtension<D, BaseField = F>, const D: usize>(&mut self, x: FE) -> FE {
+        self.count += 1;
+        x.scalar_mul(self.base)
+    }
+
    fn mul_poly(&mut self, p: &mut PolynomialCoeffs<F>) {
        self.count += 1;
        *p *= self.base;
@ -45,6 +50,14 @@ impl<F: Field> ReducingFactor<F> {
            .fold(F::ZERO, |acc, x| self.mul(acc) + *x.borrow())
    }

+    pub fn reduce_ext<FE: FieldExtension<D, BaseField = F>, const D: usize>(
+        &mut self,
+        iter: impl DoubleEndedIterator<Item = impl Borrow<FE>>,
+    ) -> FE {
+        iter.rev()
+            .fold(FE::ZERO, |acc, x| self.mul_ext(acc) + *x.borrow())
+    }
+
    pub fn reduce_polys(
        &mut self,
        polys: impl DoubleEndedIterator<Item = impl Borrow<PolynomialCoeffs<F>>>,
--- a/plonky2/src/util/serialization.rs
+++ b/plonky2/src/util/serialization.rs
@ -15,6 +15,7 @@ use crate::hash::merkle_proofs::MerkleProof;
 use crate::hash::merkle_tree::MerkleCap;
 use crate::plonk::circuit_data::CommonCircuitData;
 use crate::plonk::config::{GenericConfig, GenericHashOut, Hasher};
+use crate::plonk::plonk_common::salt_size;
 use crate::plonk::proof::{
    CompressedProof, CompressedProofWithPublicInputs, OpeningSet, Proof, ProofWithPublicInputs,
 };
@ -235,6 +236,7 @@ impl Buffer {
        common_data: &CommonCircuitData<F, C, D>,
    ) -> Result<FriInitialTreeProof<F, C::Hasher>> {
        let config = &common_data.config;
+        let salt = salt_size(common_data.fri_params.hiding);
        let mut evals_proofs = Vec::with_capacity(4);

        let constants_sigmas_v =
@ -242,17 +244,18 @@ impl Buffer {
        let constants_sigmas_p = self.read_merkle_proof()?;
        evals_proofs.push((constants_sigmas_v, constants_sigmas_p));

-        let wires_v = self.read_field_vec(config.num_wires)?;
+        let wires_v = self.read_field_vec(config.num_wires + salt)?;
        let wires_p = self.read_merkle_proof()?;
        evals_proofs.push((wires_v, wires_p));

-        let zs_partial_v =
-            self.read_field_vec(config.num_challenges * (1 + common_data.num_partial_products))?;
+        let zs_partial_v = self.read_field_vec(
+            config.num_challenges * (1 + common_data.num_partial_products) + salt,
+        )?;
        let zs_partial_p = self.read_merkle_proof()?;
        evals_proofs.push((zs_partial_v, zs_partial_p));

        let quotient_v =
-            self.read_field_vec(config.num_challenges * common_data.quotient_degree_factor)?;
+            self.read_field_vec(config.num_challenges * common_data.quotient_degree_factor + salt)?;
        let quotient_p = self.read_merkle_proof()?;
        evals_proofs.push((quotient_v, quotient_p));

--- a/starky/src/constraint_consumer.rs
+++ b/starky/src/constraint_consumer.rs
@ -53,12 +53,12 @@ impl<P: PackedField> ConstraintConsumer<P> {
    }

    /// Add one constraint valid on all rows except the last.
-    pub fn constraint(&mut self, constraint: P) {
-        self.constraint_wrapping(constraint * self.z_last);
+    pub fn constraint_transition(&mut self, constraint: P) {
+        self.constraint(constraint * self.z_last);
    }

    /// Add one constraint on all rows.
-    pub fn constraint_wrapping(&mut self, constraint: P) {
+    pub fn constraint(&mut self, constraint: P) {
        for (&alpha, acc) in self.alphas.iter().zip(&mut self.constraint_accs) {
            *acc *= alpha;
            *acc += constraint;
@ -68,13 +68,13 @@ impl<P: PackedField> ConstraintConsumer<P> {
    /// Add one constraint, but first multiply it by a filter such that it will only apply to the
    /// first row of the trace.
    pub fn constraint_first_row(&mut self, constraint: P) {
-        self.constraint_wrapping(constraint * self.lagrange_basis_first);
+        self.constraint(constraint * self.lagrange_basis_first);
    }

    /// Add one constraint, but first multiply it by a filter such that it will only apply to the
    /// last row of the trace.
    pub fn constraint_last_row(&mut self, constraint: P) {
-        self.constraint_wrapping(constraint * self.lagrange_basis_last);
+        self.constraint(constraint * self.lagrange_basis_last);
    }
 }

@ -122,17 +122,17 @@ impl<F: RichField + Extendable<D>, const D: usize> RecursiveConstraintConsumer<F
    }

    /// Add one constraint valid on all rows except the last.
-    pub fn constraint(
+    pub fn constraint_transition(
        &mut self,
        builder: &mut CircuitBuilder<F, D>,
        constraint: ExtensionTarget<D>,
    ) {
        let filtered_constraint = builder.mul_extension(constraint, self.z_last);
-        self.constraint_wrapping(builder, filtered_constraint);
+        self.constraint(builder, filtered_constraint);
    }

    /// Add one constraint valid on all rows.
-    pub fn constraint_wrapping(
+    pub fn constraint(
        &mut self,
        builder: &mut CircuitBuilder<F, D>,
        constraint: ExtensionTarget<D>,
@ -150,7 +150,7 @@ impl<F: RichField + Extendable<D>, const D: usize> RecursiveConstraintConsumer<F
        constraint: ExtensionTarget<D>,
    ) {
        let filtered_constraint = builder.mul_extension(constraint, self.lagrange_basis_first);
-        self.constraint_wrapping(builder, filtered_constraint);
+        self.constraint(builder, filtered_constraint);
    }

    /// Add one constraint, but first multiply it by a filter such that it will only apply to the
@ -161,6 +161,6 @@ impl<F: RichField + Extendable<D>, const D: usize> RecursiveConstraintConsumer<F
        constraint: ExtensionTarget<D>,
    ) {
        let filtered_constraint = builder.mul_extension(constraint, self.lagrange_basis_last);
-        self.constraint_wrapping(builder, filtered_constraint);
+        self.constraint(builder, filtered_constraint);
    }
 }
--- a/starky/src/fibonacci_stark.rs
+++ b/starky/src/fibonacci_stark.rs
@ -2,16 +2,20 @@ use std::marker::PhantomData;

 use plonky2::field::extension_field::{Extendable, FieldExtension};
 use plonky2::field::packed_field::PackedField;
+use plonky2::field::polynomial::PolynomialValues;
 use plonky2::hash::hash_types::RichField;
 use plonky2::plonk::circuit_builder::CircuitBuilder;

 use crate::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer};
+use crate::permutation::PermutationPair;
 use crate::stark::Stark;
+use crate::util::trace_rows_to_poly_values;
 use crate::vars::{StarkEvaluationTargets, StarkEvaluationVars};

 /// Toy STARK system used for testing.
-/// Computes a Fibonacci sequence with state `[x0, x1]` using the state transition
-/// `x0 <- x1, x1 <- x0 + x1`.
+/// Computes a Fibonacci sequence with state `[x0, x1, i, j]` using the state transition
+/// `x0' <- x1, x1' <- x0 + x1, i' <- i+1, j' <- j+1`.
+/// Note: The `i, j` columns are only used to test the permutation argument.
 #[derive(Copy, Clone)]
 struct FibonacciStark<F: RichField + Extendable<D>, const D: usize> {
    num_rows: usize,
@ -34,21 +38,25 @@ impl<F: RichField + Extendable<D>, const D: usize> FibonacciStark<F, D> {
        }
    }

-    /// Generate the trace using `x0, x1` as inital state values.
-    fn generate_trace(&self, x0: F, x1: F) -> Vec<[F; Self::COLUMNS]> {
-        (0..self.num_rows)
-            .scan([x0, x1], |acc, _| {
+    /// Generate the trace using `x0, x1, 0, 1` as initial state values.
+    fn generate_trace(&self, x0: F, x1: F) -> Vec<PolynomialValues<F>> {
+        let mut trace_rows = (0..self.num_rows)
+            .scan([x0, x1, F::ZERO, F::ONE], |acc, _| {
                let tmp = *acc;
                acc[0] = tmp[1];
                acc[1] = tmp[0] + tmp[1];
+                acc[2] = tmp[2] + F::ONE;
+                acc[3] = tmp[3] + F::ONE;
                Some(tmp)
            })
-            .collect()
+            .collect::<Vec<_>>();
+        trace_rows[self.num_rows - 1][3] = F::ZERO; // So that column 2 and 3 are permutation of one another.
+        trace_rows_to_poly_values(trace_rows)
    }
 }

 impl<F: RichField + Extendable<D>, const D: usize> Stark<F, D> for FibonacciStark<F, D> {
-    const COLUMNS: usize = 2;
+    const COLUMNS: usize = 4;
    const PUBLIC_INPUTS: usize = 3;

    fn eval_packed_generic<FE, P, const D2: usize>(
@ -68,9 +76,11 @@ impl<F: RichField + Extendable<D>, const D: usize> Stark<F, D> for FibonacciStar
            .constraint_last_row(vars.local_values[1] - vars.public_inputs[Self::PI_INDEX_RES]);

        // x0' <- x1
-        yield_constr.constraint(vars.next_values[0] - vars.local_values[1]);
+        yield_constr.constraint_transition(vars.next_values[0] - vars.local_values[1]);
        // x1' <- x0 + x1
-        yield_constr.constraint(vars.next_values[1] - vars.local_values[0] - vars.local_values[1]);
+        yield_constr.constraint_transition(
+            vars.next_values[1] - vars.local_values[0] - vars.local_values[1],
+        );
    }

    fn eval_ext_recursively(
@ -91,18 +101,22 @@ impl<F: RichField + Extendable<D>, const D: usize> Stark<F, D> for FibonacciStar

        // x0' <- x1
        let first_col_constraint = builder.sub_extension(vars.next_values[0], vars.local_values[1]);
-        yield_constr.constraint(builder, first_col_constraint);
+        yield_constr.constraint_transition(builder, first_col_constraint);
        // x1' <- x0 + x1
        let second_col_constraint = {
            let tmp = builder.sub_extension(vars.next_values[1], vars.local_values[0]);
            builder.sub_extension(tmp, vars.local_values[1])
        };
-        yield_constr.constraint(builder, second_col_constraint);
+        yield_constr.constraint_transition(builder, second_col_constraint);
    }

    fn constraint_degree(&self) -> usize {
        2
    }
+
+    fn permutation_pairs(&self) -> Vec<PermutationPair> {
+        vec![PermutationPair::singletons(2, 3)]
+    }
 }

 #[cfg(test)]
--- a/starky/src/get_challenges.rs
+++ b/starky/src/get_challenges.rs
@ -1,4 +1,3 @@
-use anyhow::Result;
 use plonky2::field::extension_field::Extendable;
 use plonky2::field::polynomial::PolynomialCoeffs;
 use plonky2::fri::proof::{FriProof, FriProofTarget};
@ -11,7 +10,9 @@ use plonky2::plonk::circuit_builder::CircuitBuilder;
 use plonky2::plonk::config::{AlgebraicHasher, GenericConfig};

 use crate::config::StarkConfig;
-use crate::permutation::get_n_permutation_challenge_sets;
+use crate::permutation::{
+    get_n_permutation_challenge_sets, get_n_permutation_challenge_sets_target,
+};
 use crate::proof::*;
 use crate::stark::Stark;

@ -26,7 +27,7 @@ fn get_challenges<F, C, S, const D: usize>(
    pow_witness: F,
    config: &StarkConfig,
    degree_bits: usize,
-) -> Result<StarkProofChallenges<F, D>>
+) -> StarkProofChallenges<F, D>
 where
    F: RichField + Extendable<D>,
    C: GenericConfig<D, F = F>,
@ -38,20 +39,15 @@ where

    challenger.observe_cap(trace_cap);

-    let permutation_challenge_sets = if stark.uses_permutation_args() {
-        get_n_permutation_challenge_sets(
+    let permutation_challenge_sets = permutation_zs_cap.map(|permutation_zs_cap| {
+        let tmp = get_n_permutation_challenge_sets(
            &mut challenger,
            num_challenges,
            stark.permutation_batch_size(),
-        )
-    } else {
-        vec![]
-    };
-    if stark.uses_permutation_args() {
-        let cap =
-            permutation_zs_cap.ok_or_else(|| anyhow::Error::msg("expected permutation_zs_cap"));
-        challenger.observe_cap(cap?);
-    }
+        );
+        challenger.observe_cap(permutation_zs_cap);
+        tmp
+    });

    let stark_alphas = challenger.get_n_challenges(num_challenges);

@ -60,7 +56,7 @@ where

    challenger.observe_openings(&openings.to_fri_openings());

-    Ok(StarkProofChallenges {
+    StarkProofChallenges {
        permutation_challenge_sets,
        stark_alphas,
        stark_zeta,
@ -71,7 +67,7 @@ where
            degree_bits,
            &config.fri_config,
        ),
-    })
+    }
 }

 impl<F, C, const D: usize> StarkProofWithPublicInputs<F, C, D>
@ -84,11 +80,10 @@ where
        stark: &S,
        config: &StarkConfig,
        degree_bits: usize,
-    ) -> anyhow::Result<Vec<usize>> {
-        Ok(self
-            .get_challenges(stark, config, degree_bits)?
+    ) -> Vec<usize> {
+        self.get_challenges(stark, config, degree_bits)
            .fri_challenges
-            .fri_query_indices)
+            .fri_query_indices
    }

    /// Computes all Fiat-Shamir challenges used in the STARK proof.
@ -97,7 +92,7 @@ where
        stark: &S,
        config: &StarkConfig,
        degree_bits: usize,
-    ) -> Result<StarkProofChallenges<F, D>> {
+    ) -> StarkProofChallenges<F, D> {
        let StarkProof {
            trace_cap,
            permutation_zs_cap,
@ -131,9 +126,11 @@ where
 pub(crate) fn get_challenges_target<
    F: RichField + Extendable<D>,
    C: GenericConfig<D, F = F>,
+    S: Stark<F, D>,
    const D: usize,
 >(
    builder: &mut CircuitBuilder<F, D>,
+    stark: &S,
    trace_cap: &MerkleCapTarget,
    permutation_zs_cap: Option<&MerkleCapTarget>,
    quotient_polys_cap: &MerkleCapTarget,
@ -151,6 +148,18 @@ where
    let mut challenger = RecursiveChallenger::<F, C::Hasher, D>::new(builder);

    challenger.observe_cap(trace_cap);
+
+    let permutation_challenge_sets = permutation_zs_cap.map(|permutation_zs_cap| {
+        let tmp = get_n_permutation_challenge_sets_target(
+            builder,
+            &mut challenger,
+            num_challenges,
+            stark.permutation_batch_size(),
+        );
+        challenger.observe_cap(permutation_zs_cap);
+        tmp
+    });
+
    let stark_alphas = challenger.get_n_challenges(builder, num_challenges);

    challenger.observe_cap(quotient_polys_cap);
@ -159,6 +168,7 @@ where
    challenger.observe_openings(&openings.to_fri_openings());

    StarkProofChallengesTarget {
+        permutation_challenge_sets,
        stark_alphas,
        stark_zeta,
        fri_challenges: challenger.fri_challenges::<C>(
@ -172,9 +182,14 @@ where
 }

 impl<const D: usize> StarkProofWithPublicInputsTarget<D> {
-    pub(crate) fn get_challenges<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>>(
+    pub(crate) fn get_challenges<
+        F: RichField + Extendable<D>,
+        C: GenericConfig<D, F = F>,
+        S: Stark<F, D>,
+    >(
        &self,
        builder: &mut CircuitBuilder<F, D>,
+        stark: &S,
        config: &StarkConfig,
    ) -> StarkProofChallengesTarget<D>
    where
@ -194,8 +209,9 @@ impl<const D: usize> StarkProofWithPublicInputsTarget<D> {
                },
        } = &self.proof;

-        get_challenges_target::<F, C, D>(
+        get_challenges_target::<F, C, S, D>(
            builder,
+            stark,
            trace_cap,
            permutation_zs_cap.as_ref(),
            quotient_polys_cap,
--- a/starky/src/lib.rs
+++ b/starky/src/lib.rs
@ -3,6 +3,7 @@
 #![allow(unused_variables)]
 #![allow(incomplete_features)]
 #![allow(clippy::too_many_arguments)]
+#![allow(clippy::type_complexity)]
 #![feature(generic_const_exprs)]

 pub mod config;
@ -14,6 +15,8 @@ pub mod prover;
 pub mod recursive_verifier;
 pub mod stark;
 pub mod stark_testing;
+pub mod util;
+pub mod vanishing_poly;
 pub mod vars;
 pub mod verifier;

--- a/starky/src/permutation.rs
+++ b/starky/src/permutation.rs
@ -2,16 +2,23 @@

 use itertools::Itertools;
 use plonky2::field::batch_util::batch_multiply_inplace;
-use plonky2::field::extension_field::Extendable;
+use plonky2::field::extension_field::{Extendable, FieldExtension};
 use plonky2::field::field_types::Field;
+use plonky2::field::packed_field::PackedField;
 use plonky2::field::polynomial::PolynomialValues;
 use plonky2::hash::hash_types::RichField;
-use plonky2::iop::challenger::Challenger;
-use plonky2::plonk::config::{GenericConfig, Hasher};
+use plonky2::iop::challenger::{Challenger, RecursiveChallenger};
+use plonky2::iop::ext_target::ExtensionTarget;
+use plonky2::iop::target::Target;
+use plonky2::plonk::circuit_builder::CircuitBuilder;
+use plonky2::plonk::config::{AlgebraicHasher, GenericConfig, Hasher};
+use plonky2::util::reducing::{ReducingFactor, ReducingFactorTarget};
 use rayon::prelude::*;

 use crate::config::StarkConfig;
+use crate::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer};
 use crate::stark::Stark;
+use crate::vars::{StarkEvaluationTargets, StarkEvaluationVars};

 /// A pair of lists of columns, `lhs` and `rhs`, that should be permutations of one another.
 /// In particular, there should exist some permutation `pi` such that for any `i`,
@ -23,32 +30,41 @@ pub struct PermutationPair {
    pub column_pairs: Vec<(usize, usize)>,
 }

+impl PermutationPair {
+    pub fn singletons(lhs: usize, rhs: usize) -> Self {
+        Self {
+            column_pairs: vec![(lhs, rhs)],
+        }
+    }
+}
+
 /// A single instance of a permutation check protocol.
-pub(crate) struct PermutationInstance<'a, F: Field> {
+pub(crate) struct PermutationInstance<'a, T: Copy> {
    pub(crate) pair: &'a PermutationPair,
-    pub(crate) challenge: PermutationChallenge<F>,
+    pub(crate) challenge: PermutationChallenge<T>,
 }

 /// Randomness for a single instance of a permutation check protocol.
 #[derive(Copy, Clone)]
-pub(crate) struct PermutationChallenge<F: Field> {
+pub(crate) struct PermutationChallenge<T: Copy> {
    /// Randomness used to combine multiple columns into one.
-    pub(crate) beta: F,
+    pub(crate) beta: T,
    /// Random offset that's added to the beta-reduced column values.
-    pub(crate) gamma: F,
+    pub(crate) gamma: T,
 }

 /// Like `PermutationChallenge`, but with `num_challenges` copies to boost soundness.
-pub(crate) struct PermutationChallengeSet<F: Field> {
-    pub(crate) challenges: Vec<PermutationChallenge<F>>,
+#[derive(Clone)]
+pub(crate) struct PermutationChallengeSet<T: Copy> {
+    pub(crate) challenges: Vec<PermutationChallenge<T>>,
 }

 /// Compute all Z polynomials (for permutation arguments).
 pub(crate) fn compute_permutation_z_polys<F, C, S, const D: usize>(
    stark: &S,
    config: &StarkConfig,
-    challenger: &mut Challenger<F, C::Hasher>,
    trace_poly_values: &[PolynomialValues<F>],
+    permutation_challenge_sets: &[PermutationChallengeSet<F>],
 ) -> Vec<PolynomialValues<F>>
 where
    F: RichField + Extendable<D>,
@ -56,59 +72,37 @@ where
    S: Stark<F, D>,
 {
    let permutation_pairs = stark.permutation_pairs();
-    let permutation_challenge_sets = get_n_permutation_challenge_sets(
-        challenger,
+    let permutation_batches = get_permutation_batches(
+        &permutation_pairs,
+        permutation_challenge_sets,
        config.num_challenges,
        stark.permutation_batch_size(),
    );

-    // Get a list of instances of our batch-permutation argument. These are permutation arguments
-    // where the same `Z(x)` polynomial is used to check more than one permutation.
-    // Before batching, each permutation pair leads to `num_challenges` permutation arguments, so we
-    // start with the cartesian product of `permutation_pairs` and `0..num_challenges`. Then we
-    // chunk these arguments based on our batch size.
-    let permutation_instances = permutation_pairs
-        .iter()
-        .cartesian_product(0..config.num_challenges)
-        .chunks(stark.permutation_batch_size())
-        .into_iter()
-        .flat_map(|batch| {
-            batch.enumerate().map(|(i, (pair, chal))| {
-                let challenge = permutation_challenge_sets[i].challenges[chal];
-                PermutationInstance { pair, challenge }
-            })
-        })
-        .collect_vec();
-
-    permutation_instances
+    permutation_batches
        .into_par_iter()
-        .map(|instance| compute_permutation_z_poly(instance, trace_poly_values))
+        .map(|instances| compute_permutation_z_poly(&instances, trace_poly_values))
        .collect()
 }

 /// Compute a single Z polynomial.
-// TODO: Change this to handle a batch of `PermutationInstance`s.
 fn compute_permutation_z_poly<F: Field>(
-    instance: PermutationInstance<F>,
+    instances: &[PermutationInstance<F>],
    trace_poly_values: &[PolynomialValues<F>],
 ) -> PolynomialValues<F> {
-    let PermutationInstance { pair, challenge } = instance;
-    let PermutationPair { column_pairs } = pair;
-    let PermutationChallenge { beta, gamma } = challenge;
-
    let degree = trace_poly_values[0].len();
-    let mut reduced_lhs = PolynomialValues::constant(gamma, degree);
-    let mut reduced_rhs = PolynomialValues::constant(gamma, degree);
+    let (reduced_lhs_polys, reduced_rhs_polys): (Vec<_>, Vec<_>) = instances
+        .iter()
+        .map(|instance| permutation_reduced_polys(instance, trace_poly_values, degree))
+        .unzip();

-    for ((lhs, rhs), weight) in column_pairs.iter().zip(beta.powers()) {
-        reduced_lhs.add_assign_scaled(&trace_poly_values[*lhs], weight);
-        reduced_rhs.add_assign_scaled(&trace_poly_values[*rhs], weight);
-    }
+    let numerator = poly_product_elementwise(reduced_lhs_polys.into_iter());
+    let denominator = poly_product_elementwise(reduced_rhs_polys.into_iter());

    // Compute the quotients.
-    let reduced_rhs_inverses = F::batch_multiplicative_inverse(&reduced_rhs.values);
-    let mut quotients = reduced_lhs.values;
-    batch_multiply_inplace(&mut quotients, &reduced_rhs_inverses);
+    let denominator_inverses = F::batch_multiplicative_inverse(&denominator.values);
+    let mut quotients = numerator.values;
+    batch_multiply_inplace(&mut quotients, &denominator_inverses);

    // Compute Z, which contains partial products of the quotients.
    let mut partial_products = Vec::with_capacity(degree);
@ -120,6 +114,39 @@ fn compute_permutation_z_poly<F: Field>(
    PolynomialValues::new(partial_products)
 }

+/// Computes the reduced polynomial, `\sum beta^i f_i(x) + gamma`, for both the "left" and "right"
+/// sides of a given `PermutationPair`.
+fn permutation_reduced_polys<F: Field>(
+    instance: &PermutationInstance<F>,
+    trace_poly_values: &[PolynomialValues<F>],
+    degree: usize,
+) -> (PolynomialValues<F>, PolynomialValues<F>) {
+    let PermutationInstance {
+        pair: PermutationPair { column_pairs },
+        challenge: PermutationChallenge { beta, gamma },
+    } = instance;
+
+    let mut reduced_lhs = PolynomialValues::constant(*gamma, degree);
+    let mut reduced_rhs = PolynomialValues::constant(*gamma, degree);
+    for ((lhs, rhs), weight) in column_pairs.iter().zip(beta.powers()) {
+        reduced_lhs.add_assign_scaled(&trace_poly_values[*lhs], weight);
+        reduced_rhs.add_assign_scaled(&trace_poly_values[*rhs], weight);
+    }
+    (reduced_lhs, reduced_rhs)
+}
+
+/// Computes the elementwise product of a set of polynomials. Assumes that the set is non-empty and
+/// that each polynomial has the same length.
+fn poly_product_elementwise<F: Field>(
+    mut polys: impl Iterator<Item = PolynomialValues<F>>,
+) -> PolynomialValues<F> {
+    let mut product = polys.next().expect("Expected at least one polynomial");
+    for poly in polys {
+        batch_multiply_inplace(&mut product.values, &poly.values)
+    }
+    product
+}
+
 fn get_permutation_challenge<F: RichField, H: Hasher<F>>(
    challenger: &mut Challenger<F, H>,
 ) -> PermutationChallenge<F> {
@ -147,3 +174,221 @@ pub(crate) fn get_n_permutation_challenge_sets<F: RichField, H: Hasher<F>>(
        .map(|_| get_permutation_challenge_set(challenger, num_challenges))
        .collect()
 }
+
+fn get_permutation_challenge_target<
+    F: RichField + Extendable<D>,
+    H: AlgebraicHasher<F>,
+    const D: usize,
+>(
+    builder: &mut CircuitBuilder<F, D>,
+    challenger: &mut RecursiveChallenger<F, H, D>,
+) -> PermutationChallenge<Target> {
+    let beta = challenger.get_challenge(builder);
+    let gamma = challenger.get_challenge(builder);
+    PermutationChallenge { beta, gamma }
+}
+
+fn get_permutation_challenge_set_target<
+    F: RichField + Extendable<D>,
+    H: AlgebraicHasher<F>,
+    const D: usize,
+>(
+    builder: &mut CircuitBuilder<F, D>,
+    challenger: &mut RecursiveChallenger<F, H, D>,
+    num_challenges: usize,
+) -> PermutationChallengeSet<Target> {
+    let challenges = (0..num_challenges)
+        .map(|_| get_permutation_challenge_target(builder, challenger))
+        .collect();
+    PermutationChallengeSet { challenges }
+}
+
+pub(crate) fn get_n_permutation_challenge_sets_target<
+    F: RichField + Extendable<D>,
+    H: AlgebraicHasher<F>,
+    const D: usize,
+>(
+    builder: &mut CircuitBuilder<F, D>,
+    challenger: &mut RecursiveChallenger<F, H, D>,
+    num_challenges: usize,
+    num_sets: usize,
+) -> Vec<PermutationChallengeSet<Target>> {
+    (0..num_sets)
+        .map(|_| get_permutation_challenge_set_target(builder, challenger, num_challenges))
+        .collect()
+}
+
+/// Get a list of instances of our batch-permutation argument. These are permutation arguments
+/// where the same `Z(x)` polynomial is used to check more than one permutation.
+/// Before batching, each permutation pair leads to `num_challenges` permutation arguments, so we
+/// start with the cartesian product of `permutation_pairs` and `0..num_challenges`. Then we
+/// chunk these arguments based on our batch size.
+pub(crate) fn get_permutation_batches<'a, T: Copy>(
+    permutation_pairs: &'a [PermutationPair],
+    permutation_challenge_sets: &[PermutationChallengeSet<T>],
+    num_challenges: usize,
+    batch_size: usize,
+) -> Vec<Vec<PermutationInstance<'a, T>>> {
+    permutation_pairs
+        .iter()
+        .cartesian_product(0..num_challenges)
+        .chunks(batch_size)
+        .into_iter()
+        .map(|batch| {
+            batch
+                .enumerate()
+                .map(|(i, (pair, chal))| {
+                    let challenge = permutation_challenge_sets[i].challenges[chal];
+                    PermutationInstance { pair, challenge }
+                })
+                .collect_vec()
+        })
+        .collect()
+}
+
+// TODO: Use slices.
+pub struct PermutationCheckVars<F: Field, FE: FieldExtension<D2, BaseField = F>, const D2: usize> {
+    pub(crate) local_zs: Vec<FE>,
+    pub(crate) next_zs: Vec<FE>,
+    pub(crate) permutation_challenge_sets: Vec<PermutationChallengeSet<F>>,
+}
+
+pub(crate) fn eval_permutation_checks<F, FE, P, C, S, const D: usize, const D2: usize>(
+    stark: &S,
+    config: &StarkConfig,
+    vars: StarkEvaluationVars<FE, FE, { S::COLUMNS }, { S::PUBLIC_INPUTS }>,
+    permutation_data: PermutationCheckVars<F, FE, D2>,
+    consumer: &mut ConstraintConsumer<FE>,
+) where
+    F: RichField + Extendable<D>,
+    FE: FieldExtension<D2, BaseField = F>,
+    P: PackedField<Scalar = FE>,
+    C: GenericConfig<D, F = F>,
+    S: Stark<F, D>,
+    [(); S::COLUMNS]:,
+    [(); S::PUBLIC_INPUTS]:,
+{
+    let PermutationCheckVars {
+        local_zs,
+        next_zs,
+        permutation_challenge_sets,
+    } = permutation_data;
+
+    // Check that Z(1) = 1;
+    for &z in &local_zs {
+        consumer.constraint_first_row(z - FE::ONE);
+    }
+
+    let permutation_pairs = stark.permutation_pairs();
+
+    let permutation_batches = get_permutation_batches(
+        &permutation_pairs,
+        &permutation_challenge_sets,
+        config.num_challenges,
+        stark.permutation_batch_size(),
+    );
+
+    // Each zs value corresponds to a permutation batch.
+    for (i, instances) in permutation_batches.iter().enumerate() {
+        // Z(gx) * down = Z x  * up
+        let (reduced_lhs, reduced_rhs): (Vec<FE>, Vec<FE>) = instances
+            .iter()
+            .map(|instance| {
+                let PermutationInstance {
+                    pair: PermutationPair { column_pairs },
+                    challenge: PermutationChallenge { beta, gamma },
+                } = instance;
+                let mut factor = ReducingFactor::new(*beta);
+                let (lhs, rhs): (Vec<_>, Vec<_>) = column_pairs
+                    .iter()
+                    .map(|&(i, j)| (vars.local_values[i], vars.local_values[j]))
+                    .unzip();
+                (
+                    factor.reduce_ext(lhs.into_iter()) + FE::from_basefield(*gamma),
+                    factor.reduce_ext(rhs.into_iter()) + FE::from_basefield(*gamma),
+                )
+            })
+            .unzip();
+        let constraint = next_zs[i] * reduced_rhs.into_iter().product()
+            - local_zs[i] * reduced_lhs.into_iter().product();
+        consumer.constraint(constraint);
+    }
+}
+
+// TODO: Use slices.
+pub struct PermutationCheckDataTarget<const D: usize> {
+    pub(crate) local_zs: Vec<ExtensionTarget<D>>,
+    pub(crate) next_zs: Vec<ExtensionTarget<D>>,
+    pub(crate) permutation_challenge_sets: Vec<PermutationChallengeSet<Target>>,
+}
+
+pub(crate) fn eval_permutation_checks_recursively<F, S, const D: usize>(
+    builder: &mut CircuitBuilder<F, D>,
+    stark: &S,
+    config: &StarkConfig,
+    vars: StarkEvaluationTargets<D, { S::COLUMNS }, { S::PUBLIC_INPUTS }>,
+    permutation_data: PermutationCheckDataTarget<D>,
+    consumer: &mut RecursiveConstraintConsumer<F, D>,
+) where
+    F: RichField + Extendable<D>,
+    S: Stark<F, D>,
+    [(); S::COLUMNS]:,
+    [(); S::PUBLIC_INPUTS]:,
+{
+    let PermutationCheckDataTarget {
+        local_zs,
+        next_zs,
+        permutation_challenge_sets,
+    } = permutation_data;
+
+    let one = builder.one_extension();
+    // Check that Z(1) = 1;
+    for &z in &local_zs {
+        let z_1 = builder.sub_extension(z, one);
+        consumer.constraint_first_row(builder, z_1);
+    }
+
+    let permutation_pairs = stark.permutation_pairs();
+
+    let permutation_batches = get_permutation_batches(
+        &permutation_pairs,
+        &permutation_challenge_sets,
+        config.num_challenges,
+        stark.permutation_batch_size(),
+    );
+
+    // Each zs value corresponds to a permutation batch.
+    for (i, instances) in permutation_batches.iter().enumerate() {
+        let (reduced_lhs, reduced_rhs): (Vec<ExtensionTarget<D>>, Vec<ExtensionTarget<D>>) =
+            instances
+                .iter()
+                .map(|instance| {
+                    let PermutationInstance {
+                        pair: PermutationPair { column_pairs },
+                        challenge: PermutationChallenge { beta, gamma },
+                    } = instance;
+                    let beta_ext = builder.convert_to_ext(*beta);
+                    let gamma_ext = builder.convert_to_ext(*gamma);
+                    let mut factor = ReducingFactorTarget::new(beta_ext);
+                    let (lhs, rhs): (Vec<_>, Vec<_>) = column_pairs
+                        .iter()
+                        .map(|&(i, j)| (vars.local_values[i], vars.local_values[j]))
+                        .unzip();
+                    let reduced_lhs = factor.reduce(&lhs, builder);
+                    let reduced_rhs = factor.reduce(&rhs, builder);
+                    (
+                        builder.add_extension(reduced_lhs, gamma_ext),
+                        builder.add_extension(reduced_rhs, gamma_ext),
+                    )
+                })
+                .unzip();
+        let reduced_lhs_product = builder.mul_many_extension(&reduced_lhs);
+        let reduced_rhs_product = builder.mul_many_extension(&reduced_rhs);
+        // constraint = next_zs[i] * reduced_rhs_product - local_zs[i] * reduced_lhs_product
+        let constraint = {
+            let tmp = builder.mul_extension(local_zs[i], reduced_lhs_product);
+            builder.mul_sub_extension(next_zs[i], reduced_rhs_product, tmp)
+        };
+        consumer.constraint(builder, constraint)
+    }
+}
--- a/starky/src/proof.rs
+++ b/starky/src/proof.rs
@ -32,6 +32,7 @@ pub struct StarkProof<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>,
 }

 impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize> StarkProof<F, C, D> {
+    /// Recover the length of the trace from a STARK proof and a STARK config.
    pub(crate) fn recover_degree_bits(&self, config: &StarkConfig) -> usize {
        let initial_merkle_proof = &self.opening_proof.query_round_proofs[0]
            .initial_trees_proof
@ -51,6 +52,7 @@ pub struct StarkProofTarget<const D: usize> {
 }

 impl<const D: usize> StarkProofTarget<D> {
+    /// Recover the length of the trace from a STARK proof and a STARK config.
    pub(crate) fn recover_degree_bits(&self, config: &StarkConfig) -> usize {
        let initial_merkle_proof = &self.opening_proof.query_round_proofs[0]
            .initial_trees_proof
@ -101,7 +103,7 @@ pub struct CompressedStarkProofWithPublicInputs<

 pub(crate) struct StarkProofChallenges<F: RichField + Extendable<D>, const D: usize> {
    /// Randomness used in any permutation arguments.
-    pub permutation_challenge_sets: Vec<PermutationChallengeSet<F>>,
+    pub permutation_challenge_sets: Option<Vec<PermutationChallengeSet<F>>>,

    /// Random values used to combine STARK constraints.
    pub stark_alphas: Vec<F>,
@ -113,6 +115,7 @@ pub(crate) struct StarkProofChallenges<F: RichField + Extendable<D>, const D: us
 }

 pub(crate) struct StarkProofChallengesTarget<const D: usize> {
+    pub permutation_challenge_sets: Option<Vec<PermutationChallengeSet<Target>>>,
    pub stark_alphas: Vec<Target>,
    pub stark_zeta: ExtensionTarget<D>,
    pub fri_challenges: FriChallengesTarget<D>,
@ -179,27 +182,29 @@ impl<F: RichField + Extendable<D>, const D: usize> StarkOpeningSet<F, D> {
 pub struct StarkOpeningSetTarget<const D: usize> {
    pub local_values: Vec<ExtensionTarget<D>>,
    pub next_values: Vec<ExtensionTarget<D>>,
-    pub permutation_zs: Vec<ExtensionTarget<D>>,
-    pub permutation_zs_right: Vec<ExtensionTarget<D>>,
+    pub permutation_zs: Option<Vec<ExtensionTarget<D>>>,
+    pub permutation_zs_right: Option<Vec<ExtensionTarget<D>>>,
    pub quotient_polys: Vec<ExtensionTarget<D>>,
 }

 impl<const D: usize> StarkOpeningSetTarget<D> {
    pub(crate) fn to_fri_openings(&self) -> FriOpeningsTarget<D> {
        let zeta_batch = FriOpeningBatchTarget {
-            values: [
-                self.local_values.as_slice(),
-                self.quotient_polys.as_slice(),
-                self.permutation_zs.as_slice(),
-            ]
-            .concat(),
+            values: self
+                .local_values
+                .iter()
+                .chain(self.permutation_zs.iter().flatten())
+                .chain(&self.quotient_polys)
+                .copied()
+                .collect_vec(),
        };
        let zeta_right_batch = FriOpeningBatchTarget {
-            values: [
-                self.next_values.as_slice(),
-                self.permutation_zs_right.as_slice(),
-            ]
-            .concat(),
+            values: self
+                .next_values
+                .iter()
+                .chain(self.permutation_zs_right.iter().flatten())
+                .copied()
+                .collect_vec(),
        };
        FriOpeningsTarget {
            batches: vec![zeta_batch, zeta_right_batch],
--- a/starky/src/prover.rs
+++ b/starky/src/prover.rs
@ -18,15 +18,19 @@ use rayon::prelude::*;

 use crate::config::StarkConfig;
 use crate::constraint_consumer::ConstraintConsumer;
-use crate::permutation::compute_permutation_z_polys;
+use crate::permutation::PermutationCheckVars;
+use crate::permutation::{
+    compute_permutation_z_polys, get_n_permutation_challenge_sets, PermutationChallengeSet,
+};
 use crate::proof::{StarkOpeningSet, StarkProof, StarkProofWithPublicInputs};
 use crate::stark::Stark;
+use crate::vanishing_poly::eval_vanishing_poly;
 use crate::vars::StarkEvaluationVars;

 pub fn prove<F, C, S, const D: usize>(
    stark: S,
    config: &StarkConfig,
-    trace: Vec<[F; S::COLUMNS]>,
+    trace_poly_values: Vec<PolynomialValues<F>>,
    public_inputs: [F; S::PUBLIC_INPUTS],
    timing: &mut TimingTree,
 ) -> Result<StarkProofWithPublicInputs<F, C, D>>
@ -38,23 +42,16 @@ where
    [(); S::PUBLIC_INPUTS]:,
    [(); C::Hasher::HASH_SIZE]:,
 {
-    let degree = trace.len();
+    let degree = trace_poly_values[0].len();
    let degree_bits = log2_strict(degree);
-
-    let trace_vecs = trace.iter().map(|row| row.to_vec()).collect_vec();
-    let trace_col_major: Vec<Vec<F>> = transpose(&trace_vecs);
-
-    let trace_poly_values: Vec<PolynomialValues<F>> = timed!(
-        timing,
-        "compute trace polynomials",
-        trace_col_major
-            .par_iter()
-            .map(|column| PolynomialValues::new(column.clone()))
-            .collect()
-    );
-
+    let fri_params = config.fri_params(degree_bits);
    let rate_bits = config.fri_config.rate_bits;
    let cap_height = config.fri_config.cap_height;
+    assert!(
+        fri_params.total_arities() <= degree_bits + rate_bits - cap_height,
+        "FRI total reduction arity is too large.",
+    );
+
    let trace_commitment = timed!(
        timing,
        "compute trace commitment",
@ -75,28 +72,36 @@ where
    challenger.observe_cap(&trace_cap);

    // Permutation arguments.
-    let permutation_zs_commitment = if stark.uses_permutation_args() {
+    let permutation_zs_commitment_challenges = stark.uses_permutation_args().then(|| {
+        let permutation_challenge_sets = get_n_permutation_challenge_sets(
+            &mut challenger,
+            config.num_challenges,
+            stark.permutation_batch_size(),
+        );
        let permutation_z_polys = compute_permutation_z_polys::<F, C, S, D>(
            &stark,
            config,
-            &mut challenger,
            &trace_poly_values,
+            &permutation_challenge_sets,
        );
-        timed!(
+
+        let permutation_zs_commitment = timed!(
            timing,
            "compute permutation Z commitments",
-            Some(PolynomialBatch::from_values(
+            PolynomialBatch::from_values(
                permutation_z_polys,
                rate_bits,
                false,
                config.fri_config.cap_height,
                timing,
                None,
-            ))
-        )
-    } else {
-        None
-    };
+            )
+        );
+        (permutation_zs_commitment, permutation_challenge_sets)
+    });
+    let permutation_zs_commitment = permutation_zs_commitment_challenges
+        .as_ref()
+        .map(|(comm, _)| comm);
    let permutation_zs_cap = permutation_zs_commitment
        .as_ref()
        .map(|commit| commit.merkle_tree.cap.clone());
@ -108,10 +113,11 @@ where
    let quotient_polys = compute_quotient_polys::<F, C, S, D>(
        &stark,
        &trace_commitment,
+        &permutation_zs_commitment_challenges,
        public_inputs,
        alphas,
        degree_bits,
-        rate_bits,
+        config,
    );
    let all_quotient_chunks = quotient_polys
        .into_par_iter()
@ -151,16 +157,15 @@ where
        zeta,
        g,
        &trace_commitment,
-        permutation_zs_commitment.as_ref(),
+        permutation_zs_commitment,
        &quotient_commitment,
    );
    challenger.observe_openings(&openings.to_fri_openings());

    let initial_merkle_trees = once(&trace_commitment)
-        .chain(permutation_zs_commitment.as_ref())
+        .chain(permutation_zs_commitment)
        .chain(once(&quotient_commitment))
        .collect_vec();
-    let fri_params = config.fri_params(degree_bits);

    let opening_proof = timed!(
        timing,
@ -189,13 +194,17 @@ where

 /// Computes the quotient polynomials `(sum alpha^i C_i(x)) / Z_H(x)` for `alpha` in `alphas`,
 /// where the `C_i`s are the Stark constraints.
-fn compute_quotient_polys<F, C, S, const D: usize>(
+fn compute_quotient_polys<'a, F, C, S, const D: usize>(
    stark: &S,
-    trace_commitment: &PolynomialBatch<F, C, D>,
+    trace_commitment: &'a PolynomialBatch<F, C, D>,
+    permutation_zs_commitment_challenges: &'a Option<(
+        PolynomialBatch<F, C, D>,
+        Vec<PermutationChallengeSet<F>>,
+    )>,
    public_inputs: [F; S::PUBLIC_INPUTS],
    alphas: Vec<F>,
    degree_bits: usize,
-    rate_bits: usize,
+    config: &StarkConfig,
 ) -> Vec<PolynomialCoeffs<F>>
 where
    F: RichField + Extendable<D>,
@ -205,6 +214,7 @@ where
    [(); S::PUBLIC_INPUTS]:,
 {
    let degree = 1 << degree_bits;
+    let rate_bits = config.fri_config.rate_bits;

    let quotient_degree_bits = log2_ceil(stark.quotient_degree_factor());
    assert!(
@ -224,9 +234,10 @@ where
    let z_h_on_coset = ZeroPolyOnCoset::<F>::new(degree_bits, quotient_degree_bits);

    // Retrieve the LDE values at index `i`.
-    let get_at_index = |comm: &PolynomialBatch<F, C, D>, i: usize| -> [F; S::COLUMNS] {
-        comm.get_lde_values(i * step).try_into().unwrap()
-    };
+    let get_at_index =
+        |comm: &'a PolynomialBatch<F, C, D>, i: usize| -> &'a [F] { comm.get_lde_values(i * step) };
+    let get_trace_at_index = |i| get_at_index(trace_commitment, i).try_into().unwrap();
+
    // Last element of the subgroup.
    let last = F::primitive_root_of_unity(degree_bits).inverse();
    let size = degree << quotient_degree_bits;
@ -247,12 +258,26 @@ where
                lagrange_last.values[i],
            );
            let vars = StarkEvaluationVars::<F, F, { S::COLUMNS }, { S::PUBLIC_INPUTS }> {
-                local_values: &get_at_index(trace_commitment, i),
-                next_values: &get_at_index(trace_commitment, (i + next_step) % size),
+                local_values: &get_trace_at_index(i),
+                next_values: &get_trace_at_index((i + next_step) % size),
                public_inputs: &public_inputs,
            };
-            stark.eval_packed_base(vars, &mut consumer);
-            // TODO: Add in constraints for permutation arguments.
+            let permutation_check_data = permutation_zs_commitment_challenges.as_ref().map(
+                |(permutation_zs_commitment, permutation_challenge_sets)| PermutationCheckVars {
+                    local_zs: get_at_index(permutation_zs_commitment, i).to_vec(),
+                    next_zs: get_at_index(permutation_zs_commitment, (i + next_step) % size)
+                        .to_vec(),
+                    permutation_challenge_sets: permutation_challenge_sets.to_vec(),
+                },
+            );
+            // TODO: Use packed field for F.
+            eval_vanishing_poly::<F, F, F, C, S, D, 1>(
+                stark,
+                config,
+                vars,
+                permutation_check_data,
+                &mut consumer,
+            );
            // TODO: Fix this once we use a genuine `PackedField`.
            let mut constraints_evals = consumer.accumulators();
            // We divide the constraints evaluations by `Z_H(x)`.
--- a/starky/src/recursive_verifier.rs
+++ b/starky/src/recursive_verifier.rs
@ -1,5 +1,6 @@
 use std::iter::once;

+use anyhow::{ensure, Result};
 use itertools::Itertools;
 use plonky2::field::extension_field::Extendable;
 use plonky2::field::field_types::Field;
@ -13,11 +14,13 @@ use plonky2::util::reducing::ReducingFactorTarget;

 use crate::config::StarkConfig;
 use crate::constraint_consumer::RecursiveConstraintConsumer;
+use crate::permutation::PermutationCheckDataTarget;
 use crate::proof::{
    StarkOpeningSetTarget, StarkProof, StarkProofChallengesTarget, StarkProofTarget,
    StarkProofWithPublicInputs, StarkProofWithPublicInputsTarget,
 };
 use crate::stark::Stark;
+use crate::vanishing_poly::eval_vanishing_poly_recursively;
 use crate::vars::StarkEvaluationTargets;

 pub fn recursively_verify_stark_proof<
@ -37,7 +40,7 @@ pub fn recursively_verify_stark_proof<
 {
    assert_eq!(proof_with_pis.public_inputs.len(), S::PUBLIC_INPUTS);
    let degree_bits = proof_with_pis.proof.recover_degree_bits(inner_config);
-    let challenges = proof_with_pis.get_challenges::<F, C>(builder, inner_config);
+    let challenges = proof_with_pis.get_challenges::<F, C, S>(builder, &stark, inner_config);

    recursively_verify_stark_proof_with_challenges::<F, C, S, D>(
        builder,
@ -67,6 +70,7 @@ fn recursively_verify_stark_proof_with_challenges<
    [(); S::COLUMNS]:,
    [(); S::PUBLIC_INPUTS]:,
 {
+    check_permutation_options(&stark, &proof_with_pis, &challenges).unwrap();
    let one = builder.one_extension();

    let StarkProofWithPublicInputsTarget {
@ -104,8 +108,21 @@ fn recursively_verify_stark_proof_with_challenges<
        l_1,
        l_last,
    );
-    stark.eval_ext_recursively(builder, vars, &mut consumer);
-    // TODO: Add in constraints for permutation arguments.
+    let permutation_data = stark
+        .uses_permutation_args()
+        .then(|| PermutationCheckDataTarget {
+            local_zs: permutation_zs.as_ref().unwrap().clone(),
+            next_zs: permutation_zs_right.as_ref().unwrap().clone(),
+            permutation_challenge_sets: challenges.permutation_challenge_sets.unwrap(),
+        });
+    eval_vanishing_poly_recursively::<F, C, S, D>(
+        builder,
+        &stark,
+        inner_config,
+        vars,
+        permutation_data,
+        &mut consumer,
+    );
    let vanishing_polys_zeta = consumer.accumulators();

    // Check each polynomial identity, of the form `vanishing(x) = Z_H(x) quotient(x)`, at zeta.
@ -187,24 +204,25 @@ pub fn add_virtual_stark_proof<F: RichField + Extendable<D>, S: Stark<F, D>, con
    let fri_params = config.fri_params(degree_bits);
    let cap_height = fri_params.config.cap_height;

-    let num_leaves_per_oracle = &[
-        S::COLUMNS,
-        // TODO: permutation polys
-        stark.quotient_degree_factor() * config.num_challenges,
-    ];
+    let num_leaves_per_oracle = once(S::COLUMNS)
+        .chain(
+            stark
+                .uses_permutation_args()
+                .then(|| stark.num_permutation_batches(config)),
+        )
+        .chain(once(stark.quotient_degree_factor() * config.num_challenges))
+        .collect_vec();

-    let permutation_zs_cap = if stark.uses_permutation_args() {
-        Some(builder.add_virtual_cap(cap_height))
-    } else {
-        None
-    };
+    let permutation_zs_cap = stark
+        .uses_permutation_args()
+        .then(|| builder.add_virtual_cap(cap_height));

    StarkProofTarget {
        trace_cap: builder.add_virtual_cap(cap_height),
        permutation_zs_cap,
        quotient_polys_cap: builder.add_virtual_cap(cap_height),
        openings: add_stark_opening_set::<F, S, D>(builder, stark, config),
-        opening_proof: builder.add_virtual_fri_proof(num_leaves_per_oracle, &fri_params),
+        opening_proof: builder.add_virtual_fri_proof(&num_leaves_per_oracle, &fri_params),
    }
 }

@ -217,8 +235,12 @@ fn add_stark_opening_set<F: RichField + Extendable<D>, S: Stark<F, D>, const D:
    StarkOpeningSetTarget {
        local_values: builder.add_virtual_extension_targets(S::COLUMNS),
        next_values: builder.add_virtual_extension_targets(S::COLUMNS),
-        permutation_zs: vec![/*TODO*/],
-        permutation_zs_right: vec![/*TODO*/],
+        permutation_zs: stark
+            .uses_permutation_args()
+            .then(|| builder.add_virtual_extension_targets(stark.num_permutation_batches(config))),
+        permutation_zs_right: stark
+            .uses_permutation_args()
+            .then(|| builder.add_virtual_extension_targets(stark.num_permutation_batches(config))),
        quotient_polys: builder
            .add_virtual_extension_targets(stark.quotient_degree_factor() * num_challenges),
    }
@ -267,5 +289,33 @@ pub fn set_stark_proof_target<F, C: GenericConfig<D, F = F>, W, const D: usize>(
        &proof.openings.to_fri_openings(),
    );

+    if let (Some(permutation_zs_cap_target), Some(permutation_zs_cap)) =
+        (&proof_target.permutation_zs_cap, &proof.permutation_zs_cap)
+    {
+        witness.set_cap_target(permutation_zs_cap_target, permutation_zs_cap);
+    }
+
    set_fri_proof_target(witness, &proof_target.opening_proof, &proof.opening_proof);
 }
+
+/// Utility function to check that all permutation data wrapped in `Option`s are `Some` iff
+/// the Stark uses a permutation argument.
+fn check_permutation_options<F: RichField + Extendable<D>, S: Stark<F, D>, const D: usize>(
+    stark: &S,
+    proof_with_pis: &StarkProofWithPublicInputsTarget<D>,
+    challenges: &StarkProofChallengesTarget<D>,
+) -> Result<()> {
+    let options_is_some = [
+        proof_with_pis.proof.permutation_zs_cap.is_some(),
+        proof_with_pis.proof.openings.permutation_zs.is_some(),
+        proof_with_pis.proof.openings.permutation_zs_right.is_some(),
+        challenges.permutation_challenge_sets.is_some(),
+    ];
+    ensure!(
+        options_is_some
+            .into_iter()
+            .all(|b| b == stark.uses_permutation_args()),
+        "Permutation data doesn't match with Stark configuration."
+    );
+    Ok(())
+}
--- a/starky/src/stark.rs
+++ b/starky/src/stark.rs
@ -16,7 +16,6 @@ use crate::vars::StarkEvaluationTargets;
 use crate::vars::StarkEvaluationVars;

 /// Represents a STARK system.
-// TODO: Add a `constraint_degree` fn that returns the maximum constraint degree.
 pub trait Stark<F: RichField + Extendable<D>, const D: usize>: Sync {
    /// The total number of columns in the trace.
    const COLUMNS: usize;
--- a/starky/src/util.rs
+++ b/starky/src/util.rs
@ -0,0 +1,16 @@
+use itertools::Itertools;
+use plonky2::field::field_types::Field;
+use plonky2::field::polynomial::PolynomialValues;
+use plonky2::util::transpose;
+
+/// A helper function to transpose a row-wise trace and put it in the format that `prove` expects.
+pub fn trace_rows_to_poly_values<F: Field, const COLUMNS: usize>(
+    trace_rows: Vec<[F; COLUMNS]>,
+) -> Vec<PolynomialValues<F>> {
+    let trace_row_vecs = trace_rows.into_iter().map(|row| row.to_vec()).collect_vec();
+    let trace_col_vecs: Vec<Vec<F>> = transpose(&trace_row_vecs);
+    trace_col_vecs
+        .into_iter()
+        .map(|column| PolynomialValues::new(column))
+        .collect()
+}
--- a/starky/src/vanishing_poly.rs
+++ b/starky/src/vanishing_poly.rs
@ -0,0 +1,68 @@
+use plonky2::field::extension_field::{Extendable, FieldExtension};
+use plonky2::field::packed_field::PackedField;
+use plonky2::hash::hash_types::RichField;
+use plonky2::plonk::circuit_builder::CircuitBuilder;
+use plonky2::plonk::config::GenericConfig;
+
+use crate::config::StarkConfig;
+use crate::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer};
+use crate::permutation::{
+    eval_permutation_checks, eval_permutation_checks_recursively, PermutationCheckDataTarget,
+    PermutationCheckVars,
+};
+use crate::stark::Stark;
+use crate::vars::{StarkEvaluationTargets, StarkEvaluationVars};
+
+pub(crate) fn eval_vanishing_poly<F, FE, P, C, S, const D: usize, const D2: usize>(
+    stark: &S,
+    config: &StarkConfig,
+    vars: StarkEvaluationVars<FE, FE, { S::COLUMNS }, { S::PUBLIC_INPUTS }>,
+    permutation_data: Option<PermutationCheckVars<F, FE, D2>>,
+    consumer: &mut ConstraintConsumer<FE>,
+) where
+    F: RichField + Extendable<D>,
+    FE: FieldExtension<D2, BaseField = F>,
+    P: PackedField<Scalar = FE>,
+    C: GenericConfig<D, F = F>,
+    S: Stark<F, D>,
+    [(); S::COLUMNS]:,
+    [(); S::PUBLIC_INPUTS]:,
+{
+    stark.eval_packed_generic(vars, consumer);
+    if let Some(permutation_data) = permutation_data {
+        eval_permutation_checks::<F, FE, P, C, S, D, D2>(
+            stark,
+            config,
+            vars,
+            permutation_data,
+            consumer,
+        );
+    }
+}
+
+pub(crate) fn eval_vanishing_poly_recursively<F, C, S, const D: usize>(
+    builder: &mut CircuitBuilder<F, D>,
+    stark: &S,
+    config: &StarkConfig,
+    vars: StarkEvaluationTargets<D, { S::COLUMNS }, { S::PUBLIC_INPUTS }>,
+    permutation_data: Option<PermutationCheckDataTarget<D>>,
+    consumer: &mut RecursiveConstraintConsumer<F, D>,
+) where
+    F: RichField + Extendable<D>,
+    C: GenericConfig<D, F = F>,
+    S: Stark<F, D>,
+    [(); S::COLUMNS]:,
+    [(); S::PUBLIC_INPUTS]:,
+{
+    stark.eval_ext_recursively(builder, vars, consumer);
+    if let Some(permutation_data) = permutation_data {
+        eval_permutation_checks_recursively::<F, S, D>(
+            builder,
+            stark,
+            config,
+            vars,
+            permutation_data,
+            consumer,
+        );
+    }
+}
--- a/starky/src/verifier.rs
+++ b/starky/src/verifier.rs
@ -11,8 +11,10 @@ use plonky2::plonk::plonk_common::reduce_with_powers;

 use crate::config::StarkConfig;
 use crate::constraint_consumer::ConstraintConsumer;
+use crate::permutation::PermutationCheckVars;
 use crate::proof::{StarkOpeningSet, StarkProofChallenges, StarkProofWithPublicInputs};
 use crate::stark::Stark;
+use crate::vanishing_poly::eval_vanishing_poly;
 use crate::vars::StarkEvaluationVars;

 pub fn verify_stark_proof<
@ -32,7 +34,7 @@ where
 {
    ensure!(proof_with_pis.public_inputs.len() == S::PUBLIC_INPUTS);
    let degree_bits = proof_with_pis.proof.recover_degree_bits(config);
-    let challenges = proof_with_pis.get_challenges(&stark, config, degree_bits)?;
+    let challenges = proof_with_pis.get_challenges(&stark, config, degree_bits);
    verify_stark_proof_with_challenges(stark, proof_with_pis, challenges, degree_bits, config)
 }

@ -53,6 +55,7 @@ where
    [(); S::PUBLIC_INPUTS]:,
    [(); C::Hasher::HASH_SIZE]:,
 {
+    check_permutation_options(&stark, &proof_with_pis, &challenges)?;
    let StarkProofWithPublicInputs {
        proof,
        public_inputs,
@ -88,8 +91,18 @@ where
        l_1,
        l_last,
    );
-    stark.eval_ext(vars, &mut consumer);
-    // TODO: Add in constraints for permutation arguments.
+    let permutation_data = stark.uses_permutation_args().then(|| PermutationCheckVars {
+        local_zs: permutation_zs.as_ref().unwrap().clone(),
+        next_zs: permutation_zs_right.as_ref().unwrap().clone(),
+        permutation_challenge_sets: challenges.permutation_challenge_sets.unwrap(),
+    });
+    eval_vanishing_poly::<F, F::Extension, F::Extension, C, S, D, D>(
+        &stark,
+        config,
+        vars,
+        permutation_data,
+        &mut consumer,
+    );
    let vanishing_polys_zeta = consumer.accumulators();

    // Check each polynomial identity, of the form `vanishing(x) = Z_H(x) quotient(x)`, at zeta.
@ -105,7 +118,10 @@ where
        .chunks(stark.quotient_degree_factor())
        .enumerate()
    {
-        ensure!(vanishing_polys_zeta[i] == z_h_zeta * reduce_with_powers(chunk, zeta_pow_deg));
+        ensure!(
+            vanishing_polys_zeta[i] == z_h_zeta * reduce_with_powers(chunk, zeta_pow_deg),
+            "Mismatch between evaluation and opening of quotient polynomial"
+        );
    }

    let merkle_caps = once(proof.trace_cap)
@ -141,7 +157,32 @@ fn eval_l_1_and_l_last<F: Field>(log_n: usize, x: F) -> (F, F) {
    (z_x * invs[0], z_x * invs[1])
 }

-/// Recover the length of the trace from a STARK proof and a STARK config.
+/// Utility function to check that all permutation data wrapped in `Option`s are `Some` iff
+/// the Stark uses a permutation argument.
+fn check_permutation_options<
+    F: RichField + Extendable<D>,
+    C: GenericConfig<D, F = F>,
+    S: Stark<F, D>,
+    const D: usize,
+>(
+    stark: &S,
+    proof_with_pis: &StarkProofWithPublicInputs<F, C, D>,
+    challenges: &StarkProofChallenges<F, D>,
+) -> Result<()> {
+    let options_is_some = [
+        proof_with_pis.proof.permutation_zs_cap.is_some(),
+        proof_with_pis.proof.openings.permutation_zs.is_some(),
+        proof_with_pis.proof.openings.permutation_zs_right.is_some(),
+        challenges.permutation_challenge_sets.is_some(),
+    ];
+    ensure!(
+        options_is_some
+            .into_iter()
+            .all(|b| b == stark.uses_permutation_args()),
+        "Permutation data doesn't match with Stark configuration."
+    );
+    Ok(())
+}

 #[cfg(test)]
 mod tests {
--- a/system_zero/Cargo.toml
+++ b/system_zero/Cargo.toml
@ -6,9 +6,18 @@ edition = "2021"

 [dependencies]
 plonky2 = { path = "../plonky2" }
+plonky2_util = { path = "../util" }
 starky = { path = "../starky" }
 anyhow = "1.0.40"
 env_logger = "0.9.0"
+itertools = "0.10.0"
 log = "0.4.14"
 rand = "0.8.4"
 rand_chacha = "0.3.1"
+
+[dev-dependencies]
+criterion = "0.3.5"
+
+[[bench]]
+name = "lookup_permuted_cols"
+harness = false
--- a/system_zero/benches/lookup_permuted_cols.rs
+++ b/system_zero/benches/lookup_permuted_cols.rs
@ -0,0 +1,30 @@
+use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
+use itertools::Itertools;
+use plonky2::field::field_types::Field;
+use plonky2::field::goldilocks_field::GoldilocksField;
+use rand::{thread_rng, Rng};
+use system_zero::lookup::permuted_cols;
+
+type F = GoldilocksField;
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let mut group = c.benchmark_group("lookup-permuted-cols");
+
+    for size_log in [16, 17, 18] {
+        let size = 1 << size_log;
+        group.bench_with_input(BenchmarkId::from_parameter(size), &size, |b, _| {
+            // We could benchmark a table of random values with
+            //     let table = F::rand_vec(size);
+            // But in practice we currently use tables that are pre-sorted, which makes
+            // permuted_cols cheaper since it will sort the table.
+            let table = (0..size).map(F::from_canonical_usize).collect_vec();
+            let input = (0..size)
+                .map(|_| table[thread_rng().gen_range(0..size)])
+                .collect_vec();
+            b.iter(|| permuted_cols(&input, &table));
+        });
+    }
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
--- a/system_zero/src/arithmetic/addition.rs
+++ b/system_zero/src/arithmetic/addition.rs
@ -7,18 +7,18 @@ use plonky2::plonk::circuit_builder::CircuitBuilder;
 use plonky2::plonk::plonk_common::reduce_with_powers_ext_recursive;
 use starky::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer};

-use crate::registers::arithmetic::*;
+use crate::registers::alu::*;
 use crate::registers::NUM_COLUMNS;

 pub(crate) fn generate_addition<F: PrimeField64>(values: &mut [F; NUM_COLUMNS]) {
-    let in_1 = values[COL_ADD_INPUT_1].to_canonical_u64();
-    let in_2 = values[COL_ADD_INPUT_2].to_canonical_u64();
-    let in_3 = values[COL_ADD_INPUT_3].to_canonical_u64();
+    let in_1 = values[COL_ADD_INPUT_0].to_canonical_u64();
+    let in_2 = values[COL_ADD_INPUT_1].to_canonical_u64();
+    let in_3 = values[COL_ADD_INPUT_2].to_canonical_u64();
    let output = in_1 + in_2 + in_3;

-    values[COL_ADD_OUTPUT_1] = F::from_canonical_u16(output as u16);
-    values[COL_ADD_OUTPUT_2] = F::from_canonical_u16((output >> 16) as u16);
-    values[COL_ADD_OUTPUT_3] = F::from_canonical_u16((output >> 32) as u16);
+    values[COL_ADD_OUTPUT_0] = F::from_canonical_u16(output as u16);
+    values[COL_ADD_OUTPUT_1] = F::from_canonical_u16((output >> 16) as u16);
+    values[COL_ADD_OUTPUT_2] = F::from_canonical_u16((output >> 32) as u16);
 }

 pub(crate) fn eval_addition<F: Field, P: PackedField<Scalar = F>>(
@ -26,12 +26,12 @@ pub(crate) fn eval_addition<F: Field, P: PackedField<Scalar = F>>(
    yield_constr: &mut ConstraintConsumer<P>,
 ) {
    let is_add = local_values[IS_ADD];
-    let in_1 = local_values[COL_ADD_INPUT_1];
-    let in_2 = local_values[COL_ADD_INPUT_2];
-    let in_3 = local_values[COL_ADD_INPUT_3];
-    let out_1 = local_values[COL_ADD_OUTPUT_1];
-    let out_2 = local_values[COL_ADD_OUTPUT_2];
-    let out_3 = local_values[COL_ADD_OUTPUT_3];
+    let in_1 = local_values[COL_ADD_INPUT_0];
+    let in_2 = local_values[COL_ADD_INPUT_1];
+    let in_3 = local_values[COL_ADD_INPUT_2];
+    let out_1 = local_values[COL_ADD_OUTPUT_0];
+    let out_2 = local_values[COL_ADD_OUTPUT_1];
+    let out_3 = local_values[COL_ADD_OUTPUT_2];

    let weight_2 = F::from_canonical_u64(1 << 16);
    let weight_3 = F::from_canonical_u64(1 << 32);
@ -41,7 +41,7 @@ pub(crate) fn eval_addition<F: Field, P: PackedField<Scalar = F>>(

    let computed_out = in_1 + in_2 + in_3;

-    yield_constr.constraint_wrapping(is_add * (out - computed_out));
+    yield_constr.constraint(is_add * (out - computed_out));
 }

 pub(crate) fn eval_addition_recursively<F: RichField + Extendable<D>, const D: usize>(
@ -50,12 +50,12 @@ pub(crate) fn eval_addition_recursively<F: RichField + Extendable<D>, const D: u
    yield_constr: &mut RecursiveConstraintConsumer<F, D>,
 ) {
    let is_add = local_values[IS_ADD];
-    let in_1 = local_values[COL_ADD_INPUT_1];
-    let in_2 = local_values[COL_ADD_INPUT_2];
-    let in_3 = local_values[COL_ADD_INPUT_3];
-    let out_1 = local_values[COL_ADD_OUTPUT_1];
-    let out_2 = local_values[COL_ADD_OUTPUT_2];
-    let out_3 = local_values[COL_ADD_OUTPUT_3];
+    let in_1 = local_values[COL_ADD_INPUT_0];
+    let in_2 = local_values[COL_ADD_INPUT_1];
+    let in_3 = local_values[COL_ADD_INPUT_2];
+    let out_1 = local_values[COL_ADD_OUTPUT_0];
+    let out_2 = local_values[COL_ADD_OUTPUT_1];
+    let out_3 = local_values[COL_ADD_OUTPUT_2];

    let limb_base = builder.constant(F::from_canonical_u64(1 << 16));
    // Note that this can't overflow. Since each output limb has been range checked as 16-bits,
@ -66,5 +66,5 @@ pub(crate) fn eval_addition_recursively<F: RichField + Extendable<D>, const D: u

    let diff = builder.sub_extension(out, computed_out);
    let filtered_diff = builder.mul_extension(is_add, diff);
-    yield_constr.constraint_wrapping(builder, filtered_diff);
+    yield_constr.constraint(builder, filtered_diff);
 }
--- a/system_zero/src/alu/canonical.rs
+++ b/system_zero/src/alu/canonical.rs
@ -0,0 +1,109 @@
+//! Helper methods for checking that a value is canonical, i.e. is less than `|F|`.
+//!
+//! See https://hackmd.io/NC-yRmmtRQSvToTHb96e8Q#Checking-element-validity
+
+use plonky2::field::extension_field::Extendable;
+use plonky2::field::field_types::Field;
+use plonky2::field::packed_field::PackedField;
+use plonky2::hash::hash_types::RichField;
+use plonky2::iop::ext_target::ExtensionTarget;
+use plonky2::plonk::circuit_builder::CircuitBuilder;
+use starky::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer};
+
+/// Computes the helper value used in the is-canonical check.
+pub(crate) fn compute_canonical_inv<F: Field>(value_to_check: u64) -> F {
+    let value_hi_32 = (value_to_check >> 32) as u32;
+
+    if value_hi_32 == u32::MAX {
+        debug_assert_eq!(value_to_check as u32, 0, "Value was not canonical.");
+        // In this case it doesn't matter what we put for the purported inverse value. The
+        // constraint containing this value will get multiplied by the low u32 limb, which will be
+        // zero, satisfying the constraint regardless of what we put here.
+        F::ZERO
+    } else {
+        F::from_canonical_u32(u32::MAX - value_hi_32).inverse()
+    }
+}
+
+/// Adds constraints to require that a list of four `u16`s, in little-endian order, represent a
+/// canonical field element, i.e. that their combined value is less than `|F|`. Returns their
+/// combined value.
+pub(crate) fn combine_u16s_check_canonical<F: Field, P: PackedField<Scalar = F>>(
+    limb_0_u16: P,
+    limb_1_u16: P,
+    limb_2_u16: P,
+    limb_3_u16: P,
+    inverse: P,
+    yield_constr: &mut ConstraintConsumer<P>,
+) -> P {
+    let base = F::from_canonical_u32(1 << 16);
+    let limb_0_u32 = limb_0_u16 + limb_1_u16 * base;
+    let limb_1_u32 = limb_2_u16 + limb_3_u16 * base;
+    combine_u32s_check_canonical(limb_0_u32, limb_1_u32, inverse, yield_constr)
+}
+
+/// Adds constraints to require that a list of four `u16`s, in little-endian order, represent a
+/// canonical field element, i.e. that their combined value is less than `|F|`. Returns their
+/// combined value.
+pub(crate) fn combine_u16s_check_canonical_circuit<F: RichField + Extendable<D>, const D: usize>(
+    builder: &mut CircuitBuilder<F, D>,
+    limb_0_u16: ExtensionTarget<D>,
+    limb_1_u16: ExtensionTarget<D>,
+    limb_2_u16: ExtensionTarget<D>,
+    limb_3_u16: ExtensionTarget<D>,
+    inverse: ExtensionTarget<D>,
+    yield_constr: &mut RecursiveConstraintConsumer<F, D>,
+) -> ExtensionTarget<D> {
+    let base = F::from_canonical_u32(1 << 16);
+    let limb_0_u32 = builder.mul_const_add_extension(base, limb_1_u16, limb_0_u16);
+    let limb_1_u32 = builder.mul_const_add_extension(base, limb_3_u16, limb_2_u16);
+    combine_u32s_check_canonical_circuit(builder, limb_0_u32, limb_1_u32, inverse, yield_constr)
+}
+
+/// Adds constraints to require that a pair of `u32`s, in little-endian order, represent a canonical
+/// field element, i.e. that their combined value is less than `|F|`. Returns their combined value.
+pub(crate) fn combine_u32s_check_canonical<F: Field, P: PackedField<Scalar = F>>(
+    limb_0_u32: P,
+    limb_1_u32: P,
+    inverse: P,
+    yield_constr: &mut ConstraintConsumer<P>,
+) -> P {
+    let u32_max = P::from(F::from_canonical_u32(u32::MAX));
+
+    // This is zero if and only if the high limb is `u32::MAX`.
+    let diff = u32_max - limb_1_u32;
+    // If this is zero, the diff is invertible, so the high limb is not `u32::MAX`.
+    let hi_not_max = inverse * diff - F::ONE;
+    // If this is zero, either the high limb is not `u32::MAX`, or the low limb is zero.
+    let hi_not_max_or_lo_zero = hi_not_max * limb_0_u32;
+
+    yield_constr.constraint(hi_not_max_or_lo_zero);
+
+    // Return the combined value.
+    limb_0_u32 + limb_1_u32 * F::from_canonical_u64(1 << 32)
+}
+
+/// Adds constraints to require that a pair of `u32`s, in little-endian order, represent a canonical
+/// field element, i.e. that their combined value is less than `|F|`. Returns their combined value.
+pub(crate) fn combine_u32s_check_canonical_circuit<F: RichField + Extendable<D>, const D: usize>(
+    builder: &mut CircuitBuilder<F, D>,
+    limb_0_u32: ExtensionTarget<D>,
+    limb_1_u32: ExtensionTarget<D>,
+    inverse: ExtensionTarget<D>,
+    yield_constr: &mut RecursiveConstraintConsumer<F, D>,
+) -> ExtensionTarget<D> {
+    let one = builder.one_extension();
+    let u32_max = builder.constant_extension(F::Extension::from_canonical_u32(u32::MAX));
+
+    // This is zero if and only if the high limb is `u32::MAX`.
+    let diff = builder.sub_extension(u32_max, limb_1_u32);
+    // If this is zero, the diff is invertible, so the high limb is not `u32::MAX`.
+    let hi_not_max = builder.mul_sub_extension(inverse, diff, one);
+    // If this is zero, either the high limb is not `u32::MAX`, or the low limb is zero.
+    let hi_not_max_or_lo_zero = builder.mul_extension(hi_not_max, limb_0_u32);
+
+    yield_constr.constraint(builder, hi_not_max_or_lo_zero);
+
+    // Return the combined value.
+    builder.mul_const_add_extension(F::from_canonical_u64(1 << 32), limb_1_u32, limb_0_u32)
+}
--- a/system_zero/src/arithmetic/division.rs
+++ b/system_zero/src/arithmetic/division.rs
@ -6,7 +6,7 @@ use plonky2::iop::ext_target::ExtensionTarget;
 use plonky2::plonk::circuit_builder::CircuitBuilder;
 use starky::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer};

-use crate::registers::arithmetic::*;
+use crate::registers::alu::*;
 use crate::registers::NUM_COLUMNS;

 pub(crate) fn generate_division<F: PrimeField64>(values: &mut [F; NUM_COLUMNS]) {
--- a/system_zero/src/arithmetic/mod.rs
+++ b/system_zero/src/arithmetic/mod.rs
@ -7,54 +7,53 @@ use starky::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsume
 use starky::vars::StarkEvaluationTargets;
 use starky::vars::StarkEvaluationVars;

-use crate::arithmetic::addition::{eval_addition, eval_addition_recursively, generate_addition};
-use crate::arithmetic::division::{eval_division, eval_division_recursively, generate_division};
-use crate::arithmetic::multiplication::{
-    eval_multiplication, eval_multiplication_recursively, generate_multiplication,
-};
-use crate::arithmetic::subtraction::{
+use crate::alu::addition::{eval_addition, eval_addition_recursively, generate_addition};
+use crate::alu::division::{eval_division, eval_division_recursively, generate_division};
+use crate::alu::mul_add::{eval_mul_add, eval_mul_add_recursively, generate_mul_add};
+use crate::alu::subtraction::{
    eval_subtraction, eval_subtraction_recursively, generate_subtraction,
 };
 use crate::public_input_layout::NUM_PUBLIC_INPUTS;
-use crate::registers::arithmetic::*;
+use crate::registers::alu::*;
 use crate::registers::NUM_COLUMNS;

 mod addition;
+mod canonical;
 mod division;
-mod multiplication;
+mod mul_add;
 mod subtraction;

-pub(crate) fn generate_arithmetic_unit<F: PrimeField64>(values: &mut [F; NUM_COLUMNS]) {
+pub(crate) fn generate_alu<F: PrimeField64>(values: &mut [F; NUM_COLUMNS]) {
    if values[IS_ADD].is_one() {
        generate_addition(values);
    } else if values[IS_SUB].is_one() {
        generate_subtraction(values);
-    } else if values[IS_MUL].is_one() {
-        generate_multiplication(values);
+    } else if values[IS_MUL_ADD].is_one() {
+        generate_mul_add(values);
    } else if values[IS_DIV].is_one() {
        generate_division(values);
    }
 }

-pub(crate) fn eval_arithmetic_unit<F: Field, P: PackedField<Scalar = F>>(
+pub(crate) fn eval_alu<F: Field, P: PackedField<Scalar = F>>(
    vars: StarkEvaluationVars<F, P, NUM_COLUMNS, NUM_PUBLIC_INPUTS>,
    yield_constr: &mut ConstraintConsumer<P>,
 ) {
    let local_values = &vars.local_values;

    // Check that the operation flag values are binary.
-    for col in [IS_ADD, IS_SUB, IS_MUL, IS_DIV] {
+    for col in [IS_ADD, IS_SUB, IS_MUL_ADD, IS_DIV] {
        let val = local_values[col];
-        yield_constr.constraint_wrapping(val * val - val);
+        yield_constr.constraint(val * val - val);
    }

    eval_addition(local_values, yield_constr);
    eval_subtraction(local_values, yield_constr);
-    eval_multiplication(local_values, yield_constr);
+    eval_mul_add(local_values, yield_constr);
    eval_division(local_values, yield_constr);
 }

-pub(crate) fn eval_arithmetic_unit_recursively<F: RichField + Extendable<D>, const D: usize>(
+pub(crate) fn eval_alu_recursively<F: RichField + Extendable<D>, const D: usize>(
    builder: &mut CircuitBuilder<F, D>,
    vars: StarkEvaluationTargets<D, NUM_COLUMNS, NUM_PUBLIC_INPUTS>,
    yield_constr: &mut RecursiveConstraintConsumer<F, D>,
@ -62,14 +61,14 @@ pub(crate) fn eval_arithmetic_unit_recursively<F: RichField + Extendable<D>, con
    let local_values = &vars.local_values;

    // Check that the operation flag values are binary.
-    for col in [IS_ADD, IS_SUB, IS_MUL, IS_DIV] {
+    for col in [IS_ADD, IS_SUB, IS_MUL_ADD, IS_DIV] {
        let val = local_values[col];
        let constraint = builder.mul_sub_extension(val, val, val);
-        yield_constr.constraint_wrapping(builder, constraint);
+        yield_constr.constraint(builder, constraint);
    }

    eval_addition_recursively(builder, local_values, yield_constr);
    eval_subtraction_recursively(builder, local_values, yield_constr);
-    eval_multiplication_recursively(builder, local_values, yield_constr);
+    eval_mul_add_recursively(builder, local_values, yield_constr);
    eval_division_recursively(builder, local_values, yield_constr);
 }
--- a/system_zero/src/alu/mul_add.rs
+++ b/system_zero/src/alu/mul_add.rs
@ -0,0 +1,91 @@
+use plonky2::field::extension_field::Extendable;
+use plonky2::field::field_types::{Field, PrimeField64};
+use plonky2::field::packed_field::PackedField;
+use plonky2::hash::hash_types::RichField;
+use plonky2::iop::ext_target::ExtensionTarget;
+use plonky2::plonk::circuit_builder::CircuitBuilder;
+use plonky2_util::assume;
+use starky::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer};
+
+use crate::alu::canonical::*;
+use crate::registers::alu::*;
+use crate::registers::NUM_COLUMNS;
+
+pub(crate) fn generate_mul_add<F: PrimeField64>(values: &mut [F; NUM_COLUMNS]) {
+    let factor_0 = values[COL_MUL_ADD_FACTOR_0].to_canonical_u64();
+    let factor_1 = values[COL_MUL_ADD_FACTOR_1].to_canonical_u64();
+    let addend = values[COL_MUL_ADD_ADDEND].to_canonical_u64();
+
+    // Let the compiler know that each input must fit in 32 bits.
+    assume(factor_0 <= u32::MAX as u64);
+    assume(factor_1 <= u32::MAX as u64);
+    assume(addend <= u32::MAX as u64);
+
+    let output = factor_0 * factor_1 + addend;
+
+    // An advice value used to help verify that the limbs represent a canonical field element.
+    values[COL_MUL_ADD_RESULT_CANONICAL_INV] = compute_canonical_inv(output);
+
+    values[COL_MUL_ADD_OUTPUT_0] = F::from_canonical_u16(output as u16);
+    values[COL_MUL_ADD_OUTPUT_1] = F::from_canonical_u16((output >> 16) as u16);
+    values[COL_MUL_ADD_OUTPUT_2] = F::from_canonical_u16((output >> 32) as u16);
+    values[COL_MUL_ADD_OUTPUT_3] = F::from_canonical_u16((output >> 48) as u16);
+}
+
+pub(crate) fn eval_mul_add<F: Field, P: PackedField<Scalar = F>>(
+    local_values: &[P; NUM_COLUMNS],
+    yield_constr: &mut ConstraintConsumer<P>,
+) {
+    let is_mul = local_values[IS_MUL_ADD];
+    let factor_0 = local_values[COL_MUL_ADD_FACTOR_0];
+    let factor_1 = local_values[COL_MUL_ADD_FACTOR_1];
+    let addend = local_values[COL_MUL_ADD_ADDEND];
+    let output_1 = local_values[COL_MUL_ADD_OUTPUT_0];
+    let output_2 = local_values[COL_MUL_ADD_OUTPUT_1];
+    let output_3 = local_values[COL_MUL_ADD_OUTPUT_2];
+    let output_4 = local_values[COL_MUL_ADD_OUTPUT_3];
+    let result_canonical_inv = local_values[COL_MUL_ADD_RESULT_CANONICAL_INV];
+
+    let computed_output = factor_0 * factor_1 + addend;
+    // TODO: Needs to be filtered by IS_MUL_ADD.
+    let output = combine_u16s_check_canonical(
+        output_1,
+        output_2,
+        output_3,
+        output_4,
+        result_canonical_inv,
+        yield_constr,
+    );
+    yield_constr.constraint(is_mul * (computed_output - output));
+}
+
+pub(crate) fn eval_mul_add_recursively<F: RichField + Extendable<D>, const D: usize>(
+    builder: &mut CircuitBuilder<F, D>,
+    local_values: &[ExtensionTarget<D>; NUM_COLUMNS],
+    yield_constr: &mut RecursiveConstraintConsumer<F, D>,
+) {
+    let is_mul = local_values[IS_MUL_ADD];
+    let factor_0 = local_values[COL_MUL_ADD_FACTOR_0];
+    let factor_1 = local_values[COL_MUL_ADD_FACTOR_1];
+    let addend = local_values[COL_MUL_ADD_ADDEND];
+    let output_1 = local_values[COL_MUL_ADD_OUTPUT_0];
+    let output_2 = local_values[COL_MUL_ADD_OUTPUT_1];
+    let output_3 = local_values[COL_MUL_ADD_OUTPUT_2];
+    let output_4 = local_values[COL_MUL_ADD_OUTPUT_3];
+    let result_canonical_inv = local_values[COL_MUL_ADD_RESULT_CANONICAL_INV];
+
+    let computed_output = builder.mul_add_extension(factor_0, factor_1, addend);
+    // TODO: Needs to be filtered by IS_MUL_ADD.
+    let output = combine_u16s_check_canonical_circuit(
+        builder,
+        output_1,
+        output_2,
+        output_3,
+        output_4,
+        result_canonical_inv,
+        yield_constr,
+    );
+    let diff = builder.sub_extension(computed_output, output);
+    let filtered_diff = builder.mul_extension(is_mul, diff);
+    yield_constr.constraint(builder, diff);
+}
--- a/system_zero/src/alu/subtraction.rs
+++ b/system_zero/src/alu/subtraction.rs
@ -0,0 +1,78 @@
+use plonky2::field::extension_field::Extendable;
+use plonky2::field::field_types::{Field, PrimeField64};
+use plonky2::field::packed_field::PackedField;
+use plonky2::hash::hash_types::RichField;
+use plonky2::iop::ext_target::ExtensionTarget;
+use plonky2::plonk::circuit_builder::CircuitBuilder;
+use starky::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer};
+
+use crate::registers::alu::*;
+use crate::registers::NUM_COLUMNS;
+
+pub(crate) fn generate_subtraction<F: PrimeField64>(values: &mut [F; NUM_COLUMNS]) {
+    let in_1 = values[COL_SUB_INPUT_0].to_canonical_u64() as u32;
+    let in_2 = values[COL_SUB_INPUT_1].to_canonical_u64() as u32;
+
+    // in_1 - in_2 == diff - br*2^32
+    let (diff, br) = in_1.overflowing_sub(in_2);
+
+    let diff_1 = F::from_canonical_u16(diff as u16);
+    let diff_2 = F::from_canonical_u16((diff >> 16) as u16);
+
+    values[COL_SUB_OUTPUT_0] = F::from_canonical_u16(diff as u16);
+    values[COL_SUB_OUTPUT_1] = F::from_canonical_u16((diff >> 16) as u16);
+    values[COL_SUB_OUTPUT_BORROW] = F::from_canonical_u16(br as u16);
+}
+
+pub(crate) fn eval_subtraction<F: Field, P: PackedField<Scalar = F>>(
+    local_values: &[P; NUM_COLUMNS],
+    yield_constr: &mut ConstraintConsumer<P>,
+) {
+    let is_sub = local_values[IS_SUB];
+    let in_1 = local_values[COL_SUB_INPUT_0];
+    let in_2 = local_values[COL_SUB_INPUT_1];
+    let out_1 = local_values[COL_SUB_OUTPUT_0];
+    let out_2 = local_values[COL_SUB_OUTPUT_1];
+    let out_br = local_values[COL_SUB_OUTPUT_BORROW];
+
+    let base = F::from_canonical_u64(1 << 16);
+    let base_sqr = F::from_canonical_u64(1 << 32);
+
+    let out_br = out_br * base_sqr;
+    let lhs = (out_br + in_1) - in_2;
+    let rhs = out_1 + out_2 * base;
+
+    yield_constr.constraint(is_sub * (lhs - rhs));
+
+    // We don't need to check that out_br is in {0, 1} because it's
+    // checked by boolean::col_bit(0) in the ALU.
+}
+
+pub(crate) fn eval_subtraction_recursively<F: RichField + Extendable<D>, const D: usize>(
+    builder: &mut CircuitBuilder<F, D>,
+    local_values: &[ExtensionTarget<D>; NUM_COLUMNS],
+    yield_constr: &mut RecursiveConstraintConsumer<F, D>,
+) {
+    let is_sub = local_values[IS_SUB];
+    let in_1 = local_values[COL_SUB_INPUT_0];
+    let in_2 = local_values[COL_SUB_INPUT_1];
+    let out_1 = local_values[COL_SUB_OUTPUT_0];
+    let out_2 = local_values[COL_SUB_OUTPUT_1];
+    let out_br = local_values[COL_SUB_OUTPUT_BORROW];
+
+    let base = builder.constant_extension(F::Extension::from_canonical_u64(1 << 16));
+    let base_sqr = builder.constant_extension(F::Extension::from_canonical_u64(1 << 32));
+
+    // lhs = (out_br + in_1) - in_2
+    let lhs = builder.add_extension(out_br, in_1);
+    let lhs = builder.sub_extension(lhs, in_2);
+
+    // rhs = out_1 + base * out_2
+    let rhs = builder.mul_add_extension(out_2, base, out_1);
+
+    // filtered_diff = is_sub * (lhs - rhs)
+    let diff = builder.sub_extension(lhs, rhs);
+    let filtered_diff = builder.mul_extension(is_sub, diff);
+
+    yield_constr.constraint(builder, filtered_diff);
+}
--- a/system_zero/src/arithmetic/multiplication.rs
+++ b/system_zero/src/arithmetic/multiplication.rs
@ -1,31 +0,0 @@
-use plonky2::field::extension_field::Extendable;
-use plonky2::field::field_types::{Field, PrimeField64};
-use plonky2::field::packed_field::PackedField;
-use plonky2::hash::hash_types::RichField;
-use plonky2::iop::ext_target::ExtensionTarget;
-use plonky2::plonk::circuit_builder::CircuitBuilder;
-use starky::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer};
-
-use crate::registers::arithmetic::*;
-use crate::registers::NUM_COLUMNS;
-
-pub(crate) fn generate_multiplication<F: PrimeField64>(values: &mut [F; NUM_COLUMNS]) {
-    // TODO
-}
-
-pub(crate) fn eval_multiplication<F: Field, P: PackedField<Scalar = F>>(
-    local_values: &[P; NUM_COLUMNS],
-    yield_constr: &mut ConstraintConsumer<P>,
-) {
-    let is_mul = local_values[IS_MUL];
-    // TODO
-}
-
-pub(crate) fn eval_multiplication_recursively<F: RichField + Extendable<D>, const D: usize>(
-    builder: &mut CircuitBuilder<F, D>,
-    local_values: &[ExtensionTarget<D>; NUM_COLUMNS],
-    yield_constr: &mut RecursiveConstraintConsumer<F, D>,
-) {
-    let is_mul = local_values[IS_MUL];
-    // TODO
-}
--- a/system_zero/src/arithmetic/subtraction.rs
+++ b/system_zero/src/arithmetic/subtraction.rs
@ -1,31 +0,0 @@
-use plonky2::field::extension_field::Extendable;
-use plonky2::field::field_types::{Field, PrimeField64};
-use plonky2::field::packed_field::PackedField;
-use plonky2::hash::hash_types::RichField;
-use plonky2::iop::ext_target::ExtensionTarget;
-use plonky2::plonk::circuit_builder::CircuitBuilder;
-use starky::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer};
-
-use crate::registers::arithmetic::*;
-use crate::registers::NUM_COLUMNS;
-
-pub(crate) fn generate_subtraction<F: PrimeField64>(values: &mut [F; NUM_COLUMNS]) {
-    // TODO
-}
-
-pub(crate) fn eval_subtraction<F: Field, P: PackedField<Scalar = F>>(
-    local_values: &[P; NUM_COLUMNS],
-    yield_constr: &mut ConstraintConsumer<P>,
-) {
-    let is_sub = local_values[IS_SUB];
-    // TODO
-}
-
-pub(crate) fn eval_subtraction_recursively<F: RichField + Extendable<D>, const D: usize>(
-    builder: &mut CircuitBuilder<F, D>,
-    local_values: &[ExtensionTarget<D>; NUM_COLUMNS],
-    yield_constr: &mut RecursiveConstraintConsumer<F, D>,
-) {
-    let is_sub = local_values[IS_SUB];
-    // TODO
-}
--- a/system_zero/src/core_registers.rs
+++ b/system_zero/src/core_registers.rs
@ -49,7 +49,7 @@ pub(crate) fn eval_core_registers<F: Field, P: PackedField<Scalar = F>>(
    let next_clock = vars.next_values[COL_CLOCK];
    let delta_clock = next_clock - local_clock;
    yield_constr.constraint_first_row(local_clock);
-    yield_constr.constraint(delta_clock - F::ONE);
+    yield_constr.constraint_transition(delta_clock - F::ONE);

    // The 16-bit table must start with 0, end with 2^16 - 1, and increment by 0 or 1.
    let local_range_16 = vars.local_values[COL_RANGE_16];
@ -57,7 +57,7 @@ pub(crate) fn eval_core_registers<F: Field, P: PackedField<Scalar = F>>(
    let delta_range_16 = next_range_16 - local_range_16;
    yield_constr.constraint_first_row(local_range_16);
    yield_constr.constraint_last_row(local_range_16 - F::from_canonical_u64((1 << 16) - 1));
-    yield_constr.constraint(delta_range_16 * delta_range_16 - delta_range_16);
+    yield_constr.constraint_transition(delta_range_16 * delta_range_16 - delta_range_16);

    // TODO constraints for stack etc.
 }
@ -77,7 +77,7 @@ pub(crate) fn eval_core_registers_recursively<F: RichField + Extendable<D>, cons
    let delta_clock = builder.sub_extension(next_clock, local_clock);
    yield_constr.constraint_first_row(builder, local_clock);
    let constraint = builder.sub_extension(delta_clock, one_ext);
-    yield_constr.constraint(builder, constraint);
+    yield_constr.constraint_transition(builder, constraint);

    // The 16-bit table must start with 0, end with 2^16 - 1, and increment by 0 or 1.
    let local_range_16 = vars.local_values[COL_RANGE_16];
@ -87,7 +87,7 @@ pub(crate) fn eval_core_registers_recursively<F: RichField + Extendable<D>, cons
    let constraint = builder.sub_extension(local_range_16, max_u16_ext);
    yield_constr.constraint_last_row(builder, constraint);
    let constraint = builder.mul_add_extension(delta_range_16, delta_range_16, delta_range_16);
-    yield_constr.constraint(builder, constraint);
+    yield_constr.constraint_transition(builder, constraint);

    // TODO constraints for stack etc.
 }
--- a/system_zero/src/lib.rs
+++ b/system_zero/src/lib.rs
@ -2,8 +2,9 @@
 #![allow(dead_code)]
 #![allow(unused_variables)]

-mod arithmetic;
+mod alu;
 mod core_registers;
+pub mod lookup;
 mod memory;
 mod permutation_unit;
 mod public_input_layout;
--- a/system_zero/src/lookup.rs
+++ b/system_zero/src/lookup.rs
@ -0,0 +1,147 @@
+//! Implementation of the Halo2 lookup argument.
+//!
+//! References:
+//! - https://zcash.github.io/halo2/design/proving-system/lookup.html
+//! - https://www.youtube.com/watch?v=YlTt12s7vGE&t=5237s
+
+use std::cmp::Ordering;
+
+use itertools::Itertools;
+use plonky2::field::extension_field::Extendable;
+use plonky2::field::field_types::{Field, PrimeField64};
+use plonky2::field::packed_field::PackedField;
+use plonky2::hash::hash_types::RichField;
+use plonky2::plonk::circuit_builder::CircuitBuilder;
+use starky::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer};
+use starky::vars::StarkEvaluationTargets;
+use starky::vars::StarkEvaluationVars;
+
+use crate::public_input_layout::NUM_PUBLIC_INPUTS;
+use crate::registers::lookup::*;
+use crate::registers::NUM_COLUMNS;
+
+pub(crate) fn generate_lookups<F: PrimeField64>(trace_cols: &mut [Vec<F>]) {
+    for i in 0..NUM_LOOKUPS {
+        let inputs = &trace_cols[col_input(i)];
+        let table = &trace_cols[col_table(i)];
+        let (permuted_inputs, permuted_table) = permuted_cols(inputs, table);
+        trace_cols[col_permuted_input(i)] = permuted_inputs;
+        trace_cols[col_permuted_table(i)] = permuted_table;
+    }
+}
+
+/// Given an input column and a table column, generate the permuted input and permuted table columns
+/// used in the Halo2 permutation argument.
+pub fn permuted_cols<F: PrimeField64>(inputs: &[F], table: &[F]) -> (Vec<F>, Vec<F>) {
+    let n = inputs.len();
+
+    // The permuted inputs do not have to be ordered, but we found that sorting was faster than
+    // hash-based grouping. We also sort the table, as this helps us identify "unused" table
+    // elements efficiently.
+
+    // To compare elements, e.g. for sorting, we first need them in canonical form. It would be
+    // wasteful to canonicalize in each comparison, as a single element may be involved in many
+    // comparisons. So we will canonicalize once upfront, then use `to_noncanonical_u64` when
+    // comparing elements.
+
+    let sorted_inputs = inputs
+        .iter()
+        .map(|x| x.to_canonical())
+        .sorted_unstable_by_key(|x| x.to_noncanonical_u64())
+        .collect_vec();
+    let sorted_table = table
+        .iter()
+        .map(|x| x.to_canonical())
+        .sorted_unstable_by_key(|x| x.to_noncanonical_u64())
+        .collect_vec();
+
+    let mut unused_table_inds = Vec::with_capacity(n);
+    let mut unused_table_vals = Vec::with_capacity(n);
+    let mut permuted_table = vec![F::ZERO; n];
+    let mut i = 0;
+    let mut j = 0;
+    while (j < n) && (i < n) {
+        let input_val = sorted_inputs[i].to_noncanonical_u64();
+        let table_val = sorted_table[j].to_noncanonical_u64();
+        match input_val.cmp(&table_val) {
+            Ordering::Greater => {
+                unused_table_vals.push(sorted_table[j]);
+                j += 1;
+            }
+            Ordering::Less => {
+                if let Some(x) = unused_table_vals.pop() {
+                    permuted_table[i] = x;
+                } else {
+                    unused_table_inds.push(i);
+                }
+                i += 1;
+            }
+            Ordering::Equal => {
+                permuted_table[i] = sorted_table[j];
+                i += 1;
+                j += 1;
+            }
+        }
+    }
+
+    #[allow(clippy::needless_range_loop)] // indexing is just more natural here
+    for jj in j..n {
+        unused_table_vals.push(sorted_table[jj]);
+    }
+    for ii in i..n {
+        unused_table_inds.push(ii);
+    }
+    for (ind, val) in unused_table_inds.into_iter().zip_eq(unused_table_vals) {
+        permuted_table[ind] = val;
+    }
+
+    (sorted_inputs, permuted_table)
+}
+
+pub(crate) fn eval_lookups<F: Field, P: PackedField<Scalar = F>>(
+    vars: StarkEvaluationVars<F, P, NUM_COLUMNS, NUM_PUBLIC_INPUTS>,
+    yield_constr: &mut ConstraintConsumer<P>,
+) {
+    for i in 0..NUM_LOOKUPS {
+        let local_perm_input = vars.local_values[col_permuted_input(i)];
+        let next_perm_table = vars.next_values[col_permuted_table(i)];
+        let next_perm_input = vars.next_values[col_permuted_input(i)];
+
+        // A "vertical" diff between the local and next permuted inputs.
+        let diff_input_prev = next_perm_input - local_perm_input;
+        // A "horizontal" diff between the next permuted input and permuted table value.
+        let diff_input_table = next_perm_input - next_perm_table;
+
+        yield_constr.constraint(diff_input_prev * diff_input_table);
+
+        // This is actually constraining the first row, as per the spec, since `diff_input_table`
+        // is a diff of the next row's values. In the context of `constraint_last_row`, the next
+        // row is the first row.
+        yield_constr.constraint_last_row(diff_input_table);
+    }
+}
+
+pub(crate) fn eval_lookups_recursively<F: RichField + Extendable<D>, const D: usize>(
+    builder: &mut CircuitBuilder<F, D>,
+    vars: StarkEvaluationTargets<D, NUM_COLUMNS, NUM_PUBLIC_INPUTS>,
+    yield_constr: &mut RecursiveConstraintConsumer<F, D>,
+) {
+    for i in 0..NUM_LOOKUPS {
+        let local_perm_input = vars.local_values[col_permuted_input(i)];
+        let next_perm_table = vars.next_values[col_permuted_table(i)];
+        let next_perm_input = vars.next_values[col_permuted_input(i)];
+
+        // A "vertical" diff between the local and next permuted inputs.
+        let diff_input_prev = builder.sub_extension(next_perm_input, local_perm_input);
+        // A "horizontal" diff between the next permuted input and permuted table value.
+        let diff_input_table = builder.sub_extension(next_perm_input, next_perm_table);
+
+        let diff_product = builder.mul_extension(diff_input_prev, diff_input_table);
+        yield_constr.constraint(builder, diff_product);
+
+        // This is actually constraining the first row, as per the spec, since `diff_input_table`
+        // is a diff of the next row's values. In the context of `constraint_last_row`, the next
+        // row is the first row.
+        yield_constr.constraint_last_row(builder, diff_input_table);
+    }
+}
--- a/system_zero/src/permutation_unit.rs
+++ b/system_zero/src/permutation_unit.rs
@ -127,8 +127,7 @@ pub(crate) fn eval_permutation_unit<F, FE, P, const D: usize>(

        for i in 0..SPONGE_WIDTH {
            let state_cubed = state[i] * state[i].square();
-            yield_constr
-                .constraint_wrapping(state_cubed - local_values[col_full_first_mid_sbox(r, i)]);
+            yield_constr.constraint(state_cubed - local_values[col_full_first_mid_sbox(r, i)]);
            let state_cubed = local_values[col_full_first_mid_sbox(r, i)];
            state[i] *= state_cubed.square(); // Form state ** 7.
        }
@ -136,8 +135,7 @@ pub(crate) fn eval_permutation_unit<F, FE, P, const D: usize>(
        state = mds_layer(state);

        for i in 0..SPONGE_WIDTH {
-            yield_constr
-                .constraint_wrapping(state[i] - local_values[col_full_first_after_mds(r, i)]);
+            yield_constr.constraint(state[i] - local_values[col_full_first_after_mds(r, i)]);
            state[i] = local_values[col_full_first_after_mds(r, i)];
        }
    }
@ -146,10 +144,10 @@ pub(crate) fn eval_permutation_unit<F, FE, P, const D: usize>(
        state = constant_layer(state, HALF_N_FULL_ROUNDS + r);

        let state0_cubed = state[0] * state[0].square();
-        yield_constr.constraint_wrapping(state0_cubed - local_values[col_partial_mid_sbox(r)]);
+        yield_constr.constraint(state0_cubed - local_values[col_partial_mid_sbox(r)]);
        let state0_cubed = local_values[col_partial_mid_sbox(r)];
        state[0] *= state0_cubed.square(); // Form state ** 7.
-        yield_constr.constraint_wrapping(state[0] - local_values[col_partial_after_sbox(r)]);
+        yield_constr.constraint(state[0] - local_values[col_partial_after_sbox(r)]);
        state[0] = local_values[col_partial_after_sbox(r)];

        state = mds_layer(state);
@ -160,8 +158,7 @@ pub(crate) fn eval_permutation_unit<F, FE, P, const D: usize>(

        for i in 0..SPONGE_WIDTH {
            let state_cubed = state[i] * state[i].square();
-            yield_constr
-                .constraint_wrapping(state_cubed - local_values[col_full_second_mid_sbox(r, i)]);
+            yield_constr.constraint(state_cubed - local_values[col_full_second_mid_sbox(r, i)]);
            let state_cubed = local_values[col_full_second_mid_sbox(r, i)];
            state[i] *= state_cubed.square(); // Form state ** 7.
        }
@ -169,8 +166,7 @@ pub(crate) fn eval_permutation_unit<F, FE, P, const D: usize>(
        state = mds_layer(state);

        for i in 0..SPONGE_WIDTH {
-            yield_constr
-                .constraint_wrapping(state[i] - local_values[col_full_second_after_mds(r, i)]);
+            yield_constr.constraint(state[i] - local_values[col_full_second_after_mds(r, i)]);
            state[i] = local_values[col_full_second_after_mds(r, i)];
        }
    }
@ -197,7 +193,7 @@ pub(crate) fn eval_permutation_unit_recursively<F: RichField + Extendable<D>, co
            let state_cubed = builder.cube_extension(state[i]);
            let diff =
                builder.sub_extension(state_cubed, local_values[col_full_first_mid_sbox(r, i)]);
-            yield_constr.constraint_wrapping(builder, diff);
+            yield_constr.constraint(builder, diff);
            let state_cubed = local_values[col_full_first_mid_sbox(r, i)];
            state[i] = builder.mul_many_extension(&[state[i], state_cubed, state_cubed]);
            // Form state ** 7.
@ -208,7 +204,7 @@ pub(crate) fn eval_permutation_unit_recursively<F: RichField + Extendable<D>, co
        for i in 0..SPONGE_WIDTH {
            let diff =
                builder.sub_extension(state[i], local_values[col_full_first_after_mds(r, i)]);
-            yield_constr.constraint_wrapping(builder, diff);
+            yield_constr.constraint(builder, diff);
            state[i] = local_values[col_full_first_after_mds(r, i)];
        }
    }
@ -218,11 +214,11 @@ pub(crate) fn eval_permutation_unit_recursively<F: RichField + Extendable<D>, co

        let state0_cubed = builder.cube_extension(state[0]);
        let diff = builder.sub_extension(state0_cubed, local_values[col_partial_mid_sbox(r)]);
-        yield_constr.constraint_wrapping(builder, diff);
+        yield_constr.constraint(builder, diff);
        let state0_cubed = local_values[col_partial_mid_sbox(r)];
        state[0] = builder.mul_many_extension(&[state[0], state0_cubed, state0_cubed]); // Form state ** 7.
        let diff = builder.sub_extension(state[0], local_values[col_partial_after_sbox(r)]);
-        yield_constr.constraint_wrapping(builder, diff);
+        yield_constr.constraint(builder, diff);
        state[0] = local_values[col_partial_after_sbox(r)];

        state = F::mds_layer_recursive(builder, &state);
@ -239,7 +235,7 @@ pub(crate) fn eval_permutation_unit_recursively<F: RichField + Extendable<D>, co
            let state_cubed = builder.cube_extension(state[i]);
            let diff =
                builder.sub_extension(state_cubed, local_values[col_full_second_mid_sbox(r, i)]);
-            yield_constr.constraint_wrapping(builder, diff);
+            yield_constr.constraint(builder, diff);
            let state_cubed = local_values[col_full_second_mid_sbox(r, i)];
            state[i] = builder.mul_many_extension(&[state[i], state_cubed, state_cubed]);
            // Form state ** 7.
@ -250,7 +246,7 @@ pub(crate) fn eval_permutation_unit_recursively<F: RichField + Extendable<D>, co
        for i in 0..SPONGE_WIDTH {
            let diff =
                builder.sub_extension(state[i], local_values[col_full_second_after_mds(r, i)]);
-            yield_constr.constraint_wrapping(builder, diff);
+            yield_constr.constraint(builder, diff);
            state[i] = local_values[col_full_second_after_mds(r, i)];
        }
    }
--- a/system_zero/src/registers/alu.rs
+++ b/system_zero/src/registers/alu.rs
@ -0,0 +1,69 @@
+//! Arithmetic and logic unit.
+
+pub(crate) const IS_ADD: usize = super::START_ALU;
+pub(crate) const IS_SUB: usize = IS_ADD + 1;
+pub(crate) const IS_MUL_ADD: usize = IS_SUB + 1;
+pub(crate) const IS_DIV: usize = IS_MUL_ADD + 1;
+
+const START_SHARED_COLS: usize = IS_DIV + 1;
+
+/// Within the ALU, there are shared columns which can be used by any arithmetic/logic
+/// circuit, depending on which one is active this cycle.
+// Can be increased as needed as other operations are implemented.
+const NUM_SHARED_COLS: usize = 4;
+
+const fn shared_col(i: usize) -> usize {
+    debug_assert!(i < NUM_SHARED_COLS);
+    START_SHARED_COLS + i
+}
+
+/// The first value to be added; treated as an unsigned u32.
+pub(crate) const COL_ADD_INPUT_0: usize = shared_col(0);
+/// The second value to be added; treated as an unsigned u32.
+pub(crate) const COL_ADD_INPUT_1: usize = shared_col(1);
+/// The third value to be added; treated as an unsigned u32.
+pub(crate) const COL_ADD_INPUT_2: usize = shared_col(2);
+
+// Note: Addition outputs three 16-bit chunks, and since these values need to be range-checked
+// anyway, we might as well use the range check unit's columns as our addition outputs. So the
+// three proceeding columns are basically aliases, not columns owned by the ALU.
+/// The first 16-bit chunk of the output, based on little-endian ordering.
+pub(crate) const COL_ADD_OUTPUT_0: usize = super::range_check_16::col_rc_16_input(0);
+/// The second 16-bit chunk of the output, based on little-endian ordering.
+pub(crate) const COL_ADD_OUTPUT_1: usize = super::range_check_16::col_rc_16_input(1);
+/// The third 16-bit chunk of the output, based on little-endian ordering.
+pub(crate) const COL_ADD_OUTPUT_2: usize = super::range_check_16::col_rc_16_input(2);
+
+/// Inputs for subtraction; the second value is subtracted from the
+/// first; inputs treated as an unsigned u32.
+pub(crate) const COL_SUB_INPUT_0: usize = shared_col(0);
+pub(crate) const COL_SUB_INPUT_1: usize = shared_col(1);
+
+/// The first 16-bit chunk of the output, based on little-endian ordering.
+pub(crate) const COL_SUB_OUTPUT_0: usize = super::range_check_16::col_rc_16_input(0);
+/// The second 16-bit chunk of the output, based on little-endian ordering.
+pub(crate) const COL_SUB_OUTPUT_1: usize = super::range_check_16::col_rc_16_input(1);
+/// The borrow output
+pub(crate) const COL_SUB_OUTPUT_BORROW: usize = super::boolean::col_bit(0);
+
+/// The first value to be multiplied; treated as an unsigned u32.
+pub(crate) const COL_MUL_ADD_FACTOR_0: usize = shared_col(0);
+/// The second value to be multiplied; treated as an unsigned u32.
+pub(crate) const COL_MUL_ADD_FACTOR_1: usize = shared_col(1);
+/// The value to be added to the product; treated as an unsigned u32.
+pub(crate) const COL_MUL_ADD_ADDEND: usize = shared_col(2);
+
+/// The inverse of `u32::MAX - result_hi`, where `output_hi` is the high 32-bits of the result.
+/// See https://hackmd.io/NC-yRmmtRQSvToTHb96e8Q#Checking-element-validity
+pub(crate) const COL_MUL_ADD_RESULT_CANONICAL_INV: usize = shared_col(3);
+
+/// The first 16-bit chunk of the output, based on little-endian ordering.
+pub(crate) const COL_MUL_ADD_OUTPUT_0: usize = super::range_check_16::col_rc_16_input(0);
+/// The second 16-bit chunk of the output, based on little-endian ordering.
+pub(crate) const COL_MUL_ADD_OUTPUT_1: usize = super::range_check_16::col_rc_16_input(1);
+/// The third 16-bit chunk of the output, based on little-endian ordering.
+pub(crate) const COL_MUL_ADD_OUTPUT_2: usize = super::range_check_16::col_rc_16_input(2);
+/// The fourth 16-bit chunk of the output, based on little-endian ordering.
+pub(crate) const COL_MUL_ADD_OUTPUT_3: usize = super::range_check_16::col_rc_16_input(3);
+
+pub(super) const END: usize = super::START_ALU + NUM_SHARED_COLS;
--- a/system_zero/src/registers/arithmetic.rs
+++ b/system_zero/src/registers/arithmetic.rs
@ -1,37 +0,0 @@
-//! Arithmetic unit.
-
-pub(crate) const IS_ADD: usize = super::START_ARITHMETIC;
-pub(crate) const IS_SUB: usize = IS_ADD + 1;
-pub(crate) const IS_MUL: usize = IS_SUB + 1;
-pub(crate) const IS_DIV: usize = IS_MUL + 1;
-
-const START_SHARED_COLS: usize = IS_DIV + 1;
-
-/// Within the arithmetic unit, there are shared columns which can be used by any arithmetic
-/// circuit, depending on which one is active this cycle.
-// Can be increased as needed as other operations are implemented.
-const NUM_SHARED_COLS: usize = 3;
-
-const fn shared_col(i: usize) -> usize {
-    debug_assert!(i < NUM_SHARED_COLS);
-    START_SHARED_COLS + i
-}
-
-/// The first value to be added; treated as an unsigned u32.
-pub(crate) const COL_ADD_INPUT_1: usize = shared_col(0);
-/// The second value to be added; treated as an unsigned u32.
-pub(crate) const COL_ADD_INPUT_2: usize = shared_col(1);
-/// The third value to be added; treated as an unsigned u32.
-pub(crate) const COL_ADD_INPUT_3: usize = shared_col(2);
-
-// Note: Addition outputs three 16-bit chunks, and since these values need to be range-checked
-// anyway, we might as well use the range check unit's columns as our addition outputs. So the
-// three proceeding columns are basically aliases, not columns owned by the arithmetic unit.
-/// The first 16-bit chunk of the output, based on little-endian ordering.
-pub(crate) const COL_ADD_OUTPUT_1: usize = super::range_check_16::col_rc_16_input(0);
-/// The second 16-bit chunk of the output, based on little-endian ordering.
-pub(crate) const COL_ADD_OUTPUT_2: usize = super::range_check_16::col_rc_16_input(1);
-/// The third 16-bit chunk of the output, based on little-endian ordering.
-pub(crate) const COL_ADD_OUTPUT_3: usize = super::range_check_16::col_rc_16_input(2);
-
-pub(super) const END: usize = super::START_ARITHMETIC + NUM_SHARED_COLS;
--- a/system_zero/src/registers/lookup.rs
+++ b/system_zero/src/registers/lookup.rs
@ -3,19 +3,35 @@

 const START_UNIT: usize = super::START_LOOKUP;

-const NUM_LOOKUPS: usize =
+pub(crate) const NUM_LOOKUPS: usize =
    super::range_check_16::NUM_RANGE_CHECKS + super::range_check_degree::NUM_RANGE_CHECKS;

+pub(crate) const fn col_input(i: usize) -> usize {
+    if i < super::range_check_16::NUM_RANGE_CHECKS {
+        super::range_check_16::col_rc_16_input(i)
+    } else {
+        super::range_check_degree::col_rc_degree_input(i - super::range_check_16::NUM_RANGE_CHECKS)
+    }
+}
+
 /// This column contains a permutation of the input values.
-const fn col_permuted_input(i: usize) -> usize {
+pub(crate) const fn col_permuted_input(i: usize) -> usize {
    debug_assert!(i < NUM_LOOKUPS);
    START_UNIT + 2 * i
 }

+pub(crate) const fn col_table(i: usize) -> usize {
+    if i < super::range_check_16::NUM_RANGE_CHECKS {
+        super::core::COL_RANGE_16
+    } else {
+        super::core::COL_CLOCK
+    }
+}
+
 /// This column contains a permutation of the table values.
-const fn col_permuted_table(i: usize) -> usize {
+pub(crate) const fn col_permuted_table(i: usize) -> usize {
    debug_assert!(i < NUM_LOOKUPS);
    START_UNIT + 2 * i + 1
 }

-pub(super) const END: usize = START_UNIT + NUM_LOOKUPS;
+pub(super) const END: usize = START_UNIT + NUM_LOOKUPS * 2;
--- a/system_zero/src/registers/mod.rs
+++ b/system_zero/src/registers/mod.rs
@ -1,4 +1,4 @@
-pub(crate) mod arithmetic;
+pub(crate) mod alu;
 pub(crate) mod boolean;
 pub(crate) mod core;
 pub(crate) mod logic;
@ -8,8 +8,8 @@ pub(crate) mod permutation;
 pub(crate) mod range_check_16;
 pub(crate) mod range_check_degree;

-const START_ARITHMETIC: usize = 0;
-const START_BOOLEAN: usize = arithmetic::END;
+const START_ALU: usize = 0;
+const START_BOOLEAN: usize = alu::END;
 const START_CORE: usize = boolean::END;
 const START_LOGIC: usize = core::END;
 const START_LOOKUP: usize = logic::END;
--- a/system_zero/src/registers/range_check_16.rs
+++ b/system_zero/src/registers/range_check_16.rs
@ -1,6 +1,6 @@
 //! Range check unit which checks that values are in `[0, 2^16)`.

-pub(super) const NUM_RANGE_CHECKS: usize = 5;
+pub(crate) const NUM_RANGE_CHECKS: usize = 5;

 /// The input of the `i`th range check, i.e. the value being range checked.
 pub(crate) const fn col_rc_16_input(i: usize) -> usize {
--- a/system_zero/src/registers/range_check_degree.rs
+++ b/system_zero/src/registers/range_check_degree.rs
@ -1,6 +1,6 @@
 //! Range check unit which checks that values are in `[0, degree)`.

-pub(super) const NUM_RANGE_CHECKS: usize = 5;
+pub(crate) const NUM_RANGE_CHECKS: usize = 5;

 /// The input of the `i`th range check, i.e. the value being range checked.
 pub(crate) const fn col_rc_degree_input(i: usize) -> usize {
--- a/system_zero/src/system_zero.rs
+++ b/system_zero/src/system_zero.rs
@ -2,27 +2,30 @@ use std::marker::PhantomData;

 use plonky2::field::extension_field::{Extendable, FieldExtension};
 use plonky2::field::packed_field::PackedField;
+use plonky2::field::polynomial::PolynomialValues;
 use plonky2::hash::hash_types::RichField;
 use plonky2::plonk::circuit_builder::CircuitBuilder;
+use plonky2::timed;
+use plonky2::util::timing::TimingTree;
+use plonky2::util::transpose;
 use starky::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer};
 use starky::permutation::PermutationPair;
 use starky::stark::Stark;
 use starky::vars::StarkEvaluationTargets;
 use starky::vars::StarkEvaluationVars;

-use crate::arithmetic::{
-    eval_arithmetic_unit, eval_arithmetic_unit_recursively, generate_arithmetic_unit,
-};
+use crate::alu::{eval_alu, eval_alu_recursively, generate_alu};
 use crate::core_registers::{
    eval_core_registers, eval_core_registers_recursively, generate_first_row_core_registers,
    generate_next_row_core_registers,
 };
+use crate::lookup::{eval_lookups, eval_lookups_recursively, generate_lookups};
 use crate::memory::TransactionMemory;
 use crate::permutation_unit::{
    eval_permutation_unit, eval_permutation_unit_recursively, generate_permutation_unit,
 };
 use crate::public_input_layout::NUM_PUBLIC_INPUTS;
-use crate::registers::NUM_COLUMNS;
+use crate::registers::{lookup, NUM_COLUMNS};

 /// We require at least 2^16 rows as it helps support efficient 16-bit range checks.
 const MIN_TRACE_ROWS: usize = 1 << 16;
@ -33,12 +36,14 @@ pub struct SystemZero<F: RichField + Extendable<D>, const D: usize> {
 }

 impl<F: RichField + Extendable<D>, const D: usize> SystemZero<F, D> {
-    fn generate_trace(&self) -> Vec<[F; NUM_COLUMNS]> {
+    /// Generate the rows of the trace. Note that this does not generate the permuted columns used
+    /// in our lookup arguments, as those are computed after transposing to column-wise form.
+    fn generate_trace_rows(&self) -> Vec<[F; NUM_COLUMNS]> {
        let memory = TransactionMemory::default();

        let mut row = [F::ZERO; NUM_COLUMNS];
        generate_first_row_core_registers(&mut row);
-        generate_arithmetic_unit(&mut row);
+        generate_alu(&mut row);
        generate_permutation_unit(&mut row);

        let mut trace = Vec::with_capacity(MIN_TRACE_ROWS);
@ -46,7 +51,7 @@ impl<F: RichField + Extendable<D>, const D: usize> SystemZero<F, D> {
        loop {
            let mut next_row = [F::ZERO; NUM_COLUMNS];
            generate_next_row_core_registers(&row, &mut next_row);
-            generate_arithmetic_unit(&mut next_row);
+            generate_alu(&mut next_row);
            generate_permutation_unit(&mut next_row);

            trace.push(row);
@ -61,6 +66,45 @@ impl<F: RichField + Extendable<D>, const D: usize> SystemZero<F, D> {
        trace.push(row);
        trace
    }
+
+    fn generate_trace(&self) -> Vec<PolynomialValues<F>> {
+        let mut timing = TimingTree::new("generate trace", log::Level::Debug);
+
+        // Generate the witness, except for permuted columns in the lookup argument.
+        let trace_rows = timed!(
+            &mut timing,
+            "generate trace rows",
+            self.generate_trace_rows()
+        );
+
+        // Transpose from row-wise to column-wise.
+        let trace_row_vecs: Vec<_> = timed!(
+            &mut timing,
+            "convert to Vecs",
+            trace_rows.into_iter().map(|row| row.to_vec()).collect()
+        );
+        let mut trace_col_vecs: Vec<Vec<F>> =
+            timed!(&mut timing, "transpose", transpose(&trace_row_vecs));
+
+        // Generate permuted columns in the lookup argument.
+        timed!(
+            &mut timing,
+            "generate lookup columns",
+            generate_lookups(&mut trace_col_vecs)
+        );
+
+        let trace_polys = timed!(
+            &mut timing,
+            "convert to PolynomialValues",
+            trace_col_vecs
+                .into_iter()
+                .map(|column| PolynomialValues::new(column))
+                .collect()
+        );
+
+        timing.print();
+        trace_polys
+    }
 }

 impl<F: RichField + Extendable<D>, const D: usize> Default for SystemZero<F, D> {
@ -84,8 +128,9 @@ impl<F: RichField + Extendable<D>, const D: usize> Stark<F, D> for SystemZero<F,
        P: PackedField<Scalar = FE>,
    {
        eval_core_registers(vars, yield_constr);
-        eval_arithmetic_unit(vars, yield_constr);
+        eval_alu(vars, yield_constr);
        eval_permutation_unit::<F, FE, P, D2>(vars, yield_constr);
+        eval_lookups(vars, yield_constr);
        // TODO: Other units
    }

@ -96,8 +141,9 @@ impl<F: RichField + Extendable<D>, const D: usize> Stark<F, D> for SystemZero<F,
        yield_constr: &mut RecursiveConstraintConsumer<F, D>,
    ) {
        eval_core_registers_recursively(builder, vars, yield_constr);
-        eval_arithmetic_unit_recursively(builder, vars, yield_constr);
+        eval_alu_recursively(builder, vars, yield_constr);
        eval_permutation_unit_recursively(builder, vars, yield_constr);
+        eval_lookups_recursively(builder, vars, yield_constr);
        // TODO: Other units
    }

@ -106,9 +152,22 @@ impl<F: RichField + Extendable<D>, const D: usize> Stark<F, D> for SystemZero<F,
    }

    fn permutation_pairs(&self) -> Vec<PermutationPair> {
+        let mut pairs = Vec::new();
+
+        for i in 0..lookup::NUM_LOOKUPS {
+            pairs.push(PermutationPair::singletons(
+                lookup::col_input(i),
+                lookup::col_permuted_input(i),
+            ));
+            pairs.push(PermutationPair::singletons(
+                lookup::col_table(i),
+                lookup::col_permuted_table(i),
+            ));
+        }
+
        // TODO: Add permutation pairs for memory.
-        // TODO: Add permutation pairs for range checks.
-        vec![]
+
+        pairs
    }
 }

@ -129,8 +188,9 @@ mod tests {
    use crate::system_zero::SystemZero;

    #[test]
-    #[ignore] // A bit slow.
    fn run() -> Result<()> {
+        init_logger();
+
        type F = GoldilocksField;
        type C = PoseidonGoldilocksConfig;
        const D: usize = 2;
@ -156,4 +216,8 @@ mod tests {
        let system = S::default();
        test_stark_low_degree(system)
    }
+
+    fn init_logger() {
+        let _ = env_logger::builder().format_timestamp(None).try_init();
+    }
 }