From b28cd55326dc618e7596c97a9dccb1c38886ca6b Mon Sep 17 00:00:00 2001
From: wborgeaud <williamborgeaud@gmail.com>
Date: Wed, 16 Feb 2022 13:37:01 +0100
Subject: [PATCH 01/32] Fix reduction strategy

---
 plonky2/src/fri/mod.rs                  |  2 +-
 plonky2/src/fri/reduction_strategies.rs | 21 +++++++++++----------
 plonky2/src/plonk/circuit_builder.rs    |  2 +-
 plonky2/src/plonk/circuit_data.rs       |  2 +-
 starky/src/config.rs                    |  2 +-
 starky/src/prover.rs                    |  6 +++++-
 6 files changed, 20 insertions(+), 15 deletions(-)
diff --git a/plonky2/src/fri/mod.rs b/plonky2/src/fri/mod.rs
index c491f8f0..5792444e 100644
--- a/plonky2/src/fri/mod.rs
+++ b/plonky2/src/fri/mod.rs
@@ -67,7 +67,7 @@ pub struct FriParams {
 }
 
 impl FriParams {
-    pub(crate) fn total_arities(&self) -> usize {
+    pub fn total_arities(&self) -> usize {
         self.reduction_arity_bits.iter().sum()
     }
 
diff --git a/plonky2/src/fri/reduction_strategies.rs b/plonky2/src/fri/reduction_strategies.rs
index 49eda3ba..84505ec2 100644
--- a/plonky2/src/fri/reduction_strategies.rs
+++ b/plonky2/src/fri/reduction_strategies.rs
@@ -8,11 +8,12 @@ pub enum FriReductionStrategy {
     /// Specifies the exact sequence of arities (expressed in bits) to use.
     Fixed(Vec<usize>),
 
-    /// `ConstantArityBits(arity_bits, final_poly_bits)` applies reductions of arity `2^arity_bits`
-    /// until the polynomial degree is `2^final_poly_bits` or less. This tends to work well in the
-    /// recursive setting, as it avoids needing multiple configurations of gates used in FRI
-    /// verification, such as `InterpolationGate`.
-    ConstantArityBits(usize, usize),
+    /// `ConstantArityBits(arity_bits, final_poly_bits, cap_height)` applies reductions of arity `2^arity_bits`
+    /// until the polynomial degree is less than or equal to `2^final_poly_bits` or until any further
+    /// `arity_bits`-reduction makes the polynomial degree smaller than `2^cap_height` (which would make FRI fail).
+    /// This tends to work well in the recursive setting, as it avoids needing multiple configurations
+    /// of gates used in FRI verification, such as `InterpolationGate`.
+    ConstantArityBits(usize, usize, usize),
 
     /// `MinSize(opt_max_arity_bits)` searches for an optimal sequence of reduction arities, with an
     /// optional max `arity_bits`. If this proof will have recursive proofs on top of it, a max
@@ -31,12 +32,12 @@ impl FriReductionStrategy {
         match self {
             FriReductionStrategy::Fixed(reduction_arity_bits) => reduction_arity_bits.to_vec(),
 
-            FriReductionStrategy::ConstantArityBits(arity_bits, final_poly_bits) => {
+            &FriReductionStrategy::ConstantArityBits(arity_bits, final_poly_bits, cap_height) => {
                 let mut result = Vec::new();
-                while degree_bits > *final_poly_bits {
-                    result.push(*arity_bits);
-                    assert!(degree_bits >= *arity_bits);
-                    degree_bits -= *arity_bits;
+                while degree_bits > final_poly_bits && degree_bits - arity_bits >= cap_height {
+                    result.push(arity_bits);
+                    assert!(degree_bits >= arity_bits);
+                    degree_bits -= arity_bits;
                 }
                 result.shrink_to_fit();
                 result
diff --git a/plonky2/src/plonk/circuit_builder.rs b/plonky2/src/plonk/circuit_builder.rs
index bd216389..d045aa6e 100644
--- a/plonky2/src/plonk/circuit_builder.rs
+++ b/plonky2/src/plonk/circuit_builder.rs
@@ -664,7 +664,7 @@ impl<F: RichField + Extendable<D>, const D: usize> CircuitBuilder<F, D> {
         let degree_bits = log2_strict(degree);
         let fri_params = self.fri_params(degree_bits);
         assert!(
-            fri_params.total_arities() <= degree_bits,
+            fri_params.total_arities() <= degree_bits - self.config.fri_config.cap_height,
             "FRI total reduction arity is too large.",
         );
 
diff --git a/plonky2/src/plonk/circuit_data.rs b/plonky2/src/plonk/circuit_data.rs
index 3d4ee2df..fdec495e 100644
--- a/plonky2/src/plonk/circuit_data.rs
+++ b/plonky2/src/plonk/circuit_data.rs
@@ -73,7 +73,7 @@ impl CircuitConfig {
                 rate_bits: 3,
                 cap_height: 4,
                 proof_of_work_bits: 16,
-                reduction_strategy: FriReductionStrategy::ConstantArityBits(4, 5),
+                reduction_strategy: FriReductionStrategy::ConstantArityBits(4, 5, 4),
                 num_query_rounds: 28,
             },
         }
diff --git a/starky/src/config.rs b/starky/src/config.rs
index 500cd957..2e2cced7 100644
--- a/starky/src/config.rs
+++ b/starky/src/config.rs
@@ -22,7 +22,7 @@ impl StarkConfig {
                 rate_bits: 1,
                 cap_height: 4,
                 proof_of_work_bits: 10,
-                reduction_strategy: FriReductionStrategy::ConstantArityBits(4, 5),
+                reduction_strategy: FriReductionStrategy::ConstantArityBits(4, 5, 4),
                 num_query_rounds: 90,
             },
         }
diff --git a/starky/src/prover.rs b/starky/src/prover.rs
index 2d57a60a..902fd1f9 100644
--- a/starky/src/prover.rs
+++ b/starky/src/prover.rs
@@ -37,6 +37,11 @@ where
 {
     let degree = trace.len();
     let degree_bits = log2_strict(degree);
+    let fri_params = config.fri_params(degree_bits);
+    assert!(
+        fri_params.total_arities() <= degree_bits - config.fri_config.cap_height,
+        "FRI total reduction arity is too large.",
+    );
 
     let trace_vecs = trace.into_iter().map(|row| row.to_vec()).collect_vec();
     let trace_col_major: Vec<Vec<F>> = transpose(&trace_vecs);
@@ -117,7 +122,6 @@ where
 
     // TODO: Add permutation checks
     let initial_merkle_trees = &[&trace_commitment, &quotient_commitment];
-    let fri_params = config.fri_params(degree_bits);
 
     let opening_proof = timed!(
         timing,

From ea9006f52eb98a96ff706167ea9b67cfe0f033a5 Mon Sep 17 00:00:00 2001
From: wborgeaud <williamborgeaud@gmail.com>
Date: Wed, 16 Feb 2022 13:51:10 +0100
Subject: [PATCH 02/32] Add rate_bits

---
 plonky2/src/plonk/circuit_builder.rs | 5 +++--
 starky/src/prover.rs                 | 6 +++---
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/plonky2/src/plonk/circuit_builder.rs b/plonky2/src/plonk/circuit_builder.rs
index d045aa6e..8e2f2e10 100644
--- a/plonky2/src/plonk/circuit_builder.rs
+++ b/plonky2/src/plonk/circuit_builder.rs
@@ -639,6 +639,7 @@ impl<F: RichField + Extendable<D>, const D: usize> CircuitBuilder<F, D> {
         let mut timing = TimingTree::new("preprocess", Level::Trace);
         let start = Instant::now();
         let rate_bits = self.config.fri_config.rate_bits;
+        let cap_height = self.config.fri_config.cap_height;
 
         // Hash the public inputs, and route them to a `PublicInputGate` which will enforce that
         // those hash wires match the claimed public inputs.
@@ -664,7 +665,7 @@ impl<F: RichField + Extendable<D>, const D: usize> CircuitBuilder<F, D> {
         let degree_bits = log2_strict(degree);
         let fri_params = self.fri_params(degree_bits);
         assert!(
-            fri_params.total_arities() <= degree_bits - self.config.fri_config.cap_height,
+            fri_params.total_arities() <= degree_bits + rate_bits - cap_height,
             "FRI total reduction arity is too large.",
         );
 
@@ -705,7 +706,7 @@ impl<F: RichField + Extendable<D>, const D: usize> CircuitBuilder<F, D> {
             constants_sigmas_vecs,
             rate_bits,
             PlonkOracle::CONSTANTS_SIGMAS.blinding,
-            self.config.fri_config.cap_height,
+            cap_height,
             &mut timing,
             Some(&fft_root_table),
         );
diff --git a/starky/src/prover.rs b/starky/src/prover.rs
index 4fef0b4a..be1f198b 100644
--- a/starky/src/prover.rs
+++ b/starky/src/prover.rs
@@ -41,8 +41,10 @@ where
     let degree = trace.len();
     let degree_bits = log2_strict(degree);
     let fri_params = config.fri_params(degree_bits);
+    let rate_bits = config.fri_config.rate_bits;
+    let cap_height = config.fri_config.cap_height;
     assert!(
-        fri_params.total_arities() <= degree_bits - config.fri_config.cap_height,
+        fri_params.total_arities() <= degree_bits + rate_bits - cap_height,
         "FRI total reduction arity is too large.",
     );
 
@@ -58,8 +60,6 @@ where
             .collect()
     );
 
-    let rate_bits = config.fri_config.rate_bits;
-    let cap_height = config.fri_config.cap_height;
     let trace_commitment = timed!(
         timing,
         "compute trace commitment",

From 56336e396d40f0d6fe221334a6ef761a2bb0394b Mon Sep 17 00:00:00 2001
From: wborgeaud <williamborgeaud@gmail.com>
Date: Wed, 16 Feb 2022 14:17:14 +0100
Subject: [PATCH 03/32] Fix

---
 plonky2/src/fri/reduction_strategies.rs | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/plonky2/src/fri/reduction_strategies.rs b/plonky2/src/fri/reduction_strategies.rs
index 84505ec2..d81ac2ae 100644
--- a/plonky2/src/fri/reduction_strategies.rs
+++ b/plonky2/src/fri/reduction_strategies.rs
@@ -10,7 +10,7 @@ pub enum FriReductionStrategy {
 
     /// `ConstantArityBits(arity_bits, final_poly_bits, cap_height)` applies reductions of arity `2^arity_bits`
     /// until the polynomial degree is less than or equal to `2^final_poly_bits` or until any further
-    /// `arity_bits`-reduction makes the polynomial degree smaller than `2^cap_height` (which would make FRI fail).
+    /// `arity_bits`-reduction makes the last FRI tree have height less than `cap_height`.
     /// This tends to work well in the recursive setting, as it avoids needing multiple configurations
     /// of gates used in FRI verification, such as `InterpolationGate`.
     ConstantArityBits(usize, usize, usize),
@@ -34,7 +34,9 @@ impl FriReductionStrategy {
 
             &FriReductionStrategy::ConstantArityBits(arity_bits, final_poly_bits, cap_height) => {
                 let mut result = Vec::new();
-                while degree_bits > final_poly_bits && degree_bits - arity_bits >= cap_height {
+                while degree_bits > final_poly_bits
+                    && degree_bits + rate_bits - arity_bits >= cap_height
+                {
                     result.push(arity_bits);
                     assert!(degree_bits >= arity_bits);
                     degree_bits -= arity_bits;

From 431faccbdbee989300992a1d1d04a42bc2602b7e Mon Sep 17 00:00:00 2001
From: Daniel Lubarov <daniel@lubarov.com>
Date: Wed, 16 Feb 2022 22:37:20 -0800
Subject: [PATCH 04/32] Change `compute_permutation_z_polys` to batch
 permutation checks (#492)

* Change `compute_permutation_z_polys` to batch permutation checks

* feedback
---
 starky/src/permutation.rs | 77 +++++++++++++++++++++++++++------------
 1 file changed, 54 insertions(+), 23 deletions(-)

diff --git a/starky/src/permutation.rs b/starky/src/permutation.rs
index 1f7655b4..01cfa8bf 100644
--- a/starky/src/permutation.rs
+++ b/starky/src/permutation.rs
@@ -67,48 +67,46 @@ where
     // Before batching, each permutation pair leads to `num_challenges` permutation arguments, so we
     // start with the cartesian product of `permutation_pairs` and `0..num_challenges`. Then we
     // chunk these arguments based on our batch size.
-    let permutation_instances = permutation_pairs
+    let permutation_batches = permutation_pairs
         .iter()
         .cartesian_product(0..config.num_challenges)
         .chunks(stark.permutation_batch_size())
         .into_iter()
-        .flat_map(|batch| {
-            batch.enumerate().map(|(i, (pair, chal))| {
-                let challenge = permutation_challenge_sets[i].challenges[chal];
-                PermutationInstance { pair, challenge }
-            })
+        .map(|batch| {
+            batch
+                .enumerate()
+                .map(|(i, (pair, chal))| {
+                    let challenge = permutation_challenge_sets[i].challenges[chal];
+                    PermutationInstance { pair, challenge }
+                })
+                .collect_vec()
         })
         .collect_vec();
 
-    permutation_instances
+    permutation_batches
         .into_par_iter()
-        .map(|instance| compute_permutation_z_poly(instance, trace_poly_values))
+        .map(|instances| compute_permutation_z_poly(&instances, trace_poly_values))
         .collect()
 }
 
 /// Compute a single Z polynomial.
-// TODO: Change this to handle a batch of `PermutationInstance`s.
 fn compute_permutation_z_poly<F: Field>(
-    instance: PermutationInstance<F>,
+    instances: &[PermutationInstance<F>],
     trace_poly_values: &[PolynomialValues<F>],
 ) -> PolynomialValues<F> {
-    let PermutationInstance { pair, challenge } = instance;
-    let PermutationPair { column_pairs } = pair;
-    let PermutationChallenge { beta, gamma } = challenge;
-
     let degree = trace_poly_values[0].len();
-    let mut reduced_lhs = PolynomialValues::constant(gamma, degree);
-    let mut reduced_rhs = PolynomialValues::constant(gamma, degree);
+    let (reduced_lhs_polys, reduced_rhs_polys): (Vec<_>, Vec<_>) = instances
+        .iter()
+        .map(|instance| permutation_reduced_polys(instance, trace_poly_values, degree))
+        .unzip();
 
-    for ((lhs, rhs), weight) in column_pairs.iter().zip(beta.powers()) {
-        reduced_lhs.add_assign_scaled(&trace_poly_values[*lhs], weight);
-        reduced_rhs.add_assign_scaled(&trace_poly_values[*rhs], weight);
-    }
+    let numerator = poly_product_elementwise(reduced_lhs_polys.into_iter());
+    let denominator = poly_product_elementwise(reduced_rhs_polys.into_iter());
 
     // Compute the quotients.
-    let reduced_rhs_inverses = F::batch_multiplicative_inverse(&reduced_rhs.values);
-    let mut quotients = reduced_lhs.values;
-    batch_multiply_inplace(&mut quotients, &reduced_rhs_inverses);
+    let denominator_inverses = F::batch_multiplicative_inverse(&denominator.values);
+    let mut quotients = numerator.values;
+    batch_multiply_inplace(&mut quotients, &denominator_inverses);
 
     // Compute Z, which contains partial products of the quotients.
     let mut partial_products = Vec::with_capacity(degree);
@@ -120,6 +118,39 @@ fn compute_permutation_z_poly<F: Field>(
     PolynomialValues::new(partial_products)
 }
 
+/// Computes the reduced polynomial, `\sum beta^i f_i(x) + gamma`, for both the "left" and "right"
+/// sides of a given `PermutationPair`.
+fn permutation_reduced_polys<F: Field>(
+    instance: &PermutationInstance<F>,
+    trace_poly_values: &[PolynomialValues<F>],
+    degree: usize,
+) -> (PolynomialValues<F>, PolynomialValues<F>) {
+    let PermutationInstance {
+        pair: PermutationPair { column_pairs },
+        challenge: PermutationChallenge { beta, gamma },
+    } = instance;
+
+    let mut reduced_lhs = PolynomialValues::constant(*gamma, degree);
+    let mut reduced_rhs = PolynomialValues::constant(*gamma, degree);
+    for ((lhs, rhs), weight) in column_pairs.iter().zip(beta.powers()) {
+        reduced_lhs.add_assign_scaled(&trace_poly_values[*lhs], weight);
+        reduced_rhs.add_assign_scaled(&trace_poly_values[*rhs], weight);
+    }
+    (reduced_lhs, reduced_rhs)
+}
+
+/// Computes the elementwise product of a set of polynomials. Assumes that the set is non-empty and
+/// that each polynomial has the same length.
+fn poly_product_elementwise<F: Field>(
+    mut polys: impl Iterator<Item = PolynomialValues<F>>,
+) -> PolynomialValues<F> {
+    let mut product = polys.next().expect("Expected at least one polynomial");
+    for poly in polys {
+        batch_multiply_inplace(&mut product.values, &poly.values)
+    }
+    product
+}
+
 fn get_permutation_challenge<F: RichField, H: Hasher<F>>(
     challenger: &mut Challenger<F, H>,
 ) -> PermutationChallenge<F> {

From 67cb5dfd5880ffc8a080bd22f63c62f0d83fd7e4 Mon Sep 17 00:00:00 2001
From: wborgeaud <williamborgeaud@gmail.com>
Date: Thu, 17 Feb 2022 08:26:23 +0100
Subject: [PATCH 05/32] PR feedback

---
 plonky2/src/fri/mod.rs                  | 1 +
 plonky2/src/fri/reduction_strategies.rs | 7 ++++---
 plonky2/src/plonk/circuit_data.rs       | 2 +-
 starky/src/config.rs                    | 2 +-
 4 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/plonky2/src/fri/mod.rs b/plonky2/src/fri/mod.rs
index 5792444e..4ed2ea3b 100644
--- a/plonky2/src/fri/mod.rs
+++ b/plonky2/src/fri/mod.rs
@@ -35,6 +35,7 @@ impl FriConfig {
         let reduction_arity_bits = self.reduction_strategy.reduction_arity_bits(
             degree_bits,
             self.rate_bits,
+            self.cap_height,
             self.num_query_rounds,
         );
         FriParams {
diff --git a/plonky2/src/fri/reduction_strategies.rs b/plonky2/src/fri/reduction_strategies.rs
index d81ac2ae..4252564e 100644
--- a/plonky2/src/fri/reduction_strategies.rs
+++ b/plonky2/src/fri/reduction_strategies.rs
@@ -8,12 +8,12 @@ pub enum FriReductionStrategy {
     /// Specifies the exact sequence of arities (expressed in bits) to use.
     Fixed(Vec<usize>),
 
-    /// `ConstantArityBits(arity_bits, final_poly_bits, cap_height)` applies reductions of arity `2^arity_bits`
+    /// `ConstantArityBits(arity_bits, final_poly_bits)` applies reductions of arity `2^arity_bits`
     /// until the polynomial degree is less than or equal to `2^final_poly_bits` or until any further
     /// `arity_bits`-reduction makes the last FRI tree have height less than `cap_height`.
     /// This tends to work well in the recursive setting, as it avoids needing multiple configurations
     /// of gates used in FRI verification, such as `InterpolationGate`.
-    ConstantArityBits(usize, usize, usize),
+    ConstantArityBits(usize, usize),
 
     /// `MinSize(opt_max_arity_bits)` searches for an optimal sequence of reduction arities, with an
     /// optional max `arity_bits`. If this proof will have recursive proofs on top of it, a max
@@ -27,12 +27,13 @@ impl FriReductionStrategy {
         &self,
         mut degree_bits: usize,
         rate_bits: usize,
+        cap_height: usize,
         num_queries: usize,
     ) -> Vec<usize> {
         match self {
             FriReductionStrategy::Fixed(reduction_arity_bits) => reduction_arity_bits.to_vec(),
 
-            &FriReductionStrategy::ConstantArityBits(arity_bits, final_poly_bits, cap_height) => {
+            &FriReductionStrategy::ConstantArityBits(arity_bits, final_poly_bits) => {
                 let mut result = Vec::new();
                 while degree_bits > final_poly_bits
                     && degree_bits + rate_bits - arity_bits >= cap_height
diff --git a/plonky2/src/plonk/circuit_data.rs b/plonky2/src/plonk/circuit_data.rs
index fdec495e..3d4ee2df 100644
--- a/plonky2/src/plonk/circuit_data.rs
+++ b/plonky2/src/plonk/circuit_data.rs
@@ -73,7 +73,7 @@ impl CircuitConfig {
                 rate_bits: 3,
                 cap_height: 4,
                 proof_of_work_bits: 16,
-                reduction_strategy: FriReductionStrategy::ConstantArityBits(4, 5, 4),
+                reduction_strategy: FriReductionStrategy::ConstantArityBits(4, 5),
                 num_query_rounds: 28,
             },
         }
diff --git a/starky/src/config.rs b/starky/src/config.rs
index 2e2cced7..500cd957 100644
--- a/starky/src/config.rs
+++ b/starky/src/config.rs
@@ -22,7 +22,7 @@ impl StarkConfig {
                 rate_bits: 1,
                 cap_height: 4,
                 proof_of_work_bits: 10,
-                reduction_strategy: FriReductionStrategy::ConstantArityBits(4, 5, 4),
+                reduction_strategy: FriReductionStrategy::ConstantArityBits(4, 5),
                 num_query_rounds: 90,
             },
         }

From a736aa8e705314ba5b061556946129ae838ffa76 Mon Sep 17 00:00:00 2001
From: Jakub Nabaglo <jakub@mirprotocol.org>
Date: Thu, 17 Feb 2022 22:01:07 -0800
Subject: [PATCH 06/32] Update MDS matrix and round consts in Poseidon; disable
 vectorization (#493)

---
 plonky2/src/bin/generate_constants.rs   |  11 +-
 plonky2/src/gates/poseidon_mds.rs       |  14 +-
 plonky2/src/hash/arch/aarch64/mod.rs    |   4 +-
 plonky2/src/hash/arch/x86_64/mod.rs     |  10 +-
 plonky2/src/hash/poseidon.rs            | 218 ++++++-----
 plonky2/src/hash/poseidon_goldilocks.rs | 501 ++++++++++++------------
 6 files changed, 376 insertions(+), 382 deletions(-)

diff --git a/plonky2/src/bin/generate_constants.rs b/plonky2/src/bin/generate_constants.rs
index d2744991..6527b361 100644
--- a/plonky2/src/bin/generate_constants.rs
+++ b/plonky2/src/bin/generate_constants.rs
@@ -7,22 +7,15 @@ use plonky2_field::goldilocks_field::GoldilocksField;
 use rand::{Rng, SeedableRng};
 use rand_chacha::ChaCha8Rng;
 
-// For historical reasons, we sample from 0..0xffffffff70000001, which is slightly larger than the
-// range of GoldilocksField, then verify that each constant also fits in GoldilocksField.
-const SAMPLE_RANGE_END: u64 = 0xffffffff70000001;
+const SAMPLE_RANGE_END: u64 = GoldilocksField::ORDER;
 
-// const N: usize = 8 * 30; // For Posiedon-8
-const N: usize = 12 * 30; // For Posiedon-12
+const N: usize = 12 * 30; // For Poseidon-12
 
 pub(crate) fn main() {
     let mut rng = ChaCha8Rng::seed_from_u64(0);
     let mut constants = [0u64; N];
     for i in 0..N {
         constants[i] = rng.gen_range(0..SAMPLE_RANGE_END);
-        // Make sure the constant fits in Goldilocks. If so, we also have random numbers in
-        // GoldilocksField::ORDER. This may be viewed as rejection sampling, except that we never
-        // encounter a rejection in practice, so we don't bother handling it.
-        assert!(constants[i] < GoldilocksField::ORDER);
     }
 
     // Print the constants in the format we prefer in our code.
diff --git a/plonky2/src/gates/poseidon_mds.rs b/plonky2/src/gates/poseidon_mds.rs
index 81583f88..8a989078 100644
--- a/plonky2/src/gates/poseidon_mds.rs
+++ b/plonky2/src/gates/poseidon_mds.rs
@@ -51,9 +51,13 @@ impl<F: RichField + Extendable<D> + Poseidon, const D: usize> PoseidonMdsGate<F,
         let mut res = ExtensionAlgebra::ZERO;
 
         for i in 0..SPONGE_WIDTH {
-            let coeff = F::Extension::from_canonical_u64(1 << <F as Poseidon>::MDS_MATRIX_EXPS[i]);
+            let coeff = F::Extension::from_canonical_u64(<F as Poseidon>::MDS_MATRIX_CIRC[i]);
             res += v[(i + r) % SPONGE_WIDTH].scalar_mul(coeff);
         }
+        {
+            let coeff = F::Extension::from_canonical_u64(<F as Poseidon>::MDS_MATRIX_DIAG[r]);
+            res += v[r].scalar_mul(coeff);
+        }
 
         res
     }
@@ -69,10 +73,16 @@ impl<F: RichField + Extendable<D> + Poseidon, const D: usize> PoseidonMdsGate<F,
 
         for i in 0..SPONGE_WIDTH {
             let coeff = builder.constant_extension(F::Extension::from_canonical_u64(
-                1 << <F as Poseidon>::MDS_MATRIX_EXPS[i],
+                <F as Poseidon>::MDS_MATRIX_CIRC[i],
             ));
             res = builder.scalar_mul_add_ext_algebra(coeff, v[(i + r) % SPONGE_WIDTH], res);
         }
+        {
+            let coeff = builder.constant_extension(F::Extension::from_canonical_u64(
+                <F as Poseidon>::MDS_MATRIX_DIAG[r],
+            ));
+            res = builder.scalar_mul_add_ext_algebra(coeff, v[r], res);
+        }
 
         res
     }
diff --git a/plonky2/src/hash/arch/aarch64/mod.rs b/plonky2/src/hash/arch/aarch64/mod.rs
index b8ae14af..ba86797d 100644
--- a/plonky2/src/hash/arch/aarch64/mod.rs
+++ b/plonky2/src/hash/arch/aarch64/mod.rs
@@ -1,2 +1,2 @@
-#[cfg(target_feature = "neon")]
-pub(crate) mod poseidon_goldilocks_neon;
+// #[cfg(target_feature = "neon")]
+// pub(crate) mod poseidon_goldilocks_neon;
diff --git a/plonky2/src/hash/arch/x86_64/mod.rs b/plonky2/src/hash/arch/x86_64/mod.rs
index fa3681d0..0730b626 100644
--- a/plonky2/src/hash/arch/x86_64/mod.rs
+++ b/plonky2/src/hash/arch/x86_64/mod.rs
@@ -1,5 +1,5 @@
-// Requires:
-// - AVX2
-// - BMI2 (for MULX and SHRX)
-#[cfg(all(target_feature = "avx2", target_feature = "bmi2"))]
-pub(crate) mod poseidon_goldilocks_avx2_bmi2;
+// // Requires:
+// // - AVX2
+// // - BMI2 (for MULX and SHRX)
+// #[cfg(all(target_feature = "avx2", target_feature = "bmi2"))]
+// pub(crate) mod poseidon_goldilocks_avx2_bmi2;
diff --git a/plonky2/src/hash/poseidon.rs b/plonky2/src/hash/poseidon.rs
index 09c5d2fc..9c202834 100644
--- a/plonky2/src/hash/poseidon.rs
+++ b/plonky2/src/hash/poseidon.rs
@@ -55,96 +55,96 @@ pub const ALL_ROUND_CONSTANTS: [u64; MAX_WIDTH * N_ROUNDS]  = [
     // WARNING: If these are changed in any way, then all the
     // implementations of Poseidon must be regenerated. See comments
     // in `poseidon_goldilocks.rs`.
-    0xb585f767417ee042, 0x7746a55f77c10331, 0xb2fb0d321d356f7a, 0x0f6760a486f1621f,
-    0xe10d6666b36abcdf, 0x8cae14cb455cc50b, 0xd438539cf2cee334, 0xef781c7d4c1fd8b4,
-    0xcdc4a23a0aca4b1f, 0x277fa208d07b52e3, 0xe17653a300493d38, 0xc54302f27c287dc1,
-    0x8628782231d47d10, 0x59cd1a8a690b49f2, 0xc3b919ad9efec0b0, 0xa484c4c637641d97,
-    0x308bbd23f191398b, 0x6e4a40c1bf713cf1, 0x9a2eedb7510414fb, 0xe360c6e111c2c63b,
-    0xd5c771901d4d89aa, 0xc35eae076e7d6b2f, 0x849c2656d0a09cad, 0xc0572c8c5cf1df2b,
-    0xe9fa634a883b8bf3, 0xf56f6d4900fb1fdd, 0xf7d713e872a72a1b, 0x8297132b6ba47612,
-    0xad6805e12ee8af1c, 0xac51d9f6485c22b9, 0x502ad7dc3bd56bf8, 0x57a1550c3761c577,
-    0x66bbd30e99d311da, 0x0da2abef5e948f87, 0xf0612750443f8e94, 0x28b8ec3afb937d8c,
-    0x92a756e6be54ca18, 0x70e741ec304e925d, 0x019d5ee2b037c59f, 0x6f6f2ed7a30707d1,
-    0x7cf416d01e8c169c, 0x61df517bb17617df, 0x85dc499b4c67dbaa, 0x4b959b48dad27b23,
-    0xe8be3e5e0dd779a0, 0xf5c0bc1e525ed8e6, 0x40b12cbf263cf853, 0xa637093f13e2ea3c,
-    0x3cc3f89232e3b0c8, 0x2e479dc16bfe86c0, 0x6f49de07d6d39469, 0x213ce7beecc232de,
-    0x5b043134851fc00a, 0xa2de45784a861506, 0x7103aaf97bed8dd5, 0x5326fc0dbb88a147,
-    0xa9ceb750364cb77a, 0x27f8ec88cc9e991f, 0xfceb4fda8c93fb83, 0xfac6ff13b45b260e,
-    0x7131aa455813380b, 0x93510360d5d68119, 0xad535b24fb96e3db, 0x4627f5c6b7efc045,
-    0x645cf794e4da78a9, 0x241c70ed1ac2877f, 0xacb8e076b009e825, 0x3737e9db6477bd9d,
-    0xe7ea5e344cd688ed, 0x90dee4a009214640, 0xd1b1edf7c77e74af, 0x0b65481bab42158e,
-    0x99ad1aab4b4fe3e7, 0x438a7c91f1a360cd, 0xb60de3bd159088bf, 0xc99cab6b47a3e3bb,
-    0x69a5ed92d5677cef, 0x5e7b329c482a9396, 0x5fc0ac0829f893c9, 0x32db82924fb757ea,
-    0x0ade699c5cf24145, 0x7cc5583b46d7b5bb, 0x85df9ed31bf8abcb, 0x6604df501ad4de64,
-    0xeb84f60941611aec, 0xda60883523989bd4, 0x8f97fe40bf3470bf, 0xa93f485ce0ff2b32,
-    0x6704e8eebc2afb4b, 0xcee3e9ac788ad755, 0x510d0e66062a270d, 0xf6323f48d74634a0,
-    0x0b508cdf04990c90, 0xf241708a4ef7ddf9, 0x60e75c28bb368f82, 0xa6217d8c3f0f9989,
-    0x7159cd30f5435b53, 0x839b4e8fe97ec79f, 0x0d3f3e5e885db625, 0x8f7d83be1daea54b,
-    0x780f22441e8dbc04, 0xeb9158465aedacd3, 0xd19e120d826c1b6c, 0x016ee53a7f007110,
-    0xcb5fd54ed22dd1ca, 0xacb84178c58de144, 0x9c22190c2c463227, 0x5d693c1bcc98406d,
-    0xdcef0798235f321a, 0x3d639263f55e0b1e, 0xe273fd977edb8fda, 0x418f027049d10fe7,
-    0x8c25fda3f253a284, 0x2cbaed4dc25a884e, 0x5f58e6aff78dc2af, 0x284650ac6fb9d206,
-    0x635b337f1391c13c, 0x9f9a036f1ac6361f, 0xb93e260cff6747b4, 0xb0a7eae8c7272e33,
-    0xd0762cbce7da0a9f, 0x34c6efb829c754d6, 0x40bf0ab6166855c1, 0xb6b570fccc46a242,
-    0x5a27b90055549545, 0xb1a5b166048b306f, 0x8722e0ad24f1006d, 0x788ee3b3b315049a,
-    0x14a726661e5b0351, 0x98b7672fe1c3f13e, 0xbb93ae77bdc3aa8f, 0x28fd3b04756fc222,
-    0x30a46805a86d7109, 0x337dc00c7844a0e7, 0xd5eca245253c861b, 0x77626382990d8546,
-    0xc1e434bf33c3ae7a, 0x0299351a54dbf35e, 0xb2d456e4fb620184, 0x3e9ed1fdc00265ea,
-    0x2972a92bb672e8db, 0x20216dd789f333ec, 0xadffe8cf746494a1, 0x1c4dbb1c5889d420,
-    0x15a16a8a8c9972f5, 0x388a128b98960e26, 0x2300e5d6ca3e5589, 0x2f63aa865c9ceb9f,
-    0xf1c36ce8d894420f, 0x271811252953f84a, 0xe5840293d5466a8e, 0x4d9bbc3e24e5f20e,
-    0xea35bc29cfa2794b, 0x18e21b4bf59e2d28, 0x1e3b9fc632ef6adb, 0x25d643627a05e678,
-    0x5a3f1bb1ecb63263, 0xdb7f0238ca031e31, 0xb462065960bfc4c4, 0x49c24ae463c280f4,
-    0xd793862c6f7b901a, 0xaadd1106bdce475e, 0xc43b6e0eed8ad58f, 0xe29024c1f2060cb7,
-    0x5e50c2755efbe17a, 0x10383f20ac183625, 0x38e8ee9d8a8a435d, 0xdd511837bcc52452,
-    0x7750059861a7da6a, 0x86ab99b518d1dbef, 0xb1204f608ccfe33b, 0xef61ac84d8dfca49,
-    0x1bbcd90f1f4eff36, 0x0cd1dabd9be9850a, 0x11a3ae5bf354bb11, 0xf755bfef11bb5516,
-    0xa3b832506e2f3adb, 0x516306f4b617e6ba, 0xddb4ac4a2aeead3a, 0x64bb6dec62af4430,
-    0xf9cc95c29895a152, 0x08d37f75632771b9, 0xeec49b619cee6b56, 0xf143933b56b3711a,
-    0xe4c5dd82b9f6570c, 0xe7ad775756eefdc4, 0x92c2318bc834ef78, 0x739c25f93007aa0a,
-    0x5636caca1725f788, 0xdd8f909af47cd0b6, 0xc6401fe16bc24d4e, 0x8ad97b342e6b3a3c,
-    0x0c49366bb7be8ce2, 0x0784d3d2f4b39fb5, 0x530fb67ec5d77a58, 0x41049229b8221f3b,
-    0x139542347cb606a3, 0x9cb0bd5ee62e6438, 0x02e3f615c4d3054a, 0x985d4f4adefb64a0,
-    0x775b9feb32053cde, 0x304265a64d6c1ba6, 0x593664c3be7acd42, 0x4f0a2e5fd2bd6718,
-    0xdd611f10619bf1da, 0xd8185f9b3e74f9a4, 0xef87139d126ec3b3, 0x3ba71336dd67f99b,
-    0x7d3a455d8d808091, 0x660d32e15cbdecc7, 0x297a863f5af2b9ff, 0x90e0a736e6b434df,
-    0x549f80ce7a12182e, 0x0f73b29235fb5b84, 0x16bf1f74056e3a01, 0x6d1f5a593019a39f,
-    0x02ff876fa73f6305, 0xc5cb72a2fb9a5bd7, 0x8470f39d674dfaa3, 0x25abb3f1e41aea30,
-    0x23eb8cc9c32951c7, 0xd687ba56242ac4ea, 0xda8d9e915d2de6b7, 0xe3cbdc7d938d8f1e,
-    0xb9a8c9b4001efad6, 0xc0d28a5c64f2285c, 0x45d7ac9b878575b8, 0xeeb76e39d8da283e,
-    0x3d06c8bd2fc7daac, 0x9c9c9820c13589f5, 0x65700b51db40bae3, 0x911f451579044242,
-    0x7ae6849ff1fee8cc, 0x3bb340ebba896ae5, 0xb46e9d8bb71f0b4b, 0x8dcf22f9e1bde2a3,
-    0x77bdaeda8cc55427, 0xf19e400ababa0e12, 0xc368a34939eb5c7f, 0x9ef1cd612c03bc5e,
-    0xe89cd8553b94bbd8, 0x5cd377dcb4550713, 0xa7b0fb78cd4c5665, 0x7684403ef76c7128,
-    0x5fa3f06f79c4f483, 0x8df57ac159dbade6, 0x2db01efa321b2625, 0x54846de4cfd58cb6,
-    0xba674538aa20f5cd, 0x541d4963699f9777, 0xe9096784dadaa548, 0xdfe8992458bf85ff,
-    0xece5a71e74a35593, 0x5ff98fd5ff1d14fd, 0x83e89419524c06e1, 0x5922040b6ef03286,
-    0xf97d750eab002858, 0x5080d4c2dba7b3ec, 0xa7de115ba038b508, 0x6a9242acb5f37ec0,
-    0xf7856ef865619ed0, 0x2265fc930dbd7a89, 0x17dfc8e5022c723b, 0x9001a64248f2d676,
-    0x90004c13b0b8b50e, 0xb932b7cfc63485b0, 0xa0b1df81fd4c2bc5, 0x8ef1dd26b594c383,
-    0x0541a4f9d20ba562, 0x9e611061be0a3c5b, 0xb3767e80e1e1624a, 0x0098d57820a88c6b,
-    0x31d191cd71e01691, 0x410fefafbf90a57a, 0xbdf8f2433633aea8, 0x9e8cd55b9cc11c28,
-    0xde122bec4acb869f, 0x4d001fd5b0b03314, 0xca66370067416209, 0x2f2339d6399888c6,
-    0x6d1a7918f7c98a13, 0xdf9a493995f688f3, 0xebc2151f4ded22ca, 0x03cc2ba8a2bab82f,
-    0xd341d03844ad9a9b, 0x387cb5d273ab3f58, 0xbba2515f74a7a221, 0x7248fe7737f37d9c,
-    0x4d61e56a7437f6b9, 0x262e963c9e54bef8, 0x59e89b097477d296, 0x055d5b52b9e47452,
-    0x82b27eb36e430708, 0xd30094caf3080f94, 0xcf5cb38227c2a3be, 0xfeed4db701262c7c,
-    0x41703f5391dd0154, 0x5eeea9412666f57b, 0x4cd1f1b196abdbc4, 0x4a20358594b3662b,
-    0x1478d361e4b47c26, 0x6f02dc0801d2c79f, 0x296a202eeb03c4b6, 0x2afd6799aec20c38,
-    0x7acfd96f3050383d, 0x6798ba0c380dfdd3, 0x34c6f57b3de02c88, 0x5736e1baf82eb8a0,
-    0x20057d2a0e58b8de, 0x3dea5bd5eb6e1404, 0x16e50d89874a6a98, 0x29bff3eccbfba19a,
-    0x475cd3207974793c, 0x18a42105cde34cfa, 0x023e7414b0618331, 0x151471081b52594b,
-    0xe4a3dff23bdeb0f3, 0x01a8d1a588c232ef, 0x11b4c74ee221d621, 0xe587cc0dce129c8c,
-    0x1ff7327025a65080, 0x594e29c44b8602b1, 0xf6f31db1f5a56fd3, 0xc02ac5e4c7258a5e,
-    0xe70201e9c5dc598f, 0x6f90ff3b9b3560b2, 0x42747a7262faf016, 0xd1f507e496927d26,
-    0x1c86d265fdd24cd9, 0x3996ce73f6b5266e, 0x8e7fba02d68a061e, 0xba0dec71548b7546,
-    0x9e9cbd785b8d8f40, 0xdae86459f6b3828c, 0xdebe08541314f71d, 0xa49229d29501358f,
-    0x7be5ba0010c4df7c, 0xa3c95eaf09ecc39c, 0x0230bca8f5d457cd, 0x4135c2bedc68cdf9,
-    0x166fc0cc4d5b20cc, 0x3762b59aa3236e6e, 0xe8928a4ceed163d2, 0x2a440b51b71223d9,
-    0x80cefd2bb5f48e46, 0xbb9879c738328b71, 0x6e7c8f1ab47cced0, 0x164bb2de257ffc0a,
-    0xf3c12fe5b800ea30, 0x40b9e92309e8c7e1, 0x551f5b0fe3b8d017, 0x25032aa7d4fc7aba,
-    0xaaed340795de0a0a, 0x8ffd96bc38c8ba0f, 0x70fc91eb8aa58833, 0x7f795e2a97566d73,
-    0x4543d9df72c4831d, 0xf172d73e69f20739, 0xdfd1c4ff1eb3d868, 0xbc8dfb62d26376f7,
+    0xb585f766f2144405, 0x7746a55f43921ad7, 0xb2fb0d31cee799b4, 0x0f6760a4803427d7,
+    0xe10d666650f4e012, 0x8cae14cb07d09bf1, 0xd438539c95f63e9f, 0xef781c7ce35b4c3d,
+    0xcdc4a239b0c44426, 0x277fa208bf337bff, 0xe17653a29da578a1, 0xc54302f225db2c76,
+    0x86287821f722c881, 0x59cd1a8a41c18e55, 0xc3b919ad495dc574, 0xa484c4c5ef6a0781,
+    0x308bbd23dc5416cc, 0x6e4a40c18f30c09c, 0x9a2eedb70d8f8cfa, 0xe360c6e0ae486f38,
+    0xd5c7718fbfc647fb, 0xc35eae071903ff0b, 0x849c2656969c4be7, 0xc0572c8c08cbbbad,
+    0xe9fa634a21de0082, 0xf56f6d48959a600d, 0xf7d713e806391165, 0x8297132b32825daf,
+    0xad6805e0e30b2c8a, 0xac51d9f5fcf8535e, 0x502ad7dc18c2ad87, 0x57a1550c110b3041,
+    0x66bbd30e6ce0e583, 0x0da2abef589d644e, 0xf061274fdb150d61, 0x28b8ec3ae9c29633,
+    0x92a756e67e2b9413, 0x70e741ebfee96586, 0x019d5ee2af82ec1c, 0x6f6f2ed772466352,
+    0x7cf416cfe7e14ca1, 0x61df517b86a46439, 0x85dc499b11d77b75, 0x4b959b48b9c10733,
+    0xe8be3e5da8043e57, 0xf5c0bc1de6da8699, 0x40b12cbf09ef74bf, 0xa637093ecb2ad631,
+    0x3cc3f892184df408, 0x2e479dc157bf31bb, 0x6f49de07a6234346, 0x213ce7bede378d7b,
+    0x5b0431345d4dea83, 0xa2de45780344d6a1, 0x7103aaf94a7bf308, 0x5326fc0d97279301,
+    0xa9ceb74fec024747, 0x27f8ec88bb21b1a3, 0xfceb4fda1ded0893, 0xfac6ff1346a41675,
+    0x7131aa45268d7d8c, 0x9351036095630f9f, 0xad535b24afc26bfb, 0x4627f5c6993e44be,
+    0x645cf794b8f1cc58, 0x241c70ed0af61617, 0xacb8e076647905f1, 0x3737e9db4c4f474d,
+    0xe7ea5e33e75fffb6, 0x90dee49fc9bfc23a, 0xd1b1edf76bc09c92, 0x0b65481ba645c602,
+    0x99ad1aab0814283b, 0x438a7c91d416ca4d, 0xb60de3bcc5ea751c, 0xc99cab6aef6f58bc,
+    0x69a5ed92a72ee4ff, 0x5e7b329c1ed4ad71, 0x5fc0ac0800144885, 0x32db829239774eca,
+    0x0ade699c5830f310, 0x7cc5583b10415f21, 0x85df9ed2e166d64f, 0x6604df4fee32bcb1,
+    0xeb84f608da56ef48, 0xda608834c40e603d, 0x8f97fe408061f183, 0xa93f485c96f37b89,
+    0x6704e8ee8f18d563, 0xcee3e9ac1e072119, 0x510d0e65e2b470c1, 0xf6323f486b9038f0,
+    0x0b508cdeffa5ceef, 0xf2417089e4fb3cbd, 0x60e75c2890d15730, 0xa6217d8bf660f29c,
+    0x7159cd30c3ac118e, 0x839b4e8fafead540, 0x0d3f3e5e82920adc, 0x8f7d83bddee7bba8,
+    0x780f2243ea071d06, 0xeb915845f3de1634, 0xd19e120d26b6f386, 0x016ee53a7e5fecc6,
+    0xcb5fd54e7933e477, 0xacb8417879fd449f, 0x9c22190be7f74732, 0x5d693c1ba3ba3621,
+    0xdcef0797c2b69ec7, 0x3d639263da827b13, 0xe273fd971bc8d0e7, 0x418f02702d227ed5,
+    0x8c25fda3b503038c, 0x2cbaed4daec8c07c, 0x5f58e6afcdd6ddc2, 0x284650ac5e1b0eba,
+    0x635b337ee819dab5, 0x9f9a036ed4f2d49f, 0xb93e260cae5c170e, 0xb0a7eae879ddb76d,
+    0xd0762cbc8ca6570c, 0x34c6efb812b04bf5, 0x40bf0ab5fa14c112, 0xb6b570fc7c5740d3,
+    0x5a27b9002de33454, 0xb1a5b165b6d2b2d2, 0x8722e0ace9d1be22, 0x788ee3b37e5680fb,
+    0x14a726661551e284, 0x98b7672f9ef3b419, 0xbb93ae776bb30e3a, 0x28fd3b046380f850,
+    0x30a4680593258387, 0x337dc00c61bd9ce1, 0xd5eca244c7a4ff1d, 0x7762638264d279bd,
+    0xc1e434bedeefd767, 0x0299351a53b8ec22, 0xb2d456e4ad251b80, 0x3e9ed1fda49cea0b,
+    0x2972a92ba450bed8, 0x20216dd77be493de, 0xadffe8cf28449ec6, 0x1c4dbb1c4c27d243,
+    0x15a16a8a8322d458, 0x388a128b7fd9a609, 0x2300e5d6baedf0fb, 0x2f63aa8647e15104,
+    0xf1c36ce86ecec269, 0x27181125183970c9, 0xe584029370dca96d, 0x4d9bbc3e02f1cfb2,
+    0xea35bc29692af6f8, 0x18e21b4beabb4137, 0x1e3b9fc625b554f4, 0x25d64362697828fd,
+    0x5a3f1bb1c53a9645, 0xdb7f023869fb8d38, 0xb462065911d4e1fc, 0x49c24ae4437d8030,
+    0xd793862c112b0566, 0xaadd1106730d8feb, 0xc43b6e0e97b0d568, 0xe29024c18ee6fca2,
+    0x5e50c27535b88c66, 0x10383f20a4ff9a87, 0x38e8ee9d71a45af8, 0xdd5118375bf1a9b9,
+    0x775005982d74d7f7, 0x86ab99b4dde6c8b0, 0xb1204f603f51c080, 0xef61ac8470250ecf,
+    0x1bbcd90f132c603f, 0x0cd1dabd964db557, 0x11a3ae5beb9d1ec9, 0xf755bfeea585d11d,
+    0xa3b83250268ea4d7, 0x516306f4927c93af, 0xddb4ac49c9efa1da, 0x64bb6dec369d4418,
+    0xf9cc95c22b4c1fcc, 0x08d37f755f4ae9f6, 0xeec49b613478675b, 0xf143933aed25e0b0,
+    0xe4c5dd8255dfc622, 0xe7ad7756f193198e, 0x92c2318b87fff9cb, 0x739c25f8fd73596d,
+    0x5636cac9f16dfed0, 0xdd8f909a938e0172, 0xc6401fe115063f5b, 0x8ad97b33f1ac1455,
+    0x0c49366bb25e8513, 0x0784d3d2f1698309, 0x530fb67ea1809a81, 0x410492299bb01f49,
+    0x139542347424b9ac, 0x9cb0bd5ea1a1115e, 0x02e3f615c38f49a1, 0x985d4f4a9c5291ef,
+    0x775b9feafdcd26e7, 0x304265a6384f0f2d, 0x593664c39773012c, 0x4f0a2e5fb028f2ce,
+    0xdd611f1000c17442, 0xd8185f9adfea4fd0, 0xef87139ca9a3ab1e, 0x3ba71336c34ee133,
+    0x7d3a455d56b70238, 0x660d32e130182684, 0x297a863f48cd1f43, 0x90e0a736a751ebb7,
+    0x549f80ce550c4fd3, 0x0f73b2922f38bd64, 0x16bf1f73fb7a9c3f, 0x6d1f5a59005bec17,
+    0x02ff876fa5ef97c4, 0xc5cb72a2a51159b0, 0x8470f39d2d5c900e, 0x25abb3f1d39fcb76,
+    0x23eb8cc9b372442f, 0xd687ba55c64f6364, 0xda8d9e90fd8ff158, 0xe3cbdc7d2fe45ea7,
+    0xb9a8c9b3aee52297, 0xc0d28a5c10960bd3, 0x45d7ac9b68f71a34, 0xeeb76e397069e804,
+    0x3d06c8bd1514e2d9, 0x9c9c98207cb10767, 0x65700b51aedfb5ef, 0x911f451539869408,
+    0x7ae6849fbc3a0ec6, 0x3bb340eba06afe7e, 0xb46e9d8b682ea65e, 0x8dcf22f9a3b34356,
+    0x77bdaeda586257a7, 0xf19e400a5104d20d, 0xc368a348e46d950f, 0x9ef1cd60e679f284,
+    0xe89cd854d5d01d33, 0x5cd377dc8bb882a2, 0xa7b0fb7883eee860, 0x7684403ec392950d,
+    0x5fa3f06f4fed3b52, 0x8df57ac11bc04831, 0x2db01efa1e1e1897, 0x54846de4aadb9ca2,
+    0xba6745385893c784, 0x541d496344d2c75b, 0xe909678474e687fe, 0xdfe89923f6c9c2ff,
+    0xece5a71e0cfedc75, 0x5ff98fd5d51fe610, 0x83e8941918964615, 0x5922040b47f150c1,
+    0xf97d750e3dd94521, 0x5080d4c2b86f56d7, 0xa7de115b56c78d70, 0x6a9242ac87538194,
+    0xf7856ef7f9173e44, 0x2265fc92feb0dc09, 0x17dfc8e4f7ba8a57, 0x9001a64209f21db8,
+    0x90004c1371b893c5, 0xb932b7cf752e5545, 0xa0b1df81b6fe59fc, 0x8ef1dd26770af2c2,
+    0x0541a4f9cfbeed35, 0x9e61106178bfc530, 0xb3767e80935d8af2, 0x0098d5782065af06,
+    0x31d191cd5c1466c7, 0x410fefafa319ac9d, 0xbdf8f242e316c4ab, 0x9e8cd55b57637ed0,
+    0xde122bebe9a39368, 0x4d001fd58f002526, 0xca6637000eb4a9f8, 0x2f2339d624f91f78,
+    0x6d1a7918c80df518, 0xdf9a4939342308e9, 0xebc2151ee6c8398c, 0x03cc2ba8a1116515,
+    0xd341d037e840cf83, 0x387cb5d25af4afcc, 0xbba2515f22909e87, 0x7248fe7705f38e47,
+    0x4d61e56a525d225a, 0x262e963c8da05d3d, 0x59e89b094d220ec2, 0x055d5b52b78b9c5e,
+    0x82b27eb33514ef99, 0xd30094ca96b7ce7b, 0xcf5cb381cd0a1535, 0xfeed4db6919e5a7c,
+    0x41703f53753be59f, 0x5eeea940fcde8b6f, 0x4cd1f1b175100206, 0x4a20358574454ec0,
+    0x1478d361dbbf9fac, 0x6f02dc07d141875c, 0x296a202ed8e556a2, 0x2afd67999bf32ee5,
+    0x7acfd96efa95491d, 0x6798ba0c0abb2c6d, 0x34c6f57b26c92122, 0x5736e1bad206b5de,
+    0x20057d2a0056521b, 0x3dea5bd5d0578bd7, 0x16e50d897d4634ac, 0x29bff3ecb9b7a6e3,
+    0x475cd3205a3bdcde, 0x18a42105c31b7e88, 0x023e7414af663068, 0x15147108121967d7,
+    0xe4a3dff1d7d6fef9, 0x01a8d1a588085737, 0x11b4c74eda62beef, 0xe587cc0d69a73346,
+    0x1ff7327017aa2a6e, 0x594e29c42473d06b, 0xf6f31db1899b12d5, 0xc02ac5e47312d3ca,
+    0xe70201e960cb78b8, 0x6f90ff3b6a65f108, 0x42747a7245e7fa84, 0xd1f507e43ab749b2,
+    0x1c86d265f15750cd, 0x3996ce73dd832c1c, 0x8e7fba02983224bd, 0xba0dec7103255dd4,
+    0x9e9cbd781628fc5b, 0xdae8645996edd6a5, 0xdebe0853b1a1d378, 0xa49229d24d014343,
+    0x7be5b9ffda905e1c, 0xa3c95eaec244aa30, 0x0230bca8f4df0544, 0x4135c2bebfe148c6,
+    0x166fc0cc438a3c72, 0x3762b59a8ae83efa, 0xe8928a4c89114750, 0x2a440b51a4945ee5,
+    0x80cefd2b7d99ff83, 0xbb9879c6e61fd62a, 0x6e7c8f1a84265034, 0x164bb2de1bbeddc8,
+    0xf3c12fe54d5c653b, 0x40b9e922ed9771e2, 0x551f5b0fbe7b1840, 0x25032aa7c4cb1811,
+    0xaaed34074b164346, 0x8ffd96bbf9c9c81d, 0x70fc91eb5937085c, 0x7f795e2a5f915440,
+    0x4543d9df5476d3cb, 0xf172d73e004fc90d, 0xdfd1c4febcc81238, 0xbc8dfb627fe558fc,
 ];
 
 const WIDTH: usize = SPONGE_WIDTH;
@@ -153,9 +153,10 @@ pub trait Poseidon: PrimeField64 {
     // times number of rounds.
     const N_ROUND_CONSTANTS: usize = WIDTH * N_ROUNDS;
 
-    // Use the MDS matrix which is circulant with entries 2^x for each
-    // x in MDS_MATRIX_EXPS.
-    const MDS_MATRIX_EXPS: [u64; WIDTH];
+    // The MDS matrix we use is C + D, where C is the circulant matrix whose first row is given by
+    // `MDS_MATRIX_CIRC`, and D is the diagonal matrix whose diagonal is given by `MDS_MATRIX_DIAG`.
+    const MDS_MATRIX_CIRC: [u64; WIDTH];
+    const MDS_MATRIX_DIAG: [u64; WIDTH];
 
     // Precomputed constants for the fast Poseidon calculation. See
     // the paper.
@@ -169,9 +170,10 @@ pub trait Poseidon: PrimeField64 {
     #[unroll_for_loops]
     fn mds_row_shf(r: usize, v: &[u64; WIDTH]) -> u128 {
         debug_assert!(r < WIDTH);
-        // The values of MDS_MATRIX_EXPS are known to be small, so we can
-        // accumulate all the products for each row and reduce just once
-        // at the end (done by the caller).
+        // The values of `MDS_MATRIX_CIRC` and `MDS_MATRIX_DIAG` are
+        // known to be small, so we can accumulate all the products for
+        // each row and reduce just once at the end (done by the
+        // caller).
 
         // NB: Unrolling this, calculating each term independently, and
         // summing at the end, didn't improve performance for me.
@@ -180,9 +182,10 @@ pub trait Poseidon: PrimeField64 {
         // This is a hacky way of fully unrolling the loop.
         for i in 0..12 {
             if i < WIDTH {
-                res += (v[(i + r) % WIDTH] as u128) << Self::MDS_MATRIX_EXPS[i];
+                res += (v[(i + r) % WIDTH] as u128) * (Self::MDS_MATRIX_CIRC[i] as u128);
             }
         }
+        res += (v[r] as u128) * (Self::MDS_MATRIX_DIAG[r] as u128);
 
         res
     }
@@ -196,8 +199,9 @@ pub trait Poseidon: PrimeField64 {
         let mut res = F::ZERO;
 
         for i in 0..WIDTH {
-            res += v[(i + r) % WIDTH] * F::from_canonical_u64(1 << Self::MDS_MATRIX_EXPS[i]);
+            res += v[(i + r) % WIDTH] * F::from_canonical_u64(Self::MDS_MATRIX_CIRC[i]);
         }
+        res += v[r] * F::from_canonical_u64(Self::MDS_MATRIX_DIAG[r]);
 
         res
     }
@@ -215,9 +219,13 @@ pub trait Poseidon: PrimeField64 {
         let mut res = builder.zero_extension();
 
         for i in 0..WIDTH {
-            let c = Self::from_canonical_u64(1 << <Self as Poseidon>::MDS_MATRIX_EXPS[i]);
+            let c = Self::from_canonical_u64(<Self as Poseidon>::MDS_MATRIX_CIRC[i]);
             res = builder.mul_const_add_extension(c, v[(i + r) % WIDTH], res);
         }
+        {
+            let c = Self::from_canonical_u64(<Self as Poseidon>::MDS_MATRIX_DIAG[r]);
+            res = builder.mul_const_add_extension(c, v[r], res);
+        }
 
         res
     }
@@ -395,7 +403,8 @@ pub trait Poseidon: PrimeField64 {
             }
         }
         let s0 = state[0].to_noncanonical_u64() as u128;
-        d_sum = add_u160_u128(d_sum, s0 << Self::MDS_MATRIX_EXPS[0]);
+        let mds0to0 = (Self::MDS_MATRIX_CIRC[0] + Self::MDS_MATRIX_DIAG[0]) as u128;
+        d_sum = add_u160_u128(d_sum, s0 * mds0to0);
         let d = reduce_u160::<Self>(d_sum);
 
         // result = [d] concat [state[0] * v + state[shift up by 1]]
@@ -416,7 +425,8 @@ pub trait Poseidon: PrimeField64 {
         r: usize,
     ) -> [F; WIDTH] {
         let s0 = state[0];
-        let mut d = s0 * F::from_canonical_u64(1 << Self::MDS_MATRIX_EXPS[0]);
+        let mds0to0 = Self::MDS_MATRIX_CIRC[0] + Self::MDS_MATRIX_DIAG[0];
+        let mut d = s0 * F::from_canonical_u64(mds0to0);
         for i in 1..WIDTH {
             let t = F::from_canonical_u64(Self::FAST_PARTIAL_ROUND_W_HATS[r][i - 1]);
             d += state[i] * t;
@@ -442,10 +452,8 @@ pub trait Poseidon: PrimeField64 {
         Self: RichField + Extendable<D>,
     {
         let s0 = state[0];
-        let mut d = builder.mul_const_extension(
-            Self::from_canonical_u64(1 << <Self as Poseidon>::MDS_MATRIX_EXPS[0]),
-            s0,
-        );
+        let mds0to0 = Self::MDS_MATRIX_CIRC[0] + Self::MDS_MATRIX_DIAG[0];
+        let mut d = builder.mul_const_extension(Self::from_canonical_u64(mds0to0), s0);
         for i in 1..WIDTH {
             let t = <Self as Poseidon>::FAST_PARTIAL_ROUND_W_HATS[r][i - 1];
             let t = Self::Extension::from_canonical_u64(t);
diff --git a/plonky2/src/hash/poseidon_goldilocks.rs b/plonky2/src/hash/poseidon_goldilocks.rs
index 7b82bb01..971fda0f 100644
--- a/plonky2/src/hash/poseidon_goldilocks.rs
+++ b/plonky2/src/hash/poseidon_goldilocks.rs
@@ -10,8 +10,8 @@ use crate::hash::poseidon::{Poseidon, N_PARTIAL_ROUNDS};
 
 #[rustfmt::skip]
 impl Poseidon for GoldilocksField {
-    // The MDS matrix we use is the circulant matrix with first row given by the vector
-    // [ 2^x for x in MDS_MATRIX_EXPS] = [1, 1, 2, 1, 8, 32, 2, 256, 4096, 8, 65536, 1024]
+    // The MDS matrix we use is C + D, where C is the circulant matrix whose first row is given by
+    // `MDS_MATRIX_CIRC`, and D is the diagonal matrix whose diagonal is given by `MDS_MATRIX_DIAG`.
     //
     // WARNING: If the MDS matrix is changed, then the following
     // constants need to be updated accordingly:
@@ -19,253 +19,254 @@ impl Poseidon for GoldilocksField {
     //  - FAST_PARTIAL_ROUND_VS
     //  - FAST_PARTIAL_ROUND_W_HATS
     //  - FAST_PARTIAL_ROUND_INITIAL_MATRIX
-    const MDS_MATRIX_EXPS: [u64; 12] = [0, 0, 1, 0, 3, 5, 1, 8, 12, 3, 16, 10];
+    const MDS_MATRIX_CIRC: [u64; 12] = [17, 15, 41, 16, 2, 28, 13, 13, 39, 18, 34, 20];
+    const MDS_MATRIX_DIAG: [u64; 12] = [8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
 
     const FAST_PARTIAL_FIRST_ROUND_CONSTANT: [u64; 12]  = [
-        0x3cc3f89232e3b0c8, 0x3a8304bc56985013, 0x2a9f75c2280d2a8e, 0x53b9e0fac07c9b2b,
-        0x276ef5190ab36dd6, 0xdccc95c1f434ce8d, 0x28d717d689301db6, 0x2662f1723650b872,
-        0xc6b0375cf47850da, 0xbdfcca7661d81f17, 0x911992a4f6d9591f, 0xb718e4720c9f542f,
+        0x3cc3f892184df408, 0xe993fd841e7e97f1, 0xf2831d3575f0f3af, 0xd2500e0a350994ca,
+        0xc5571f35d7288633, 0x91d89c5184109a02, 0xf37f925d04e5667b, 0x2d6e448371955a69,
+        0x740ef19ce01398a1, 0x694d24c0752fdf45, 0x60936af96ee2f148, 0xc33448feadc78f0c,
     ];
 
     const FAST_PARTIAL_ROUND_CONSTANTS: [u64; N_PARTIAL_ROUNDS]  = [
-        0x1c92804be083d129, 0x81d932f4620fcfc6, 0x29f58a72045f76a0, 0x434472d6c6e34f30,
-        0xc82c90fad781bb5c, 0xe6dfefae3135c450, 0xd0a0c9c9fff4798f, 0x97517f4034e7c8e6,
-        0xae8b5030952e5949, 0xf77251b77cc297e2, 0x879c3a97606f1160, 0xed4e1e98780bdc19,
-        0x5a9120e0c05b1660, 0xc4b244ea04b27221, 0x7fe9d55a335d7b82, 0xd69ff91c66ec999a,
-        0x4c389b1b8180f1f5, 0x1b289f8c7fdeea1e, 0x3d464c75140b20e7, 0x74d158e1be40eb73,
-        0xfc787193d2a84ea4, 0x0,
+        0x74cb2e819ae421ab, 0xd2559d2370e7f663, 0x62bf78acf843d17c, 0xd5ab7b67e14d1fb4,
+        0xb9fe2ae6e0969bdc, 0xe33fdf79f92a10e8, 0x0ea2bb4c2b25989b, 0xca9121fbf9d38f06,
+        0xbdd9b0aa81f58fa4, 0x83079fa4ecf20d7e, 0x650b838edfcc4ad3, 0x77180c88583c76ac,
+        0xaf8c20753143a180, 0xb8ccfe9989a39175, 0x954a1729f60cc9c5, 0xdeb5b550c4dca53b,
+        0xf01bb0b00f77011e, 0xa1ebb404b676afd9, 0x860b6e1597a0173e, 0x308bb65a036acbce,
+        0x1aca78f31c97c876, 0x0,
     ];
 
     const FAST_PARTIAL_ROUND_VS: [[u64; 12 - 1]; N_PARTIAL_ROUNDS] = [
-        [0x9a5dd25dc32e6569, 0xd4b82de00e7510fa, 0x165bdcd7b344404a, 0xa85b4c126b8edfd4,
-         0xcd2735bf92ab4f96, 0xdc07742c7da8ac41, 0x953fc266fc5ae49f, 0x0a151c20bfc847bf,
-         0x0c550caef5afedb5, 0x74d28901888c5fa8, 0xdc51b68c30cc1741, ],
-        [0x4f765e0a4246c828, 0xbbdc8cbadd477a84, 0x052a5abd7de2344c, 0xab88daa04d9c7fab,
-         0xbc8fd7acbee798ef, 0xe55d796c0d8a7a09, 0x40824732ed2c556c, 0x298a94d56eabeaa4,
-         0x719fcd5e11312b6c, 0x1ec9a560131d1ac7, 0xabc54a42497f7fd1, ],
-        [0xb51f81e6eeeeb0d6, 0xc6f3c34e7161d1ef, 0x1e93b9e2255eed5b, 0xa78338e63ec48cc2,
-         0xea6e89d1c7220a56, 0xaa52f6a1c2814bc5, 0x5896b6395e09fba0, 0xf7fc97a18d5f1eee,
-         0xf2712e64111823e8, 0x4f84821bf1f857f4, 0x02041415d72da206, ],
-        [0x39286a4a4a391e77, 0x4ac16c7bebc97214, 0x7427cbbcb895a01f, 0x2ef8491d0b14759b,
-         0xbec7625ee20fa616, 0x7c64393faf749b6f, 0x0f61c751c9826dc5, 0x700e6f3ee8ccb8a7,
-         0x5bdea3b447ef8667, 0xa0f569a5a6e97588, 0xcc9e78115d7cae2d, ],
-        [0x0933079ab678e5ee, 0xed6861bf33c54a28, 0x62503e6e1749a497, 0x745a9c65dea83ac6,
-         0x20ce351f6e700cf0, 0x2ec0b18d30fafb8a, 0x0312f54c22b5f299, 0x5222977218fd6cd5,
-         0x82662e8445868eec, 0xc4cab6335040265d, 0x12e5790e9efb9217, ],
-        [0x0d829aec63871f55, 0x384d8a425086dd8c, 0x13e78b54657bfd3e, 0x2a45a17a03093566,
-         0x7b6872656233b9be, 0xddc0281bb12bbb4c, 0xa224ebff0652d7c8, 0xc5ca97207780ea5c,
-         0x484236194d3586ba, 0x432a56d44a44f3f7, 0xc41f926f862fc532, ],
-        [0x9366cd7ed9ef5e06, 0xd7f941098175f223, 0x9af7dda3e1c9f2b1, 0x9a0ec6d0a03525f5,
-         0x3ab244f4fb0fb387, 0xd8c4e357eb1d5778, 0xe62157e2e25edbbb, 0xafcd6630f841f1f8,
-         0xc3969199738708fb, 0xa8224d311e6a551f, 0xc2c0a01fc655fd9f, ],
-        [0xd78498f2013cd9b6, 0x675d21a200b2908c, 0x70bfd23b9e88c707, 0x85472dcbcfd078e3,
-         0x5658c961cfffd574, 0x89e05a2cda3ca315, 0x1b51ae1ff8186a9f, 0xca648f8c6c7822cb,
-         0x7233c92647957f4d, 0x520bf21c62d37ffa, 0x897496c7407a2ca7, ],
-        [0x8e80cf5bca4eee19, 0x754779126bc1afcf, 0x07e887764b379cb0, 0x7dc7c14e12f91d5e,
-         0xc8f5dab5fb6b0264, 0x1c842cf8021f9176, 0x69b56a7e2e2db2c0, 0xf30253f77fef3445,
-         0x14bb3a62919efb99, 0xff9976d424a5d89c, 0x59dde7be0331a202, ],
-        [0xdbe04b62126330a2, 0x0409b2138da1eaec, 0x7bd4558eb2262691, 0xafa86cfa8d52b05b,
-         0xb83f570197d8c584, 0xb3ded6cc13990ac1, 0xfd33937cb072c9e1, 0xe3b3989341d92952,
-         0xd26e76d6ca949ad9, 0x35c89a8548f88e86, 0x8af785bd940c3b43, ],
-        [0xcbf3b86701c790da, 0x63634f67e29f4005, 0x008f903982363b81, 0xc2b07f99d6eb0229,
-         0xa8344b83d15e2558, 0x880f4e5fd103b7b0, 0xd40eddb0a5929072, 0x476e27ccee571f49,
-         0xe71439b4b989f9eb, 0x97e55074f852b2fe, 0xdd258c2137e1a2c5, ],
-        [0x982b90366d23259b, 0xb2667eacaa76b306, 0xecf233e82020ede1, 0x3cee7ac07d4a88c7,
-         0x31428be2fe5a5854, 0xf1beea1d55c4c4db, 0x584fd6b580f1ffd2, 0x6e2381c3c8ba0d0b,
-         0x21ab749cbafc0611, 0x8ed389f39aba3001, 0xa24ba694f2b42f13, ],
-        [0xdb30cd9db02606f9, 0x1b0d6736682ba257, 0x0d3bcdecf5808443, 0x31c330001dbd3dbd,
-         0x9684d22370447946, 0xde0e24e6426c6935, 0xf487270dd081ef69, 0xd943f4ef48f2b252,
-         0x4c52a7fdd1c52d24, 0xc293082029ea139d, 0xc2ba73ab3da0468a, ],
-        [0xd093bd0dcc74e0d1, 0xe91428f9ce6a98e5, 0x673dee716909dc21, 0xf22e3223548219d7,
-         0x3297978d881a1300, 0x51157b1e8218d77c, 0x0e3b0a5c07843889, 0x273b48dfa36752b6,
-         0x5dbf2c6323576866, 0x1c032b70763df9a7, 0x1a8d7ed4159ecbf4, ],
-        [0x8e40b29fa6c4f3ad, 0x43bc06dba91daa9b, 0x445df1620dd6d846, 0xae1e72ed68c45c46,
-         0x496ee4e593ade46d, 0x1d3642eddce9118f, 0x71a88114bd8fd755, 0x4a10d6b22514943d,
-         0x56dca305d4d72fee, 0xe2e4d9ce95fa62bf, 0xfb6bfffd47b50b0a, ],
-        [0x4c6c14946cc557ee, 0x9b1bcbaac7ba3226, 0xdd7410361fa0dd20, 0x9c8a098cbaf95b26,
-         0x3da4f26593503adf, 0xffb07b45cd3bf859, 0xaf034373af54a559, 0xd6b9bace407146bb,
-         0x7b92c04c972f4ec6, 0xfe71df71165b9845, 0xad0134b9dc9ebe51, ],
-        [0xfdaa64ceec88aa7c, 0x565342e2d815525c, 0xe382458f259429a8, 0x0f6ba5afd5d1d1ca,
-         0xcba85de412439a41, 0x212d3c62049ccb1a, 0x930c0bf5950267e3, 0x60f87fe43fc560d8,
-         0x8f1fbdbcd878a33b, 0xd28b789abf9af16f, 0xd921f0434fa0eb07, ],
-        [0xd69c2c80635e7c18, 0x5a3d78c8772f293f, 0x844fe5e72ad1ceb5, 0x81b217e5910dc916,
-         0x2951409fb7c8ba85, 0x5c135dd95693e367, 0xc2e8a723f9f7ebd2, 0x10bb79bf5d63f38d,
-         0x34625b1550385a89, 0xdc6235328d791163, 0x1eb12b7aed4d5133, ],
-        [0x01426faca89577d0, 0x003ca90136ac4fd0, 0x00289223dc45a17f, 0x0009921704320612,
-         0x0007efae3669e451, 0x006499f206b3349d, 0x1001120d9b5dcfe1, 0x000e3aa47db4da94,
-         0x0320dc8339d35692, 0x4030a0a16247ecbd, 0x04368a659c160a6b, ],
-        [0x0000001237b408f0, 0x00000004c8f1b79c, 0x0000000446de5309, 0x00000032a3e2d4ac,
-         0x00000c007600eeb7, 0x000100040ee771b0, 0x00000198394d0817, 0x0000301810a981ba,
-         0x0000030f37d86f5a, 0x0000030ab1cc04d4, 0x000000c0e7c0b7e9, ],
-        [0x00000000000234a0, 0x0000000000114630, 0x000000000800260c, 0x0000000100005288,
-         0x0000000000900194, 0x00000000200800a3, 0x0000000002011034, 0x000000000105100e,
-         0x0000000000604025, 0x0000000000114a03, 0x0000000000061481, ],
-        [0x0000000000000400, 0x0000000000010000, 0x0000000000000008, 0x0000000000001000,
-         0x0000000000000100, 0x0000000000000002, 0x0000000000000020, 0x0000000000000008,
-         0x0000000000000001, 0x0000000000000002, 0x0000000000000001, ],
+        [0x94877900674181c3, 0xc6c67cc37a2a2bbd, 0xd667c2055387940f, 0x0ba63a63e94b5ff0,
+         0x99460cc41b8f079f, 0x7ff02375ed524bb3, 0xea0870b47a8caf0e, 0xabcad82633b7bc9d,
+         0x3b8d135261052241, 0xfb4515f5e5b0d539, 0x3ee8011c2b37f77c, ],
+        [0x0adef3740e71c726, 0xa37bf67c6f986559, 0xc6b16f7ed4fa1b00, 0x6a065da88d8bfc3c,
+         0x4cabc0916844b46f, 0x407faac0f02e78d1, 0x07a786d9cf0852cf, 0x42433fb6949a629a,
+         0x891682a147ce43b0, 0x26cfd58e7b003b55, 0x2bbf0ed7b657acb3, ],
+        [0x481ac7746b159c67, 0xe367de32f108e278, 0x73f260087ad28bec, 0x5cfc82216bc1bdca,
+         0xcaccc870a2663a0e, 0xdb69cd7b4298c45d, 0x7bc9e0c57243e62d, 0x3cc51c5d368693ae,
+         0x366b4e8cc068895b, 0x2bd18715cdabbca4, 0xa752061c4f33b8cf, ],
+        [0xb22d2432b72d5098, 0x9e18a487f44d2fe4, 0x4b39e14ce22abd3c, 0x9e77fde2eb315e0d,
+         0xca5e0385fe67014d, 0x0c2cb99bf1b6bddb, 0x99ec1cd2a4460bfe, 0x8577a815a2ff843f,
+         0x7d80a6b4fd6518a5, 0xeb6c67123eab62cb, 0x8f7851650eca21a5, ],
+        [0x11ba9a1b81718c2a, 0x9f7d798a3323410c, 0xa821855c8c1cf5e5, 0x535e8d6fac0031b2,
+         0x404e7c751b634320, 0xa729353f6e55d354, 0x4db97d92e58bb831, 0xb53926c27897bf7d,
+         0x965040d52fe115c5, 0x9565fa41ebd31fd7, 0xaae4438c877ea8f4, ],
+        [0x37f4e36af6073c6e, 0x4edc0918210800e9, 0xc44998e99eae4188, 0x9f4310d05d068338,
+         0x9ec7fe4350680f29, 0xc5b2c1fdc0b50874, 0xa01920c5ef8b2ebe, 0x59fa6f8bd91d58ba,
+         0x8bfc9eb89b515a82, 0xbe86a7a2555ae775, 0xcbb8bbaa3810babf, ],
+        [0x577f9a9e7ee3f9c2, 0x88c522b949ace7b1, 0x82f07007c8b72106, 0x8283d37c6675b50e,
+         0x98b074d9bbac1123, 0x75c56fb7758317c1, 0xfed24e206052bc72, 0x26d7c3d1bc07dae5,
+         0xf88c5e441e28dbb4, 0x4fe27f9f96615270, 0x514d4ba49c2b14fe, ],
+        [0xf02a3ac068ee110b, 0x0a3630dafb8ae2d7, 0xce0dc874eaf9b55c, 0x9a95f6cff5b55c7e,
+         0x626d76abfed00c7b, 0xa0c1cf1251c204ad, 0xdaebd3006321052c, 0x3d4bd48b625a8065,
+         0x7f1e584e071f6ed2, 0x720574f0501caed3, 0xe3260ba93d23540a, ],
+        [0xab1cbd41d8c1e335, 0x9322ed4c0bc2df01, 0x51c3c0983d4284e5, 0x94178e291145c231,
+         0xfd0f1a973d6b2085, 0xd427ad96e2b39719, 0x8a52437fecaac06b, 0xdc20ee4b8c4c9a80,
+         0xa2c98e9549da2100, 0x1603fe12613db5b6, 0x0e174929433c5505, ],
+        [0x3d4eab2b8ef5f796, 0xcfff421583896e22, 0x4143cb32d39ac3d9, 0x22365051b78a5b65,
+         0x6f7fd010d027c9b6, 0xd9dd36fba77522ab, 0xa44cf1cb33e37165, 0x3fc83d3038c86417,
+         0xc4588d418e88d270, 0xce1320f10ab80fe2, 0xdb5eadbbec18de5d, ],
+        [0x1183dfce7c454afd, 0x21cea4aa3d3ed949, 0x0fce6f70303f2304, 0x19557d34b55551be,
+         0x4c56f689afc5bbc9, 0xa1e920844334f944, 0xbad66d423d2ec861, 0xf318c785dc9e0479,
+         0x99e2032e765ddd81, 0x400ccc9906d66f45, 0xe1197454db2e0dd9, ],
+        [0x84d1ecc4d53d2ff1, 0xd8af8b9ceb4e11b6, 0x335856bb527b52f4, 0xc756f17fb59be595,
+         0xc0654e4ea5553a78, 0x9e9a46b61f2ea942, 0x14fc8b5b3b809127, 0xd7009f0f103be413,
+         0x3e0ee7b7a9fb4601, 0xa74e888922085ed7, 0xe80a7cde3d4ac526, ],
+        [0x238aa6daa612186d, 0x9137a5c630bad4b4, 0xc7db3817870c5eda, 0x217e4f04e5718dc9,
+         0xcae814e2817bd99d, 0xe3292e7ab770a8ba, 0x7bb36ef70b6b9482, 0x3c7835fb85bca2d3,
+         0xfe2cdf8ee3c25e86, 0x61b3915ad7274b20, 0xeab75ca7c918e4ef, ],
+        [0xd6e15ffc055e154e, 0xec67881f381a32bf, 0xfbb1196092bf409c, 0xdc9d2e07830ba226,
+         0x0698ef3245ff7988, 0x194fae2974f8b576, 0x7a5d9bea6ca4910e, 0x7aebfea95ccdd1c9,
+         0xf9bd38a67d5f0e86, 0xfa65539de65492d8, 0xf0dfcbe7653ff787, ],
+        [0x0bd87ad390420258, 0x0ad8617bca9e33c8, 0x0c00ad377a1e2666, 0x0ac6fc58b3f0518f,
+         0x0c0cc8a892cc4173, 0x0c210accb117bc21, 0x0b73630dbb46ca18, 0x0c8be4920cbd4a54,
+         0x0bfe877a21be1690, 0x0ae790559b0ded81, 0x0bf50db2f8d6ce31, ],
+        [0x000cf29427ff7c58, 0x000bd9b3cf49eec8, 0x000d1dc8aa81fb26, 0x000bc792d5c394ef,
+         0x000d2ae0b2266453, 0x000d413f12c496c1, 0x000c84128cfed618, 0x000db5ebd48fc0d4,
+         0x000d1b77326dcb90, 0x000beb0ccc145421, 0x000d10e5b22b11d1, ],
+        [0x00000e24c99adad8, 0x00000cf389ed4bc8, 0x00000e580cbf6966, 0x00000cde5fd7e04f,
+         0x00000e63628041b3, 0x00000e7e81a87361, 0x00000dabe78f6d98, 0x00000efb14cac554,
+         0x00000e5574743b10, 0x00000d05709f42c1, 0x00000e4690c96af1, ],
+        [0x0000000f7157bc98, 0x0000000e3006d948, 0x0000000fa65811e6, 0x0000000e0d127e2f,
+         0x0000000fc18bfe53, 0x0000000fd002d901, 0x0000000eed6461d8, 0x0000001068562754,
+         0x0000000fa0236f50, 0x0000000e3af13ee1, 0x0000000fa460f6d1, ],
+        [0x0000000011131738, 0x000000000f56d588, 0x0000000011050f86, 0x000000000f848f4f,
+         0x00000000111527d3, 0x00000000114369a1, 0x00000000106f2f38, 0x0000000011e2ca94,
+         0x00000000110a29f0, 0x000000000fa9f5c1, 0x0000000010f625d1, ],
+        [0x000000000011f718, 0x000000000010b6c8, 0x0000000000134a96, 0x000000000010cf7f,
+         0x0000000000124d03, 0x000000000013f8a1, 0x0000000000117c58, 0x0000000000132c94,
+         0x0000000000134fc0, 0x000000000010a091, 0x0000000000128961, ],
+        [0x0000000000001300, 0x0000000000001750, 0x000000000000114e, 0x000000000000131f,
+         0x000000000000167b, 0x0000000000001371, 0x0000000000001230, 0x000000000000182c,
+         0x0000000000001368, 0x0000000000000f31, 0x00000000000015c9, ],
+        [0x0000000000000014, 0x0000000000000022, 0x0000000000000012, 0x0000000000000027,
+         0x000000000000000d, 0x000000000000000d, 0x000000000000001c, 0x0000000000000002,
+         0x0000000000000010, 0x0000000000000029, 0x000000000000000f, ],
     ];
 
     const FAST_PARTIAL_ROUND_W_HATS: [[u64; 12 - 1]; N_PARTIAL_ROUNDS] = [
-        [0x54accab273d3aeca, 0x12fecae33b1f1da9, 0x573bb85449ea9a27, 0x6b5ddc139f172aad,
-         0xd2b6d0ca34465d4c, 0x51cf0aafbddfc269, 0x6075e64679e7a403, 0x678316c041900ac9,
-         0x10019c84b343fc57, 0xde5b81280922f644, 0x42490a86b2f2f305, ],
-        [0x337c5930f7bacc46, 0x334792a4f1afb921, 0xc97ea5f1426e540e, 0x5fc74568337bd780,
-         0xfd5718cc391d80ef, 0xef90b77a337d923c, 0xb28561998f153fea, 0xed5f65b8894345aa,
-         0x7e2aacb5985893a7, 0xcbde536cb644fcf0, 0x07338300a07fc43b, ],
-        [0xd4c9ad02fcc8b4c1, 0x2890dac7a1caa815, 0x7d62bc45c45f5db2, 0x0a902300db5deac2,
-         0x663f3726307f62a4, 0x050bda7dc7d8eb3b, 0xd9db68f3f051c5b6, 0xc5110194a38210aa,
-         0x403862136533be0e, 0x20039e053d9b227d, 0xe2c90d16262c5f3c, ],
-        [0x6578da963396c755, 0xea6b546e6bc1e86f, 0x4e562ef0c66c2be3, 0x35b839dae0f9d22e,
-         0x4aab3d88857b058c, 0x4f7443e07ac462d3, 0x93c2c5bbc385e50f, 0xc0c0c5c8ea023ce2,
-         0x8409c53d4b62965d, 0x0489f2258135dcd1, 0x32958358c736aec9, ],
-        [0xe13b50ca15b0a455, 0x9878071e2b5d4547, 0xb8e50d27b4172b30, 0xbf312f828d3ea142,
-         0x5b8510573020e6e8, 0x7c3091c29d8d6afa, 0x7e2d900a50f194fa, 0xb236d5080d0b0409,
-         0x08f148b6c3b99320, 0x679c6b9cadbe604c, 0x6b0313be2ad9b9f2, ],
-        [0x12038ac320459b0e, 0x7abd36c6b25cd8e0, 0x37cc3583930e5a13, 0xafe725c4446a691d,
-         0x99d89ccadeb38d80, 0x96c820be5528ec36, 0x9b63969fdc84ede6, 0x8f8f21cf5ad78c48,
-         0x1a4d3573bc3c2d8b, 0x9f5a7bd9e771866e, 0x5bcef938b72497fc, ],
-        [0x5f969817be6add7a, 0x572b04c1ae5a4c6d, 0x8d219b8fac9a287b, 0x4566b3c56372f434,
-         0xdd3f46f108bf4441, 0xd7e1469baa3912c4, 0xac36377b68e071fc, 0xf348c609201d771a,
-         0x0bb926a5e2ebdd96, 0x30efa780aee4705a, 0xb24ff2673691146a, ],
-        [0x5d0324b3a1dab6e2, 0xbd1491a0cc9e564b, 0xb8699e13b528ef99, 0x7743d9a8753ee023,
-         0xce577363cdb5bcbc, 0xc056688d4f006774, 0x61f9363c10d7fdf2, 0x5f730e5530f6e06d,
-         0x25efb9ef3adf0072, 0xcf971d58e21a8aa7, 0xd830d7e8d0d70680, ],
-        [0x36e69157ac42f39d, 0x3e7aca69ddf62d3e, 0xbbbef86cac42bb30, 0xa2e793ae56c27043,
-         0x2a315dc4bc40c8a0, 0x84022758f3b3af55, 0x668809e74e7a470d, 0xf2d91eaafdee1820,
-         0x50f19afd16d03294, 0x30c087d3223bcd4b, 0xf5739d95458cc633, ],
-        [0x15266b5a75028317, 0x8059f198c9f88799, 0x437a070386c65244, 0xc70e0bb73942929d,
-         0xa8b32cb37ae137ea, 0xc2e556278323a459, 0xbc486da754091692, 0x7815a23467d6b541,
-         0x3e6dba4e930e8be6, 0x6b4277b0915d56ba, 0x20212bfac7922ea0, ],
-        [0xeeba270c067b0c8b, 0xa4d576458941f29a, 0xecdf04a28c8c83be, 0xc808f0af215d7dda,
-         0x424f4bfbecced0fb, 0xe4cbf6c0c10e58b3, 0x66a87bebfa09c031, 0x614ffc9443d5f0a4,
-         0x96c96636f7b7975a, 0x58d4222a6f860cc5, 0x2d4f51c75bf50169, ],
-        [0xab43452aec55310f, 0x0a719e77ec2b398c, 0x8f946888a3f5f74f, 0x7b447e0d9f7ad4fb,
-         0x7a2887ceb40ef226, 0x8840b904c1c49e50, 0xd91ea2510b0eaddc, 0x6617fa40a1a220fb,
-         0xb1c41a72a845cb45, 0x02c2715281868092, 0xaf5b1b6c46ca37bd, ],
-        [0xe27649b9dbcbe631, 0x4afdf11d1d5e73b2, 0x05285a0e99160910, 0x23bfd6197ed8d3ba,
-         0xb1e6292028792aab, 0xc997f6cc14e05cae, 0x34793ec255a555bd, 0xeb4f2da35a76dd03,
-         0x767a5552c9910f3a, 0x4c4cc6987c30a447, 0x64da2b6920578f8d, ],
-        [0xe97ce2fecc0720ac, 0x99fc5741fcdeae8a, 0x0ac47be58b345692, 0x75a446121f2cccda,
-         0xf38e40a102691c8e, 0xdbe5d707594714ef, 0x6ab183bdab92e450, 0x0aed83850dc10451,
-         0x66e16941a4373c93, 0x22af15bb3e1034a1, 0xab2136f22ed23ccc, ],
-        [0xb0d3214d3c4c46c1, 0x3983bffd4053346c, 0xab1239b72a6a9e64, 0x669bcbda2406c089,
-         0xf3118af8e563feda, 0x58323dbdd43a9c95, 0x5438aa910b51fd8c, 0xcbf071f9573f7e4f,
-         0x476c8fde40075e51, 0xa10f54d3c77d8bed, 0xfecafe7ec7346beb, ],
-        [0x79e00c6916f68fa8, 0x80e39c20c11400d6, 0x242e2b46a7c116b7, 0xea660990074fcff6,
-         0x18e3369da4c9272b, 0xfa6471be8be33b80, 0xede2ed2a83a4574a, 0x9e595d610deaaed6,
-         0xc7d2cf35fcacdc58, 0xc65cf113a9af2302, 0x35a74c3d0cac5fde, ],
-        [0x35d6cf1a9aeabd4b, 0x4dc004b0b64954c3, 0xcb67ab54210b4c8f, 0xa2359b770621d28e,
-         0x027a0a0a5e315bf6, 0xed6aad0492a86ef6, 0x127074e28969232c, 0x3e3d68e6354d396f,
-         0x3cf204ab96edf7c6, 0x513a9050b70c18bf, 0x73b3b7399a3f5281, ],
-        [0x0af9319d5b7cd620, 0x0514fbcecd8a897d, 0x542dd32e46738f8d, 0x49248ae425e9bd45,
-         0x8bb9ef7ac36e53ea, 0x97981020c414a723, 0xe587f186c024e0c8, 0x14f01dd28e990ad2,
-         0x4d3fca72e19ea756, 0x01a3824f1ee8e7f1, 0xb048d25b575f250e, ],
-        [0xe78a4cfe6c6aa236, 0x4840deffdefd3b04, 0x6e0952d028e63e47, 0x249d49fb1d93304d,
-         0xd41ce9ed49f7fbb3, 0xba255e808ea77466, 0x5ce52e6dc2005436, 0x8b5bf13acd881a04,
-         0xf80f439f3ac011d1, 0x1d3618fb2cc3f916, 0xf41489c837e14938, ],
-        [0x41e065665af15054, 0x71752ac86d1bba64, 0x9bfddd30f8ceadeb, 0x4f59dd5e6c985767,
-         0x8aa3e0718ecaa657, 0x355f734ed4199ca2, 0x110f361baec4d693, 0x283a46e9e134b5b1,
-         0x4fda33376f5c6514, 0xcca192f9565e7d13, 0x2251835db1c24c39, ],
-        [0xc583f62f5970a849, 0xb6cc325741cd89dd, 0xf83288467f07ac1f, 0xfd82624964b845e7,
-         0x11967e4e00a49fdd, 0x2fb200fae9f72577, 0xd6fb31913c7d5da7, 0xfad9ae578dd090cc,
-         0xcd13b2be741ea5d8, 0xc1c54f9cf54b0c27, 0x29520a761b657cce, ],
-        [0x0ac0e496a2b39f4a, 0x20571abb59e27953, 0xe9971143579a1d30, 0x980359c3dba518cb,
-         0x05ecee5a85b427c4, 0x4620dd90ad0b5366, 0x95c98f9c5b859365, 0x0fbb1806fbc56995,
-         0xfe4526fd802afae2, 0x70e3786431084092, 0xa8d78a0494939111, ],
+        [0x3d999c961b7c63b0, 0x814e82efcd172529, 0x2421e5d236704588, 0x887af7d4dd482328,
+         0xa5e9c291f6119b27, 0xbdc52b2676a4b4aa, 0x64832009d29bcf57, 0x09c4155174a552cc,
+         0x463f9ee03d290810, 0xc810936e64982542, 0x043b1c289f7bc3ac, ],
+        [0x673655aae8be5a8b, 0xd510fe714f39fa10, 0x2c68a099b51c9e73, 0xa667bfa9aa96999d,
+         0x4d67e72f063e2108, 0xf84dde3e6acda179, 0x40f9cc8c08f80981, 0x5ead032050097142,
+         0x6591b02092d671bb, 0x00e18c71963dd1b7, 0x8a21bcd24a14218a, ],
+        [0x202800f4addbdc87, 0xe4b5bdb1cc3504ff, 0xbe32b32a825596e7, 0x8e0f68c5dc223b9a,
+         0x58022d9e1c256ce3, 0x584d29227aa073ac, 0x8b9352ad04bef9e7, 0xaead42a3f445ecbf,
+         0x3c667a1d833a3cca, 0xda6f61838efa1ffe, 0xe8f749470bd7c446, ],
+        [0xc5b85bab9e5b3869, 0x45245258aec51cf7, 0x16e6b8e68b931830, 0xe2ae0f051418112c,
+         0x0470e26a0093a65b, 0x6bef71973a8146ed, 0x119265be51812daf, 0xb0be7356254bea2e,
+         0x8584defff7589bd7, 0x3c5fe4aeb1fb52ba, 0x9e7cd88acf543a5e, ],
+        [0x179be4bba87f0a8c, 0xacf63d95d8887355, 0x6696670196b0074f, 0xd99ddf1fe75085f9,
+         0xc2597881fef0283b, 0xcf48395ee6c54f14, 0x15226a8e4cd8d3b6, 0xc053297389af5d3b,
+         0x2c08893f0d1580e2, 0x0ed3cbcff6fcc5ba, 0xc82f510ecf81f6d0, ],
+        [0x94b06183acb715cc, 0x500392ed0d431137, 0x861cc95ad5c86323, 0x05830a443f86c4ac,
+         0x3b68225874a20a7c, 0x10b3309838e236fb, 0x9b77fc8bcd559e2c, 0xbdecf5e0cb9cb213,
+         0x30276f1221ace5fa, 0x7935dd342764a144, 0xeac6db520bb03708, ],
+        [0x7186a80551025f8f, 0x622247557e9b5371, 0xc4cbe326d1ad9742, 0x55f1523ac6a23ea2,
+         0xa13dfe77a3d52f53, 0xe30750b6301c0452, 0x08bd488070a3a32b, 0xcd800caef5b72ae3,
+         0x83329c90f04233ce, 0xb5b99e6664a0a3ee, 0x6b0731849e200a7f, ],
+        [0xec3fabc192b01799, 0x382b38cee8ee5375, 0x3bfb6c3f0e616572, 0x514abd0cf6c7bc86,
+         0x47521b1361dcc546, 0x178093843f863d14, 0xad1003c5d28918e7, 0x738450e42495bc81,
+         0xaf947c59af5e4047, 0x4653fb0685084ef2, 0x057fde2062ae35bf, ],
+        [0xe376678d843ce55e, 0x66f3860d7514e7fc, 0x7817f3dfff8b4ffa, 0x3929624a9def725b,
+         0x0126ca37f215a80a, 0xfce2f5d02762a303, 0x1bc927375febbad7, 0x85b481e5243f60bf,
+         0x2d3c5f42a39c91a0, 0x0811719919351ae8, 0xf669de0add993131, ],
+        [0x7de38bae084da92d, 0x5b848442237e8a9b, 0xf6c705da84d57310, 0x31e6a4bdb6a49017,
+         0x889489706e5c5c0f, 0x0e4a205459692a1b, 0xbac3fa75ee26f299, 0x5f5894f4057d755e,
+         0xb0dc3ecd724bb076, 0x5e34d8554a6452ba, 0x04f78fd8c1fdcc5f, ],
+        [0x4dd19c38779512ea, 0xdb79ba02704620e9, 0x92a29a3675a5d2be, 0xd5177029fe495166,
+         0xd32b3298a13330c1, 0x251c4a3eb2c5f8fd, 0xe1c48b26e0d98825, 0x3301d3362a4ffccb,
+         0x09bb6c88de8cd178, 0xdc05b676564f538a, 0x60192d883e473fee, ],
+        [0x16b9774801ac44a0, 0x3cb8411e786d3c8e, 0xa86e9cf505072491, 0x0178928152e109ae,
+         0x5317b905a6e1ab7b, 0xda20b3be7f53d59f, 0xcb97dedecebee9ad, 0x4bd545218c59f58d,
+         0x77dc8d856c05a44a, 0x87948589e4f243fd, 0x7e5217af969952c2, ],
+        [0xbc58987d06a84e4d, 0x0b5d420244c9cae3, 0xa3c4711b938c02c0, 0x3aace640a3e03990,
+         0x865a0f3249aacd8a, 0x8d00b2a7dbed06c7, 0x6eacb905beb7e2f8, 0x045322b216ec3ec7,
+         0xeb9de00d594828e6, 0x088c5f20df9e5c26, 0xf555f4112b19781f, ],
+        [0xa8cedbff1813d3a7, 0x50dcaee0fd27d164, 0xf1cb02417e23bd82, 0xfaf322786e2abe8b,
+         0x937a4315beb5d9b6, 0x1b18992921a11d85, 0x7d66c4368b3c497b, 0x0e7946317a6b4e99,
+         0xbe4430134182978b, 0x3771e82493ab262d, 0xa671690d8095ce82, ],
+        [0xb035585f6e929d9d, 0xba1579c7e219b954, 0xcb201cf846db4ba3, 0x287bf9177372cf45,
+         0xa350e4f61147d0a6, 0xd5d0ecfb50bcff99, 0x2e166aa6c776ed21, 0xe1e66c991990e282,
+         0x662b329b01e7bb38, 0x8aa674b36144d9a9, 0xcbabf78f97f95e65, ],
+        [0xeec24b15a06b53fe, 0xc8a7aa07c5633533, 0xefe9c6fa4311ad51, 0xb9173f13977109a1,
+         0x69ce43c9cc94aedc, 0xecf623c9cd118815, 0x28625def198c33c7, 0xccfc5f7de5c3636a,
+         0xf5e6c40f1621c299, 0xcec0e58c34cb64b1, 0xa868ea113387939f, ],
+        [0xd8dddbdc5ce4ef45, 0xacfc51de8131458c, 0x146bb3c0fe499ac0, 0x9e65309f15943903,
+         0x80d0ad980773aa70, 0xf97817d4ddbf0607, 0xe4626620a75ba276, 0x0dfdc7fd6fc74f66,
+         0xf464864ad6f2bb93, 0x02d55e52a5d44414, 0xdd8de62487c40925, ],
+        [0xc15acf44759545a3, 0xcbfdcf39869719d4, 0x33f62042e2f80225, 0x2599c5ead81d8fa3,
+         0x0b306cb6c1d7c8d0, 0x658c80d3df3729b1, 0xe8d1b2b21b41429c, 0xa1b67f09d4b3ccb8,
+         0x0e1adf8b84437180, 0x0d593a5e584af47b, 0xa023d94c56e151c7, ],
+        [0x49026cc3a4afc5a6, 0xe06dff00ab25b91b, 0x0ab38c561e8850ff, 0x92c3c8275e105eeb,
+         0xb65256e546889bd0, 0x3c0468236ea142f6, 0xee61766b889e18f2, 0xa206f41b12c30415,
+         0x02fe9d756c9f12d1, 0xe9633210630cbf12, 0x1ffea9fe85a0b0b1, ],
+        [0x81d1ae8cc50240f3, 0xf4c77a079a4607d7, 0xed446b2315e3efc1, 0x0b0a6b70915178c3,
+         0xb11ff3e089f15d9a, 0x1d4dba0b7ae9cc18, 0x65d74e2f43b48d05, 0xa2df8c6b8ae0804a,
+         0xa4e6f0a8c33348a6, 0xc0a26efc7be5669b, 0xa6b6582c547d0d60, ],
+        [0x84afc741f1c13213, 0x2f8f43734fc906f3, 0xde682d72da0a02d9, 0x0bb005236adb9ef2,
+         0x5bdf35c10a8b5624, 0x0739a8a343950010, 0x52f515f44785cfbc, 0xcbaf4e5d82856c60,
+         0xac9ea09074e3e150, 0x8f0fa011a2035fb0, 0x1a37905d8450904a, ],
+        [0x3abeb80def61cc85, 0x9d19c9dd4eac4133, 0x075a652d9641a985, 0x9daf69ae1b67e667,
+         0x364f71da77920a18, 0x50bd769f745c95b1, 0xf223d1180dbbf3fc, 0x2f885e584e04aa99,
+         0xb69a0fa70aea684a, 0x09584acaa6e062a0, 0x0bc051640145b19b, ],
     ];
 
     // NB: This is in ROW-major order to support cache-friendly pre-multiplication.
     const FAST_PARTIAL_ROUND_INITIAL_MATRIX: [[u64; 12 - 1]; 12 - 1] = [
-        [0xb8dee12bf8e622dc, 0x2a0bcfdad25a7a77, 0x35f873e941f6055d, 0x99b7b85b6028982e,
-         0x86d6993880e836f7, 0x1ef8de305b9c354d, 0x8b0a80ef933c37dc, 0x715c7164aacaf4a8,
-         0x43845bd4f75ac7f5, 0x3e71bb7b0ec57a1a, 0xffc5b2f8946575c3, ],
-        [0x863ca0992eae09b0, 0x68901dfa3ecc7696, 0x6ba9546fc13ba8be, 0x555b7567255c9650,
-         0x4570c6ac5e80551b, 0x8e440c6cc2d0ed18, 0xbad8ae4dbfba0799, 0x8b71ed9e65a6ed7a,
-         0xaade0f9eb69ee576, 0xdebe1855920c6e64, 0x3e71bb7b0ec57a1a, ],
-        [0x2c3887c29246a985, 0x5aeb127ffeece78f, 0xa86e940514be2461, 0x2cb276ddf6094068,
-         0x81e59e8f82a28b3c, 0x27bc037b1569fb52, 0x706ee8b692c2ebc7, 0xeba6949241aedb71,
-         0xc416ad39f1f908f8, 0xaade0f9eb69ee576, 0x43845bd4f75ac7f5, ],
-        [0x03df3a62e1ea48d2, 0xbb484c2d408e9b12, 0x0fbf2169623ec24c, 0x50955930c2f9eb19,
-         0x3dfc3cc6123745cc, 0xa2a8d3774d197b2c, 0xd16417e43d20feab, 0xd998a362dba538ba,
-         0xeba6949241aedb71, 0x8b71ed9e65a6ed7a, 0x715c7164aacaf4a8, ],
-        [0xbbf73d77fc6c411c, 0xad7f124615d240ee, 0x4e413fcebe9020ee, 0x540bd8044c672f2b,
-         0x6db739f6d2e9f37d, 0x9aa1b0a8f56ad33d, 0x53c179d92714378f, 0xd16417e43d20feab,
-         0x706ee8b692c2ebc7, 0xbad8ae4dbfba0799, 0x8b0a80ef933c37dc, ],
-        [0xab92e860ecde7bdc, 0xa58fc91c605c26d5, 0xfbe68b79a8d5e0b9, 0x3e7edc1407cbd848,
-         0xf69c76d11eaf57bf, 0x941ef2c6beace374, 0x9aa1b0a8f56ad33d, 0xa2a8d3774d197b2c,
-         0x27bc037b1569fb52, 0x8e440c6cc2d0ed18, 0x1ef8de305b9c354d, ],
-        [0xb522132046b25eaf, 0x2b7b18e882c3e2c6, 0xe3322ad433ba15c8, 0x87355794faf87b1b,
-         0x14f6e5ac86065fce, 0xf69c76d11eaf57bf, 0x6db739f6d2e9f37d, 0x3dfc3cc6123745cc,
-         0x81e59e8f82a28b3c, 0x4570c6ac5e80551b, 0x86d6993880e836f7, ],
-        [0x0084dd11f5c0d55c, 0x9d664d307df18036, 0x1d80d847dca52945, 0xee3eecb9b2df1658,
-         0x87355794faf87b1b, 0x3e7edc1407cbd848, 0x540bd8044c672f2b, 0x50955930c2f9eb19,
-         0x2cb276ddf6094068, 0x555b7567255c9650, 0x99b7b85b6028982e, ],
-        [0xeb7c39655546eba5, 0xf07245b62d94cf71, 0x17db9b690f0031a3, 0x1d80d847dca52945,
-         0xe3322ad433ba15c8, 0xfbe68b79a8d5e0b9, 0x4e413fcebe9020ee, 0x0fbf2169623ec24c,
-         0xa86e940514be2461, 0x6ba9546fc13ba8be, 0x35f873e941f6055d, ],
-        [0xcb7fc57923717f84, 0x795a850bf5f9e397, 0xf07245b62d94cf71, 0x9d664d307df18036,
-         0x2b7b18e882c3e2c6, 0xa58fc91c605c26d5, 0xad7f124615d240ee, 0xbb484c2d408e9b12,
-         0x5aeb127ffeece78f, 0x68901dfa3ecc7696, 0x2a0bcfdad25a7a77, ],
-        [0x3107f5edca2f02b8, 0xcb7fc57923717f84, 0xeb7c39655546eba5, 0x0084dd11f5c0d55c,
-         0xb522132046b25eaf, 0xab92e860ecde7bdc, 0xbbf73d77fc6c411c, 0x03df3a62e1ea48d2,
-         0x2c3887c29246a985, 0x863ca0992eae09b0, 0xb8dee12bf8e622dc, ],
+        [0x80772dc2645b280b, 0xdc927721da922cf8, 0xc1978156516879ad, 0x90e80c591f48b603,
+         0x3a2432625475e3ae, 0x00a2d4321cca94fe, 0x77736f524010c932, 0x904d3f2804a36c54,
+         0xbf9b39e28a16f354, 0x3a1ded54a6cd058b, 0x42392870da5737cf, ],
+        [0xe796d293a47a64cb, 0xb124c33152a2421a, 0x0ee5dc0ce131268a, 0xa9032a52f930fae6,
+         0x7e33ca8c814280de, 0xad11180f69a8c29e, 0xc75ac6d5b5a10ff3, 0xf0674a8dc5a387ec,
+         0xb36d43120eaa5e2b, 0x6f232aab4b533a25, 0x3a1ded54a6cd058b, ],
+        [0xdcedab70f40718ba, 0x14a4a64da0b2668f, 0x4715b8e5ab34653b, 0x1e8916a99c93a88e,
+         0xbba4b5d86b9a3b2c, 0xe76649f9bd5d5c2e, 0xaf8e2518a1ece54d, 0xdcda1344cdca873f,
+         0xcd080204256088e5, 0xb36d43120eaa5e2b, 0xbf9b39e28a16f354, ],
+        [0xf4a437f2888ae909, 0xc537d44dc2875403, 0x7f68007619fd8ba9, 0xa4911db6a32612da,
+         0x2f7e9aade3fdaec1, 0xe7ffd578da4ea43d, 0x43a608e7afa6b5c2, 0xca46546aa99e1575,
+         0xdcda1344cdca873f, 0xf0674a8dc5a387ec, 0x904d3f2804a36c54, ],
+        [0xf97abba0dffb6c50, 0x5e40f0c9bb82aab5, 0x5996a80497e24a6b, 0x07084430a7307c9a,
+         0xad2f570a5b8545aa, 0xab7f81fef4274770, 0xcb81f535cf98c9e9, 0x43a608e7afa6b5c2,
+         0xaf8e2518a1ece54d, 0xc75ac6d5b5a10ff3, 0x77736f524010c932, ],
+        [0x7f8e41e0b0a6cdff, 0x4b1ba8d40afca97d, 0x623708f28fca70e8, 0xbf150dc4914d380f,
+         0xc26a083554767106, 0x753b8b1126665c22, 0xab7f81fef4274770, 0xe7ffd578da4ea43d,
+         0xe76649f9bd5d5c2e, 0xad11180f69a8c29e, 0x00a2d4321cca94fe, ],
+        [0x726af914971c1374, 0x1d7f8a2cce1a9d00, 0x18737784700c75cd, 0x7fb45d605dd82838,
+         0x862361aeab0f9b6e, 0xc26a083554767106, 0xad2f570a5b8545aa, 0x2f7e9aade3fdaec1,
+         0xbba4b5d86b9a3b2c, 0x7e33ca8c814280de, 0x3a2432625475e3ae, ],
+        [0x64dd936da878404d, 0x4db9a2ead2bd7262, 0xbe2e19f6d07f1a83, 0x02290fe23c20351a,
+         0x7fb45d605dd82838, 0xbf150dc4914d380f, 0x07084430a7307c9a, 0xa4911db6a32612da,
+         0x1e8916a99c93a88e, 0xa9032a52f930fae6, 0x90e80c591f48b603, ],
+        [0x85418a9fef8a9890, 0xd8a2eb7ef5e707ad, 0xbfe85ababed2d882, 0xbe2e19f6d07f1a83,
+         0x18737784700c75cd, 0x623708f28fca70e8, 0x5996a80497e24a6b, 0x7f68007619fd8ba9,
+         0x4715b8e5ab34653b, 0x0ee5dc0ce131268a, 0xc1978156516879ad, ],
+        [0x156048ee7a738154, 0x91f7562377e81df5, 0xd8a2eb7ef5e707ad, 0x4db9a2ead2bd7262,
+         0x1d7f8a2cce1a9d00, 0x4b1ba8d40afca97d, 0x5e40f0c9bb82aab5, 0xc537d44dc2875403,
+         0x14a4a64da0b2668f, 0xb124c33152a2421a, 0xdc927721da922cf8, ],
+        [0xd841e8ef9dde8ba0, 0x156048ee7a738154, 0x85418a9fef8a9890, 0x64dd936da878404d,
+         0x726af914971c1374, 0x7f8e41e0b0a6cdff, 0xf97abba0dffb6c50, 0xf4a437f2888ae909,
+         0xdcedab70f40718ba, 0xe796d293a47a64cb, 0x80772dc2645b280b, ],
     ];
 
-    #[cfg(all(target_arch="x86_64", target_feature="avx2", target_feature="bmi2"))]
-    #[inline]
-    fn poseidon(input: [Self; 12]) -> [Self; 12] {
-        unsafe {
-            crate::hash::arch::x86_64::poseidon_goldilocks_avx2_bmi2::poseidon(&input)
-        }
-    }
+    // #[cfg(all(target_arch="x86_64", target_feature="avx2", target_feature="bmi2"))]
+    // #[inline]
+    // fn poseidon(input: [Self; 12]) -> [Self; 12] {
+    //     unsafe {
+    //         crate::hash::arch::x86_64::poseidon_goldilocks_avx2_bmi2::poseidon(&input)
+    //     }
+    // }
 
-    #[cfg(all(target_arch="x86_64", target_feature="avx2", target_feature="bmi2"))]
-    #[inline(always)]
-    fn constant_layer(state: &mut [Self; 12], round_ctr: usize) {
-        unsafe {
-            crate::hash::arch::x86_64::poseidon_goldilocks_avx2_bmi2::constant_layer(state, round_ctr);
-        }
-    }
+    // #[cfg(all(target_arch="x86_64", target_feature="avx2", target_feature="bmi2"))]
+    // #[inline(always)]
+    // fn constant_layer(state: &mut [Self; 12], round_ctr: usize) {
+    //     unsafe {
+    //         crate::hash::arch::x86_64::poseidon_goldilocks_avx2_bmi2::constant_layer(state, round_ctr);
+    //     }
+    // }
 
-    #[cfg(all(target_arch="x86_64", target_feature="avx2", target_feature="bmi2"))]
-    #[inline(always)]
-    fn sbox_layer(state: &mut [Self; 12]) {
-        unsafe {
-            crate::hash::arch::x86_64::poseidon_goldilocks_avx2_bmi2::sbox_layer(state);
-        }
-    }
+    // #[cfg(all(target_arch="x86_64", target_feature="avx2", target_feature="bmi2"))]
+    // #[inline(always)]
+    // fn sbox_layer(state: &mut [Self; 12]) {
+    //     unsafe {
+    //         crate::hash::arch::x86_64::poseidon_goldilocks_avx2_bmi2::sbox_layer(state);
+    //     }
+    // }
 
-    #[cfg(all(target_arch="x86_64", target_feature="avx2", target_feature="bmi2"))]
-    #[inline(always)]
-    fn mds_layer(state: &[Self; 12]) -> [Self; 12] {
-        unsafe {
-            crate::hash::arch::x86_64::poseidon_goldilocks_avx2_bmi2::mds_layer(state)
-        }
-    }
+    // #[cfg(all(target_arch="x86_64", target_feature="avx2", target_feature="bmi2"))]
+    // #[inline(always)]
+    // fn mds_layer(state: &[Self; 12]) -> [Self; 12] {
+    //     unsafe {
+    //         crate::hash::arch::x86_64::poseidon_goldilocks_avx2_bmi2::mds_layer(state)
+    //     }
+    // }
 
-    #[cfg(all(target_arch="aarch64", target_feature="neon"))]
-    #[inline]
-    fn poseidon(input: [Self; 12]) -> [Self; 12] {
-        unsafe {
-            crate::hash::arch::aarch64::poseidon_goldilocks_neon::poseidon(input)
-        }
-    }
+    // #[cfg(all(target_arch="aarch64", target_feature="neon"))]
+    // #[inline]
+    // fn poseidon(input: [Self; 12]) -> [Self; 12] {
+    //     unsafe {
+    //         crate::hash::arch::aarch64::poseidon_goldilocks_neon::poseidon(input)
+    //     }
+    // }
 
-    #[cfg(all(target_arch="aarch64", target_feature="neon"))]
-    #[inline(always)]
-    fn sbox_layer(state: &mut [Self; 12]) {
-        unsafe {
-            crate::hash::arch::aarch64::poseidon_goldilocks_neon::sbox_layer(state);
-        }
-    }
+    // #[cfg(all(target_arch="aarch64", target_feature="neon"))]
+    // #[inline(always)]
+    // fn sbox_layer(state: &mut [Self; 12]) {
+    //     unsafe {
+    //         crate::hash::arch::aarch64::poseidon_goldilocks_neon::sbox_layer(state);
+    //     }
+    // }
 
-    #[cfg(all(target_arch="aarch64", target_feature="neon"))]
-    #[inline(always)]
-    fn mds_layer(state: &[Self; 12]) -> [Self; 12] {
-        unsafe {
-            crate::hash::arch::aarch64::poseidon_goldilocks_neon::mds_layer(state)
-        }
-    }
+    // #[cfg(all(target_arch="aarch64", target_feature="neon"))]
+    // #[inline(always)]
+    // fn mds_layer(state: &[Self; 12]) -> [Self; 12] {
+    //     unsafe {
+    //         crate::hash::arch::aarch64::poseidon_goldilocks_neon::mds_layer(state)
+    //     }
+    // }
 }
 
 #[cfg(test)]
@@ -287,46 +288,28 @@ mod tests {
 
         let neg_one: u64 = F::NEG_ONE.to_canonical_u64();
 
-        #[rustfmt::skip]
-        let _test_vectors8: Vec<([u64; 8], [u64; 8])> = vec![
-            ([0, 0, 0, 0, 0, 0, 0, 0, ],
-             [0x649eec3229475d06, 0x72afe85b8b600222, 0x816d0a50ddd39228, 0x5083133a721a187c,
-              0xbb69bd7d90c490a6, 0xea1d33a65d0a3287, 0xb4d27542d2fba3bc, 0xf9756d565d90c20a, ]),
-            ([0, 1, 2, 3, 4, 5, 6, 7, ],
-             [0xdfda4e2a7ec338f4, 0x3ac8d668054b1873, 0xeaaef2f72528e7ff, 0xee7bcc836ae165bc,
-              0x95561d9377c3e696, 0x2e7d39c369dfccaa, 0x992178c050936f8f, 0x34e38ec33f572850, ]),
-            ([neg_one, neg_one, neg_one, neg_one,
-              neg_one, neg_one, neg_one, neg_one, ],
-             [0x9d8553546c658f67, 0xd5f6422aea26962b, 0xffb40b4db302da75, 0x34f43bbd7882c16c,
-              0xccb375313fa146b0, 0x87574c332e89201a, 0x60e9e6c0c0be3a16, 0xf0e2a741e90756ba, ]),
-            ([0x016f2dde9ccdaf6f, 0x77e29cda821fece4, 0x2f6686f781255f78, 0xd2c4c9a53070b44f,
-              0x4d7035c9fd01fc40, 0xc8d460945c91d509, 0x14855cd8a36a097f, 0x49f640d6a30f9cf0, ],
-             [0x4c3c58a3fac4ba05, 0x3f26fc2bcb33a3d4, 0xe13fcddcd7a136bb, 0x27b05be73a91e2f2,
-              0x37804ed8ca07fcd5, 0xe78ec2f213e28456, 0xecf67d2aacb4dbe3, 0xad14575187c496ca, ]),
-        ];
-
         #[rustfmt::skip]
         let test_vectors12: Vec<([u64; 12], [u64; 12])> = vec![
             ([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ],
-             [0x3901858a44be6b3a, 0xb3470607c5f0ba0e, 0xb3b3ac3d89b37e8e, 0xd389513a7f6fe6e9,
-              0x1eceb92f5da1c96b, 0x55d0bdfc6a842adf, 0x0112c568afb8819c, 0x6ac21107619569ee,
-              0x3de33babbb421a85, 0x83688eb15ffe4ca3, 0x47e285b477551fa9, 0x1dd3dda781901271, ]),
+             [0x3c18a9786cb0b359, 0xc4055e3364a246c3, 0x7953db0ab48808f4, 0xc71603f33a1144ca,
+              0xd7709673896996dc, 0x46a84e87642f44ed, 0xd032648251ee0b3c, 0x1c687363b207df62,
+              0xdf8565563e8045fe, 0x40f5b37ff4254dae, 0xd070f637b431067c, 0x1792b1c4342109d7, ]),
             ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, ],
-             [0x641772a94a77c7e5, 0x38d2cec9c47e7314, 0x3577218e825058c9, 0x1cdb3b4d22c54bcc,
-              0x803234d4b16eb152, 0xbbb6c8438627c0f0, 0x1b219561c95a41fa, 0x9bdc97531bacc401,
-              0x4251f4fac8271d9d, 0x0279ffa7ba5ce9aa, 0x63baf77c533b5874, 0xb7ada3e1f98b25e7, ]),
+             [0xd64e1e3efc5b8e9e, 0x53666633020aaa47, 0xd40285597c6a8825, 0x613a4f81e81231d2,
+              0x414754bfebd051f0, 0xcb1f8980294a023f, 0x6eb2a9e4d54a9d0f, 0x1902bc3af467e056,
+              0xf045d5eafdc6021f, 0xe4150f77caaa3be5, 0xc9bfd01d39b50cce, 0x5c0a27fcb0e1459b, ]),
             ([neg_one, neg_one, neg_one, neg_one,
               neg_one, neg_one, neg_one, neg_one,
               neg_one, neg_one, neg_one, neg_one, ],
-             [0xd2e4605ed1eb9613, 0x62510e8cbaf8a3b5, 0x64dc1e941dbaf46c, 0x1d6c5a5fd43cc4c5,
-              0xac4b4f6bf503a6b4, 0x19e17983f5e52404, 0x927b08e033b29b6f, 0xa41bc2cb5ddb9bc0,
-              0x270d528b1accc148, 0x022169acf46c71ae, 0xbbd4566e7b49ad7d, 0x0ed1ea54401533ef, ]),
-            ([0xa48728856b047229, 0xc43ab5e4aa986608, 0x715f470f075c057f, 0x36e955a095478013,
-              0x7c036db7200ba52d, 0x20377cd3410dc7dc, 0x058c0956659b05b2, 0xa66c880ee57e8399,
-              0xb06521c88afbd610, 0xdfa4d72ba95c8895, 0x25b403dac3622acc, 0xda607d79268a8fce, ],
-             [0xe85b56b0764df429, 0x7c0796201b43fe68, 0x231673b8300a6a16, 0x25db4745a952a677,
-              0x01431a6817415a4d, 0xfdfbbe63602076eb, 0x82c643dabf1154c1, 0x896e7e87b3f3417d,
-              0x27eca78818ef9c27, 0xf08c93583c24dc47, 0x1c9e1552c07a9f73, 0x7659179192cfdc88, ]),
+             [0xbe0085cfc57a8357, 0xd95af71847d05c09, 0xcf55a13d33c1c953, 0x95803a74f4530e82,
+              0xfcd99eb30a135df1, 0xe095905e913a3029, 0xde0392461b42919b, 0x7d3260e24e81d031,
+              0x10d3d0465d9deaa0, 0xa87571083dfc2a47, 0xe18263681e9958f8, 0xe28e96f1ae5e60d3, ]),
+            ([0x8ccbbbea4fe5d2b7, 0xc2af59ee9ec49970, 0x90f7e1a9e658446a, 0xdcc0630a3ab8b1b8,
+              0x7ff8256bca20588c, 0x5d99a7ca0c44ecfb, 0x48452b17a70fbee3, 0xeb09d654690b6c88,
+              0x4a55d3a39c676a88, 0xc0407a38d2285139, 0xa234bac9356386d1, 0xe1633f2bad98a52f, ],
+             [0xa89280105650c4ec, 0xab542d53860d12ed, 0x5704148e9ccab94f, 0xd3a826d4b62da9f5,
+              0x8a7a6ca87892574f, 0xc7017e1cad1a674e, 0x1f06668922318e34, 0xa3b203bc8102676f,
+              0xfcc781b0ce382bf2, 0x934c69ff3ed14ba5, 0x504688a5996e8f13, 0x401f3f2ed524a2ba, ]),
         ];
 
         check_test_vectors::<F>(test_vectors12);

From bedd2aa711c2628c050a0ec86d9f4f0cbcf71a52 Mon Sep 17 00:00:00 2001
From: Daniel Lubarov <daniel@lubarov.com>
Date: Sat, 19 Feb 2022 18:32:11 -0700
Subject: [PATCH 07/32] Rename arithmetic unit to ALU (#496)

---
 system_zero/src/{arithmetic => alu}/addition.rs  |  2 +-
 system_zero/src/{arithmetic => alu}/division.rs  |  2 +-
 system_zero/src/{arithmetic => alu}/mod.rs       | 16 ++++++++--------
 .../src/{arithmetic => alu}/multiplication.rs    |  2 +-
 .../src/{arithmetic => alu}/subtraction.rs       |  2 +-
 system_zero/src/lib.rs                           |  2 +-
 .../src/registers/{arithmetic.rs => alu.rs}      | 10 +++++-----
 system_zero/src/registers/mod.rs                 |  6 +++---
 system_zero/src/system_zero.rs                   | 12 +++++-------
 9 files changed, 26 insertions(+), 28 deletions(-)
 rename system_zero/src/{arithmetic => alu}/addition.rs (98%)
 rename system_zero/src/{arithmetic => alu}/division.rs (96%)
 rename system_zero/src/{arithmetic => alu}/mod.rs (80%)
 rename system_zero/src/{arithmetic => alu}/multiplication.rs (96%)
 rename system_zero/src/{arithmetic => alu}/subtraction.rs (96%)
 rename system_zero/src/registers/{arithmetic.rs => alu.rs} (84%)

diff --git a/system_zero/src/arithmetic/addition.rs b/system_zero/src/alu/addition.rs
similarity index 98%
rename from system_zero/src/arithmetic/addition.rs
rename to system_zero/src/alu/addition.rs
index 7aa0d81a..068092e8 100644
--- a/system_zero/src/arithmetic/addition.rs
+++ b/system_zero/src/alu/addition.rs
@@ -7,7 +7,7 @@ use plonky2::plonk::circuit_builder::CircuitBuilder;
 use plonky2::plonk::plonk_common::reduce_with_powers_ext_recursive;
 use starky::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer};
 
-use crate::registers::arithmetic::*;
+use crate::registers::alu::*;
 use crate::registers::NUM_COLUMNS;
 
 pub(crate) fn generate_addition<F: PrimeField64>(values: &mut [F; NUM_COLUMNS]) {
diff --git a/system_zero/src/arithmetic/division.rs b/system_zero/src/alu/division.rs
similarity index 96%
rename from system_zero/src/arithmetic/division.rs
rename to system_zero/src/alu/division.rs
index e91288b9..f0d645f1 100644
--- a/system_zero/src/arithmetic/division.rs
+++ b/system_zero/src/alu/division.rs
@@ -6,7 +6,7 @@ use plonky2::iop::ext_target::ExtensionTarget;
 use plonky2::plonk::circuit_builder::CircuitBuilder;
 use starky::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer};
 
-use crate::registers::arithmetic::*;
+use crate::registers::alu::*;
 use crate::registers::NUM_COLUMNS;
 
 pub(crate) fn generate_division<F: PrimeField64>(values: &mut [F; NUM_COLUMNS]) {
diff --git a/system_zero/src/arithmetic/mod.rs b/system_zero/src/alu/mod.rs
similarity index 80%
rename from system_zero/src/arithmetic/mod.rs
rename to system_zero/src/alu/mod.rs
index a2b3a4f8..17a12df1 100644
--- a/system_zero/src/arithmetic/mod.rs
+++ b/system_zero/src/alu/mod.rs
@@ -7,16 +7,16 @@ use starky::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsume
 use starky::vars::StarkEvaluationTargets;
 use starky::vars::StarkEvaluationVars;
 
-use crate::arithmetic::addition::{eval_addition, eval_addition_recursively, generate_addition};
-use crate::arithmetic::division::{eval_division, eval_division_recursively, generate_division};
-use crate::arithmetic::multiplication::{
+use crate::alu::addition::{eval_addition, eval_addition_recursively, generate_addition};
+use crate::alu::division::{eval_division, eval_division_recursively, generate_division};
+use crate::alu::multiplication::{
     eval_multiplication, eval_multiplication_recursively, generate_multiplication,
 };
-use crate::arithmetic::subtraction::{
+use crate::alu::subtraction::{
     eval_subtraction, eval_subtraction_recursively, generate_subtraction,
 };
 use crate::public_input_layout::NUM_PUBLIC_INPUTS;
-use crate::registers::arithmetic::*;
+use crate::registers::alu::*;
 use crate::registers::NUM_COLUMNS;
 
 mod addition;
@@ -24,7 +24,7 @@ mod division;
 mod multiplication;
 mod subtraction;
 
-pub(crate) fn generate_arithmetic_unit<F: PrimeField64>(values: &mut [F; NUM_COLUMNS]) {
+pub(crate) fn generate_alu<F: PrimeField64>(values: &mut [F; NUM_COLUMNS]) {
     if values[IS_ADD].is_one() {
         generate_addition(values);
     } else if values[IS_SUB].is_one() {
@@ -36,7 +36,7 @@ pub(crate) fn generate_arithmetic_unit<F: PrimeField64>(values: &mut [F; NUM_COL
     }
 }
 
-pub(crate) fn eval_arithmetic_unit<F: Field, P: PackedField<Scalar = F>>(
+pub(crate) fn eval_alu<F: Field, P: PackedField<Scalar = F>>(
     vars: StarkEvaluationVars<F, P, NUM_COLUMNS, NUM_PUBLIC_INPUTS>,
     yield_constr: &mut ConstraintConsumer<P>,
 ) {
@@ -54,7 +54,7 @@ pub(crate) fn eval_arithmetic_unit<F: Field, P: PackedField<Scalar = F>>(
     eval_division(local_values, yield_constr);
 }
 
-pub(crate) fn eval_arithmetic_unit_recursively<F: RichField + Extendable<D>, const D: usize>(
+pub(crate) fn eval_alu_recursively<F: RichField + Extendable<D>, const D: usize>(
     builder: &mut CircuitBuilder<F, D>,
     vars: StarkEvaluationTargets<D, NUM_COLUMNS, NUM_PUBLIC_INPUTS>,
     yield_constr: &mut RecursiveConstraintConsumer<F, D>,
diff --git a/system_zero/src/arithmetic/multiplication.rs b/system_zero/src/alu/multiplication.rs
similarity index 96%
rename from system_zero/src/arithmetic/multiplication.rs
rename to system_zero/src/alu/multiplication.rs
index 70c181d8..a88b42f6 100644
--- a/system_zero/src/arithmetic/multiplication.rs
+++ b/system_zero/src/alu/multiplication.rs
@@ -6,7 +6,7 @@ use plonky2::iop::ext_target::ExtensionTarget;
 use plonky2::plonk::circuit_builder::CircuitBuilder;
 use starky::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer};
 
-use crate::registers::arithmetic::*;
+use crate::registers::alu::*;
 use crate::registers::NUM_COLUMNS;
 
 pub(crate) fn generate_multiplication<F: PrimeField64>(values: &mut [F; NUM_COLUMNS]) {
diff --git a/system_zero/src/arithmetic/subtraction.rs b/system_zero/src/alu/subtraction.rs
similarity index 96%
rename from system_zero/src/arithmetic/subtraction.rs
rename to system_zero/src/alu/subtraction.rs
index 267bac72..8f8bb810 100644
--- a/system_zero/src/arithmetic/subtraction.rs
+++ b/system_zero/src/alu/subtraction.rs
@@ -6,7 +6,7 @@ use plonky2::iop::ext_target::ExtensionTarget;
 use plonky2::plonk::circuit_builder::CircuitBuilder;
 use starky::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer};
 
-use crate::registers::arithmetic::*;
+use crate::registers::alu::*;
 use crate::registers::NUM_COLUMNS;
 
 pub(crate) fn generate_subtraction<F: PrimeField64>(values: &mut [F; NUM_COLUMNS]) {
diff --git a/system_zero/src/lib.rs b/system_zero/src/lib.rs
index 1c097573..35576cd3 100644
--- a/system_zero/src/lib.rs
+++ b/system_zero/src/lib.rs
@@ -2,7 +2,7 @@
 #![allow(dead_code)]
 #![allow(unused_variables)]
 
-mod arithmetic;
+mod alu;
 mod core_registers;
 mod memory;
 mod permutation_unit;
diff --git a/system_zero/src/registers/arithmetic.rs b/system_zero/src/registers/alu.rs
similarity index 84%
rename from system_zero/src/registers/arithmetic.rs
rename to system_zero/src/registers/alu.rs
index 92c0d2c3..b4f82dff 100644
--- a/system_zero/src/registers/arithmetic.rs
+++ b/system_zero/src/registers/alu.rs
@@ -1,13 +1,13 @@
-//! Arithmetic unit.
+//! Arithmetic and logic unit.
 
-pub(crate) const IS_ADD: usize = super::START_ARITHMETIC;
+pub(crate) const IS_ADD: usize = super::START_ALU;
 pub(crate) const IS_SUB: usize = IS_ADD + 1;
 pub(crate) const IS_MUL: usize = IS_SUB + 1;
 pub(crate) const IS_DIV: usize = IS_MUL + 1;
 
 const START_SHARED_COLS: usize = IS_DIV + 1;
 
-/// Within the arithmetic unit, there are shared columns which can be used by any arithmetic
+/// Within the ALU, there are shared columns which can be used by any arithmetic/logic
 /// circuit, depending on which one is active this cycle.
 // Can be increased as needed as other operations are implemented.
 const NUM_SHARED_COLS: usize = 3;
@@ -26,7 +26,7 @@ pub(crate) const COL_ADD_INPUT_3: usize = shared_col(2);
 
 // Note: Addition outputs three 16-bit chunks, and since these values need to be range-checked
 // anyway, we might as well use the range check unit's columns as our addition outputs. So the
-// three proceeding columns are basically aliases, not columns owned by the arithmetic unit.
+// three proceeding columns are basically aliases, not columns owned by the ALU.
 /// The first 16-bit chunk of the output, based on little-endian ordering.
 pub(crate) const COL_ADD_OUTPUT_1: usize = super::range_check_16::col_rc_16_input(0);
 /// The second 16-bit chunk of the output, based on little-endian ordering.
@@ -34,4 +34,4 @@ pub(crate) const COL_ADD_OUTPUT_2: usize = super::range_check_16::col_rc_16_inpu
 /// The third 16-bit chunk of the output, based on little-endian ordering.
 pub(crate) const COL_ADD_OUTPUT_3: usize = super::range_check_16::col_rc_16_input(2);
 
-pub(super) const END: usize = super::START_ARITHMETIC + NUM_SHARED_COLS;
+pub(super) const END: usize = super::START_ALU + NUM_SHARED_COLS;
diff --git a/system_zero/src/registers/mod.rs b/system_zero/src/registers/mod.rs
index 134a28bf..12688b1c 100644
--- a/system_zero/src/registers/mod.rs
+++ b/system_zero/src/registers/mod.rs
@@ -1,4 +1,4 @@
-pub(crate) mod arithmetic;
+pub(crate) mod alu;
 pub(crate) mod boolean;
 pub(crate) mod core;
 pub(crate) mod logic;
@@ -8,8 +8,8 @@ pub(crate) mod permutation;
 pub(crate) mod range_check_16;
 pub(crate) mod range_check_degree;
 
-const START_ARITHMETIC: usize = 0;
-const START_BOOLEAN: usize = arithmetic::END;
+const START_ALU: usize = 0;
+const START_BOOLEAN: usize = alu::END;
 const START_CORE: usize = boolean::END;
 const START_LOGIC: usize = core::END;
 const START_LOOKUP: usize = logic::END;
diff --git a/system_zero/src/system_zero.rs b/system_zero/src/system_zero.rs
index cd7796d7..c42a04a8 100644
--- a/system_zero/src/system_zero.rs
+++ b/system_zero/src/system_zero.rs
@@ -10,9 +10,7 @@ use starky::stark::Stark;
 use starky::vars::StarkEvaluationTargets;
 use starky::vars::StarkEvaluationVars;
 
-use crate::arithmetic::{
-    eval_arithmetic_unit, eval_arithmetic_unit_recursively, generate_arithmetic_unit,
-};
+use crate::alu::{eval_alu, eval_alu_recursively, generate_alu};
 use crate::core_registers::{
     eval_core_registers, eval_core_registers_recursively, generate_first_row_core_registers,
     generate_next_row_core_registers,
@@ -38,7 +36,7 @@ impl<F: RichField + Extendable<D>, const D: usize> SystemZero<F, D> {
 
         let mut row = [F::ZERO; NUM_COLUMNS];
         generate_first_row_core_registers(&mut row);
-        generate_arithmetic_unit(&mut row);
+        generate_alu(&mut row);
         generate_permutation_unit(&mut row);
 
         let mut trace = Vec::with_capacity(MIN_TRACE_ROWS);
@@ -46,7 +44,7 @@ impl<F: RichField + Extendable<D>, const D: usize> SystemZero<F, D> {
         loop {
             let mut next_row = [F::ZERO; NUM_COLUMNS];
             generate_next_row_core_registers(&row, &mut next_row);
-            generate_arithmetic_unit(&mut next_row);
+            generate_alu(&mut next_row);
             generate_permutation_unit(&mut next_row);
 
             trace.push(row);
@@ -84,7 +82,7 @@ impl<F: RichField + Extendable<D>, const D: usize> Stark<F, D> for SystemZero<F,
         P: PackedField<Scalar = FE>,
     {
         eval_core_registers(vars, yield_constr);
-        eval_arithmetic_unit(vars, yield_constr);
+        eval_alu(vars, yield_constr);
         eval_permutation_unit::<F, FE, P, D2>(vars, yield_constr);
         // TODO: Other units
     }
@@ -96,7 +94,7 @@ impl<F: RichField + Extendable<D>, const D: usize> Stark<F, D> for SystemZero<F,
         yield_constr: &mut RecursiveConstraintConsumer<F, D>,
     ) {
         eval_core_registers_recursively(builder, vars, yield_constr);
-        eval_arithmetic_unit_recursively(builder, vars, yield_constr);
+        eval_alu_recursively(builder, vars, yield_constr);
         eval_permutation_unit_recursively(builder, vars, yield_constr);
         // TODO: Other units
     }

From bc3685587cc371f96eabd7d169498251e1db55b1 Mon Sep 17 00:00:00 2001
From: Daniel Lubarov <daniel@lubarov.com>
Date: Sun, 20 Feb 2022 17:48:31 -0700
Subject: [PATCH 08/32] Rename constraint methods (#497)

Most of our constraints apply to all rows, and it seems safest to make that the "default".
---
 starky/src/constraint_consumer.rs   | 20 ++++++++++----------
 starky/src/fibonacci_stark.rs       | 10 ++++++----
 system_zero/src/alu/addition.rs     |  4 ++--
 system_zero/src/alu/mod.rs          |  4 ++--
 system_zero/src/core_registers.rs   |  8 ++++----
 system_zero/src/permutation_unit.rs | 28 ++++++++++++----------------
 6 files changed, 36 insertions(+), 38 deletions(-)

diff --git a/starky/src/constraint_consumer.rs b/starky/src/constraint_consumer.rs
index 88f66118..ada28730 100644
--- a/starky/src/constraint_consumer.rs
+++ b/starky/src/constraint_consumer.rs
@@ -53,12 +53,12 @@ impl<P: PackedField> ConstraintConsumer<P> {
     }
 
     /// Add one constraint valid on all rows except the last.
-    pub fn constraint(&mut self, constraint: P) {
-        self.constraint_wrapping(constraint * self.z_last);
+    pub fn constraint_transition(&mut self, constraint: P) {
+        self.constraint(constraint * self.z_last);
     }
 
     /// Add one constraint on all rows.
-    pub fn constraint_wrapping(&mut self, constraint: P) {
+    pub fn constraint(&mut self, constraint: P) {
         for (&alpha, acc) in self.alphas.iter().zip(&mut self.constraint_accs) {
             *acc *= alpha;
             *acc += constraint;
@@ -68,13 +68,13 @@ impl<P: PackedField> ConstraintConsumer<P> {
     /// Add one constraint, but first multiply it by a filter such that it will only apply to the
     /// first row of the trace.
     pub fn constraint_first_row(&mut self, constraint: P) {
-        self.constraint_wrapping(constraint * self.lagrange_basis_first);
+        self.constraint(constraint * self.lagrange_basis_first);
     }
 
     /// Add one constraint, but first multiply it by a filter such that it will only apply to the
     /// last row of the trace.
     pub fn constraint_last_row(&mut self, constraint: P) {
-        self.constraint_wrapping(constraint * self.lagrange_basis_last);
+        self.constraint(constraint * self.lagrange_basis_last);
     }
 }
 
@@ -122,17 +122,17 @@ impl<F: RichField + Extendable<D>, const D: usize> RecursiveConstraintConsumer<F
     }
 
     /// Add one constraint valid on all rows except the last.
-    pub fn constraint(
+    pub fn constraint_transition(
         &mut self,
         builder: &mut CircuitBuilder<F, D>,
         constraint: ExtensionTarget<D>,
     ) {
         let filtered_constraint = builder.mul_extension(constraint, self.z_last);
-        self.constraint_wrapping(builder, filtered_constraint);
+        self.constraint(builder, filtered_constraint);
     }
 
     /// Add one constraint valid on all rows.
-    pub fn constraint_wrapping(
+    pub fn constraint(
         &mut self,
         builder: &mut CircuitBuilder<F, D>,
         constraint: ExtensionTarget<D>,
@@ -150,7 +150,7 @@ impl<F: RichField + Extendable<D>, const D: usize> RecursiveConstraintConsumer<F
         constraint: ExtensionTarget<D>,
     ) {
         let filtered_constraint = builder.mul_extension(constraint, self.lagrange_basis_first);
-        self.constraint_wrapping(builder, filtered_constraint);
+        self.constraint(builder, filtered_constraint);
     }
 
     /// Add one constraint, but first multiply it by a filter such that it will only apply to the
@@ -161,6 +161,6 @@ impl<F: RichField + Extendable<D>, const D: usize> RecursiveConstraintConsumer<F
         constraint: ExtensionTarget<D>,
     ) {
         let filtered_constraint = builder.mul_extension(constraint, self.lagrange_basis_last);
-        self.constraint_wrapping(builder, filtered_constraint);
+        self.constraint(builder, filtered_constraint);
     }
 }
diff --git a/starky/src/fibonacci_stark.rs b/starky/src/fibonacci_stark.rs
index bd1775e1..a0204359 100644
--- a/starky/src/fibonacci_stark.rs
+++ b/starky/src/fibonacci_stark.rs
@@ -68,9 +68,11 @@ impl<F: RichField + Extendable<D>, const D: usize> Stark<F, D> for FibonacciStar
             .constraint_last_row(vars.local_values[1] - vars.public_inputs[Self::PI_INDEX_RES]);
 
         // x0' <- x1
-        yield_constr.constraint(vars.next_values[0] - vars.local_values[1]);
+        yield_constr.constraint_transition(vars.next_values[0] - vars.local_values[1]);
         // x1' <- x0 + x1
-        yield_constr.constraint(vars.next_values[1] - vars.local_values[0] - vars.local_values[1]);
+        yield_constr.constraint_transition(
+            vars.next_values[1] - vars.local_values[0] - vars.local_values[1],
+        );
     }
 
     fn eval_ext_recursively(
@@ -91,13 +93,13 @@ impl<F: RichField + Extendable<D>, const D: usize> Stark<F, D> for FibonacciStar
 
         // x0' <- x1
         let first_col_constraint = builder.sub_extension(vars.next_values[0], vars.local_values[1]);
-        yield_constr.constraint(builder, first_col_constraint);
+        yield_constr.constraint_transition(builder, first_col_constraint);
         // x1' <- x0 + x1
         let second_col_constraint = {
             let tmp = builder.sub_extension(vars.next_values[1], vars.local_values[0]);
             builder.sub_extension(tmp, vars.local_values[1])
         };
-        yield_constr.constraint(builder, second_col_constraint);
+        yield_constr.constraint_transition(builder, second_col_constraint);
     }
 
     fn constraint_degree(&self) -> usize {
diff --git a/system_zero/src/alu/addition.rs b/system_zero/src/alu/addition.rs
index 068092e8..dc83ecb8 100644
--- a/system_zero/src/alu/addition.rs
+++ b/system_zero/src/alu/addition.rs
@@ -41,7 +41,7 @@ pub(crate) fn eval_addition<F: Field, P: PackedField<Scalar = F>>(
 
     let computed_out = in_1 + in_2 + in_3;
 
-    yield_constr.constraint_wrapping(is_add * (out - computed_out));
+    yield_constr.constraint(is_add * (out - computed_out));
 }
 
 pub(crate) fn eval_addition_recursively<F: RichField + Extendable<D>, const D: usize>(
@@ -66,5 +66,5 @@ pub(crate) fn eval_addition_recursively<F: RichField + Extendable<D>, const D: u
 
     let diff = builder.sub_extension(out, computed_out);
     let filtered_diff = builder.mul_extension(is_add, diff);
-    yield_constr.constraint_wrapping(builder, filtered_diff);
+    yield_constr.constraint(builder, filtered_diff);
 }
diff --git a/system_zero/src/alu/mod.rs b/system_zero/src/alu/mod.rs
index 17a12df1..4e7e09fa 100644
--- a/system_zero/src/alu/mod.rs
+++ b/system_zero/src/alu/mod.rs
@@ -45,7 +45,7 @@ pub(crate) fn eval_alu<F: Field, P: PackedField<Scalar = F>>(
     // Check that the operation flag values are binary.
     for col in [IS_ADD, IS_SUB, IS_MUL, IS_DIV] {
         let val = local_values[col];
-        yield_constr.constraint_wrapping(val * val - val);
+        yield_constr.constraint(val * val - val);
     }
 
     eval_addition(local_values, yield_constr);
@@ -65,7 +65,7 @@ pub(crate) fn eval_alu_recursively<F: RichField + Extendable<D>, const D: usize>
     for col in [IS_ADD, IS_SUB, IS_MUL, IS_DIV] {
         let val = local_values[col];
         let constraint = builder.mul_sub_extension(val, val, val);
-        yield_constr.constraint_wrapping(builder, constraint);
+        yield_constr.constraint(builder, constraint);
     }
 
     eval_addition_recursively(builder, local_values, yield_constr);
diff --git a/system_zero/src/core_registers.rs b/system_zero/src/core_registers.rs
index c8c6533b..1f33611a 100644
--- a/system_zero/src/core_registers.rs
+++ b/system_zero/src/core_registers.rs
@@ -49,7 +49,7 @@ pub(crate) fn eval_core_registers<F: Field, P: PackedField<Scalar = F>>(
     let next_clock = vars.next_values[COL_CLOCK];
     let delta_clock = next_clock - local_clock;
     yield_constr.constraint_first_row(local_clock);
-    yield_constr.constraint(delta_clock - F::ONE);
+    yield_constr.constraint_transition(delta_clock - F::ONE);
 
     // The 16-bit table must start with 0, end with 2^16 - 1, and increment by 0 or 1.
     let local_range_16 = vars.local_values[COL_RANGE_16];
@@ -57,7 +57,7 @@ pub(crate) fn eval_core_registers<F: Field, P: PackedField<Scalar = F>>(
     let delta_range_16 = next_range_16 - local_range_16;
     yield_constr.constraint_first_row(local_range_16);
     yield_constr.constraint_last_row(local_range_16 - F::from_canonical_u64((1 << 16) - 1));
-    yield_constr.constraint(delta_range_16 * delta_range_16 - delta_range_16);
+    yield_constr.constraint_transition(delta_range_16 * delta_range_16 - delta_range_16);
 
     // TODO constraints for stack etc.
 }
@@ -77,7 +77,7 @@ pub(crate) fn eval_core_registers_recursively<F: RichField + Extendable<D>, cons
     let delta_clock = builder.sub_extension(next_clock, local_clock);
     yield_constr.constraint_first_row(builder, local_clock);
     let constraint = builder.sub_extension(delta_clock, one_ext);
-    yield_constr.constraint(builder, constraint);
+    yield_constr.constraint_transition(builder, constraint);
 
     // The 16-bit table must start with 0, end with 2^16 - 1, and increment by 0 or 1.
     let local_range_16 = vars.local_values[COL_RANGE_16];
@@ -87,7 +87,7 @@ pub(crate) fn eval_core_registers_recursively<F: RichField + Extendable<D>, cons
     let constraint = builder.sub_extension(local_range_16, max_u16_ext);
     yield_constr.constraint_last_row(builder, constraint);
     let constraint = builder.mul_add_extension(delta_range_16, delta_range_16, delta_range_16);
-    yield_constr.constraint(builder, constraint);
+    yield_constr.constraint_transition(builder, constraint);
 
     // TODO constraints for stack etc.
 }
diff --git a/system_zero/src/permutation_unit.rs b/system_zero/src/permutation_unit.rs
index 366cff65..079ab14a 100644
--- a/system_zero/src/permutation_unit.rs
+++ b/system_zero/src/permutation_unit.rs
@@ -127,8 +127,7 @@ pub(crate) fn eval_permutation_unit<F, FE, P, const D: usize>(
 
         for i in 0..SPONGE_WIDTH {
             let state_cubed = state[i] * state[i].square();
-            yield_constr
-                .constraint_wrapping(state_cubed - local_values[col_full_first_mid_sbox(r, i)]);
+            yield_constr.constraint(state_cubed - local_values[col_full_first_mid_sbox(r, i)]);
             let state_cubed = local_values[col_full_first_mid_sbox(r, i)];
             state[i] *= state_cubed.square(); // Form state ** 7.
         }
@@ -136,8 +135,7 @@ pub(crate) fn eval_permutation_unit<F, FE, P, const D: usize>(
         state = mds_layer(state);
 
         for i in 0..SPONGE_WIDTH {
-            yield_constr
-                .constraint_wrapping(state[i] - local_values[col_full_first_after_mds(r, i)]);
+            yield_constr.constraint(state[i] - local_values[col_full_first_after_mds(r, i)]);
             state[i] = local_values[col_full_first_after_mds(r, i)];
         }
     }
@@ -146,10 +144,10 @@ pub(crate) fn eval_permutation_unit<F, FE, P, const D: usize>(
         state = constant_layer(state, HALF_N_FULL_ROUNDS + r);
 
         let state0_cubed = state[0] * state[0].square();
-        yield_constr.constraint_wrapping(state0_cubed - local_values[col_partial_mid_sbox(r)]);
+        yield_constr.constraint(state0_cubed - local_values[col_partial_mid_sbox(r)]);
         let state0_cubed = local_values[col_partial_mid_sbox(r)];
         state[0] *= state0_cubed.square(); // Form state ** 7.
-        yield_constr.constraint_wrapping(state[0] - local_values[col_partial_after_sbox(r)]);
+        yield_constr.constraint(state[0] - local_values[col_partial_after_sbox(r)]);
         state[0] = local_values[col_partial_after_sbox(r)];
 
         state = mds_layer(state);
@@ -160,8 +158,7 @@ pub(crate) fn eval_permutation_unit<F, FE, P, const D: usize>(
 
         for i in 0..SPONGE_WIDTH {
             let state_cubed = state[i] * state[i].square();
-            yield_constr
-                .constraint_wrapping(state_cubed - local_values[col_full_second_mid_sbox(r, i)]);
+            yield_constr.constraint(state_cubed - local_values[col_full_second_mid_sbox(r, i)]);
             let state_cubed = local_values[col_full_second_mid_sbox(r, i)];
             state[i] *= state_cubed.square(); // Form state ** 7.
         }
@@ -169,8 +166,7 @@ pub(crate) fn eval_permutation_unit<F, FE, P, const D: usize>(
         state = mds_layer(state);
 
         for i in 0..SPONGE_WIDTH {
-            yield_constr
-                .constraint_wrapping(state[i] - local_values[col_full_second_after_mds(r, i)]);
+            yield_constr.constraint(state[i] - local_values[col_full_second_after_mds(r, i)]);
             state[i] = local_values[col_full_second_after_mds(r, i)];
         }
     }
@@ -197,7 +193,7 @@ pub(crate) fn eval_permutation_unit_recursively<F: RichField + Extendable<D>, co
             let state_cubed = builder.cube_extension(state[i]);
             let diff =
                 builder.sub_extension(state_cubed, local_values[col_full_first_mid_sbox(r, i)]);
-            yield_constr.constraint_wrapping(builder, diff);
+            yield_constr.constraint(builder, diff);
             let state_cubed = local_values[col_full_first_mid_sbox(r, i)];
             state[i] = builder.mul_many_extension(&[state[i], state_cubed, state_cubed]);
             // Form state ** 7.
@@ -208,7 +204,7 @@ pub(crate) fn eval_permutation_unit_recursively<F: RichField + Extendable<D>, co
         for i in 0..SPONGE_WIDTH {
             let diff =
                 builder.sub_extension(state[i], local_values[col_full_first_after_mds(r, i)]);
-            yield_constr.constraint_wrapping(builder, diff);
+            yield_constr.constraint(builder, diff);
             state[i] = local_values[col_full_first_after_mds(r, i)];
         }
     }
@@ -218,11 +214,11 @@ pub(crate) fn eval_permutation_unit_recursively<F: RichField + Extendable<D>, co
 
         let state0_cubed = builder.cube_extension(state[0]);
         let diff = builder.sub_extension(state0_cubed, local_values[col_partial_mid_sbox(r)]);
-        yield_constr.constraint_wrapping(builder, diff);
+        yield_constr.constraint(builder, diff);
         let state0_cubed = local_values[col_partial_mid_sbox(r)];
         state[0] = builder.mul_many_extension(&[state[0], state0_cubed, state0_cubed]); // Form state ** 7.
         let diff = builder.sub_extension(state[0], local_values[col_partial_after_sbox(r)]);
-        yield_constr.constraint_wrapping(builder, diff);
+        yield_constr.constraint(builder, diff);
         state[0] = local_values[col_partial_after_sbox(r)];
 
         state = F::mds_layer_recursive(builder, &state);
@@ -239,7 +235,7 @@ pub(crate) fn eval_permutation_unit_recursively<F: RichField + Extendable<D>, co
             let state_cubed = builder.cube_extension(state[i]);
             let diff =
                 builder.sub_extension(state_cubed, local_values[col_full_second_mid_sbox(r, i)]);
-            yield_constr.constraint_wrapping(builder, diff);
+            yield_constr.constraint(builder, diff);
             let state_cubed = local_values[col_full_second_mid_sbox(r, i)];
             state[i] = builder.mul_many_extension(&[state[i], state_cubed, state_cubed]);
             // Form state ** 7.
@@ -250,7 +246,7 @@ pub(crate) fn eval_permutation_unit_recursively<F: RichField + Extendable<D>, co
         for i in 0..SPONGE_WIDTH {
             let diff =
                 builder.sub_extension(state[i], local_values[col_full_second_after_mds(r, i)]);
-            yield_constr.constraint_wrapping(builder, diff);
+            yield_constr.constraint(builder, diff);
             state[i] = local_values[col_full_second_after_mds(r, i)];
         }
     }

From 6072fab0770eb2f9797bdc09997e72b85282e77f Mon Sep 17 00:00:00 2001
From: Daniel Lubarov <daniel@lubarov.com>
Date: Mon, 21 Feb 2022 00:39:04 -0800
Subject: [PATCH 09/32] Implement a mul-add circuit in the ALU (#495)

* Implement a mul-add circuit in the ALU

The inputs are assumed to be `u32`s, while the output is encoded as four `u16 limbs`. Each output limb is range-checked.

So, our basic mul-add constraint looks like

    out_0 + 2^16 out_1 + 2^32 out_2 + 2^48 out_3 = in_1 * in_2 + in_3

The right hand side will never overflow, since `u32::MAX * u32::MAX + u32::MAX < |F|`. However, the left hand side could overflow, even though we know each limb is less than `2^16`.

For example, an operation like `0 * 0 + 0` could have two possible outputs, 0 and `|F|`, both of which would satisfy the constraint above. To prevent these non-canonical outputs, we need a comparison to enforce that `out < |F|`.

Thankfully, `F::MAX` has all zeros in its low 32 bits, so `x <= F::MAX` is equivalent to `x_lo == 0 || x_hi != u32::MAX`. `x_hi != u32::MAX` can be checked by showing that `u32::MAX - x_hi` has an inverse. If `x_hi != u32::MAX`, the prover provides this (purported) inverse in an advice column.

See @bobbinth's [post](https://hackmd.io/NC-yRmmtRQSvToTHb96e8Q#Checking-element-validity) for details. That post calls the purported inverse column `m`; I named it `canonical_inv` in this code.

* fix

* PR feedback

* naming
---
 system_zero/Cargo.toml                |   1 +
 system_zero/src/alu/addition.rs       |  36 ++++-----
 system_zero/src/alu/canonical.rs      | 109 ++++++++++++++++++++++++++
 system_zero/src/alu/mod.rs            |  13 ++-
 system_zero/src/alu/mul_add.rs        |  88 +++++++++++++++++++++
 system_zero/src/alu/multiplication.rs |  31 --------
 system_zero/src/registers/alu.rs      |  34 ++++++--
 7 files changed, 249 insertions(+), 63 deletions(-)
 create mode 100644 system_zero/src/alu/canonical.rs
 create mode 100644 system_zero/src/alu/mul_add.rs
 delete mode 100644 system_zero/src/alu/multiplication.rs

diff --git a/system_zero/Cargo.toml b/system_zero/Cargo.toml
index e5b617c9..032bfb53 100644
--- a/system_zero/Cargo.toml
+++ b/system_zero/Cargo.toml
@@ -6,6 +6,7 @@ edition = "2021"
 
 [dependencies]
 plonky2 = { path = "../plonky2" }
+plonky2_util = { path = "../util" }
 starky = { path = "../starky" }
 anyhow = "1.0.40"
 env_logger = "0.9.0"
diff --git a/system_zero/src/alu/addition.rs b/system_zero/src/alu/addition.rs
index dc83ecb8..c2293b4a 100644
--- a/system_zero/src/alu/addition.rs
+++ b/system_zero/src/alu/addition.rs
@@ -11,14 +11,14 @@ use crate::registers::alu::*;
 use crate::registers::NUM_COLUMNS;
 
 pub(crate) fn generate_addition<F: PrimeField64>(values: &mut [F; NUM_COLUMNS]) {
-    let in_1 = values[COL_ADD_INPUT_1].to_canonical_u64();
-    let in_2 = values[COL_ADD_INPUT_2].to_canonical_u64();
-    let in_3 = values[COL_ADD_INPUT_3].to_canonical_u64();
+    let in_1 = values[COL_ADD_INPUT_0].to_canonical_u64();
+    let in_2 = values[COL_ADD_INPUT_1].to_canonical_u64();
+    let in_3 = values[COL_ADD_INPUT_2].to_canonical_u64();
     let output = in_1 + in_2 + in_3;
 
-    values[COL_ADD_OUTPUT_1] = F::from_canonical_u16(output as u16);
-    values[COL_ADD_OUTPUT_2] = F::from_canonical_u16((output >> 16) as u16);
-    values[COL_ADD_OUTPUT_3] = F::from_canonical_u16((output >> 32) as u16);
+    values[COL_ADD_OUTPUT_0] = F::from_canonical_u16(output as u16);
+    values[COL_ADD_OUTPUT_1] = F::from_canonical_u16((output >> 16) as u16);
+    values[COL_ADD_OUTPUT_2] = F::from_canonical_u16((output >> 32) as u16);
 }
 
 pub(crate) fn eval_addition<F: Field, P: PackedField<Scalar = F>>(
@@ -26,12 +26,12 @@ pub(crate) fn eval_addition<F: Field, P: PackedField<Scalar = F>>(
     yield_constr: &mut ConstraintConsumer<P>,
 ) {
     let is_add = local_values[IS_ADD];
-    let in_1 = local_values[COL_ADD_INPUT_1];
-    let in_2 = local_values[COL_ADD_INPUT_2];
-    let in_3 = local_values[COL_ADD_INPUT_3];
-    let out_1 = local_values[COL_ADD_OUTPUT_1];
-    let out_2 = local_values[COL_ADD_OUTPUT_2];
-    let out_3 = local_values[COL_ADD_OUTPUT_3];
+    let in_1 = local_values[COL_ADD_INPUT_0];
+    let in_2 = local_values[COL_ADD_INPUT_1];
+    let in_3 = local_values[COL_ADD_INPUT_2];
+    let out_1 = local_values[COL_ADD_OUTPUT_0];
+    let out_2 = local_values[COL_ADD_OUTPUT_1];
+    let out_3 = local_values[COL_ADD_OUTPUT_2];
 
     let weight_2 = F::from_canonical_u64(1 << 16);
     let weight_3 = F::from_canonical_u64(1 << 32);
@@ -50,12 +50,12 @@ pub(crate) fn eval_addition_recursively<F: RichField + Extendable<D>, const D: u
     yield_constr: &mut RecursiveConstraintConsumer<F, D>,
 ) {
     let is_add = local_values[IS_ADD];
-    let in_1 = local_values[COL_ADD_INPUT_1];
-    let in_2 = local_values[COL_ADD_INPUT_2];
-    let in_3 = local_values[COL_ADD_INPUT_3];
-    let out_1 = local_values[COL_ADD_OUTPUT_1];
-    let out_2 = local_values[COL_ADD_OUTPUT_2];
-    let out_3 = local_values[COL_ADD_OUTPUT_3];
+    let in_1 = local_values[COL_ADD_INPUT_0];
+    let in_2 = local_values[COL_ADD_INPUT_1];
+    let in_3 = local_values[COL_ADD_INPUT_2];
+    let out_1 = local_values[COL_ADD_OUTPUT_0];
+    let out_2 = local_values[COL_ADD_OUTPUT_1];
+    let out_3 = local_values[COL_ADD_OUTPUT_2];
 
     let limb_base = builder.constant(F::from_canonical_u64(1 << 16));
     // Note that this can't overflow. Since each output limb has been range checked as 16-bits,
diff --git a/system_zero/src/alu/canonical.rs b/system_zero/src/alu/canonical.rs
new file mode 100644
index 00000000..fb90eb0d
--- /dev/null
+++ b/system_zero/src/alu/canonical.rs
@@ -0,0 +1,109 @@
+//! Helper methods for checking that a value is canonical, i.e. is less than `|F|`.
+//!
+//! See https://hackmd.io/NC-yRmmtRQSvToTHb96e8Q#Checking-element-validity
+
+use plonky2::field::extension_field::Extendable;
+use plonky2::field::field_types::Field;
+use plonky2::field::packed_field::PackedField;
+use plonky2::hash::hash_types::RichField;
+use plonky2::iop::ext_target::ExtensionTarget;
+use plonky2::plonk::circuit_builder::CircuitBuilder;
+use starky::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer};
+
+/// Computes the helper value used in the is-canonical check.
+pub(crate) fn compute_canonical_inv<F: Field>(value_to_check: u64) -> F {
+    let value_hi_32 = (value_to_check >> 32) as u32;
+
+    if value_hi_32 == u32::MAX {
+        debug_assert_eq!(value_to_check as u32, 0, "Value was not canonical.");
+        // In this case it doesn't matter what we put for the purported inverse value. The
+        // constraint containing this value will get multiplied by the low u32 limb, which will be
+        // zero, satisfying the constraint regardless of what we put here.
+        F::ZERO
+    } else {
+        F::from_canonical_u32(u32::MAX - value_hi_32).inverse()
+    }
+}
+
+/// Adds constraints to require that a list of four `u16`s, in little-endian order, represent a
+/// canonical field element, i.e. that their combined value is less than `|F|`. Returns their
+/// combined value.
+pub(crate) fn combine_u16s_check_canonical<F: Field, P: PackedField<Scalar = F>>(
+    limb_0_u16: P,
+    limb_1_u16: P,
+    limb_2_u16: P,
+    limb_3_u16: P,
+    inverse: P,
+    yield_constr: &mut ConstraintConsumer<P>,
+) -> P {
+    let base = F::from_canonical_u32(1 << 16);
+    let limb_0_u32 = limb_0_u16 + limb_1_u16 * base;
+    let limb_1_u32 = limb_2_u16 + limb_3_u16 * base;
+    combine_u32s_check_canonical(limb_0_u32, limb_1_u32, inverse, yield_constr)
+}
+
+/// Adds constraints to require that a list of four `u16`s, in little-endian order, represent a
+/// canonical field element, i.e. that their combined value is less than `|F|`. Returns their
+/// combined value.
+pub(crate) fn combine_u16s_check_canonical_circuit<F: RichField + Extendable<D>, const D: usize>(
+    builder: &mut CircuitBuilder<F, D>,
+    limb_0_u16: ExtensionTarget<D>,
+    limb_1_u16: ExtensionTarget<D>,
+    limb_2_u16: ExtensionTarget<D>,
+    limb_3_u16: ExtensionTarget<D>,
+    inverse: ExtensionTarget<D>,
+    yield_constr: &mut RecursiveConstraintConsumer<F, D>,
+) -> ExtensionTarget<D> {
+    let base = F::from_canonical_u32(1 << 16);
+    let limb_0_u32 = builder.mul_const_add_extension(base, limb_1_u16, limb_0_u16);
+    let limb_1_u32 = builder.mul_const_add_extension(base, limb_3_u16, limb_2_u16);
+    combine_u32s_check_canonical_circuit(builder, limb_0_u32, limb_1_u32, inverse, yield_constr)
+}
+
+/// Adds constraints to require that a pair of `u32`s, in little-endian order, represent a canonical
+/// field element, i.e. that their combined value is less than `|F|`. Returns their combined value.
+pub(crate) fn combine_u32s_check_canonical<F: Field, P: PackedField<Scalar = F>>(
+    limb_0_u32: P,
+    limb_1_u32: P,
+    inverse: P,
+    yield_constr: &mut ConstraintConsumer<P>,
+) -> P {
+    let u32_max = P::from(F::from_canonical_u32(u32::MAX));
+
+    // This is zero if and only if the high limb is `u32::MAX`.
+    let diff = u32_max - limb_1_u32;
+    // If this is zero, the diff is invertible, so the high limb is not `u32::MAX`.
+    let hi_not_max = inverse * diff - F::ONE;
+    // If this is zero, either the high limb is not `u32::MAX`, or the low limb is zero.
+    let hi_not_max_or_lo_zero = hi_not_max * limb_0_u32;
+
+    yield_constr.constraint(hi_not_max_or_lo_zero);
+
+    // Return the combined value.
+    limb_0_u32 + limb_1_u32 * F::from_canonical_u64(1 << 32)
+}
+
+/// Adds constraints to require that a pair of `u32`s, in little-endian order, represent a canonical
+/// field element, i.e. that their combined value is less than `|F|`. Returns their combined value.
+pub(crate) fn combine_u32s_check_canonical_circuit<F: RichField + Extendable<D>, const D: usize>(
+    builder: &mut CircuitBuilder<F, D>,
+    limb_0_u32: ExtensionTarget<D>,
+    limb_1_u32: ExtensionTarget<D>,
+    inverse: ExtensionTarget<D>,
+    yield_constr: &mut RecursiveConstraintConsumer<F, D>,
+) -> ExtensionTarget<D> {
+    let one = builder.one_extension();
+    let u32_max = builder.constant_extension(F::Extension::from_canonical_u32(u32::MAX));
+
+    // This is zero if and only if the high limb is `u32::MAX`.
+    let diff = builder.sub_extension(u32_max, limb_1_u32);
+    // If this is zero, the diff is invertible, so the high limb is not `u32::MAX`.
+    let hi_not_max = builder.mul_sub_extension(inverse, diff, one);
+    // If this is zero, either the high limb is not `u32::MAX`, or the low limb is zero.
+    let hi_not_max_or_lo_zero = builder.mul_extension(hi_not_max, limb_0_u32);
+
+    yield_constr.constraint(builder, hi_not_max_or_lo_zero);
+
+    // Return the combined value.
+    builder.mul_const_add_extension(F::from_canonical_u64(1 << 32), limb_1_u32, limb_0_u32)
+}
diff --git a/system_zero/src/alu/mod.rs b/system_zero/src/alu/mod.rs
index 4e7e09fa..730ca302 100644
--- a/system_zero/src/alu/mod.rs
+++ b/system_zero/src/alu/mod.rs
@@ -9,9 +9,7 @@ use starky::vars::StarkEvaluationVars;
 
 use crate::alu::addition::{eval_addition, eval_addition_recursively, generate_addition};
 use crate::alu::division::{eval_division, eval_division_recursively, generate_division};
-use crate::alu::multiplication::{
-    eval_multiplication, eval_multiplication_recursively, generate_multiplication,
-};
+use crate::alu::mul_add::{eval_mul_add, eval_mul_add_recursively, generate_mul_add};
 use crate::alu::subtraction::{
     eval_subtraction, eval_subtraction_recursively, generate_subtraction,
 };
@@ -20,8 +18,9 @@ use crate::registers::alu::*;
 use crate::registers::NUM_COLUMNS;
 
 mod addition;
+mod canonical;
 mod division;
-mod multiplication;
+mod mul_add;
 mod subtraction;
 
 pub(crate) fn generate_alu<F: PrimeField64>(values: &mut [F; NUM_COLUMNS]) {
@@ -30,7 +29,7 @@ pub(crate) fn generate_alu<F: PrimeField64>(values: &mut [F; NUM_COLUMNS]) {
     } else if values[IS_SUB].is_one() {
         generate_subtraction(values);
     } else if values[IS_MUL].is_one() {
-        generate_multiplication(values);
+        generate_mul_add(values);
     } else if values[IS_DIV].is_one() {
         generate_division(values);
     }
@@ -50,7 +49,7 @@ pub(crate) fn eval_alu<F: Field, P: PackedField<Scalar = F>>(
 
     eval_addition(local_values, yield_constr);
     eval_subtraction(local_values, yield_constr);
-    eval_multiplication(local_values, yield_constr);
+    eval_mul_add(local_values, yield_constr);
     eval_division(local_values, yield_constr);
 }
 
@@ -70,6 +69,6 @@ pub(crate) fn eval_alu_recursively<F: RichField + Extendable<D>, const D: usize>
 
     eval_addition_recursively(builder, local_values, yield_constr);
     eval_subtraction_recursively(builder, local_values, yield_constr);
-    eval_multiplication_recursively(builder, local_values, yield_constr);
+    eval_mul_add_recursively(builder, local_values, yield_constr);
     eval_division_recursively(builder, local_values, yield_constr);
 }
diff --git a/system_zero/src/alu/mul_add.rs b/system_zero/src/alu/mul_add.rs
new file mode 100644
index 00000000..53ba34a2
--- /dev/null
+++ b/system_zero/src/alu/mul_add.rs
@@ -0,0 +1,88 @@
+use plonky2::field::extension_field::Extendable;
+use plonky2::field::field_types::{Field, PrimeField64};
+use plonky2::field::packed_field::PackedField;
+use plonky2::hash::hash_types::RichField;
+use plonky2::iop::ext_target::ExtensionTarget;
+use plonky2::plonk::circuit_builder::CircuitBuilder;
+use plonky2_util::assume;
+use starky::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer};
+
+use crate::alu::canonical::*;
+use crate::registers::alu::*;
+use crate::registers::NUM_COLUMNS;
+
+pub(crate) fn generate_mul_add<F: PrimeField64>(values: &mut [F; NUM_COLUMNS]) {
+    let factor_0 = values[COL_MUL_ADD_FACTOR_0].to_canonical_u64();
+    let factor_1 = values[COL_MUL_ADD_FACTOR_1].to_canonical_u64();
+    let addend = values[COL_MUL_ADD_ADDEND].to_canonical_u64();
+
+    // Let the compiler know that each input must fit in 32 bits.
+    assume(factor_0 <= u32::MAX as u64);
+    assume(factor_1 <= u32::MAX as u64);
+    assume(addend <= u32::MAX as u64);
+
+    let output = factor_0 * factor_1 + addend;
+
+    // An advice value used to help verify that the limbs represent a canonical field element.
+    values[COL_MUL_ADD_RESULT_CANONICAL_INV] = compute_canonical_inv(output);
+
+    values[COL_MUL_ADD_OUTPUT_0] = F::from_canonical_u16(output as u16);
+    values[COL_MUL_ADD_OUTPUT_1] = F::from_canonical_u16((output >> 16) as u16);
+    values[COL_MUL_ADD_OUTPUT_2] = F::from_canonical_u16((output >> 32) as u16);
+    values[COL_MUL_ADD_OUTPUT_3] = F::from_canonical_u16((output >> 48) as u16);
+}
+
+pub(crate) fn eval_mul_add<F: Field, P: PackedField<Scalar = F>>(
+    local_values: &[P; NUM_COLUMNS],
+    yield_constr: &mut ConstraintConsumer<P>,
+) {
+    let is_mul = local_values[IS_MUL];
+    let factor_0 = local_values[COL_MUL_ADD_FACTOR_0];
+    let factor_1 = local_values[COL_MUL_ADD_FACTOR_1];
+    let addend = local_values[COL_MUL_ADD_ADDEND];
+    let output_1 = local_values[COL_MUL_ADD_OUTPUT_0];
+    let output_2 = local_values[COL_MUL_ADD_OUTPUT_1];
+    let output_3 = local_values[COL_MUL_ADD_OUTPUT_2];
+    let output_4 = local_values[COL_MUL_ADD_OUTPUT_3];
+    let result_canonical_inv = local_values[COL_MUL_ADD_RESULT_CANONICAL_INV];
+
+    let computed_output = factor_0 * factor_1 + addend;
+    let output = combine_u16s_check_canonical(
+        output_1,
+        output_2,
+        output_3,
+        output_4,
+        result_canonical_inv,
+        yield_constr,
+    );
+    yield_constr.constraint(computed_output - output);
+}
+
+pub(crate) fn eval_mul_add_recursively<F: RichField + Extendable<D>, const D: usize>(
+    builder: &mut CircuitBuilder<F, D>,
+    local_values: &[ExtensionTarget<D>; NUM_COLUMNS],
+    yield_constr: &mut RecursiveConstraintConsumer<F, D>,
+) {
+    let is_mul = local_values[IS_MUL];
+    let factor_0 = local_values[COL_MUL_ADD_FACTOR_0];
+    let factor_1 = local_values[COL_MUL_ADD_FACTOR_1];
+    let addend = local_values[COL_MUL_ADD_ADDEND];
+    let output_1 = local_values[COL_MUL_ADD_OUTPUT_0];
+    let output_2 = local_values[COL_MUL_ADD_OUTPUT_1];
+    let output_3 = local_values[COL_MUL_ADD_OUTPUT_2];
+    let output_4 = local_values[COL_MUL_ADD_OUTPUT_3];
+    let result_canonical_inv = local_values[COL_MUL_ADD_RESULT_CANONICAL_INV];
+
+    let computed_output = builder.mul_add_extension(factor_0, factor_1, addend);
+    let output = combine_u16s_check_canonical_circuit(
+        builder,
+        output_1,
+        output_2,
+        output_3,
+        output_4,
+        result_canonical_inv,
+        yield_constr,
+    );
+    let diff = builder.sub_extension(computed_output, output);
+    yield_constr.constraint(builder, diff);
+}
diff --git a/system_zero/src/alu/multiplication.rs b/system_zero/src/alu/multiplication.rs
deleted file mode 100644
index a88b42f6..00000000
--- a/system_zero/src/alu/multiplication.rs
+++ /dev/null
@@ -1,31 +0,0 @@
-use plonky2::field::extension_field::Extendable;
-use plonky2::field::field_types::{Field, PrimeField64};
-use plonky2::field::packed_field::PackedField;
-use plonky2::hash::hash_types::RichField;
-use plonky2::iop::ext_target::ExtensionTarget;
-use plonky2::plonk::circuit_builder::CircuitBuilder;
-use starky::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer};
-
-use crate::registers::alu::*;
-use crate::registers::NUM_COLUMNS;
-
-pub(crate) fn generate_multiplication<F: PrimeField64>(values: &mut [F; NUM_COLUMNS]) {
-    // TODO
-}
-
-pub(crate) fn eval_multiplication<F: Field, P: PackedField<Scalar = F>>(
-    local_values: &[P; NUM_COLUMNS],
-    yield_constr: &mut ConstraintConsumer<P>,
-) {
-    let is_mul = local_values[IS_MUL];
-    // TODO
-}
-
-pub(crate) fn eval_multiplication_recursively<F: RichField + Extendable<D>, const D: usize>(
-    builder: &mut CircuitBuilder<F, D>,
-    local_values: &[ExtensionTarget<D>; NUM_COLUMNS],
-    yield_constr: &mut RecursiveConstraintConsumer<F, D>,
-) {
-    let is_mul = local_values[IS_MUL];
-    // TODO
-}
diff --git a/system_zero/src/registers/alu.rs b/system_zero/src/registers/alu.rs
index b4f82dff..e678d8e4 100644
--- a/system_zero/src/registers/alu.rs
+++ b/system_zero/src/registers/alu.rs
@@ -10,7 +10,7 @@ const START_SHARED_COLS: usize = IS_DIV + 1;
 /// Within the ALU, there are shared columns which can be used by any arithmetic/logic
 /// circuit, depending on which one is active this cycle.
 // Can be increased as needed as other operations are implemented.
-const NUM_SHARED_COLS: usize = 3;
+const NUM_SHARED_COLS: usize = 4;
 
 const fn shared_col(i: usize) -> usize {
     debug_assert!(i < NUM_SHARED_COLS);
@@ -18,20 +18,40 @@ const fn shared_col(i: usize) -> usize {
 }
 
 /// The first value to be added; treated as an unsigned u32.
-pub(crate) const COL_ADD_INPUT_1: usize = shared_col(0);
+pub(crate) const COL_ADD_INPUT_0: usize = shared_col(0);
 /// The second value to be added; treated as an unsigned u32.
-pub(crate) const COL_ADD_INPUT_2: usize = shared_col(1);
+pub(crate) const COL_ADD_INPUT_1: usize = shared_col(1);
 /// The third value to be added; treated as an unsigned u32.
-pub(crate) const COL_ADD_INPUT_3: usize = shared_col(2);
+pub(crate) const COL_ADD_INPUT_2: usize = shared_col(2);
 
 // Note: Addition outputs three 16-bit chunks, and since these values need to be range-checked
 // anyway, we might as well use the range check unit's columns as our addition outputs. So the
 // three proceeding columns are basically aliases, not columns owned by the ALU.
 /// The first 16-bit chunk of the output, based on little-endian ordering.
-pub(crate) const COL_ADD_OUTPUT_1: usize = super::range_check_16::col_rc_16_input(0);
+pub(crate) const COL_ADD_OUTPUT_0: usize = super::range_check_16::col_rc_16_input(0);
 /// The second 16-bit chunk of the output, based on little-endian ordering.
-pub(crate) const COL_ADD_OUTPUT_2: usize = super::range_check_16::col_rc_16_input(1);
+pub(crate) const COL_ADD_OUTPUT_1: usize = super::range_check_16::col_rc_16_input(1);
 /// The third 16-bit chunk of the output, based on little-endian ordering.
-pub(crate) const COL_ADD_OUTPUT_3: usize = super::range_check_16::col_rc_16_input(2);
+pub(crate) const COL_ADD_OUTPUT_2: usize = super::range_check_16::col_rc_16_input(2);
+
+/// The first value to be multiplied; treated as an unsigned u32.
+pub(crate) const COL_MUL_ADD_FACTOR_0: usize = shared_col(0);
+/// The second value to be multiplied; treated as an unsigned u32.
+pub(crate) const COL_MUL_ADD_FACTOR_1: usize = shared_col(1);
+/// The value to be added to the product; treated as an unsigned u32.
+pub(crate) const COL_MUL_ADD_ADDEND: usize = shared_col(2);
+
+/// The inverse of `u32::MAX - result_hi`, where `output_hi` is the high 32-bits of the result.
+/// See https://hackmd.io/NC-yRmmtRQSvToTHb96e8Q#Checking-element-validity
+pub(crate) const COL_MUL_ADD_RESULT_CANONICAL_INV: usize = shared_col(3);
+
+/// The first 16-bit chunk of the output, based on little-endian ordering.
+pub(crate) const COL_MUL_ADD_OUTPUT_0: usize = super::range_check_16::col_rc_16_input(0);
+/// The second 16-bit chunk of the output, based on little-endian ordering.
+pub(crate) const COL_MUL_ADD_OUTPUT_1: usize = super::range_check_16::col_rc_16_input(1);
+/// The third 16-bit chunk of the output, based on little-endian ordering.
+pub(crate) const COL_MUL_ADD_OUTPUT_2: usize = super::range_check_16::col_rc_16_input(2);
+/// The fourth 16-bit chunk of the output, based on little-endian ordering.
+pub(crate) const COL_MUL_ADD_OUTPUT_3: usize = super::range_check_16::col_rc_16_input(3);
 
 pub(super) const END: usize = super::START_ALU + NUM_SHARED_COLS;

From d52fabaf26975be5d00ea520282b9dd23bd80045 Mon Sep 17 00:00:00 2001
From: wborgeaud <williamborgeaud@gmail.com>
Date: Mon, 21 Feb 2022 10:18:05 +0100
Subject: [PATCH 10/32] First pass

---
 starky/src/lib.rs            |  1 +
 starky/src/permutation.rs    | 54 ++++++++++++++++++++------------
 starky/src/vanishing_poly.rs | 60 ++++++++++++++++++++++++++++++++++++
 3 files changed, 95 insertions(+), 20 deletions(-)
 create mode 100644 starky/src/vanishing_poly.rs

diff --git a/starky/src/lib.rs b/starky/src/lib.rs
index 1df9629e..51a73479 100644
--- a/starky/src/lib.rs
+++ b/starky/src/lib.rs
@@ -14,6 +14,7 @@ pub mod prover;
 pub mod recursive_verifier;
 pub mod stark;
 pub mod stark_testing;
+pub mod vanishing_poly;
 pub mod vars;
 pub mod verifier;
 
diff --git a/starky/src/permutation.rs b/starky/src/permutation.rs
index 01cfa8bf..9306d0b2 100644
--- a/starky/src/permutation.rs
+++ b/starky/src/permutation.rs
@@ -62,26 +62,12 @@ where
         stark.permutation_batch_size(),
     );
 
-    // Get a list of instances of our batch-permutation argument. These are permutation arguments
-    // where the same `Z(x)` polynomial is used to check more than one permutation.
-    // Before batching, each permutation pair leads to `num_challenges` permutation arguments, so we
-    // start with the cartesian product of `permutation_pairs` and `0..num_challenges`. Then we
-    // chunk these arguments based on our batch size.
-    let permutation_batches = permutation_pairs
-        .iter()
-        .cartesian_product(0..config.num_challenges)
-        .chunks(stark.permutation_batch_size())
-        .into_iter()
-        .map(|batch| {
-            batch
-                .enumerate()
-                .map(|(i, (pair, chal))| {
-                    let challenge = permutation_challenge_sets[i].challenges[chal];
-                    PermutationInstance { pair, challenge }
-                })
-                .collect_vec()
-        })
-        .collect_vec();
+    let permutation_batches = get_permutation_batches(
+        &permutation_pairs,
+        &permutation_challenge_sets,
+        config.num_challenges,
+        stark.permutation_batch_size(),
+    );
 
     permutation_batches
         .into_par_iter()
@@ -178,3 +164,31 @@ pub(crate) fn get_n_permutation_challenge_sets<F: RichField, H: Hasher<F>>(
         .map(|_| get_permutation_challenge_set(challenger, num_challenges))
         .collect()
 }
+
+/// Get a list of instances of our batch-permutation argument. These are permutation arguments
+/// where the same `Z(x)` polynomial is used to check more than one permutation.
+/// Before batching, each permutation pair leads to `num_challenges` permutation arguments, so we
+/// start with the cartesian product of `permutation_pairs` and `0..num_challenges`. Then we
+/// chunk these arguments based on our batch size.
+pub(crate) fn get_permutation_batches<'a, F: Field>(
+    permutation_pairs: &'a [PermutationPair],
+    permutation_challenge_sets: &[PermutationChallengeSet<F>],
+    num_challenges: usize,
+    batch_size: usize,
+) -> Vec<Vec<PermutationInstance<'a, F>>> {
+    permutation_pairs
+        .iter()
+        .cartesian_product(0..num_challenges)
+        .chunks(batch_size)
+        .into_iter()
+        .map(|batch| {
+            batch
+                .enumerate()
+                .map(|(i, (pair, chal))| {
+                    let challenge = permutation_challenge_sets[i].challenges[chal];
+                    PermutationInstance { pair, challenge }
+                })
+                .collect_vec()
+        })
+        .collect()
+}
diff --git a/starky/src/vanishing_poly.rs b/starky/src/vanishing_poly.rs
new file mode 100644
index 00000000..6f7225b5
--- /dev/null
+++ b/starky/src/vanishing_poly.rs
@@ -0,0 +1,60 @@
+use plonky2::field::extension_field::Extendable;
+use plonky2::field::packed_field::PackedField;
+use plonky2::hash::hash_types::RichField;
+use plonky2::plonk::config::GenericConfig;
+use rayon::prelude::*;
+
+use crate::config::StarkConfig;
+use crate::constraint_consumer::ConstraintConsumer;
+use crate::permutation::{get_permutation_batches, PermutationChallenge};
+use crate::stark::Stark;
+use crate::vars::StarkEvaluationVars;
+
+pub(crate) fn eval_vanishing_poly<F, C, S, const D: usize>(
+    stark: S,
+    config: &StarkConfig,
+    vars: StarkEvaluationVars<F, F, S::COLUMNS, S::PUBLIC_INPUTS>,
+    local_zs: &[F::Extension],
+    next_zs: &[F::Extension],
+    mut consumer: ConstraintConsumer<F>,
+    permutation_challenge_sets: &[PermutationChallenge<F>],
+) where
+    F: RichField + Extendable<D>,
+    C: GenericConfig<D, F = F>,
+    S: Stark<F, D>,
+    [(); S::COLUMNS]:,
+    [(); S::PUBLIC_INPUTS]:,
+{
+    stark.eval_packed_base(vars, &mut consumer);
+}
+
+fn eval_permutation_checks<F, C, S, const D: usize>(
+    stark: S,
+    config: &StarkConfig,
+    vars: StarkEvaluationVars<F::Extension, F::Extension, S::COLUMNS, S::PUBLIC_INPUTS>,
+    local_zs: &[F::Extension],
+    next_zs: &[F::Extension],
+    mut consumer: ConstraintConsumer<F>,
+    permutation_challenge_sets: &[PermutationChallenge<F>],
+) where
+    F: RichField + Extendable<D>,
+    C: GenericConfig<D, F = F>,
+    S: Stark<F, D>,
+    [(); S::COLUMNS]:,
+    [(); S::PUBLIC_INPUTS]:,
+{
+    let permutation_pairs = stark.permutation_pairs();
+
+    let permutation_batches = get_permutation_batches(
+        &permutation_pairs,
+        &permutation_challenge_sets,
+        config.num_challenges,
+        stark.permutation_batch_size(),
+    );
+
+    // Each zs value corresponds to a permutation batch.
+    permutation_batches
+        .into_par_iter()
+        .map(|instances| compute_permutation_z_poly(&instances, trace_poly_values))
+        .collect()
+}

From 79ba85eb088a41cda96b76ccf1ca5b50646a2597 Mon Sep 17 00:00:00 2001
From: wborgeaud <williamborgeaud@gmail.com>
Date: Mon, 21 Feb 2022 10:52:04 +0100
Subject: [PATCH 11/32] Compiles

---
 starky/src/vanishing_poly.rs | 57 ++++++++++++++++++++++++++----------
 1 file changed, 41 insertions(+), 16 deletions(-)

diff --git a/starky/src/vanishing_poly.rs b/starky/src/vanishing_poly.rs
index 6f7225b5..a3323796 100644
--- a/starky/src/vanishing_poly.rs
+++ b/starky/src/vanishing_poly.rs
@@ -1,23 +1,24 @@
-use plonky2::field::extension_field::Extendable;
-use plonky2::field::packed_field::PackedField;
+use plonky2::field::extension_field::{Extendable, FieldExtension};
 use plonky2::hash::hash_types::RichField;
 use plonky2::plonk::config::GenericConfig;
-use rayon::prelude::*;
 
 use crate::config::StarkConfig;
 use crate::constraint_consumer::ConstraintConsumer;
-use crate::permutation::{get_permutation_batches, PermutationChallenge};
+use crate::permutation::{
+    get_permutation_batches, PermutationChallenge, PermutationChallengeSet, PermutationInstance,
+    PermutationPair,
+};
 use crate::stark::Stark;
 use crate::vars::StarkEvaluationVars;
 
 pub(crate) fn eval_vanishing_poly<F, C, S, const D: usize>(
     stark: S,
     config: &StarkConfig,
-    vars: StarkEvaluationVars<F, F, S::COLUMNS, S::PUBLIC_INPUTS>,
+    vars: StarkEvaluationVars<F::Extension, F::Extension, { S::COLUMNS }, { S::PUBLIC_INPUTS }>,
     local_zs: &[F::Extension],
     next_zs: &[F::Extension],
-    mut consumer: ConstraintConsumer<F>,
-    permutation_challenge_sets: &[PermutationChallenge<F>],
+    mut consumer: ConstraintConsumer<F::Extension>,
+    permutation_challenge_sets: &[PermutationChallengeSet<F>],
 ) where
     F: RichField + Extendable<D>,
     C: GenericConfig<D, F = F>,
@@ -25,17 +26,17 @@ pub(crate) fn eval_vanishing_poly<F, C, S, const D: usize>(
     [(); S::COLUMNS]:,
     [(); S::PUBLIC_INPUTS]:,
 {
-    stark.eval_packed_base(vars, &mut consumer);
+    stark.eval_packed_generic(vars, &mut consumer);
 }
 
 fn eval_permutation_checks<F, C, S, const D: usize>(
     stark: S,
     config: &StarkConfig,
-    vars: StarkEvaluationVars<F::Extension, F::Extension, S::COLUMNS, S::PUBLIC_INPUTS>,
+    vars: StarkEvaluationVars<F::Extension, F::Extension, { S::COLUMNS }, { S::PUBLIC_INPUTS }>,
     local_zs: &[F::Extension],
     next_zs: &[F::Extension],
-    mut consumer: ConstraintConsumer<F>,
-    permutation_challenge_sets: &[PermutationChallenge<F>],
+    consumer: &mut ConstraintConsumer<F::Extension>,
+    permutation_challenge_sets: &[PermutationChallengeSet<F>],
 ) where
     F: RichField + Extendable<D>,
     C: GenericConfig<D, F = F>,
@@ -43,18 +44,42 @@ fn eval_permutation_checks<F, C, S, const D: usize>(
     [(); S::COLUMNS]:,
     [(); S::PUBLIC_INPUTS]:,
 {
+    // TODO: Z_1 check.
     let permutation_pairs = stark.permutation_pairs();
 
     let permutation_batches = get_permutation_batches(
         &permutation_pairs,
-        &permutation_challenge_sets,
+        permutation_challenge_sets,
         config.num_challenges,
         stark.permutation_batch_size(),
     );
 
     // Each zs value corresponds to a permutation batch.
-    permutation_batches
-        .into_par_iter()
-        .map(|instances| compute_permutation_z_poly(&instances, trace_poly_values))
-        .collect()
+    for (i, instances) in permutation_batches.iter().enumerate() {
+        // Z(gx) * down = Z x  * up
+        let (reduced_lhs, reduced_rhs): (Vec<F::Extension>, Vec<F::Extension>) = instances
+            .iter()
+            .map(|instance| {
+                let PermutationInstance {
+                    pair: PermutationPair { column_pairs },
+                    challenge: PermutationChallenge { beta, gamma },
+                } = instance;
+                column_pairs.iter().rev().fold(
+                    (
+                        F::Extension::from_basefield(*gamma),
+                        F::Extension::from_basefield(*gamma),
+                    ),
+                    |(lhs, rhs), &(i, j)| {
+                        (
+                            lhs.scalar_mul(*beta) + vars.local_values[i],
+                            rhs.scalar_mul(*beta) + vars.local_values[j],
+                        )
+                    },
+                )
+            })
+            .unzip();
+        let constraint = next_zs[i] * reduced_rhs.into_iter().product()
+            - local_zs[i] * reduced_lhs.into_iter().product();
+        consumer.constraint(constraint);
+    }
 }

From 5c1173379e4c6f111f84fcb03fa8fa354cb3f19e Mon Sep 17 00:00:00 2001
From: wborgeaud <williamborgeaud@gmail.com>
Date: Mon, 21 Feb 2022 16:05:24 +0100
Subject: [PATCH 12/32] Compiles

---
 starky/src/permutation.rs    | 72 ++++++++++++++++++++++++++---
 starky/src/prover.rs         | 66 +++++++++++++++++++++------
 starky/src/vanishing_poly.rs | 87 +++++++++---------------------------
 3 files changed, 139 insertions(+), 86 deletions(-)

diff --git a/starky/src/permutation.rs b/starky/src/permutation.rs
index 9306d0b2..8a33eb41 100644
--- a/starky/src/permutation.rs
+++ b/starky/src/permutation.rs
@@ -2,7 +2,7 @@
 
 use itertools::Itertools;
 use plonky2::field::batch_util::batch_multiply_inplace;
-use plonky2::field::extension_field::Extendable;
+use plonky2::field::extension_field::{Extendable, FieldExtension};
 use plonky2::field::field_types::Field;
 use plonky2::field::polynomial::PolynomialValues;
 use plonky2::hash::hash_types::RichField;
@@ -11,7 +11,9 @@ use plonky2::plonk::config::{GenericConfig, Hasher};
 use rayon::prelude::*;
 
 use crate::config::StarkConfig;
+use crate::constraint_consumer::ConstraintConsumer;
 use crate::stark::Stark;
+use crate::vars::StarkEvaluationVars;
 
 /// A pair of lists of columns, `lhs` and `rhs`, that should be permutations of one another.
 /// In particular, there should exist some permutation `pi` such that for any `i`,
@@ -39,6 +41,7 @@ pub(crate) struct PermutationChallenge<F: Field> {
 }
 
 /// Like `PermutationChallenge`, but with `num_challenges` copies to boost soundness.
+#[derive(Clone)]
 pub(crate) struct PermutationChallengeSet<F: Field> {
     pub(crate) challenges: Vec<PermutationChallenge<F>>,
 }
@@ -49,6 +52,7 @@ pub(crate) fn compute_permutation_z_polys<F, C, S, const D: usize>(
     config: &StarkConfig,
     challenger: &mut Challenger<F, C::Hasher>,
     trace_poly_values: &[PolynomialValues<F>],
+    permutation_challenge_sets: &[PermutationChallengeSet<F>],
 ) -> Vec<PolynomialValues<F>>
 where
     F: RichField + Extendable<D>,
@@ -56,12 +60,6 @@ where
     S: Stark<F, D>,
 {
     let permutation_pairs = stark.permutation_pairs();
-    let permutation_challenge_sets = get_n_permutation_challenge_sets(
-        challenger,
-        config.num_challenges,
-        stark.permutation_batch_size(),
-    );
-
     let permutation_batches = get_permutation_batches(
         &permutation_pairs,
         &permutation_challenge_sets,
@@ -192,3 +190,63 @@ pub(crate) fn get_permutation_batches<'a, F: Field>(
         })
         .collect()
 }
+
+// TODO: Use slices.
+pub struct PermutationCheckData<F: Field, FE: FieldExtension<D2, BaseField = F>, const D2: usize> {
+    pub(crate) local_zs: Vec<FE>,
+    pub(crate) next_zs: Vec<FE>,
+    pub(crate) permutation_challenge_sets: Vec<PermutationChallengeSet<F>>,
+}
+
+pub(crate) fn eval_permutation_checks<F, FE, C, S, const D: usize, const D2: usize>(
+    stark: &S,
+    config: &StarkConfig,
+    vars: StarkEvaluationVars<FE, FE, { S::COLUMNS }, { S::PUBLIC_INPUTS }>,
+    local_zs: &[FE],
+    next_zs: &[FE],
+    consumer: &mut ConstraintConsumer<FE>,
+    permutation_challenge_sets: &[PermutationChallengeSet<F>],
+) where
+    F: RichField + Extendable<D>,
+    FE: FieldExtension<D2, BaseField = F>,
+    C: GenericConfig<D, F = F>,
+    S: Stark<F, D>,
+    [(); S::COLUMNS]:,
+    [(); S::PUBLIC_INPUTS]:,
+{
+    // TODO: Z_1 check.
+    let permutation_pairs = stark.permutation_pairs();
+
+    let permutation_batches = get_permutation_batches(
+        &permutation_pairs,
+        permutation_challenge_sets,
+        config.num_challenges,
+        stark.permutation_batch_size(),
+    );
+
+    // Each zs value corresponds to a permutation batch.
+    for (i, instances) in permutation_batches.iter().enumerate() {
+        // Z(gx) * down = Z x  * up
+        let (reduced_lhs, reduced_rhs): (Vec<FE>, Vec<FE>) = instances
+            .iter()
+            .map(|instance| {
+                let PermutationInstance {
+                    pair: PermutationPair { column_pairs },
+                    challenge: PermutationChallenge { beta, gamma },
+                } = instance;
+                column_pairs.iter().rev().fold(
+                    (FE::from_basefield(*gamma), FE::from_basefield(*gamma)),
+                    |(lhs, rhs), &(i, j)| {
+                        (
+                            lhs.scalar_mul(*beta) + vars.local_values[i],
+                            rhs.scalar_mul(*beta) + vars.local_values[j],
+                        )
+                    },
+                )
+            })
+            .unzip();
+        let constraint = next_zs[i] * reduced_rhs.into_iter().product()
+            - local_zs[i] * reduced_lhs.into_iter().product();
+        consumer.constraint(constraint);
+    }
+}
diff --git a/starky/src/prover.rs b/starky/src/prover.rs
index be1f198b..0206cb95 100644
--- a/starky/src/prover.rs
+++ b/starky/src/prover.rs
@@ -18,9 +18,13 @@ use rayon::prelude::*;
 
 use crate::config::StarkConfig;
 use crate::constraint_consumer::ConstraintConsumer;
-use crate::permutation::compute_permutation_z_polys;
+use crate::permutation::PermutationCheckData;
+use crate::permutation::{
+    compute_permutation_z_polys, get_n_permutation_challenge_sets, PermutationChallengeSet,
+};
 use crate::proof::{StarkOpeningSet, StarkProof, StarkProofWithPublicInputs};
 use crate::stark::Stark;
+use crate::vanishing_poly::eval_vanishing_poly;
 use crate::vars::StarkEvaluationVars;
 
 pub fn prove<F, C, S, const D: usize>(
@@ -80,28 +84,41 @@ where
     challenger.observe_cap(&trace_cap);
 
     // Permutation arguments.
-    let permutation_zs_commitment = if stark.uses_permutation_args() {
+    let permutation_zs_commitment_challenges = if stark.uses_permutation_args() {
+        let permutation_challenge_sets = get_n_permutation_challenge_sets(
+            &mut challenger,
+            config.num_challenges,
+            stark.permutation_batch_size(),
+        );
         let permutation_z_polys = compute_permutation_z_polys::<F, C, S, D>(
             &stark,
             config,
             &mut challenger,
             &trace_poly_values,
+            &permutation_challenge_sets,
         );
+
         timed!(
             timing,
             "compute permutation Z commitments",
-            Some(PolynomialBatch::from_values(
-                permutation_z_polys,
-                rate_bits,
-                false,
-                config.fri_config.cap_height,
-                timing,
-                None,
+            Some((
+                PolynomialBatch::from_values(
+                    permutation_z_polys,
+                    rate_bits,
+                    false,
+                    config.fri_config.cap_height,
+                    timing,
+                    None,
+                ),
+                permutation_challenge_sets
             ))
         )
     } else {
         None
     };
+    let permutation_zs_commitment = permutation_zs_commitment_challenges
+        .as_ref()
+        .map(|(comm, _)| comm);
     let permutation_zs_cap = permutation_zs_commitment
         .as_ref()
         .map(|commit| commit.merkle_tree.cap.clone());
@@ -113,10 +130,11 @@ where
     let quotient_polys = compute_quotient_polys::<F, C, S, D>(
         &stark,
         &trace_commitment,
+        &permutation_zs_commitment_challenges,
         public_inputs,
         alphas,
         degree_bits,
-        rate_bits,
+        config,
     );
     let all_quotient_chunks = quotient_polys
         .into_par_iter()
@@ -156,13 +174,13 @@ where
         zeta,
         g,
         &trace_commitment,
-        permutation_zs_commitment.as_ref(),
+        permutation_zs_commitment,
         &quotient_commitment,
     );
     challenger.observe_openings(&openings.to_fri_openings());
 
     let initial_merkle_trees = once(&trace_commitment)
-        .chain(permutation_zs_commitment.as_ref())
+        .chain(permutation_zs_commitment)
         .chain(once(&quotient_commitment))
         .collect_vec();
 
@@ -196,10 +214,14 @@ where
 fn compute_quotient_polys<F, C, S, const D: usize>(
     stark: &S,
     trace_commitment: &PolynomialBatch<F, C, D>,
+    permutation_zs_commitment_challenges: &Option<(
+        PolynomialBatch<F, C, D>,
+        Vec<PermutationChallengeSet<F>>,
+    )>,
     public_inputs: [F; S::PUBLIC_INPUTS],
     alphas: Vec<F>,
     degree_bits: usize,
-    rate_bits: usize,
+    config: &StarkConfig,
 ) -> Vec<PolynomialCoeffs<F>>
 where
     F: RichField + Extendable<D>,
@@ -209,6 +231,7 @@ where
     [(); S::PUBLIC_INPUTS]:,
 {
     let degree = 1 << degree_bits;
+    let rate_bits = config.fri_config.rate_bits;
 
     let quotient_degree_bits = log2_ceil(stark.quotient_degree_factor());
     assert!(
@@ -255,7 +278,22 @@ where
                 next_values: &get_at_index(trace_commitment, (i + next_step) % size),
                 public_inputs: &public_inputs,
             };
-            stark.eval_packed_base(vars, &mut consumer);
+            let permutation_check_data = permutation_zs_commitment_challenges.as_ref().map(
+                |(permutation_zs_commitment, permutation_challenge_sets)| PermutationCheckData {
+                    local_zs: get_at_index(&permutation_zs_commitment, i).to_vec(),
+                    next_zs: get_at_index(&permutation_zs_commitment, (i + next_step) % size)
+                        .to_vec(),
+                    permutation_challenge_sets: permutation_challenge_sets.to_vec(),
+                },
+            );
+            eval_vanishing_poly::<F, F, C, S, D, 1>(
+                stark,
+                config,
+                vars,
+                permutation_check_data,
+                &mut consumer,
+            );
+            // stark.eval_packed_base(vars, &mut consumer);
             // TODO: Add in constraints for permutation arguments.
             // TODO: Fix this once we use a genuine `PackedField`.
             let mut constraints_evals = consumer.accumulators();
diff --git a/starky/src/vanishing_poly.rs b/starky/src/vanishing_poly.rs
index a3323796..dc598167 100644
--- a/starky/src/vanishing_poly.rs
+++ b/starky/src/vanishing_poly.rs
@@ -4,82 +4,39 @@ use plonky2::plonk::config::GenericConfig;
 
 use crate::config::StarkConfig;
 use crate::constraint_consumer::ConstraintConsumer;
-use crate::permutation::{
-    get_permutation_batches, PermutationChallenge, PermutationChallengeSet, PermutationInstance,
-    PermutationPair,
-};
+use crate::permutation::{eval_permutation_checks, PermutationCheckData};
 use crate::stark::Stark;
 use crate::vars::StarkEvaluationVars;
 
-pub(crate) fn eval_vanishing_poly<F, C, S, const D: usize>(
-    stark: S,
+pub(crate) fn eval_vanishing_poly<F, FE, C, S, const D: usize, const D2: usize>(
+    stark: &S,
     config: &StarkConfig,
-    vars: StarkEvaluationVars<F::Extension, F::Extension, { S::COLUMNS }, { S::PUBLIC_INPUTS }>,
-    local_zs: &[F::Extension],
-    next_zs: &[F::Extension],
-    mut consumer: ConstraintConsumer<F::Extension>,
-    permutation_challenge_sets: &[PermutationChallengeSet<F>],
+    vars: StarkEvaluationVars<FE, FE, { S::COLUMNS }, { S::PUBLIC_INPUTS }>,
+    permutation_data: Option<PermutationCheckData<F, FE, D2>>,
+    consumer: &mut ConstraintConsumer<FE>,
 ) where
     F: RichField + Extendable<D>,
+    FE: FieldExtension<D2, BaseField = F>,
     C: GenericConfig<D, F = F>,
     S: Stark<F, D>,
     [(); S::COLUMNS]:,
     [(); S::PUBLIC_INPUTS]:,
 {
-    stark.eval_packed_generic(vars, &mut consumer);
-}
-
-fn eval_permutation_checks<F, C, S, const D: usize>(
-    stark: S,
-    config: &StarkConfig,
-    vars: StarkEvaluationVars<F::Extension, F::Extension, { S::COLUMNS }, { S::PUBLIC_INPUTS }>,
-    local_zs: &[F::Extension],
-    next_zs: &[F::Extension],
-    consumer: &mut ConstraintConsumer<F::Extension>,
-    permutation_challenge_sets: &[PermutationChallengeSet<F>],
-) where
-    F: RichField + Extendable<D>,
-    C: GenericConfig<D, F = F>,
-    S: Stark<F, D>,
-    [(); S::COLUMNS]:,
-    [(); S::PUBLIC_INPUTS]:,
-{
-    // TODO: Z_1 check.
-    let permutation_pairs = stark.permutation_pairs();
-
-    let permutation_batches = get_permutation_batches(
-        &permutation_pairs,
+    stark.eval_packed_generic(vars, consumer);
+    if let Some(PermutationCheckData {
+        local_zs,
+        next_zs,
         permutation_challenge_sets,
-        config.num_challenges,
-        stark.permutation_batch_size(),
-    );
-
-    // Each zs value corresponds to a permutation batch.
-    for (i, instances) in permutation_batches.iter().enumerate() {
-        // Z(gx) * down = Z x  * up
-        let (reduced_lhs, reduced_rhs): (Vec<F::Extension>, Vec<F::Extension>) = instances
-            .iter()
-            .map(|instance| {
-                let PermutationInstance {
-                    pair: PermutationPair { column_pairs },
-                    challenge: PermutationChallenge { beta, gamma },
-                } = instance;
-                column_pairs.iter().rev().fold(
-                    (
-                        F::Extension::from_basefield(*gamma),
-                        F::Extension::from_basefield(*gamma),
-                    ),
-                    |(lhs, rhs), &(i, j)| {
-                        (
-                            lhs.scalar_mul(*beta) + vars.local_values[i],
-                            rhs.scalar_mul(*beta) + vars.local_values[j],
-                        )
-                    },
-                )
-            })
-            .unzip();
-        let constraint = next_zs[i] * reduced_rhs.into_iter().product()
-            - local_zs[i] * reduced_lhs.into_iter().product();
-        consumer.constraint(constraint);
+    }) = permutation_data
+    {
+        eval_permutation_checks::<F, FE, C, S, D, D2>(
+            stark,
+            config,
+            vars,
+            &local_zs,
+            &next_zs,
+            consumer,
+            &permutation_challenge_sets,
+        );
     }
 }

From 85c1e1d5e07bfc4c4cb34a85373f09392a18e5c1 Mon Sep 17 00:00:00 2001
From: wborgeaud <williamborgeaud@gmail.com>
Date: Mon, 21 Feb 2022 18:00:03 +0100
Subject: [PATCH 13/32] Should work (does not)

---
 starky/src/fibonacci_stark.rs | 29 ++++++++++++++++++++++-------
 starky/src/prover.rs          | 17 +++++++++--------
 starky/src/verifier.rs        | 16 +++++++++++++++-
 3 files changed, 46 insertions(+), 16 deletions(-)

diff --git a/starky/src/fibonacci_stark.rs b/starky/src/fibonacci_stark.rs
index a0204359..2bbd333f 100644
--- a/starky/src/fibonacci_stark.rs
+++ b/starky/src/fibonacci_stark.rs
@@ -2,16 +2,21 @@ use std::marker::PhantomData;
 
 use plonky2::field::extension_field::{Extendable, FieldExtension};
 use plonky2::field::packed_field::PackedField;
+use plonky2::fri::structure::{FriInstanceInfo, FriInstanceInfoTarget};
 use plonky2::hash::hash_types::RichField;
+use plonky2::iop::ext_target::ExtensionTarget;
 use plonky2::plonk::circuit_builder::CircuitBuilder;
 
+use crate::config::StarkConfig;
 use crate::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer};
+use crate::permutation::PermutationPair;
 use crate::stark::Stark;
 use crate::vars::{StarkEvaluationTargets, StarkEvaluationVars};
 
 /// Toy STARK system used for testing.
-/// Computes a Fibonacci sequence with state `[x0, x1]` using the state transition
-/// `x0 <- x1, x1 <- x0 + x1`.
+/// Computes a Fibonacci sequence with state `[x0, x1, i, j]` using the state transition
+/// `x0' <- x1, x1' <- x0 + x1, i' <- i+1, j' <- j+1`.
+/// Note: The `i, j` columns are used to test the permutation argument.
 #[derive(Copy, Clone)]
 struct FibonacciStark<F: RichField + Extendable<D>, const D: usize> {
     num_rows: usize,
@@ -34,21 +39,25 @@ impl<F: RichField + Extendable<D>, const D: usize> FibonacciStark<F, D> {
         }
     }
 
-    /// Generate the trace using `x0, x1` as inital state values.
+    /// Generate the trace using `x0, x1, 0, 1` as initial state values.
     fn generate_trace(&self, x0: F, x1: F) -> Vec<[F; Self::COLUMNS]> {
-        (0..self.num_rows)
-            .scan([x0, x1], |acc, _| {
+        let mut trace = (0..self.num_rows)
+            .scan([x0, x1, F::ZERO, F::ONE], |acc, _| {
                 let tmp = *acc;
                 acc[0] = tmp[1];
                 acc[1] = tmp[0] + tmp[1];
+                acc[2] = tmp[2] + F::ONE;
+                acc[3] = tmp[3] + F::ONE;
                 Some(tmp)
             })
-            .collect()
+            .collect::<Vec<_>>();
+        trace[self.num_rows - 1][3] = F::ZERO;
+        trace
     }
 }
 
 impl<F: RichField + Extendable<D>, const D: usize> Stark<F, D> for FibonacciStark<F, D> {
-    const COLUMNS: usize = 2;
+    const COLUMNS: usize = 4;
     const PUBLIC_INPUTS: usize = 3;
 
     fn eval_packed_generic<FE, P, const D2: usize>(
@@ -105,6 +114,12 @@ impl<F: RichField + Extendable<D>, const D: usize> Stark<F, D> for FibonacciStar
     fn constraint_degree(&self) -> usize {
         2
     }
+
+    fn permutation_pairs(&self) -> Vec<PermutationPair> {
+        vec![PermutationPair {
+            column_pairs: vec![(2, 3)],
+        }]
+    }
 }
 
 #[cfg(test)]
diff --git a/starky/src/prover.rs b/starky/src/prover.rs
index 0206cb95..e0c14dde 100644
--- a/starky/src/prover.rs
+++ b/starky/src/prover.rs
@@ -211,10 +211,10 @@ where
 
 /// Computes the quotient polynomials `(sum alpha^i C_i(x)) / Z_H(x)` for `alpha` in `alphas`,
 /// where the `C_i`s are the Stark constraints.
-fn compute_quotient_polys<F, C, S, const D: usize>(
+fn compute_quotient_polys<'a, F, C, S, const D: usize>(
     stark: &S,
-    trace_commitment: &PolynomialBatch<F, C, D>,
-    permutation_zs_commitment_challenges: &Option<(
+    trace_commitment: &'a PolynomialBatch<F, C, D>,
+    permutation_zs_commitment_challenges: &'a Option<(
         PolynomialBatch<F, C, D>,
         Vec<PermutationChallengeSet<F>>,
     )>,
@@ -251,9 +251,8 @@ where
     let z_h_on_coset = ZeroPolyOnCoset::<F>::new(degree_bits, quotient_degree_bits);
 
     // Retrieve the LDE values at index `i`.
-    let get_at_index = |comm: &PolynomialBatch<F, C, D>, i: usize| -> [F; S::COLUMNS] {
-        comm.get_lde_values(i * step).try_into().unwrap()
-    };
+    let get_at_index =
+        |comm: &'a PolynomialBatch<F, C, D>, i: usize| -> &'a [F] { comm.get_lde_values(i * step) };
     // Last element of the subgroup.
     let last = F::primitive_root_of_unity(degree_bits).inverse();
     let size = degree << quotient_degree_bits;
@@ -274,8 +273,10 @@ where
                 lagrange_last.values[i],
             );
             let vars = StarkEvaluationVars::<F, F, { S::COLUMNS }, { S::PUBLIC_INPUTS }> {
-                local_values: &get_at_index(trace_commitment, i),
-                next_values: &get_at_index(trace_commitment, (i + next_step) % size),
+                local_values: &get_at_index(trace_commitment, i).try_into().unwrap(),
+                next_values: &get_at_index(trace_commitment, (i + next_step) % size)
+                    .try_into()
+                    .unwrap(),
                 public_inputs: &public_inputs,
             };
             let permutation_check_data = permutation_zs_commitment_challenges.as_ref().map(
diff --git a/starky/src/verifier.rs b/starky/src/verifier.rs
index 686ecd98..1603b208 100644
--- a/starky/src/verifier.rs
+++ b/starky/src/verifier.rs
@@ -11,8 +11,10 @@ use plonky2::plonk::plonk_common::reduce_with_powers;
 
 use crate::config::StarkConfig;
 use crate::constraint_consumer::ConstraintConsumer;
+use crate::permutation::PermutationCheckData;
 use crate::proof::{StarkOpeningSet, StarkProofChallenges, StarkProofWithPublicInputs};
 use crate::stark::Stark;
+use crate::vanishing_poly::eval_vanishing_poly;
 use crate::vars::StarkEvaluationVars;
 
 pub fn verify_stark_proof<
@@ -88,7 +90,19 @@ where
         l_1,
         l_last,
     );
-    stark.eval_ext(vars, &mut consumer);
+    // stark.eval_ext(vars, &mut consumer);
+    let permutation_data = stark.uses_permutation_args().then(|| PermutationCheckData {
+        local_zs: permutation_zs.as_ref().unwrap().clone(),
+        next_zs: permutation_zs_right.as_ref().unwrap().clone(),
+        permutation_challenge_sets: challenges.permutation_challenge_sets,
+    });
+    eval_vanishing_poly::<F, F::Extension, C, S, D, D>(
+        &stark,
+        config,
+        vars,
+        permutation_data,
+        &mut consumer,
+    );
     // TODO: Add in constraints for permutation arguments.
     let vanishing_polys_zeta = consumer.accumulators();
 

From c7af63957995d87c7527ceb0341bbfcc431dc413 Mon Sep 17 00:00:00 2001
From: Jakub Nabaglo <jakub@mirprotocol.org>
Date: Mon, 21 Feb 2022 17:45:01 -0800
Subject: [PATCH 14/32] Restore vectorization to full Poseidon rounds on
 Aarch64 (#498)

* Restore vectorization to full Poseidon layers on Aarch64

* Typos
---
 plonky2/src/hash/arch/aarch64/mod.rs          |   4 +-
 .../arch/aarch64/poseidon_goldilocks_neon.rs  | 588 +++++++-----------
 plonky2/src/hash/poseidon_goldilocks.rs       |  28 +-
 3 files changed, 250 insertions(+), 370 deletions(-)

diff --git a/plonky2/src/hash/arch/aarch64/mod.rs b/plonky2/src/hash/arch/aarch64/mod.rs
index ba86797d..b8ae14af 100644
--- a/plonky2/src/hash/arch/aarch64/mod.rs
+++ b/plonky2/src/hash/arch/aarch64/mod.rs
@@ -1,2 +1,2 @@
-// #[cfg(target_feature = "neon")]
-// pub(crate) mod poseidon_goldilocks_neon;
+#[cfg(target_feature = "neon")]
+pub(crate) mod poseidon_goldilocks_neon;
diff --git a/plonky2/src/hash/arch/aarch64/poseidon_goldilocks_neon.rs b/plonky2/src/hash/arch/aarch64/poseidon_goldilocks_neon.rs
index f2276506..352456e7 100644
--- a/plonky2/src/hash/arch/aarch64/poseidon_goldilocks_neon.rs
+++ b/plonky2/src/hash/arch/aarch64/poseidon_goldilocks_neon.rs
@@ -2,37 +2,24 @@
 
 use std::arch::aarch64::*;
 use std::arch::asm;
+use std::mem::transmute;
 
-use plonky2_field::field_types::Field64;
 use plonky2_field::goldilocks_field::GoldilocksField;
 use plonky2_util::branch_hint;
 use static_assertions::const_assert;
 use unroll::unroll_for_loops;
 
-use crate::hash::poseidon::{
-    Poseidon, ALL_ROUND_CONSTANTS, HALF_N_FULL_ROUNDS, N_PARTIAL_ROUNDS, N_ROUNDS,
-};
+use crate::hash::poseidon::Poseidon;
 
 // ========================================== CONSTANTS ===========================================
 
 const WIDTH: usize = 12;
 
-// The order below is arbitrary. Repeated coefficients have been removed so these constants fit in
-// two registers.
-// TODO: ensure this is aligned to 16 bytes (for vector loads), ideally on the same cacheline
-const MDS_CONSTS: [u32; 8] = [
-    0xffffffff,
-    1 << 1,
-    1 << 3,
-    1 << 5,
-    1 << 8,
-    1 << 10,
-    1 << 12,
-    1 << 16,
-];
+const EPSILON: u64 = 0xffffffff;
 
-// The round constants to be applied by the second set of full rounds. These are just the usual round constants,
-// shifted by one round, with zeros shifted in.
+// The round constants to be applied by the second set of full rounds. These are just the usual
+// round constants, shifted by one round, with zeros shifted in.
+/*
 const fn make_final_round_constants() -> [u64; WIDTH * HALF_N_FULL_ROUNDS] {
     let mut res = [0; WIDTH * HALF_N_FULL_ROUNDS];
     let mut i: usize = 0;
@@ -43,6 +30,7 @@ const fn make_final_round_constants() -> [u64; WIDTH * HALF_N_FULL_ROUNDS] {
     res
 }
 const FINAL_ROUND_CONSTANTS: [u64; WIDTH * HALF_N_FULL_ROUNDS] = make_final_round_constants();
+*/
 
 // ===================================== COMPILE-TIME CHECKS ======================================
 
@@ -52,9 +40,12 @@ const FINAL_ROUND_CONSTANTS: [u64; WIDTH * HALF_N_FULL_ROUNDS] = make_final_roun
 const fn check_mds_matrix() -> bool {
     // Can't == two arrays in a const_assert! (:
     let mut i = 0;
-    let wanted_matrix_exps = [0, 0, 1, 0, 3, 5, 1, 8, 12, 3, 16, 10];
+    let wanted_matrix_circ = [17, 15, 41, 16, 2, 28, 13, 13, 39, 18, 34, 20];
+    let wanted_matrix_diag = [8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
     while i < WIDTH {
-        if <GoldilocksField as Poseidon>::MDS_MATRIX_EXPS[i] != wanted_matrix_exps[i] {
+        if <GoldilocksField as Poseidon>::MDS_MATRIX_CIRC[i] != wanted_matrix_circ[i]
+            || <GoldilocksField as Poseidon>::MDS_MATRIX_DIAG[i] != wanted_matrix_diag[i]
+        {
             return false;
         }
         i += 1;
@@ -63,37 +54,10 @@ const fn check_mds_matrix() -> bool {
 }
 const_assert!(check_mds_matrix());
 
-/// The maximum amount by which the MDS matrix will multiply the input.
-/// i.e. max(MDS(state)) <= mds_matrix_inf_norm() * max(state).
-const fn mds_matrix_inf_norm() -> u64 {
-    let mut cumul = 0;
-    let mut i = 0;
-    while i < WIDTH {
-        cumul += 1 << <GoldilocksField as Poseidon>::MDS_MATRIX_EXPS[i];
-        i += 1;
-    }
-    cumul
-}
-
-/// Ensure that adding round constants to the low result of the MDS multiplication can never
-/// overflow.
-#[allow(dead_code)]
-const fn check_round_const_bounds_mds() -> bool {
-    let max_mds_res = mds_matrix_inf_norm() * (u32::MAX as u64);
-    let mut i = WIDTH; // First const layer is handled specially.
-    while i < WIDTH * N_ROUNDS {
-        if ALL_ROUND_CONSTANTS[i].overflowing_add(max_mds_res).1 {
-            return false;
-        }
-        i += 1;
-    }
-    true
-}
-const_assert!(check_round_const_bounds_mds());
-
 /// Ensure that the first WIDTH round constants are in canonical* form. This is required because
 /// the first constant layer does not handle double overflow.
 /// *: round_const == GoldilocksField::ORDER is safe.
+/*
 #[allow(dead_code)]
 const fn check_round_const_bounds_init() -> bool {
     let mut i = 0;
@@ -106,11 +70,9 @@ const fn check_round_const_bounds_init() -> bool {
     true
 }
 const_assert!(check_round_const_bounds_init());
-
+*/
 // ====================================== SCALAR ARITHMETIC =======================================
 
-const EPSILON: u64 = 0xffffffff;
-
 /// Addition modulo ORDER accounting for wraparound. Correct only when a + b < 2**64 + ORDER.
 #[inline(always)]
 unsafe fn add_with_wraparound(a: u64, b: u64) -> u64 {
@@ -133,7 +95,16 @@ unsafe fn add_with_wraparound(a: u64, b: u64) -> u64 {
 /// Subtraction of a and (b >> 32) modulo ORDER accounting for wraparound.
 #[inline(always)]
 unsafe fn sub_with_wraparound_lsr32(a: u64, b: u64) -> u64 {
-    let b_hi = b >> 32;
+    let mut b_hi = b >> 32;
+    // Make sure that LLVM emits two separate instructions for the shift and the subtraction. This
+    // reduces pressure on the execution units with access to the flags, as they are no longer
+    // responsible for the shift. The hack is to insert a fake computation between the two
+    // instructions with an `asm` block to make LLVM think that they can't be merged.
+    asm!(
+        "/* {0} */", // Make Rust think we're using the register.
+        inlateout(reg) b_hi,
+        options(nomem, nostack, preserves_flags, pure),
+    );
     // This could be done with a.overflowing_add(b_hi), but `checked_sub` signals to the compiler
     // that overflow is unlikely (note: this is a standard library implementation detail, not part
     // of the spec).
@@ -153,7 +124,8 @@ unsafe fn sub_with_wraparound_lsr32(a: u64, b: u64) -> u64 {
 unsafe fn mul_epsilon(x: u64) -> u64 {
     let res;
     asm!(
-        // Use UMULL to save one instruction. The compiler emits two: extract the low word and then multiply.
+        // Use UMULL to save one instruction. The compiler emits two: extract the low word and then
+        // multiply.
         "umull {res}, {x:w}, {epsilon:w}",
         x = in(reg) x,
         epsilon = in(reg) EPSILON,
@@ -179,8 +151,9 @@ unsafe fn multiply(x: u64, y: u64) -> u64 {
 
 // ==================================== STANDALONE CONST LAYER =====================================
 
-/// Standalone const layer. Run only once, at the start of round 1. Remaining const layers are fused with the preceeding
-/// MDS matrix multiplication.
+/// Standalone const layer. Run only once, at the start of round 1. Remaining const layers are fused
+/// with the preceeding MDS matrix multiplication.
+/*
 #[inline(always)]
 #[unroll_for_loops]
 unsafe fn const_layer_full(
@@ -195,15 +168,15 @@ unsafe fn const_layer_full(
     }
     state
 }
-
+*/
 // ========================================== FULL ROUNDS ==========================================
 
 /// Full S-box.
 #[inline(always)]
 #[unroll_for_loops]
 unsafe fn sbox_layer_full(state: [u64; WIDTH]) -> [u64; WIDTH] {
-    // This is done in scalar. S-boxes in vector are only slightly slower throughput-wise but have an insane latency
-    // (~100 cycles) on the M1.
+    // This is done in scalar. S-boxes in vector are only slightly slower throughput-wise but have
+    // an insane latency (~100 cycles) on the M1.
 
     let mut state2 = [0u64; WIDTH];
     assert!(WIDTH == 12);
@@ -228,297 +201,227 @@ unsafe fn sbox_layer_full(state: [u64; WIDTH]) -> [u64; WIDTH] {
     state7
 }
 
-// Aliases for readability. E.g. MDS[5] can be found in mdsv5[MDSI5].
-const MDSI2: i32 = 1; // MDS[2] == 1
-const MDSI4: i32 = 2; // MDS[4] == 3
-const MDSI5: i32 = 3; // MDS[5] == 5
-const MDSI6: i32 = 1; // MDS[6] == 1
-const MDSI7: i32 = 0; // MDS[7] == 8
-const MDSI8: i32 = 2; // MDS[8] == 12
-const MDSI9: i32 = 2; // MDS[9] == 3
-const MDSI10: i32 = 3; // MDS[10] == 16
-const MDSI11: i32 = 1; // MDS[11] == 10
-
 #[inline(always)]
 unsafe fn mds_reduce(
-    [[cumul0_a, cumul0_b], [cumul1_a, cumul1_b]]: [[uint64x2_t; 2]; 2],
+    // `cumul_a` and `cumul_b` represent two separate field elements. We take advantage of
+    // vectorization by reducing them simultaneously.
+    [cumul_a, cumul_b]: [uint32x4_t; 2],
 ) -> uint64x2_t {
-    // mds_consts0 == [0xffffffff, 1 << 1, 1 << 3, 1 << 5]
-    let mds_consts0: uint32x4_t = vld1q_u32((&MDS_CONSTS[0..4]).as_ptr().cast::<u32>());
-
-    // Merge accumulators
-    let cumul0 = vaddq_u64(cumul0_a, cumul0_b);
-    let cumul1 = vaddq_u64(cumul1_a, cumul1_b);
-
-    // Swizzle
-    let res_lo = vzip1q_u64(cumul0, cumul1);
-    let res_hi = vzip2q_u64(cumul0, cumul1);
-
-    // Reduce from u96
-    let res_hi = vsraq_n_u64::<32>(res_hi, res_lo);
-    let res_lo = vsliq_n_u64::<32>(res_lo, res_hi);
-
-    // Extract high 32-bits.
-    let res_hi_hi = vget_low_u32(vuzp2q_u32(
-        vreinterpretq_u32_u64(res_hi),
-        vreinterpretq_u32_u64(res_hi),
-    ));
-
-    // Multiply by EPSILON and accumulate.
-    let res_unadj = vmlal_laneq_u32::<0>(res_lo, res_hi_hi, mds_consts0);
-    let res_adj = vcgtq_u64(res_lo, res_unadj);
-    vsraq_n_u64::<32>(res_unadj, res_adj)
+    // Form:
+    // `lo = [cumul_a[0] + cumul_a[2] * 2**32, cumul_b[0] + cumul_b[2] * 2**32]`
+    // `hi = [cumul_a[1] + cumul_a[3] * 2**32, cumul_b[1] + cumul_b[3] * 2**32]`
+    // Observe that the result `== lo + hi * 2**16 (mod Goldilocks)`.
+    let mut lo = vreinterpretq_u64_u32(vuzp1q_u32(cumul_a, cumul_b));
+    let mut hi = vreinterpretq_u64_u32(vuzp2q_u32(cumul_a, cumul_b));
+    // Add the high 48 bits of `lo` to `hi`. This cannot overflow.
+    hi = vsraq_n_u64::<16>(hi, lo);
+    // Now, result `== lo.bits[0..16] + hi * 2**16 (mod Goldilocks)`.
+    // Set the high 48 bits of `lo` to the low 48 bits of `hi`.
+    lo = vsliq_n_u64::<16>(lo, hi);
+    // At this point, result `== lo + hi.bits[48..64] * 2**64 (mod Goldilocks)`.
+    // It remains to fold `hi.bits[48..64]` into `lo`.
+    let top = {
+        // Extract the top 16 bits of `hi` as a `u32`.
+        // Interpret `hi` as a vector of bytes, so we can use a table lookup instruction.
+        let hi_u8 = vreinterpretq_u8_u64(hi);
+        // Indices defining the permutation. `0xff` is out of bounds, producing `0`.
+        let top_idx =
+            transmute::<[u8; 8], uint8x8_t>([0x06, 0x07, 0xff, 0xff, 0x0e, 0x0f, 0xff, 0xff]);
+        let top_u8 = vqtbl1_u8(hi_u8, top_idx);
+        vreinterpret_u32_u8(top_u8)
+    };
+    // result `== lo + top * 2**64 (mod Goldilocks)`.
+    let adj_lo = vmlal_n_u32(lo, top, EPSILON as u32);
+    let wraparound_mask = vcgtq_u64(lo, adj_lo);
+    vsraq_n_u64::<32>(adj_lo, wraparound_mask) // Add epsilon on overflow.
 }
 
 #[inline(always)]
-unsafe fn mds_const_layers_full(
-    state: [u64; WIDTH],
-    round_constants: &[u64; WIDTH],
-) -> [u64; WIDTH] {
-    // mds_consts0 == [0xffffffff, 1 << 1, 1 << 3, 1 << 5]
-    // mds_consts1 == [1 << 8, 1 << 10, 1 << 12, 1 << 16]
-    let mds_consts0: uint32x4_t = vld1q_u32((&MDS_CONSTS[0..4]).as_ptr().cast::<u32>());
-    let mds_consts1: uint32x4_t = vld1q_u32((&MDS_CONSTS[4..8]).as_ptr().cast::<u32>());
+unsafe fn mds_layer_full(state: [u64; WIDTH]) -> [u64; WIDTH] {
+    // This function performs an MDS multiplication in complex FFT space.
+    // However, instead of performing a width-12 FFT, we perform three width-4 FFTs, which is
+    // cheaper. The 12x12 matrix-vector multiplication (a convolution) becomes two 3x3 real
+    // matrix-vector multiplications and one 3x3 complex matrix-vector multiplication.
 
-    // Aliases for readability. E.g. MDS[5] can be found in mdsv5[mdsi5]. MDS[0], MDS[1], and
-    // MDS[3] are 0, so they are not needed.
-    let mdsv2 = mds_consts0; // MDS[2] == 1
-    let mdsv4 = mds_consts0; // MDS[4] == 3
-    let mdsv5 = mds_consts0; // MDS[5] == 5
-    let mdsv6 = mds_consts0; // MDS[6] == 1
-    let mdsv7 = mds_consts1; // MDS[7] == 8
-    let mdsv8 = mds_consts1; // MDS[8] == 12
-    let mdsv9 = mds_consts0; // MDS[9] == 3
-    let mdsv10 = mds_consts1; // MDS[10] == 16
-    let mdsv11 = mds_consts1; // MDS[11] == 10
+    // We split each 64-bit into four chunks of 16 bits. To prevent overflow, each chunk is 32 bits
+    // long. Each NEON vector below represents one field element and consists of four 32-bit chunks:
+    // `elem == vector[0] + vector[1] * 2**16 + vector[2] * 2**32 + vector[3] * 2**48`.
 
-    // For i even, we combine state[i] and state[i + 1] into one vector to save on registers.
-    // Thus, state1 actually contains state0 and state1 but is only used in the intrinsics that
-    // access the high high doubleword.
-    let state1: uint32x4_t =
-        vreinterpretq_u32_u64(vcombine_u64(vcreate_u64(state[0]), vcreate_u64(state[1])));
-    let state3: uint32x4_t =
-        vreinterpretq_u32_u64(vcombine_u64(vcreate_u64(state[2]), vcreate_u64(state[3])));
-    let state5: uint32x4_t =
-        vreinterpretq_u32_u64(vcombine_u64(vcreate_u64(state[4]), vcreate_u64(state[5])));
-    let state7: uint32x4_t =
-        vreinterpretq_u32_u64(vcombine_u64(vcreate_u64(state[6]), vcreate_u64(state[7])));
-    let state9: uint32x4_t =
-        vreinterpretq_u32_u64(vcombine_u64(vcreate_u64(state[8]), vcreate_u64(state[9])));
-    let state11: uint32x4_t =
-        vreinterpretq_u32_u64(vcombine_u64(vcreate_u64(state[10]), vcreate_u64(state[11])));
-    // state0 is an alias to the low doubleword of state1. The compiler should use one register for both.
-    let state0: uint32x2_t = vget_low_u32(state1);
-    let state2: uint32x2_t = vget_low_u32(state3);
-    let state4: uint32x2_t = vget_low_u32(state5);
-    let state6: uint32x2_t = vget_low_u32(state7);
-    let state8: uint32x2_t = vget_low_u32(state9);
-    let state10: uint32x2_t = vget_low_u32(state11);
+    // Constants that we multiply by.
+    let mut consts: uint32x4_t = transmute::<[u32; 4], _>([2, 4, 8, 16]);
 
-    // Two accumulators per output to hide latency. Each accumulator is a vector of two u64s,
-    // containing the result for the low 32 bits and the high 32 bits. Thus, the final result at
-    // index i is (cumuli_a[0] + cumuli_b[0]) + (cumuli_a[1] + cumuli_b[1]) * 2**32.
+    // Prevent LLVM from turning fused multiply (by power of 2)-add (1 instruction) into shift and
+    // add (two instructions). This fake `asm` block means that LLVM no longer knows the contents of
+    // `consts`.
+    asm!("/* {0:v} */", // Make Rust think the register is being used.
+         inout(vreg) consts,
+         options(pure, nomem, nostack, preserves_flags),
+    );
 
-    // Start by loading the round constants.
-    let mut cumul0_a = vcombine_u64(vld1_u64(&round_constants[0]), vcreate_u64(0));
-    let mut cumul1_a = vcombine_u64(vld1_u64(&round_constants[1]), vcreate_u64(0));
-    let mut cumul2_a = vcombine_u64(vld1_u64(&round_constants[2]), vcreate_u64(0));
-    let mut cumul3_a = vcombine_u64(vld1_u64(&round_constants[3]), vcreate_u64(0));
-    let mut cumul4_a = vcombine_u64(vld1_u64(&round_constants[4]), vcreate_u64(0));
-    let mut cumul5_a = vcombine_u64(vld1_u64(&round_constants[5]), vcreate_u64(0));
-    let mut cumul6_a = vcombine_u64(vld1_u64(&round_constants[6]), vcreate_u64(0));
-    let mut cumul7_a = vcombine_u64(vld1_u64(&round_constants[7]), vcreate_u64(0));
-    let mut cumul8_a = vcombine_u64(vld1_u64(&round_constants[8]), vcreate_u64(0));
-    let mut cumul9_a = vcombine_u64(vld1_u64(&round_constants[9]), vcreate_u64(0));
-    let mut cumul10_a = vcombine_u64(vld1_u64(&round_constants[10]), vcreate_u64(0));
-    let mut cumul11_a = vcombine_u64(vld1_u64(&round_constants[11]), vcreate_u64(0));
+    // Four length-3 complex FFTs.
+    let mut state_fft = [vdupq_n_u32(0); 12];
+    for i in 0..3 {
+        // Interpret each field element as a 4-vector of `u16`s.
+        let x0 = vcreate_u16(state[i]);
+        let x1 = vcreate_u16(state[i + 3]);
+        let x2 = vcreate_u16(state[i + 6]);
+        let x3 = vcreate_u16(state[i + 9]);
 
-    // Now the matrix multiplication.
-    // MDS exps: [0, 0, 1, 0, 3, 5, 1, 8, 12, 3, 16, 10]
-    // out[i] += in[j] << mds[j - i]
+        // `vaddl_u16` and `vsubl_u16` yield 4-vectors of `u32`s.
+        let y0 = vaddl_u16(x0, x2);
+        let y1 = vaddl_u16(x1, x3);
+        let y2 = vsubl_u16(x0, x2);
+        let y3 = vsubl_u16(x1, x3);
 
-    let mut cumul0_b = vshll_n_u32::<0>(state0); // MDS[0]
-    let mut cumul1_b = vshll_n_u32::<10>(state0); // MDS[11]
-    let mut cumul2_b = vshll_n_u32::<16>(state0); // MDS[10]
-    let mut cumul3_b = vshll_n_u32::<3>(state0); // MDS[9]
-    let mut cumul4_b = vshll_n_u32::<12>(state0); // MDS[8]
-    let mut cumul5_b = vshll_n_u32::<8>(state0); // MDS[7]
-    let mut cumul6_b = vshll_n_u32::<1>(state0); // MDS[6]
-    let mut cumul7_b = vshll_n_u32::<5>(state0); // MDS[5]
-    let mut cumul8_b = vshll_n_u32::<3>(state0); // MDS[4]
-    let mut cumul9_b = vshll_n_u32::<0>(state0); // MDS[3]
-    let mut cumul10_b = vshll_n_u32::<1>(state0); // MDS[2]
-    let mut cumul11_b = vshll_n_u32::<0>(state0); // MDS[1]
+        let z0 = vaddq_u32(y0, y1);
+        let z1 = vsubq_u32(y0, y1);
+        let z2 = y2;
+        let z3 = y3;
 
-    cumul0_a = vaddw_high_u32(cumul0_a, state1); // MDS[1]
-    cumul1_a = vaddw_high_u32(cumul1_a, state1); // MDS[0]
-    cumul2_a = vmlal_high_laneq_u32::<MDSI11>(cumul2_a, state1, mdsv11); // MDS[11]
-    cumul3_a = vmlal_high_laneq_u32::<MDSI10>(cumul3_a, state1, mdsv10); // MDS[10]
-    cumul4_a = vmlal_high_laneq_u32::<MDSI9>(cumul4_a, state1, mdsv9); // MDS[9]
-    cumul5_a = vmlal_high_laneq_u32::<MDSI8>(cumul5_a, state1, mdsv8); // MDS[8]
-    cumul6_a = vmlal_high_laneq_u32::<MDSI7>(cumul6_a, state1, mdsv7); // MDS[7]
-    cumul7_a = vmlal_high_laneq_u32::<MDSI6>(cumul7_a, state1, mdsv6); // MDS[6]
-    cumul8_a = vmlal_high_laneq_u32::<MDSI5>(cumul8_a, state1, mdsv5); // MDS[5]
-    cumul9_a = vmlal_high_laneq_u32::<MDSI4>(cumul9_a, state1, mdsv4); // MDS[4]
-    cumul10_a = vaddw_high_u32(cumul10_a, state1); // MDS[3]
-    cumul11_a = vmlal_high_laneq_u32::<MDSI2>(cumul11_a, state1, mdsv2); // MDS[2]
+        // The FFT is `[z0, z2 + z3 i, z1, z2 - z3 i]`.
 
-    cumul0_b = vmlal_laneq_u32::<MDSI2>(cumul0_b, state2, mdsv2); // MDS[2]
-    cumul1_b = vaddw_u32(cumul1_b, state2); // MDS[1]
-    cumul2_b = vaddw_u32(cumul2_b, state2); // MDS[0]
-    cumul3_b = vmlal_laneq_u32::<MDSI11>(cumul3_b, state2, mdsv11); // MDS[11]
-    cumul4_b = vmlal_laneq_u32::<MDSI10>(cumul4_b, state2, mdsv10); // MDS[10]
-    cumul5_b = vmlal_laneq_u32::<MDSI9>(cumul5_b, state2, mdsv9); // MDS[9]
-    cumul6_b = vmlal_laneq_u32::<MDSI8>(cumul6_b, state2, mdsv8); // MDS[8]
-    cumul7_b = vmlal_laneq_u32::<MDSI7>(cumul7_b, state2, mdsv7); // MDS[7]
-    cumul8_b = vmlal_laneq_u32::<MDSI6>(cumul8_b, state2, mdsv6); // MDS[6]
-    cumul9_b = vmlal_laneq_u32::<MDSI5>(cumul9_b, state2, mdsv5); // MDS[5]
-    cumul10_b = vmlal_laneq_u32::<MDSI4>(cumul10_b, state2, mdsv4); // MDS[4]
-    cumul11_b = vaddw_u32(cumul11_b, state2); // MDS[3]
+        state_fft[i] = z0;
+        state_fft[i + 3] = z1;
+        state_fft[i + 6] = z2;
+        state_fft[i + 9] = z3;
+    }
 
-    cumul0_a = vaddw_high_u32(cumul0_a, state3); // MDS[3]
-    cumul1_a = vmlal_high_laneq_u32::<MDSI2>(cumul1_a, state3, mdsv2); // MDS[2]
-    cumul2_a = vaddw_high_u32(cumul2_a, state3); // MDS[1]
-    cumul3_a = vaddw_high_u32(cumul3_a, state3); // MDS[0]
-    cumul4_a = vmlal_high_laneq_u32::<MDSI11>(cumul4_a, state3, mdsv11); // MDS[11]
-    cumul5_a = vmlal_high_laneq_u32::<MDSI10>(cumul5_a, state3, mdsv10); // MDS[10]
-    cumul6_a = vmlal_high_laneq_u32::<MDSI9>(cumul6_a, state3, mdsv9); // MDS[9]
-    cumul7_a = vmlal_high_laneq_u32::<MDSI8>(cumul7_a, state3, mdsv8); // MDS[8]
-    cumul8_a = vmlal_high_laneq_u32::<MDSI7>(cumul8_a, state3, mdsv7); // MDS[7]
-    cumul9_a = vmlal_high_laneq_u32::<MDSI6>(cumul9_a, state3, mdsv6); // MDS[6]
-    cumul10_a = vmlal_high_laneq_u32::<MDSI5>(cumul10_a, state3, mdsv5); // MDS[5]
-    cumul11_a = vmlal_high_laneq_u32::<MDSI4>(cumul11_a, state3, mdsv4); // MDS[4]
+    // 3x3 real matrix-vector mul for component 0 of the FFTs.
+    // Multiply the vector `[x0, x1, x2]` by the matrix
+    // `[[ 64,  64, 128],`
+    // ` [128,  64,  64],`
+    // ` [ 64, 128,  64]]`
+    // The results are divided by 4 (this ends up cancelling out some later computations).
+    {
+        let x0 = state_fft[0];
+        let x1 = state_fft[1];
+        let x2 = state_fft[2];
 
-    cumul0_b = vmlal_laneq_u32::<MDSI4>(cumul0_b, state4, mdsv4); // MDS[4]
-    cumul1_b = vaddw_u32(cumul1_b, state4); // MDS[3]
-    cumul2_b = vmlal_laneq_u32::<MDSI2>(cumul2_b, state4, mdsv2); // MDS[2]
-    cumul3_b = vaddw_u32(cumul3_b, state4); // MDS[1]
-    cumul4_b = vaddw_u32(cumul4_b, state4); // MDS[0]
-    cumul5_b = vmlal_laneq_u32::<MDSI11>(cumul5_b, state4, mdsv11); // MDS[11]
-    cumul6_b = vmlal_laneq_u32::<MDSI10>(cumul6_b, state4, mdsv10); // MDS[10]
-    cumul7_b = vmlal_laneq_u32::<MDSI9>(cumul7_b, state4, mdsv9); // MDS[9]
-    cumul8_b = vmlal_laneq_u32::<MDSI8>(cumul8_b, state4, mdsv8); // MDS[8]
-    cumul9_b = vmlal_laneq_u32::<MDSI7>(cumul9_b, state4, mdsv7); // MDS[7]
-    cumul10_b = vmlal_laneq_u32::<MDSI6>(cumul10_b, state4, mdsv6); // MDS[6]
-    cumul11_b = vmlal_laneq_u32::<MDSI5>(cumul11_b, state4, mdsv5); // MDS[5]
+        let t = vshlq_n_u32::<4>(x0);
+        let u = vaddq_u32(x1, x2);
 
-    cumul0_a = vmlal_high_laneq_u32::<MDSI5>(cumul0_a, state5, mdsv5); // MDS[5]
-    cumul1_a = vmlal_high_laneq_u32::<MDSI4>(cumul1_a, state5, mdsv4); // MDS[4]
-    cumul2_a = vaddw_high_u32(cumul2_a, state5); // MDS[3]
-    cumul3_a = vmlal_high_laneq_u32::<MDSI2>(cumul3_a, state5, mdsv2); // MDS[2]
-    cumul4_a = vaddw_high_u32(cumul4_a, state5); // MDS[1]
-    cumul5_a = vaddw_high_u32(cumul5_a, state5); // MDS[0]
-    cumul6_a = vmlal_high_laneq_u32::<MDSI11>(cumul6_a, state5, mdsv11); // MDS[11]
-    cumul7_a = vmlal_high_laneq_u32::<MDSI10>(cumul7_a, state5, mdsv10); // MDS[10]
-    cumul8_a = vmlal_high_laneq_u32::<MDSI9>(cumul8_a, state5, mdsv9); // MDS[9]
-    cumul9_a = vmlal_high_laneq_u32::<MDSI8>(cumul9_a, state5, mdsv8); // MDS[8]
-    cumul10_a = vmlal_high_laneq_u32::<MDSI7>(cumul10_a, state5, mdsv7); // MDS[7]
-    cumul11_a = vmlal_high_laneq_u32::<MDSI6>(cumul11_a, state5, mdsv6); // MDS[6]
+        let y0 = vshlq_n_u32::<4>(u);
+        let y1 = vmlaq_laneq_u32::<3>(t, x2, consts);
+        let y2 = vmlaq_laneq_u32::<3>(t, x1, consts);
 
-    cumul0_b = vmlal_laneq_u32::<MDSI6>(cumul0_b, state6, mdsv6); // MDS[6]
-    cumul1_b = vmlal_laneq_u32::<MDSI5>(cumul1_b, state6, mdsv5); // MDS[5]
-    cumul2_b = vmlal_laneq_u32::<MDSI4>(cumul2_b, state6, mdsv4); // MDS[4]
-    cumul3_b = vaddw_u32(cumul3_b, state6); // MDS[3]
-    cumul4_b = vmlal_laneq_u32::<MDSI2>(cumul4_b, state6, mdsv2); // MDS[2]
-    cumul5_b = vaddw_u32(cumul5_b, state6); // MDS[1]
-    cumul6_b = vaddw_u32(cumul6_b, state6); // MDS[0]
-    cumul7_b = vmlal_laneq_u32::<MDSI11>(cumul7_b, state6, mdsv11); // MDS[11]
-    cumul8_b = vmlal_laneq_u32::<MDSI10>(cumul8_b, state6, mdsv10); // MDS[10]
-    cumul9_b = vmlal_laneq_u32::<MDSI9>(cumul9_b, state6, mdsv9); // MDS[9]
-    cumul10_b = vmlal_laneq_u32::<MDSI8>(cumul10_b, state6, mdsv8); // MDS[8]
-    cumul11_b = vmlal_laneq_u32::<MDSI7>(cumul11_b, state6, mdsv7); // MDS[7]
+        state_fft[0] = vaddq_u32(y0, y1);
+        state_fft[1] = vaddq_u32(y1, y2);
+        state_fft[2] = vaddq_u32(y0, y2);
+    }
 
-    cumul0_a = vmlal_high_laneq_u32::<MDSI7>(cumul0_a, state7, mdsv7); // MDS[7]
-    cumul1_a = vmlal_high_laneq_u32::<MDSI6>(cumul1_a, state7, mdsv6); // MDS[6]
-    cumul2_a = vmlal_high_laneq_u32::<MDSI5>(cumul2_a, state7, mdsv5); // MDS[5]
-    cumul3_a = vmlal_high_laneq_u32::<MDSI4>(cumul3_a, state7, mdsv4); // MDS[4]
-    cumul4_a = vaddw_high_u32(cumul4_a, state7); // MDS[3]
-    cumul5_a = vmlal_high_laneq_u32::<MDSI2>(cumul5_a, state7, mdsv2); // MDS[2]
-    cumul6_a = vaddw_high_u32(cumul6_a, state7); // MDS[1]
-    cumul7_a = vaddw_high_u32(cumul7_a, state7); // MDS[0]
-    cumul8_a = vmlal_high_laneq_u32::<MDSI11>(cumul8_a, state7, mdsv11); // MDS[11]
-    cumul9_a = vmlal_high_laneq_u32::<MDSI10>(cumul9_a, state7, mdsv10); // MDS[10]
-    cumul10_a = vmlal_high_laneq_u32::<MDSI9>(cumul10_a, state7, mdsv9); // MDS[9]
-    cumul11_a = vmlal_high_laneq_u32::<MDSI8>(cumul11_a, state7, mdsv8); // MDS[8]
+    // 3x3 real matrix-vector mul for component 2 of the FFTs.
+    // Multiply the vector `[x0, x1, x2]` by the matrix
+    // `[[ -4,  -8,  32],`
+    // ` [-32,  -4,  -8],`
+    // ` [  8, -32,  -4]]`
+    // The results are divided by 4 (this ends up cancelling out some later computations).
+    {
+        let x0 = state_fft[3];
+        let x1 = state_fft[4];
+        let x2 = state_fft[5];
+        state_fft[3] = vmlsq_laneq_u32::<2>(vmlaq_laneq_u32::<0>(x0, x1, consts), x2, consts);
+        state_fft[4] = vmlaq_laneq_u32::<0>(vmlaq_laneq_u32::<2>(x1, x0, consts), x2, consts);
+        state_fft[5] = vmlsq_laneq_u32::<0>(x2, vmlsq_laneq_u32::<1>(x0, x1, consts), consts);
+    }
 
-    cumul0_b = vmlal_laneq_u32::<MDSI8>(cumul0_b, state8, mdsv8); // MDS[8]
-    cumul1_b = vmlal_laneq_u32::<MDSI7>(cumul1_b, state8, mdsv7); // MDS[7]
-    cumul2_b = vmlal_laneq_u32::<MDSI6>(cumul2_b, state8, mdsv6); // MDS[6]
-    cumul3_b = vmlal_laneq_u32::<MDSI5>(cumul3_b, state8, mdsv5); // MDS[5]
-    cumul4_b = vmlal_laneq_u32::<MDSI4>(cumul4_b, state8, mdsv4); // MDS[4]
-    cumul5_b = vaddw_u32(cumul5_b, state8); // MDS[3]
-    cumul6_b = vmlal_laneq_u32::<MDSI2>(cumul6_b, state8, mdsv2); // MDS[2]
-    cumul7_b = vaddw_u32(cumul7_b, state8); // MDS[1]
-    cumul8_b = vaddw_u32(cumul8_b, state8); // MDS[0]
-    cumul9_b = vmlal_laneq_u32::<MDSI11>(cumul9_b, state8, mdsv11); // MDS[11]
-    cumul10_b = vmlal_laneq_u32::<MDSI10>(cumul10_b, state8, mdsv10); // MDS[10]
-    cumul11_b = vmlal_laneq_u32::<MDSI9>(cumul11_b, state8, mdsv9); // MDS[9]
+    // 3x3 complex matrix-vector mul for components 1 and 3 of the FFTs.
+    // Multiply the vector `[x0r + x0i i, x1r + x1i i, x2r + x2i i]` by the matrix
+    // `[[ 4 +  2i,  2 + 32i,  2 -  8i],`
+    // ` [-8 -  2i,  4 +  2i,  2 + 32i],`
+    // ` [32 -  2i, -8 -  2i,  4 +  2i]]`
+    // The results are divided by 2 (this ends up cancelling out some later computations).
+    {
+        let x0r = state_fft[6];
+        let x1r = state_fft[7];
+        let x2r = state_fft[8];
 
-    cumul0_a = vmlal_high_laneq_u32::<MDSI9>(cumul0_a, state9, mdsv9); // MDS[9]
-    cumul1_a = vmlal_high_laneq_u32::<MDSI8>(cumul1_a, state9, mdsv8); // MDS[8]
-    cumul2_a = vmlal_high_laneq_u32::<MDSI7>(cumul2_a, state9, mdsv7); // MDS[7]
-    cumul3_a = vmlal_high_laneq_u32::<MDSI6>(cumul3_a, state9, mdsv6); // MDS[6]
-    cumul4_a = vmlal_high_laneq_u32::<MDSI5>(cumul4_a, state9, mdsv5); // MDS[5]
-    cumul5_a = vmlal_high_laneq_u32::<MDSI4>(cumul5_a, state9, mdsv4); // MDS[4]
-    cumul6_a = vaddw_high_u32(cumul6_a, state9); // MDS[3]
-    cumul7_a = vmlal_high_laneq_u32::<MDSI2>(cumul7_a, state9, mdsv2); // MDS[2]
-    cumul8_a = vaddw_high_u32(cumul8_a, state9); // MDS[1]
-    cumul9_a = vaddw_high_u32(cumul9_a, state9); // MDS[0]
-    cumul10_a = vmlal_high_laneq_u32::<MDSI11>(cumul10_a, state9, mdsv11); // MDS[11]
-    cumul11_a = vmlal_high_laneq_u32::<MDSI10>(cumul11_a, state9, mdsv10); // MDS[10]
+        let x0i = state_fft[9];
+        let x1i = state_fft[10];
+        let x2i = state_fft[11];
 
-    cumul0_b = vmlal_laneq_u32::<MDSI10>(cumul0_b, state10, mdsv10); // MDS[10]
-    cumul1_b = vmlal_laneq_u32::<MDSI9>(cumul1_b, state10, mdsv9); // MDS[9]
-    cumul2_b = vmlal_laneq_u32::<MDSI8>(cumul2_b, state10, mdsv8); // MDS[8]
-    cumul3_b = vmlal_laneq_u32::<MDSI7>(cumul3_b, state10, mdsv7); // MDS[7]
-    cumul4_b = vmlal_laneq_u32::<MDSI6>(cumul4_b, state10, mdsv6); // MDS[6]
-    cumul5_b = vmlal_laneq_u32::<MDSI5>(cumul5_b, state10, mdsv5); // MDS[5]
-    cumul6_b = vmlal_laneq_u32::<MDSI4>(cumul6_b, state10, mdsv4); // MDS[4]
-    cumul7_b = vaddw_u32(cumul7_b, state10); // MDS[3]
-    cumul8_b = vmlal_laneq_u32::<MDSI2>(cumul8_b, state10, mdsv2); // MDS[2]
-    cumul9_b = vaddw_u32(cumul9_b, state10); // MDS[1]
-    cumul10_b = vaddw_u32(cumul10_b, state10); // MDS[0]
-    cumul11_b = vmlal_laneq_u32::<MDSI11>(cumul11_b, state10, mdsv11); // MDS[11]
+        // real part of result <- real part of input
+        let r0rr = vaddq_u32(vmlaq_laneq_u32::<0>(x1r, x0r, consts), x2r);
+        let r1rr = vmlaq_laneq_u32::<0>(x2r, vmlsq_laneq_u32::<0>(x1r, x0r, consts), consts);
+        let r2rr = vmlsq_laneq_u32::<0>(x2r, vmlsq_laneq_u32::<1>(x1r, x0r, consts), consts);
 
-    cumul0_a = vmlal_high_laneq_u32::<MDSI11>(cumul0_a, state11, mdsv11); // MDS[11]
-    cumul1_a = vmlal_high_laneq_u32::<MDSI10>(cumul1_a, state11, mdsv10); // MDS[10]
-    cumul2_a = vmlal_high_laneq_u32::<MDSI9>(cumul2_a, state11, mdsv9); // MDS[9]
-    cumul3_a = vmlal_high_laneq_u32::<MDSI8>(cumul3_a, state11, mdsv8); // MDS[8]
-    cumul4_a = vmlal_high_laneq_u32::<MDSI7>(cumul4_a, state11, mdsv7); // MDS[7]
-    cumul5_a = vmlal_high_laneq_u32::<MDSI6>(cumul5_a, state11, mdsv6); // MDS[6]
-    cumul6_a = vmlal_high_laneq_u32::<MDSI5>(cumul6_a, state11, mdsv5); // MDS[5]
-    cumul7_a = vmlal_high_laneq_u32::<MDSI4>(cumul7_a, state11, mdsv4); // MDS[4]
-    cumul8_a = vaddw_high_u32(cumul8_a, state11); // MDS[3]
-    cumul9_a = vmlal_high_laneq_u32::<MDSI2>(cumul9_a, state11, mdsv2); // MDS[2]
-    cumul10_a = vaddw_high_u32(cumul10_a, state11); // MDS[1]
-    cumul11_a = vaddw_high_u32(cumul11_a, state11); // MDS[0]
+        // real part of result <- imaginary part of input
+        let r0ri = vmlsq_laneq_u32::<1>(vmlaq_laneq_u32::<3>(x0i, x1i, consts), x2i, consts);
+        let r1ri = vmlsq_laneq_u32::<3>(vsubq_u32(x0i, x1i), x2i, consts);
+        let r2ri = vsubq_u32(vaddq_u32(x0i, x1i), x2i);
 
-    let reduced = [
-        mds_reduce([[cumul0_a, cumul0_b], [cumul1_a, cumul1_b]]),
-        mds_reduce([[cumul2_a, cumul2_b], [cumul3_a, cumul3_b]]),
-        mds_reduce([[cumul4_a, cumul4_b], [cumul5_a, cumul5_b]]),
-        mds_reduce([[cumul6_a, cumul6_b], [cumul7_a, cumul7_b]]),
-        mds_reduce([[cumul8_a, cumul8_b], [cumul9_a, cumul9_b]]),
-        mds_reduce([[cumul10_a, cumul10_b], [cumul11_a, cumul11_b]]),
-    ];
-    [
-        vgetq_lane_u64::<0>(reduced[0]),
-        vgetq_lane_u64::<1>(reduced[0]),
-        vgetq_lane_u64::<0>(reduced[1]),
-        vgetq_lane_u64::<1>(reduced[1]),
-        vgetq_lane_u64::<0>(reduced[2]),
-        vgetq_lane_u64::<1>(reduced[2]),
-        vgetq_lane_u64::<0>(reduced[3]),
-        vgetq_lane_u64::<1>(reduced[3]),
-        vgetq_lane_u64::<0>(reduced[4]),
-        vgetq_lane_u64::<1>(reduced[4]),
-        vgetq_lane_u64::<0>(reduced[5]),
-        vgetq_lane_u64::<1>(reduced[5]),
-    ]
+        // real part of result (total)
+        let r0r = vsubq_u32(r0rr, r0ri);
+        let r1r = vaddq_u32(r1rr, r1ri);
+        let r2r = vmlaq_laneq_u32::<0>(r2ri, r2rr, consts);
+
+        // imaginary part of result <- real part of input
+        let r0ir = vmlsq_laneq_u32::<1>(vmlaq_laneq_u32::<3>(x0r, x1r, consts), x2r, consts);
+        let r1ir = vmlaq_laneq_u32::<3>(vsubq_u32(x1r, x0r), x2r, consts);
+        let r2ir = vsubq_u32(x2r, vaddq_u32(x0r, x1r));
+
+        // imaginary part of result <- imaginary part of input
+        let r0ii = vaddq_u32(vmlaq_laneq_u32::<0>(x1i, x0i, consts), x2i);
+        let r1ii = vmlaq_laneq_u32::<0>(x2i, vmlsq_laneq_u32::<0>(x1i, x0i, consts), consts);
+        let r2ii = vmlsq_laneq_u32::<0>(x2i, vmlsq_laneq_u32::<1>(x1i, x0i, consts), consts);
+
+        // imaginary part of result (total)
+        let r0i = vaddq_u32(r0ir, r0ii);
+        let r1i = vaddq_u32(r1ir, r1ii);
+        let r2i = vmlaq_laneq_u32::<0>(r2ir, r2ii, consts);
+
+        state_fft[6] = r0r;
+        state_fft[7] = r1r;
+        state_fft[8] = r2r;
+
+        state_fft[9] = r0i;
+        state_fft[10] = r1i;
+        state_fft[11] = r2i;
+    }
+
+    // Three length-4 inverse FFTs.
+    // Normally, such IFFT would divide by 4, but we've already taken care of that.
+    for i in 0..3 {
+        let z0 = state_fft[i];
+        let z1 = state_fft[i + 3];
+        let z2 = state_fft[i + 6];
+        let z3 = state_fft[i + 9];
+
+        let y0 = vsubq_u32(z0, z1);
+        let y1 = vaddq_u32(z0, z1);
+        let y2 = z2;
+        let y3 = z3;
+
+        let x0 = vaddq_u32(y0, y2);
+        let x1 = vaddq_u32(y1, y3);
+        let x2 = vsubq_u32(y0, y2);
+        let x3 = vsubq_u32(y1, y3);
+
+        state_fft[i] = x0;
+        state_fft[i + 3] = x1;
+        state_fft[i + 6] = x2;
+        state_fft[i + 9] = x3;
+    }
+
+    // Perform `res[0] += state[0] * 8` for the diagonal component of the MDS matrix.
+    state_fft[0] = vmlal_laneq_u16::<4>(
+        state_fft[0],
+        vcreate_u16(state[0]),         // Each 16-bit chunk gets zero-extended.
+        vreinterpretq_u16_u32(consts), // Hack: these constants fit in `u16s`, so we can bit-cast.
+    );
+
+    let mut res_arr = [0; 12];
+    for i in 0..6 {
+        let res = mds_reduce([state_fft[2 * i], state_fft[2 * i + 1]]);
+        res_arr[2 * i] = vgetq_lane_u64::<0>(res);
+        res_arr[2 * i + 1] = vgetq_lane_u64::<1>(res);
+    }
+
+    res_arr
 }
 
 // ======================================== PARTIAL ROUNDS =========================================
 
+/*
 #[rustfmt::skip]
 macro_rules! mds_reduce_asm {
     ($c0:literal, $c1:literal, $out:literal, $consts:literal) => {
@@ -961,13 +864,15 @@ unsafe fn partial_round(
         [res23, res45, res67, res89, res1011],
     )
 }
+*/
 
 // ========================================== GLUE CODE ===========================================
 
+/*
 #[inline(always)]
 unsafe fn full_round(state: [u64; 12], round_constants: &[u64; WIDTH]) -> [u64; 12] {
     let state = sbox_layer_full(state);
-    mds_const_layers_full(state, round_constants)
+    mds_layer_full(state, round_constants)
 }
 
 #[inline]
@@ -1001,43 +906,19 @@ unsafe fn partial_rounds(
     }
     state.0
 }
+*/
 
 #[inline(always)]
 fn unwrap_state(state: [GoldilocksField; 12]) -> [u64; 12] {
-    [
-        state[0].0,
-        state[1].0,
-        state[2].0,
-        state[3].0,
-        state[4].0,
-        state[5].0,
-        state[6].0,
-        state[7].0,
-        state[8].0,
-        state[9].0,
-        state[10].0,
-        state[11].0,
-    ]
+    state.map(|s| s.0)
 }
 
 #[inline(always)]
 fn wrap_state(state: [u64; 12]) -> [GoldilocksField; 12] {
-    [
-        GoldilocksField(state[0]),
-        GoldilocksField(state[1]),
-        GoldilocksField(state[2]),
-        GoldilocksField(state[3]),
-        GoldilocksField(state[4]),
-        GoldilocksField(state[5]),
-        GoldilocksField(state[6]),
-        GoldilocksField(state[7]),
-        GoldilocksField(state[8]),
-        GoldilocksField(state[9]),
-        GoldilocksField(state[10]),
-        GoldilocksField(state[11]),
-    ]
+    state.map(GoldilocksField)
 }
 
+/*
 #[inline(always)]
 pub unsafe fn poseidon(state: [GoldilocksField; 12]) -> [GoldilocksField; 12] {
     let state = unwrap_state(state);
@@ -1058,6 +939,7 @@ pub unsafe fn poseidon(state: [GoldilocksField; 12]) -> [GoldilocksField; 12] {
     let state = full_rounds(state, &FINAL_ROUND_CONSTANTS);
     wrap_state(state)
 }
+*/
 
 #[inline(always)]
 pub unsafe fn sbox_layer(state: &mut [GoldilocksField; WIDTH]) {
@@ -1067,8 +949,6 @@ pub unsafe fn sbox_layer(state: &mut [GoldilocksField; WIDTH]) {
 #[inline(always)]
 pub unsafe fn mds_layer(state: &[GoldilocksField; WIDTH]) -> [GoldilocksField; WIDTH] {
     let state = unwrap_state(*state);
-    // We want to do an MDS layer without the constant layer.
-    let round_consts = [0u64; WIDTH];
-    let state = mds_const_layers_full(state, &round_consts);
+    let state = mds_layer_full(state);
     wrap_state(state)
 }
diff --git a/plonky2/src/hash/poseidon_goldilocks.rs b/plonky2/src/hash/poseidon_goldilocks.rs
index 971fda0f..177b30ff 100644
--- a/plonky2/src/hash/poseidon_goldilocks.rs
+++ b/plonky2/src/hash/poseidon_goldilocks.rs
@@ -252,21 +252,21 @@ impl Poseidon for GoldilocksField {
     //     }
     // }
 
-    // #[cfg(all(target_arch="aarch64", target_feature="neon"))]
-    // #[inline(always)]
-    // fn sbox_layer(state: &mut [Self; 12]) {
-    //     unsafe {
-    //         crate::hash::arch::aarch64::poseidon_goldilocks_neon::sbox_layer(state);
-    //     }
-    // }
+    #[cfg(all(target_arch="aarch64", target_feature="neon"))]
+    #[inline(always)]
+    fn sbox_layer(state: &mut [Self; 12]) {
+        unsafe {
+            crate::hash::arch::aarch64::poseidon_goldilocks_neon::sbox_layer(state);
+        }
+    }
 
-    // #[cfg(all(target_arch="aarch64", target_feature="neon"))]
-    // #[inline(always)]
-    // fn mds_layer(state: &[Self; 12]) -> [Self; 12] {
-    //     unsafe {
-    //         crate::hash::arch::aarch64::poseidon_goldilocks_neon::mds_layer(state)
-    //     }
-    // }
+    #[cfg(all(target_arch="aarch64", target_feature="neon"))]
+    #[inline(always)]
+    fn mds_layer(state: &[Self; 12]) -> [Self; 12] {
+        unsafe {
+            crate::hash::arch::aarch64::poseidon_goldilocks_neon::mds_layer(state)
+        }
+    }
 }
 
 #[cfg(test)]

From 56e269e27abaa50f33493b06cc105240e8cdd334 Mon Sep 17 00:00:00 2001
From: wborgeaud <williamborgeaud@gmail.com>
Date: Tue, 22 Feb 2022 10:37:08 +0100
Subject: [PATCH 15/32] Working (not recursively)

---
 starky/src/fibonacci_stark.rs |  3 ---
 starky/src/lib.rs             |  1 +
 starky/src/permutation.rs     | 24 ++++++++++++++----------
 starky/src/prover.rs          |  4 ++--
 4 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/starky/src/fibonacci_stark.rs b/starky/src/fibonacci_stark.rs
index 2bbd333f..6ffbe858 100644
--- a/starky/src/fibonacci_stark.rs
+++ b/starky/src/fibonacci_stark.rs
@@ -2,12 +2,9 @@ use std::marker::PhantomData;
 
 use plonky2::field::extension_field::{Extendable, FieldExtension};
 use plonky2::field::packed_field::PackedField;
-use plonky2::fri::structure::{FriInstanceInfo, FriInstanceInfoTarget};
 use plonky2::hash::hash_types::RichField;
-use plonky2::iop::ext_target::ExtensionTarget;
 use plonky2::plonk::circuit_builder::CircuitBuilder;
 
-use crate::config::StarkConfig;
 use crate::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer};
 use crate::permutation::PermutationPair;
 use crate::stark::Stark;
diff --git a/starky/src/lib.rs b/starky/src/lib.rs
index 51a73479..8249d90b 100644
--- a/starky/src/lib.rs
+++ b/starky/src/lib.rs
@@ -3,6 +3,7 @@
 #![allow(unused_variables)]
 #![allow(incomplete_features)]
 #![allow(clippy::too_many_arguments)]
+#![allow(clippy::type_complexity)]
 #![feature(generic_const_exprs)]
 
 pub mod config;
diff --git a/starky/src/permutation.rs b/starky/src/permutation.rs
index 8a33eb41..75fa8400 100644
--- a/starky/src/permutation.rs
+++ b/starky/src/permutation.rs
@@ -62,7 +62,7 @@ where
     let permutation_pairs = stark.permutation_pairs();
     let permutation_batches = get_permutation_batches(
         &permutation_pairs,
-        &permutation_challenge_sets,
+        permutation_challenge_sets,
         config.num_challenges,
         stark.permutation_batch_size(),
     );
@@ -234,15 +234,19 @@ pub(crate) fn eval_permutation_checks<F, FE, C, S, const D: usize, const D2: usi
                     pair: PermutationPair { column_pairs },
                     challenge: PermutationChallenge { beta, gamma },
                 } = instance;
-                column_pairs.iter().rev().fold(
-                    (FE::from_basefield(*gamma), FE::from_basefield(*gamma)),
-                    |(lhs, rhs), &(i, j)| {
-                        (
-                            lhs.scalar_mul(*beta) + vars.local_values[i],
-                            rhs.scalar_mul(*beta) + vars.local_values[j],
-                        )
-                    },
-                )
+                let mut reduced =
+                    column_pairs
+                        .iter()
+                        .rev()
+                        .fold((FE::ZERO, FE::ZERO), |(lhs, rhs), &(i, j)| {
+                            (
+                                lhs.scalar_mul(*beta) + vars.local_values[i],
+                                rhs.scalar_mul(*beta) + vars.local_values[j],
+                            )
+                        });
+                reduced.0 += FE::from_basefield(*gamma);
+                reduced.1 += FE::from_basefield(*gamma);
+                reduced
             })
             .unzip();
         let constraint = next_zs[i] * reduced_rhs.into_iter().product()
diff --git a/starky/src/prover.rs b/starky/src/prover.rs
index e0c14dde..5cfcf6ea 100644
--- a/starky/src/prover.rs
+++ b/starky/src/prover.rs
@@ -281,8 +281,8 @@ where
             };
             let permutation_check_data = permutation_zs_commitment_challenges.as_ref().map(
                 |(permutation_zs_commitment, permutation_challenge_sets)| PermutationCheckData {
-                    local_zs: get_at_index(&permutation_zs_commitment, i).to_vec(),
-                    next_zs: get_at_index(&permutation_zs_commitment, (i + next_step) % size)
+                    local_zs: get_at_index(permutation_zs_commitment, i).to_vec(),
+                    next_zs: get_at_index(permutation_zs_commitment, (i + next_step) % size)
                         .to_vec(),
                     permutation_challenge_sets: permutation_challenge_sets.to_vec(),
                 },

From ed4aef0fa0b92f70fb24114c8bf0f8e890585a47 Mon Sep 17 00:00:00 2001
From: wborgeaud <williamborgeaud@gmail.com>
Date: Tue, 22 Feb 2022 10:46:51 +0100
Subject: [PATCH 16/32] Fill permutation todos

---
 starky/src/recursive_verifier.rs | 31 +++++++++++++++++++------------
 1 file changed, 19 insertions(+), 12 deletions(-)

diff --git a/starky/src/recursive_verifier.rs b/starky/src/recursive_verifier.rs
index ea7ffb70..c1071b5a 100644
--- a/starky/src/recursive_verifier.rs
+++ b/starky/src/recursive_verifier.rs
@@ -187,24 +187,29 @@ pub fn add_virtual_stark_proof<F: RichField + Extendable<D>, S: Stark<F, D>, con
     let fri_params = config.fri_params(degree_bits);
     let cap_height = fri_params.config.cap_height;
 
-    let num_leaves_per_oracle = &[
-        S::COLUMNS,
-        // TODO: permutation polys
-        stark.quotient_degree_factor() * config.num_challenges,
-    ];
-
-    let permutation_zs_cap = if stark.uses_permutation_args() {
-        Some(builder.add_virtual_cap(cap_height))
+    let num_leaves_per_oracle = if stark.uses_permutation_args() {
+        vec![
+            S::COLUMNS,
+            stark.num_permutation_batches(config),
+            stark.quotient_degree_factor() * config.num_challenges,
+        ]
     } else {
-        None
+        vec![
+            S::COLUMNS,
+            stark.quotient_degree_factor() * config.num_challenges,
+        ]
     };
 
+    let permutation_zs_cap = stark
+        .uses_permutation_args()
+        .then(|| builder.add_virtual_cap(cap_height));
+
     StarkProofTarget {
         trace_cap: builder.add_virtual_cap(cap_height),
         permutation_zs_cap,
         quotient_polys_cap: builder.add_virtual_cap(cap_height),
         openings: add_stark_opening_set::<F, S, D>(builder, stark, config),
-        opening_proof: builder.add_virtual_fri_proof(num_leaves_per_oracle, &fri_params),
+        opening_proof: builder.add_virtual_fri_proof(&num_leaves_per_oracle, &fri_params),
     }
 }
 
@@ -217,8 +222,10 @@ fn add_stark_opening_set<F: RichField + Extendable<D>, S: Stark<F, D>, const D:
     StarkOpeningSetTarget {
         local_values: builder.add_virtual_extension_targets(S::COLUMNS),
         next_values: builder.add_virtual_extension_targets(S::COLUMNS),
-        permutation_zs: vec![/*TODO*/],
-        permutation_zs_right: vec![/*TODO*/],
+        permutation_zs: builder
+            .add_virtual_extension_targets(stark.num_permutation_batches(config)),
+        permutation_zs_right: builder
+            .add_virtual_extension_targets(stark.num_permutation_batches(config)),
         quotient_polys: builder
             .add_virtual_extension_targets(stark.quotient_degree_factor() * num_challenges),
     }

From 6cd2fc62b57714357b6e6dd21ed085dbb2f95b46 Mon Sep 17 00:00:00 2001
From: wborgeaud <williamborgeaud@gmail.com>
Date: Tue, 22 Feb 2022 11:44:24 +0100
Subject: [PATCH 17/32] Should work (does not)

---
 plonky2/src/iop/challenger.rs    |   2 +-
 starky/src/fibonacci_stark.rs    |   2 +-
 starky/src/get_challenges.rs     |  42 ++++++--
 starky/src/permutation.rs        | 160 +++++++++++++++++++++++++++----
 starky/src/proof.rs              |  29 +++---
 starky/src/recursive_verifier.rs |  37 +++++--
 starky/src/vanishing_poly.rs     |  48 +++++++---
 starky/src/verifier.rs           |   1 -
 8 files changed, 260 insertions(+), 61 deletions(-)

diff --git a/plonky2/src/iop/challenger.rs b/plonky2/src/iop/challenger.rs
index c3a4403a..5b374834 100644
--- a/plonky2/src/iop/challenger.rs
+++ b/plonky2/src/iop/challenger.rs
@@ -208,7 +208,7 @@ impl<F: RichField + Extendable<D>, H: AlgebraicHasher<F>, const D: usize>
         }
     }
 
-    pub(crate) fn get_challenge(&mut self, builder: &mut CircuitBuilder<F, D>) -> Target {
+    pub fn get_challenge(&mut self, builder: &mut CircuitBuilder<F, D>) -> Target {
         self.absorb_buffered_inputs(builder);
 
         if self.output_buffer.is_empty() {
diff --git a/starky/src/fibonacci_stark.rs b/starky/src/fibonacci_stark.rs
index 6ffbe858..10b54d69 100644
--- a/starky/src/fibonacci_stark.rs
+++ b/starky/src/fibonacci_stark.rs
@@ -234,7 +234,7 @@ mod tests {
         let pt = add_virtual_stark_proof_with_pis(&mut builder, stark, inner_config, degree_bits);
         set_stark_proof_with_pis_target(&mut pw, &pt, &inner_proof);
 
-        recursively_verify_stark_proof::<F, InnerC, S, D>(&mut builder, stark, pt, inner_config);
+        recursively_verify_stark_proof::<F, InnerC, S, D>(&mut builder, stark, pt, inner_config)?;
 
         if print_gate_counts {
             builder.print_gate_counts(0);
diff --git a/starky/src/get_challenges.rs b/starky/src/get_challenges.rs
index 1cb1e633..8ee71667 100644
--- a/starky/src/get_challenges.rs
+++ b/starky/src/get_challenges.rs
@@ -11,7 +11,9 @@ use plonky2::plonk::circuit_builder::CircuitBuilder;
 use plonky2::plonk::config::{AlgebraicHasher, GenericConfig};
 
 use crate::config::StarkConfig;
-use crate::permutation::get_n_permutation_challenge_sets;
+use crate::permutation::{
+    get_n_permutation_challenge_sets, get_n_permutation_challenge_sets_target,
+};
 use crate::proof::*;
 use crate::stark::Stark;
 
@@ -131,9 +133,11 @@ where
 pub(crate) fn get_challenges_target<
     F: RichField + Extendable<D>,
     C: GenericConfig<D, F = F>,
+    S: Stark<F, D>,
     const D: usize,
 >(
     builder: &mut CircuitBuilder<F, D>,
+    stark: &S,
     trace_cap: &MerkleCapTarget,
     permutation_zs_cap: Option<&MerkleCapTarget>,
     quotient_polys_cap: &MerkleCapTarget,
@@ -142,7 +146,7 @@ pub(crate) fn get_challenges_target<
     final_poly: &PolynomialCoeffsExtTarget<D>,
     pow_witness: Target,
     config: &StarkConfig,
-) -> StarkProofChallengesTarget<D>
+) -> Result<StarkProofChallengesTarget<D>>
 where
     C::Hasher: AlgebraicHasher<F>,
 {
@@ -151,6 +155,23 @@ where
     let mut challenger = RecursiveChallenger::<F, C::Hasher, D>::new(builder);
 
     challenger.observe_cap(trace_cap);
+
+    let permutation_challenge_sets = if stark.uses_permutation_args() {
+        get_n_permutation_challenge_sets_target(
+            builder,
+            &mut challenger,
+            num_challenges,
+            stark.permutation_batch_size(),
+        )
+    } else {
+        vec![]
+    };
+    if stark.uses_permutation_args() {
+        let cap =
+            permutation_zs_cap.ok_or_else(|| anyhow::Error::msg("expected permutation_zs_cap"));
+        challenger.observe_cap(cap?);
+    }
+
     let stark_alphas = challenger.get_n_challenges(builder, num_challenges);
 
     challenger.observe_cap(quotient_polys_cap);
@@ -158,7 +179,8 @@ where
 
     challenger.observe_openings(&openings.to_fri_openings());
 
-    StarkProofChallengesTarget {
+    Ok(StarkProofChallengesTarget {
+        permutation_challenge_sets,
         stark_alphas,
         stark_zeta,
         fri_challenges: challenger.fri_challenges::<C>(
@@ -168,15 +190,20 @@ where
             pow_witness,
             &config.fri_config,
         ),
-    }
+    })
 }
 
 impl<const D: usize> StarkProofWithPublicInputsTarget<D> {
-    pub(crate) fn get_challenges<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>>(
+    pub(crate) fn get_challenges<
+        F: RichField + Extendable<D>,
+        C: GenericConfig<D, F = F>,
+        S: Stark<F, D>,
+    >(
         &self,
         builder: &mut CircuitBuilder<F, D>,
+        stark: &S,
         config: &StarkConfig,
-    ) -> StarkProofChallengesTarget<D>
+    ) -> Result<StarkProofChallengesTarget<D>>
     where
         C::Hasher: AlgebraicHasher<F>,
     {
@@ -194,8 +221,9 @@ impl<const D: usize> StarkProofWithPublicInputsTarget<D> {
                 },
         } = &self.proof;
 
-        get_challenges_target::<F, C, D>(
+        get_challenges_target::<F, C, S, D>(
             builder,
+            stark,
             trace_cap,
             permutation_zs_cap.as_ref(),
             quotient_polys_cap,
diff --git a/starky/src/permutation.rs b/starky/src/permutation.rs
index 75fa8400..1113094d 100644
--- a/starky/src/permutation.rs
+++ b/starky/src/permutation.rs
@@ -6,14 +6,17 @@ use plonky2::field::extension_field::{Extendable, FieldExtension};
 use plonky2::field::field_types::Field;
 use plonky2::field::polynomial::PolynomialValues;
 use plonky2::hash::hash_types::RichField;
-use plonky2::iop::challenger::Challenger;
-use plonky2::plonk::config::{GenericConfig, Hasher};
+use plonky2::iop::challenger::{Challenger, RecursiveChallenger};
+use plonky2::iop::ext_target::ExtensionTarget;
+use plonky2::iop::target::Target;
+use plonky2::plonk::circuit_builder::CircuitBuilder;
+use plonky2::plonk::config::{AlgebraicHasher, GenericConfig, Hasher};
 use rayon::prelude::*;
 
 use crate::config::StarkConfig;
-use crate::constraint_consumer::ConstraintConsumer;
+use crate::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer};
 use crate::stark::Stark;
-use crate::vars::StarkEvaluationVars;
+use crate::vars::{StarkEvaluationTargets, StarkEvaluationVars};
 
 /// A pair of lists of columns, `lhs` and `rhs`, that should be permutations of one another.
 /// In particular, there should exist some permutation `pi` such that for any `i`,
@@ -26,24 +29,24 @@ pub struct PermutationPair {
 }
 
 /// A single instance of a permutation check protocol.
-pub(crate) struct PermutationInstance<'a, F: Field> {
+pub(crate) struct PermutationInstance<'a, T: Copy> {
     pub(crate) pair: &'a PermutationPair,
-    pub(crate) challenge: PermutationChallenge<F>,
+    pub(crate) challenge: PermutationChallenge<T>,
 }
 
 /// Randomness for a single instance of a permutation check protocol.
 #[derive(Copy, Clone)]
-pub(crate) struct PermutationChallenge<F: Field> {
+pub(crate) struct PermutationChallenge<T: Copy> {
     /// Randomness used to combine multiple columns into one.
-    pub(crate) beta: F,
+    pub(crate) beta: T,
     /// Random offset that's added to the beta-reduced column values.
-    pub(crate) gamma: F,
+    pub(crate) gamma: T,
 }
 
 /// Like `PermutationChallenge`, but with `num_challenges` copies to boost soundness.
 #[derive(Clone)]
-pub(crate) struct PermutationChallengeSet<F: Field> {
-    pub(crate) challenges: Vec<PermutationChallenge<F>>,
+pub(crate) struct PermutationChallengeSet<T: Copy> {
+    pub(crate) challenges: Vec<PermutationChallenge<T>>,
 }
 
 /// Compute all Z polynomials (for permutation arguments).
@@ -163,17 +166,60 @@ pub(crate) fn get_n_permutation_challenge_sets<F: RichField, H: Hasher<F>>(
         .collect()
 }
 
+fn get_permutation_challenge_target<
+    F: RichField + Extendable<D>,
+    H: AlgebraicHasher<F>,
+    const D: usize,
+>(
+    builder: &mut CircuitBuilder<F, D>,
+    challenger: &mut RecursiveChallenger<F, H, D>,
+) -> PermutationChallenge<Target> {
+    let beta = challenger.get_challenge(builder);
+    let gamma = challenger.get_challenge(builder);
+    PermutationChallenge { beta, gamma }
+}
+
+fn get_permutation_challenge_set_target<
+    F: RichField + Extendable<D>,
+    H: AlgebraicHasher<F>,
+    const D: usize,
+>(
+    builder: &mut CircuitBuilder<F, D>,
+    challenger: &mut RecursiveChallenger<F, H, D>,
+    num_challenges: usize,
+) -> PermutationChallengeSet<Target> {
+    let challenges = (0..num_challenges)
+        .map(|_| get_permutation_challenge_target(builder, challenger))
+        .collect();
+    PermutationChallengeSet { challenges }
+}
+
+pub(crate) fn get_n_permutation_challenge_sets_target<
+    F: RichField + Extendable<D>,
+    H: AlgebraicHasher<F>,
+    const D: usize,
+>(
+    builder: &mut CircuitBuilder<F, D>,
+    challenger: &mut RecursiveChallenger<F, H, D>,
+    num_challenges: usize,
+    num_sets: usize,
+) -> Vec<PermutationChallengeSet<Target>> {
+    (0..num_sets)
+        .map(|_| get_permutation_challenge_set_target(builder, challenger, num_challenges))
+        .collect()
+}
+
 /// Get a list of instances of our batch-permutation argument. These are permutation arguments
 /// where the same `Z(x)` polynomial is used to check more than one permutation.
 /// Before batching, each permutation pair leads to `num_challenges` permutation arguments, so we
 /// start with the cartesian product of `permutation_pairs` and `0..num_challenges`. Then we
 /// chunk these arguments based on our batch size.
-pub(crate) fn get_permutation_batches<'a, F: Field>(
+pub(crate) fn get_permutation_batches<'a, T: Copy>(
     permutation_pairs: &'a [PermutationPair],
-    permutation_challenge_sets: &[PermutationChallengeSet<F>],
+    permutation_challenge_sets: &[PermutationChallengeSet<T>],
     num_challenges: usize,
     batch_size: usize,
-) -> Vec<Vec<PermutationInstance<'a, F>>> {
+) -> Vec<Vec<PermutationInstance<'a, T>>> {
     permutation_pairs
         .iter()
         .cartesian_product(0..num_challenges)
@@ -202,10 +248,8 @@ pub(crate) fn eval_permutation_checks<F, FE, C, S, const D: usize, const D2: usi
     stark: &S,
     config: &StarkConfig,
     vars: StarkEvaluationVars<FE, FE, { S::COLUMNS }, { S::PUBLIC_INPUTS }>,
-    local_zs: &[FE],
-    next_zs: &[FE],
+    permutation_data: PermutationCheckData<F, FE, D2>,
     consumer: &mut ConstraintConsumer<FE>,
-    permutation_challenge_sets: &[PermutationChallengeSet<F>],
 ) where
     F: RichField + Extendable<D>,
     FE: FieldExtension<D2, BaseField = F>,
@@ -214,12 +258,17 @@ pub(crate) fn eval_permutation_checks<F, FE, C, S, const D: usize, const D2: usi
     [(); S::COLUMNS]:,
     [(); S::PUBLIC_INPUTS]:,
 {
+    let PermutationCheckData {
+        local_zs,
+        next_zs,
+        permutation_challenge_sets,
+    } = permutation_data;
     // TODO: Z_1 check.
     let permutation_pairs = stark.permutation_pairs();
 
     let permutation_batches = get_permutation_batches(
         &permutation_pairs,
-        permutation_challenge_sets,
+        &permutation_challenge_sets,
         config.num_challenges,
         stark.permutation_batch_size(),
     );
@@ -254,3 +303,78 @@ pub(crate) fn eval_permutation_checks<F, FE, C, S, const D: usize, const D2: usi
         consumer.constraint(constraint);
     }
 }
+
+// TODO: Use slices.
+pub struct PermutationCheckDataTarget<const D: usize> {
+    pub(crate) local_zs: Vec<ExtensionTarget<D>>,
+    pub(crate) next_zs: Vec<ExtensionTarget<D>>,
+    pub(crate) permutation_challenge_sets: Vec<PermutationChallengeSet<Target>>,
+}
+
+pub(crate) fn eval_permutation_checks_recursively<F, S, const D: usize>(
+    builder: &mut CircuitBuilder<F, D>,
+    stark: &S,
+    config: &StarkConfig,
+    vars: StarkEvaluationTargets<D, { S::COLUMNS }, { S::PUBLIC_INPUTS }>,
+    permutation_data: PermutationCheckDataTarget<D>,
+    consumer: &mut RecursiveConstraintConsumer<F, D>,
+) where
+    F: RichField + Extendable<D>,
+    S: Stark<F, D>,
+    [(); S::COLUMNS]:,
+    [(); S::PUBLIC_INPUTS]:,
+{
+    let PermutationCheckDataTarget {
+        local_zs,
+        next_zs,
+        permutation_challenge_sets,
+    } = permutation_data;
+    // TODO: Z_1 check.
+    let permutation_pairs = stark.permutation_pairs();
+
+    let permutation_batches = get_permutation_batches(
+        &permutation_pairs,
+        &permutation_challenge_sets,
+        config.num_challenges,
+        stark.permutation_batch_size(),
+    );
+
+    // Each zs value corresponds to a permutation batch.
+    for (i, instances) in permutation_batches.iter().enumerate() {
+        // Z(gx) * down = Z x  * up
+        let (reduced_lhs, reduced_rhs): (Vec<ExtensionTarget<D>>, Vec<ExtensionTarget<D>>) =
+            instances
+                .iter()
+                .map(|instance| {
+                    let PermutationInstance {
+                        pair: PermutationPair { column_pairs },
+                        challenge: PermutationChallenge { beta, gamma },
+                    } = instance;
+                    let zero = builder.zero_extension();
+                    let beta_ext = builder.convert_to_ext(*beta);
+                    let gamma_ext = builder.convert_to_ext(*gamma);
+                    let mut reduced =
+                        column_pairs
+                            .iter()
+                            .rev()
+                            .fold((zero, zero), |(lhs, rhs), &(i, j)| {
+                                (
+                                    builder.mul_add_extension(lhs, beta_ext, vars.local_values[i]),
+                                    builder.mul_add_extension(rhs, beta_ext, vars.local_values[j]),
+                                )
+                            });
+                    reduced.0 = builder.add_extension(reduced.0, gamma_ext);
+                    reduced.1 = builder.add_extension(reduced.1, gamma_ext);
+                    reduced
+                })
+                .unzip();
+        let reduced_lhs_product = builder.mul_many_extension(&reduced_lhs);
+        let reduced_rhs_product = builder.mul_many_extension(&reduced_rhs);
+        // constraint = next_zs[i] * reduced_rhs_product - local_zs[i] * reduced_lhs_product
+        let constraint = {
+            let tmp = builder.mul_extension(local_zs[i], reduced_lhs_product);
+            builder.mul_sub_extension(next_zs[i], reduced_rhs_product, tmp)
+        };
+        consumer.constraint(builder, constraint)
+    }
+}
diff --git a/starky/src/proof.rs b/starky/src/proof.rs
index 4807b443..d1f86d7e 100644
--- a/starky/src/proof.rs
+++ b/starky/src/proof.rs
@@ -113,6 +113,7 @@ pub(crate) struct StarkProofChallenges<F: RichField + Extendable<D>, const D: us
 }
 
 pub(crate) struct StarkProofChallengesTarget<const D: usize> {
+    pub permutation_challenge_sets: Vec<PermutationChallengeSet<Target>>,
     pub stark_alphas: Vec<Target>,
     pub stark_zeta: ExtensionTarget<D>,
     pub fri_challenges: FriChallengesTarget<D>,
@@ -179,27 +180,29 @@ impl<F: RichField + Extendable<D>, const D: usize> StarkOpeningSet<F, D> {
 pub struct StarkOpeningSetTarget<const D: usize> {
     pub local_values: Vec<ExtensionTarget<D>>,
     pub next_values: Vec<ExtensionTarget<D>>,
-    pub permutation_zs: Vec<ExtensionTarget<D>>,
-    pub permutation_zs_right: Vec<ExtensionTarget<D>>,
+    pub permutation_zs: Option<Vec<ExtensionTarget<D>>>,
+    pub permutation_zs_right: Option<Vec<ExtensionTarget<D>>>,
     pub quotient_polys: Vec<ExtensionTarget<D>>,
 }
 
 impl<const D: usize> StarkOpeningSetTarget<D> {
     pub(crate) fn to_fri_openings(&self) -> FriOpeningsTarget<D> {
         let zeta_batch = FriOpeningBatchTarget {
-            values: [
-                self.local_values.as_slice(),
-                self.quotient_polys.as_slice(),
-                self.permutation_zs.as_slice(),
-            ]
-            .concat(),
+            values: self
+                .local_values
+                .iter()
+                .chain(self.permutation_zs.iter().flatten())
+                .chain(&self.quotient_polys)
+                .copied()
+                .collect_vec(),
         };
         let zeta_right_batch = FriOpeningBatchTarget {
-            values: [
-                self.next_values.as_slice(),
-                self.permutation_zs_right.as_slice(),
-            ]
-            .concat(),
+            values: self
+                .next_values
+                .iter()
+                .chain(self.permutation_zs_right.iter().flatten())
+                .copied()
+                .collect_vec(),
         };
         FriOpeningsTarget {
             batches: vec![zeta_batch, zeta_right_batch],
diff --git a/starky/src/recursive_verifier.rs b/starky/src/recursive_verifier.rs
index c1071b5a..cc547396 100644
--- a/starky/src/recursive_verifier.rs
+++ b/starky/src/recursive_verifier.rs
@@ -1,5 +1,6 @@
 use std::iter::once;
 
+use anyhow::Result;
 use itertools::Itertools;
 use plonky2::field::extension_field::Extendable;
 use plonky2::field::field_types::Field;
@@ -13,11 +14,13 @@ use plonky2::util::reducing::ReducingFactorTarget;
 
 use crate::config::StarkConfig;
 use crate::constraint_consumer::RecursiveConstraintConsumer;
+use crate::permutation::PermutationCheckDataTarget;
 use crate::proof::{
     StarkOpeningSetTarget, StarkProof, StarkProofChallengesTarget, StarkProofTarget,
     StarkProofWithPublicInputs, StarkProofWithPublicInputsTarget,
 };
 use crate::stark::Stark;
+use crate::vanishing_poly::eval_vanishing_poly_recursively;
 use crate::vars::StarkEvaluationTargets;
 
 pub fn recursively_verify_stark_proof<
@@ -30,14 +33,15 @@ pub fn recursively_verify_stark_proof<
     stark: S,
     proof_with_pis: StarkProofWithPublicInputsTarget<D>,
     inner_config: &StarkConfig,
-) where
+) -> Result<()>
+where
     C::Hasher: AlgebraicHasher<F>,
     [(); S::COLUMNS]:,
     [(); S::PUBLIC_INPUTS]:,
 {
     assert_eq!(proof_with_pis.public_inputs.len(), S::PUBLIC_INPUTS);
     let degree_bits = proof_with_pis.proof.recover_degree_bits(inner_config);
-    let challenges = proof_with_pis.get_challenges::<F, C>(builder, inner_config);
+    let challenges = proof_with_pis.get_challenges::<F, C, S>(builder, &stark, inner_config)?;
 
     recursively_verify_stark_proof_with_challenges::<F, C, S, D>(
         builder,
@@ -47,6 +51,8 @@ pub fn recursively_verify_stark_proof<
         inner_config,
         degree_bits,
     );
+
+    Ok(())
 }
 
 /// Recursively verifies an inner proof.
@@ -104,8 +110,21 @@ fn recursively_verify_stark_proof_with_challenges<
         l_1,
         l_last,
     );
-    stark.eval_ext_recursively(builder, vars, &mut consumer);
-    // TODO: Add in constraints for permutation arguments.
+    let permutation_data = stark
+        .uses_permutation_args()
+        .then(|| PermutationCheckDataTarget {
+            local_zs: permutation_zs.as_ref().unwrap().clone(),
+            next_zs: permutation_zs_right.as_ref().unwrap().clone(),
+            permutation_challenge_sets: challenges.permutation_challenge_sets,
+        });
+    eval_vanishing_poly_recursively::<F, C, S, D>(
+        builder,
+        &stark,
+        inner_config,
+        vars,
+        permutation_data,
+        &mut consumer,
+    );
     let vanishing_polys_zeta = consumer.accumulators();
 
     // Check each polynomial identity, of the form `vanishing(x) = Z_H(x) quotient(x)`, at zeta.
@@ -222,10 +241,12 @@ fn add_stark_opening_set<F: RichField + Extendable<D>, S: Stark<F, D>, const D:
     StarkOpeningSetTarget {
         local_values: builder.add_virtual_extension_targets(S::COLUMNS),
         next_values: builder.add_virtual_extension_targets(S::COLUMNS),
-        permutation_zs: builder
-            .add_virtual_extension_targets(stark.num_permutation_batches(config)),
-        permutation_zs_right: builder
-            .add_virtual_extension_targets(stark.num_permutation_batches(config)),
+        permutation_zs: stark
+            .uses_permutation_args()
+            .then(|| builder.add_virtual_extension_targets(stark.num_permutation_batches(config))),
+        permutation_zs_right: stark
+            .uses_permutation_args()
+            .then(|| builder.add_virtual_extension_targets(stark.num_permutation_batches(config))),
         quotient_polys: builder
             .add_virtual_extension_targets(stark.quotient_degree_factor() * num_challenges),
     }
diff --git a/starky/src/vanishing_poly.rs b/starky/src/vanishing_poly.rs
index dc598167..55ea7a5a 100644
--- a/starky/src/vanishing_poly.rs
+++ b/starky/src/vanishing_poly.rs
@@ -1,12 +1,16 @@
 use plonky2::field::extension_field::{Extendable, FieldExtension};
 use plonky2::hash::hash_types::RichField;
+use plonky2::plonk::circuit_builder::CircuitBuilder;
 use plonky2::plonk::config::GenericConfig;
 
 use crate::config::StarkConfig;
-use crate::constraint_consumer::ConstraintConsumer;
-use crate::permutation::{eval_permutation_checks, PermutationCheckData};
+use crate::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer};
+use crate::permutation::{
+    eval_permutation_checks, eval_permutation_checks_recursively, PermutationCheckData,
+    PermutationCheckDataTarget,
+};
 use crate::stark::Stark;
-use crate::vars::StarkEvaluationVars;
+use crate::vars::{StarkEvaluationTargets, StarkEvaluationVars};
 
 pub(crate) fn eval_vanishing_poly<F, FE, C, S, const D: usize, const D2: usize>(
     stark: &S,
@@ -23,20 +27,40 @@ pub(crate) fn eval_vanishing_poly<F, FE, C, S, const D: usize, const D2: usize>(
     [(); S::PUBLIC_INPUTS]:,
 {
     stark.eval_packed_generic(vars, consumer);
-    if let Some(PermutationCheckData {
-        local_zs,
-        next_zs,
-        permutation_challenge_sets,
-    }) = permutation_data
-    {
+    if let Some(permutation_data) = permutation_data {
         eval_permutation_checks::<F, FE, C, S, D, D2>(
             stark,
             config,
             vars,
-            &local_zs,
-            &next_zs,
+            permutation_data,
+            consumer,
+        );
+    }
+}
+
+pub(crate) fn eval_vanishing_poly_recursively<F, C, S, const D: usize>(
+    builder: &mut CircuitBuilder<F, D>,
+    stark: &S,
+    config: &StarkConfig,
+    vars: StarkEvaluationTargets<D, { S::COLUMNS }, { S::PUBLIC_INPUTS }>,
+    permutation_data: Option<PermutationCheckDataTarget<D>>,
+    consumer: &mut RecursiveConstraintConsumer<F, D>,
+) where
+    F: RichField + Extendable<D>,
+    C: GenericConfig<D, F = F>,
+    S: Stark<F, D>,
+    [(); S::COLUMNS]:,
+    [(); S::PUBLIC_INPUTS]:,
+{
+    stark.eval_ext_recursively(builder, vars, consumer);
+    if let Some(permutation_data) = permutation_data {
+        eval_permutation_checks_recursively::<F, S, D>(
+            builder,
+            stark,
+            config,
+            vars,
+            permutation_data,
             consumer,
-            &permutation_challenge_sets,
         );
     }
 }
diff --git a/starky/src/verifier.rs b/starky/src/verifier.rs
index 1603b208..6bb1ac4e 100644
--- a/starky/src/verifier.rs
+++ b/starky/src/verifier.rs
@@ -90,7 +90,6 @@ where
         l_1,
         l_last,
     );
-    // stark.eval_ext(vars, &mut consumer);
     let permutation_data = stark.uses_permutation_args().then(|| PermutationCheckData {
         local_zs: permutation_zs.as_ref().unwrap().clone(),
         next_zs: permutation_zs_right.as_ref().unwrap().clone(),

From 064b3c07a829deefadb5e71a974971da845e5ab8 Mon Sep 17 00:00:00 2001
From: wborgeaud <williamborgeaud@gmail.com>
Date: Tue, 22 Feb 2022 16:18:41 +0100
Subject: [PATCH 18/32] Forgot to set permutation cap

---
 starky/src/prover.rs             | 2 --
 starky/src/recursive_verifier.rs | 6 ++++++
 starky/src/stark.rs              | 1 -
 starky/src/verifier.rs           | 1 -
 4 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/starky/src/prover.rs b/starky/src/prover.rs
index 5cfcf6ea..ac6689c2 100644
--- a/starky/src/prover.rs
+++ b/starky/src/prover.rs
@@ -294,8 +294,6 @@ where
                 permutation_check_data,
                 &mut consumer,
             );
-            // stark.eval_packed_base(vars, &mut consumer);
-            // TODO: Add in constraints for permutation arguments.
             // TODO: Fix this once we use a genuine `PackedField`.
             let mut constraints_evals = consumer.accumulators();
             // We divide the constraints evaluations by `Z_H(x)`.
diff --git a/starky/src/recursive_verifier.rs b/starky/src/recursive_verifier.rs
index cc547396..608d6dc1 100644
--- a/starky/src/recursive_verifier.rs
+++ b/starky/src/recursive_verifier.rs
@@ -295,5 +295,11 @@ pub fn set_stark_proof_target<F, C: GenericConfig<D, F = F>, W, const D: usize>(
         &proof.openings.to_fri_openings(),
     );
 
+    if let (Some(permutation_zs_cap_target), Some(permutation_zs_cap)) =
+        (&proof_target.permutation_zs_cap, &proof.permutation_zs_cap)
+    {
+        witness.set_cap_target(&permutation_zs_cap_target, &permutation_zs_cap);
+    }
+
     set_fri_proof_target(witness, &proof_target.opening_proof, &proof.opening_proof);
 }
diff --git a/starky/src/stark.rs b/starky/src/stark.rs
index a2a2f7fd..72614574 100644
--- a/starky/src/stark.rs
+++ b/starky/src/stark.rs
@@ -16,7 +16,6 @@ use crate::vars::StarkEvaluationTargets;
 use crate::vars::StarkEvaluationVars;
 
 /// Represents a STARK system.
-// TODO: Add a `constraint_degree` fn that returns the maximum constraint degree.
 pub trait Stark<F: RichField + Extendable<D>, const D: usize>: Sync {
     /// The total number of columns in the trace.
     const COLUMNS: usize;
diff --git a/starky/src/verifier.rs b/starky/src/verifier.rs
index 6bb1ac4e..44d3f9c7 100644
--- a/starky/src/verifier.rs
+++ b/starky/src/verifier.rs
@@ -102,7 +102,6 @@ where
         permutation_data,
         &mut consumer,
     );
-    // TODO: Add in constraints for permutation arguments.
     let vanishing_polys_zeta = consumer.accumulators();
 
     // Check each polynomial identity, of the form `vanishing(x) = Z_H(x) quotient(x)`, at zeta.

From 4ea418a4865c7cb70b3bfdc17eab233e292a19f1 Mon Sep 17 00:00:00 2001
From: wborgeaud <williamborgeaud@gmail.com>
Date: Tue, 22 Feb 2022 16:35:20 +0100
Subject: [PATCH 19/32] Clippy

---
 starky/src/recursive_verifier.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/starky/src/recursive_verifier.rs b/starky/src/recursive_verifier.rs
index 608d6dc1..de01d39e 100644
--- a/starky/src/recursive_verifier.rs
+++ b/starky/src/recursive_verifier.rs
@@ -298,7 +298,7 @@ pub fn set_stark_proof_target<F, C: GenericConfig<D, F = F>, W, const D: usize>(
     if let (Some(permutation_zs_cap_target), Some(permutation_zs_cap)) =
         (&proof_target.permutation_zs_cap, &proof.permutation_zs_cap)
     {
-        witness.set_cap_target(&permutation_zs_cap_target, &permutation_zs_cap);
+        witness.set_cap_target(permutation_zs_cap_target, permutation_zs_cap);
     }
 
     set_fri_proof_target(witness, &proof_target.opening_proof, &proof.opening_proof);

From 150d76444081563e454e4edd554a5128c1b13271 Mon Sep 17 00:00:00 2001
From: wborgeaud <williamborgeaud@gmail.com>
Date: Tue, 22 Feb 2022 17:00:08 +0100
Subject: [PATCH 20/32] Simplification

---
 starky/src/fibonacci_stark.rs    |  6 ++--
 starky/src/get_challenges.rs     | 58 +++++++++++++-------------------
 starky/src/proof.rs              |  4 +--
 starky/src/recursive_verifier.rs | 10 ++----
 starky/src/verifier.rs           |  4 +--
 5 files changed, 33 insertions(+), 49 deletions(-)

diff --git a/starky/src/fibonacci_stark.rs b/starky/src/fibonacci_stark.rs
index 10b54d69..7961ad50 100644
--- a/starky/src/fibonacci_stark.rs
+++ b/starky/src/fibonacci_stark.rs
@@ -13,7 +13,7 @@ use crate::vars::{StarkEvaluationTargets, StarkEvaluationVars};
 /// Toy STARK system used for testing.
 /// Computes a Fibonacci sequence with state `[x0, x1, i, j]` using the state transition
 /// `x0' <- x1, x1' <- x0 + x1, i' <- i+1, j' <- j+1`.
-/// Note: The `i, j` columns are used to test the permutation argument.
+/// Note: The `i, j` columns are only used to test the permutation argument.
 #[derive(Copy, Clone)]
 struct FibonacciStark<F: RichField + Extendable<D>, const D: usize> {
     num_rows: usize,
@@ -48,7 +48,7 @@ impl<F: RichField + Extendable<D>, const D: usize> FibonacciStark<F, D> {
                 Some(tmp)
             })
             .collect::<Vec<_>>();
-        trace[self.num_rows - 1][3] = F::ZERO;
+        trace[self.num_rows - 1][3] = F::ZERO; // So that column 2 and 3 are permutation of one another.
         trace
     }
 }
@@ -234,7 +234,7 @@ mod tests {
         let pt = add_virtual_stark_proof_with_pis(&mut builder, stark, inner_config, degree_bits);
         set_stark_proof_with_pis_target(&mut pw, &pt, &inner_proof);
 
-        recursively_verify_stark_proof::<F, InnerC, S, D>(&mut builder, stark, pt, inner_config)?;
+        recursively_verify_stark_proof::<F, InnerC, S, D>(&mut builder, stark, pt, inner_config);
 
         if print_gate_counts {
             builder.print_gate_counts(0);
diff --git a/starky/src/get_challenges.rs b/starky/src/get_challenges.rs
index 8ee71667..0f4aacee 100644
--- a/starky/src/get_challenges.rs
+++ b/starky/src/get_challenges.rs
@@ -1,4 +1,3 @@
-use anyhow::Result;
 use plonky2::field::extension_field::Extendable;
 use plonky2::field::polynomial::PolynomialCoeffs;
 use plonky2::fri::proof::{FriProof, FriProofTarget};
@@ -28,7 +27,7 @@ fn get_challenges<F, C, S, const D: usize>(
     pow_witness: F,
     config: &StarkConfig,
     degree_bits: usize,
-) -> Result<StarkProofChallenges<F, D>>
+) -> StarkProofChallenges<F, D>
 where
     F: RichField + Extendable<D>,
     C: GenericConfig<D, F = F>,
@@ -40,20 +39,15 @@ where
 
     challenger.observe_cap(trace_cap);
 
-    let permutation_challenge_sets = if stark.uses_permutation_args() {
-        get_n_permutation_challenge_sets(
+    let permutation_challenge_sets = permutation_zs_cap.map(|permutation_zs_cap| {
+        let tmp = get_n_permutation_challenge_sets(
             &mut challenger,
             num_challenges,
             stark.permutation_batch_size(),
-        )
-    } else {
-        vec![]
-    };
-    if stark.uses_permutation_args() {
-        let cap =
-            permutation_zs_cap.ok_or_else(|| anyhow::Error::msg("expected permutation_zs_cap"));
-        challenger.observe_cap(cap?);
-    }
+        );
+        challenger.observe_cap(permutation_zs_cap);
+        tmp
+    });
 
     let stark_alphas = challenger.get_n_challenges(num_challenges);
 
@@ -62,7 +56,7 @@ where
 
     challenger.observe_openings(&openings.to_fri_openings());
 
-    Ok(StarkProofChallenges {
+    StarkProofChallenges {
         permutation_challenge_sets,
         stark_alphas,
         stark_zeta,
@@ -73,7 +67,7 @@ where
             degree_bits,
             &config.fri_config,
         ),
-    })
+    }
 }
 
 impl<F, C, const D: usize> StarkProofWithPublicInputs<F, C, D>
@@ -86,11 +80,10 @@ where
         stark: &S,
         config: &StarkConfig,
         degree_bits: usize,
-    ) -> anyhow::Result<Vec<usize>> {
-        Ok(self
-            .get_challenges(stark, config, degree_bits)?
+    ) -> Vec<usize> {
+        self.get_challenges(stark, config, degree_bits)
             .fri_challenges
-            .fri_query_indices)
+            .fri_query_indices
     }
 
     /// Computes all Fiat-Shamir challenges used in the STARK proof.
@@ -99,7 +92,7 @@ where
         stark: &S,
         config: &StarkConfig,
         degree_bits: usize,
-    ) -> Result<StarkProofChallenges<F, D>> {
+    ) -> StarkProofChallenges<F, D> {
         let StarkProof {
             trace_cap,
             permutation_zs_cap,
@@ -146,7 +139,7 @@ pub(crate) fn get_challenges_target<
     final_poly: &PolynomialCoeffsExtTarget<D>,
     pow_witness: Target,
     config: &StarkConfig,
-) -> Result<StarkProofChallengesTarget<D>>
+) -> StarkProofChallengesTarget<D>
 where
     C::Hasher: AlgebraicHasher<F>,
 {
@@ -156,21 +149,16 @@ where
 
     challenger.observe_cap(trace_cap);
 
-    let permutation_challenge_sets = if stark.uses_permutation_args() {
-        get_n_permutation_challenge_sets_target(
+    let permutation_challenge_sets = permutation_zs_cap.map(|permutation_zs_cap| {
+        let tmp = get_n_permutation_challenge_sets_target(
             builder,
             &mut challenger,
             num_challenges,
             stark.permutation_batch_size(),
-        )
-    } else {
-        vec![]
-    };
-    if stark.uses_permutation_args() {
-        let cap =
-            permutation_zs_cap.ok_or_else(|| anyhow::Error::msg("expected permutation_zs_cap"));
-        challenger.observe_cap(cap?);
-    }
+        );
+        challenger.observe_cap(permutation_zs_cap);
+        tmp
+    });
 
     let stark_alphas = challenger.get_n_challenges(builder, num_challenges);
 
@@ -179,7 +167,7 @@ where
 
     challenger.observe_openings(&openings.to_fri_openings());
 
-    Ok(StarkProofChallengesTarget {
+    StarkProofChallengesTarget {
         permutation_challenge_sets,
         stark_alphas,
         stark_zeta,
@@ -190,7 +178,7 @@ where
             pow_witness,
             &config.fri_config,
         ),
-    })
+    }
 }
 
 impl<const D: usize> StarkProofWithPublicInputsTarget<D> {
@@ -203,7 +191,7 @@ impl<const D: usize> StarkProofWithPublicInputsTarget<D> {
         builder: &mut CircuitBuilder<F, D>,
         stark: &S,
         config: &StarkConfig,
-    ) -> Result<StarkProofChallengesTarget<D>>
+    ) -> StarkProofChallengesTarget<D>
     where
         C::Hasher: AlgebraicHasher<F>,
     {
diff --git a/starky/src/proof.rs b/starky/src/proof.rs
index d1f86d7e..1975b1b9 100644
--- a/starky/src/proof.rs
+++ b/starky/src/proof.rs
@@ -101,7 +101,7 @@ pub struct CompressedStarkProofWithPublicInputs<
 
 pub(crate) struct StarkProofChallenges<F: RichField + Extendable<D>, const D: usize> {
     /// Randomness used in any permutation arguments.
-    pub permutation_challenge_sets: Vec<PermutationChallengeSet<F>>,
+    pub permutation_challenge_sets: Option<Vec<PermutationChallengeSet<F>>>,
 
     /// Random values used to combine STARK constraints.
     pub stark_alphas: Vec<F>,
@@ -113,7 +113,7 @@ pub(crate) struct StarkProofChallenges<F: RichField + Extendable<D>, const D: us
 }
 
 pub(crate) struct StarkProofChallengesTarget<const D: usize> {
-    pub permutation_challenge_sets: Vec<PermutationChallengeSet<Target>>,
+    pub permutation_challenge_sets: Option<Vec<PermutationChallengeSet<Target>>>,
     pub stark_alphas: Vec<Target>,
     pub stark_zeta: ExtensionTarget<D>,
     pub fri_challenges: FriChallengesTarget<D>,
diff --git a/starky/src/recursive_verifier.rs b/starky/src/recursive_verifier.rs
index de01d39e..6a7363ae 100644
--- a/starky/src/recursive_verifier.rs
+++ b/starky/src/recursive_verifier.rs
@@ -1,6 +1,5 @@
 use std::iter::once;
 
-use anyhow::Result;
 use itertools::Itertools;
 use plonky2::field::extension_field::Extendable;
 use plonky2::field::field_types::Field;
@@ -33,15 +32,14 @@ pub fn recursively_verify_stark_proof<
     stark: S,
     proof_with_pis: StarkProofWithPublicInputsTarget<D>,
     inner_config: &StarkConfig,
-) -> Result<()>
-where
+) where
     C::Hasher: AlgebraicHasher<F>,
     [(); S::COLUMNS]:,
     [(); S::PUBLIC_INPUTS]:,
 {
     assert_eq!(proof_with_pis.public_inputs.len(), S::PUBLIC_INPUTS);
     let degree_bits = proof_with_pis.proof.recover_degree_bits(inner_config);
-    let challenges = proof_with_pis.get_challenges::<F, C, S>(builder, &stark, inner_config)?;
+    let challenges = proof_with_pis.get_challenges::<F, C, S>(builder, &stark, inner_config);
 
     recursively_verify_stark_proof_with_challenges::<F, C, S, D>(
         builder,
@@ -51,8 +49,6 @@ where
         inner_config,
         degree_bits,
     );
-
-    Ok(())
 }
 
 /// Recursively verifies an inner proof.
@@ -115,7 +111,7 @@ fn recursively_verify_stark_proof_with_challenges<
         .then(|| PermutationCheckDataTarget {
             local_zs: permutation_zs.as_ref().unwrap().clone(),
             next_zs: permutation_zs_right.as_ref().unwrap().clone(),
-            permutation_challenge_sets: challenges.permutation_challenge_sets,
+            permutation_challenge_sets: challenges.permutation_challenge_sets.unwrap(),
         });
     eval_vanishing_poly_recursively::<F, C, S, D>(
         builder,
diff --git a/starky/src/verifier.rs b/starky/src/verifier.rs
index 44d3f9c7..959cbc8e 100644
--- a/starky/src/verifier.rs
+++ b/starky/src/verifier.rs
@@ -34,7 +34,7 @@ where
 {
     ensure!(proof_with_pis.public_inputs.len() == S::PUBLIC_INPUTS);
     let degree_bits = proof_with_pis.proof.recover_degree_bits(config);
-    let challenges = proof_with_pis.get_challenges(&stark, config, degree_bits)?;
+    let challenges = proof_with_pis.get_challenges(&stark, config, degree_bits);
     verify_stark_proof_with_challenges(stark, proof_with_pis, challenges, degree_bits, config)
 }
 
@@ -93,7 +93,7 @@ where
     let permutation_data = stark.uses_permutation_args().then(|| PermutationCheckData {
         local_zs: permutation_zs.as_ref().unwrap().clone(),
         next_zs: permutation_zs_right.as_ref().unwrap().clone(),
-        permutation_challenge_sets: challenges.permutation_challenge_sets,
+        permutation_challenge_sets: challenges.permutation_challenge_sets.unwrap(),
     });
     eval_vanishing_poly::<F, F::Extension, C, S, D, D>(
         &stark,

From a31c58b69d17c8899f2e86fac4070a323493bbad Mon Sep 17 00:00:00 2001
From: wborgeaud <williamborgeaud@gmail.com>
Date: Tue, 22 Feb 2022 17:23:55 +0100
Subject: [PATCH 21/32] Use ReducingFactor

---
 plonky2/src/util/reducing.rs | 15 +++++++++++-
 starky/src/permutation.rs    | 47 ++++++++++++++++--------------------
 2 files changed, 35 insertions(+), 27 deletions(-)

diff --git a/plonky2/src/util/reducing.rs b/plonky2/src/util/reducing.rs
index f29c6d08..626668e6 100644
--- a/plonky2/src/util/reducing.rs
+++ b/plonky2/src/util/reducing.rs
@@ -1,6 +1,6 @@
 use std::borrow::Borrow;
 
-use plonky2_field::extension_field::Extendable;
+use plonky2_field::extension_field::{Extendable, FieldExtension};
 use plonky2_field::field_types::Field;
 use plonky2_field::polynomial::PolynomialCoeffs;
 
@@ -35,6 +35,11 @@ impl<F: Field> ReducingFactor<F> {
         self.base * x
     }
 
+    fn mul_ext<FE: FieldExtension<D, BaseField = F>, const D: usize>(&mut self, x: FE) -> FE {
+        self.count += 1;
+        x.scalar_mul(self.base)
+    }
+
     fn mul_poly(&mut self, p: &mut PolynomialCoeffs<F>) {
         self.count += 1;
         *p *= self.base;
@@ -45,6 +50,14 @@ impl<F: Field> ReducingFactor<F> {
             .fold(F::ZERO, |acc, x| self.mul(acc) + *x.borrow())
     }
 
+    pub fn reduce_ext<FE: FieldExtension<D, BaseField = F>, const D: usize>(
+        &mut self,
+        iter: impl DoubleEndedIterator<Item = impl Borrow<FE>>,
+    ) -> FE {
+        iter.rev()
+            .fold(FE::ZERO, |acc, x| self.mul_ext(acc) + *x.borrow())
+    }
+
     pub fn reduce_polys(
         &mut self,
         polys: impl DoubleEndedIterator<Item = impl Borrow<PolynomialCoeffs<F>>>,
diff --git a/starky/src/permutation.rs b/starky/src/permutation.rs
index 1113094d..dad4b661 100644
--- a/starky/src/permutation.rs
+++ b/starky/src/permutation.rs
@@ -11,6 +11,7 @@ use plonky2::iop::ext_target::ExtensionTarget;
 use plonky2::iop::target::Target;
 use plonky2::plonk::circuit_builder::CircuitBuilder;
 use plonky2::plonk::config::{AlgebraicHasher, GenericConfig, Hasher};
+use plonky2::util::reducing::{ReducingFactor, ReducingFactorTarget};
 use rayon::prelude::*;
 
 use crate::config::StarkConfig;
@@ -283,19 +284,15 @@ pub(crate) fn eval_permutation_checks<F, FE, C, S, const D: usize, const D2: usi
                     pair: PermutationPair { column_pairs },
                     challenge: PermutationChallenge { beta, gamma },
                 } = instance;
-                let mut reduced =
-                    column_pairs
-                        .iter()
-                        .rev()
-                        .fold((FE::ZERO, FE::ZERO), |(lhs, rhs), &(i, j)| {
-                            (
-                                lhs.scalar_mul(*beta) + vars.local_values[i],
-                                rhs.scalar_mul(*beta) + vars.local_values[j],
-                            )
-                        });
-                reduced.0 += FE::from_basefield(*gamma);
-                reduced.1 += FE::from_basefield(*gamma);
-                reduced
+                let mut factor = ReducingFactor::new(*beta);
+                let (lhs, rhs): (Vec<_>, Vec<_>) = column_pairs
+                    .iter()
+                    .map(|&(i, j)| (vars.local_values[i], vars.local_values[j]))
+                    .unzip();
+                (
+                    factor.reduce_ext(lhs.into_iter()) + FE::from_basefield(*gamma),
+                    factor.reduce_ext(rhs.into_iter()) + FE::from_basefield(*gamma),
+                )
             })
             .unzip();
         let constraint = next_zs[i] * reduced_rhs.into_iter().product()
@@ -353,19 +350,17 @@ pub(crate) fn eval_permutation_checks_recursively<F, S, const D: usize>(
                     let zero = builder.zero_extension();
                     let beta_ext = builder.convert_to_ext(*beta);
                     let gamma_ext = builder.convert_to_ext(*gamma);
-                    let mut reduced =
-                        column_pairs
-                            .iter()
-                            .rev()
-                            .fold((zero, zero), |(lhs, rhs), &(i, j)| {
-                                (
-                                    builder.mul_add_extension(lhs, beta_ext, vars.local_values[i]),
-                                    builder.mul_add_extension(rhs, beta_ext, vars.local_values[j]),
-                                )
-                            });
-                    reduced.0 = builder.add_extension(reduced.0, gamma_ext);
-                    reduced.1 = builder.add_extension(reduced.1, gamma_ext);
-                    reduced
+                    let mut factor = ReducingFactorTarget::new(beta_ext);
+                    let (lhs, rhs): (Vec<_>, Vec<_>) = column_pairs
+                        .iter()
+                        .map(|&(i, j)| (vars.local_values[i], vars.local_values[j]))
+                        .unzip();
+                    let reduced_lhs = factor.reduce(&lhs, builder);
+                    let reduced_rhs = factor.reduce(&rhs, builder);
+                    (
+                        builder.add_extension(reduced_lhs, gamma_ext),
+                        builder.add_extension(reduced_rhs, gamma_ext),
+                    )
                 })
                 .unzip();
         let reduced_lhs_product = builder.mul_many_extension(&reduced_lhs);

From 17bbc6f3e403642296ca73be0147debb31e9e86e Mon Sep 17 00:00:00 2001
From: wborgeaud <williamborgeaud@gmail.com>
Date: Tue, 22 Feb 2022 17:30:08 +0100
Subject: [PATCH 22/32] Minor

---
 starky/src/prover.rs | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/starky/src/prover.rs b/starky/src/prover.rs
index ac6689c2..fe007f05 100644
--- a/starky/src/prover.rs
+++ b/starky/src/prover.rs
@@ -84,7 +84,7 @@ where
     challenger.observe_cap(&trace_cap);
 
     // Permutation arguments.
-    let permutation_zs_commitment_challenges = if stark.uses_permutation_args() {
+    let permutation_zs_commitment_challenges = stark.uses_permutation_args().then(|| {
         let permutation_challenge_sets = get_n_permutation_challenge_sets(
             &mut challenger,
             config.num_challenges,
@@ -101,7 +101,7 @@ where
         timed!(
             timing,
             "compute permutation Z commitments",
-            Some((
+            (
                 PolynomialBatch::from_values(
                     permutation_z_polys,
                     rate_bits,
@@ -111,11 +111,9 @@ where
                     None,
                 ),
                 permutation_challenge_sets
-            ))
+            )
         )
-    } else {
-        None
-    };
+    });
     let permutation_zs_commitment = permutation_zs_commitment_challenges
         .as_ref()
         .map(|(comm, _)| comm);

From 8c5cbbc7c66a51c805003dfcce5ed02d472cf99d Mon Sep 17 00:00:00 2001
From: wborgeaud <williamborgeaud@gmail.com>
Date: Tue, 22 Feb 2022 17:40:48 +0100
Subject: [PATCH 23/32] Add first row Z check

---
 starky/src/permutation.rs | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/starky/src/permutation.rs b/starky/src/permutation.rs
index dad4b661..d2a16fc4 100644
--- a/starky/src/permutation.rs
+++ b/starky/src/permutation.rs
@@ -264,7 +264,12 @@ pub(crate) fn eval_permutation_checks<F, FE, C, S, const D: usize, const D2: usi
         next_zs,
         permutation_challenge_sets,
     } = permutation_data;
-    // TODO: Z_1 check.
+
+    // Check that Z(1) = 1;
+    for &z in &local_zs {
+        consumer.constraint_first_row(z - FE::ONE);
+    }
+
     let permutation_pairs = stark.permutation_pairs();
 
     let permutation_batches = get_permutation_batches(
@@ -326,7 +331,14 @@ pub(crate) fn eval_permutation_checks_recursively<F, S, const D: usize>(
         next_zs,
         permutation_challenge_sets,
     } = permutation_data;
-    // TODO: Z_1 check.
+
+    let one = builder.one_extension();
+    // Check that Z(1) = 1;
+    for &z in &local_zs {
+        let z_1 = builder.sub_extension(z, one);
+        consumer.constraint_first_row(builder, z_1);
+    }
+
     let permutation_pairs = stark.permutation_pairs();
 
     let permutation_batches = get_permutation_batches(

From dd4cc21309c860a7be2f2e1baa830bdba7e033a6 Mon Sep 17 00:00:00 2001
From: wborgeaud <williamborgeaud@gmail.com>
Date: Wed, 23 Feb 2022 09:36:28 +0100
Subject: [PATCH 24/32] PR feedback

---
 starky/src/permutation.rs        | 13 +++++-----
 starky/src/proof.rs              |  2 ++
 starky/src/prover.rs             | 38 +++++++++++++--------------
 starky/src/recursive_verifier.rs | 44 +++++++++++++++++++++++---------
 starky/src/vanishing_poly.rs     | 12 +++++----
 starky/src/verifier.rs           | 34 +++++++++++++++++++++---
 6 files changed, 95 insertions(+), 48 deletions(-)

diff --git a/starky/src/permutation.rs b/starky/src/permutation.rs
index d2a16fc4..2e1d603c 100644
--- a/starky/src/permutation.rs
+++ b/starky/src/permutation.rs
@@ -4,6 +4,7 @@ use itertools::Itertools;
 use plonky2::field::batch_util::batch_multiply_inplace;
 use plonky2::field::extension_field::{Extendable, FieldExtension};
 use plonky2::field::field_types::Field;
+use plonky2::field::packed_field::PackedField;
 use plonky2::field::polynomial::PolynomialValues;
 use plonky2::hash::hash_types::RichField;
 use plonky2::iop::challenger::{Challenger, RecursiveChallenger};
@@ -54,7 +55,6 @@ pub(crate) struct PermutationChallengeSet<T: Copy> {
 pub(crate) fn compute_permutation_z_polys<F, C, S, const D: usize>(
     stark: &S,
     config: &StarkConfig,
-    challenger: &mut Challenger<F, C::Hasher>,
     trace_poly_values: &[PolynomialValues<F>],
     permutation_challenge_sets: &[PermutationChallengeSet<F>],
 ) -> Vec<PolynomialValues<F>>
@@ -239,27 +239,28 @@ pub(crate) fn get_permutation_batches<'a, T: Copy>(
 }
 
 // TODO: Use slices.
-pub struct PermutationCheckData<F: Field, FE: FieldExtension<D2, BaseField = F>, const D2: usize> {
+pub struct PermutationCheckVars<F: Field, FE: FieldExtension<D2, BaseField = F>, const D2: usize> {
     pub(crate) local_zs: Vec<FE>,
     pub(crate) next_zs: Vec<FE>,
     pub(crate) permutation_challenge_sets: Vec<PermutationChallengeSet<F>>,
 }
 
-pub(crate) fn eval_permutation_checks<F, FE, C, S, const D: usize, const D2: usize>(
+pub(crate) fn eval_permutation_checks<F, FE, P, C, S, const D: usize, const D2: usize>(
     stark: &S,
     config: &StarkConfig,
     vars: StarkEvaluationVars<FE, FE, { S::COLUMNS }, { S::PUBLIC_INPUTS }>,
-    permutation_data: PermutationCheckData<F, FE, D2>,
+    permutation_data: PermutationCheckVars<F, FE, D2>,
     consumer: &mut ConstraintConsumer<FE>,
 ) where
     F: RichField + Extendable<D>,
     FE: FieldExtension<D2, BaseField = F>,
+    P: PackedField<Scalar = FE>,
     C: GenericConfig<D, F = F>,
     S: Stark<F, D>,
     [(); S::COLUMNS]:,
     [(); S::PUBLIC_INPUTS]:,
 {
-    let PermutationCheckData {
+    let PermutationCheckVars {
         local_zs,
         next_zs,
         permutation_challenge_sets,
@@ -350,7 +351,6 @@ pub(crate) fn eval_permutation_checks_recursively<F, S, const D: usize>(
 
     // Each zs value corresponds to a permutation batch.
     for (i, instances) in permutation_batches.iter().enumerate() {
-        // Z(gx) * down = Z x  * up
         let (reduced_lhs, reduced_rhs): (Vec<ExtensionTarget<D>>, Vec<ExtensionTarget<D>>) =
             instances
                 .iter()
@@ -359,7 +359,6 @@ pub(crate) fn eval_permutation_checks_recursively<F, S, const D: usize>(
                         pair: PermutationPair { column_pairs },
                         challenge: PermutationChallenge { beta, gamma },
                     } = instance;
-                    let zero = builder.zero_extension();
                     let beta_ext = builder.convert_to_ext(*beta);
                     let gamma_ext = builder.convert_to_ext(*gamma);
                     let mut factor = ReducingFactorTarget::new(beta_ext);
diff --git a/starky/src/proof.rs b/starky/src/proof.rs
index 1975b1b9..dba3db3e 100644
--- a/starky/src/proof.rs
+++ b/starky/src/proof.rs
@@ -32,6 +32,7 @@ pub struct StarkProof<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>,
 }
 
 impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize> StarkProof<F, C, D> {
+    /// Recover the length of the trace from a STARK proof and a STARK config.
     pub(crate) fn recover_degree_bits(&self, config: &StarkConfig) -> usize {
         let initial_merkle_proof = &self.opening_proof.query_round_proofs[0]
             .initial_trees_proof
@@ -51,6 +52,7 @@ pub struct StarkProofTarget<const D: usize> {
 }
 
 impl<const D: usize> StarkProofTarget<D> {
+    /// Recover the length of the trace from a STARK proof and a STARK config.
     pub(crate) fn recover_degree_bits(&self, config: &StarkConfig) -> usize {
         let initial_merkle_proof = &self.opening_proof.query_round_proofs[0]
             .initial_trees_proof
diff --git a/starky/src/prover.rs b/starky/src/prover.rs
index fe007f05..336b9963 100644
--- a/starky/src/prover.rs
+++ b/starky/src/prover.rs
@@ -18,7 +18,7 @@ use rayon::prelude::*;
 
 use crate::config::StarkConfig;
 use crate::constraint_consumer::ConstraintConsumer;
-use crate::permutation::PermutationCheckData;
+use crate::permutation::PermutationCheckVars;
 use crate::permutation::{
     compute_permutation_z_polys, get_n_permutation_challenge_sets, PermutationChallengeSet,
 };
@@ -93,26 +93,23 @@ where
         let permutation_z_polys = compute_permutation_z_polys::<F, C, S, D>(
             &stark,
             config,
-            &mut challenger,
             &trace_poly_values,
             &permutation_challenge_sets,
         );
 
-        timed!(
+        let permutation_zs_commitment = timed!(
             timing,
             "compute permutation Z commitments",
-            (
-                PolynomialBatch::from_values(
-                    permutation_z_polys,
-                    rate_bits,
-                    false,
-                    config.fri_config.cap_height,
-                    timing,
-                    None,
-                ),
-                permutation_challenge_sets
+            PolynomialBatch::from_values(
+                permutation_z_polys,
+                rate_bits,
+                false,
+                config.fri_config.cap_height,
+                timing,
+                None,
             )
-        )
+        );
+        (permutation_zs_commitment, permutation_challenge_sets)
     });
     let permutation_zs_commitment = permutation_zs_commitment_challenges
         .as_ref()
@@ -251,6 +248,8 @@ where
     // Retrieve the LDE values at index `i`.
     let get_at_index =
         |comm: &'a PolynomialBatch<F, C, D>, i: usize| -> &'a [F] { comm.get_lde_values(i * step) };
+    let get_trace_at_index = |i| get_at_index(trace_commitment, i).try_into().unwrap();
+
     // Last element of the subgroup.
     let last = F::primitive_root_of_unity(degree_bits).inverse();
     let size = degree << quotient_degree_bits;
@@ -271,21 +270,20 @@ where
                 lagrange_last.values[i],
             );
             let vars = StarkEvaluationVars::<F, F, { S::COLUMNS }, { S::PUBLIC_INPUTS }> {
-                local_values: &get_at_index(trace_commitment, i).try_into().unwrap(),
-                next_values: &get_at_index(trace_commitment, (i + next_step) % size)
-                    .try_into()
-                    .unwrap(),
+                local_values: &get_trace_at_index(i),
+                next_values: &get_trace_at_index((i + next_step) % size),
                 public_inputs: &public_inputs,
             };
             let permutation_check_data = permutation_zs_commitment_challenges.as_ref().map(
-                |(permutation_zs_commitment, permutation_challenge_sets)| PermutationCheckData {
+                |(permutation_zs_commitment, permutation_challenge_sets)| PermutationCheckVars {
                     local_zs: get_at_index(permutation_zs_commitment, i).to_vec(),
                     next_zs: get_at_index(permutation_zs_commitment, (i + next_step) % size)
                         .to_vec(),
                     permutation_challenge_sets: permutation_challenge_sets.to_vec(),
                 },
             );
-            eval_vanishing_poly::<F, F, C, S, D, 1>(
+            // TODO: Use packed field for F.
+            eval_vanishing_poly::<F, F, F, C, S, D, 1>(
                 stark,
                 config,
                 vars,
diff --git a/starky/src/recursive_verifier.rs b/starky/src/recursive_verifier.rs
index 6a7363ae..c1abbdb0 100644
--- a/starky/src/recursive_verifier.rs
+++ b/starky/src/recursive_verifier.rs
@@ -1,5 +1,6 @@
 use std::iter::once;
 
+use anyhow::{ensure, Result};
 use itertools::Itertools;
 use plonky2::field::extension_field::Extendable;
 use plonky2::field::field_types::Field;
@@ -69,6 +70,7 @@ fn recursively_verify_stark_proof_with_challenges<
     [(); S::COLUMNS]:,
     [(); S::PUBLIC_INPUTS]:,
 {
+    check_permutation_options(&stark, &proof_with_pis, &challenges).unwrap();
     let one = builder.one_extension();
 
     let StarkProofWithPublicInputsTarget {
@@ -202,18 +204,14 @@ pub fn add_virtual_stark_proof<F: RichField + Extendable<D>, S: Stark<F, D>, con
     let fri_params = config.fri_params(degree_bits);
     let cap_height = fri_params.config.cap_height;
 
-    let num_leaves_per_oracle = if stark.uses_permutation_args() {
-        vec![
-            S::COLUMNS,
-            stark.num_permutation_batches(config),
-            stark.quotient_degree_factor() * config.num_challenges,
-        ]
-    } else {
-        vec![
-            S::COLUMNS,
-            stark.quotient_degree_factor() * config.num_challenges,
-        ]
-    };
+    let num_leaves_per_oracle = once(S::COLUMNS)
+        .chain(
+            stark
+                .uses_permutation_args()
+                .then(|| stark.num_permutation_batches(config)),
+        )
+        .chain(once(stark.quotient_degree_factor() * config.num_challenges))
+        .collect_vec();
 
     let permutation_zs_cap = stark
         .uses_permutation_args()
@@ -299,3 +297,25 @@ pub fn set_stark_proof_target<F, C: GenericConfig<D, F = F>, W, const D: usize>(
 
     set_fri_proof_target(witness, &proof_target.opening_proof, &proof.opening_proof);
 }
+
+/// Utility function to check that all permutation data wrapped in `Option`s are `Some` iff
+/// the Stark uses a permutation argument.
+fn check_permutation_options<F: RichField + Extendable<D>, S: Stark<F, D>, const D: usize>(
+    stark: &S,
+    proof_with_pis: &StarkProofWithPublicInputsTarget<D>,
+    challenges: &StarkProofChallengesTarget<D>,
+) -> Result<()> {
+    let options_is_some = [
+        proof_with_pis.proof.permutation_zs_cap.is_some(),
+        proof_with_pis.proof.openings.permutation_zs.is_some(),
+        proof_with_pis.proof.openings.permutation_zs_right.is_some(),
+        challenges.permutation_challenge_sets.is_some(),
+    ];
+    ensure!(
+        options_is_some
+            .into_iter()
+            .all(|b| b == stark.uses_permutation_args()),
+        "Permutation data doesn't match with Stark configuration."
+    );
+    Ok(())
+}
diff --git a/starky/src/vanishing_poly.rs b/starky/src/vanishing_poly.rs
index 55ea7a5a..c8c75730 100644
--- a/starky/src/vanishing_poly.rs
+++ b/starky/src/vanishing_poly.rs
@@ -1,4 +1,5 @@
 use plonky2::field::extension_field::{Extendable, FieldExtension};
+use plonky2::field::packed_field::PackedField;
 use plonky2::hash::hash_types::RichField;
 use plonky2::plonk::circuit_builder::CircuitBuilder;
 use plonky2::plonk::config::GenericConfig;
@@ -6,21 +7,22 @@ use plonky2::plonk::config::GenericConfig;
 use crate::config::StarkConfig;
 use crate::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer};
 use crate::permutation::{
-    eval_permutation_checks, eval_permutation_checks_recursively, PermutationCheckData,
-    PermutationCheckDataTarget,
+    eval_permutation_checks, eval_permutation_checks_recursively, PermutationCheckDataTarget,
+    PermutationCheckVars,
 };
 use crate::stark::Stark;
 use crate::vars::{StarkEvaluationTargets, StarkEvaluationVars};
 
-pub(crate) fn eval_vanishing_poly<F, FE, C, S, const D: usize, const D2: usize>(
+pub(crate) fn eval_vanishing_poly<F, FE, P, C, S, const D: usize, const D2: usize>(
     stark: &S,
     config: &StarkConfig,
     vars: StarkEvaluationVars<FE, FE, { S::COLUMNS }, { S::PUBLIC_INPUTS }>,
-    permutation_data: Option<PermutationCheckData<F, FE, D2>>,
+    permutation_data: Option<PermutationCheckVars<F, FE, D2>>,
     consumer: &mut ConstraintConsumer<FE>,
 ) where
     F: RichField + Extendable<D>,
     FE: FieldExtension<D2, BaseField = F>,
+    P: PackedField<Scalar = FE>,
     C: GenericConfig<D, F = F>,
     S: Stark<F, D>,
     [(); S::COLUMNS]:,
@@ -28,7 +30,7 @@ pub(crate) fn eval_vanishing_poly<F, FE, C, S, const D: usize, const D2: usize>(
 {
     stark.eval_packed_generic(vars, consumer);
     if let Some(permutation_data) = permutation_data {
-        eval_permutation_checks::<F, FE, C, S, D, D2>(
+        eval_permutation_checks::<F, FE, P, C, S, D, D2>(
             stark,
             config,
             vars,
diff --git a/starky/src/verifier.rs b/starky/src/verifier.rs
index 959cbc8e..a9bf897c 100644
--- a/starky/src/verifier.rs
+++ b/starky/src/verifier.rs
@@ -11,7 +11,7 @@ use plonky2::plonk::plonk_common::reduce_with_powers;
 
 use crate::config::StarkConfig;
 use crate::constraint_consumer::ConstraintConsumer;
-use crate::permutation::PermutationCheckData;
+use crate::permutation::PermutationCheckVars;
 use crate::proof::{StarkOpeningSet, StarkProofChallenges, StarkProofWithPublicInputs};
 use crate::stark::Stark;
 use crate::vanishing_poly::eval_vanishing_poly;
@@ -55,6 +55,7 @@ where
     [(); S::PUBLIC_INPUTS]:,
     [(); C::Hasher::HASH_SIZE]:,
 {
+    check_permutation_options(&stark, &proof_with_pis, &challenges)?;
     let StarkProofWithPublicInputs {
         proof,
         public_inputs,
@@ -90,12 +91,12 @@ where
         l_1,
         l_last,
     );
-    let permutation_data = stark.uses_permutation_args().then(|| PermutationCheckData {
+    let permutation_data = stark.uses_permutation_args().then(|| PermutationCheckVars {
         local_zs: permutation_zs.as_ref().unwrap().clone(),
         next_zs: permutation_zs_right.as_ref().unwrap().clone(),
         permutation_challenge_sets: challenges.permutation_challenge_sets.unwrap(),
     });
-    eval_vanishing_poly::<F, F::Extension, C, S, D, D>(
+    eval_vanishing_poly::<F, F::Extension, F::Extension, C, S, D, D>(
         &stark,
         config,
         vars,
@@ -153,7 +154,32 @@ fn eval_l_1_and_l_last<F: Field>(log_n: usize, x: F) -> (F, F) {
     (z_x * invs[0], z_x * invs[1])
 }
 
-/// Recover the length of the trace from a STARK proof and a STARK config.
+/// Utility function to check that all permutation data wrapped in `Option`s are `Some` iff
+/// the Stark uses a permutation argument.
+fn check_permutation_options<
+    F: RichField + Extendable<D>,
+    C: GenericConfig<D, F = F>,
+    S: Stark<F, D>,
+    const D: usize,
+>(
+    stark: &S,
+    proof_with_pis: &StarkProofWithPublicInputs<F, C, D>,
+    challenges: &StarkProofChallenges<F, D>,
+) -> Result<()> {
+    let options_is_some = [
+        proof_with_pis.proof.permutation_zs_cap.is_some(),
+        proof_with_pis.proof.openings.permutation_zs.is_some(),
+        proof_with_pis.proof.openings.permutation_zs_right.is_some(),
+        challenges.permutation_challenge_sets.is_some(),
+    ];
+    ensure!(
+        options_is_some
+            .into_iter()
+            .all(|b| b == stark.uses_permutation_args()),
+        "Permutation data doesn't match with Stark configuration."
+    );
+    Ok(())
+}
 
 #[cfg(test)]
 mod tests {

From 383b8b68b3cc5410175ea148e86574a4cae67032 Mon Sep 17 00:00:00 2001
From: Nicholas Ward <npward@berkeley.edu>
Date: Thu, 24 Feb 2022 10:01:38 -0800
Subject: [PATCH 25/32] secret_to_public fn

---
 plonky2/src/curve/ecdsa.rs | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/plonky2/src/curve/ecdsa.rs b/plonky2/src/curve/ecdsa.rs
index cabe038a..52262830 100644
--- a/plonky2/src/curve/ecdsa.rs
+++ b/plonky2/src/curve/ecdsa.rs
@@ -16,6 +16,10 @@ pub struct ECDSASecretKey<C: Curve>(pub C::ScalarField);
 #[derive(Copy, Clone, Debug, Deserialize, Eq, Hash, PartialEq, Serialize)]
 pub struct ECDSAPublicKey<C: Curve>(pub AffinePoint<C>);
 
+pub fn secret_to_public<C: Curve>(sk: ECDSASecretKey<C>) -> ECDSAPublicKey<C> {
+    ECDSAPublicKey((CurveScalar(sk.0) * C::GENERATOR_PROJECTIVE).to_affine())
+}
+
 pub fn sign_message<C: Curve>(msg: C::ScalarField, sk: ECDSASecretKey<C>) -> ECDSASignature<C> {
     let (k, rr) = {
         let mut k = C::ScalarField::rand();
@@ -57,8 +61,7 @@ pub fn verify_message<C: Curve>(
 
 #[cfg(test)]
 mod tests {
-    use crate::curve::curve_types::{Curve, CurveScalar};
-    use crate::curve::ecdsa::{sign_message, verify_message, ECDSAPublicKey, ECDSASecretKey};
+    use crate::curve::ecdsa::{secret_to_public, sign_message, verify_message, ECDSASecretKey};
     use crate::curve::secp256k1::Secp256K1;
     use crate::field::field_types::Field;
     use crate::field::secp256k1_scalar::Secp256K1Scalar;
@@ -68,8 +71,8 @@ mod tests {
         type C = Secp256K1;
 
         let msg = Secp256K1Scalar::rand();
-        let sk = ECDSASecretKey(Secp256K1Scalar::rand());
-        let pk = ECDSAPublicKey((CurveScalar(sk.0) * C::GENERATOR_PROJECTIVE).to_affine());
+        let sk = ECDSASecretKey::<C>(Secp256K1Scalar::rand());
+        let pk = secret_to_public(sk);
 
         let sig = sign_message(msg, sk);
         let result = verify_message(msg, sig, pk);

From bd7f43adc2d314d2f3fd18415af089529d1e336b Mon Sep 17 00:00:00 2001
From: Nicholas Ward <npward@berkeley.edu>
Date: Thu, 24 Feb 2022 10:19:16 -0800
Subject: [PATCH 26/32] visibility

---
 plonky2/src/gadgets/ecdsa.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/plonky2/src/gadgets/ecdsa.rs b/plonky2/src/gadgets/ecdsa.rs
index 0a95e189..64f37e1f 100644
--- a/plonky2/src/gadgets/ecdsa.rs
+++ b/plonky2/src/gadgets/ecdsa.rs
@@ -8,10 +8,10 @@ use crate::hash::hash_types::RichField;
 use crate::plonk::circuit_builder::CircuitBuilder;
 
 #[derive(Clone, Debug)]
-pub struct ECDSASecretKeyTarget<C: Curve>(NonNativeTarget<C::ScalarField>);
+pub struct ECDSASecretKeyTarget<C: Curve>(pub NonNativeTarget<C::ScalarField>);
 
 #[derive(Clone, Debug)]
-pub struct ECDSAPublicKeyTarget<C: Curve>(AffinePointTarget<C>);
+pub struct ECDSAPublicKeyTarget<C: Curve>(pub AffinePointTarget<C>);
 
 #[derive(Clone, Debug)]
 pub struct ECDSASignatureTarget<C: Curve> {

From 2644f5f74a241244de19e306ae75b7d07a029e2a Mon Sep 17 00:00:00 2001
From: Hamish Ivey-Law <426294+unzvfu@users.noreply.github.com>
Date: Thu, 3 Mar 2022 14:18:19 +1100
Subject: [PATCH 27/32] System Zero subtraction operation (#508)

* First draft of subtraction operation.

* Daniel comments.

* Fix constraint calculation.

* cargo fmt

* Align native and recursive eval functions; fix typo.
---
 system_zero/src/alu/subtraction.rs | 53 ++++++++++++++++++++++++++++--
 system_zero/src/registers/alu.rs   | 12 +++++++
 2 files changed, 62 insertions(+), 3 deletions(-)

diff --git a/system_zero/src/alu/subtraction.rs b/system_zero/src/alu/subtraction.rs
index 8f8bb810..8b795cbb 100644
--- a/system_zero/src/alu/subtraction.rs
+++ b/system_zero/src/alu/subtraction.rs
@@ -10,7 +10,18 @@ use crate::registers::alu::*;
 use crate::registers::NUM_COLUMNS;
 
 pub(crate) fn generate_subtraction<F: PrimeField64>(values: &mut [F; NUM_COLUMNS]) {
-    // TODO
+    let in_1 = values[COL_SUB_INPUT_0].to_canonical_u64() as u32;
+    let in_2 = values[COL_SUB_INPUT_1].to_canonical_u64() as u32;
+
+    // in_1 - in_2 == diff - br*2^32
+    let (diff, br) = in_1.overflowing_sub(in_2);
+
+    let diff_1 = F::from_canonical_u16(diff as u16);
+    let diff_2 = F::from_canonical_u16((diff >> 16) as u16);
+
+    values[COL_SUB_OUTPUT_0] = F::from_canonical_u16(diff as u16);
+    values[COL_SUB_OUTPUT_1] = F::from_canonical_u16((diff >> 16) as u16);
+    values[COL_SUB_OUTPUT_BORROW] = F::from_canonical_u16(br as u16);
 }
 
 pub(crate) fn eval_subtraction<F: Field, P: PackedField<Scalar = F>>(
@@ -18,7 +29,23 @@ pub(crate) fn eval_subtraction<F: Field, P: PackedField<Scalar = F>>(
     yield_constr: &mut ConstraintConsumer<P>,
 ) {
     let is_sub = local_values[IS_SUB];
-    // TODO
+    let in_1 = local_values[COL_SUB_INPUT_0];
+    let in_2 = local_values[COL_SUB_INPUT_1];
+    let out_1 = local_values[COL_SUB_OUTPUT_0];
+    let out_2 = local_values[COL_SUB_OUTPUT_1];
+    let out_br = local_values[COL_SUB_OUTPUT_BORROW];
+
+    let base = F::from_canonical_u64(1 << 16);
+    let base_sqr = F::from_canonical_u64(1 << 32);
+
+    let out_br = out_br * base_sqr;
+    let lhs = (out_br + in_1) - in_2;
+    let rhs = out_1 + out_2 * base;
+
+    yield_constr.constraint(is_sub * (lhs - rhs));
+
+    // We don't need to check that out_br is in {0, 1} because it's
+    // checked by boolean::col_bit(0) in the ALU.
 }
 
 pub(crate) fn eval_subtraction_recursively<F: RichField + Extendable<D>, const D: usize>(
@@ -27,5 +54,25 @@ pub(crate) fn eval_subtraction_recursively<F: RichField + Extendable<D>, const D
     yield_constr: &mut RecursiveConstraintConsumer<F, D>,
 ) {
     let is_sub = local_values[IS_SUB];
-    // TODO
+    let in_1 = local_values[COL_SUB_INPUT_0];
+    let in_2 = local_values[COL_SUB_INPUT_1];
+    let out_1 = local_values[COL_SUB_OUTPUT_0];
+    let out_2 = local_values[COL_SUB_OUTPUT_1];
+    let out_br = local_values[COL_SUB_OUTPUT_BORROW];
+
+    let base = builder.constant_extension(F::Extension::from_canonical_u64(1 << 16));
+    let base_sqr = builder.constant_extension(F::Extension::from_canonical_u64(1 << 32));
+
+    // lhs = (out_br + in_1) - in_2
+    let lhs = builder.add_extension(out_br, in_1);
+    let lhs = builder.sub_extension(lhs, in_2);
+
+    // rhs = out_1 + base * out_2
+    let rhs = builder.mul_add_extension(out_2, base, out_1);
+
+    // filtered_diff = is_sub * (lhs - rhs)
+    let diff = builder.sub_extension(lhs, rhs);
+    let filtered_diff = builder.mul_extension(is_sub, diff);
+
+    yield_constr.constraint(builder, filtered_diff);
 }
diff --git a/system_zero/src/registers/alu.rs b/system_zero/src/registers/alu.rs
index e678d8e4..6a9412a1 100644
--- a/system_zero/src/registers/alu.rs
+++ b/system_zero/src/registers/alu.rs
@@ -34,6 +34,18 @@ pub(crate) const COL_ADD_OUTPUT_1: usize = super::range_check_16::col_rc_16_inpu
 /// The third 16-bit chunk of the output, based on little-endian ordering.
 pub(crate) const COL_ADD_OUTPUT_2: usize = super::range_check_16::col_rc_16_input(2);
 
+/// Inputs for subtraction; the second value is subtracted from the
+/// first; inputs treated as an unsigned u32.
+pub(crate) const COL_SUB_INPUT_0: usize = shared_col(0);
+pub(crate) const COL_SUB_INPUT_1: usize = shared_col(1);
+
+/// The first 16-bit chunk of the output, based on little-endian ordering.
+pub(crate) const COL_SUB_OUTPUT_0: usize = super::range_check_16::col_rc_16_input(0);
+/// The second 16-bit chunk of the output, based on little-endian ordering.
+pub(crate) const COL_SUB_OUTPUT_1: usize = super::range_check_16::col_rc_16_input(1);
+/// The borrow output
+pub(crate) const COL_SUB_OUTPUT_BORROW: usize = super::boolean::col_bit(0);
+
 /// The first value to be multiplied; treated as an unsigned u32.
 pub(crate) const COL_MUL_ADD_FACTOR_0: usize = shared_col(0);
 /// The second value to be multiplied; treated as an unsigned u32.

From 7329dade9490a247c532cce7dee6e91a03100070 Mon Sep 17 00:00:00 2001
From: Daniel Lubarov <daniel@lubarov.com>
Date: Wed, 2 Mar 2022 22:49:57 -0800
Subject: [PATCH 28/32] IS_MUL -> IS_MUL_ADD (#510)

---
 system_zero/src/alu/mod.rs       | 6 +++---
 system_zero/src/alu/mul_add.rs   | 4 ++--
 system_zero/src/registers/alu.rs | 4 ++--
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/system_zero/src/alu/mod.rs b/system_zero/src/alu/mod.rs
index 730ca302..b1b05dc2 100644
--- a/system_zero/src/alu/mod.rs
+++ b/system_zero/src/alu/mod.rs
@@ -28,7 +28,7 @@ pub(crate) fn generate_alu<F: PrimeField64>(values: &mut [F; NUM_COLUMNS]) {
         generate_addition(values);
     } else if values[IS_SUB].is_one() {
         generate_subtraction(values);
-    } else if values[IS_MUL].is_one() {
+    } else if values[IS_MUL_ADD].is_one() {
         generate_mul_add(values);
     } else if values[IS_DIV].is_one() {
         generate_division(values);
@@ -42,7 +42,7 @@ pub(crate) fn eval_alu<F: Field, P: PackedField<Scalar = F>>(
     let local_values = &vars.local_values;
 
     // Check that the operation flag values are binary.
-    for col in [IS_ADD, IS_SUB, IS_MUL, IS_DIV] {
+    for col in [IS_ADD, IS_SUB, IS_MUL_ADD, IS_DIV] {
         let val = local_values[col];
         yield_constr.constraint(val * val - val);
     }
@@ -61,7 +61,7 @@ pub(crate) fn eval_alu_recursively<F: RichField + Extendable<D>, const D: usize>
     let local_values = &vars.local_values;
 
     // Check that the operation flag values are binary.
-    for col in [IS_ADD, IS_SUB, IS_MUL, IS_DIV] {
+    for col in [IS_ADD, IS_SUB, IS_MUL_ADD, IS_DIV] {
         let val = local_values[col];
         let constraint = builder.mul_sub_extension(val, val, val);
         yield_constr.constraint(builder, constraint);
diff --git a/system_zero/src/alu/mul_add.rs b/system_zero/src/alu/mul_add.rs
index 53ba34a2..b84cafbf 100644
--- a/system_zero/src/alu/mul_add.rs
+++ b/system_zero/src/alu/mul_add.rs
@@ -36,7 +36,7 @@ pub(crate) fn eval_mul_add<F: Field, P: PackedField<Scalar = F>>(
     local_values: &[P; NUM_COLUMNS],
     yield_constr: &mut ConstraintConsumer<P>,
 ) {
-    let is_mul = local_values[IS_MUL];
+    let is_mul = local_values[IS_MUL_ADD];
     let factor_0 = local_values[COL_MUL_ADD_FACTOR_0];
     let factor_1 = local_values[COL_MUL_ADD_FACTOR_1];
     let addend = local_values[COL_MUL_ADD_ADDEND];
@@ -63,7 +63,7 @@ pub(crate) fn eval_mul_add_recursively<F: RichField + Extendable<D>, const D: us
     local_values: &[ExtensionTarget<D>; NUM_COLUMNS],
     yield_constr: &mut RecursiveConstraintConsumer<F, D>,
 ) {
-    let is_mul = local_values[IS_MUL];
+    let is_mul = local_values[IS_MUL_ADD];
     let factor_0 = local_values[COL_MUL_ADD_FACTOR_0];
     let factor_1 = local_values[COL_MUL_ADD_FACTOR_1];
     let addend = local_values[COL_MUL_ADD_ADDEND];
diff --git a/system_zero/src/registers/alu.rs b/system_zero/src/registers/alu.rs
index 6a9412a1..585ecab1 100644
--- a/system_zero/src/registers/alu.rs
+++ b/system_zero/src/registers/alu.rs
@@ -2,8 +2,8 @@
 
 pub(crate) const IS_ADD: usize = super::START_ALU;
 pub(crate) const IS_SUB: usize = IS_ADD + 1;
-pub(crate) const IS_MUL: usize = IS_SUB + 1;
-pub(crate) const IS_DIV: usize = IS_MUL + 1;
+pub(crate) const IS_MUL_ADD: usize = IS_SUB + 1;
+pub(crate) const IS_DIV: usize = IS_MUL_ADD + 1;
 
 const START_SHARED_COLS: usize = IS_DIV + 1;
 

From 310493c293436e73096a75d0e5755816fbfe3d58 Mon Sep 17 00:00:00 2001
From: Hamish Ivey-Law <426294+unzvfu@users.noreply.github.com>
Date: Fri, 4 Mar 2022 09:34:31 +1100
Subject: [PATCH 29/32] Faster extension field multiplication (#500)

* Initial implementation of quintic extensions.

* Update to/from_biguint() methods.

* Draft of fast multiplication on quintic extensions over 64-bit base.

* cargo fmt

* Typo.

* Document functions (a bit).

* Refactor reduction step.

* Change multiplication call so that LLVM generates better assembly.

* Use one main accumulator instead of two minor ones; faster reduce.

* Use one main accumulator in square too; clean up redundant code.

* Call faster routines from Mul and Square impls.

* Fix reduction function.

* Fix square calculation.

* Slightly faster reduction.

* Clean up names and types.

* cargo fmt

* Move extension field mul/sqr specialisations to their own file.

* Rename functions to have unique prefix.

* Add faster quadratic multiplication/squaring.

* Faster quartic multiplication and squaring.

* cargo fmt

* clippy

* Alternative reduce160 function.

* Typo.

* Remove alternative reduction function.

* Remove delayed reduction implementation of squaring.

* Enforce assumptions about extension generators.

* Make the accumulation variable a u32 instead of u64.

* Add test to trigger carry branch in reduce160.

* cargo fmt

* Some documentation.

* Clippy; improved comments.

* cargo fmt

* Remove redundant Square specialisations.

* Fix reduce*() visibility.

* Faster reduce160 from Jakub.

* Change mul-by-const functions to operate on 160 bits instead of 128.

* Move code for extensions of GoldilocksField to its own file.
---
 field/Cargo.toml                       |   1 +
 field/src/extension_field/quadratic.rs |   2 +-
 field/src/extension_field/quartic.rs   |   2 +-
 field/src/extension_field/quintic.rs   |   2 +-
 field/src/goldilocks_extensions.rs     | 495 +++++++++++++++++++++++++
 field/src/goldilocks_field.rs          |  93 ++---
 field/src/lib.rs                       |   1 +
 plonky2/benches/field_arithmetic.rs    |   2 +
 8 files changed, 530 insertions(+), 68 deletions(-)
 create mode 100644 field/src/goldilocks_extensions.rs

diff --git a/field/Cargo.toml b/field/Cargo.toml
index 6abffc5d..748b65ac 100644
--- a/field/Cargo.toml
+++ b/field/Cargo.toml
@@ -12,3 +12,4 @@ num = { version = "0.4", features = [ "rand" ] }
 rand = "0.8.4"
 serde = { version = "1.0", features = ["derive"] }
 unroll = "0.1.5"
+static_assertions = "1.1.0"
diff --git a/field/src/extension_field/quadratic.rs b/field/src/extension_field/quadratic.rs
index 488304d2..9cdc01c3 100644
--- a/field/src/extension_field/quadratic.rs
+++ b/field/src/extension_field/quadratic.rs
@@ -170,7 +170,7 @@ impl<F: Extendable<2>> Mul for QuadraticExtension<F> {
     type Output = Self;
 
     #[inline]
-    fn mul(self, rhs: Self) -> Self {
+    default fn mul(self, rhs: Self) -> Self {
         let Self([a0, a1]) = self;
         let Self([b0, b1]) = rhs;
 
diff --git a/field/src/extension_field/quartic.rs b/field/src/extension_field/quartic.rs
index 7b4a6950..09e35a4f 100644
--- a/field/src/extension_field/quartic.rs
+++ b/field/src/extension_field/quartic.rs
@@ -201,7 +201,7 @@ impl<F: Extendable<4>> Mul for QuarticExtension<F> {
     type Output = Self;
 
     #[inline]
-    fn mul(self, rhs: Self) -> Self {
+    default fn mul(self, rhs: Self) -> Self {
         let Self([a0, a1, a2, a3]) = self;
         let Self([b0, b1, b2, b3]) = rhs;
 
diff --git a/field/src/extension_field/quintic.rs b/field/src/extension_field/quintic.rs
index d2c29ffe..1600107d 100644
--- a/field/src/extension_field/quintic.rs
+++ b/field/src/extension_field/quintic.rs
@@ -201,7 +201,7 @@ impl<F: Extendable<5>> Mul for QuinticExtension<F> {
     type Output = Self;
 
     #[inline]
-    fn mul(self, rhs: Self) -> Self {
+    default fn mul(self, rhs: Self) -> Self {
         let Self([a0, a1, a2, a3, a4]) = self;
         let Self([b0, b1, b2, b3, b4]) = rhs;
         let w = <Self as OEF<5>>::W;
diff --git a/field/src/goldilocks_extensions.rs b/field/src/goldilocks_extensions.rs
new file mode 100644
index 00000000..95265fe3
--- /dev/null
+++ b/field/src/goldilocks_extensions.rs
@@ -0,0 +1,495 @@
+use std::ops::Mul;
+
+use static_assertions::const_assert;
+
+use crate::extension_field::quadratic::QuadraticExtension;
+use crate::extension_field::quartic::QuarticExtension;
+use crate::extension_field::quintic::QuinticExtension;
+use crate::extension_field::{Extendable, Frobenius};
+use crate::field_types::Field;
+use crate::goldilocks_field::{reduce160, GoldilocksField};
+
+impl Frobenius<1> for GoldilocksField {}
+
+impl Extendable<2> for GoldilocksField {
+    type Extension = QuadraticExtension<Self>;
+
+    // Verifiable in Sage with
+    // `R.<x> = GF(p)[]; assert (x^2 - 7).is_irreducible()`.
+    const W: Self = Self(7);
+
+    // DTH_ROOT = W^((ORDER - 1)/2)
+    const DTH_ROOT: Self = Self(18446744069414584320);
+
+    const EXT_MULTIPLICATIVE_GROUP_GENERATOR: [Self; 2] =
+        [Self(18081566051660590251), Self(16121475356294670766)];
+
+    const EXT_POWER_OF_TWO_GENERATOR: [Self; 2] = [Self(0), Self(15659105665374529263)];
+}
+
+impl Mul for QuadraticExtension<GoldilocksField> {
+    #[inline]
+    fn mul(self, rhs: Self) -> Self {
+        let Self([a0, a1]) = self;
+        let Self([b0, b1]) = rhs;
+        let c = ext2_mul([a0.0, a1.0], [b0.0, b1.0]);
+        Self(c)
+    }
+}
+
+impl Extendable<4> for GoldilocksField {
+    type Extension = QuarticExtension<Self>;
+
+    const W: Self = Self(7);
+
+    // DTH_ROOT = W^((ORDER - 1)/4)
+    const DTH_ROOT: Self = Self(281474976710656);
+
+    const EXT_MULTIPLICATIVE_GROUP_GENERATOR: [Self; 4] = [
+        Self(5024755240244648895),
+        Self(13227474371289740625),
+        Self(3912887029498544536),
+        Self(3900057112666848848),
+    ];
+
+    const EXT_POWER_OF_TWO_GENERATOR: [Self; 4] =
+        [Self(0), Self(0), Self(0), Self(12587610116473453104)];
+}
+
+impl Mul for QuarticExtension<GoldilocksField> {
+    #[inline]
+    fn mul(self, rhs: Self) -> Self {
+        let Self([a0, a1, a2, a3]) = self;
+        let Self([b0, b1, b2, b3]) = rhs;
+        let c = ext4_mul([a0.0, a1.0, a2.0, a3.0], [b0.0, b1.0, b2.0, b3.0]);
+        Self(c)
+    }
+}
+
+impl Extendable<5> for GoldilocksField {
+    type Extension = QuinticExtension<Self>;
+
+    const W: Self = Self(3);
+
+    // DTH_ROOT = W^((ORDER - 1)/5)
+    const DTH_ROOT: Self = Self(1041288259238279555);
+
+    const EXT_MULTIPLICATIVE_GROUP_GENERATOR: [Self; 5] = [
+        Self(2899034827742553394),
+        Self(13012057356839176729),
+        Self(14593811582388663055),
+        Self(7722900811313895436),
+        Self(4557222484695340057),
+    ];
+
+    const EXT_POWER_OF_TWO_GENERATOR: [Self; 5] = [
+        Self::POWER_OF_TWO_GENERATOR,
+        Self(0),
+        Self(0),
+        Self(0),
+        Self(0),
+    ];
+}
+
+impl Mul for QuinticExtension<GoldilocksField> {
+    #[inline]
+    fn mul(self, rhs: Self) -> Self {
+        let Self([a0, a1, a2, a3, a4]) = self;
+        let Self([b0, b1, b2, b3, b4]) = rhs;
+        let c = ext5_mul(
+            [a0.0, a1.0, a2.0, a3.0, a4.0],
+            [b0.0, b1.0, b2.0, b3.0, b4.0],
+        );
+        Self(c)
+    }
+}
+
+/*
+ * The functions extD_add_prods[0-4] are helper functions for
+ * computing products for extensions of degree D over the Goldilocks
+ * field. They are faster than the generic method because all
+ * reductions are delayed until the end which means only one per
+ * result coefficient is necessary.
+ */
+
+/// Return a, b such that a + b*2^128 = 3*x with a < 2^128 and b < 2^32.
+#[inline(always)]
+fn u160_times_3(x: u128, y: u32) -> (u128, u32) {
+    let (s, cy) = x.overflowing_add(x << 1);
+    (s, 3 * y + (x >> 127) as u32 + cy as u32)
+}
+
+/// Return a, b such that a + b*2^128 = 7*x with a < 2^128 and b < 2^32.
+#[inline(always)]
+fn u160_times_7(x: u128, y: u32) -> (u128, u32) {
+    let (d, br) = (x << 3).overflowing_sub(x);
+    // NB: subtracting the borrow can't underflow
+    (d, 7 * y + (x >> (128 - 3)) as u32 - br as u32)
+}
+
+/*
+ * Quadratic multiplication and squaring
+ */
+
+#[inline(always)]
+fn ext2_add_prods0(a: &[u64; 2], b: &[u64; 2]) -> GoldilocksField {
+    // Computes a0 * b0 + W * a1 * b1;
+    let [a0, a1] = *a;
+    let [b0, b1] = *b;
+
+    let cy;
+
+    // W * a1 * b1
+    let (mut cumul_lo, mut cumul_hi) = u160_times_7((a1 as u128) * (b1 as u128), 0u32);
+
+    // a0 * b0
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a0 as u128) * (b0 as u128));
+    cumul_hi += cy as u32;
+
+    unsafe { reduce160(cumul_lo, cumul_hi) }
+}
+
+#[inline(always)]
+fn ext2_add_prods1(a: &[u64; 2], b: &[u64; 2]) -> GoldilocksField {
+    // Computes a0 * b1 + a1 * b0;
+    let [a0, a1] = *a;
+    let [b0, b1] = *b;
+
+    let cy;
+
+    // a0 * b1
+    let mut cumul_lo = (a0 as u128) * (b1 as u128);
+
+    // a1 * b0
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a1 as u128) * (b0 as u128));
+    let cumul_hi = cy as u32;
+
+    unsafe { reduce160(cumul_lo, cumul_hi) }
+}
+
+/// Multiply a and b considered as elements of GF(p^2).
+#[inline(always)]
+pub(crate) fn ext2_mul(a: [u64; 2], b: [u64; 2]) -> [GoldilocksField; 2] {
+    // The code in ext2_add_prods[01] assumes the quadratic extension
+    // generator is 7.
+    const_assert!(<GoldilocksField as Extendable<2>>::W.0 == 7u64);
+
+    let c0 = ext2_add_prods0(&a, &b);
+    let c1 = ext2_add_prods1(&a, &b);
+    [c0, c1]
+}
+
+/*
+ * Quartic multiplication and squaring
+ */
+
+#[inline(always)]
+fn ext4_add_prods0(a: &[u64; 4], b: &[u64; 4]) -> GoldilocksField {
+    // Computes c0 = a0 * b0 + W * (a1 * b3 + a2 * b2 + a3 * b1)
+
+    let [a0, a1, a2, a3] = *a;
+    let [b0, b1, b2, b3] = *b;
+
+    let mut cy;
+
+    // a1 * b3
+    let mut cumul_lo = (a1 as u128) * (b3 as u128);
+
+    // a2 * b2
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a2 as u128) * (b2 as u128));
+    let mut cumul_hi = cy as u32;
+
+    // a3 * b1
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a3 as u128) * (b1 as u128));
+    cumul_hi += cy as u32;
+
+    // * W
+    (cumul_lo, cumul_hi) = u160_times_7(cumul_lo, cumul_hi);
+
+    // a0 * b0
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a0 as u128) * (b0 as u128));
+    cumul_hi += cy as u32;
+
+    unsafe { reduce160(cumul_lo, cumul_hi) }
+}
+
+#[inline(always)]
+fn ext4_add_prods1(a: &[u64; 4], b: &[u64; 4]) -> GoldilocksField {
+    // Computes c1 = a0 * b1 + a1 * b0 + W * (a2 * b3 + a3 * b2);
+
+    let [a0, a1, a2, a3] = *a;
+    let [b0, b1, b2, b3] = *b;
+
+    let mut cy;
+
+    // a2 * b3
+    let mut cumul_lo = (a2 as u128) * (b3 as u128);
+
+    // a3 * b2
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a3 as u128) * (b2 as u128));
+    let mut cumul_hi = cy as u32;
+
+    // * W
+    (cumul_lo, cumul_hi) = u160_times_7(cumul_lo, cumul_hi);
+
+    // a0 * b1
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a0 as u128) * (b1 as u128));
+    cumul_hi += cy as u32;
+
+    // a1 * b0
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a1 as u128) * (b0 as u128));
+    cumul_hi += cy as u32;
+
+    unsafe { reduce160(cumul_lo, cumul_hi) }
+}
+
+#[inline(always)]
+fn ext4_add_prods2(a: &[u64; 4], b: &[u64; 4]) -> GoldilocksField {
+    // Computes c2 = a0 * b2 + a1 * b1 + a2 * b0 + W * a3 * b3;
+
+    let [a0, a1, a2, a3] = *a;
+    let [b0, b1, b2, b3] = *b;
+
+    let mut cy;
+
+    // W * a3 * b3
+    let (mut cumul_lo, mut cumul_hi) = u160_times_7((a3 as u128) * (b3 as u128), 0u32);
+
+    // a0 * b2
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a0 as u128) * (b2 as u128));
+    cumul_hi += cy as u32;
+
+    // a1 * b1
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a1 as u128) * (b1 as u128));
+    cumul_hi += cy as u32;
+
+    // a2 * b0
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a2 as u128) * (b0 as u128));
+    cumul_hi += cy as u32;
+
+    unsafe { reduce160(cumul_lo, cumul_hi) }
+}
+
+#[inline(always)]
+fn ext4_add_prods3(a: &[u64; 4], b: &[u64; 4]) -> GoldilocksField {
+    // Computes c3 = a0 * b3 + a1 * b2 + a2 * b1 + a3 * b0;
+
+    let [a0, a1, a2, a3] = *a;
+    let [b0, b1, b2, b3] = *b;
+
+    let mut cy;
+
+    // a0 * b3
+    let mut cumul_lo = (a0 as u128) * (b3 as u128);
+
+    // a1 * b2
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a1 as u128) * (b2 as u128));
+    let mut cumul_hi = cy as u32;
+
+    // a2 * b1
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a2 as u128) * (b1 as u128));
+    cumul_hi += cy as u32;
+
+    // a3 * b0
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a3 as u128) * (b0 as u128));
+    cumul_hi += cy as u32;
+
+    unsafe { reduce160(cumul_lo, cumul_hi) }
+}
+
+/// Multiply a and b considered as elements of GF(p^4).
+#[inline(always)]
+pub(crate) fn ext4_mul(a: [u64; 4], b: [u64; 4]) -> [GoldilocksField; 4] {
+    // The code in ext4_add_prods[0-3] assumes the quartic extension
+    // generator is 7.
+    const_assert!(<GoldilocksField as Extendable<4>>::W.0 == 7u64);
+
+    let c0 = ext4_add_prods0(&a, &b);
+    let c1 = ext4_add_prods1(&a, &b);
+    let c2 = ext4_add_prods2(&a, &b);
+    let c3 = ext4_add_prods3(&a, &b);
+    [c0, c1, c2, c3]
+}
+
+/*
+ * Quintic multiplication and squaring
+ */
+
+#[inline(always)]
+fn ext5_add_prods0(a: &[u64; 5], b: &[u64; 5]) -> GoldilocksField {
+    // Computes c0 = a0 * b0 + W * (a1 * b4 + a2 * b3 + a3 * b2 + a4 * b1)
+
+    let [a0, a1, a2, a3, a4] = *a;
+    let [b0, b1, b2, b3, b4] = *b;
+
+    let mut cy;
+
+    // a1 * b4
+    let mut cumul_lo = (a1 as u128) * (b4 as u128);
+
+    // a2 * b3
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a2 as u128) * (b3 as u128));
+    let mut cumul_hi = cy as u32;
+
+    // a3 * b2
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a3 as u128) * (b2 as u128));
+    cumul_hi += cy as u32;
+
+    // a4 * b1
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a4 as u128) * (b1 as u128));
+    cumul_hi += cy as u32;
+
+    // * W
+    (cumul_lo, cumul_hi) = u160_times_3(cumul_lo, cumul_hi);
+
+    // a0 * b0
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a0 as u128) * (b0 as u128));
+    cumul_hi += cy as u32;
+
+    unsafe { reduce160(cumul_lo, cumul_hi) }
+}
+
+#[inline(always)]
+fn ext5_add_prods1(a: &[u64; 5], b: &[u64; 5]) -> GoldilocksField {
+    // Computes c1 = a0 * b1 + a1 * b0 + W * (a2 * b4 + a3 * b3 + a4 * b2);
+
+    let [a0, a1, a2, a3, a4] = *a;
+    let [b0, b1, b2, b3, b4] = *b;
+
+    let mut cy;
+
+    // a2 * b4
+    let mut cumul_lo = (a2 as u128) * (b4 as u128);
+
+    // a3 * b3
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a3 as u128) * (b3 as u128));
+    let mut cumul_hi = cy as u32;
+
+    // a4 * b2
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a4 as u128) * (b2 as u128));
+    cumul_hi += cy as u32;
+
+    // * W
+    (cumul_lo, cumul_hi) = u160_times_3(cumul_lo, cumul_hi);
+
+    // a0 * b1
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a0 as u128) * (b1 as u128));
+    cumul_hi += cy as u32;
+
+    // a1 * b0
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a1 as u128) * (b0 as u128));
+    cumul_hi += cy as u32;
+
+    unsafe { reduce160(cumul_lo, cumul_hi) }
+}
+
+#[inline(always)]
+fn ext5_add_prods2(a: &[u64; 5], b: &[u64; 5]) -> GoldilocksField {
+    // Computes c2 = a0 * b2 + a1 * b1 + a2 * b0 + W * (a3 * b4 + a4 * b3);
+
+    let [a0, a1, a2, a3, a4] = *a;
+    let [b0, b1, b2, b3, b4] = *b;
+
+    let mut cy;
+
+    // a3 * b4
+    let mut cumul_lo = (a3 as u128) * (b4 as u128);
+
+    // a4 * b3
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a4 as u128) * (b3 as u128));
+    let mut cumul_hi = cy as u32;
+
+    // * W
+    (cumul_lo, cumul_hi) = u160_times_3(cumul_lo, cumul_hi);
+
+    // a0 * b2
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a0 as u128) * (b2 as u128));
+    cumul_hi += cy as u32;
+
+    // a1 * b1
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a1 as u128) * (b1 as u128));
+    cumul_hi += cy as u32;
+
+    // a2 * b0
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a2 as u128) * (b0 as u128));
+    cumul_hi += cy as u32;
+
+    unsafe { reduce160(cumul_lo, cumul_hi) }
+}
+
+#[inline(always)]
+fn ext5_add_prods3(a: &[u64; 5], b: &[u64; 5]) -> GoldilocksField {
+    // Computes c3 = a0 * b3 + a1 * b2 + a2 * b1 + a3 * b0 + W * a4 * b4;
+
+    let [a0, a1, a2, a3, a4] = *a;
+    let [b0, b1, b2, b3, b4] = *b;
+
+    let mut cy;
+
+    // W * a4 * b4
+    let (mut cumul_lo, mut cumul_hi) = u160_times_3((a4 as u128) * (b4 as u128), 0u32);
+
+    // a0 * b3
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a0 as u128) * (b3 as u128));
+    cumul_hi += cy as u32;
+
+    // a1 * b2
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a1 as u128) * (b2 as u128));
+    cumul_hi += cy as u32;
+
+    // a2 * b1
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a2 as u128) * (b1 as u128));
+    cumul_hi += cy as u32;
+
+    // a3 * b0
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a3 as u128) * (b0 as u128));
+    cumul_hi += cy as u32;
+
+    unsafe { reduce160(cumul_lo, cumul_hi) }
+}
+
+#[inline(always)]
+fn ext5_add_prods4(a: &[u64; 5], b: &[u64; 5]) -> GoldilocksField {
+    // Computes c4 = a0 * b4 + a1 * b3 + a2 * b2 + a3 * b1 + a4 * b0;
+
+    let [a0, a1, a2, a3, a4] = *a;
+    let [b0, b1, b2, b3, b4] = *b;
+
+    let mut cy;
+
+    // a0 * b4
+    let mut cumul_lo = (a0 as u128) * (b4 as u128);
+
+    // a1 * b3
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a1 as u128) * (b3 as u128));
+    let mut cumul_hi = cy as u32;
+
+    // a2 * b2
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a2 as u128) * (b2 as u128));
+    cumul_hi += cy as u32;
+
+    // a3 * b1
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a3 as u128) * (b1 as u128));
+    cumul_hi += cy as u32;
+
+    // a4 * b0
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a4 as u128) * (b0 as u128));
+    cumul_hi += cy as u32;
+
+    unsafe { reduce160(cumul_lo, cumul_hi) }
+}
+
+/// Multiply a and b considered as elements of GF(p^5).
+#[inline(always)]
+pub(crate) fn ext5_mul(a: [u64; 5], b: [u64; 5]) -> [GoldilocksField; 5] {
+    // The code in ext5_add_prods[0-4] assumes the quintic extension
+    // generator is 3.
+    const_assert!(<GoldilocksField as Extendable<5>>::W.0 == 3u64);
+
+    let c0 = ext5_add_prods0(&a, &b);
+    let c1 = ext5_add_prods1(&a, &b);
+    let c2 = ext5_add_prods2(&a, &b);
+    let c3 = ext5_add_prods3(&a, &b);
+    let c4 = ext5_add_prods4(&a, &b);
+    [c0, c1, c2, c3, c4]
+}
diff --git a/field/src/goldilocks_field.rs b/field/src/goldilocks_field.rs
index af958629..4ed32a0d 100644
--- a/field/src/goldilocks_field.rs
+++ b/field/src/goldilocks_field.rs
@@ -9,10 +9,6 @@ use plonky2_util::{assume, branch_hint};
 use rand::Rng;
 use serde::{Deserialize, Serialize};
 
-use crate::extension_field::quadratic::QuadraticExtension;
-use crate::extension_field::quartic::QuarticExtension;
-use crate::extension_field::quintic::QuinticExtension;
-use crate::extension_field::{Extendable, Frobenius};
 use crate::field_types::{Field, Field64, PrimeField, PrimeField64};
 use crate::inversion::try_inverse_u64;
 
@@ -283,66 +279,6 @@ impl DivAssign for GoldilocksField {
     }
 }
 
-impl Extendable<2> for GoldilocksField {
-    type Extension = QuadraticExtension<Self>;
-
-    // Verifiable in Sage with
-    // `R.<x> = GF(p)[]; assert (x^2 - 7).is_irreducible()`.
-    const W: Self = Self(7);
-
-    // DTH_ROOT = W^((ORDER - 1)/2)
-    const DTH_ROOT: Self = Self(18446744069414584320);
-
-    const EXT_MULTIPLICATIVE_GROUP_GENERATOR: [Self; 2] =
-        [Self(18081566051660590251), Self(16121475356294670766)];
-
-    const EXT_POWER_OF_TWO_GENERATOR: [Self; 2] = [Self(0), Self(15659105665374529263)];
-}
-
-impl Extendable<4> for GoldilocksField {
-    type Extension = QuarticExtension<Self>;
-
-    const W: Self = Self(7);
-
-    // DTH_ROOT = W^((ORDER - 1)/4)
-    const DTH_ROOT: Self = Self(281474976710656);
-
-    const EXT_MULTIPLICATIVE_GROUP_GENERATOR: [Self; 4] = [
-        Self(5024755240244648895),
-        Self(13227474371289740625),
-        Self(3912887029498544536),
-        Self(3900057112666848848),
-    ];
-
-    const EXT_POWER_OF_TWO_GENERATOR: [Self; 4] =
-        [Self(0), Self(0), Self(0), Self(12587610116473453104)];
-}
-
-impl Extendable<5> for GoldilocksField {
-    type Extension = QuinticExtension<Self>;
-
-    const W: Self = Self(3);
-
-    // DTH_ROOT = W^((ORDER - 1)/5)
-    const DTH_ROOT: Self = Self(1041288259238279555);
-
-    const EXT_MULTIPLICATIVE_GROUP_GENERATOR: [Self; 5] = [
-        Self(2899034827742553394),
-        Self(13012057356839176729),
-        Self(14593811582388663055),
-        Self(7722900811313895436),
-        Self(4557222484695340057),
-    ];
-
-    const EXT_POWER_OF_TWO_GENERATOR: [Self; 5] = [
-        Self::POWER_OF_TWO_GENERATOR,
-        Self(0),
-        Self(0),
-        Self(0),
-        Self(0),
-    ];
-}
-
 /// Fast addition modulo ORDER for x86-64.
 /// This function is marked unsafe for the following reasons:
 ///   - It is only correct if x + y < 2**64 + ORDER = 0x1ffffffff00000001.
@@ -407,7 +343,34 @@ fn split(x: u128) -> (u64, u64) {
     (x as u64, (x >> 64) as u64)
 }
 
-impl Frobenius<1> for GoldilocksField {}
+/// Reduce the value x_lo + x_hi * 2^128 to an element in the
+/// Goldilocks field.
+///
+/// This function is marked 'unsafe' because correctness relies on the
+/// unchecked assumption that x < 2^160 - 2^128 + 2^96. Further,
+/// performance may degrade as x_hi increases beyond 2**40 or so.
+#[inline(always)]
+pub(crate) unsafe fn reduce160(x_lo: u128, x_hi: u32) -> GoldilocksField {
+    let x_hi = (x_lo >> 96) as u64 + ((x_hi as u64) << 32); // shld to form x_hi
+    let x_mid = (x_lo >> 64) as u32; // shr to form x_mid
+    let x_lo = x_lo as u64;
+
+    // sub + jc (should fuse)
+    let (mut t0, borrow) = x_lo.overflowing_sub(x_hi);
+    if borrow {
+        // The maximum possible value of x is (2^64 - 1)^2 * 4 * 7 < 2^133,
+        // so x_hi < 2^37. A borrow will happen roughly one in 134 million
+        // times, so it's best to branch.
+        branch_hint();
+        // NB: this assumes that x < 2^160 - 2^128 + 2^96.
+        t0 -= EPSILON; // Cannot underflow if x_hi is canonical.
+    }
+    // imul
+    let t1 = (x_mid as u64) * EPSILON;
+    // add, sbb, add
+    let t2 = add_no_canonicalize_trashing_input(t0, t1);
+    GoldilocksField(t2)
+}
 
 #[cfg(test)]
 mod tests {
diff --git a/field/src/lib.rs b/field/src/lib.rs
index 2c89aab3..e54f2aa7 100644
--- a/field/src/lib.rs
+++ b/field/src/lib.rs
@@ -15,6 +15,7 @@ pub mod cosets;
 pub mod extension_field;
 pub mod fft;
 pub mod field_types;
+pub mod goldilocks_extensions;
 pub mod goldilocks_field;
 pub mod interpolation;
 mod inversion;
diff --git a/plonky2/benches/field_arithmetic.rs b/plonky2/benches/field_arithmetic.rs
index 0e4383ee..7b74ae52 100644
--- a/plonky2/benches/field_arithmetic.rs
+++ b/plonky2/benches/field_arithmetic.rs
@@ -1,4 +1,5 @@
 use criterion::{criterion_group, criterion_main, BatchSize, Criterion};
+use plonky2::field::extension_field::quadratic::QuadraticExtension;
 use plonky2::field::extension_field::quartic::QuarticExtension;
 use plonky2::field::extension_field::quintic::QuinticExtension;
 use plonky2::field::field_types::Field;
@@ -175,6 +176,7 @@ pub(crate) fn bench_field<F: Field>(c: &mut Criterion) {
 
 fn criterion_benchmark(c: &mut Criterion) {
     bench_field::<GoldilocksField>(c);
+    bench_field::<QuadraticExtension<GoldilocksField>>(c);
     bench_field::<QuarticExtension<GoldilocksField>>(c);
     bench_field::<QuinticExtension<GoldilocksField>>(c);
 }

From cc9a43b57433f3c6724b1764bb8c64a9c3bdf395 Mon Sep 17 00:00:00 2001
From: wborgeaud <williamborgeaud@gmail.com>
Date: Tue, 15 Mar 2022 17:14:45 +0100
Subject: [PATCH 30/32] Fix salt issues

---
 plonky2/src/plonk/recursive_verifier.rs | 10 ++++++----
 plonky2/src/util/serialization.rs       | 11 +++++++----
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/plonky2/src/plonk/recursive_verifier.rs b/plonky2/src/plonk/recursive_verifier.rs
index 6210bb29..2fe7d648 100644
--- a/plonky2/src/plonk/recursive_verifier.rs
+++ b/plonky2/src/plonk/recursive_verifier.rs
@@ -4,6 +4,7 @@ use crate::hash::hash_types::{HashOutTarget, RichField};
 use crate::plonk::circuit_builder::CircuitBuilder;
 use crate::plonk::circuit_data::{CommonCircuitData, VerifierCircuitTarget};
 use crate::plonk::config::{AlgebraicHasher, GenericConfig};
+use crate::plonk::plonk_common::salt_size;
 use crate::plonk::proof::{
     OpeningSetTarget, ProofChallengesTarget, ProofTarget, ProofWithPublicInputsTarget,
 };
@@ -141,11 +142,12 @@ impl<F: RichField + Extendable<D>, const D: usize> CircuitBuilder<F, D> {
         let fri_params = &common_data.fri_params;
         let cap_height = fri_params.config.cap_height;
 
+        let salt = salt_size(common_data.fri_params.hiding);
         let num_leaves_per_oracle = &[
             common_data.num_preprocessed_polys(),
-            config.num_wires,
-            common_data.num_zs_partial_products_polys(),
-            common_data.num_quotient_polys(),
+            config.num_wires + salt,
+            common_data.num_zs_partial_products_polys() + salt,
+            common_data.num_quotient_polys() + salt,
         ];
 
         ProofTarget {
@@ -200,7 +202,7 @@ mod tests {
         const D: usize = 2;
         type C = PoseidonGoldilocksConfig;
         type F = <C as GenericConfig<D>>::F;
-        let config = CircuitConfig::standard_recursion_config();
+        let config = CircuitConfig::standard_recursion_zk_config();
 
         let (proof, vd, cd) = dummy_proof::<F, C, D>(&config, 4_000)?;
         let (proof, _vd, cd) =
diff --git a/plonky2/src/util/serialization.rs b/plonky2/src/util/serialization.rs
index d0326073..ce5b1270 100644
--- a/plonky2/src/util/serialization.rs
+++ b/plonky2/src/util/serialization.rs
@@ -15,6 +15,7 @@ use crate::hash::merkle_proofs::MerkleProof;
 use crate::hash::merkle_tree::MerkleCap;
 use crate::plonk::circuit_data::CommonCircuitData;
 use crate::plonk::config::{GenericConfig, GenericHashOut, Hasher};
+use crate::plonk::plonk_common::salt_size;
 use crate::plonk::proof::{
     CompressedProof, CompressedProofWithPublicInputs, OpeningSet, Proof, ProofWithPublicInputs,
 };
@@ -235,6 +236,7 @@ impl Buffer {
         common_data: &CommonCircuitData<F, C, D>,
     ) -> Result<FriInitialTreeProof<F, C::Hasher>> {
         let config = &common_data.config;
+        let salt = salt_size(common_data.fri_params.hiding);
         let mut evals_proofs = Vec::with_capacity(4);
 
         let constants_sigmas_v =
@@ -242,17 +244,18 @@ impl Buffer {
         let constants_sigmas_p = self.read_merkle_proof()?;
         evals_proofs.push((constants_sigmas_v, constants_sigmas_p));
 
-        let wires_v = self.read_field_vec(config.num_wires)?;
+        let wires_v = self.read_field_vec(config.num_wires + salt)?;
         let wires_p = self.read_merkle_proof()?;
         evals_proofs.push((wires_v, wires_p));
 
-        let zs_partial_v =
-            self.read_field_vec(config.num_challenges * (1 + common_data.num_partial_products))?;
+        let zs_partial_v = self.read_field_vec(
+            config.num_challenges * (1 + common_data.num_partial_products) + salt,
+        )?;
         let zs_partial_p = self.read_merkle_proof()?;
         evals_proofs.push((zs_partial_v, zs_partial_p));
 
         let quotient_v =
-            self.read_field_vec(config.num_challenges * common_data.quotient_degree_factor)?;
+            self.read_field_vec(config.num_challenges * common_data.quotient_degree_factor + salt)?;
         let quotient_p = self.read_merkle_proof()?;
         evals_proofs.push((quotient_v, quotient_p));
 

From 627e80bfd502fb4add0af43b25a0a5a582ea5870 Mon Sep 17 00:00:00 2001
From: Daniel Lubarov <daniel@lubarov.com>
Date: Tue, 15 Mar 2022 09:24:10 -0700
Subject: [PATCH 31/32] Filter mul-add constraints (#512)

---
 system_zero/src/alu/mul_add.rs | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/system_zero/src/alu/mul_add.rs b/system_zero/src/alu/mul_add.rs
index b84cafbf..5179faac 100644
--- a/system_zero/src/alu/mul_add.rs
+++ b/system_zero/src/alu/mul_add.rs
@@ -47,6 +47,7 @@ pub(crate) fn eval_mul_add<F: Field, P: PackedField<Scalar = F>>(
     let result_canonical_inv = local_values[COL_MUL_ADD_RESULT_CANONICAL_INV];
 
     let computed_output = factor_0 * factor_1 + addend;
+    // TODO: Needs to be filtered by IS_MUL_ADD.
     let output = combine_u16s_check_canonical(
         output_1,
         output_2,
@@ -55,7 +56,7 @@ pub(crate) fn eval_mul_add<F: Field, P: PackedField<Scalar = F>>(
         result_canonical_inv,
         yield_constr,
     );
-    yield_constr.constraint(computed_output - output);
+    yield_constr.constraint(is_mul * (computed_output - output));
 }
 
 pub(crate) fn eval_mul_add_recursively<F: RichField + Extendable<D>, const D: usize>(
@@ -74,6 +75,7 @@ pub(crate) fn eval_mul_add_recursively<F: RichField + Extendable<D>, const D: us
     let result_canonical_inv = local_values[COL_MUL_ADD_RESULT_CANONICAL_INV];
 
     let computed_output = builder.mul_add_extension(factor_0, factor_1, addend);
+    // TODO: Needs to be filtered by IS_MUL_ADD.
     let output = combine_u16s_check_canonical_circuit(
         builder,
         output_1,
@@ -84,5 +86,6 @@ pub(crate) fn eval_mul_add_recursively<F: RichField + Extendable<D>, const D: us
         yield_constr,
     );
     let diff = builder.sub_extension(computed_output, output);
+    let filtered_diff = builder.mul_extension(is_mul, diff);
     yield_constr.constraint(builder, diff);
 }

From 7d6c0a448ddb68f5c181f9440bf3213f898519aa Mon Sep 17 00:00:00 2001
From: Daniel Lubarov <daniel@lubarov.com>
Date: Wed, 16 Mar 2022 17:37:34 -0700
Subject: [PATCH 32/32] Halo2 style lookup arguments in System Zero (#513)

* Halo2 style lookup arguments in System Zero

It's a really nice and simple protocol, particularly for the verifier since the constraints are trivial (aside from the underlying batched permutation checks, which we already support). See the [Halo2 book](https://zcash.github.io/halo2/design/proving-system/lookup.html) and this [talk](https://www.youtube.com/watch?v=YlTt12s7vGE&t=5237s) by @daira.

Previously we generated the whole trace in row-wise form, but it's much more efficient to generate these "permuted" columns column-wise. So I changed our STARK framework to accept the trace in column-wise form. STARK impls now have the flexibility to do some generation row-wise and some column-wise (without extra costs; there's a single transpose as before).

* sorting

* fixes

* PR feedback

* into_iter

* timing
---
 field/src/field_types.rs                      |   5 +
 field/src/goldilocks_field.rs                 |   3 +-
 starky/src/fibonacci_stark.rs                 |  14 +-
 starky/src/lib.rs                             |   1 +
 starky/src/permutation.rs                     |   8 +
 starky/src/prover.rs                          |  16 +-
 starky/src/util.rs                            |  16 ++
 starky/src/verifier.rs                        |   5 +-
 system_zero/Cargo.toml                        |   8 +
 system_zero/benches/lookup_permuted_cols.rs   |  30 ++++
 system_zero/src/lib.rs                        |   1 +
 system_zero/src/lookup.rs                     | 147 ++++++++++++++++++
 system_zero/src/registers/lookup.rs           |  24 ++-
 system_zero/src/registers/range_check_16.rs   |   2 +-
 .../src/registers/range_check_degree.rs       |   2 +-
 system_zero/src/system_zero.rs                |  76 ++++++++-
 16 files changed, 324 insertions(+), 34 deletions(-)
 create mode 100644 starky/src/util.rs
 create mode 100644 system_zero/benches/lookup_permuted_cols.rs
 create mode 100644 system_zero/src/lookup.rs

diff --git a/field/src/field_types.rs b/field/src/field_types.rs
index 83826b9f..4adfdbf4 100644
--- a/field/src/field_types.rs
+++ b/field/src/field_types.rs
@@ -462,6 +462,11 @@ pub trait PrimeField64: PrimeField + Field64 {
     fn to_canonical_u64(&self) -> u64;
 
     fn to_noncanonical_u64(&self) -> u64;
+
+    #[inline(always)]
+    fn to_canonical(&self) -> Self {
+        Self::from_canonical_u64(self.to_canonical_u64())
+    }
 }
 
 /// An iterator over the powers of a certain base element `b`: `b^0, b^1, b^2, ...`.
diff --git a/field/src/goldilocks_field.rs b/field/src/goldilocks_field.rs
index 4ed32a0d..c3172991 100644
--- a/field/src/goldilocks_field.rs
+++ b/field/src/goldilocks_field.rs
@@ -95,7 +95,7 @@ impl Field for GoldilocksField {
         Self(n.mod_floor(&Self::order()).to_u64_digits()[0])
     }
 
-    #[inline]
+    #[inline(always)]
     fn from_canonical_u64(n: u64) -> Self {
         debug_assert!(n < Self::ORDER);
         Self(n)
@@ -156,6 +156,7 @@ impl PrimeField64 for GoldilocksField {
         c
     }
 
+    #[inline(always)]
     fn to_noncanonical_u64(&self) -> u64 {
         self.0
     }
diff --git a/starky/src/fibonacci_stark.rs b/starky/src/fibonacci_stark.rs
index 7961ad50..fa9ccd87 100644
--- a/starky/src/fibonacci_stark.rs
+++ b/starky/src/fibonacci_stark.rs
@@ -2,12 +2,14 @@ use std::marker::PhantomData;
 
 use plonky2::field::extension_field::{Extendable, FieldExtension};
 use plonky2::field::packed_field::PackedField;
+use plonky2::field::polynomial::PolynomialValues;
 use plonky2::hash::hash_types::RichField;
 use plonky2::plonk::circuit_builder::CircuitBuilder;
 
 use crate::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer};
 use crate::permutation::PermutationPair;
 use crate::stark::Stark;
+use crate::util::trace_rows_to_poly_values;
 use crate::vars::{StarkEvaluationTargets, StarkEvaluationVars};
 
 /// Toy STARK system used for testing.
@@ -37,8 +39,8 @@ impl<F: RichField + Extendable<D>, const D: usize> FibonacciStark<F, D> {
     }
 
     /// Generate the trace using `x0, x1, 0, 1` as initial state values.
-    fn generate_trace(&self, x0: F, x1: F) -> Vec<[F; Self::COLUMNS]> {
-        let mut trace = (0..self.num_rows)
+    fn generate_trace(&self, x0: F, x1: F) -> Vec<PolynomialValues<F>> {
+        let mut trace_rows = (0..self.num_rows)
             .scan([x0, x1, F::ZERO, F::ONE], |acc, _| {
                 let tmp = *acc;
                 acc[0] = tmp[1];
@@ -48,8 +50,8 @@ impl<F: RichField + Extendable<D>, const D: usize> FibonacciStark<F, D> {
                 Some(tmp)
             })
             .collect::<Vec<_>>();
-        trace[self.num_rows - 1][3] = F::ZERO; // So that column 2 and 3 are permutation of one another.
-        trace
+        trace_rows[self.num_rows - 1][3] = F::ZERO; // So that column 2 and 3 are permutation of one another.
+        trace_rows_to_poly_values(trace_rows)
     }
 }
 
@@ -113,9 +115,7 @@ impl<F: RichField + Extendable<D>, const D: usize> Stark<F, D> for FibonacciStar
     }
 
     fn permutation_pairs(&self) -> Vec<PermutationPair> {
-        vec![PermutationPair {
-            column_pairs: vec![(2, 3)],
-        }]
+        vec![PermutationPair::singletons(2, 3)]
     }
 }
 
diff --git a/starky/src/lib.rs b/starky/src/lib.rs
index 8249d90b..b2293443 100644
--- a/starky/src/lib.rs
+++ b/starky/src/lib.rs
@@ -15,6 +15,7 @@ pub mod prover;
 pub mod recursive_verifier;
 pub mod stark;
 pub mod stark_testing;
+pub mod util;
 pub mod vanishing_poly;
 pub mod vars;
 pub mod verifier;
diff --git a/starky/src/permutation.rs b/starky/src/permutation.rs
index 2e1d603c..91b1be27 100644
--- a/starky/src/permutation.rs
+++ b/starky/src/permutation.rs
@@ -30,6 +30,14 @@ pub struct PermutationPair {
     pub column_pairs: Vec<(usize, usize)>,
 }
 
+impl PermutationPair {
+    pub fn singletons(lhs: usize, rhs: usize) -> Self {
+        Self {
+            column_pairs: vec![(lhs, rhs)],
+        }
+    }
+}
+
 /// A single instance of a permutation check protocol.
 pub(crate) struct PermutationInstance<'a, T: Copy> {
     pub(crate) pair: &'a PermutationPair,
diff --git a/starky/src/prover.rs b/starky/src/prover.rs
index 336b9963..da1b5dd4 100644
--- a/starky/src/prover.rs
+++ b/starky/src/prover.rs
@@ -30,7 +30,7 @@ use crate::vars::StarkEvaluationVars;
 pub fn prove<F, C, S, const D: usize>(
     stark: S,
     config: &StarkConfig,
-    trace: Vec<[F; S::COLUMNS]>,
+    trace_poly_values: Vec<PolynomialValues<F>>,
     public_inputs: [F; S::PUBLIC_INPUTS],
     timing: &mut TimingTree,
 ) -> Result<StarkProofWithPublicInputs<F, C, D>>
@@ -42,7 +42,7 @@ where
     [(); S::PUBLIC_INPUTS]:,
     [(); C::Hasher::HASH_SIZE]:,
 {
-    let degree = trace.len();
+    let degree = trace_poly_values[0].len();
     let degree_bits = log2_strict(degree);
     let fri_params = config.fri_params(degree_bits);
     let rate_bits = config.fri_config.rate_bits;
@@ -52,18 +52,6 @@ where
         "FRI total reduction arity is too large.",
     );
 
-    let trace_vecs = trace.iter().map(|row| row.to_vec()).collect_vec();
-    let trace_col_major: Vec<Vec<F>> = transpose(&trace_vecs);
-
-    let trace_poly_values: Vec<PolynomialValues<F>> = timed!(
-        timing,
-        "compute trace polynomials",
-        trace_col_major
-            .par_iter()
-            .map(|column| PolynomialValues::new(column.clone()))
-            .collect()
-    );
-
     let trace_commitment = timed!(
         timing,
         "compute trace commitment",
diff --git a/starky/src/util.rs b/starky/src/util.rs
new file mode 100644
index 00000000..011a1add
--- /dev/null
+++ b/starky/src/util.rs
@@ -0,0 +1,16 @@
+use itertools::Itertools;
+use plonky2::field::field_types::Field;
+use plonky2::field::polynomial::PolynomialValues;
+use plonky2::util::transpose;
+
+/// A helper function to transpose a row-wise trace and put it in the format that `prove` expects.
+pub fn trace_rows_to_poly_values<F: Field, const COLUMNS: usize>(
+    trace_rows: Vec<[F; COLUMNS]>,
+) -> Vec<PolynomialValues<F>> {
+    let trace_row_vecs = trace_rows.into_iter().map(|row| row.to_vec()).collect_vec();
+    let trace_col_vecs: Vec<Vec<F>> = transpose(&trace_row_vecs);
+    trace_col_vecs
+        .into_iter()
+        .map(|column| PolynomialValues::new(column))
+        .collect()
+}
diff --git a/starky/src/verifier.rs b/starky/src/verifier.rs
index a9bf897c..d5071af7 100644
--- a/starky/src/verifier.rs
+++ b/starky/src/verifier.rs
@@ -118,7 +118,10 @@ where
         .chunks(stark.quotient_degree_factor())
         .enumerate()
     {
-        ensure!(vanishing_polys_zeta[i] == z_h_zeta * reduce_with_powers(chunk, zeta_pow_deg));
+        ensure!(
+            vanishing_polys_zeta[i] == z_h_zeta * reduce_with_powers(chunk, zeta_pow_deg),
+            "Mismatch between evaluation and opening of quotient polynomial"
+        );
     }
 
     let merkle_caps = once(proof.trace_cap)
diff --git a/system_zero/Cargo.toml b/system_zero/Cargo.toml
index 032bfb53..a9029dad 100644
--- a/system_zero/Cargo.toml
+++ b/system_zero/Cargo.toml
@@ -10,6 +10,14 @@ plonky2_util = { path = "../util" }
 starky = { path = "../starky" }
 anyhow = "1.0.40"
 env_logger = "0.9.0"
+itertools = "0.10.0"
 log = "0.4.14"
 rand = "0.8.4"
 rand_chacha = "0.3.1"
+
+[dev-dependencies]
+criterion = "0.3.5"
+
+[[bench]]
+name = "lookup_permuted_cols"
+harness = false
diff --git a/system_zero/benches/lookup_permuted_cols.rs b/system_zero/benches/lookup_permuted_cols.rs
new file mode 100644
index 00000000..371b3470
--- /dev/null
+++ b/system_zero/benches/lookup_permuted_cols.rs
@@ -0,0 +1,30 @@
+use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
+use itertools::Itertools;
+use plonky2::field::field_types::Field;
+use plonky2::field::goldilocks_field::GoldilocksField;
+use rand::{thread_rng, Rng};
+use system_zero::lookup::permuted_cols;
+
+type F = GoldilocksField;
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let mut group = c.benchmark_group("lookup-permuted-cols");
+
+    for size_log in [16, 17, 18] {
+        let size = 1 << size_log;
+        group.bench_with_input(BenchmarkId::from_parameter(size), &size, |b, _| {
+            // We could benchmark a table of random values with
+            //     let table = F::rand_vec(size);
+            // But in practice we currently use tables that are pre-sorted, which makes
+            // permuted_cols cheaper since it will sort the table.
+            let table = (0..size).map(F::from_canonical_usize).collect_vec();
+            let input = (0..size)
+                .map(|_| table[thread_rng().gen_range(0..size)])
+                .collect_vec();
+            b.iter(|| permuted_cols(&input, &table));
+        });
+    }
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/system_zero/src/lib.rs b/system_zero/src/lib.rs
index 35576cd3..81e5e9b1 100644
--- a/system_zero/src/lib.rs
+++ b/system_zero/src/lib.rs
@@ -4,6 +4,7 @@
 
 mod alu;
 mod core_registers;
+pub mod lookup;
 mod memory;
 mod permutation_unit;
 mod public_input_layout;
diff --git a/system_zero/src/lookup.rs b/system_zero/src/lookup.rs
new file mode 100644
index 00000000..5a5f0da1
--- /dev/null
+++ b/system_zero/src/lookup.rs
@@ -0,0 +1,147 @@
+//! Implementation of the Halo2 lookup argument.
+//!
+//! References:
+//! - https://zcash.github.io/halo2/design/proving-system/lookup.html
+//! - https://www.youtube.com/watch?v=YlTt12s7vGE&t=5237s
+
+use std::cmp::Ordering;
+
+use itertools::Itertools;
+use plonky2::field::extension_field::Extendable;
+use plonky2::field::field_types::{Field, PrimeField64};
+use plonky2::field::packed_field::PackedField;
+use plonky2::hash::hash_types::RichField;
+use plonky2::plonk::circuit_builder::CircuitBuilder;
+use starky::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer};
+use starky::vars::StarkEvaluationTargets;
+use starky::vars::StarkEvaluationVars;
+
+use crate::public_input_layout::NUM_PUBLIC_INPUTS;
+use crate::registers::lookup::*;
+use crate::registers::NUM_COLUMNS;
+
+pub(crate) fn generate_lookups<F: PrimeField64>(trace_cols: &mut [Vec<F>]) {
+    for i in 0..NUM_LOOKUPS {
+        let inputs = &trace_cols[col_input(i)];
+        let table = &trace_cols[col_table(i)];
+        let (permuted_inputs, permuted_table) = permuted_cols(inputs, table);
+        trace_cols[col_permuted_input(i)] = permuted_inputs;
+        trace_cols[col_permuted_table(i)] = permuted_table;
+    }
+}
+
+/// Given an input column and a table column, generate the permuted input and permuted table columns
+/// used in the Halo2 permutation argument.
+pub fn permuted_cols<F: PrimeField64>(inputs: &[F], table: &[F]) -> (Vec<F>, Vec<F>) {
+    let n = inputs.len();
+
+    // The permuted inputs do not have to be ordered, but we found that sorting was faster than
+    // hash-based grouping. We also sort the table, as this helps us identify "unused" table
+    // elements efficiently.
+
+    // To compare elements, e.g. for sorting, we first need them in canonical form. It would be
+    // wasteful to canonicalize in each comparison, as a single element may be involved in many
+    // comparisons. So we will canonicalize once upfront, then use `to_noncanonical_u64` when
+    // comparing elements.
+
+    let sorted_inputs = inputs
+        .iter()
+        .map(|x| x.to_canonical())
+        .sorted_unstable_by_key(|x| x.to_noncanonical_u64())
+        .collect_vec();
+    let sorted_table = table
+        .iter()
+        .map(|x| x.to_canonical())
+        .sorted_unstable_by_key(|x| x.to_noncanonical_u64())
+        .collect_vec();
+
+    let mut unused_table_inds = Vec::with_capacity(n);
+    let mut unused_table_vals = Vec::with_capacity(n);
+    let mut permuted_table = vec![F::ZERO; n];
+    let mut i = 0;
+    let mut j = 0;
+    while (j < n) && (i < n) {
+        let input_val = sorted_inputs[i].to_noncanonical_u64();
+        let table_val = sorted_table[j].to_noncanonical_u64();
+        match input_val.cmp(&table_val) {
+            Ordering::Greater => {
+                unused_table_vals.push(sorted_table[j]);
+                j += 1;
+            }
+            Ordering::Less => {
+                if let Some(x) = unused_table_vals.pop() {
+                    permuted_table[i] = x;
+                } else {
+                    unused_table_inds.push(i);
+                }
+                i += 1;
+            }
+            Ordering::Equal => {
+                permuted_table[i] = sorted_table[j];
+                i += 1;
+                j += 1;
+            }
+        }
+    }
+
+    #[allow(clippy::needless_range_loop)] // indexing is just more natural here
+    for jj in j..n {
+        unused_table_vals.push(sorted_table[jj]);
+    }
+    for ii in i..n {
+        unused_table_inds.push(ii);
+    }
+    for (ind, val) in unused_table_inds.into_iter().zip_eq(unused_table_vals) {
+        permuted_table[ind] = val;
+    }
+
+    (sorted_inputs, permuted_table)
+}
+
+pub(crate) fn eval_lookups<F: Field, P: PackedField<Scalar = F>>(
+    vars: StarkEvaluationVars<F, P, NUM_COLUMNS, NUM_PUBLIC_INPUTS>,
+    yield_constr: &mut ConstraintConsumer<P>,
+) {
+    for i in 0..NUM_LOOKUPS {
+        let local_perm_input = vars.local_values[col_permuted_input(i)];
+        let next_perm_table = vars.next_values[col_permuted_table(i)];
+        let next_perm_input = vars.next_values[col_permuted_input(i)];
+
+        // A "vertical" diff between the local and next permuted inputs.
+        let diff_input_prev = next_perm_input - local_perm_input;
+        // A "horizontal" diff between the next permuted input and permuted table value.
+        let diff_input_table = next_perm_input - next_perm_table;
+
+        yield_constr.constraint(diff_input_prev * diff_input_table);
+
+        // This is actually constraining the first row, as per the spec, since `diff_input_table`
+        // is a diff of the next row's values. In the context of `constraint_last_row`, the next
+        // row is the first row.
+        yield_constr.constraint_last_row(diff_input_table);
+    }
+}
+
+pub(crate) fn eval_lookups_recursively<F: RichField + Extendable<D>, const D: usize>(
+    builder: &mut CircuitBuilder<F, D>,
+    vars: StarkEvaluationTargets<D, NUM_COLUMNS, NUM_PUBLIC_INPUTS>,
+    yield_constr: &mut RecursiveConstraintConsumer<F, D>,
+) {
+    for i in 0..NUM_LOOKUPS {
+        let local_perm_input = vars.local_values[col_permuted_input(i)];
+        let next_perm_table = vars.next_values[col_permuted_table(i)];
+        let next_perm_input = vars.next_values[col_permuted_input(i)];
+
+        // A "vertical" diff between the local and next permuted inputs.
+        let diff_input_prev = builder.sub_extension(next_perm_input, local_perm_input);
+        // A "horizontal" diff between the next permuted input and permuted table value.
+        let diff_input_table = builder.sub_extension(next_perm_input, next_perm_table);
+
+        let diff_product = builder.mul_extension(diff_input_prev, diff_input_table);
+        yield_constr.constraint(builder, diff_product);
+
+        // This is actually constraining the first row, as per the spec, since `diff_input_table`
+        // is a diff of the next row's values. In the context of `constraint_last_row`, the next
+        // row is the first row.
+        yield_constr.constraint_last_row(builder, diff_input_table);
+    }
+}
diff --git a/system_zero/src/registers/lookup.rs b/system_zero/src/registers/lookup.rs
index eb773acf..fd0abd43 100644
--- a/system_zero/src/registers/lookup.rs
+++ b/system_zero/src/registers/lookup.rs
@@ -3,19 +3,35 @@
 
 const START_UNIT: usize = super::START_LOOKUP;
 
-const NUM_LOOKUPS: usize =
+pub(crate) const NUM_LOOKUPS: usize =
     super::range_check_16::NUM_RANGE_CHECKS + super::range_check_degree::NUM_RANGE_CHECKS;
 
+pub(crate) const fn col_input(i: usize) -> usize {
+    if i < super::range_check_16::NUM_RANGE_CHECKS {
+        super::range_check_16::col_rc_16_input(i)
+    } else {
+        super::range_check_degree::col_rc_degree_input(i - super::range_check_16::NUM_RANGE_CHECKS)
+    }
+}
+
 /// This column contains a permutation of the input values.
-const fn col_permuted_input(i: usize) -> usize {
+pub(crate) const fn col_permuted_input(i: usize) -> usize {
     debug_assert!(i < NUM_LOOKUPS);
     START_UNIT + 2 * i
 }
 
+pub(crate) const fn col_table(i: usize) -> usize {
+    if i < super::range_check_16::NUM_RANGE_CHECKS {
+        super::core::COL_RANGE_16
+    } else {
+        super::core::COL_CLOCK
+    }
+}
+
 /// This column contains a permutation of the table values.
-const fn col_permuted_table(i: usize) -> usize {
+pub(crate) const fn col_permuted_table(i: usize) -> usize {
     debug_assert!(i < NUM_LOOKUPS);
     START_UNIT + 2 * i + 1
 }
 
-pub(super) const END: usize = START_UNIT + NUM_LOOKUPS;
+pub(super) const END: usize = START_UNIT + NUM_LOOKUPS * 2;
diff --git a/system_zero/src/registers/range_check_16.rs b/system_zero/src/registers/range_check_16.rs
index c44db494..674df302 100644
--- a/system_zero/src/registers/range_check_16.rs
+++ b/system_zero/src/registers/range_check_16.rs
@@ -1,6 +1,6 @@
 //! Range check unit which checks that values are in `[0, 2^16)`.
 
-pub(super) const NUM_RANGE_CHECKS: usize = 5;
+pub(crate) const NUM_RANGE_CHECKS: usize = 5;
 
 /// The input of the `i`th range check, i.e. the value being range checked.
 pub(crate) const fn col_rc_16_input(i: usize) -> usize {
diff --git a/system_zero/src/registers/range_check_degree.rs b/system_zero/src/registers/range_check_degree.rs
index 6d61e6e2..caad705d 100644
--- a/system_zero/src/registers/range_check_degree.rs
+++ b/system_zero/src/registers/range_check_degree.rs
@@ -1,6 +1,6 @@
 //! Range check unit which checks that values are in `[0, degree)`.
 
-pub(super) const NUM_RANGE_CHECKS: usize = 5;
+pub(crate) const NUM_RANGE_CHECKS: usize = 5;
 
 /// The input of the `i`th range check, i.e. the value being range checked.
 pub(crate) const fn col_rc_degree_input(i: usize) -> usize {
diff --git a/system_zero/src/system_zero.rs b/system_zero/src/system_zero.rs
index c42a04a8..32c49266 100644
--- a/system_zero/src/system_zero.rs
+++ b/system_zero/src/system_zero.rs
@@ -2,8 +2,12 @@ use std::marker::PhantomData;
 
 use plonky2::field::extension_field::{Extendable, FieldExtension};
 use plonky2::field::packed_field::PackedField;
+use plonky2::field::polynomial::PolynomialValues;
 use plonky2::hash::hash_types::RichField;
 use plonky2::plonk::circuit_builder::CircuitBuilder;
+use plonky2::timed;
+use plonky2::util::timing::TimingTree;
+use plonky2::util::transpose;
 use starky::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer};
 use starky::permutation::PermutationPair;
 use starky::stark::Stark;
@@ -15,12 +19,13 @@ use crate::core_registers::{
     eval_core_registers, eval_core_registers_recursively, generate_first_row_core_registers,
     generate_next_row_core_registers,
 };
+use crate::lookup::{eval_lookups, eval_lookups_recursively, generate_lookups};
 use crate::memory::TransactionMemory;
 use crate::permutation_unit::{
     eval_permutation_unit, eval_permutation_unit_recursively, generate_permutation_unit,
 };
 use crate::public_input_layout::NUM_PUBLIC_INPUTS;
-use crate::registers::NUM_COLUMNS;
+use crate::registers::{lookup, NUM_COLUMNS};
 
 /// We require at least 2^16 rows as it helps support efficient 16-bit range checks.
 const MIN_TRACE_ROWS: usize = 1 << 16;
@@ -31,7 +36,9 @@ pub struct SystemZero<F: RichField + Extendable<D>, const D: usize> {
 }
 
 impl<F: RichField + Extendable<D>, const D: usize> SystemZero<F, D> {
-    fn generate_trace(&self) -> Vec<[F; NUM_COLUMNS]> {
+    /// Generate the rows of the trace. Note that this does not generate the permuted columns used
+    /// in our lookup arguments, as those are computed after transposing to column-wise form.
+    fn generate_trace_rows(&self) -> Vec<[F; NUM_COLUMNS]> {
         let memory = TransactionMemory::default();
 
         let mut row = [F::ZERO; NUM_COLUMNS];
@@ -59,6 +66,45 @@ impl<F: RichField + Extendable<D>, const D: usize> SystemZero<F, D> {
         trace.push(row);
         trace
     }
+
+    fn generate_trace(&self) -> Vec<PolynomialValues<F>> {
+        let mut timing = TimingTree::new("generate trace", log::Level::Debug);
+
+        // Generate the witness, except for permuted columns in the lookup argument.
+        let trace_rows = timed!(
+            &mut timing,
+            "generate trace rows",
+            self.generate_trace_rows()
+        );
+
+        // Transpose from row-wise to column-wise.
+        let trace_row_vecs: Vec<_> = timed!(
+            &mut timing,
+            "convert to Vecs",
+            trace_rows.into_iter().map(|row| row.to_vec()).collect()
+        );
+        let mut trace_col_vecs: Vec<Vec<F>> =
+            timed!(&mut timing, "transpose", transpose(&trace_row_vecs));
+
+        // Generate permuted columns in the lookup argument.
+        timed!(
+            &mut timing,
+            "generate lookup columns",
+            generate_lookups(&mut trace_col_vecs)
+        );
+
+        let trace_polys = timed!(
+            &mut timing,
+            "convert to PolynomialValues",
+            trace_col_vecs
+                .into_iter()
+                .map(|column| PolynomialValues::new(column))
+                .collect()
+        );
+
+        timing.print();
+        trace_polys
+    }
 }
 
 impl<F: RichField + Extendable<D>, const D: usize> Default for SystemZero<F, D> {
@@ -84,6 +130,7 @@ impl<F: RichField + Extendable<D>, const D: usize> Stark<F, D> for SystemZero<F,
         eval_core_registers(vars, yield_constr);
         eval_alu(vars, yield_constr);
         eval_permutation_unit::<F, FE, P, D2>(vars, yield_constr);
+        eval_lookups(vars, yield_constr);
         // TODO: Other units
     }
 
@@ -96,6 +143,7 @@ impl<F: RichField + Extendable<D>, const D: usize> Stark<F, D> for SystemZero<F,
         eval_core_registers_recursively(builder, vars, yield_constr);
         eval_alu_recursively(builder, vars, yield_constr);
         eval_permutation_unit_recursively(builder, vars, yield_constr);
+        eval_lookups_recursively(builder, vars, yield_constr);
         // TODO: Other units
     }
 
@@ -104,9 +152,22 @@ impl<F: RichField + Extendable<D>, const D: usize> Stark<F, D> for SystemZero<F,
     }
 
     fn permutation_pairs(&self) -> Vec<PermutationPair> {
+        let mut pairs = Vec::new();
+
+        for i in 0..lookup::NUM_LOOKUPS {
+            pairs.push(PermutationPair::singletons(
+                lookup::col_input(i),
+                lookup::col_permuted_input(i),
+            ));
+            pairs.push(PermutationPair::singletons(
+                lookup::col_table(i),
+                lookup::col_permuted_table(i),
+            ));
+        }
+
         // TODO: Add permutation pairs for memory.
-        // TODO: Add permutation pairs for range checks.
-        vec![]
+
+        pairs
     }
 }
 
@@ -127,8 +188,9 @@ mod tests {
     use crate::system_zero::SystemZero;
 
     #[test]
-    #[ignore] // A bit slow.
     fn run() -> Result<()> {
+        init_logger();
+
         type F = GoldilocksField;
         type C = PoseidonGoldilocksConfig;
         const D: usize = 2;
@@ -154,4 +216,8 @@ mod tests {
         let system = S::default();
         test_stark_low_degree(system)
     }
+
+    fn init_logger() {
+        let _ = env_logger::builder().format_timestamp(None).try_init();
+    }
 }