From 310493c293436e73096a75d0e5755816fbfe3d58 Mon Sep 17 00:00:00 2001
From: Hamish Ivey-Law <426294+unzvfu@users.noreply.github.com>
Date: Fri, 4 Mar 2022 09:34:31 +1100
Subject: [PATCH] Faster extension field multiplication (#500)

* Initial implementation of quintic extensions.

* Update to/from_biguint() methods.

* Draft of fast multiplication on quintic extensions over 64-bit base.

* cargo fmt

* Typo.

* Document functions (a bit).

* Refactor reduction step.

* Change multiplication call so that LLVM generates better assembly.

* Use one main accumulator instead of two minor ones; faster reduce.

* Use one main accumulator in square too; clean up redundant code.

* Call faster routines from Mul and Square impls.

* Fix reduction function.

* Fix square calculation.

* Slightly faster reduction.

* Clean up names and types.

* cargo fmt

* Move extension field mul/sqr specialisations to their own file.

* Rename functions to have unique prefix.

* Add faster quadratic multiplication/squaring.

* Faster quartic multiplication and squaring.

* cargo fmt

* clippy

* Alternative reduce160 function.

* Typo.

* Remove alternative reduction function.

* Remove delayed reduction implementation of squaring.

* Enforce assumptions about extension generators.

* Make the accumulation variable a u32 instead of u64.

* Add test to trigger carry branch in reduce160.

* cargo fmt

* Some documentation.

* Clippy; improved comments.

* cargo fmt

* Remove redundant Square specialisations.

* Fix reduce*() visibility.

* Faster reduce160 from Jakub.

* Change mul-by-const functions to operate on 160 bits instead of 128.

* Move code for extensions of GoldilocksField to its own file.
---
 field/Cargo.toml                       |   1 +
 field/src/extension_field/quadratic.rs |   2 +-
 field/src/extension_field/quartic.rs   |   2 +-
 field/src/extension_field/quintic.rs   |   2 +-
 field/src/goldilocks_extensions.rs     | 495 +++++++++++++++++++++++++
 field/src/goldilocks_field.rs          |  93 ++---
 field/src/lib.rs                       |   1 +
 plonky2/benches/field_arithmetic.rs    |   2 +
 8 files changed, 530 insertions(+), 68 deletions(-)
 create mode 100644 field/src/goldilocks_extensions.rs
diff --git a/field/Cargo.toml b/field/Cargo.toml
index 6abffc5d..748b65ac 100644
--- a/field/Cargo.toml
+++ b/field/Cargo.toml
@@ -12,3 +12,4 @@ num = { version = "0.4", features = [ "rand" ] }
 rand = "0.8.4"
 serde = { version = "1.0", features = ["derive"] }
 unroll = "0.1.5"
+static_assertions = "1.1.0"
diff --git a/field/src/extension_field/quadratic.rs b/field/src/extension_field/quadratic.rs
index 488304d2..9cdc01c3 100644
--- a/field/src/extension_field/quadratic.rs
+++ b/field/src/extension_field/quadratic.rs
@@ -170,7 +170,7 @@ impl<F: Extendable<2>> Mul for QuadraticExtension<F> {
     type Output = Self;
 
     #[inline]
-    fn mul(self, rhs: Self) -> Self {
+    default fn mul(self, rhs: Self) -> Self {
         let Self([a0, a1]) = self;
         let Self([b0, b1]) = rhs;
 
diff --git a/field/src/extension_field/quartic.rs b/field/src/extension_field/quartic.rs
index 7b4a6950..09e35a4f 100644
--- a/field/src/extension_field/quartic.rs
+++ b/field/src/extension_field/quartic.rs
@@ -201,7 +201,7 @@ impl<F: Extendable<4>> Mul for QuarticExtension<F> {
     type Output = Self;
 
     #[inline]
-    fn mul(self, rhs: Self) -> Self {
+    default fn mul(self, rhs: Self) -> Self {
         let Self([a0, a1, a2, a3]) = self;
         let Self([b0, b1, b2, b3]) = rhs;
 
diff --git a/field/src/extension_field/quintic.rs b/field/src/extension_field/quintic.rs
index d2c29ffe..1600107d 100644
--- a/field/src/extension_field/quintic.rs
+++ b/field/src/extension_field/quintic.rs
@@ -201,7 +201,7 @@ impl<F: Extendable<5>> Mul for QuinticExtension<F> {
     type Output = Self;
 
     #[inline]
-    fn mul(self, rhs: Self) -> Self {
+    default fn mul(self, rhs: Self) -> Self {
         let Self([a0, a1, a2, a3, a4]) = self;
         let Self([b0, b1, b2, b3, b4]) = rhs;
         let w = <Self as OEF<5>>::W;
diff --git a/field/src/goldilocks_extensions.rs b/field/src/goldilocks_extensions.rs
new file mode 100644
index 00000000..95265fe3
--- /dev/null
+++ b/field/src/goldilocks_extensions.rs
@@ -0,0 +1,495 @@
+use std::ops::Mul;
+
+use static_assertions::const_assert;
+
+use crate::extension_field::quadratic::QuadraticExtension;
+use crate::extension_field::quartic::QuarticExtension;
+use crate::extension_field::quintic::QuinticExtension;
+use crate::extension_field::{Extendable, Frobenius};
+use crate::field_types::Field;
+use crate::goldilocks_field::{reduce160, GoldilocksField};
+
+impl Frobenius<1> for GoldilocksField {}
+
+impl Extendable<2> for GoldilocksField {
+    type Extension = QuadraticExtension<Self>;
+
+    // Verifiable in Sage with
+    // `R.<x> = GF(p)[]; assert (x^2 - 7).is_irreducible()`.
+    const W: Self = Self(7);
+
+    // DTH_ROOT = W^((ORDER - 1)/2)
+    const DTH_ROOT: Self = Self(18446744069414584320);
+
+    const EXT_MULTIPLICATIVE_GROUP_GENERATOR: [Self; 2] =
+        [Self(18081566051660590251), Self(16121475356294670766)];
+
+    const EXT_POWER_OF_TWO_GENERATOR: [Self; 2] = [Self(0), Self(15659105665374529263)];
+}
+
+impl Mul for QuadraticExtension<GoldilocksField> {
+    #[inline]
+    fn mul(self, rhs: Self) -> Self {
+        let Self([a0, a1]) = self;
+        let Self([b0, b1]) = rhs;
+        let c = ext2_mul([a0.0, a1.0], [b0.0, b1.0]);
+        Self(c)
+    }
+}
+
+impl Extendable<4> for GoldilocksField {
+    type Extension = QuarticExtension<Self>;
+
+    const W: Self = Self(7);
+
+    // DTH_ROOT = W^((ORDER - 1)/4)
+    const DTH_ROOT: Self = Self(281474976710656);
+
+    const EXT_MULTIPLICATIVE_GROUP_GENERATOR: [Self; 4] = [
+        Self(5024755240244648895),
+        Self(13227474371289740625),
+        Self(3912887029498544536),
+        Self(3900057112666848848),
+    ];
+
+    const EXT_POWER_OF_TWO_GENERATOR: [Self; 4] =
+        [Self(0), Self(0), Self(0), Self(12587610116473453104)];
+}
+
+impl Mul for QuarticExtension<GoldilocksField> {
+    #[inline]
+    fn mul(self, rhs: Self) -> Self {
+        let Self([a0, a1, a2, a3]) = self;
+        let Self([b0, b1, b2, b3]) = rhs;
+        let c = ext4_mul([a0.0, a1.0, a2.0, a3.0], [b0.0, b1.0, b2.0, b3.0]);
+        Self(c)
+    }
+}
+
+impl Extendable<5> for GoldilocksField {
+    type Extension = QuinticExtension<Self>;
+
+    const W: Self = Self(3);
+
+    // DTH_ROOT = W^((ORDER - 1)/5)
+    const DTH_ROOT: Self = Self(1041288259238279555);
+
+    const EXT_MULTIPLICATIVE_GROUP_GENERATOR: [Self; 5] = [
+        Self(2899034827742553394),
+        Self(13012057356839176729),
+        Self(14593811582388663055),
+        Self(7722900811313895436),
+        Self(4557222484695340057),
+    ];
+
+    const EXT_POWER_OF_TWO_GENERATOR: [Self; 5] = [
+        Self::POWER_OF_TWO_GENERATOR,
+        Self(0),
+        Self(0),
+        Self(0),
+        Self(0),
+    ];
+}
+
+impl Mul for QuinticExtension<GoldilocksField> {
+    #[inline]
+    fn mul(self, rhs: Self) -> Self {
+        let Self([a0, a1, a2, a3, a4]) = self;
+        let Self([b0, b1, b2, b3, b4]) = rhs;
+        let c = ext5_mul(
+            [a0.0, a1.0, a2.0, a3.0, a4.0],
+            [b0.0, b1.0, b2.0, b3.0, b4.0],
+        );
+        Self(c)
+    }
+}
+
+/*
+ * The functions extD_add_prods[0-4] are helper functions for
+ * computing products for extensions of degree D over the Goldilocks
+ * field. They are faster than the generic method because all
+ * reductions are delayed until the end which means only one per
+ * result coefficient is necessary.
+ */
+
+/// Return a, b such that a + b*2^128 = 3*x with a < 2^128 and b < 2^32.
+#[inline(always)]
+fn u160_times_3(x: u128, y: u32) -> (u128, u32) {
+    let (s, cy) = x.overflowing_add(x << 1);
+    (s, 3 * y + (x >> 127) as u32 + cy as u32)
+}
+
+/// Return a, b such that a + b*2^128 = 7*x with a < 2^128 and b < 2^32.
+#[inline(always)]
+fn u160_times_7(x: u128, y: u32) -> (u128, u32) {
+    let (d, br) = (x << 3).overflowing_sub(x);
+    // NB: subtracting the borrow can't underflow
+    (d, 7 * y + (x >> (128 - 3)) as u32 - br as u32)
+}
+
+/*
+ * Quadratic multiplication and squaring
+ */
+
+#[inline(always)]
+fn ext2_add_prods0(a: &[u64; 2], b: &[u64; 2]) -> GoldilocksField {
+    // Computes a0 * b0 + W * a1 * b1;
+    let [a0, a1] = *a;
+    let [b0, b1] = *b;
+
+    let cy;
+
+    // W * a1 * b1
+    let (mut cumul_lo, mut cumul_hi) = u160_times_7((a1 as u128) * (b1 as u128), 0u32);
+
+    // a0 * b0
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a0 as u128) * (b0 as u128));
+    cumul_hi += cy as u32;
+
+    unsafe { reduce160(cumul_lo, cumul_hi) }
+}
+
+#[inline(always)]
+fn ext2_add_prods1(a: &[u64; 2], b: &[u64; 2]) -> GoldilocksField {
+    // Computes a0 * b1 + a1 * b0;
+    let [a0, a1] = *a;
+    let [b0, b1] = *b;
+
+    let cy;
+
+    // a0 * b1
+    let mut cumul_lo = (a0 as u128) * (b1 as u128);
+
+    // a1 * b0
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a1 as u128) * (b0 as u128));
+    let cumul_hi = cy as u32;
+
+    unsafe { reduce160(cumul_lo, cumul_hi) }
+}
+
+/// Multiply a and b considered as elements of GF(p^2).
+#[inline(always)]
+pub(crate) fn ext2_mul(a: [u64; 2], b: [u64; 2]) -> [GoldilocksField; 2] {
+    // The code in ext2_add_prods[01] assumes the quadratic extension
+    // generator is 7.
+    const_assert!(<GoldilocksField as Extendable<2>>::W.0 == 7u64);
+
+    let c0 = ext2_add_prods0(&a, &b);
+    let c1 = ext2_add_prods1(&a, &b);
+    [c0, c1]
+}
+
+/*
+ * Quartic multiplication and squaring
+ */
+
+#[inline(always)]
+fn ext4_add_prods0(a: &[u64; 4], b: &[u64; 4]) -> GoldilocksField {
+    // Computes c0 = a0 * b0 + W * (a1 * b3 + a2 * b2 + a3 * b1)
+
+    let [a0, a1, a2, a3] = *a;
+    let [b0, b1, b2, b3] = *b;
+
+    let mut cy;
+
+    // a1 * b3
+    let mut cumul_lo = (a1 as u128) * (b3 as u128);
+
+    // a2 * b2
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a2 as u128) * (b2 as u128));
+    let mut cumul_hi = cy as u32;
+
+    // a3 * b1
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a3 as u128) * (b1 as u128));
+    cumul_hi += cy as u32;
+
+    // * W
+    (cumul_lo, cumul_hi) = u160_times_7(cumul_lo, cumul_hi);
+
+    // a0 * b0
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a0 as u128) * (b0 as u128));
+    cumul_hi += cy as u32;
+
+    unsafe { reduce160(cumul_lo, cumul_hi) }
+}
+
+#[inline(always)]
+fn ext4_add_prods1(a: &[u64; 4], b: &[u64; 4]) -> GoldilocksField {
+    // Computes c1 = a0 * b1 + a1 * b0 + W * (a2 * b3 + a3 * b2);
+
+    let [a0, a1, a2, a3] = *a;
+    let [b0, b1, b2, b3] = *b;
+
+    let mut cy;
+
+    // a2 * b3
+    let mut cumul_lo = (a2 as u128) * (b3 as u128);
+
+    // a3 * b2
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a3 as u128) * (b2 as u128));
+    let mut cumul_hi = cy as u32;
+
+    // * W
+    (cumul_lo, cumul_hi) = u160_times_7(cumul_lo, cumul_hi);
+
+    // a0 * b1
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a0 as u128) * (b1 as u128));
+    cumul_hi += cy as u32;
+
+    // a1 * b0
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a1 as u128) * (b0 as u128));
+    cumul_hi += cy as u32;
+
+    unsafe { reduce160(cumul_lo, cumul_hi) }
+}
+
+#[inline(always)]
+fn ext4_add_prods2(a: &[u64; 4], b: &[u64; 4]) -> GoldilocksField {
+    // Computes c2 = a0 * b2 + a1 * b1 + a2 * b0 + W * a3 * b3;
+
+    let [a0, a1, a2, a3] = *a;
+    let [b0, b1, b2, b3] = *b;
+
+    let mut cy;
+
+    // W * a3 * b3
+    let (mut cumul_lo, mut cumul_hi) = u160_times_7((a3 as u128) * (b3 as u128), 0u32);
+
+    // a0 * b2
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a0 as u128) * (b2 as u128));
+    cumul_hi += cy as u32;
+
+    // a1 * b1
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a1 as u128) * (b1 as u128));
+    cumul_hi += cy as u32;
+
+    // a2 * b0
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a2 as u128) * (b0 as u128));
+    cumul_hi += cy as u32;
+
+    unsafe { reduce160(cumul_lo, cumul_hi) }
+}
+
+#[inline(always)]
+fn ext4_add_prods3(a: &[u64; 4], b: &[u64; 4]) -> GoldilocksField {
+    // Computes c3 = a0 * b3 + a1 * b2 + a2 * b1 + a3 * b0;
+
+    let [a0, a1, a2, a3] = *a;
+    let [b0, b1, b2, b3] = *b;
+
+    let mut cy;
+
+    // a0 * b3
+    let mut cumul_lo = (a0 as u128) * (b3 as u128);
+
+    // a1 * b2
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a1 as u128) * (b2 as u128));
+    let mut cumul_hi = cy as u32;
+
+    // a2 * b1
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a2 as u128) * (b1 as u128));
+    cumul_hi += cy as u32;
+
+    // a3 * b0
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a3 as u128) * (b0 as u128));
+    cumul_hi += cy as u32;
+
+    unsafe { reduce160(cumul_lo, cumul_hi) }
+}
+
+/// Multiply a and b considered as elements of GF(p^4).
+#[inline(always)]
+pub(crate) fn ext4_mul(a: [u64; 4], b: [u64; 4]) -> [GoldilocksField; 4] {
+    // The code in ext4_add_prods[0-3] assumes the quartic extension
+    // generator is 7.
+    const_assert!(<GoldilocksField as Extendable<4>>::W.0 == 7u64);
+
+    let c0 = ext4_add_prods0(&a, &b);
+    let c1 = ext4_add_prods1(&a, &b);
+    let c2 = ext4_add_prods2(&a, &b);
+    let c3 = ext4_add_prods3(&a, &b);
+    [c0, c1, c2, c3]
+}
+
+/*
+ * Quintic multiplication and squaring
+ */
+
+#[inline(always)]
+fn ext5_add_prods0(a: &[u64; 5], b: &[u64; 5]) -> GoldilocksField {
+    // Computes c0 = a0 * b0 + W * (a1 * b4 + a2 * b3 + a3 * b2 + a4 * b1)
+
+    let [a0, a1, a2, a3, a4] = *a;
+    let [b0, b1, b2, b3, b4] = *b;
+
+    let mut cy;
+
+    // a1 * b4
+    let mut cumul_lo = (a1 as u128) * (b4 as u128);
+
+    // a2 * b3
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a2 as u128) * (b3 as u128));
+    let mut cumul_hi = cy as u32;
+
+    // a3 * b2
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a3 as u128) * (b2 as u128));
+    cumul_hi += cy as u32;
+
+    // a4 * b1
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a4 as u128) * (b1 as u128));
+    cumul_hi += cy as u32;
+
+    // * W
+    (cumul_lo, cumul_hi) = u160_times_3(cumul_lo, cumul_hi);
+
+    // a0 * b0
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a0 as u128) * (b0 as u128));
+    cumul_hi += cy as u32;
+
+    unsafe { reduce160(cumul_lo, cumul_hi) }
+}
+
+#[inline(always)]
+fn ext5_add_prods1(a: &[u64; 5], b: &[u64; 5]) -> GoldilocksField {
+    // Computes c1 = a0 * b1 + a1 * b0 + W * (a2 * b4 + a3 * b3 + a4 * b2);
+
+    let [a0, a1, a2, a3, a4] = *a;
+    let [b0, b1, b2, b3, b4] = *b;
+
+    let mut cy;
+
+    // a2 * b4
+    let mut cumul_lo = (a2 as u128) * (b4 as u128);
+
+    // a3 * b3
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a3 as u128) * (b3 as u128));
+    let mut cumul_hi = cy as u32;
+
+    // a4 * b2
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a4 as u128) * (b2 as u128));
+    cumul_hi += cy as u32;
+
+    // * W
+    (cumul_lo, cumul_hi) = u160_times_3(cumul_lo, cumul_hi);
+
+    // a0 * b1
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a0 as u128) * (b1 as u128));
+    cumul_hi += cy as u32;
+
+    // a1 * b0
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a1 as u128) * (b0 as u128));
+    cumul_hi += cy as u32;
+
+    unsafe { reduce160(cumul_lo, cumul_hi) }
+}
+
+#[inline(always)]
+fn ext5_add_prods2(a: &[u64; 5], b: &[u64; 5]) -> GoldilocksField {
+    // Computes c2 = a0 * b2 + a1 * b1 + a2 * b0 + W * (a3 * b4 + a4 * b3);
+
+    let [a0, a1, a2, a3, a4] = *a;
+    let [b0, b1, b2, b3, b4] = *b;
+
+    let mut cy;
+
+    // a3 * b4
+    let mut cumul_lo = (a3 as u128) * (b4 as u128);
+
+    // a4 * b3
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a4 as u128) * (b3 as u128));
+    let mut cumul_hi = cy as u32;
+
+    // * W
+    (cumul_lo, cumul_hi) = u160_times_3(cumul_lo, cumul_hi);
+
+    // a0 * b2
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a0 as u128) * (b2 as u128));
+    cumul_hi += cy as u32;
+
+    // a1 * b1
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a1 as u128) * (b1 as u128));
+    cumul_hi += cy as u32;
+
+    // a2 * b0
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a2 as u128) * (b0 as u128));
+    cumul_hi += cy as u32;
+
+    unsafe { reduce160(cumul_lo, cumul_hi) }
+}
+
+#[inline(always)]
+fn ext5_add_prods3(a: &[u64; 5], b: &[u64; 5]) -> GoldilocksField {
+    // Computes c3 = a0 * b3 + a1 * b2 + a2 * b1 + a3 * b0 + W * a4 * b4;
+
+    let [a0, a1, a2, a3, a4] = *a;
+    let [b0, b1, b2, b3, b4] = *b;
+
+    let mut cy;
+
+    // W * a4 * b4
+    let (mut cumul_lo, mut cumul_hi) = u160_times_3((a4 as u128) * (b4 as u128), 0u32);
+
+    // a0 * b3
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a0 as u128) * (b3 as u128));
+    cumul_hi += cy as u32;
+
+    // a1 * b2
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a1 as u128) * (b2 as u128));
+    cumul_hi += cy as u32;
+
+    // a2 * b1
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a2 as u128) * (b1 as u128));
+    cumul_hi += cy as u32;
+
+    // a3 * b0
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a3 as u128) * (b0 as u128));
+    cumul_hi += cy as u32;
+
+    unsafe { reduce160(cumul_lo, cumul_hi) }
+}
+
+#[inline(always)]
+fn ext5_add_prods4(a: &[u64; 5], b: &[u64; 5]) -> GoldilocksField {
+    // Computes c4 = a0 * b4 + a1 * b3 + a2 * b2 + a3 * b1 + a4 * b0;
+
+    let [a0, a1, a2, a3, a4] = *a;
+    let [b0, b1, b2, b3, b4] = *b;
+
+    let mut cy;
+
+    // a0 * b4
+    let mut cumul_lo = (a0 as u128) * (b4 as u128);
+
+    // a1 * b3
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a1 as u128) * (b3 as u128));
+    let mut cumul_hi = cy as u32;
+
+    // a2 * b2
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a2 as u128) * (b2 as u128));
+    cumul_hi += cy as u32;
+
+    // a3 * b1
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a3 as u128) * (b1 as u128));
+    cumul_hi += cy as u32;
+
+    // a4 * b0
+    (cumul_lo, cy) = cumul_lo.overflowing_add((a4 as u128) * (b0 as u128));
+    cumul_hi += cy as u32;
+
+    unsafe { reduce160(cumul_lo, cumul_hi) }
+}
+
+/// Multiply a and b considered as elements of GF(p^5).
+#[inline(always)]
+pub(crate) fn ext5_mul(a: [u64; 5], b: [u64; 5]) -> [GoldilocksField; 5] {
+    // The code in ext5_add_prods[0-4] assumes the quintic extension
+    // generator is 3.
+    const_assert!(<GoldilocksField as Extendable<5>>::W.0 == 3u64);
+
+    let c0 = ext5_add_prods0(&a, &b);
+    let c1 = ext5_add_prods1(&a, &b);
+    let c2 = ext5_add_prods2(&a, &b);
+    let c3 = ext5_add_prods3(&a, &b);
+    let c4 = ext5_add_prods4(&a, &b);
+    [c0, c1, c2, c3, c4]
+}
diff --git a/field/src/goldilocks_field.rs b/field/src/goldilocks_field.rs
index af958629..4ed32a0d 100644
--- a/field/src/goldilocks_field.rs
+++ b/field/src/goldilocks_field.rs
@@ -9,10 +9,6 @@ use plonky2_util::{assume, branch_hint};
 use rand::Rng;
 use serde::{Deserialize, Serialize};
 
-use crate::extension_field::quadratic::QuadraticExtension;
-use crate::extension_field::quartic::QuarticExtension;
-use crate::extension_field::quintic::QuinticExtension;
-use crate::extension_field::{Extendable, Frobenius};
 use crate::field_types::{Field, Field64, PrimeField, PrimeField64};
 use crate::inversion::try_inverse_u64;
 
@@ -283,66 +279,6 @@ impl DivAssign for GoldilocksField {
     }
 }
 
-impl Extendable<2> for GoldilocksField {
-    type Extension = QuadraticExtension<Self>;
-
-    // Verifiable in Sage with
-    // `R.<x> = GF(p)[]; assert (x^2 - 7).is_irreducible()`.
-    const W: Self = Self(7);
-
-    // DTH_ROOT = W^((ORDER - 1)/2)
-    const DTH_ROOT: Self = Self(18446744069414584320);
-
-    const EXT_MULTIPLICATIVE_GROUP_GENERATOR: [Self; 2] =
-        [Self(18081566051660590251), Self(16121475356294670766)];
-
-    const EXT_POWER_OF_TWO_GENERATOR: [Self; 2] = [Self(0), Self(15659105665374529263)];
-}
-
-impl Extendable<4> for GoldilocksField {
-    type Extension = QuarticExtension<Self>;
-
-    const W: Self = Self(7);
-
-    // DTH_ROOT = W^((ORDER - 1)/4)
-    const DTH_ROOT: Self = Self(281474976710656);
-
-    const EXT_MULTIPLICATIVE_GROUP_GENERATOR: [Self; 4] = [
-        Self(5024755240244648895),
-        Self(13227474371289740625),
-        Self(3912887029498544536),
-        Self(3900057112666848848),
-    ];
-
-    const EXT_POWER_OF_TWO_GENERATOR: [Self; 4] =
-        [Self(0), Self(0), Self(0), Self(12587610116473453104)];
-}
-
-impl Extendable<5> for GoldilocksField {
-    type Extension = QuinticExtension<Self>;
-
-    const W: Self = Self(3);
-
-    // DTH_ROOT = W^((ORDER - 1)/5)
-    const DTH_ROOT: Self = Self(1041288259238279555);
-
-    const EXT_MULTIPLICATIVE_GROUP_GENERATOR: [Self; 5] = [
-        Self(2899034827742553394),
-        Self(13012057356839176729),
-        Self(14593811582388663055),
-        Self(7722900811313895436),
-        Self(4557222484695340057),
-    ];
-
-    const EXT_POWER_OF_TWO_GENERATOR: [Self; 5] = [
-        Self::POWER_OF_TWO_GENERATOR,
-        Self(0),
-        Self(0),
-        Self(0),
-        Self(0),
-    ];
-}
-
 /// Fast addition modulo ORDER for x86-64.
 /// This function is marked unsafe for the following reasons:
 ///   - It is only correct if x + y < 2**64 + ORDER = 0x1ffffffff00000001.
@@ -407,7 +343,34 @@ fn split(x: u128) -> (u64, u64) {
     (x as u64, (x >> 64) as u64)
 }
 
-impl Frobenius<1> for GoldilocksField {}
+/// Reduce the value x_lo + x_hi * 2^128 to an element in the
+/// Goldilocks field.
+///
+/// This function is marked 'unsafe' because correctness relies on the
+/// unchecked assumption that x < 2^160 - 2^128 + 2^96. Further,
+/// performance may degrade as x_hi increases beyond 2**40 or so.
+#[inline(always)]
+pub(crate) unsafe fn reduce160(x_lo: u128, x_hi: u32) -> GoldilocksField {
+    let x_hi = (x_lo >> 96) as u64 + ((x_hi as u64) << 32); // shld to form x_hi
+    let x_mid = (x_lo >> 64) as u32; // shr to form x_mid
+    let x_lo = x_lo as u64;
+
+    // sub + jc (should fuse)
+    let (mut t0, borrow) = x_lo.overflowing_sub(x_hi);
+    if borrow {
+        // The maximum possible value of x is (2^64 - 1)^2 * 4 * 7 < 2^133,
+        // so x_hi < 2^37. A borrow will happen roughly one in 134 million
+        // times, so it's best to branch.
+        branch_hint();
+        // NB: this assumes that x < 2^160 - 2^128 + 2^96.
+        t0 -= EPSILON; // Cannot underflow if x_hi is canonical.
+    }
+    // imul
+    let t1 = (x_mid as u64) * EPSILON;
+    // add, sbb, add
+    let t2 = add_no_canonicalize_trashing_input(t0, t1);
+    GoldilocksField(t2)
+}
 
 #[cfg(test)]
 mod tests {
diff --git a/field/src/lib.rs b/field/src/lib.rs
index 2c89aab3..e54f2aa7 100644
--- a/field/src/lib.rs
+++ b/field/src/lib.rs
@@ -15,6 +15,7 @@ pub mod cosets;
 pub mod extension_field;
 pub mod fft;
 pub mod field_types;
+pub mod goldilocks_extensions;
 pub mod goldilocks_field;
 pub mod interpolation;
 mod inversion;
diff --git a/plonky2/benches/field_arithmetic.rs b/plonky2/benches/field_arithmetic.rs
index 0e4383ee..7b74ae52 100644
--- a/plonky2/benches/field_arithmetic.rs
+++ b/plonky2/benches/field_arithmetic.rs
@@ -1,4 +1,5 @@
 use criterion::{criterion_group, criterion_main, BatchSize, Criterion};
+use plonky2::field::extension_field::quadratic::QuadraticExtension;
 use plonky2::field::extension_field::quartic::QuarticExtension;
 use plonky2::field::extension_field::quintic::QuinticExtension;
 use plonky2::field::field_types::Field;
@@ -175,6 +176,7 @@ pub(crate) fn bench_field<F: Field>(c: &mut Criterion) {
 
 fn criterion_benchmark(c: &mut Criterion) {
     bench_field::<GoldilocksField>(c);
+    bench_field::<QuadraticExtension<GoldilocksField>>(c);
     bench_field::<QuarticExtension<GoldilocksField>>(c);
     bench_field::<QuinticExtension<GoldilocksField>>(c);
 }