From 310493c293436e73096a75d0e5755816fbfe3d58 Mon Sep 17 00:00:00 2001 From: Hamish Ivey-Law <426294+unzvfu@users.noreply.github.com> Date: Fri, 4 Mar 2022 09:34:31 +1100 Subject: [PATCH] Faster extension field multiplication (#500) * Initial implementation of quintic extensions. * Update to/from_biguint() methods. * Draft of fast multiplication on quintic extensions over 64-bit base. * cargo fmt * Typo. * Document functions (a bit). * Refactor reduction step. * Change multiplication call so that LLVM generates better assembly. * Use one main accumulator instead of two minor ones; faster reduce. * Use one main accumulator in square too; clean up redundant code. * Call faster routines from Mul and Square impls. * Fix reduction function. * Fix square calculation. * Slightly faster reduction. * Clean up names and types. * cargo fmt * Move extension field mul/sqr specialisations to their own file. * Rename functions to have unique prefix. * Add faster quadratic multiplication/squaring. * Faster quartic multiplication and squaring. * cargo fmt * clippy * Alternative reduce160 function. * Typo. * Remove alternative reduction function. * Remove delayed reduction implementation of squaring. * Enforce assumptions about extension generators. * Make the accumulation variable a u32 instead of u64. * Add test to trigger carry branch in reduce160. * cargo fmt * Some documentation. * Clippy; improved comments. * cargo fmt * Remove redundant Square specialisations. * Fix reduce*() visibility. * Faster reduce160 from Jakub. * Change mul-by-const functions to operate on 160 bits instead of 128. * Move code for extensions of GoldilocksField to its own file. --- field/Cargo.toml | 1 + field/src/extension_field/quadratic.rs | 2 +- field/src/extension_field/quartic.rs | 2 +- field/src/extension_field/quintic.rs | 2 +- field/src/goldilocks_extensions.rs | 495 +++++++++++++++++++++++++ field/src/goldilocks_field.rs | 93 ++--- field/src/lib.rs | 1 + plonky2/benches/field_arithmetic.rs | 2 + 8 files changed, 530 insertions(+), 68 deletions(-) create mode 100644 field/src/goldilocks_extensions.rs diff --git a/field/Cargo.toml b/field/Cargo.toml index 6abffc5d..748b65ac 100644 --- a/field/Cargo.toml +++ b/field/Cargo.toml @@ -12,3 +12,4 @@ num = { version = "0.4", features = [ "rand" ] } rand = "0.8.4" serde = { version = "1.0", features = ["derive"] } unroll = "0.1.5" +static_assertions = "1.1.0" diff --git a/field/src/extension_field/quadratic.rs b/field/src/extension_field/quadratic.rs index 488304d2..9cdc01c3 100644 --- a/field/src/extension_field/quadratic.rs +++ b/field/src/extension_field/quadratic.rs @@ -170,7 +170,7 @@ impl> Mul for QuadraticExtension { type Output = Self; #[inline] - fn mul(self, rhs: Self) -> Self { + default fn mul(self, rhs: Self) -> Self { let Self([a0, a1]) = self; let Self([b0, b1]) = rhs; diff --git a/field/src/extension_field/quartic.rs b/field/src/extension_field/quartic.rs index 7b4a6950..09e35a4f 100644 --- a/field/src/extension_field/quartic.rs +++ b/field/src/extension_field/quartic.rs @@ -201,7 +201,7 @@ impl> Mul for QuarticExtension { type Output = Self; #[inline] - fn mul(self, rhs: Self) -> Self { + default fn mul(self, rhs: Self) -> Self { let Self([a0, a1, a2, a3]) = self; let Self([b0, b1, b2, b3]) = rhs; diff --git a/field/src/extension_field/quintic.rs b/field/src/extension_field/quintic.rs index d2c29ffe..1600107d 100644 --- a/field/src/extension_field/quintic.rs +++ b/field/src/extension_field/quintic.rs @@ -201,7 +201,7 @@ impl> Mul for QuinticExtension { type Output = Self; #[inline] - fn mul(self, rhs: Self) -> Self { + default fn mul(self, rhs: Self) -> Self { let Self([a0, a1, a2, a3, a4]) = self; let Self([b0, b1, b2, b3, b4]) = rhs; let w = >::W; diff --git a/field/src/goldilocks_extensions.rs b/field/src/goldilocks_extensions.rs new file mode 100644 index 00000000..95265fe3 --- /dev/null +++ b/field/src/goldilocks_extensions.rs @@ -0,0 +1,495 @@ +use std::ops::Mul; + +use static_assertions::const_assert; + +use crate::extension_field::quadratic::QuadraticExtension; +use crate::extension_field::quartic::QuarticExtension; +use crate::extension_field::quintic::QuinticExtension; +use crate::extension_field::{Extendable, Frobenius}; +use crate::field_types::Field; +use crate::goldilocks_field::{reduce160, GoldilocksField}; + +impl Frobenius<1> for GoldilocksField {} + +impl Extendable<2> for GoldilocksField { + type Extension = QuadraticExtension; + + // Verifiable in Sage with + // `R. = GF(p)[]; assert (x^2 - 7).is_irreducible()`. + const W: Self = Self(7); + + // DTH_ROOT = W^((ORDER - 1)/2) + const DTH_ROOT: Self = Self(18446744069414584320); + + const EXT_MULTIPLICATIVE_GROUP_GENERATOR: [Self; 2] = + [Self(18081566051660590251), Self(16121475356294670766)]; + + const EXT_POWER_OF_TWO_GENERATOR: [Self; 2] = [Self(0), Self(15659105665374529263)]; +} + +impl Mul for QuadraticExtension { + #[inline] + fn mul(self, rhs: Self) -> Self { + let Self([a0, a1]) = self; + let Self([b0, b1]) = rhs; + let c = ext2_mul([a0.0, a1.0], [b0.0, b1.0]); + Self(c) + } +} + +impl Extendable<4> for GoldilocksField { + type Extension = QuarticExtension; + + const W: Self = Self(7); + + // DTH_ROOT = W^((ORDER - 1)/4) + const DTH_ROOT: Self = Self(281474976710656); + + const EXT_MULTIPLICATIVE_GROUP_GENERATOR: [Self; 4] = [ + Self(5024755240244648895), + Self(13227474371289740625), + Self(3912887029498544536), + Self(3900057112666848848), + ]; + + const EXT_POWER_OF_TWO_GENERATOR: [Self; 4] = + [Self(0), Self(0), Self(0), Self(12587610116473453104)]; +} + +impl Mul for QuarticExtension { + #[inline] + fn mul(self, rhs: Self) -> Self { + let Self([a0, a1, a2, a3]) = self; + let Self([b0, b1, b2, b3]) = rhs; + let c = ext4_mul([a0.0, a1.0, a2.0, a3.0], [b0.0, b1.0, b2.0, b3.0]); + Self(c) + } +} + +impl Extendable<5> for GoldilocksField { + type Extension = QuinticExtension; + + const W: Self = Self(3); + + // DTH_ROOT = W^((ORDER - 1)/5) + const DTH_ROOT: Self = Self(1041288259238279555); + + const EXT_MULTIPLICATIVE_GROUP_GENERATOR: [Self; 5] = [ + Self(2899034827742553394), + Self(13012057356839176729), + Self(14593811582388663055), + Self(7722900811313895436), + Self(4557222484695340057), + ]; + + const EXT_POWER_OF_TWO_GENERATOR: [Self; 5] = [ + Self::POWER_OF_TWO_GENERATOR, + Self(0), + Self(0), + Self(0), + Self(0), + ]; +} + +impl Mul for QuinticExtension { + #[inline] + fn mul(self, rhs: Self) -> Self { + let Self([a0, a1, a2, a3, a4]) = self; + let Self([b0, b1, b2, b3, b4]) = rhs; + let c = ext5_mul( + [a0.0, a1.0, a2.0, a3.0, a4.0], + [b0.0, b1.0, b2.0, b3.0, b4.0], + ); + Self(c) + } +} + +/* + * The functions extD_add_prods[0-4] are helper functions for + * computing products for extensions of degree D over the Goldilocks + * field. They are faster than the generic method because all + * reductions are delayed until the end which means only one per + * result coefficient is necessary. + */ + +/// Return a, b such that a + b*2^128 = 3*x with a < 2^128 and b < 2^32. +#[inline(always)] +fn u160_times_3(x: u128, y: u32) -> (u128, u32) { + let (s, cy) = x.overflowing_add(x << 1); + (s, 3 * y + (x >> 127) as u32 + cy as u32) +} + +/// Return a, b such that a + b*2^128 = 7*x with a < 2^128 and b < 2^32. +#[inline(always)] +fn u160_times_7(x: u128, y: u32) -> (u128, u32) { + let (d, br) = (x << 3).overflowing_sub(x); + // NB: subtracting the borrow can't underflow + (d, 7 * y + (x >> (128 - 3)) as u32 - br as u32) +} + +/* + * Quadratic multiplication and squaring + */ + +#[inline(always)] +fn ext2_add_prods0(a: &[u64; 2], b: &[u64; 2]) -> GoldilocksField { + // Computes a0 * b0 + W * a1 * b1; + let [a0, a1] = *a; + let [b0, b1] = *b; + + let cy; + + // W * a1 * b1 + let (mut cumul_lo, mut cumul_hi) = u160_times_7((a1 as u128) * (b1 as u128), 0u32); + + // a0 * b0 + (cumul_lo, cy) = cumul_lo.overflowing_add((a0 as u128) * (b0 as u128)); + cumul_hi += cy as u32; + + unsafe { reduce160(cumul_lo, cumul_hi) } +} + +#[inline(always)] +fn ext2_add_prods1(a: &[u64; 2], b: &[u64; 2]) -> GoldilocksField { + // Computes a0 * b1 + a1 * b0; + let [a0, a1] = *a; + let [b0, b1] = *b; + + let cy; + + // a0 * b1 + let mut cumul_lo = (a0 as u128) * (b1 as u128); + + // a1 * b0 + (cumul_lo, cy) = cumul_lo.overflowing_add((a1 as u128) * (b0 as u128)); + let cumul_hi = cy as u32; + + unsafe { reduce160(cumul_lo, cumul_hi) } +} + +/// Multiply a and b considered as elements of GF(p^2). +#[inline(always)] +pub(crate) fn ext2_mul(a: [u64; 2], b: [u64; 2]) -> [GoldilocksField; 2] { + // The code in ext2_add_prods[01] assumes the quadratic extension + // generator is 7. + const_assert!(>::W.0 == 7u64); + + let c0 = ext2_add_prods0(&a, &b); + let c1 = ext2_add_prods1(&a, &b); + [c0, c1] +} + +/* + * Quartic multiplication and squaring + */ + +#[inline(always)] +fn ext4_add_prods0(a: &[u64; 4], b: &[u64; 4]) -> GoldilocksField { + // Computes c0 = a0 * b0 + W * (a1 * b3 + a2 * b2 + a3 * b1) + + let [a0, a1, a2, a3] = *a; + let [b0, b1, b2, b3] = *b; + + let mut cy; + + // a1 * b3 + let mut cumul_lo = (a1 as u128) * (b3 as u128); + + // a2 * b2 + (cumul_lo, cy) = cumul_lo.overflowing_add((a2 as u128) * (b2 as u128)); + let mut cumul_hi = cy as u32; + + // a3 * b1 + (cumul_lo, cy) = cumul_lo.overflowing_add((a3 as u128) * (b1 as u128)); + cumul_hi += cy as u32; + + // * W + (cumul_lo, cumul_hi) = u160_times_7(cumul_lo, cumul_hi); + + // a0 * b0 + (cumul_lo, cy) = cumul_lo.overflowing_add((a0 as u128) * (b0 as u128)); + cumul_hi += cy as u32; + + unsafe { reduce160(cumul_lo, cumul_hi) } +} + +#[inline(always)] +fn ext4_add_prods1(a: &[u64; 4], b: &[u64; 4]) -> GoldilocksField { + // Computes c1 = a0 * b1 + a1 * b0 + W * (a2 * b3 + a3 * b2); + + let [a0, a1, a2, a3] = *a; + let [b0, b1, b2, b3] = *b; + + let mut cy; + + // a2 * b3 + let mut cumul_lo = (a2 as u128) * (b3 as u128); + + // a3 * b2 + (cumul_lo, cy) = cumul_lo.overflowing_add((a3 as u128) * (b2 as u128)); + let mut cumul_hi = cy as u32; + + // * W + (cumul_lo, cumul_hi) = u160_times_7(cumul_lo, cumul_hi); + + // a0 * b1 + (cumul_lo, cy) = cumul_lo.overflowing_add((a0 as u128) * (b1 as u128)); + cumul_hi += cy as u32; + + // a1 * b0 + (cumul_lo, cy) = cumul_lo.overflowing_add((a1 as u128) * (b0 as u128)); + cumul_hi += cy as u32; + + unsafe { reduce160(cumul_lo, cumul_hi) } +} + +#[inline(always)] +fn ext4_add_prods2(a: &[u64; 4], b: &[u64; 4]) -> GoldilocksField { + // Computes c2 = a0 * b2 + a1 * b1 + a2 * b0 + W * a3 * b3; + + let [a0, a1, a2, a3] = *a; + let [b0, b1, b2, b3] = *b; + + let mut cy; + + // W * a3 * b3 + let (mut cumul_lo, mut cumul_hi) = u160_times_7((a3 as u128) * (b3 as u128), 0u32); + + // a0 * b2 + (cumul_lo, cy) = cumul_lo.overflowing_add((a0 as u128) * (b2 as u128)); + cumul_hi += cy as u32; + + // a1 * b1 + (cumul_lo, cy) = cumul_lo.overflowing_add((a1 as u128) * (b1 as u128)); + cumul_hi += cy as u32; + + // a2 * b0 + (cumul_lo, cy) = cumul_lo.overflowing_add((a2 as u128) * (b0 as u128)); + cumul_hi += cy as u32; + + unsafe { reduce160(cumul_lo, cumul_hi) } +} + +#[inline(always)] +fn ext4_add_prods3(a: &[u64; 4], b: &[u64; 4]) -> GoldilocksField { + // Computes c3 = a0 * b3 + a1 * b2 + a2 * b1 + a3 * b0; + + let [a0, a1, a2, a3] = *a; + let [b0, b1, b2, b3] = *b; + + let mut cy; + + // a0 * b3 + let mut cumul_lo = (a0 as u128) * (b3 as u128); + + // a1 * b2 + (cumul_lo, cy) = cumul_lo.overflowing_add((a1 as u128) * (b2 as u128)); + let mut cumul_hi = cy as u32; + + // a2 * b1 + (cumul_lo, cy) = cumul_lo.overflowing_add((a2 as u128) * (b1 as u128)); + cumul_hi += cy as u32; + + // a3 * b0 + (cumul_lo, cy) = cumul_lo.overflowing_add((a3 as u128) * (b0 as u128)); + cumul_hi += cy as u32; + + unsafe { reduce160(cumul_lo, cumul_hi) } +} + +/// Multiply a and b considered as elements of GF(p^4). +#[inline(always)] +pub(crate) fn ext4_mul(a: [u64; 4], b: [u64; 4]) -> [GoldilocksField; 4] { + // The code in ext4_add_prods[0-3] assumes the quartic extension + // generator is 7. + const_assert!(>::W.0 == 7u64); + + let c0 = ext4_add_prods0(&a, &b); + let c1 = ext4_add_prods1(&a, &b); + let c2 = ext4_add_prods2(&a, &b); + let c3 = ext4_add_prods3(&a, &b); + [c0, c1, c2, c3] +} + +/* + * Quintic multiplication and squaring + */ + +#[inline(always)] +fn ext5_add_prods0(a: &[u64; 5], b: &[u64; 5]) -> GoldilocksField { + // Computes c0 = a0 * b0 + W * (a1 * b4 + a2 * b3 + a3 * b2 + a4 * b1) + + let [a0, a1, a2, a3, a4] = *a; + let [b0, b1, b2, b3, b4] = *b; + + let mut cy; + + // a1 * b4 + let mut cumul_lo = (a1 as u128) * (b4 as u128); + + // a2 * b3 + (cumul_lo, cy) = cumul_lo.overflowing_add((a2 as u128) * (b3 as u128)); + let mut cumul_hi = cy as u32; + + // a3 * b2 + (cumul_lo, cy) = cumul_lo.overflowing_add((a3 as u128) * (b2 as u128)); + cumul_hi += cy as u32; + + // a4 * b1 + (cumul_lo, cy) = cumul_lo.overflowing_add((a4 as u128) * (b1 as u128)); + cumul_hi += cy as u32; + + // * W + (cumul_lo, cumul_hi) = u160_times_3(cumul_lo, cumul_hi); + + // a0 * b0 + (cumul_lo, cy) = cumul_lo.overflowing_add((a0 as u128) * (b0 as u128)); + cumul_hi += cy as u32; + + unsafe { reduce160(cumul_lo, cumul_hi) } +} + +#[inline(always)] +fn ext5_add_prods1(a: &[u64; 5], b: &[u64; 5]) -> GoldilocksField { + // Computes c1 = a0 * b1 + a1 * b0 + W * (a2 * b4 + a3 * b3 + a4 * b2); + + let [a0, a1, a2, a3, a4] = *a; + let [b0, b1, b2, b3, b4] = *b; + + let mut cy; + + // a2 * b4 + let mut cumul_lo = (a2 as u128) * (b4 as u128); + + // a3 * b3 + (cumul_lo, cy) = cumul_lo.overflowing_add((a3 as u128) * (b3 as u128)); + let mut cumul_hi = cy as u32; + + // a4 * b2 + (cumul_lo, cy) = cumul_lo.overflowing_add((a4 as u128) * (b2 as u128)); + cumul_hi += cy as u32; + + // * W + (cumul_lo, cumul_hi) = u160_times_3(cumul_lo, cumul_hi); + + // a0 * b1 + (cumul_lo, cy) = cumul_lo.overflowing_add((a0 as u128) * (b1 as u128)); + cumul_hi += cy as u32; + + // a1 * b0 + (cumul_lo, cy) = cumul_lo.overflowing_add((a1 as u128) * (b0 as u128)); + cumul_hi += cy as u32; + + unsafe { reduce160(cumul_lo, cumul_hi) } +} + +#[inline(always)] +fn ext5_add_prods2(a: &[u64; 5], b: &[u64; 5]) -> GoldilocksField { + // Computes c2 = a0 * b2 + a1 * b1 + a2 * b0 + W * (a3 * b4 + a4 * b3); + + let [a0, a1, a2, a3, a4] = *a; + let [b0, b1, b2, b3, b4] = *b; + + let mut cy; + + // a3 * b4 + let mut cumul_lo = (a3 as u128) * (b4 as u128); + + // a4 * b3 + (cumul_lo, cy) = cumul_lo.overflowing_add((a4 as u128) * (b3 as u128)); + let mut cumul_hi = cy as u32; + + // * W + (cumul_lo, cumul_hi) = u160_times_3(cumul_lo, cumul_hi); + + // a0 * b2 + (cumul_lo, cy) = cumul_lo.overflowing_add((a0 as u128) * (b2 as u128)); + cumul_hi += cy as u32; + + // a1 * b1 + (cumul_lo, cy) = cumul_lo.overflowing_add((a1 as u128) * (b1 as u128)); + cumul_hi += cy as u32; + + // a2 * b0 + (cumul_lo, cy) = cumul_lo.overflowing_add((a2 as u128) * (b0 as u128)); + cumul_hi += cy as u32; + + unsafe { reduce160(cumul_lo, cumul_hi) } +} + +#[inline(always)] +fn ext5_add_prods3(a: &[u64; 5], b: &[u64; 5]) -> GoldilocksField { + // Computes c3 = a0 * b3 + a1 * b2 + a2 * b1 + a3 * b0 + W * a4 * b4; + + let [a0, a1, a2, a3, a4] = *a; + let [b0, b1, b2, b3, b4] = *b; + + let mut cy; + + // W * a4 * b4 + let (mut cumul_lo, mut cumul_hi) = u160_times_3((a4 as u128) * (b4 as u128), 0u32); + + // a0 * b3 + (cumul_lo, cy) = cumul_lo.overflowing_add((a0 as u128) * (b3 as u128)); + cumul_hi += cy as u32; + + // a1 * b2 + (cumul_lo, cy) = cumul_lo.overflowing_add((a1 as u128) * (b2 as u128)); + cumul_hi += cy as u32; + + // a2 * b1 + (cumul_lo, cy) = cumul_lo.overflowing_add((a2 as u128) * (b1 as u128)); + cumul_hi += cy as u32; + + // a3 * b0 + (cumul_lo, cy) = cumul_lo.overflowing_add((a3 as u128) * (b0 as u128)); + cumul_hi += cy as u32; + + unsafe { reduce160(cumul_lo, cumul_hi) } +} + +#[inline(always)] +fn ext5_add_prods4(a: &[u64; 5], b: &[u64; 5]) -> GoldilocksField { + // Computes c4 = a0 * b4 + a1 * b3 + a2 * b2 + a3 * b1 + a4 * b0; + + let [a0, a1, a2, a3, a4] = *a; + let [b0, b1, b2, b3, b4] = *b; + + let mut cy; + + // a0 * b4 + let mut cumul_lo = (a0 as u128) * (b4 as u128); + + // a1 * b3 + (cumul_lo, cy) = cumul_lo.overflowing_add((a1 as u128) * (b3 as u128)); + let mut cumul_hi = cy as u32; + + // a2 * b2 + (cumul_lo, cy) = cumul_lo.overflowing_add((a2 as u128) * (b2 as u128)); + cumul_hi += cy as u32; + + // a3 * b1 + (cumul_lo, cy) = cumul_lo.overflowing_add((a3 as u128) * (b1 as u128)); + cumul_hi += cy as u32; + + // a4 * b0 + (cumul_lo, cy) = cumul_lo.overflowing_add((a4 as u128) * (b0 as u128)); + cumul_hi += cy as u32; + + unsafe { reduce160(cumul_lo, cumul_hi) } +} + +/// Multiply a and b considered as elements of GF(p^5). +#[inline(always)] +pub(crate) fn ext5_mul(a: [u64; 5], b: [u64; 5]) -> [GoldilocksField; 5] { + // The code in ext5_add_prods[0-4] assumes the quintic extension + // generator is 3. + const_assert!(>::W.0 == 3u64); + + let c0 = ext5_add_prods0(&a, &b); + let c1 = ext5_add_prods1(&a, &b); + let c2 = ext5_add_prods2(&a, &b); + let c3 = ext5_add_prods3(&a, &b); + let c4 = ext5_add_prods4(&a, &b); + [c0, c1, c2, c3, c4] +} diff --git a/field/src/goldilocks_field.rs b/field/src/goldilocks_field.rs index af958629..4ed32a0d 100644 --- a/field/src/goldilocks_field.rs +++ b/field/src/goldilocks_field.rs @@ -9,10 +9,6 @@ use plonky2_util::{assume, branch_hint}; use rand::Rng; use serde::{Deserialize, Serialize}; -use crate::extension_field::quadratic::QuadraticExtension; -use crate::extension_field::quartic::QuarticExtension; -use crate::extension_field::quintic::QuinticExtension; -use crate::extension_field::{Extendable, Frobenius}; use crate::field_types::{Field, Field64, PrimeField, PrimeField64}; use crate::inversion::try_inverse_u64; @@ -283,66 +279,6 @@ impl DivAssign for GoldilocksField { } } -impl Extendable<2> for GoldilocksField { - type Extension = QuadraticExtension; - - // Verifiable in Sage with - // `R. = GF(p)[]; assert (x^2 - 7).is_irreducible()`. - const W: Self = Self(7); - - // DTH_ROOT = W^((ORDER - 1)/2) - const DTH_ROOT: Self = Self(18446744069414584320); - - const EXT_MULTIPLICATIVE_GROUP_GENERATOR: [Self; 2] = - [Self(18081566051660590251), Self(16121475356294670766)]; - - const EXT_POWER_OF_TWO_GENERATOR: [Self; 2] = [Self(0), Self(15659105665374529263)]; -} - -impl Extendable<4> for GoldilocksField { - type Extension = QuarticExtension; - - const W: Self = Self(7); - - // DTH_ROOT = W^((ORDER - 1)/4) - const DTH_ROOT: Self = Self(281474976710656); - - const EXT_MULTIPLICATIVE_GROUP_GENERATOR: [Self; 4] = [ - Self(5024755240244648895), - Self(13227474371289740625), - Self(3912887029498544536), - Self(3900057112666848848), - ]; - - const EXT_POWER_OF_TWO_GENERATOR: [Self; 4] = - [Self(0), Self(0), Self(0), Self(12587610116473453104)]; -} - -impl Extendable<5> for GoldilocksField { - type Extension = QuinticExtension; - - const W: Self = Self(3); - - // DTH_ROOT = W^((ORDER - 1)/5) - const DTH_ROOT: Self = Self(1041288259238279555); - - const EXT_MULTIPLICATIVE_GROUP_GENERATOR: [Self; 5] = [ - Self(2899034827742553394), - Self(13012057356839176729), - Self(14593811582388663055), - Self(7722900811313895436), - Self(4557222484695340057), - ]; - - const EXT_POWER_OF_TWO_GENERATOR: [Self; 5] = [ - Self::POWER_OF_TWO_GENERATOR, - Self(0), - Self(0), - Self(0), - Self(0), - ]; -} - /// Fast addition modulo ORDER for x86-64. /// This function is marked unsafe for the following reasons: /// - It is only correct if x + y < 2**64 + ORDER = 0x1ffffffff00000001. @@ -407,7 +343,34 @@ fn split(x: u128) -> (u64, u64) { (x as u64, (x >> 64) as u64) } -impl Frobenius<1> for GoldilocksField {} +/// Reduce the value x_lo + x_hi * 2^128 to an element in the +/// Goldilocks field. +/// +/// This function is marked 'unsafe' because correctness relies on the +/// unchecked assumption that x < 2^160 - 2^128 + 2^96. Further, +/// performance may degrade as x_hi increases beyond 2**40 or so. +#[inline(always)] +pub(crate) unsafe fn reduce160(x_lo: u128, x_hi: u32) -> GoldilocksField { + let x_hi = (x_lo >> 96) as u64 + ((x_hi as u64) << 32); // shld to form x_hi + let x_mid = (x_lo >> 64) as u32; // shr to form x_mid + let x_lo = x_lo as u64; + + // sub + jc (should fuse) + let (mut t0, borrow) = x_lo.overflowing_sub(x_hi); + if borrow { + // The maximum possible value of x is (2^64 - 1)^2 * 4 * 7 < 2^133, + // so x_hi < 2^37. A borrow will happen roughly one in 134 million + // times, so it's best to branch. + branch_hint(); + // NB: this assumes that x < 2^160 - 2^128 + 2^96. + t0 -= EPSILON; // Cannot underflow if x_hi is canonical. + } + // imul + let t1 = (x_mid as u64) * EPSILON; + // add, sbb, add + let t2 = add_no_canonicalize_trashing_input(t0, t1); + GoldilocksField(t2) +} #[cfg(test)] mod tests { diff --git a/field/src/lib.rs b/field/src/lib.rs index 2c89aab3..e54f2aa7 100644 --- a/field/src/lib.rs +++ b/field/src/lib.rs @@ -15,6 +15,7 @@ pub mod cosets; pub mod extension_field; pub mod fft; pub mod field_types; +pub mod goldilocks_extensions; pub mod goldilocks_field; pub mod interpolation; mod inversion; diff --git a/plonky2/benches/field_arithmetic.rs b/plonky2/benches/field_arithmetic.rs index 0e4383ee..7b74ae52 100644 --- a/plonky2/benches/field_arithmetic.rs +++ b/plonky2/benches/field_arithmetic.rs @@ -1,4 +1,5 @@ use criterion::{criterion_group, criterion_main, BatchSize, Criterion}; +use plonky2::field::extension_field::quadratic::QuadraticExtension; use plonky2::field::extension_field::quartic::QuarticExtension; use plonky2::field::extension_field::quintic::QuinticExtension; use plonky2::field::field_types::Field; @@ -175,6 +176,7 @@ pub(crate) fn bench_field(c: &mut Criterion) { fn criterion_benchmark(c: &mut Criterion) { bench_field::(c); + bench_field::>(c); bench_field::>(c); bench_field::>(c); }