diff --git a/src/field/arch/mod.rs b/src/field/arch/mod.rs new file mode 100644 index 00000000..832557ef --- /dev/null +++ b/src/field/arch/mod.rs @@ -0,0 +1,2 @@ +#[cfg(target_arch = "x86_64")] +pub mod x86_64; diff --git a/src/field/arch/x86_64/avx2_goldilocks_field.rs b/src/field/arch/x86_64/avx2_goldilocks_field.rs new file mode 100644 index 00000000..5cf7c1fa --- /dev/null +++ b/src/field/arch/x86_64/avx2_goldilocks_field.rs @@ -0,0 +1,692 @@ +use core::arch::x86_64::*; +use std::fmt; +use std::fmt::{Debug, Formatter}; +use std::iter::{Product, Sum}; +use std::mem::transmute; +use std::ops::{Add, AddAssign, Div, DivAssign, Mul, MulAssign, Neg, Sub, SubAssign}; + +use crate::field::field_types::{Field, PrimeField}; +use crate::field::goldilocks_field::GoldilocksField; +use crate::field::packed_field::PackedField; + +// Ideally `Avx2GoldilocksField` would wrap `__m256i`. Unfortunately, `__m256i` has an alignment of +// 32B, which would preclude us from casting `[GoldilocksField; 4]` (alignment 8B) to +// `Avx2GoldilocksField`. We need to ensure that `Avx2GoldilocksField` has the same alignment as +// `GoldilocksField`. Thus we wrap `[GoldilocksField; 4]` and use the `new` and `get` methods to +// convert to and from `__m256i`. +#[derive(Copy, Clone)] +#[repr(transparent)] +pub struct Avx2GoldilocksField(pub [GoldilocksField; 4]); + +impl Avx2GoldilocksField { + #[inline] + fn new(x: __m256i) -> Self { + unsafe { transmute(x) } + } + #[inline] + fn get(&self) -> __m256i { + unsafe { transmute(*self) } + } +} + +impl Add for Avx2GoldilocksField { + type Output = Self; + #[inline] + fn add(self, rhs: Self) -> Self { + Self::new(unsafe { add(self.get(), rhs.get()) }) + } +} +impl Add for Avx2GoldilocksField { + type Output = Self; + #[inline] + fn add(self, rhs: GoldilocksField) -> Self { + self + Self::from(rhs) + } +} +impl Add for GoldilocksField { + type Output = Avx2GoldilocksField; + #[inline] + fn add(self, rhs: Self::Output) -> Self::Output { + Self::Output::from(self) + rhs + } +} +impl AddAssign for Avx2GoldilocksField { + #[inline] + fn add_assign(&mut self, rhs: Self) { + *self = *self + rhs; + } +} +impl AddAssign for Avx2GoldilocksField { + #[inline] + fn add_assign(&mut self, rhs: GoldilocksField) { + *self = *self + rhs; + } +} + +impl Debug for Avx2GoldilocksField { + #[inline] + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + write!(f, "({:?})", self.get()) + } +} + +impl Default for Avx2GoldilocksField { + #[inline] + fn default() -> Self { + Self::ZERO + } +} + +impl Div for Avx2GoldilocksField { + type Output = Self; + #[inline] + fn div(self, rhs: GoldilocksField) -> Self { + self * rhs.inverse() + } +} +impl DivAssign for Avx2GoldilocksField { + #[inline] + fn div_assign(&mut self, rhs: GoldilocksField) { + *self *= rhs.inverse(); + } +} + +impl From for Avx2GoldilocksField { + fn from(x: GoldilocksField) -> Self { + Self([x; 4]) + } +} + +impl Mul for Avx2GoldilocksField { + type Output = Self; + #[inline] + fn mul(self, rhs: Self) -> Self { + Self::new(unsafe { mul(self.get(), rhs.get()) }) + } +} +impl Mul for Avx2GoldilocksField { + type Output = Self; + #[inline] + fn mul(self, rhs: GoldilocksField) -> Self { + self * Self::from(rhs) + } +} +impl Mul for GoldilocksField { + type Output = Avx2GoldilocksField; + #[inline] + fn mul(self, rhs: Avx2GoldilocksField) -> Self::Output { + Self::Output::from(self) * rhs + } +} +impl MulAssign for Avx2GoldilocksField { + #[inline] + fn mul_assign(&mut self, rhs: Self) { + *self = *self * rhs; + } +} +impl MulAssign for Avx2GoldilocksField { + #[inline] + fn mul_assign(&mut self, rhs: GoldilocksField) { + *self = *self * rhs; + } +} + +impl Neg for Avx2GoldilocksField { + type Output = Self; + #[inline] + fn neg(self) -> Self { + Self::new(unsafe { neg(self.get()) }) + } +} + +impl Product for Avx2GoldilocksField { + #[inline] + fn product>(iter: I) -> Self { + iter.reduce(|x, y| x * y).unwrap_or(Self::ONE) + } +} + +unsafe impl PackedField for Avx2GoldilocksField { + const WIDTH: usize = 4; + + type Scalar = GoldilocksField; + + const ZERO: Self = Self([::ZERO; 4]); + const ONE: Self = Self([::ONE; 4]); + + #[inline] + fn from_arr(arr: [Self::Scalar; Self::WIDTH]) -> Self { + Self(arr) + } + + #[inline] + fn as_arr(&self) -> [Self::Scalar; Self::WIDTH] { + self.0 + } + + #[inline] + fn from_slice(slice: &[Self::Scalar]) -> &Self { + assert_eq!(slice.len(), Self::WIDTH); + unsafe { &*slice.as_ptr().cast() } + } + #[inline] + fn from_slice_mut(slice: &mut [Self::Scalar]) -> &mut Self { + assert_eq!(slice.len(), Self::WIDTH); + unsafe { &mut *slice.as_mut_ptr().cast() } + } + #[inline] + fn as_slice(&self) -> &[Self::Scalar] { + &self.0[..] + } + #[inline] + fn as_slice_mut(&mut self) -> &mut [Self::Scalar] { + &mut self.0[..] + } + + #[inline] + fn interleave(&self, other: Self, block_len: usize) -> (Self, Self) { + let (v0, v1) = (self.get(), other.get()); + let (res0, res1) = match block_len { + 1 => unsafe { interleave1(v0, v1) }, + 2 => unsafe { interleave2(v0, v1) }, + 4 => (v0, v1), + _ => panic!("unsupported block_len"), + }; + (Self::new(res0), Self::new(res1)) + } + + #[inline] + fn square(&self) -> Self { + Self::new(unsafe { square(self.get()) }) + } +} + +impl Sub for Avx2GoldilocksField { + type Output = Self; + #[inline] + fn sub(self, rhs: Self) -> Self { + Self::new(unsafe { sub(self.get(), rhs.get()) }) + } +} +impl Sub for Avx2GoldilocksField { + type Output = Self; + #[inline] + fn sub(self, rhs: GoldilocksField) -> Self { + self - Self::from(rhs) + } +} +impl Sub for GoldilocksField { + type Output = Avx2GoldilocksField; + #[inline] + fn sub(self, rhs: Avx2GoldilocksField) -> Self::Output { + Self::Output::from(self) - rhs + } +} +impl SubAssign for Avx2GoldilocksField { + #[inline] + fn sub_assign(&mut self, rhs: Self) { + *self = *self - rhs; + } +} +impl SubAssign for Avx2GoldilocksField { + #[inline] + fn sub_assign(&mut self, rhs: GoldilocksField) { + *self = *self - rhs; + } +} + +impl Sum for Avx2GoldilocksField { + #[inline] + fn sum>(iter: I) -> Self { + iter.reduce(|x, y| x + y).unwrap_or(Self::ZERO) + } +} + +// Resources: +// 1. Intel Intrinsics Guide for explanation of each intrinsic: +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/ +// 2. uops.info lists micro-ops for each instruction: https://uops.info/table.html +// 3. Intel optimization manual for introduction to x86 vector extensions and best practices: +// https://software.intel.com/content/www/us/en/develop/download/intel-64-and-ia-32-architectures-optimization-reference-manual.html + +// Preliminary knowledge: +// 1. Vector code usually avoids branching. Instead of branches, we can do input selection with +// _mm256_blendv_epi8 or similar instruction. If all we're doing is conditionally zeroing a +// vector element then _mm256_and_si256 or _mm256_andnot_si256 may be used and are cheaper. +// +// 2. AVX does not support addition with carry but 128-bit (2-word) addition can be easily +// emulated. The method recognizes that for a + b overflowed iff (a + b) < a: +// i. res_lo = a_lo + b_lo +// ii. carry_mask = res_lo < a_lo +// iii. res_hi = a_hi + b_hi - carry_mask +// Notice that carry_mask is subtracted, not added. This is because AVX comparison instructions +// return -1 (all bits 1) for true and 0 for false. +// +// 3. AVX does not have unsigned 64-bit comparisons. Those can be emulated with signed comparisons +// by recognizing that a __m256i { + _mm256_xor_si256(x, SIGN_BIT) +} + +/// Convert to canonical representation. +/// The argument is assumed to be shifted by 1 << 63 (i.e. x_s = x + 1<<63, where x is the field +/// value). The returned value is similarly shifted by 1 << 63 (i.e. we return y_s = y + (1<<63), +/// where 0 <= y < FIELD_ORDER). +#[inline] +unsafe fn canonicalize_s(x_s: __m256i) -> __m256i { + // If x >= FIELD_ORDER then corresponding mask bits are all 0; otherwise all 1. + let mask = _mm256_cmpgt_epi64(SHIFTED_FIELD_ORDER, x_s); + // wrapback_amt is -FIELD_ORDER if mask is 0; otherwise 0. + let wrapback_amt = _mm256_andnot_si256(mask, EPSILON); + _mm256_add_epi64(x_s, wrapback_amt) +} + +/// Addition u64 + u64 -> u64. Assumes that x + y < 2^64 + FIELD_ORDER. The second argument is +/// pre-shifted by 1 << 63. The result is similarly shifted. +#[inline] +unsafe fn add_no_double_overflow_64_64s_s(x: __m256i, y_s: __m256i) -> __m256i { + let res_wrapped_s = _mm256_add_epi64(x, y_s); + let mask = _mm256_cmpgt_epi64(y_s, res_wrapped_s); // -1 if overflowed else 0. + let wrapback_amt = _mm256_srli_epi64::<32>(mask); // -FIELD_ORDER if overflowed else 0. + let res_s = _mm256_add_epi64(res_wrapped_s, wrapback_amt); + res_s +} + +#[inline] +unsafe fn add(x: __m256i, y: __m256i) -> __m256i { + let y_s = shift(y); + let res_s = add_no_double_overflow_64_64s_s(x, canonicalize_s(y_s)); + shift(res_s) +} + +#[inline] +unsafe fn sub(x: __m256i, y: __m256i) -> __m256i { + let mut y_s = shift(y); + y_s = canonicalize_s(y_s); + let x_s = shift(x); + let mask = _mm256_cmpgt_epi64(y_s, x_s); // -1 if sub will underflow (y > x) else 0. + let wrapback_amt = _mm256_srli_epi64::<32>(mask); // -FIELD_ORDER if underflow else 0. + let res_wrapped = _mm256_sub_epi64(x_s, y_s); + let res = _mm256_sub_epi64(res_wrapped, wrapback_amt); + res +} + +#[inline] +unsafe fn neg(y: __m256i) -> __m256i { + let y_s = shift(y); + _mm256_sub_epi64(SHIFTED_FIELD_ORDER, canonicalize_s(y_s)) +} + +/// Full 64-bit by 64-bit multiplication. This emulated multiplication is 1.33x slower than the +/// scalar instruction, but may be worth it if we want our data to live in vector registers. +#[inline] +unsafe fn mul64_64(x: __m256i, y: __m256i) -> (__m256i, __m256i) { + // We want to move the high 32 bits to the low position. The multiplication instruction ignores + // the high 32 bits, so it's ok to just duplicate it into the low position. This duplication can + // be done on port 5; bitshifts run on ports 0 and 1, competing with multiplication. + // This instruction is only provided for 32-bit floats, not integers. Idk why Intel makes the + // distinction; the casts are free and it guarantees that the exact bit pattern is preserved. + // Using a swizzle instruction of the wrong domain (float vs int) does not increase latency + // since Haswell. + let x_hi = _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(x))); + let y_hi = _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(y))); + + // All four pairwise multiplications + let mul_ll = _mm256_mul_epu32(x, y); + let mul_lh = _mm256_mul_epu32(x, y_hi); + let mul_hl = _mm256_mul_epu32(x_hi, y); + let mul_hh = _mm256_mul_epu32(x_hi, y_hi); + + // Bignum addition + // Extract high 32 bits of mul_ll and add to mul_hl. This cannot overflow. + let mul_ll_hi = _mm256_srli_epi64::<32>(mul_ll); + let t0 = _mm256_add_epi64(mul_hl, mul_ll_hi); + // Extract low 32 bits of t0 and add to mul_lh. Again, this cannot overflow. + // Also, extract high 32 bits of t0 and add to mul_hh. + let t0_lo = _mm256_and_si256(t0, EPSILON); + let t0_hi = _mm256_srli_epi64::<32>(t0); + let t1 = _mm256_add_epi64(mul_lh, t0_lo); + let t2 = _mm256_add_epi64(mul_hh, t0_hi); + // Lastly, extract the high 32 bits of t1 and add to t2. + let t1_hi = _mm256_srli_epi64::<32>(t1); + let res_hi = _mm256_add_epi64(t2, t1_hi); + + // Form res_lo by combining the low half of mul_ll with the low half of t1 (shifted into high + // position). + let t1_lo = _mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(t1))); + let res_lo = _mm256_blend_epi32::<0xaa>(mul_ll, t1_lo); + + (res_hi, res_lo) +} + +/// Full 64-bit squaring. This routine is 1.2x faster than the scalar instruction. +#[inline] +unsafe fn square64(x: __m256i) -> (__m256i, __m256i) { + // Get high 32 bits of x. See comment in mul64_64_s. + let x_hi = _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(x))); + + // All pairwise multiplications. + let mul_ll = _mm256_mul_epu32(x, x); + let mul_lh = _mm256_mul_epu32(x, x_hi); + let mul_hh = _mm256_mul_epu32(x_hi, x_hi); + + // Bignum addition, but mul_lh is shifted by 33 bits (not 32). + let mul_ll_hi = _mm256_srli_epi64::<33>(mul_ll); + let t0 = _mm256_add_epi64(mul_lh, mul_ll_hi); + let t0_hi = _mm256_srli_epi64::<31>(t0); + let res_hi = _mm256_add_epi64(mul_hh, t0_hi); + + // Form low result by adding the mul_ll and the low 31 bits of mul_lh (shifted to the high + // position). + let mul_lh_lo = _mm256_slli_epi64::<33>(mul_lh); + let res_lo = _mm256_add_epi64(mul_ll, mul_lh_lo); + + (res_hi, res_lo) +} + +/// Goldilocks addition of a "small" number. `x_s` is pre-shifted by 2**63. `y` is assumed to be <= +/// `0xffffffff00000000`. The result is shifted by 2**63. +#[inline] +unsafe fn add_small_64s_64_s(x_s: __m256i, y: __m256i) -> __m256i { + let res_wrapped_s = _mm256_add_epi64(x_s, y); + // 32-bit compare is faster than 64-bit. It's safe as long as x > res_wrapped iff x >> 32 > + // res_wrapped >> 32. The case of x >> 32 > res_wrapped >> 32 is trivial and so is <. The case + // where x >> 32 = res_wrapped >> 32 remains. If x >> 32 = res_wrapped >> 32, then y >> 32 = + // 0xffffffff and the addition of the low 32 bits generated a carry. This can never occur if y + // <= 0xffffffff00000000: if y >> 32 = 0xffffffff, then no carry can occur. + let mask = _mm256_cmpgt_epi32(x_s, res_wrapped_s); // -1 if overflowed else 0. + // The mask contains 0xffffffff in the high 32 bits if wraparound occured and 0 otherwise. + let wrapback_amt = _mm256_srli_epi64::<32>(mask); // -FIELD_ORDER if overflowed else 0. + let res_s = _mm256_add_epi64(res_wrapped_s, wrapback_amt); + res_s +} + +/// Goldilocks subtraction of a "small" number. `x_s` is pre-shifted by 2**63. `y` is assumed to be +/// <= `0xffffffff00000000`. The result is shifted by 2**63. +#[inline] +unsafe fn sub_small_64s_64_s(x_s: __m256i, y: __m256i) -> __m256i { + let res_wrapped_s = _mm256_sub_epi64(x_s, y); + // 32-bit compare is faster than 64-bit. It's safe as long as res_wrapped > x iff res_wrapped >> + // 32 > x >> 32. The case of res_wrapped >> 32 > x >> 32 is trivial and so is <. The case where + // res_wrapped >> 32 = x >> 32 remains. If res_wrapped >> 32 = x >> 32, then y >> 32 = + // 0xffffffff and the subtraction of the low 32 bits generated a borrow. This can never occur if + // y <= 0xffffffff00000000: if y >> 32 = 0xffffffff, then no borrow can occur. + let mask = _mm256_cmpgt_epi32(res_wrapped_s, x_s); // -1 if underflowed else 0. + // The mask contains 0xffffffff in the high 32 bits if wraparound occured and 0 otherwise. + let wrapback_amt = _mm256_srli_epi64::<32>(mask); // -FIELD_ORDER if underflowed else 0. + let res_s = _mm256_sub_epi64(res_wrapped_s, wrapback_amt); + res_s +} + +#[inline] +unsafe fn reduce128(x: (__m256i, __m256i)) -> __m256i { + let (hi0, lo0) = x; + let lo0_s = shift(lo0); + let hi_hi0 = _mm256_srli_epi64::<32>(hi0); + let lo1_s = sub_small_64s_64_s(lo0_s, hi_hi0); + let t1 = _mm256_mul_epu32(hi0, EPSILON); + let lo2_s = add_small_64s_64_s(lo1_s, t1); + let lo2 = shift(lo2_s); + lo2 +} + +/// Multiply two integers modulo FIELD_ORDER. +#[inline] +unsafe fn mul(x: __m256i, y: __m256i) -> __m256i { + reduce128(mul64_64(x, y)) +} + +/// Square an integer modulo FIELD_ORDER. +#[inline] +unsafe fn square(x: __m256i) -> __m256i { + reduce128(square64(x)) +} + +#[inline] +unsafe fn interleave1(x: __m256i, y: __m256i) -> (__m256i, __m256i) { + let a = _mm256_unpacklo_epi64(x, y); + let b = _mm256_unpackhi_epi64(x, y); + (a, b) +} + +#[inline] +unsafe fn interleave2(x: __m256i, y: __m256i) -> (__m256i, __m256i) { + let y_lo = _mm256_castsi256_si128(y); // This has 0 cost. + + // 1 places y_lo in the high half of x; 0 would place it in the lower half. + let a = _mm256_inserti128_si256::<1>(x, y_lo); + // NB: _mm256_permute2x128_si256 could be used here as well but _mm256_inserti128_si256 has + // lower latency on Zen 3 processors. + + // Each nibble of the constant has the following semantics: + // 0 => src1[low 128 bits] + // 1 => src1[high 128 bits] + // 2 => src2[low 128 bits] + // 3 => src2[high 128 bits] + // The low (resp. high) nibble chooses the low (resp. high) 128 bits of the result. + let b = _mm256_permute2x128_si256::<0x31>(x, y); + + (a, b) +} + +#[cfg(test)] +mod tests { + use crate::field::arch::x86_64::avx2_goldilocks_field::Avx2GoldilocksField; + use crate::field::field_types::PrimeField; + use crate::field::goldilocks_field::GoldilocksField; + use crate::field::packed_field::PackedField; + + fn test_vals_a() -> [GoldilocksField; 4] { + [ + GoldilocksField::from_noncanonical_u64(14479013849828404771), + GoldilocksField::from_noncanonical_u64(9087029921428221768), + GoldilocksField::from_noncanonical_u64(2441288194761790662), + GoldilocksField::from_noncanonical_u64(5646033492608483824), + ] + } + fn test_vals_b() -> [GoldilocksField; 4] { + [ + GoldilocksField::from_noncanonical_u64(17891926589593242302), + GoldilocksField::from_noncanonical_u64(11009798273260028228), + GoldilocksField::from_noncanonical_u64(2028722748960791447), + GoldilocksField::from_noncanonical_u64(7929433601095175579), + ] + } + + #[test] + fn test_add() { + let a_arr = test_vals_a(); + let b_arr = test_vals_b(); + + let packed_a = Avx2GoldilocksField::from_arr(a_arr); + let packed_b = Avx2GoldilocksField::from_arr(b_arr); + let packed_res = packed_a + packed_b; + let arr_res = packed_res.as_arr(); + + let expected = a_arr.iter().zip(b_arr).map(|(&a, b)| a + b); + for (exp, res) in expected.zip(arr_res) { + assert_eq!(res, exp); + } + } + + #[test] + fn test_mul() { + let a_arr = test_vals_a(); + let b_arr = test_vals_b(); + + let packed_a = Avx2GoldilocksField::from_arr(a_arr); + let packed_b = Avx2GoldilocksField::from_arr(b_arr); + let packed_res = packed_a * packed_b; + let arr_res = packed_res.as_arr(); + + let expected = a_arr.iter().zip(b_arr).map(|(&a, b)| a * b); + for (exp, res) in expected.zip(arr_res) { + assert_eq!(res, exp); + } + } + + #[test] + fn test_square() { + let a_arr = test_vals_a(); + + let packed_a = Avx2GoldilocksField::from_arr(a_arr); + let packed_res = packed_a.square(); + let arr_res = packed_res.as_arr(); + + let expected = a_arr.iter().map(|&a| a.square()); + for (exp, res) in expected.zip(arr_res) { + assert_eq!(res, exp); + } + } + + #[test] + fn test_neg() { + let a_arr = test_vals_a(); + + let packed_a = Avx2GoldilocksField::from_arr(a_arr); + let packed_res = -packed_a; + let arr_res = packed_res.as_arr(); + + let expected = a_arr.iter().map(|&a| -a); + for (exp, res) in expected.zip(arr_res) { + assert_eq!(res, exp); + } + } + + #[test] + fn test_sub() { + let a_arr = test_vals_a(); + let b_arr = test_vals_b(); + + let packed_a = Avx2GoldilocksField::from_arr(a_arr); + let packed_b = Avx2GoldilocksField::from_arr(b_arr); + let packed_res = packed_a - packed_b; + let arr_res = packed_res.as_arr(); + + let expected = a_arr.iter().zip(b_arr).map(|(&a, b)| a - b); + for (exp, res) in expected.zip(arr_res) { + assert_eq!(res, exp); + } + } + + #[test] + fn test_interleave_is_involution() { + let a_arr = test_vals_a(); + let b_arr = test_vals_b(); + + let packed_a = Avx2GoldilocksField::from_arr(a_arr); + let packed_b = Avx2GoldilocksField::from_arr(b_arr); + { + // Interleave, then deinterleave. + let (x, y) = packed_a.interleave(packed_b, 1); + let (res_a, res_b) = x.interleave(y, 1); + assert_eq!(res_a.as_arr(), a_arr); + assert_eq!(res_b.as_arr(), b_arr); + } + { + let (x, y) = packed_a.interleave(packed_b, 2); + let (res_a, res_b) = x.interleave(y, 2); + assert_eq!(res_a.as_arr(), a_arr); + assert_eq!(res_b.as_arr(), b_arr); + } + { + let (x, y) = packed_a.interleave(packed_b, 4); + let (res_a, res_b) = x.interleave(y, 4); + assert_eq!(res_a.as_arr(), a_arr); + assert_eq!(res_b.as_arr(), b_arr); + } + } + + #[test] + fn test_interleave() { + let in_a: [GoldilocksField; 4] = [ + GoldilocksField::from_noncanonical_u64(00), + GoldilocksField::from_noncanonical_u64(01), + GoldilocksField::from_noncanonical_u64(02), + GoldilocksField::from_noncanonical_u64(03), + ]; + let in_b: [GoldilocksField; 4] = [ + GoldilocksField::from_noncanonical_u64(10), + GoldilocksField::from_noncanonical_u64(11), + GoldilocksField::from_noncanonical_u64(12), + GoldilocksField::from_noncanonical_u64(13), + ]; + let int1_a: [GoldilocksField; 4] = [ + GoldilocksField::from_noncanonical_u64(00), + GoldilocksField::from_noncanonical_u64(10), + GoldilocksField::from_noncanonical_u64(02), + GoldilocksField::from_noncanonical_u64(12), + ]; + let int1_b: [GoldilocksField; 4] = [ + GoldilocksField::from_noncanonical_u64(01), + GoldilocksField::from_noncanonical_u64(11), + GoldilocksField::from_noncanonical_u64(03), + GoldilocksField::from_noncanonical_u64(13), + ]; + let int2_a: [GoldilocksField; 4] = [ + GoldilocksField::from_noncanonical_u64(00), + GoldilocksField::from_noncanonical_u64(01), + GoldilocksField::from_noncanonical_u64(10), + GoldilocksField::from_noncanonical_u64(11), + ]; + let int2_b: [GoldilocksField; 4] = [ + GoldilocksField::from_noncanonical_u64(02), + GoldilocksField::from_noncanonical_u64(03), + GoldilocksField::from_noncanonical_u64(12), + GoldilocksField::from_noncanonical_u64(13), + ]; + + let packed_a = Avx2GoldilocksField::from_arr(in_a); + let packed_b = Avx2GoldilocksField::from_arr(in_b); + { + let (x1, y1) = packed_a.interleave(packed_b, 1); + assert_eq!(x1.as_arr(), int1_a); + assert_eq!(y1.as_arr(), int1_b); + } + { + let (x2, y2) = packed_a.interleave(packed_b, 2); + assert_eq!(x2.as_arr(), int2_a); + assert_eq!(y2.as_arr(), int2_b); + } + { + let (x4, y4) = packed_a.interleave(packed_b, 4); + assert_eq!(x4.as_arr(), in_a); + assert_eq!(y4.as_arr(), in_b); + } + } +} diff --git a/src/field/arch/x86_64/mod.rs b/src/field/arch/x86_64/mod.rs new file mode 100644 index 00000000..bd9dccae --- /dev/null +++ b/src/field/arch/x86_64/mod.rs @@ -0,0 +1,2 @@ +#[cfg(target_feature = "avx2")] +pub mod avx2_goldilocks_field; diff --git a/src/field/mod.rs b/src/field/mod.rs index 3cf4cdfb..dac1c9be 100644 --- a/src/field/mod.rs +++ b/src/field/mod.rs @@ -1,3 +1,4 @@ +pub(crate) mod arch; pub(crate) mod batch_util; pub(crate) mod cosets; pub mod extension_field; @@ -11,9 +12,6 @@ pub(crate) mod packed_field; pub mod secp256k1_base; pub mod secp256k1_scalar; -#[cfg(target_feature = "avx2")] -pub(crate) mod packed_avx2; - #[cfg(test)] mod field_testing; #[cfg(test)] diff --git a/src/field/packable.rs b/src/field/packable.rs index a3f96197..05ab7db1 100644 --- a/src/field/packable.rs +++ b/src/field/packable.rs @@ -14,5 +14,5 @@ impl Packable for F { #[cfg(target_feature = "avx2")] impl Packable for crate::field::goldilocks_field::GoldilocksField { - type Packing = crate::field::packed_avx2::PackedGoldilocksAvx2; + type Packing = crate::field::arch::x86_64::avx2_goldilocks_field::Avx2GoldilocksField; } diff --git a/src/field/packed_avx2/avx2_prime_field.rs b/src/field/packed_avx2/avx2_prime_field.rs deleted file mode 100644 index fd153cf6..00000000 --- a/src/field/packed_avx2/avx2_prime_field.rs +++ /dev/null @@ -1,452 +0,0 @@ -use core::arch::x86_64::*; -use std::fmt; -use std::fmt::{Debug, Formatter}; -use std::iter::{Product, Sum}; -use std::ops::{Add, AddAssign, Div, DivAssign, Mul, MulAssign, Neg, Sub, SubAssign}; - -use crate::field::field_types::PrimeField; -use crate::field::packed_avx2::common::{ - add_no_canonicalize_64_64s_s, epsilon, field_order, shift, ReducibleAvx2, -}; -use crate::field::packed_field::PackedField; - -// Avx2PrimeField wraps an array of four u64s, with the new and get methods to convert that -// array to and from __m256i, which is the type we actually operate on. This indirection is a -// terrible trick to change Avx2PrimeField's alignment. -// We'd like to be able to cast slices of PrimeField to slices of Avx2PrimeField. Rust -// aligns __m256i to 32 bytes but PrimeField has a lower alignment. That alignment extends to -// Avx2PrimeField and it appears that it cannot be lowered with #[repr(C, blah)]. It is -// important for Rust not to assume 32-byte alignment, so we cannot wrap __m256i directly. -// There are two versions of vectorized load/store instructions on x86: aligned (vmovaps and -// friends) and unaligned (vmovups etc.). The difference between them is that aligned loads and -// stores are permitted to segfault on unaligned accesses. Historically, the aligned instructions -// were faster, and although this is no longer the case, compilers prefer the aligned versions if -// they know that the address is aligned. Using aligned instructions on unaligned addresses leads to -// bugs that can be frustrating to diagnose. Hence, we can't have Rust assuming alignment, and -// therefore Avx2PrimeField wraps [F; 4] and not __m256i. -#[derive(Copy, Clone)] -#[repr(transparent)] -pub struct Avx2PrimeField(pub [F; 4]); - -impl Avx2PrimeField { - #[inline] - fn new(x: __m256i) -> Self { - let mut obj = Self([F::ZERO; 4]); - let ptr = (&mut obj.0).as_mut_ptr().cast::<__m256i>(); - unsafe { - _mm256_storeu_si256(ptr, x); - } - obj - } - #[inline] - fn get(&self) -> __m256i { - let ptr = (&self.0).as_ptr().cast::<__m256i>(); - unsafe { _mm256_loadu_si256(ptr) } - } -} - -impl Add for Avx2PrimeField { - type Output = Self; - #[inline] - fn add(self, rhs: Self) -> Self { - Self::new(unsafe { add::(self.get(), rhs.get()) }) - } -} -impl Add for Avx2PrimeField { - type Output = Self; - #[inline] - fn add(self, rhs: F) -> Self { - self + Self::from(rhs) - } -} -impl Add> for as PackedField>::Scalar { - type Output = Avx2PrimeField; - #[inline] - fn add(self, rhs: Self::Output) -> Self::Output { - Self::Output::from(self) + rhs - } -} -impl AddAssign for Avx2PrimeField { - #[inline] - fn add_assign(&mut self, rhs: Self) { - *self = *self + rhs; - } -} -impl AddAssign for Avx2PrimeField { - #[inline] - fn add_assign(&mut self, rhs: F) { - *self = *self + rhs; - } -} - -impl Debug for Avx2PrimeField { - #[inline] - fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { - write!(f, "({:?})", self.get()) - } -} - -impl Default for Avx2PrimeField { - #[inline] - fn default() -> Self { - Self::ZERO - } -} - -impl Div for Avx2PrimeField { - type Output = Self; - #[inline] - fn div(self, rhs: F) -> Self { - self * rhs.inverse() - } -} -impl DivAssign for Avx2PrimeField { - #[inline] - fn div_assign(&mut self, rhs: F) { - *self *= rhs.inverse(); - } -} - -impl From for Avx2PrimeField { - fn from(x: F) -> Self { - Self([x; 4]) - } -} - -impl Mul for Avx2PrimeField { - type Output = Self; - #[inline] - fn mul(self, rhs: Self) -> Self { - Self::new(unsafe { mul::(self.get(), rhs.get()) }) - } -} -impl Mul for Avx2PrimeField { - type Output = Self; - #[inline] - fn mul(self, rhs: F) -> Self { - self * Self::from(rhs) - } -} -impl Mul> for as PackedField>::Scalar { - type Output = Avx2PrimeField; - #[inline] - fn mul(self, rhs: Avx2PrimeField) -> Self::Output { - Self::Output::from(self) * rhs - } -} -impl MulAssign for Avx2PrimeField { - #[inline] - fn mul_assign(&mut self, rhs: Self) { - *self = *self * rhs; - } -} -impl MulAssign for Avx2PrimeField { - #[inline] - fn mul_assign(&mut self, rhs: F) { - *self = *self * rhs; - } -} - -impl Neg for Avx2PrimeField { - type Output = Self; - #[inline] - fn neg(self) -> Self { - Self::new(unsafe { neg::(self.get()) }) - } -} - -impl Product for Avx2PrimeField { - #[inline] - fn product>(iter: I) -> Self { - iter.reduce(|x, y| x * y).unwrap_or(Self::ONE) - } -} - -unsafe impl PackedField for Avx2PrimeField { - const WIDTH: usize = 4; - - type Scalar = F; - - const ZERO: Self = Self([F::ZERO; 4]); - const ONE: Self = Self([F::ONE; 4]); - - #[inline] - fn from_arr(arr: [Self::Scalar; Self::WIDTH]) -> Self { - Self(arr) - } - - #[inline] - fn as_arr(&self) -> [Self::Scalar; Self::WIDTH] { - self.0 - } - - #[inline] - fn from_slice(slice: &[Self::Scalar]) -> &Self { - assert_eq!(slice.len(), Self::WIDTH); - unsafe { &*slice.as_ptr().cast() } - } - #[inline] - fn from_slice_mut(slice: &mut [Self::Scalar]) -> &mut Self { - assert_eq!(slice.len(), Self::WIDTH); - unsafe { &mut *slice.as_mut_ptr().cast() } - } - #[inline] - fn as_slice(&self) -> &[Self::Scalar] { - &self.0[..] - } - #[inline] - fn as_slice_mut(&mut self) -> &mut [Self::Scalar] { - &mut self.0[..] - } - - #[inline] - fn interleave(&self, other: Self, block_len: usize) -> (Self, Self) { - let (v0, v1) = (self.get(), other.get()); - let (res0, res1) = match block_len { - 1 => unsafe { interleave1(v0, v1) }, - 2 => unsafe { interleave2(v0, v1) }, - 4 => (v0, v1), - _ => panic!("unsupported block_len"), - }; - (Self::new(res0), Self::new(res1)) - } - - #[inline] - fn square(&self) -> Self { - Self::new(unsafe { square::(self.get()) }) - } -} - -impl Sub for Avx2PrimeField { - type Output = Self; - #[inline] - fn sub(self, rhs: Self) -> Self { - Self::new(unsafe { sub::(self.get(), rhs.get()) }) - } -} -impl Sub for Avx2PrimeField { - type Output = Self; - #[inline] - fn sub(self, rhs: F) -> Self { - self - Self::from(rhs) - } -} -impl Sub> for as PackedField>::Scalar { - type Output = Avx2PrimeField; - #[inline] - fn sub(self, rhs: Avx2PrimeField) -> Self::Output { - Self::Output::from(self) - rhs - } -} -impl SubAssign for Avx2PrimeField { - #[inline] - fn sub_assign(&mut self, rhs: Self) { - *self = *self - rhs; - } -} -impl SubAssign for Avx2PrimeField { - #[inline] - fn sub_assign(&mut self, rhs: F) { - *self = *self - rhs; - } -} - -impl Sum for Avx2PrimeField { - #[inline] - fn sum>(iter: I) -> Self { - iter.reduce(|x, y| x + y).unwrap_or(Self::ZERO) - } -} - -// Resources: -// 1. Intel Intrinsics Guide for explanation of each intrinsic: -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/ -// 2. uops.info lists micro-ops for each instruction: https://uops.info/table.html -// 3. Intel optimization manual for introduction to x86 vector extensions and best practices: -// https://software.intel.com/content/www/us/en/develop/download/intel-64-and-ia-32-architectures-optimization-reference-manual.html - -// Preliminary knowledge: -// 1. Vector code usually avoids branching. Instead of branches, we can do input selection with -// _mm256_blendv_epi8 or similar instruction. If all we're doing is conditionally zeroing a -// vector element then _mm256_and_si256 or _mm256_andnot_si256 may be used and are cheaper. -// -// 2. AVX does not support addition with carry but 128-bit (2-word) addition can be easily -// emulated. The method recognizes that for a + b overflowed iff (a + b) < a: -// i. res_lo = a_lo + b_lo -// ii. carry_mask = res_lo < a_lo -// iii. res_hi = a_hi + b_hi - carry_mask -// Notice that carry_mask is subtracted, not added. This is because AVX comparison instructions -// return -1 (all bits 1) for true and 0 for false. -// -// 3. AVX does not have unsigned 64-bit comparisons. Those can be emulated with signed comparisons -// by recognizing that a (x_s: __m256i) -> __m256i { - // If x >= FIELD_ORDER then corresponding mask bits are all 0; otherwise all 1. - let mask = _mm256_cmpgt_epi64(shift(field_order::()), x_s); - // wrapback_amt is -FIELD_ORDER if mask is 0; otherwise 0. - let wrapback_amt = _mm256_andnot_si256(mask, epsilon::()); - _mm256_add_epi64(x_s, wrapback_amt) -} - -#[inline] -unsafe fn add(x: __m256i, y: __m256i) -> __m256i { - let y_s = shift(y); - let res_s = add_no_canonicalize_64_64s_s::(x, canonicalize_s::(y_s)); - shift(res_s) -} - -#[inline] -unsafe fn sub(x: __m256i, y: __m256i) -> __m256i { - let mut y_s = shift(y); - y_s = canonicalize_s::(y_s); - let x_s = shift(x); - let mask = _mm256_cmpgt_epi64(y_s, x_s); // -1 if sub will underflow (y > x) else 0. - let wrapback_amt = _mm256_and_si256(mask, epsilon::()); // -FIELD_ORDER if underflow else 0. - let res_wrapped = _mm256_sub_epi64(x_s, y_s); - let res = _mm256_sub_epi64(res_wrapped, wrapback_amt); - res -} - -#[inline] -unsafe fn neg(y: __m256i) -> __m256i { - let y_s = shift(y); - _mm256_sub_epi64(shift(field_order::()), canonicalize_s::(y_s)) -} - -/// Full 64-bit by 64-bit multiplication. This emulated multiplication is 1.33x slower than the -/// scalar instruction, but may be worth it if we want our data to live in vector registers. -#[inline] -unsafe fn mul64_64(x: __m256i, y: __m256i) -> (__m256i, __m256i) { - // We want to move the high 32 bits to the low position. The multiplication instruction ignores - // the high 32 bits, so it's ok to just duplicate it into the low position. This duplication can - // be done on port 5; bitshifts run on ports 0 and 1, competing with multiplication. - // This instruction is only provided for 32-bit floats, not integers. Idk why Intel makes the - // distinction; the casts are free and it guarantees that the exact bit pattern is preserved. - // Using a swizzle instruction of the wrong domain (float vs int) does not increase latency - // since Haswell. - let x_hi = _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(x))); - let y_hi = _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(y))); - - // All four pairwise multiplications - let mul_ll = _mm256_mul_epu32(x, y); - let mul_lh = _mm256_mul_epu32(x, y_hi); - let mul_hl = _mm256_mul_epu32(x_hi, y); - let mul_hh = _mm256_mul_epu32(x_hi, y_hi); - - // Bignum addition - // Extract high 32 bits of mul_ll and add to mul_hl. This cannot overflow. - let mul_ll_hi = _mm256_srli_epi64::<32>(mul_ll); - let t0 = _mm256_add_epi64(mul_hl, mul_ll_hi); - // Extract low 32 bits of t0 and add to mul_lh. Again, this cannot overflow. - // Also, extract high 32 bits of t0 and add to mul_hh. - let t0_lo = _mm256_and_si256(t0, _mm256_set1_epi64x(u32::MAX.into())); - let t0_hi = _mm256_srli_epi64::<32>(t0); - let t1 = _mm256_add_epi64(mul_lh, t0_lo); - let t2 = _mm256_add_epi64(mul_hh, t0_hi); - // Lastly, extract the high 32 bits of t1 and add to t2. - let t1_hi = _mm256_srli_epi64::<32>(t1); - let res_hi = _mm256_add_epi64(t2, t1_hi); - - // Form res_lo by combining the low half of mul_ll with the low half of t1 (shifted into high - // position). - let t1_lo = _mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(t1))); - let res_lo = _mm256_blend_epi32::<0xaa>(mul_ll, t1_lo); - - (res_hi, res_lo) -} - -/// Full 64-bit squaring. This routine is 1.2x faster than the scalar instruction. -#[inline] -unsafe fn square64(x: __m256i) -> (__m256i, __m256i) { - // Get high 32 bits of x. See comment in mul64_64_s. - let x_hi = _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(x))); - - // All pairwise multiplications. - let mul_ll = _mm256_mul_epu32(x, x); - let mul_lh = _mm256_mul_epu32(x, x_hi); - let mul_hh = _mm256_mul_epu32(x_hi, x_hi); - - // Bignum addition, but mul_lh is shifted by 33 bits (not 32). - let mul_ll_hi = _mm256_srli_epi64::<33>(mul_ll); - let t0 = _mm256_add_epi64(mul_lh, mul_ll_hi); - let t0_hi = _mm256_srli_epi64::<31>(t0); - let res_hi = _mm256_add_epi64(mul_hh, t0_hi); - - // Form low result by adding the mul_ll and the low 31 bits of mul_lh (shifted to the high - // position). - let mul_lh_lo = _mm256_slli_epi64::<33>(mul_lh); - let res_lo = _mm256_add_epi64(mul_ll, mul_lh_lo); - - (res_hi, res_lo) -} - -/// Multiply two integers modulo FIELD_ORDER. -#[inline] -unsafe fn mul(x: __m256i, y: __m256i) -> __m256i { - F::reduce128(mul64_64(x, y)) -} - -/// Square an integer modulo FIELD_ORDER. -#[inline] -unsafe fn square(x: __m256i) -> __m256i { - F::reduce128(square64(x)) -} - -#[inline] -unsafe fn interleave1(x: __m256i, y: __m256i) -> (__m256i, __m256i) { - let a = _mm256_unpacklo_epi64(x, y); - let b = _mm256_unpackhi_epi64(x, y); - (a, b) -} - -#[inline] -unsafe fn interleave2(x: __m256i, y: __m256i) -> (__m256i, __m256i) { - let y_lo = _mm256_castsi256_si128(y); // This has 0 cost. - - // 1 places y_lo in the high half of x; 0 would place it in the lower half. - let a = _mm256_inserti128_si256::<1>(x, y_lo); - // NB: _mm256_permute2x128_si256 could be used here as well but _mm256_inserti128_si256 has - // lower latency on Zen 3 processors. - - // Each nibble of the constant has the following semantics: - // 0 => src1[low 128 bits] - // 1 => src1[high 128 bits] - // 2 => src2[low 128 bits] - // 3 => src2[high 128 bits] - // The low (resp. high) nibble chooses the low (resp. high) 128 bits of the result. - let b = _mm256_permute2x128_si256::<0x31>(x, y); - - (a, b) -} diff --git a/src/field/packed_avx2/common.rs b/src/field/packed_avx2/common.rs deleted file mode 100644 index 48f9524d..00000000 --- a/src/field/packed_avx2/common.rs +++ /dev/null @@ -1,53 +0,0 @@ -use core::arch::x86_64::*; - -use crate::field::field_types::PrimeField; - -pub trait ReducibleAvx2: PrimeField { - unsafe fn reduce128(x: (__m256i, __m256i)) -> __m256i; -} - -const SIGN_BIT: u64 = 1 << 63; - -#[inline] -unsafe fn sign_bit() -> __m256i { - _mm256_set1_epi64x(SIGN_BIT as i64) -} - -/// Add 2^63 with overflow. Needed to emulate unsigned comparisons (see point 3. in -/// packed_prime_field.rs). -#[inline] -pub unsafe fn shift(x: __m256i) -> __m256i { - _mm256_xor_si256(x, sign_bit()) -} - -#[inline] -pub unsafe fn field_order() -> __m256i { - _mm256_set1_epi64x(F::ORDER as i64) -} - -#[inline] -pub unsafe fn epsilon() -> __m256i { - _mm256_set1_epi64x(0u64.wrapping_sub(F::ORDER) as i64) -} - -/// Addition u64 + u64 -> u64. Assumes that x + y < 2^64 + FIELD_ORDER. The second argument is -/// pre-shifted by 1 << 63. The result is similarly shifted. -#[inline] -pub unsafe fn add_no_canonicalize_64_64s_s(x: __m256i, y_s: __m256i) -> __m256i { - let res_wrapped_s = _mm256_add_epi64(x, y_s); - let mask = _mm256_cmpgt_epi64(y_s, res_wrapped_s); // -1 if overflowed else 0. - let wrapback_amt = _mm256_and_si256(mask, epsilon::()); // -FIELD_ORDER if overflowed else 0. - let res_s = _mm256_add_epi64(res_wrapped_s, wrapback_amt); - res_s -} - -/// Subtraction u64 - u64 -> u64. Assumes that double overflow cannot occur. The first argument is -/// pre-shifted by 1 << 63 and the result is similarly shifted. -#[inline] -pub unsafe fn sub_no_canonicalize_64s_64_s(x_s: __m256i, y: __m256i) -> __m256i { - let res_wrapped_s = _mm256_sub_epi64(x_s, y); - let mask = _mm256_cmpgt_epi64(res_wrapped_s, x_s); // -1 if overflowed else 0. - let wrapback_amt = _mm256_and_si256(mask, epsilon::()); // -FIELD_ORDER if overflowed else 0. - let res_s = _mm256_sub_epi64(res_wrapped_s, wrapback_amt); - res_s -} diff --git a/src/field/packed_avx2/goldilocks.rs b/src/field/packed_avx2/goldilocks.rs deleted file mode 100644 index 954516b8..00000000 --- a/src/field/packed_avx2/goldilocks.rs +++ /dev/null @@ -1,22 +0,0 @@ -use core::arch::x86_64::*; - -use crate::field::goldilocks_field::GoldilocksField; -use crate::field::packed_avx2::common::{ - add_no_canonicalize_64_64s_s, epsilon, shift, sub_no_canonicalize_64s_64_s, ReducibleAvx2, -}; - -/// Reduce a u128 modulo FIELD_ORDER. The input is (u64, u64), pre-shifted by 2^63. The result is -/// similarly shifted. -impl ReducibleAvx2 for GoldilocksField { - #[inline] - unsafe fn reduce128(x: (__m256i, __m256i)) -> __m256i { - let (hi0, lo0) = x; - let lo0_s = shift(lo0); - let hi_hi0 = _mm256_srli_epi64(hi0, 32); - let lo1_s = sub_no_canonicalize_64s_64_s::(lo0_s, hi_hi0); - let t1 = _mm256_mul_epu32(hi0, epsilon::()); - let lo2_s = add_no_canonicalize_64_64s_s::(t1, lo1_s); - let lo2 = shift(lo2_s); - lo2 - } -} diff --git a/src/field/packed_avx2/mod.rs b/src/field/packed_avx2/mod.rs deleted file mode 100644 index 5f6294a4..00000000 --- a/src/field/packed_avx2/mod.rs +++ /dev/null @@ -1,239 +0,0 @@ -mod avx2_prime_field; -mod common; -mod goldilocks; - -use avx2_prime_field::Avx2PrimeField; - -use crate::field::goldilocks_field::GoldilocksField; - -pub type PackedGoldilocksAvx2 = Avx2PrimeField; - -#[cfg(test)] -mod tests { - use crate::field::goldilocks_field::GoldilocksField; - use crate::field::packed_avx2::avx2_prime_field::Avx2PrimeField; - use crate::field::packed_avx2::common::ReducibleAvx2; - use crate::field::packed_field::PackedField; - - fn test_vals_a() -> [F; 4] { - [ - F::from_noncanonical_u64(14479013849828404771), - F::from_noncanonical_u64(9087029921428221768), - F::from_noncanonical_u64(2441288194761790662), - F::from_noncanonical_u64(5646033492608483824), - ] - } - fn test_vals_b() -> [F; 4] { - [ - F::from_noncanonical_u64(17891926589593242302), - F::from_noncanonical_u64(11009798273260028228), - F::from_noncanonical_u64(2028722748960791447), - F::from_noncanonical_u64(7929433601095175579), - ] - } - - fn test_add() - where - [(); Avx2PrimeField::::WIDTH]:, - { - let a_arr = test_vals_a::(); - let b_arr = test_vals_b::(); - - let packed_a = Avx2PrimeField::::from_arr(a_arr); - let packed_b = Avx2PrimeField::::from_arr(b_arr); - let packed_res = packed_a + packed_b; - let arr_res = packed_res.as_arr(); - - let expected = a_arr.iter().zip(b_arr).map(|(&a, b)| a + b); - for (exp, res) in expected.zip(arr_res) { - assert_eq!(res, exp); - } - } - - fn test_mul() - where - [(); Avx2PrimeField::::WIDTH]:, - { - let a_arr = test_vals_a::(); - let b_arr = test_vals_b::(); - - let packed_a = Avx2PrimeField::::from_arr(a_arr); - let packed_b = Avx2PrimeField::::from_arr(b_arr); - let packed_res = packed_a * packed_b; - let arr_res = packed_res.as_arr(); - - let expected = a_arr.iter().zip(b_arr).map(|(&a, b)| a * b); - for (exp, res) in expected.zip(arr_res) { - assert_eq!(res, exp); - } - } - - fn test_square() - where - [(); Avx2PrimeField::::WIDTH]:, - { - let a_arr = test_vals_a::(); - - let packed_a = Avx2PrimeField::::from_arr(a_arr); - let packed_res = packed_a.square(); - let arr_res = packed_res.as_arr(); - - let expected = a_arr.iter().map(|&a| a.square()); - for (exp, res) in expected.zip(arr_res) { - assert_eq!(res, exp); - } - } - - fn test_neg() - where - [(); Avx2PrimeField::::WIDTH]:, - { - let a_arr = test_vals_a::(); - - let packed_a = Avx2PrimeField::::from_arr(a_arr); - let packed_res = -packed_a; - let arr_res = packed_res.as_arr(); - - let expected = a_arr.iter().map(|&a| -a); - for (exp, res) in expected.zip(arr_res) { - assert_eq!(res, exp); - } - } - - fn test_sub() - where - [(); Avx2PrimeField::::WIDTH]:, - { - let a_arr = test_vals_a::(); - let b_arr = test_vals_b::(); - - let packed_a = Avx2PrimeField::::from_arr(a_arr); - let packed_b = Avx2PrimeField::::from_arr(b_arr); - let packed_res = packed_a - packed_b; - let arr_res = packed_res.as_arr(); - - let expected = a_arr.iter().zip(b_arr).map(|(&a, b)| a - b); - for (exp, res) in expected.zip(arr_res) { - assert_eq!(res, exp); - } - } - - fn test_interleave_is_involution() - where - [(); Avx2PrimeField::::WIDTH]:, - { - let a_arr = test_vals_a::(); - let b_arr = test_vals_b::(); - - let packed_a = Avx2PrimeField::::from_arr(a_arr); - let packed_b = Avx2PrimeField::::from_arr(b_arr); - { - // Interleave, then deinterleave. - let (x, y) = packed_a.interleave(packed_b, 1); - let (res_a, res_b) = x.interleave(y, 1); - assert_eq!(res_a.as_arr(), a_arr); - assert_eq!(res_b.as_arr(), b_arr); - } - { - let (x, y) = packed_a.interleave(packed_b, 2); - let (res_a, res_b) = x.interleave(y, 2); - assert_eq!(res_a.as_arr(), a_arr); - assert_eq!(res_b.as_arr(), b_arr); - } - { - let (x, y) = packed_a.interleave(packed_b, 4); - let (res_a, res_b) = x.interleave(y, 4); - assert_eq!(res_a.as_arr(), a_arr); - assert_eq!(res_b.as_arr(), b_arr); - } - } - - fn test_interleave() - where - [(); Avx2PrimeField::::WIDTH]:, - { - let in_a: [F; 4] = [ - F::from_noncanonical_u64(00), - F::from_noncanonical_u64(01), - F::from_noncanonical_u64(02), - F::from_noncanonical_u64(03), - ]; - let in_b: [F; 4] = [ - F::from_noncanonical_u64(10), - F::from_noncanonical_u64(11), - F::from_noncanonical_u64(12), - F::from_noncanonical_u64(13), - ]; - let int1_a: [F; 4] = [ - F::from_noncanonical_u64(00), - F::from_noncanonical_u64(10), - F::from_noncanonical_u64(02), - F::from_noncanonical_u64(12), - ]; - let int1_b: [F; 4] = [ - F::from_noncanonical_u64(01), - F::from_noncanonical_u64(11), - F::from_noncanonical_u64(03), - F::from_noncanonical_u64(13), - ]; - let int2_a: [F; 4] = [ - F::from_noncanonical_u64(00), - F::from_noncanonical_u64(01), - F::from_noncanonical_u64(10), - F::from_noncanonical_u64(11), - ]; - let int2_b: [F; 4] = [ - F::from_noncanonical_u64(02), - F::from_noncanonical_u64(03), - F::from_noncanonical_u64(12), - F::from_noncanonical_u64(13), - ]; - - let packed_a = Avx2PrimeField::::from_arr(in_a); - let packed_b = Avx2PrimeField::::from_arr(in_b); - { - let (x1, y1) = packed_a.interleave(packed_b, 1); - assert_eq!(x1.as_arr(), int1_a); - assert_eq!(y1.as_arr(), int1_b); - } - { - let (x2, y2) = packed_a.interleave(packed_b, 2); - assert_eq!(x2.as_arr(), int2_a); - assert_eq!(y2.as_arr(), int2_b); - } - { - let (x4, y4) = packed_a.interleave(packed_b, 4); - assert_eq!(x4.as_arr(), in_a); - assert_eq!(y4.as_arr(), in_b); - } - } - - #[test] - fn test_add_goldilocks() { - test_add::(); - } - #[test] - fn test_mul_goldilocks() { - test_mul::(); - } - #[test] - fn test_square_goldilocks() { - test_square::(); - } - #[test] - fn test_neg_goldilocks() { - test_neg::(); - } - #[test] - fn test_sub_goldilocks() { - test_sub::(); - } - #[test] - fn test_interleave_is_involution_goldilocks() { - test_interleave_is_involution::(); - } - #[test] - fn test_interleave_goldilocks() { - test_interleave::(); - } -}