From f072d09ae43a525e72cff8e1dfbc3f8276393fc0 Mon Sep 17 00:00:00 2001
From: Jakub Nabaglo <jakub@mirprotocol.org>
Date: Thu, 6 Jan 2022 09:19:32 -0800
Subject: [PATCH] AVX-512 packed Goldilocks (#400)

* WIP AVX-512 Goldilocks

* Fix tests

* fmt

* Hamish PR comment
---
 .../arch/x86_64/avx512_goldilocks_field.rs    | 656 ++++++++++++++++++
 field/src/arch/x86_64/mod.rs                  |  20 +-
 field/src/lib.rs                              |   1 +
 field/src/packable.rs                         |  24 +-
 4 files changed, 699 insertions(+), 2 deletions(-)
 create mode 100644 field/src/arch/x86_64/avx512_goldilocks_field.rs
diff --git a/field/src/arch/x86_64/avx512_goldilocks_field.rs b/field/src/arch/x86_64/avx512_goldilocks_field.rs
new file mode 100644
index 00000000..ede87626
--- /dev/null
+++ b/field/src/arch/x86_64/avx512_goldilocks_field.rs
@@ -0,0 +1,656 @@
+use core::arch::x86_64::*;
+use std::fmt;
+use std::fmt::{Debug, Formatter};
+use std::iter::{Product, Sum};
+use std::mem::transmute;
+use std::ops::{Add, AddAssign, Div, DivAssign, Mul, MulAssign, Neg, Sub, SubAssign};
+
+use crate::field_types::{Field, PrimeField};
+use crate::goldilocks_field::GoldilocksField;
+use crate::ops::Square;
+use crate::packed_field::PackedField;
+
+// Ideally `Avx512GoldilocksField` would wrap `__m512i`. Unfortunately, `__m512i` has an alignment
+// of 64B, which would preclude us from casting `[GoldilocksField; 8]` (alignment 8B) to
+// `Avx512GoldilocksField`. We need to ensure that `Avx512GoldilocksField` has the same alignment as
+// `GoldilocksField`. Thus we wrap `[GoldilocksField; 8]` and use the `new` and `get` methods to
+// convert to and from `__m512i`.
+#[derive(Copy, Clone)]
+#[repr(transparent)]
+pub struct Avx512GoldilocksField(pub [GoldilocksField; 8]);
+
+impl Avx512GoldilocksField {
+    #[inline]
+    fn new(x: __m512i) -> Self {
+        unsafe { transmute(x) }
+    }
+    #[inline]
+    fn get(&self) -> __m512i {
+        unsafe { transmute(*self) }
+    }
+}
+
+unsafe impl PackedField for Avx512GoldilocksField {
+    const WIDTH: usize = 8;
+
+    type Scalar = GoldilocksField;
+
+    const ZEROS: Self = Self([GoldilocksField::ZERO; 8]);
+    const ONES: Self = Self([GoldilocksField::ONE; 8]);
+
+    #[inline]
+    fn from_arr(arr: [Self::Scalar; Self::WIDTH]) -> Self {
+        Self(arr)
+    }
+
+    #[inline]
+    fn as_arr(&self) -> [Self::Scalar; Self::WIDTH] {
+        self.0
+    }
+
+    #[inline]
+    fn from_slice(slice: &[Self::Scalar]) -> &Self {
+        assert_eq!(slice.len(), Self::WIDTH);
+        unsafe { &*slice.as_ptr().cast() }
+    }
+    #[inline]
+    fn from_slice_mut(slice: &mut [Self::Scalar]) -> &mut Self {
+        assert_eq!(slice.len(), Self::WIDTH);
+        unsafe { &mut *slice.as_mut_ptr().cast() }
+    }
+    #[inline]
+    fn as_slice(&self) -> &[Self::Scalar] {
+        &self.0[..]
+    }
+    #[inline]
+    fn as_slice_mut(&mut self) -> &mut [Self::Scalar] {
+        &mut self.0[..]
+    }
+
+    #[inline]
+    fn interleave(&self, other: Self, block_len: usize) -> (Self, Self) {
+        let (v0, v1) = (self.get(), other.get());
+        let (res0, res1) = match block_len {
+            1 => unsafe { interleave1(v0, v1) },
+            2 => unsafe { interleave2(v0, v1) },
+            4 => unsafe { interleave4(v0, v1) },
+            8 => (v0, v1),
+            _ => panic!("unsupported block_len"),
+        };
+        (Self::new(res0), Self::new(res1))
+    }
+}
+
+impl Add<Self> for Avx512GoldilocksField {
+    type Output = Self;
+    #[inline]
+    fn add(self, rhs: Self) -> Self {
+        Self::new(unsafe { add(self.get(), rhs.get()) })
+    }
+}
+impl Add<GoldilocksField> for Avx512GoldilocksField {
+    type Output = Self;
+    #[inline]
+    fn add(self, rhs: GoldilocksField) -> Self {
+        self + Self::from(rhs)
+    }
+}
+impl Add<Avx512GoldilocksField> for GoldilocksField {
+    type Output = Avx512GoldilocksField;
+    #[inline]
+    fn add(self, rhs: Self::Output) -> Self::Output {
+        Self::Output::from(self) + rhs
+    }
+}
+impl AddAssign<Self> for Avx512GoldilocksField {
+    #[inline]
+    fn add_assign(&mut self, rhs: Self) {
+        *self = *self + rhs;
+    }
+}
+impl AddAssign<GoldilocksField> for Avx512GoldilocksField {
+    #[inline]
+    fn add_assign(&mut self, rhs: GoldilocksField) {
+        *self = *self + rhs;
+    }
+}
+
+impl Debug for Avx512GoldilocksField {
+    #[inline]
+    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+        write!(f, "({:?})", self.get())
+    }
+}
+
+impl Default for Avx512GoldilocksField {
+    #[inline]
+    fn default() -> Self {
+        Self::ZEROS
+    }
+}
+
+impl Div<GoldilocksField> for Avx512GoldilocksField {
+    type Output = Self;
+    #[inline]
+    fn div(self, rhs: GoldilocksField) -> Self {
+        self * rhs.inverse()
+    }
+}
+impl DivAssign<GoldilocksField> for Avx512GoldilocksField {
+    #[inline]
+    fn div_assign(&mut self, rhs: GoldilocksField) {
+        *self *= rhs.inverse();
+    }
+}
+
+impl From<GoldilocksField> for Avx512GoldilocksField {
+    fn from(x: GoldilocksField) -> Self {
+        Self([x; 8])
+    }
+}
+
+impl Mul<Self> for Avx512GoldilocksField {
+    type Output = Self;
+    #[inline]
+    fn mul(self, rhs: Self) -> Self {
+        Self::new(unsafe { mul(self.get(), rhs.get()) })
+    }
+}
+impl Mul<GoldilocksField> for Avx512GoldilocksField {
+    type Output = Self;
+    #[inline]
+    fn mul(self, rhs: GoldilocksField) -> Self {
+        self * Self::from(rhs)
+    }
+}
+impl Mul<Avx512GoldilocksField> for GoldilocksField {
+    type Output = Avx512GoldilocksField;
+    #[inline]
+    fn mul(self, rhs: Avx512GoldilocksField) -> Self::Output {
+        Self::Output::from(self) * rhs
+    }
+}
+impl MulAssign<Self> for Avx512GoldilocksField {
+    #[inline]
+    fn mul_assign(&mut self, rhs: Self) {
+        *self = *self * rhs;
+    }
+}
+impl MulAssign<GoldilocksField> for Avx512GoldilocksField {
+    #[inline]
+    fn mul_assign(&mut self, rhs: GoldilocksField) {
+        *self = *self * rhs;
+    }
+}
+
+impl Neg for Avx512GoldilocksField {
+    type Output = Self;
+    #[inline]
+    fn neg(self) -> Self {
+        Self::new(unsafe { neg(self.get()) })
+    }
+}
+
+impl Product for Avx512GoldilocksField {
+    #[inline]
+    fn product<I: Iterator<Item = Self>>(iter: I) -> Self {
+        iter.reduce(|x, y| x * y).unwrap_or(Self::ONES)
+    }
+}
+
+impl Square for Avx512GoldilocksField {
+    #[inline]
+    fn square(&self) -> Self {
+        Self::new(unsafe { square(self.get()) })
+    }
+}
+
+impl Sub<Self> for Avx512GoldilocksField {
+    type Output = Self;
+    #[inline]
+    fn sub(self, rhs: Self) -> Self {
+        Self::new(unsafe { sub(self.get(), rhs.get()) })
+    }
+}
+impl Sub<GoldilocksField> for Avx512GoldilocksField {
+    type Output = Self;
+    #[inline]
+    fn sub(self, rhs: GoldilocksField) -> Self {
+        self - Self::from(rhs)
+    }
+}
+impl Sub<Avx512GoldilocksField> for GoldilocksField {
+    type Output = Avx512GoldilocksField;
+    #[inline]
+    fn sub(self, rhs: Avx512GoldilocksField) -> Self::Output {
+        Self::Output::from(self) - rhs
+    }
+}
+impl SubAssign<Self> for Avx512GoldilocksField {
+    #[inline]
+    fn sub_assign(&mut self, rhs: Self) {
+        *self = *self - rhs;
+    }
+}
+impl SubAssign<GoldilocksField> for Avx512GoldilocksField {
+    #[inline]
+    fn sub_assign(&mut self, rhs: GoldilocksField) {
+        *self = *self - rhs;
+    }
+}
+
+impl Sum for Avx512GoldilocksField {
+    #[inline]
+    fn sum<I: Iterator<Item = Self>>(iter: I) -> Self {
+        iter.reduce(|x, y| x + y).unwrap_or(Self::ZEROS)
+    }
+}
+
+const FIELD_ORDER: __m512i = unsafe { transmute([GoldilocksField::ORDER; 8]) };
+const EPSILON: __m512i = unsafe { transmute([GoldilocksField::ORDER.wrapping_neg(); 8]) };
+
+#[inline]
+unsafe fn canonicalize(x: __m512i) -> __m512i {
+    let mask = _mm512_cmpge_epu64_mask(x, FIELD_ORDER);
+    _mm512_mask_sub_epi64(x, mask, x, FIELD_ORDER)
+}
+
+#[inline]
+unsafe fn add_no_double_overflow_64_64(x: __m512i, y: __m512i) -> __m512i {
+    let res_wrapped = _mm512_add_epi64(x, y);
+    let mask = _mm512_cmplt_epu64_mask(res_wrapped, y); // mask set if add overflowed
+    let res = _mm512_mask_sub_epi64(res_wrapped, mask, res_wrapped, FIELD_ORDER);
+    res
+}
+
+#[inline]
+unsafe fn sub_no_double_overflow_64_64(x: __m512i, y: __m512i) -> __m512i {
+    let mask = _mm512_cmplt_epu64_mask(x, y); // mask set if sub will underflow (x < y)
+    let res_wrapped = _mm512_sub_epi64(x, y);
+    let res = _mm512_mask_add_epi64(res_wrapped, mask, res_wrapped, FIELD_ORDER);
+    res
+}
+
+#[inline]
+unsafe fn add(x: __m512i, y: __m512i) -> __m512i {
+    add_no_double_overflow_64_64(x, canonicalize(y))
+}
+
+#[inline]
+unsafe fn sub(x: __m512i, y: __m512i) -> __m512i {
+    sub_no_double_overflow_64_64(x, canonicalize(y))
+}
+
+#[inline]
+unsafe fn neg(y: __m512i) -> __m512i {
+    _mm512_sub_epi64(FIELD_ORDER, canonicalize(y))
+}
+
+const LO_32_BITS_MASK: __mmask16 = unsafe { transmute(0b0101010101010101u16) };
+
+#[inline]
+unsafe fn mul64_64(x: __m512i, y: __m512i) -> (__m512i, __m512i) {
+    // We want to move the high 32 bits to the low position. The multiplication instruction ignores
+    // the high 32 bits, so it's ok to just duplicate it into the low position. This duplication can
+    // be done on port 5; bitshifts run on port 0, competing with multiplication.
+    //   This instruction is only provided for 32-bit floats, not integers. Idk why Intel makes the
+    // distinction; the casts are free and it guarantees that the exact bit pattern is preserved.
+    // Using a swizzle instruction of the wrong domain (float vs int) does not increase latency
+    // since Haswell.
+    let x_hi = _mm512_castps_si512(_mm512_movehdup_ps(_mm512_castsi512_ps(x)));
+    let y_hi = _mm512_castps_si512(_mm512_movehdup_ps(_mm512_castsi512_ps(y)));
+
+    // All four pairwise multiplications
+    let mul_ll = _mm512_mul_epu32(x, y);
+    let mul_lh = _mm512_mul_epu32(x, y_hi);
+    let mul_hl = _mm512_mul_epu32(x_hi, y);
+    let mul_hh = _mm512_mul_epu32(x_hi, y_hi);
+
+    // Bignum addition
+    // Extract high 32 bits of mul_ll and add to mul_hl. This cannot overflow.
+    let mul_ll_hi = _mm512_srli_epi64::<32>(mul_ll);
+    let t0 = _mm512_add_epi64(mul_hl, mul_ll_hi);
+    // Extract low 32 bits of t0 and add to mul_lh. Again, this cannot overflow.
+    // Also, extract high 32 bits of t0 and add to mul_hh.
+    let t0_lo = _mm512_and_si512(t0, EPSILON);
+    let t0_hi = _mm512_srli_epi64::<32>(t0);
+    let t1 = _mm512_add_epi64(mul_lh, t0_lo);
+    let t2 = _mm512_add_epi64(mul_hh, t0_hi);
+    // Lastly, extract the high 32 bits of t1 and add to t2.
+    let t1_hi = _mm512_srli_epi64::<32>(t1);
+    let res_hi = _mm512_add_epi64(t2, t1_hi);
+
+    // Form res_lo by combining the low half of mul_ll with the low half of t1 (shifted into high
+    // position).
+    let t1_lo = _mm512_castps_si512(_mm512_moveldup_ps(_mm512_castsi512_ps(t1)));
+    let res_lo = _mm512_mask_blend_epi32(LO_32_BITS_MASK, t1_lo, mul_ll);
+
+    (res_hi, res_lo)
+}
+
+#[inline]
+unsafe fn square64(x: __m512i) -> (__m512i, __m512i) {
+    // Get high 32 bits of x. See comment in mul64_64_s.
+    let x_hi = _mm512_castps_si512(_mm512_movehdup_ps(_mm512_castsi512_ps(x)));
+
+    // All pairwise multiplications.
+    let mul_ll = _mm512_mul_epu32(x, x);
+    let mul_lh = _mm512_mul_epu32(x, x_hi);
+    let mul_hh = _mm512_mul_epu32(x_hi, x_hi);
+
+    // Bignum addition, but mul_lh is shifted by 33 bits (not 32).
+    let mul_ll_hi = _mm512_srli_epi64::<33>(mul_ll);
+    let t0 = _mm512_add_epi64(mul_lh, mul_ll_hi);
+    let t0_hi = _mm512_srli_epi64::<31>(t0);
+    let res_hi = _mm512_add_epi64(mul_hh, t0_hi);
+
+    // Form low result by adding the mul_ll and the low 31 bits of mul_lh (shifted to the high
+    // position).
+    let mul_lh_lo = _mm512_slli_epi64::<33>(mul_lh);
+    let res_lo = _mm512_add_epi64(mul_ll, mul_lh_lo);
+
+    (res_hi, res_lo)
+}
+
+#[inline]
+unsafe fn reduce128(x: (__m512i, __m512i)) -> __m512i {
+    let (hi0, lo0) = x;
+    let hi_hi0 = _mm512_srli_epi64::<32>(hi0);
+    let lo1 = sub_no_double_overflow_64_64(lo0, hi_hi0);
+    let t1 = _mm512_mul_epu32(hi0, EPSILON);
+    let lo2 = add_no_double_overflow_64_64(lo1, t1);
+    lo2
+}
+
+#[inline]
+unsafe fn mul(x: __m512i, y: __m512i) -> __m512i {
+    reduce128(mul64_64(x, y))
+}
+
+#[inline]
+unsafe fn square(x: __m512i) -> __m512i {
+    reduce128(square64(x))
+}
+
+#[inline]
+unsafe fn interleave1(x: __m512i, y: __m512i) -> (__m512i, __m512i) {
+    let a = _mm512_unpacklo_epi64(x, y);
+    let b = _mm512_unpackhi_epi64(x, y);
+    (a, b)
+}
+
+const INTERLEAVE2_IDX_A: __m512i = unsafe {
+    transmute([
+        0o00u64, 0o01u64, 0o10u64, 0o11u64, 0o04u64, 0o05u64, 0o14u64, 0o15u64,
+    ])
+};
+const INTERLEAVE2_IDX_B: __m512i = unsafe {
+    transmute([
+        0o02u64, 0o03u64, 0o12u64, 0o13u64, 0o06u64, 0o07u64, 0o16u64, 0o17u64,
+    ])
+};
+
+#[inline]
+unsafe fn interleave2(x: __m512i, y: __m512i) -> (__m512i, __m512i) {
+    let a = _mm512_permutex2var_epi64(x, INTERLEAVE2_IDX_A, y);
+    let b = _mm512_permutex2var_epi64(x, INTERLEAVE2_IDX_B, y);
+    (a, b)
+}
+
+#[inline]
+unsafe fn interleave4(x: __m512i, y: __m512i) -> (__m512i, __m512i) {
+    let a = _mm512_shuffle_i64x2::<0x44>(x, y);
+    let b = _mm512_shuffle_i64x2::<0xee>(x, y);
+    (a, b)
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::arch::x86_64::avx512_goldilocks_field::Avx512GoldilocksField;
+    use crate::field_types::PrimeField;
+    use crate::goldilocks_field::GoldilocksField;
+    use crate::ops::Square;
+    use crate::packed_field::PackedField;
+
+    fn test_vals_a() -> [GoldilocksField; 8] {
+        [
+            GoldilocksField::from_noncanonical_u64(14479013849828404771),
+            GoldilocksField::from_noncanonical_u64(9087029921428221768),
+            GoldilocksField::from_noncanonical_u64(2441288194761790662),
+            GoldilocksField::from_noncanonical_u64(5646033492608483824),
+            GoldilocksField::from_noncanonical_u64(2779181197214900072),
+            GoldilocksField::from_noncanonical_u64(2989742820063487116),
+            GoldilocksField::from_noncanonical_u64(727880025589250743),
+            GoldilocksField::from_noncanonical_u64(3803926346107752679),
+        ]
+    }
+    fn test_vals_b() -> [GoldilocksField; 8] {
+        [
+            GoldilocksField::from_noncanonical_u64(17891926589593242302),
+            GoldilocksField::from_noncanonical_u64(11009798273260028228),
+            GoldilocksField::from_noncanonical_u64(2028722748960791447),
+            GoldilocksField::from_noncanonical_u64(7929433601095175579),
+            GoldilocksField::from_noncanonical_u64(6632528436085461172),
+            GoldilocksField::from_noncanonical_u64(2145438710786785567),
+            GoldilocksField::from_noncanonical_u64(11821483668392863016),
+            GoldilocksField::from_noncanonical_u64(15638272883309521929),
+        ]
+    }
+
+    #[test]
+    fn test_add() {
+        let a_arr = test_vals_a();
+        let b_arr = test_vals_b();
+
+        let packed_a = Avx512GoldilocksField::from_arr(a_arr);
+        let packed_b = Avx512GoldilocksField::from_arr(b_arr);
+        let packed_res = packed_a + packed_b;
+        let arr_res = packed_res.as_arr();
+
+        let expected = a_arr.iter().zip(b_arr).map(|(&a, b)| a + b);
+        for (exp, res) in expected.zip(arr_res) {
+            assert_eq!(res, exp);
+        }
+    }
+
+    #[test]
+    fn test_mul() {
+        let a_arr = test_vals_a();
+        let b_arr = test_vals_b();
+
+        let packed_a = Avx512GoldilocksField::from_arr(a_arr);
+        let packed_b = Avx512GoldilocksField::from_arr(b_arr);
+        let packed_res = packed_a * packed_b;
+        let arr_res = packed_res.as_arr();
+
+        let expected = a_arr.iter().zip(b_arr).map(|(&a, b)| a * b);
+        for (exp, res) in expected.zip(arr_res) {
+            assert_eq!(res, exp);
+        }
+    }
+
+    #[test]
+    fn test_square() {
+        let a_arr = test_vals_a();
+
+        let packed_a = Avx512GoldilocksField::from_arr(a_arr);
+        let packed_res = packed_a.square();
+        let arr_res = packed_res.as_arr();
+
+        let expected = a_arr.iter().map(|&a| a.square());
+        for (exp, res) in expected.zip(arr_res) {
+            assert_eq!(res, exp);
+        }
+    }
+
+    #[test]
+    fn test_neg() {
+        let a_arr = test_vals_a();
+
+        let packed_a = Avx512GoldilocksField::from_arr(a_arr);
+        let packed_res = -packed_a;
+        let arr_res = packed_res.as_arr();
+
+        let expected = a_arr.iter().map(|&a| -a);
+        for (exp, res) in expected.zip(arr_res) {
+            assert_eq!(res, exp);
+        }
+    }
+
+    #[test]
+    fn test_sub() {
+        let a_arr = test_vals_a();
+        let b_arr = test_vals_b();
+
+        let packed_a = Avx512GoldilocksField::from_arr(a_arr);
+        let packed_b = Avx512GoldilocksField::from_arr(b_arr);
+        let packed_res = packed_a - packed_b;
+        let arr_res = packed_res.as_arr();
+
+        let expected = a_arr.iter().zip(b_arr).map(|(&a, b)| a - b);
+        for (exp, res) in expected.zip(arr_res) {
+            assert_eq!(res, exp);
+        }
+    }
+
+    #[test]
+    fn test_interleave_is_involution() {
+        let a_arr = test_vals_a();
+        let b_arr = test_vals_b();
+
+        let packed_a = Avx512GoldilocksField::from_arr(a_arr);
+        let packed_b = Avx512GoldilocksField::from_arr(b_arr);
+        {
+            // Interleave, then deinterleave.
+            let (x, y) = packed_a.interleave(packed_b, 1);
+            let (res_a, res_b) = x.interleave(y, 1);
+            assert_eq!(res_a.as_arr(), a_arr);
+            assert_eq!(res_b.as_arr(), b_arr);
+        }
+        {
+            let (x, y) = packed_a.interleave(packed_b, 2);
+            let (res_a, res_b) = x.interleave(y, 2);
+            assert_eq!(res_a.as_arr(), a_arr);
+            assert_eq!(res_b.as_arr(), b_arr);
+        }
+        {
+            let (x, y) = packed_a.interleave(packed_b, 4);
+            let (res_a, res_b) = x.interleave(y, 4);
+            assert_eq!(res_a.as_arr(), a_arr);
+            assert_eq!(res_b.as_arr(), b_arr);
+        }
+        {
+            let (x, y) = packed_a.interleave(packed_b, 8);
+            let (res_a, res_b) = x.interleave(y, 8);
+            assert_eq!(res_a.as_arr(), a_arr);
+            assert_eq!(res_b.as_arr(), b_arr);
+        }
+    }
+
+    #[test]
+    fn test_interleave() {
+        let in_a: [GoldilocksField; 8] = [
+            GoldilocksField::from_noncanonical_u64(00),
+            GoldilocksField::from_noncanonical_u64(01),
+            GoldilocksField::from_noncanonical_u64(02),
+            GoldilocksField::from_noncanonical_u64(03),
+            GoldilocksField::from_noncanonical_u64(04),
+            GoldilocksField::from_noncanonical_u64(05),
+            GoldilocksField::from_noncanonical_u64(06),
+            GoldilocksField::from_noncanonical_u64(07),
+        ];
+        let in_b: [GoldilocksField; 8] = [
+            GoldilocksField::from_noncanonical_u64(10),
+            GoldilocksField::from_noncanonical_u64(11),
+            GoldilocksField::from_noncanonical_u64(12),
+            GoldilocksField::from_noncanonical_u64(13),
+            GoldilocksField::from_noncanonical_u64(14),
+            GoldilocksField::from_noncanonical_u64(15),
+            GoldilocksField::from_noncanonical_u64(16),
+            GoldilocksField::from_noncanonical_u64(17),
+        ];
+        let int1_a: [GoldilocksField; 8] = [
+            GoldilocksField::from_noncanonical_u64(00),
+            GoldilocksField::from_noncanonical_u64(10),
+            GoldilocksField::from_noncanonical_u64(02),
+            GoldilocksField::from_noncanonical_u64(12),
+            GoldilocksField::from_noncanonical_u64(04),
+            GoldilocksField::from_noncanonical_u64(14),
+            GoldilocksField::from_noncanonical_u64(06),
+            GoldilocksField::from_noncanonical_u64(16),
+        ];
+        let int1_b: [GoldilocksField; 8] = [
+            GoldilocksField::from_noncanonical_u64(01),
+            GoldilocksField::from_noncanonical_u64(11),
+            GoldilocksField::from_noncanonical_u64(03),
+            GoldilocksField::from_noncanonical_u64(13),
+            GoldilocksField::from_noncanonical_u64(05),
+            GoldilocksField::from_noncanonical_u64(15),
+            GoldilocksField::from_noncanonical_u64(07),
+            GoldilocksField::from_noncanonical_u64(17),
+        ];
+        let int2_a: [GoldilocksField; 8] = [
+            GoldilocksField::from_noncanonical_u64(00),
+            GoldilocksField::from_noncanonical_u64(01),
+            GoldilocksField::from_noncanonical_u64(10),
+            GoldilocksField::from_noncanonical_u64(11),
+            GoldilocksField::from_noncanonical_u64(04),
+            GoldilocksField::from_noncanonical_u64(05),
+            GoldilocksField::from_noncanonical_u64(14),
+            GoldilocksField::from_noncanonical_u64(15),
+        ];
+        let int2_b: [GoldilocksField; 8] = [
+            GoldilocksField::from_noncanonical_u64(02),
+            GoldilocksField::from_noncanonical_u64(03),
+            GoldilocksField::from_noncanonical_u64(12),
+            GoldilocksField::from_noncanonical_u64(13),
+            GoldilocksField::from_noncanonical_u64(06),
+            GoldilocksField::from_noncanonical_u64(07),
+            GoldilocksField::from_noncanonical_u64(16),
+            GoldilocksField::from_noncanonical_u64(17),
+        ];
+        let int4_a: [GoldilocksField; 8] = [
+            GoldilocksField::from_noncanonical_u64(00),
+            GoldilocksField::from_noncanonical_u64(01),
+            GoldilocksField::from_noncanonical_u64(02),
+            GoldilocksField::from_noncanonical_u64(03),
+            GoldilocksField::from_noncanonical_u64(10),
+            GoldilocksField::from_noncanonical_u64(11),
+            GoldilocksField::from_noncanonical_u64(12),
+            GoldilocksField::from_noncanonical_u64(13),
+        ];
+        let int4_b: [GoldilocksField; 8] = [
+            GoldilocksField::from_noncanonical_u64(04),
+            GoldilocksField::from_noncanonical_u64(05),
+            GoldilocksField::from_noncanonical_u64(06),
+            GoldilocksField::from_noncanonical_u64(07),
+            GoldilocksField::from_noncanonical_u64(14),
+            GoldilocksField::from_noncanonical_u64(15),
+            GoldilocksField::from_noncanonical_u64(16),
+            GoldilocksField::from_noncanonical_u64(17),
+        ];
+
+        let packed_a = Avx512GoldilocksField::from_arr(in_a);
+        let packed_b = Avx512GoldilocksField::from_arr(in_b);
+        {
+            let (x1, y1) = packed_a.interleave(packed_b, 1);
+            assert_eq!(x1.as_arr(), int1_a);
+            assert_eq!(y1.as_arr(), int1_b);
+        }
+        {
+            let (x2, y2) = packed_a.interleave(packed_b, 2);
+            assert_eq!(x2.as_arr(), int2_a);
+            assert_eq!(y2.as_arr(), int2_b);
+        }
+        {
+            let (x4, y4) = packed_a.interleave(packed_b, 4);
+            assert_eq!(x4.as_arr(), int4_a);
+            assert_eq!(y4.as_arr(), int4_b);
+        }
+        {
+            let (x8, y8) = packed_a.interleave(packed_b, 8);
+            assert_eq!(x8.as_arr(), in_a);
+            assert_eq!(y8.as_arr(), in_b);
+        }
+    }
+}
diff --git a/field/src/arch/x86_64/mod.rs b/field/src/arch/x86_64/mod.rs
index bd9dccae..326deb78 100644
--- a/field/src/arch/x86_64/mod.rs
+++ b/field/src/arch/x86_64/mod.rs
@@ -1,2 +1,20 @@
-#[cfg(target_feature = "avx2")]
+#[cfg(all(
+    target_feature = "avx2",
+    not(all(
+        target_feature = "avx512bw",
+        target_feature = "avx512cd",
+        target_feature = "avx512dq",
+        target_feature = "avx512f",
+        target_feature = "avx512vl"
+    ))
+))]
 pub mod avx2_goldilocks_field;
+
+#[cfg(all(
+    target_feature = "avx512bw",
+    target_feature = "avx512cd",
+    target_feature = "avx512dq",
+    target_feature = "avx512f",
+    target_feature = "avx512vl"
+))]
+pub mod avx512_goldilocks_field;
diff --git a/field/src/lib.rs b/field/src/lib.rs
index 47dd9ccb..f190bcbc 100644
--- a/field/src/lib.rs
+++ b/field/src/lib.rs
@@ -7,6 +7,7 @@
 #![allow(clippy::return_self_not_must_use)]
 #![feature(generic_const_exprs)]
 #![feature(specialization)]
+#![feature(stdsimd)]
 
 pub(crate) mod arch;
 pub mod batch_util;
diff --git a/field/src/packable.rs b/field/src/packable.rs
index 754a7fb6..18fe07f7 100644
--- a/field/src/packable.rs
+++ b/field/src/packable.rs
@@ -12,7 +12,29 @@ impl<F: Field> Packable for F {
     default type Packing = Self;
 }
 
-#[cfg(all(target_arch = "x86_64", target_feature = "avx2"))]
+#[cfg(all(
+    target_arch = "x86_64",
+    target_feature = "avx2",
+    not(all(
+        target_feature = "avx512bw",
+        target_feature = "avx512cd",
+        target_feature = "avx512dq",
+        target_feature = "avx512f",
+        target_feature = "avx512vl"
+    ))
+))]
 impl Packable for crate::goldilocks_field::GoldilocksField {
     type Packing = crate::arch::x86_64::avx2_goldilocks_field::Avx2GoldilocksField;
 }
+
+#[cfg(all(
+    target_arch = "x86_64",
+    target_feature = "avx512bw",
+    target_feature = "avx512cd",
+    target_feature = "avx512dq",
+    target_feature = "avx512f",
+    target_feature = "avx512vl"
+))]
+impl Packable for crate::goldilocks_field::GoldilocksField {
+    type Packing = crate::arch::x86_64::avx512_goldilocks_field::Avx512GoldilocksField;
+}