Many small optimizations to scalar Poseidon (#253)

* Many small optimizations

* Lints

* Hamish comments

* Inlining
This commit is contained in:
Jakub Nabaglo 2021-09-20 10:52:12 -07:00 committed by GitHub
parent 471ace6d85
commit 1369dd7c76
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 160 additions and 93 deletions

View File

@ -93,6 +93,11 @@ impl Field for CrandallField {
reduce128(n)
}
#[inline]
fn from_noncanonical_u96(n: (u64, u32)) -> Self {
reduce96(n)
}
fn rand_from_rng<R: Rng>(rng: &mut R) -> Self {
Self::from_canonical_u64(rng.gen_range(0..Self::ORDER))
}
@ -173,6 +178,12 @@ impl Field for CrandallField {
let x74 = x73 * x39;
x74
}
#[inline]
fn multiply_accumulate(&self, x: Self, y: Self) -> Self {
// u64 + u64 * u64 cannot overflow.
reduce128((self.0 as u128) + (x.0 as u128) * (y.0 as u128))
}
}
impl PrimeField for CrandallField {
@ -197,6 +208,16 @@ impl PrimeField for CrandallField {
fn from_noncanonical_u64(n: u64) -> Self {
Self(n)
}
#[inline]
/// Faster addition for when we know that rhs <= ORDER. If this is the case, then the
/// .to_canonical_u64() that addition usually performs is unnecessary. Omitting it saves three
/// instructions. This function is marked unsafe because it may yield incorrect results if the
/// condition is not satisfied.
unsafe fn add_canonical_u64(&self, rhs: u64) -> Self {
let (sum, over) = self.0.overflowing_add(rhs);
Self(sum.wrapping_sub((over as u64) * Self::ORDER))
}
}
impl Neg for CrandallField {
@ -218,8 +239,9 @@ impl Add for CrandallField {
#[inline]
#[allow(clippy::suspicious_arithmetic_impl)]
fn add(self, rhs: Self) -> Self {
let (sum, over) = self.0.overflowing_add(rhs.to_canonical_u64());
Self(sum.wrapping_sub((over as u64) * Self::ORDER))
let rhs_canonical = rhs.to_canonical_u64();
// rhs_canonical is definitely canonical, so below is safe.
unsafe { self.add_canonical_u64(rhs_canonical) }
}
}
@ -323,17 +345,6 @@ impl Extendable<4> for CrandallField {
impl RichField for CrandallField {}
/// Faster addition for when we know that lhs.0 + rhs.0 < 2^64 + FIELD_ORDER. If this is the case,
/// then the .to_canonical_u64() that addition usually performs is unnecessary. Omitting it saves
/// three instructions.
/// This function is marked unsafe because it may yield incorrect result if the condition is not
/// satisfied.
#[inline]
unsafe fn add_no_canonicalize(lhs: CrandallField, rhs: CrandallField) -> CrandallField {
let (sum, over) = lhs.0.overflowing_add(rhs.0);
CrandallField(sum.wrapping_sub((over as u64) * CrandallField::ORDER))
}
/// Reduces to a 64-bit value. The result might not be in canonical form; it could be in between the
/// field order and `2^64`.
#[inline]
@ -345,13 +356,21 @@ fn reduce128(x: u128) -> CrandallField {
// product will fit in 64 bits.
let (lo_1, hi_1) = split(x);
let (lo_2, hi_2) = split((EPSILON as u128) * (hi_1 as u128) + (lo_1 as u128));
let lo_3 = hi_2 * EPSILON;
reduce96((lo_2, hi_2 as u32)) // hi_2 will always fit in a u32.
}
/// Reduces to a 64-bit value. The result might not be in canonical form; it could be in between the
/// field order and `2^64`.
#[inline]
fn reduce96((x_lo, x_hi): (u64, u32)) -> CrandallField {
// This is Crandall's algorithm. See reduce128.
let t = (x_hi as u64) * EPSILON;
unsafe {
// This is safe to do because lo_2 + lo_3 < 2^64 + Self::ORDER. Notice that hi_2 <=
// 2^32 - 1. Then lo_3 = hi_2 * EPSILON <= (2^32 - 1) * EPSILON < Self::ORDER.
// This is safe to do because x_lo + t < 2^64 + Self::ORDER. Notice that x_hi <= 2^32 - 1.
// Then t = x_hi * EPSILON <= (2^32 - 1) * EPSILON < Self::ORDER.
// Use of standard addition here would make multiplication 20% more expensive.
add_no_canonicalize(CrandallField(lo_2), CrandallField(lo_3))
CrandallField(x_lo).add_canonical_u64(t)
}
}

View File

@ -204,6 +204,15 @@ pub trait Field:
/// Returns `n % Self::CHARACTERISTIC`.
fn from_noncanonical_u128(n: u128) -> Self;
/// Returns `n % Self::CHARACTERISTIC`. May be cheaper than from_noncanonical_u128 when we know
/// that n < 2 ** 96.
#[inline]
fn from_noncanonical_u96((n_lo, n_hi): (u64, u32)) -> Self {
// Default implementation.
let n: u128 = ((n_hi as u128) << 64) + (n_lo as u128);
Self::from_noncanonical_u128(n)
}
fn rand_from_rng<R: Rng>(rng: &mut R) -> Self;
fn exp_power_of_2(&self, power_log: usize) -> Self {
@ -299,6 +308,13 @@ pub trait Field:
fn coset_shift() -> Self {
Self::MULTIPLICATIVE_GROUP_GENERATOR
}
/// Equivalent to *self + x * y, but may be cheaper.
#[inline]
fn multiply_accumulate(&self, x: Self, y: Self) -> Self {
// Default implementation.
*self + x * y
}
}
/// A finite field of prime order less than 2^64.
@ -310,6 +326,15 @@ pub trait PrimeField: Field {
fn to_noncanonical_u64(&self) -> u64;
fn from_noncanonical_u64(n: u64) -> Self;
/// Equivalent to *self + Self::from_canonical_u64(rhs), but may be cheaper. The caller must
/// ensure that 0 <= rhs < Self::ORDER. The function may return incorrect results if this
/// precondition is not met. It is marked unsafe for this reason.
#[inline]
unsafe fn add_canonical_u64(&self, rhs: u64) -> Self {
// Default implementation.
*self + Self::from_canonical_u64(rhs)
}
}
/// An iterator over the powers of a certain base element `b`: `b^0, b^1, b^2, ...`.

View File

@ -24,6 +24,22 @@ pub(crate) const N_PARTIAL_ROUNDS: usize = 22;
const N_ROUNDS: usize = N_FULL_ROUNDS_TOTAL + N_PARTIAL_ROUNDS;
const MAX_WIDTH: usize = 12; // we only have width 8 and 12, and 12 is bigger. :)
#[inline(always)]
fn add_u160_u128((x_lo, x_hi): (u128, u32), y: u128) -> (u128, u32) {
let (res_lo, over) = x_lo.overflowing_add(y);
let res_hi = x_hi + (over as u32);
(res_lo, res_hi)
}
#[inline(always)]
fn reduce_u160<F: PrimeField>((n_lo, n_hi): (u128, u32)) -> F {
let n_lo_hi = (n_lo >> 64) as u64;
let n_lo_lo = n_lo as u64;
let reduced_hi: u64 = F::from_noncanonical_u96((n_lo_hi, n_hi)).to_noncanonical_u64();
let reduced128: u128 = ((reduced_hi as u128) << 64) + (n_lo_lo as u128);
F::from_noncanonical_u128(reduced128)
}
/// Note that these work for the Crandall and Goldilocks fields, but not necessarily others. See
/// `generate_constants` about how these were generated. We include enough for a WIDTH of 12;
/// smaller widths just use a subset.
@ -140,7 +156,7 @@ where
// Precomputed constants for the fast Poseidon calculation. See
// the paper.
const FAST_PARTIAL_FIRST_ROUND_CONSTANT: [u64; WIDTH];
const FAST_PARTIAL_ROUND_CONSTANTS: [u64; N_PARTIAL_ROUNDS - 1];
const FAST_PARTIAL_ROUND_CONSTANTS: [u64; N_PARTIAL_ROUNDS];
const FAST_PARTIAL_ROUND_VS: [[u64; WIDTH - 1]; N_PARTIAL_ROUNDS];
const FAST_PARTIAL_ROUND_W_HATS: [[u64; WIDTH - 1]; N_PARTIAL_ROUNDS];
const FAST_PARTIAL_ROUND_INITIAL_MATRIX: [[u64; WIDTH - 1]; WIDTH - 1];
@ -220,7 +236,10 @@ where
assert!(WIDTH <= 12);
for r in 0..12 {
if r < WIDTH {
result[r] = Self::from_noncanonical_u128(Self::mds_row_shf(r, &state));
let sum = Self::mds_row_shf(r, &state);
let sum_lo = sum as u64;
let sum_hi = (sum >> 64) as u32;
result[r] = Self::from_noncanonical_u96((sum_lo, sum_hi));
}
}
@ -297,16 +316,16 @@ where
result[0] = state[0];
assert!(WIDTH <= 12);
for c in 1..12 {
if c < WIDTH {
for r in 1..12 {
if r < WIDTH {
assert!(WIDTH <= 12);
for r in 1..12 {
if r < WIDTH {
for c in 1..12 {
if c < WIDTH {
// NB: FAST_PARTIAL_ROUND_INITIAL_MATRIX is stored in
// column-major order so that this dot product is cache
// row-major order so that this dot product is cache
// friendly.
let t = F::from_canonical_u64(
Self::FAST_PARTIAL_ROUND_INITIAL_MATRIX[c - 1][r - 1],
Self::FAST_PARTIAL_ROUND_INITIAL_MATRIX[r - 1][c - 1],
);
result[c] += state[r] * t;
}
@ -326,10 +345,10 @@ where
result[0] = state[0];
for c in 1..WIDTH {
for r in 1..WIDTH {
for r in 1..WIDTH {
for c in 1..WIDTH {
let t =
F::from_canonical_u64(Self::FAST_PARTIAL_ROUND_INITIAL_MATRIX[c - 1][r - 1]);
F::from_canonical_u64(Self::FAST_PARTIAL_ROUND_INITIAL_MATRIX[r - 1][c - 1]);
result[c] = builder.arithmetic_extension(t, F::ONE, one, state[r], result[c]);
}
}
@ -349,15 +368,18 @@ where
fn mds_partial_layer_fast(state: &[Self; WIDTH], r: usize) -> [Self; WIDTH] {
// Set d = [M_00 | w^] dot [state]
let s0 = state[0].to_noncanonical_u64() as u128;
let mut d = Self::from_noncanonical_u128(s0 << Self::MDS_MATRIX_EXPS[0]);
let mut d_sum = (0u128, 0u32); // u160 accumulator
assert!(WIDTH <= 12);
for i in 1..12 {
if i < WIDTH {
let t = Self::from_canonical_u64(Self::FAST_PARTIAL_ROUND_W_HATS[r][i - 1]);
d += state[i] * t;
let t = Self::FAST_PARTIAL_ROUND_W_HATS[r][i - 1] as u128;
let si = state[i].to_noncanonical_u64() as u128;
d_sum = add_u160_u128(d_sum, si * t);
}
}
let s0 = state[0].to_noncanonical_u64() as u128;
d_sum = add_u160_u128(d_sum, s0 << Self::MDS_MATRIX_EXPS[0]);
let d = reduce_u160::<Self>(d_sum);
// result = [d] concat [state[0] * v + state[shift up by 1]]
let mut result = [Self::ZERO; WIDTH];
@ -366,7 +388,7 @@ where
for i in 1..12 {
if i < WIDTH {
let t = Self::from_canonical_u64(Self::FAST_PARTIAL_ROUND_VS[r][i - 1]);
result[i] = state[0] * t + state[i];
result[i] = state[i].multiply_accumulate(state[0], t);
}
}
result
@ -431,7 +453,10 @@ where
assert!(WIDTH <= 12);
for i in 0..12 {
if i < WIDTH {
state[i] += Self::from_canonical_u64(ALL_ROUND_CONSTANTS[i + WIDTH * round_ctr]);
let round_constant = ALL_ROUND_CONSTANTS[i + WIDTH * round_ctr];
unsafe {
state[i] = state[i].add_canonical_u64(round_constant);
}
}
}
}
@ -527,15 +552,13 @@ where
Self::partial_first_constant_layer(state);
*state = Self::mds_partial_layer_init(state);
// One less than N_PARTIAL_ROUNDS because we do the last one
// separately at the end.
for i in 0..(N_PARTIAL_ROUNDS - 1) {
for i in 0..N_PARTIAL_ROUNDS {
state[0] = Self::sbox_monomial(state[0]);
state[0] += Self::from_canonical_u64(Self::FAST_PARTIAL_ROUND_CONSTANTS[i]);
unsafe {
state[0] = state[0].add_canonical_u64(Self::FAST_PARTIAL_ROUND_CONSTANTS[i]);
}
*state = Self::mds_partial_layer_fast(state, i);
}
state[0] = Self::sbox_monomial(state[0]);
*state = Self::mds_partial_layer_fast(state, N_PARTIAL_ROUNDS - 1);
*round_ctr += N_PARTIAL_ROUNDS;
}
@ -594,13 +617,13 @@ impl Poseidon<8> for CrandallField {
0x2c943b9a8d9ee4f4, 0x6d70fcb874f05f57, 0xf48e800880a87878, 0x24b1eb418f3994c3,
];
const FAST_PARTIAL_ROUND_CONSTANTS: [u64; N_PARTIAL_ROUNDS - 1] = [
const FAST_PARTIAL_ROUND_CONSTANTS: [u64; N_PARTIAL_ROUNDS] = [
0x6d69d39f98b01c69, 0x7694ae5bbd92de89, 0x0b9bfb9fbb252451, 0xf547651a6893f655,
0x44f4e70e9f77cd03, 0xd8e2801a322a6f39, 0xbd0f7e1bc9649171, 0x2eda14ffc32245e4,
0x296e04e8222b9265, 0x9aa740fd9cf504ea, 0xe5e868a6d4315bcc, 0x7d430efe75c6ece5,
0x37ca54f0b49f6214, 0xde83a9f01bfa62d2, 0xb0831b529dbb5b9c, 0xa1d590c3b2b945b5,
0xa130846268961080, 0x79e6e27330006b7a, 0xba12695bd255613b, 0x0091d7aaf86c0e15,
0xe9028a7b418aa9f2,
0xe9028a7b418aa9f2, 0x0000000000000000,
];
const FAST_PARTIAL_ROUND_VS: [[u64; 8 - 1]; N_PARTIAL_ROUNDS] = [
@ -697,22 +720,22 @@ impl Poseidon<8> for CrandallField {
0x1e93b91509607640, 0x5ed95be1c52cf97e, 0x359f0220d53d82f4, ],
];
// NB: This is in COLUMN-major order to support cache-friendly pre-multiplication.
// NB: This is in ROW-major order to support cache-friendly pre-multiplication.
const FAST_PARTIAL_ROUND_INITIAL_MATRIX: [[u64; 8 - 1]; 8 - 1] = [
[0x3fc702a71c42c8df, 0xdc5d5c2cec372bd8, 0x61e9415bfc0d135a, 0x9b7a25991a49b57f,
0xaaee943e6eccf7b8, 0x2be97f5416341131, 0x3f3fd62d28872386, ],
[0xda6cfb436cf6973e, 0x5ed3accc77ae85d0, 0xd63481d84fa12429, 0x38d80c86e3eb1887,
0xf8ad1187508f709c, 0xd0b8c098bdcf7407, 0x2be97f5416341131, ],
[0x800fc4e2c9f585d8, 0xc768961eecdcb554, 0xc8e4a9f96ab57c10, 0xeae1feb52d6eb09a,
0x7ffbbc7ce8823d72, 0xf8ad1187508f709c, 0xaaee943e6eccf7b8, ],
[0x3864e0e53027baf7, 0x95af3551b40289ce, 0x29d0d07fd9b6e9ea, 0xda20f5c812c60b4e,
0xeae1feb52d6eb09a, 0x38d80c86e3eb1887, 0x9b7a25991a49b57f, ],
[0x44ae739518db1d10, 0xa3ae8c5444f37d9a, 0xa5aac4ccc8b791cc, 0x29d0d07fd9b6e9ea,
0xc8e4a9f96ab57c10, 0xd63481d84fa12429, 0x61e9415bfc0d135a, ],
[0x1d46b66c2ad3ef0c, 0x53c070eae0ad0c38, 0xa3ae8c5444f37d9a, 0x95af3551b40289ce,
0xc768961eecdcb554, 0x5ed3accc77ae85d0, 0xdc5d5c2cec372bd8, ],
[0xbc75b7bb6f92fb6b, 0x1d46b66c2ad3ef0c, 0x44ae739518db1d10, 0x3864e0e53027baf7,
0x800fc4e2c9f585d8, 0xda6cfb436cf6973e, 0x3fc702a71c42c8df, ],
[0x3fc702a71c42c8df, 0xda6cfb436cf6973e, 0x800fc4e2c9f585d8, 0x3864e0e53027baf7,
0x44ae739518db1d10, 0x1d46b66c2ad3ef0c, 0xbc75b7bb6f92fb6b, ],
[0xdc5d5c2cec372bd8, 0x5ed3accc77ae85d0, 0xc768961eecdcb554, 0x95af3551b40289ce,
0xa3ae8c5444f37d9a, 0x53c070eae0ad0c38, 0x1d46b66c2ad3ef0c, ],
[0x61e9415bfc0d135a, 0xd63481d84fa12429, 0xc8e4a9f96ab57c10, 0x29d0d07fd9b6e9ea,
0xa5aac4ccc8b791cc, 0xa3ae8c5444f37d9a, 0x44ae739518db1d10, ],
[0x9b7a25991a49b57f, 0x38d80c86e3eb1887, 0xeae1feb52d6eb09a, 0xda20f5c812c60b4e,
0x29d0d07fd9b6e9ea, 0x95af3551b40289ce, 0x3864e0e53027baf7, ],
[0xaaee943e6eccf7b8, 0xf8ad1187508f709c, 0x7ffbbc7ce8823d72, 0xeae1feb52d6eb09a,
0xc8e4a9f96ab57c10, 0xc768961eecdcb554, 0x800fc4e2c9f585d8, ],
[0x2be97f5416341131, 0xd0b8c098bdcf7407, 0xf8ad1187508f709c, 0x38d80c86e3eb1887,
0xd63481d84fa12429, 0x5ed3accc77ae85d0, 0xda6cfb436cf6973e, ],
[0x3f3fd62d28872386, 0x2be97f5416341131, 0xaaee943e6eccf7b8, 0x9b7a25991a49b57f,
0x61e9415bfc0d135a, 0xdc5d5c2cec372bd8, 0x3fc702a71c42c8df, ],
];
#[cfg(target_feature="avx2")]
@ -770,13 +793,13 @@ impl Poseidon<12> for CrandallField {
];
const FAST_PARTIAL_ROUND_CONSTANTS: [u64; N_PARTIAL_ROUNDS - 1] = [
const FAST_PARTIAL_ROUND_CONSTANTS: [u64; N_PARTIAL_ROUNDS] = [
0xa00e150786abac6c, 0xe71901e012a81740, 0x8c4517d65a4d4813, 0x62b1661b06dafd6b,
0x25b991b65a886452, 0x51bcd73c6aaabd6e, 0xb8956d71320d9266, 0x62e603408b7b7092,
0x9839210869008dc0, 0xc6b3ebc672dd2b86, 0x816bd6d0838e9e05, 0x0e80e96e5f3cc3fd,
0x4c8ea37c218378c9, 0x21a24a8087e0e306, 0x30c877124f60bdfa, 0x8e92578bf67f43f3,
0x79089cd2893d3cfa, 0x4a2da1f7351fe5b1, 0x7941de449fea07f0, 0x9f9fe970f90fe0b9,
0x8aff5500f81c1181,
0x8aff5500f81c1181, 0x0000000000000000,
];
const FAST_PARTIAL_ROUND_VS: [[u64; 12 - 1]; N_PARTIAL_ROUNDS] = [
@ -917,41 +940,41 @@ impl Poseidon<12> for CrandallField {
0x2e97f1cb8689f623, 0xa2a9961db0d614d8, 0xf87c2101134b253c, ],
];
// NB: This is in COLUMN-major order to support cache-friendly pre-multiplication.
// NB: This is in ROW-major order to support cache-friendly pre-multiplication.
const FAST_PARTIAL_ROUND_INITIAL_MATRIX: [[u64; 12 - 1]; 12 - 1] = [
[0x8a041eb885fb24f5, 0xeb159cc540fb5e78, 0xf2bc5f8a1eb47c5f, 0x029914d117a17af3,
0xf2bfc6a0100f3c6d, 0x2d506a5bb5b7480c, 0xda5e708c57dfe9f9, 0xd7b5feb73cd3a335,
0xeab0a50ac0fa5244, 0xad929b347785656d, 0xa344593dadcaf3de, ],
[0xb5d5efb12203ef9a, 0xe2afdb22f2e0801a, 0xac34f93c00842bef, 0xb908389bbeee3c9d,
0xf88cbe5484d71f29, 0x3e815f5ac59316cf, 0xaa5a5bcedc8ce58c, 0x2f1dcb0b29bcce64,
0x22f96387ab3046d8, 0x87b1d6bd50b96399, 0xad929b347785656d, ],
[0xe7ad8152c5d50bed, 0x2b68f22b6b414b24, 0x397f9c162cea9170, 0x33ab6239c8b237f3,
0x110365276c97b11f, 0x68bc309864072be8, 0xc0e1cb8013a75747, 0x10a1b57ff824d1f1,
0x45a7029a0a30d66f, 0x22f96387ab3046d8, 0xeab0a50ac0fa5244, ],
[0x685ef5a6b9e241d3, 0x9e085548eeb422b1, 0x3fd7af7763f724a2, 0x337d2955b1c463ae,
0x1927309728b02b2c, 0x56a37505b7b907a7, 0xfd7c623553723df6, 0x35b7afed907102a9,
0x10a1b57ff824d1f1, 0x2f1dcb0b29bcce64, 0xd7b5feb73cd3a335, ],
[0x7e60116e98d5e20c, 0xb561d1111e413cce, 0xeb3a58c29ba7f596, 0xb2ec0e3facb37f22,
0x95dc8a5e61463c07, 0xecd49c2975d75106, 0x7b298b0fc4b796ab, 0xfd7c623553723df6,
0xc0e1cb8013a75747, 0xaa5a5bcedc8ce58c, 0xda5e708c57dfe9f9, ],
[0xfdb6ca0a6d5cc865, 0xe351dee9a4f90434, 0x9fedefd5f6653e80, 0xb7f57d2afbb79622,
0xa84c2200dfb57d3e, 0x0cdf9734cbbc0e07, 0xecd49c2975d75106, 0x56a37505b7b907a7,
0x68bc309864072be8, 0x3e815f5ac59316cf, 0x2d506a5bb5b7480c, ],
[0x857f31827fb3fe60, 0x6aa96c0125bddef7, 0xe629261862a9a8e1, 0xf285b4aa369079a1,
0x01838a8c1d92d250, 0xa84c2200dfb57d3e, 0x95dc8a5e61463c07, 0x1927309728b02b2c,
0x110365276c97b11f, 0xf88cbe5484d71f29, 0xf2bfc6a0100f3c6d, ],
[0xe31988229a5fcb6e, 0x06f4e7db60b9d3b3, 0x4e093de640582a4f, 0xa3a167ee9469e711,
0xf285b4aa369079a1, 0xb7f57d2afbb79622, 0xb2ec0e3facb37f22, 0x337d2955b1c463ae,
0x33ab6239c8b237f3, 0xb908389bbeee3c9d, 0x029914d117a17af3, ],
[0xc06fefcd7cea8405, 0xa7b2498836972dc4, 0x4e3662cf34ca1a70, 0x4e093de640582a4f,
0xe629261862a9a8e1, 0x9fedefd5f6653e80, 0xeb3a58c29ba7f596, 0x3fd7af7763f724a2,
0x397f9c162cea9170, 0xac34f93c00842bef, 0xf2bc5f8a1eb47c5f, ],
[0x05adcaa7427c172c, 0xdc59b1fe6c753a07, 0xa7b2498836972dc4, 0x06f4e7db60b9d3b3,
0x6aa96c0125bddef7, 0xe351dee9a4f90434, 0xb561d1111e413cce, 0x9e085548eeb422b1,
0x2b68f22b6b414b24, 0xe2afdb22f2e0801a, 0xeb159cc540fb5e78, ],
[0x4ff536f518f675c7, 0x05adcaa7427c172c, 0xc06fefcd7cea8405, 0xe31988229a5fcb6e,
0x857f31827fb3fe60, 0xfdb6ca0a6d5cc865, 0x7e60116e98d5e20c, 0x685ef5a6b9e241d3,
0xe7ad8152c5d50bed, 0xb5d5efb12203ef9a, 0x8a041eb885fb24f5, ],
[0x8a041eb885fb24f5, 0xb5d5efb12203ef9a, 0xe7ad8152c5d50bed, 0x685ef5a6b9e241d3,
0x7e60116e98d5e20c, 0xfdb6ca0a6d5cc865, 0x857f31827fb3fe60, 0xe31988229a5fcb6e,
0xc06fefcd7cea8405, 0x05adcaa7427c172c, 0x4ff536f518f675c7, ],
[0xeb159cc540fb5e78, 0xe2afdb22f2e0801a, 0x2b68f22b6b414b24, 0x9e085548eeb422b1,
0xb561d1111e413cce, 0xe351dee9a4f90434, 0x6aa96c0125bddef7, 0x06f4e7db60b9d3b3,
0xa7b2498836972dc4, 0xdc59b1fe6c753a07, 0x05adcaa7427c172c, ],
[0xf2bc5f8a1eb47c5f, 0xac34f93c00842bef, 0x397f9c162cea9170, 0x3fd7af7763f724a2,
0xeb3a58c29ba7f596, 0x9fedefd5f6653e80, 0xe629261862a9a8e1, 0x4e093de640582a4f,
0x4e3662cf34ca1a70, 0xa7b2498836972dc4, 0xc06fefcd7cea8405, ],
[0x029914d117a17af3, 0xb908389bbeee3c9d, 0x33ab6239c8b237f3, 0x337d2955b1c463ae,
0xb2ec0e3facb37f22, 0xb7f57d2afbb79622, 0xf285b4aa369079a1, 0xa3a167ee9469e711,
0x4e093de640582a4f, 0x06f4e7db60b9d3b3, 0xe31988229a5fcb6e, ],
[0xf2bfc6a0100f3c6d, 0xf88cbe5484d71f29, 0x110365276c97b11f, 0x1927309728b02b2c,
0x95dc8a5e61463c07, 0xa84c2200dfb57d3e, 0x01838a8c1d92d250, 0xf285b4aa369079a1,
0xe629261862a9a8e1, 0x6aa96c0125bddef7, 0x857f31827fb3fe60, ],
[0x2d506a5bb5b7480c, 0x3e815f5ac59316cf, 0x68bc309864072be8, 0x56a37505b7b907a7,
0xecd49c2975d75106, 0x0cdf9734cbbc0e07, 0xa84c2200dfb57d3e, 0xb7f57d2afbb79622,
0x9fedefd5f6653e80, 0xe351dee9a4f90434, 0xfdb6ca0a6d5cc865, ],
[0xda5e708c57dfe9f9, 0xaa5a5bcedc8ce58c, 0xc0e1cb8013a75747, 0xfd7c623553723df6,
0x7b298b0fc4b796ab, 0xecd49c2975d75106, 0x95dc8a5e61463c07, 0xb2ec0e3facb37f22,
0xeb3a58c29ba7f596, 0xb561d1111e413cce, 0x7e60116e98d5e20c, ],
[0xd7b5feb73cd3a335, 0x2f1dcb0b29bcce64, 0x10a1b57ff824d1f1, 0x35b7afed907102a9,
0xfd7c623553723df6, 0x56a37505b7b907a7, 0x1927309728b02b2c, 0x337d2955b1c463ae,
0x3fd7af7763f724a2, 0x9e085548eeb422b1, 0x685ef5a6b9e241d3, ],
[0xeab0a50ac0fa5244, 0x22f96387ab3046d8, 0x45a7029a0a30d66f, 0x10a1b57ff824d1f1,
0xc0e1cb8013a75747, 0x68bc309864072be8, 0x110365276c97b11f, 0x33ab6239c8b237f3,
0x397f9c162cea9170, 0x2b68f22b6b414b24, 0xe7ad8152c5d50bed, ],
[0xad929b347785656d, 0x87b1d6bd50b96399, 0x22f96387ab3046d8, 0x2f1dcb0b29bcce64,
0xaa5a5bcedc8ce58c, 0x3e815f5ac59316cf, 0xf88cbe5484d71f29, 0xb908389bbeee3c9d,
0xac34f93c00842bef, 0xe2afdb22f2e0801a, 0xb5d5efb12203ef9a, ],
[0xa344593dadcaf3de, 0xad929b347785656d, 0xeab0a50ac0fa5244, 0xd7b5feb73cd3a335,
0xda5e708c57dfe9f9, 0x2d506a5bb5b7480c, 0xf2bfc6a0100f3c6d, 0x029914d117a17af3,
0xf2bc5f8a1eb47c5f, 0xeb159cc540fb5e78, 0x8a041eb885fb24f5, ],
];
#[cfg(target_feature="avx2")]