From f3003488daf2c5a195e53db9d3fdd7a32deee09e Mon Sep 17 00:00:00 2001 From: Christopher Taylor Date: Fri, 26 May 2017 20:30:48 -0700 Subject: [PATCH] Fix build errors --- LeopardCommon.cpp | 85 ++-------------- LeopardCommon.h | 15 +-- LeopardFF16.cpp | 179 ++++------------------------------ LeopardFF16.h | 28 ++---- LeopardFF8.cpp | 222 +++++++++--------------------------------- LeopardFF8.h | 28 ++---- tests/experiments.cpp | 32 +++--- 7 files changed, 105 insertions(+), 484 deletions(-) diff --git a/LeopardCommon.cpp b/LeopardCommon.cpp index 55850bc..6251b4c 100644 --- a/LeopardCommon.cpp +++ b/LeopardCommon.cpp @@ -144,7 +144,7 @@ void InitializeCPUArch() void xor_mem( void * LEO_RESTRICT vx, const void * LEO_RESTRICT vy, - unsigned bytes) + uint64_t bytes) { #if defined(LEO_TRY_AVX2) if (CpuHasAVX2) @@ -189,88 +189,15 @@ void xor_mem( } while (bytes > 0); } -void xor_mem2( - void * LEO_RESTRICT vx_0, const void * LEO_RESTRICT vy_0, - void * LEO_RESTRICT vx_1, const void * LEO_RESTRICT vy_1, - unsigned bytes) -{ -#if defined(LEO_TRY_AVX2) - if (CpuHasAVX2) - { - LEO_M256 * LEO_RESTRICT x32_0 = reinterpret_cast (vx_0); - const LEO_M256 * LEO_RESTRICT y32_0 = reinterpret_cast(vy_0); - LEO_M256 * LEO_RESTRICT x32_1 = reinterpret_cast (vx_1); - const LEO_M256 * LEO_RESTRICT y32_1 = reinterpret_cast(vy_1); - do - { - const LEO_M256 x0_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0), _mm256_loadu_si256(y32_0)); - const LEO_M256 x1_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 1), _mm256_loadu_si256(y32_0 + 1)); - const LEO_M256 x2_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 2), _mm256_loadu_si256(y32_0 + 2)); - const LEO_M256 x3_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 3), _mm256_loadu_si256(y32_0 + 3)); - const LEO_M256 x0_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1), _mm256_loadu_si256(y32_1)); - const LEO_M256 x1_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 1), _mm256_loadu_si256(y32_1 + 1)); - const LEO_M256 x2_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 2), _mm256_loadu_si256(y32_1 + 2)); - const LEO_M256 x3_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 3), _mm256_loadu_si256(y32_1 + 3)); - _mm256_storeu_si256(x32_0, x0_0); - _mm256_storeu_si256(x32_0 + 1, x1_0); - _mm256_storeu_si256(x32_0 + 2, x2_0); - _mm256_storeu_si256(x32_0 + 3, x3_0); - _mm256_storeu_si256(x32_1, x0_1); - _mm256_storeu_si256(x32_1 + 1, x1_1); - _mm256_storeu_si256(x32_1 + 2, x2_1); - _mm256_storeu_si256(x32_1 + 3, x3_1); - x32_0 += 4, y32_0 += 4; - x32_1 += 4, y32_1 += 4; - bytes -= 128; - } while (bytes >= 128); - if (bytes > 0) - { - const LEO_M256 x0_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0), _mm256_loadu_si256(y32_0)); - const LEO_M256 x1_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 1), _mm256_loadu_si256(y32_0 + 1)); - const LEO_M256 x0_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1), _mm256_loadu_si256(y32_1)); - const LEO_M256 x1_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 1), _mm256_loadu_si256(y32_1 + 1)); - _mm256_storeu_si256(x32_0, x0_0); - _mm256_storeu_si256(x32_0 + 1, x1_0); - _mm256_storeu_si256(x32_1, x0_1); - _mm256_storeu_si256(x32_1 + 1, x1_1); - } - return; - } -#endif // LEO_TRY_AVX2 - LEO_M128 * LEO_RESTRICT x16_0 = reinterpret_cast (vx_0); - const LEO_M128 * LEO_RESTRICT y16_0 = reinterpret_cast(vy_0); - LEO_M128 * LEO_RESTRICT x16_1 = reinterpret_cast (vx_1); - const LEO_M128 * LEO_RESTRICT y16_1 = reinterpret_cast(vy_1); - do - { - const LEO_M128 x0_0 = _mm_xor_si128(_mm_loadu_si128(x16_0), _mm_loadu_si128(y16_0)); - const LEO_M128 x1_0 = _mm_xor_si128(_mm_loadu_si128(x16_0 + 1), _mm_loadu_si128(y16_0 + 1)); - const LEO_M128 x2_0 = _mm_xor_si128(_mm_loadu_si128(x16_0 + 2), _mm_loadu_si128(y16_0 + 2)); - const LEO_M128 x3_0 = _mm_xor_si128(_mm_loadu_si128(x16_0 + 3), _mm_loadu_si128(y16_0 + 3)); - const LEO_M128 x0_1 = _mm_xor_si128(_mm_loadu_si128(x16_1), _mm_loadu_si128(y16_1)); - const LEO_M128 x1_1 = _mm_xor_si128(_mm_loadu_si128(x16_1 + 1), _mm_loadu_si128(y16_1 + 1)); - const LEO_M128 x2_1 = _mm_xor_si128(_mm_loadu_si128(x16_1 + 2), _mm_loadu_si128(y16_1 + 2)); - const LEO_M128 x3_1 = _mm_xor_si128(_mm_loadu_si128(x16_1 + 3), _mm_loadu_si128(y16_1 + 3)); - _mm_storeu_si128(x16_0, x0_0); - _mm_storeu_si128(x16_0 + 1, x1_0); - _mm_storeu_si128(x16_0 + 2, x2_0); - _mm_storeu_si128(x16_0 + 3, x3_0); - _mm_storeu_si128(x16_1, x0_1); - _mm_storeu_si128(x16_1 + 1, x1_1); - _mm_storeu_si128(x16_1 + 2, x2_1); - _mm_storeu_si128(x16_1 + 3, x3_1); - x16_0 += 4, y16_0 += 4; - x16_1 += 4, y16_1 += 4; - bytes -= 64; - } while (bytes > 0); -} - -void xor_mem3( +void xor_mem4( void * LEO_RESTRICT vx_0, const void * LEO_RESTRICT vy_0, void * LEO_RESTRICT vx_1, const void * LEO_RESTRICT vy_1, void * LEO_RESTRICT vx_2, const void * LEO_RESTRICT vy_2, - unsigned bytes) + void * LEO_RESTRICT vx_3, const void * LEO_RESTRICT vy_3, + uint64_t bytes) { + // FIXME: Add args + #if defined(LEO_TRY_AVX2) if (CpuHasAVX2) { diff --git a/LeopardCommon.h b/LeopardCommon.h index d8b4011..1581629 100644 --- a/LeopardCommon.h +++ b/LeopardCommon.h @@ -208,20 +208,15 @@ LEO_FORCE_INLINE unsigned NextPow2(unsigned n) // x[] ^= y[] void xor_mem( void * LEO_RESTRICT x, const void * LEO_RESTRICT y, - unsigned bytes); + uint64_t bytes); -// For i = {0, 1}: x_i[] ^= x_i[] -void xor_mem2( - void * LEO_RESTRICT x_0, const void * LEO_RESTRICT y_0, - void * LEO_RESTRICT x_1, const void * LEO_RESTRICT y_1, - unsigned bytes); - -// For i = {0, 1, 2}: x_i[] ^= x_i[] -void xor_mem3( +// For i = {0, 1, 2, 3}: x_i[] ^= x_i[] +void xor_mem4( void * LEO_RESTRICT x_0, const void * LEO_RESTRICT y_0, void * LEO_RESTRICT x_1, const void * LEO_RESTRICT y_1, void * LEO_RESTRICT x_2, const void * LEO_RESTRICT y_2, - unsigned bytes); + void * LEO_RESTRICT x_3, const void * LEO_RESTRICT y_3, + uint64_t bytes); } // namespace leopard diff --git a/LeopardFF16.cpp b/LeopardFF16.cpp index 76934f4..6f717ff 100644 --- a/LeopardFF16.cpp +++ b/LeopardFF16.cpp @@ -523,131 +523,6 @@ void mul_mem_set( } while (bytes > 0); } -// vx0[] *= m, vx1[] *= m -void mul_mem2_inplace( - void * LEO_RESTRICT vx_0, - void * LEO_RESTRICT vx_1, - ffe_t m, uint64_t bytes) -{ - if (m <= 1) - { - if (m == 0) - { - memset(vx_0, 0, bytes); - memset(vx_1, 0, bytes); - } - return; - } - -#if defined(LEO_TRY_AVX2) - if (CpuHasAVX2) - { - const LEO_M256 table_lo_y = _mm256_loadu_si256(Multiply256LUT.Lo + m); - const LEO_M256 table_hi_y = _mm256_loadu_si256(Multiply256LUT.Hi + m); - - const LEO_M256 clr_mask = _mm256_set1_epi8(0x0f); - - LEO_M256 * LEO_RESTRICT x32_0 = reinterpret_cast(vx_0); - LEO_M256 * LEO_RESTRICT x32_1 = reinterpret_cast(vx_1); - - do - { - LEO_M256 x0_0 = _mm256_loadu_si256(x32_0 + 1); - LEO_M256 l0_0 = _mm256_and_si256(x0_0, clr_mask); - x0_0 = _mm256_srli_epi64(x0_0, 4); - LEO_M256 h0_0 = _mm256_and_si256(x0_0, clr_mask); - l0_0 = _mm256_shuffle_epi8(table_lo_y, l0_0); - h0_0 = _mm256_shuffle_epi8(table_hi_y, h0_0); - l0_0 = _mm256_xor_si256(l0_0, h0_0); - - LEO_M256 x1_0 = _mm256_loadu_si256(x32_0); - LEO_M256 l1_0 = _mm256_and_si256(x1_0, clr_mask); - x1_0 = _mm256_srli_epi64(x1_0, 4); - LEO_M256 h1_0 = _mm256_and_si256(x1_0, clr_mask); - l1_0 = _mm256_shuffle_epi8(table_lo_y, l1_0); - h1_0 = _mm256_shuffle_epi8(table_hi_y, h1_0); - l1_0 = _mm256_xor_si256(l1_0, h1_0); - - LEO_M256 x0_1 = _mm256_loadu_si256(x32_1 + 1); - LEO_M256 l0_1 = _mm256_and_si256(x0_1, clr_mask); - x0_1 = _mm256_srli_epi64(x0_1, 4); - LEO_M256 h0_1 = _mm256_and_si256(x0_1, clr_mask); - l0_1 = _mm256_shuffle_epi8(table_lo_y, l0_1); - h0_1 = _mm256_shuffle_epi8(table_hi_y, h0_1); - l0_1 = _mm256_xor_si256(l0_1, h0_1); - - LEO_M256 x1_1 = _mm256_loadu_si256(x32_1); - LEO_M256 l1_1 = _mm256_and_si256(x1_1, clr_mask); - x1_1 = _mm256_srli_epi64(x1_1, 4); - LEO_M256 h1_1 = _mm256_and_si256(x1_1, clr_mask); - l1_1 = _mm256_shuffle_epi8(table_lo_y, l1_1); - h1_1 = _mm256_shuffle_epi8(table_hi_y, h1_1); - l1_1 = _mm256_xor_si256(l1_1, h1_1); - - _mm256_storeu_si256(x32_0 + 1, l0_0); - _mm256_storeu_si256(x32_0, l1_0); - _mm256_storeu_si256(x32_1 + 1, l0_1); - _mm256_storeu_si256(x32_1, l1_1); - - x32_0 += 2; - x32_1 += 2; - bytes -= 64; - } while (bytes > 0); - return; - } -#endif // LEO_TRY_AVX2 - - const LEO_M128 table_lo_y = _mm_loadu_si128(Multiply128LUT.Lo + m); - const LEO_M128 table_hi_y = _mm_loadu_si128(Multiply128LUT.Hi + m); - - const LEO_M128 clr_mask = _mm_set1_epi8(0x0f); - - LEO_M128 * LEO_RESTRICT x16_0 = reinterpret_cast(vx_0); - LEO_M128 * LEO_RESTRICT x16_1 = reinterpret_cast(vx_1); - - do - { - LEO_M128 x3 = _mm_loadu_si128(x16_0 + 3); - LEO_M128 l3 = _mm_and_si128(x3, clr_mask); - x3 = _mm_srli_epi64(x3, 4); - LEO_M128 h3 = _mm_and_si128(x3, clr_mask); - l3 = _mm_shuffle_epi8(table_lo_y, l3); - h3 = _mm_shuffle_epi8(table_hi_y, h3); - - LEO_M128 x2 = _mm_loadu_si128(x16_0 + 2); - LEO_M128 l2 = _mm_and_si128(x2, clr_mask); - x2 = _mm_srli_epi64(x2, 4); - LEO_M128 h2 = _mm_and_si128(x2, clr_mask); - l2 = _mm_shuffle_epi8(table_lo_y, l2); - h2 = _mm_shuffle_epi8(table_hi_y, h2); - - LEO_M128 x1 = _mm_loadu_si128(x16_0 + 1); - LEO_M128 l1 = _mm_and_si128(x1, clr_mask); - x1 = _mm_srli_epi64(x1, 4); - LEO_M128 h1 = _mm_and_si128(x1, clr_mask); - l1 = _mm_shuffle_epi8(table_lo_y, l1); - h1 = _mm_shuffle_epi8(table_hi_y, h1); - - LEO_M128 x0 = _mm_loadu_si128(x16_0); - LEO_M128 l0 = _mm_and_si128(x0, clr_mask); - x0 = _mm_srli_epi64(x0, 4); - LEO_M128 h0 = _mm_and_si128(x0, clr_mask); - l0 = _mm_shuffle_epi8(table_lo_y, l0); - h0 = _mm_shuffle_epi8(table_hi_y, h0); - - _mm_storeu_si128(x16_0 + 3, _mm_xor_si128(l3, h3)); - _mm_storeu_si128(x16_0 + 2, _mm_xor_si128(l2, h2)); - _mm_storeu_si128(x16_0 + 1, _mm_xor_si128(l1, h1)); - _mm_storeu_si128(x16_0, _mm_xor_si128(l0, h0)); - - // FIXME: Add second one here - - x16_0 += 4; - x16_1 += 4; - bytes -= 64; - } while (bytes > 0); -} - //------------------------------------------------------------------------------ // FFT Operations @@ -660,20 +535,12 @@ void fft_butterfly( } -// For i = {0, 1}: x_i[] ^= y_i[] * m, y_i[] ^= x_i[] -void fft_butterfly2( - void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0, - void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1, - ffe_t m, uint64_t bytes) -{ - -} - -// For i = {0, 1, 2}: x_i[] ^= y_i[] * m, y_i[] ^= x_i[] -void fft_butterfly3( +// For i = {0, 1, 2, 3}: x_i[] ^= y_i[] * m, y_i[] ^= x_i[] +void fft_butterfly4( void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0, void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1, void * LEO_RESTRICT x_2, void * LEO_RESTRICT y_2, + void * LEO_RESTRICT x_3, void * LEO_RESTRICT y_3, ffe_t m, uint64_t bytes) { @@ -691,20 +558,12 @@ void ifft_butterfly( } -// For i = {0, 1}: y_i[] ^= x_i[], x_i[] ^= y_i[] * m -void ifft_butterfly2( - void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0, - void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1, - ffe_t m, uint64_t bytes) -{ - -} - -// For i = {0, 1, 2}: y_i[] ^= x_i[], x_i[] ^= y_i[] * m -void ifft_butterfly3( +// For i = {0, 1, 2, 3}: y_i[] ^= x_i[], x_i[] ^= y_i[] * m +void ifft_butterfly4( void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0, void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1, void * LEO_RESTRICT x_2, void * LEO_RESTRICT y_2, + void * LEO_RESTRICT x_3, void * LEO_RESTRICT y_3, ffe_t m, uint64_t bytes) { @@ -714,7 +573,7 @@ void ifft_butterfly3( //------------------------------------------------------------------------------ // FFT -static ffe_t FFTSkew[kFieldModulus]; // twisted factors used in FFT +static ffe_t FFTSkew[kModulus]; // twisted factors used in FFT static ffe_t LogWalsh[kOrder]; // factors used in the evaluation of the error locator polynomial void FFTInitialize() @@ -739,19 +598,19 @@ void FFTInitialize() } // TBD: This can be cleaned up - temp[m] = kFieldModulus - LogLUT[FFEMultiply(temp[m], temp[m] ^ 1)]; + temp[m] = kModulus - LogLUT[FFEMultiply(temp[m], temp[m] ^ 1)]; for (unsigned i = m + 1; i < (kBits - 1); ++i) - temp[i] = FFEMultiplyLog(temp[i], (LogLUT[temp[i] ^ 1] + temp[m]) % kFieldModulus); + temp[i] = FFEMultiplyLog(temp[i], (LogLUT[temp[i] ^ 1] + temp[m]) % kModulus); } for (unsigned i = 0; i < kOrder; ++i) FFTSkew[i] = LogLUT[FFTSkew[i]]; - temp[0] = kFieldModulus - temp[0]; + temp[0] = kModulus - temp[0]; for (unsigned i = 1; i < (kBits - 1); ++i) - temp[i] = (kFieldModulus - temp[i] + temp[i - 1]) % kFieldModulus; + temp[i] = (kModulus - temp[i] + temp[i - 1]) % kModulus; for (unsigned i = 0; i < kOrder; ++i) LogWalsh[i] = LogLUT[i]; @@ -787,7 +646,7 @@ void Encode( { const ffe_t skew = FFTSkew[j + m - 1]; - if (skew != kFieldModulus) + if (skew != kModulus) { for (unsigned i = j - width; i < j; ++i) ifft_butterfly(work[i], work[i + width], skew, buffer_bytes); @@ -818,7 +677,7 @@ void Encode( { const ffe_t skew = FFTSkew[j + m + i - 1]; - if (skew != kFieldModulus) + if (skew != kModulus) { for (unsigned k = j - width; k < j; ++k) ifft_butterfly(temp[k], temp[k + width], skew, buffer_bytes); @@ -863,7 +722,7 @@ void Encode( { const ffe_t skew = FFTSkew[j + m + i - 1]; - if (skew != kFieldModulus) + if (skew != kModulus) { for (unsigned k = j - width; k < j; ++k) ifft_butterfly(temp[k], temp[k + width], skew, buffer_bytes); @@ -894,7 +753,7 @@ void Encode( { const ffe_t skew = skewLUT[j]; - if (skew != kFieldModulus) + if (skew != kModulus) { for (unsigned k = j, count = j + width; k < count; ++k) fft_butterfly(data[k], data[k + width], skew, buffer_bytes); @@ -938,7 +797,7 @@ void Decode( FWHT(ErrorLocations, kBits); for (unsigned i = 0; i < kOrder; ++i) - ErrorLocations[i] = ((unsigned)ErrorLocations[i] * (unsigned)LogWalsh[i]) % kFieldModulus; + ErrorLocations[i] = ((unsigned)ErrorLocations[i] * (unsigned)LogWalsh[i]) % kModulus; FWHT(ErrorLocations, kBits); @@ -974,7 +833,7 @@ void Decode( { const ffe_t skew = FFTSkew[j - 1]; - if (skew != kFieldModulus) + if (skew != kModulus) { for (unsigned i = j - width; i < j; ++i) ifft_butterfly(work[i], work[i + width], skew, buffer_bytes); @@ -1010,7 +869,7 @@ void Decode( { const ffe_t skew = skewLUT[j]; - if (skew != kFieldModulus) + if (skew != kModulus) { for (unsigned i = j; i < j + width; ++i) fft_butterfly(work[i], work[i + width], skew, buffer_bytes); @@ -1027,7 +886,7 @@ void Decode( for (unsigned i = 0; i < original_count; ++i) if (!original[i]) - mul_mem_set(work[i], work[i + m], kFieldModulus - ErrorLocations[i], buffer_bytes); + mul_mem_set(work[i], work[i + m], kModulus - ErrorLocations[i], buffer_bytes); } diff --git a/LeopardFF16.h b/LeopardFF16.h index b43c7f1..b2dc3f5 100644 --- a/LeopardFF16.h +++ b/LeopardFF16.h @@ -73,12 +73,6 @@ void mul_mem_set( void * LEO_RESTRICT x, const void * LEO_RESTRICT y, ffe_t m, uint64_t bytes); -// For i = {0, 1}: x_i[] *= m -void mul_mem2_inplace( - void * LEO_RESTRICT x_0, - void * LEO_RESTRICT x_1, - ffe_t m, uint64_t bytes); - //------------------------------------------------------------------------------ // FFT Operations @@ -88,17 +82,12 @@ void fft_butterfly( void * LEO_RESTRICT x, void * LEO_RESTRICT y, ffe_t m, uint64_t bytes); -// For i = {0, 1}: x_i[] ^= y_i[] * m, y_i[] ^= x_i[] -void fft_butterfly2( - void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0, - void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1, - ffe_t m, uint64_t bytes); - -// For i = {0, 1, 2}: x_i[] ^= y_i[] * m, y_i[] ^= x_i[] -void fft_butterfly3( +// For i = {0, 1, 2, 3}: x_i[] ^= y_i[] * m, y_i[] ^= x_i[] +void fft_butterfly4( void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0, void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1, void * LEO_RESTRICT x_2, void * LEO_RESTRICT y_2, + void * LEO_RESTRICT x_3, void * LEO_RESTRICT y_3, ffe_t m, uint64_t bytes); @@ -110,17 +99,12 @@ void ifft_butterfly( void * LEO_RESTRICT x, void * LEO_RESTRICT y, ffe_t m, uint64_t bytes); -// For i = {0, 1}: y_i[] ^= x_i[], x_i[] ^= y_i[] * m -void ifft_butterfly2( - void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0, - void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1, - ffe_t m, uint64_t bytes); - -// For i = {0, 1, 2}: y_i[] ^= x_i[], x_i[] ^= y_i[] * m -void ifft_butterfly3( +// For i = {0, 1, 2, 3}: y_i[] ^= x_i[], x_i[] ^= y_i[] * m +void ifft_butterfly4( void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0, void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1, void * LEO_RESTRICT x_2, void * LEO_RESTRICT y_2, + void * LEO_RESTRICT x_3, void * LEO_RESTRICT y_3, ffe_t m, uint64_t bytes); diff --git a/LeopardFF8.cpp b/LeopardFF8.cpp index ee01299..d4d4258 100644 --- a/LeopardFF8.cpp +++ b/LeopardFF8.cpp @@ -282,7 +282,7 @@ static ffe_t FFEMultiplyLog(ffe_t a, ffe_t log_b) { if (a == 0) return 0; - return ExpLUT[AddMod(LogLUT[a], b)]; + return ExpLUT[AddMod(LogLUT[a], log_b)]; } bool InitializeMultiplyTables() @@ -341,25 +341,29 @@ void mul_mem_set( LEO_M256 * LEO_RESTRICT z32 = reinterpret_cast(vx); const LEO_M256 * LEO_RESTRICT x32 = reinterpret_cast(vy); - const unsigned count = bytes / 64; - for (unsigned i = 0; i < count; ++i) + do { - LEO_M256 x0 = _mm256_loadu_si256(x32 + i * 2); + LEO_M256 x0 = _mm256_loadu_si256(x32); LEO_M256 l0 = _mm256_and_si256(x0, clr_mask); x0 = _mm256_srli_epi64(x0, 4); LEO_M256 h0 = _mm256_and_si256(x0, clr_mask); l0 = _mm256_shuffle_epi8(table_lo_y, l0); h0 = _mm256_shuffle_epi8(table_hi_y, h0); - _mm256_storeu_si256(z32 + i * 2, _mm256_xor_si256(l0, h0)); + _mm256_storeu_si256(z32, _mm256_xor_si256(l0, h0)); - LEO_M256 x1 = _mm256_loadu_si256(x32 + i * 2 + 1); + LEO_M256 x1 = _mm256_loadu_si256(x32 + 1); LEO_M256 l1 = _mm256_and_si256(x1, clr_mask); x1 = _mm256_srli_epi64(x1, 4); LEO_M256 h1 = _mm256_and_si256(x1, clr_mask); l1 = _mm256_shuffle_epi8(table_lo_y, l1); h1 = _mm256_shuffle_epi8(table_hi_y, h1); - _mm256_storeu_si256(z32 + i * 2 + 1, _mm256_xor_si256(l1, h1)); - } + _mm256_storeu_si256(z32 + 1, _mm256_xor_si256(l1, h1)); + + x32 += 2; + z32 += 2; + bytes -= 64; + } while (bytes > 0); + return; } #endif // LEO_TRY_AVX2 @@ -412,131 +416,6 @@ void mul_mem_set( } while (bytes > 0); } -// vx0[] *= m, vx1[] *= m -void mul_mem2_inplace( - void * LEO_RESTRICT vx_0, - void * LEO_RESTRICT vx_1, - ffe_t m, uint64_t bytes) -{ - if (m <= 1) - { - if (m == 0) - { - memset(vx_0, 0, bytes); - memset(vx_1, 0, bytes); - } - return; - } - -#if defined(LEO_TRY_AVX2) - if (CpuHasAVX2) - { - const LEO_M256 table_lo_y = _mm256_loadu_si256(Multiply256LUT.Lo + m); - const LEO_M256 table_hi_y = _mm256_loadu_si256(Multiply256LUT.Hi + m); - - const LEO_M256 clr_mask = _mm256_set1_epi8(0x0f); - - LEO_M256 * LEO_RESTRICT x32_0 = reinterpret_cast(vx_0); - LEO_M256 * LEO_RESTRICT x32_1 = reinterpret_cast(vx_1); - - do - { - LEO_M256 x0_0 = _mm256_loadu_si256(x32_0 + 1); - LEO_M256 l0_0 = _mm256_and_si256(x0_0, clr_mask); - x0_0 = _mm256_srli_epi64(x0_0, 4); - LEO_M256 h0_0 = _mm256_and_si256(x0_0, clr_mask); - l0_0 = _mm256_shuffle_epi8(table_lo_y, l0_0); - h0_0 = _mm256_shuffle_epi8(table_hi_y, h0_0); - l0_0 = _mm256_xor_si256(l0_0, h0_0); - - LEO_M256 x1_0 = _mm256_loadu_si256(x32_0); - LEO_M256 l1_0 = _mm256_and_si256(x1_0, clr_mask); - x1_0 = _mm256_srli_epi64(x1_0, 4); - LEO_M256 h1_0 = _mm256_and_si256(x1_0, clr_mask); - l1_0 = _mm256_shuffle_epi8(table_lo_y, l1_0); - h1_0 = _mm256_shuffle_epi8(table_hi_y, h1_0); - l1_0 = _mm256_xor_si256(l1_0, h1_0); - - LEO_M256 x0_1 = _mm256_loadu_si256(x32_1 + 1); - LEO_M256 l0_1 = _mm256_and_si256(x0_1, clr_mask); - x0_1 = _mm256_srli_epi64(x0_1, 4); - LEO_M256 h0_1 = _mm256_and_si256(x0_1, clr_mask); - l0_1 = _mm256_shuffle_epi8(table_lo_y, l0_1); - h0_1 = _mm256_shuffle_epi8(table_hi_y, h0_1); - l0_1 = _mm256_xor_si256(l0_1, h0_1); - - LEO_M256 x1_1 = _mm256_loadu_si256(x32_1); - LEO_M256 l1_1 = _mm256_and_si256(x1_1, clr_mask); - x1_1 = _mm256_srli_epi64(x1_1, 4); - LEO_M256 h1_1 = _mm256_and_si256(x1_1, clr_mask); - l1_1 = _mm256_shuffle_epi8(table_lo_y, l1_1); - h1_1 = _mm256_shuffle_epi8(table_hi_y, h1_1); - l1_1 = _mm256_xor_si256(l1_1, h1_1); - - _mm256_storeu_si256(x32_0 + 1, l0_0); - _mm256_storeu_si256(x32_0, l1_0); - _mm256_storeu_si256(x32_1 + 1, l0_1); - _mm256_storeu_si256(x32_1, l1_1); - - x32_0 += 2; - x32_1 += 2; - bytes -= 64; - } while (bytes > 0); - return; - } -#endif // LEO_TRY_AVX2 - - const LEO_M128 table_lo_y = _mm_loadu_si128(Multiply128LUT.Lo + m); - const LEO_M128 table_hi_y = _mm_loadu_si128(Multiply128LUT.Hi + m); - - const LEO_M128 clr_mask = _mm_set1_epi8(0x0f); - - LEO_M128 * LEO_RESTRICT x16_0 = reinterpret_cast(vx_0); - LEO_M128 * LEO_RESTRICT x16_1 = reinterpret_cast(vx_1); - - do - { - LEO_M128 x3 = _mm_loadu_si128(x16_0 + 3); - LEO_M128 l3 = _mm_and_si128(x3, clr_mask); - x3 = _mm_srli_epi64(x3, 4); - LEO_M128 h3 = _mm_and_si128(x3, clr_mask); - l3 = _mm_shuffle_epi8(table_lo_y, l3); - h3 = _mm_shuffle_epi8(table_hi_y, h3); - - LEO_M128 x2 = _mm_loadu_si128(x16_0 + 2); - LEO_M128 l2 = _mm_and_si128(x2, clr_mask); - x2 = _mm_srli_epi64(x2, 4); - LEO_M128 h2 = _mm_and_si128(x2, clr_mask); - l2 = _mm_shuffle_epi8(table_lo_y, l2); - h2 = _mm_shuffle_epi8(table_hi_y, h2); - - LEO_M128 x1 = _mm_loadu_si128(x16_0 + 1); - LEO_M128 l1 = _mm_and_si128(x1, clr_mask); - x1 = _mm_srli_epi64(x1, 4); - LEO_M128 h1 = _mm_and_si128(x1, clr_mask); - l1 = _mm_shuffle_epi8(table_lo_y, l1); - h1 = _mm_shuffle_epi8(table_hi_y, h1); - - LEO_M128 x0 = _mm_loadu_si128(x16_0); - LEO_M128 l0 = _mm_and_si128(x0, clr_mask); - x0 = _mm_srli_epi64(x0, 4); - LEO_M128 h0 = _mm_and_si128(x0, clr_mask); - l0 = _mm_shuffle_epi8(table_lo_y, l0); - h0 = _mm_shuffle_epi8(table_hi_y, h0); - - _mm_storeu_si128(x16_0 + 3, _mm_xor_si128(l3, h3)); - _mm_storeu_si128(x16_0 + 2, _mm_xor_si128(l2, h2)); - _mm_storeu_si128(x16_0 + 1, _mm_xor_si128(l1, h1)); - _mm_storeu_si128(x16_0, _mm_xor_si128(l0, h0)); - - // FIXME: Add second one here - - x16_0 += 4; - x16_1 += 4; - bytes -= 64; - } while (bytes > 0); -} - //------------------------------------------------------------------------------ // FFT Operations @@ -549,20 +428,12 @@ void fft_butterfly( } -// For i = {0, 1}: x_i[] ^= y_i[] * m, y_i[] ^= x_i[] -void fft_butterfly2( - void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0, - void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1, - ffe_t m, uint64_t bytes) -{ - -} - -// For i = {0, 1, 2}: x_i[] ^= y_i[] * m, y_i[] ^= x_i[] -void fft_butterfly3( +// For i = {0, 1, 2, 3}: x_i[] ^= y_i[] * m, y_i[] ^= x_i[] +void fft_butterfly4( void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0, void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1, void * LEO_RESTRICT x_2, void * LEO_RESTRICT y_2, + void * LEO_RESTRICT x_3, void * LEO_RESTRICT y_3, ffe_t m, uint64_t bytes) { @@ -580,20 +451,12 @@ void ifft_butterfly( } -// For i = {0, 1}: y_i[] ^= x_i[], x_i[] ^= y_i[] * m -void ifft_butterfly2( - void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0, - void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1, - ffe_t m, uint64_t bytes) -{ - -} - -// For i = {0, 1, 2}: y_i[] ^= x_i[], x_i[] ^= y_i[] * m -void ifft_butterfly3( +// For i = {0, 1, 2, 3}: y_i[] ^= x_i[], x_i[] ^= y_i[] * m +void ifft_butterfly4( void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0, void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1, void * LEO_RESTRICT x_2, void * LEO_RESTRICT y_2, + void * LEO_RESTRICT x_3, void * LEO_RESTRICT y_3, ffe_t m, uint64_t bytes) { @@ -603,7 +466,7 @@ void ifft_butterfly3( //------------------------------------------------------------------------------ // FFT -static ffe_t FFTSkew[kFieldModulus]; // twisted factors used in FFT +static ffe_t FFTSkew[kModulus]; // twisted factors used in FFT static ffe_t LogWalsh[kOrder]; // factors used in the evaluation of the error locator polynomial void FFTInitialize() @@ -628,10 +491,10 @@ void FFTInitialize() } // TBD: This can be cleaned up - temp[m] = kFieldModulus - LogLUT[FFEMultiply(temp[m], temp[m] ^ 1)]; + temp[m] = kModulus - LogLUT[FFEMultiply(temp[m], temp[m] ^ 1)]; for (unsigned i = m + 1; i < (kBits - 1); ++i) - temp[i] = FFEMultiplyLog(temp[i], (LogLUT[temp[i] ^ 1] + temp[m]) % kFieldModulus); + temp[i] = FFEMultiplyLog(temp[i], (LogLUT[temp[i] ^ 1] + temp[m]) % kModulus); } for (unsigned i = 0; i < kOrder; ++i) @@ -667,11 +530,14 @@ void Encode( for (unsigned width = 1; width < m; width <<= 1) { - for (unsigned j = width; j < m; j += (width << 1)) - { - const ffe_t skew = FFTSkew[j + m - 1]; + const unsigned range = width << 1; + const ffe_t* skewLUT = FFTSkew + m - 1; - if (skew != kFieldModulus) + for (unsigned j = width; j < m; j += range) + { + const ffe_t skew = skewLUT[j]; + + if (skew != kModulus) { for (unsigned i = j - width; i < j; ++i) ifft_butterfly(work[i], work[i + width], skew, buffer_bytes); @@ -696,13 +562,15 @@ void Encode( // temp <- IFFT(temp, m, m + i) + const ffe_t* skewLUT = FFTSkew + m + i - 1; + for (unsigned width = 1; width < m; width <<= 1) { for (unsigned j = width; j < m; j += (width << 1)) { - const ffe_t skew = FFTSkew[j + m + i - 1]; + const ffe_t skew = skewLUT[j]; - if (skew != kFieldModulus) + if (skew != kModulus) { for (unsigned k = j - width; k < j; ++k) ifft_butterfly(temp[k], temp[k + width], skew, buffer_bytes); @@ -742,12 +610,14 @@ void Encode( { // Calculate stop considering that the right is all zeroes const unsigned stop = ((last_count + width - 1) >> shift) << shift; + const unsigned range = width << 1; + const ffe_t* skewLUT = FFTSkew + m + i - 1; - for (unsigned j = width; j < stop; j += (width << 1)) + for (unsigned j = width; j < stop; j += range) { - const ffe_t skew = FFTSkew[j + m + i - 1]; + const ffe_t skew = skewLUT[j]; - if (skew != kFieldModulus) + if (skew != kModulus) { for (unsigned k = j - width; k < j; ++k) ifft_butterfly(temp[k], temp[k + width], skew, buffer_bytes); @@ -778,7 +648,7 @@ void Encode( { const ffe_t skew = skewLUT[j]; - if (skew != kFieldModulus) + if (skew != kModulus) { for (unsigned k = j, count = j + width; k < count; ++k) fft_butterfly(data[k], data[k + width], skew, buffer_bytes); @@ -822,7 +692,7 @@ void Decode( FWHT(ErrorLocations, kBits); for (unsigned i = 0; i < kOrder; ++i) - ErrorLocations[i] = ((unsigned)ErrorLocations[i] * (unsigned)LogWalsh[i]) % kFieldModulus; + ErrorLocations[i] = ((unsigned)ErrorLocations[i] * (unsigned)LogWalsh[i]) % kModulus; FWHT(ErrorLocations, kBits); @@ -854,11 +724,13 @@ void Decode( for (unsigned width = 1; width < n; width <<= 1) { - for (unsigned j = width; j < n; j += (width << 1)) + const unsigned range = width << 1; + + for (unsigned j = width; j < n; j += range) { const ffe_t skew = FFTSkew[j - 1]; - if (skew != kFieldModulus) + if (skew != kModulus) { for (unsigned i = j - width; i < j; ++i) ifft_butterfly(work[i], work[i + width], skew, buffer_bytes); @@ -894,14 +766,14 @@ void Decode( { const ffe_t skew = skewLUT[j]; - if (skew != kFieldModulus) + if (skew != kModulus) { - for (unsigned i = j; i < j + width; ++i) + for (unsigned i = j, count = j + width; i < count; ++i) fft_butterfly(work[i], work[i + width], skew, buffer_bytes); } else { - for (unsigned i = j; i < j + width; ++i) + for (unsigned i = j, count = j + width; i < count; ++i) xor_mem(work[i + width], work[i], buffer_bytes); } } @@ -911,7 +783,7 @@ void Decode( for (unsigned i = 0; i < original_count; ++i) if (!original[i]) - mul_mem_set(work[i], work[i + m], kFieldModulus - ErrorLocations[i], buffer_bytes); + mul_mem_set(work[i], work[i + m], kModulus - ErrorLocations[i], buffer_bytes); } diff --git a/LeopardFF8.h b/LeopardFF8.h index 2f11ff4..11be30e 100644 --- a/LeopardFF8.h +++ b/LeopardFF8.h @@ -73,12 +73,6 @@ void mul_mem_set( void * LEO_RESTRICT x, const void * LEO_RESTRICT y, ffe_t m, uint64_t bytes); -// For i = {0, 1}: x_i[] *= m -void mul_mem2_inplace( - void * LEO_RESTRICT x_0, - void * LEO_RESTRICT x_1, - ffe_t m, uint64_t bytes); - //------------------------------------------------------------------------------ // FFT Operations @@ -88,17 +82,12 @@ void fft_butterfly( void * LEO_RESTRICT x, void * LEO_RESTRICT y, ffe_t m, uint64_t bytes); -// For i = {0, 1}: x_i[] ^= y_i[] * m, y_i[] ^= x_i[] -void fft_butterfly2( - void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0, - void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1, - ffe_t m, uint64_t bytes); - -// For i = {0, 1, 2}: x_i[] ^= y_i[] * m, y_i[] ^= x_i[] -void fft_butterfly3( +// For i = {0, 1, 2, 3}: x_i[] ^= y_i[] * m, y_i[] ^= x_i[] +void fft_butterfly4( void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0, void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1, void * LEO_RESTRICT x_2, void * LEO_RESTRICT y_2, + void * LEO_RESTRICT x_3, void * LEO_RESTRICT y_3, ffe_t m, uint64_t bytes); @@ -110,17 +99,12 @@ void ifft_butterfly( void * LEO_RESTRICT x, void * LEO_RESTRICT y, ffe_t m, uint64_t bytes); -// For i = {0, 1}: y_i[] ^= x_i[], x_i[] ^= y_i[] * m -void ifft_butterfly2( - void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0, - void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1, - ffe_t m, uint64_t bytes); - -// For i = {0, 1, 2}: y_i[] ^= x_i[], x_i[] ^= y_i[] * m -void ifft_butterfly3( +// For i = {0, 1, 2, 3}: y_i[] ^= x_i[], x_i[] ^= y_i[] * m +void ifft_butterfly4( void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0, void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1, void * LEO_RESTRICT x_2, void * LEO_RESTRICT y_2, + void * LEO_RESTRICT x_3, void * LEO_RESTRICT y_3, ffe_t m, uint64_t bytes); diff --git a/tests/experiments.cpp b/tests/experiments.cpp index f057147..c17e5b6 100644 --- a/tests/experiments.cpp +++ b/tests/experiments.cpp @@ -98,7 +98,7 @@ ffe_t kGFBasis[kGFBits] = { */ static const unsigned kFieldSize = (unsigned)1 << kGFBits; //Field size -static const unsigned kFieldModulus = kFieldSize - 1; +static const unsigned kModulus = kFieldSize - 1; static ffe_t GFLog[kFieldSize]; static ffe_t GFExp[kFieldSize]; @@ -107,14 +107,14 @@ static ffe_t GFExp[kFieldSize]; static void InitField() { unsigned state = 1; - for (unsigned i = 0; i < kFieldModulus; ++i) + for (unsigned i = 0; i < kModulus; ++i) { GFExp[state] = static_cast(i); state <<= 1; if (state >= kFieldSize) state ^= kGFPolynomial; } - GFExp[0] = kFieldModulus; + GFExp[0] = kModulus; // Conversion to chosen basis: @@ -134,7 +134,7 @@ static void InitField() for (unsigned i = 0; i < kFieldSize; ++i) GFExp[GFLog[i]] = i; - GFExp[kFieldModulus] = GFExp[0]; + GFExp[kModulus] = GFExp[0]; } @@ -239,7 +239,7 @@ static void formal_derivative(ffe_t* cos, const unsigned size) //------------------------------------------------------------------------------ // Fast Fourier Transform -static ffe_t skewVec[kFieldModulus]; // twisted factors used in FFT +static ffe_t skewVec[kModulus]; // twisted factors used in FFT static LEO_FORCE_INLINE void ifft_butterfly(ffe_t& a, ffe_t& b, ffe_t skew) { @@ -256,7 +256,7 @@ static void IFLT(ffe_t* data, const unsigned size, const unsigned index) { const ffe_t skew = skewVec[j + index - 1]; - if (skew != kFieldModulus) + if (skew != kModulus) { for (unsigned i = j - width; i < j; ++i) ifft_butterfly(data[i], data[i + width], skew); @@ -287,7 +287,7 @@ static void FLT(ffe_t* data, const unsigned size, const unsigned skewIndex, cons { const ffe_t skew = skewLUT[j]; - if (skew != kFieldModulus) + if (skew != kModulus) { for (unsigned i = j; i < j + width; ++i) fft_butterfly(data[i], data[i + width], skew); @@ -330,20 +330,20 @@ static void InitFieldOperations() skewVec[j + s] = skewVec[j] ^ temp[i]; } - temp[m] = kFieldModulus - GFLog[mulE(temp[m], GFLog[temp[m] ^ 1])]; + temp[m] = kModulus - GFLog[mulE(temp[m], GFLog[temp[m] ^ 1])]; for (unsigned i = m + 1; i < (kGFBits - 1); ++i) - temp[i] = mulE(temp[i], (GFLog[temp[i] ^ 1] + temp[m]) % kFieldModulus); + temp[i] = mulE(temp[i], (GFLog[temp[i] ^ 1] + temp[m]) % kModulus); } for (unsigned i = 0; i < kFieldSize; ++i) skewVec[i] = GFLog[skewVec[i]]; #if 0 - temp[0] = kFieldModulus - temp[0]; + temp[0] = kModulus - temp[0]; for (unsigned i = 1; i < (kGFBits - 1); ++i) - temp[i] = (kFieldModulus - temp[i] + temp[i - 1]) % kFieldModulus; + temp[i] = (kModulus - temp[i] + temp[i - 1]) % kModulus; B[0] = 0; for (unsigned i = 0; i < (kGFBits - 1); ++i) @@ -351,7 +351,7 @@ static void InitFieldOperations() const unsigned depart = ((unsigned)1 << i); for (unsigned j = 0; j < depart; ++j) - B[j + depart] = (B[j] + temp[i]) % kFieldModulus; + B[j + depart] = (B[j] + temp[i]) % kModulus; } #endif @@ -419,7 +419,7 @@ static void decode(ffe_t* codeword, const unsigned m, const unsigned original_co FWHT(log_walsh2, kGFBits); for (unsigned i = 0; i < kFieldSize; ++i) - log_walsh2[i] = ((unsigned)log_walsh2[i] * (unsigned)log_walsh[i]) % kFieldModulus; + log_walsh2[i] = ((unsigned)log_walsh2[i] * (unsigned)log_walsh[i]) % kModulus; FWHT(log_walsh2, kGFBits); @@ -449,8 +449,8 @@ static void decode(ffe_t* codeword, const unsigned m, const unsigned original_co // Note: Preserves zeroes on the right for (unsigned i = 0; i < m + original_count; i += 2) { - codeword[i] = mulE(codeword[i], kFieldModulus - B[i >> 1]); - codeword[i + 1] = mulE(codeword[i + 1], kFieldModulus - B[i >> 1]); + codeword[i] = mulE(codeword[i], kModulus - B[i >> 1]); + codeword[i + 1] = mulE(codeword[i + 1], kModulus - B[i >> 1]); } #endif @@ -471,7 +471,7 @@ static void decode(ffe_t* codeword, const unsigned m, const unsigned original_co { if (erasure[i]) { - codeword[i] = mulE(codeword[i], kFieldModulus - log_walsh2[i]); + codeword[i] = mulE(codeword[i], kModulus - log_walsh2[i]); } } }