Refactor multiply table code

This commit is contained in:
Christopher Taylor 2017-05-29 15:01:01 -07:00
parent 283c1aac22
commit 49960e90f3
4 changed files with 790 additions and 317 deletions

File diff suppressed because it is too large Load Diff

View File

@ -76,44 +76,90 @@ void FWHT(ffe_t data[kOrder]);
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
// Multiplies // Multiplies
// x[] = y[] * m // x[] = exp(log(y[]) + log_m)
void mul_mem_set( void mul_mem(
void * LEO_RESTRICT x, const void * LEO_RESTRICT y, void * LEO_RESTRICT x, const void * LEO_RESTRICT y,
ffe_t m, uint64_t bytes); ffe_t log_m, uint64_t bytes);
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
// FFT Operations // FFT Operations
// x[] ^= y[] * m, y[] ^= x[] /*
Precondition: log_m != kModulus
x[] ^= exp(log(y[]) + log_m)
y[] ^= x[]
*/
void fft_butterfly( void fft_butterfly(
void * LEO_RESTRICT x, void * LEO_RESTRICT y, void * LEO_RESTRICT x, void * LEO_RESTRICT y,
ffe_t m, uint64_t bytes); ffe_t log_m, uint64_t bytes);
// For i = {0, 1, 2, 3}: x_i[] ^= y_i[] * m, y_i[] ^= x_i[] #ifdef LEO_USE_VECTOR4_OPT
// Unroll 4 rows at a time
void fft_butterfly4( void fft_butterfly4(
void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0, void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1, void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1,
void * LEO_RESTRICT x_2, void * LEO_RESTRICT y_2, void * LEO_RESTRICT x_2, void * LEO_RESTRICT y_2,
void * LEO_RESTRICT x_3, void * LEO_RESTRICT y_3, void * LEO_RESTRICT x_3, void * LEO_RESTRICT y_3,
ffe_t m, uint64_t bytes); ffe_t log_m, uint64_t bytes);
#endif // LEO_USE_VECTOR4_OPT
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
// IFFT Operations // IFFT Operations
// y[] ^= x[], x[] ^= y[] * m /*
Precondition: log_m != kModulus
y[] ^= x[]
x[] ^= exp(log(y[]) + log_m)
*/
void ifft_butterfly( void ifft_butterfly(
void * LEO_RESTRICT x, void * LEO_RESTRICT y, void * LEO_RESTRICT x, void * LEO_RESTRICT y,
ffe_t m, uint64_t bytes); ffe_t log_m, uint64_t bytes);
// For i = {0, 1, 2, 3}: y_i[] ^= x_i[], x_i[] ^= y_i[] * m #ifdef LEO_USE_VECTOR4_OPT
// Unroll 4 rows at a time
void ifft_butterfly4( void ifft_butterfly4(
void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0, void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1, void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1,
void * LEO_RESTRICT x_2, void * LEO_RESTRICT y_2, void * LEO_RESTRICT x_2, void * LEO_RESTRICT y_2,
void * LEO_RESTRICT x_3, void * LEO_RESTRICT y_3, void * LEO_RESTRICT x_3, void * LEO_RESTRICT y_3,
ffe_t m, uint64_t bytes); ffe_t log_m, uint64_t bytes);
#endif // LEO_USE_VECTOR4_OPT
//------------------------------------------------------------------------------
// FFT
/*
if (log_m != kModulus)
x[] ^= exp(log(y[]) + log_m)
y[] ^= x[]
*/
void VectorFFTButterfly(
const uint64_t bytes,
unsigned count,
void** x,
void** y,
const ffe_t log_m);
/*
y[] ^= x[]
if (log_m != kModulus)
x[] ^= exp(log(y[]) + log_m)
*/
void VectorIFFTButterfly(
const uint64_t bytes,
unsigned count,
void** x,
void** y,
const ffe_t log_m);
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------

View File

@ -253,20 +253,17 @@ static void InitializeLogarithmTables()
ExpLUT[kModulus] = ExpLUT[0]; ExpLUT[kModulus] = ExpLUT[0];
} }
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
// Multiplies // Multiplies
// We require memory to be aligned since the SIMD instructions benefit from
// or require aligned accesses to the table data.
struct { struct {
LEO_ALIGNED LEO_M128 Lo[256]; LEO_ALIGNED LEO_M128 Value[kBits / 4];
LEO_ALIGNED LEO_M128 Hi[256]; } static Multiply128LUT[kOrder];
} static Multiply128LUT;
#if defined(LEO_TRY_AVX2) #if defined(LEO_TRY_AVX2)
struct { struct {
LEO_ALIGNED LEO_M256 Lo[256]; LEO_ALIGNED LEO_M256 Value[kBits / 4];
LEO_ALIGNED LEO_M256 Hi[256]; } static Multiply256LUT[kOrder];
} static Multiply256LUT;
#endif // LEO_TRY_AVX2 #endif // LEO_TRY_AVX2
// Returns a * Log(b) // Returns a * Log(b)
@ -285,33 +282,33 @@ static ffe_t MultiplyLog(ffe_t a, ffe_t log_b)
return ExpLUT[AddMod(LogLUT[a], log_b)]; return ExpLUT[AddMod(LogLUT[a], log_b)];
} }
void InitializeMultiplyTables() void InitializeMultiplyTables()
{ {
for (int log_y = 0; log_y < 256; ++log_y) // For each value we could multiply by:
for (unsigned log_m = 0; log_m < kOrder; ++log_m)
{ {
uint8_t lo[16], hi[16]; // For each 4 bits of the finite field width in bits:
for (uint8_t x = 0; x < 16; ++x) for (unsigned i = 0, shift = 0; i < kBits / 4; ++i, shift += 4)
{ {
lo[x] = MultiplyLog(x, static_cast<uint8_t>(log_y)); // Construct 16 entry LUT for PSHUFB
hi[x] = MultiplyLog(x << 4, static_cast<uint8_t>(log_y)); ffe_t lut[16];
} for (uint8_t x = 0; x < 16; ++x)
lut[x] = MultiplyLog(x << shift, static_cast<ffe_t>(log_m));
const LEO_M128 table_lo = _mm_loadu_si128((LEO_M128*)lo); // Store in 128-bit wide table
const LEO_M128 table_hi = _mm_loadu_si128((LEO_M128*)hi); const LEO_M128 *v_ptr = reinterpret_cast<const LEO_M128 *>(&lut[0]);
const LEO_M128 value = _mm_loadu_si128(v_ptr);
_mm_storeu_si128(Multiply128LUT.Lo + log_y, table_lo); _mm_storeu_si128(&Multiply128LUT[log_m].Value[i], value);
_mm_storeu_si128(Multiply128LUT.Hi + log_y, table_hi);
// Store in 256-bit wide table
#if defined(LEO_TRY_AVX2) #if defined(LEO_TRY_AVX2)
if (CpuHasAVX2) if (CpuHasAVX2)
{ {
_mm256_storeu_si256(Multiply256LUT.Lo + log_y, _mm256_storeu_si256(&Multiply256LUT[log_m].Value[i],
_mm256_broadcastsi128_si256(table_lo)); _mm256_broadcastsi128_si256(value));
_mm256_storeu_si256(Multiply256LUT.Hi + log_y, }
_mm256_broadcastsi128_si256(table_hi));
}
#endif // LEO_TRY_AVX2 #endif // LEO_TRY_AVX2
}
} }
} }
@ -323,8 +320,8 @@ void mul_mem(
#if defined(LEO_TRY_AVX2) #if defined(LEO_TRY_AVX2)
if (CpuHasAVX2) if (CpuHasAVX2)
{ {
const LEO_M256 table_lo_y = _mm256_loadu_si256(Multiply256LUT.Lo + log_m); const LEO_M256 table_lo_y = _mm256_loadu_si256(&Multiply256LUT[log_m].Value[0]);
const LEO_M256 table_hi_y = _mm256_loadu_si256(Multiply256LUT.Hi + log_m); const LEO_M256 table_hi_y = _mm256_loadu_si256(&Multiply256LUT[log_m].Value[1]);
const LEO_M256 clr_mask = _mm256_set1_epi8(0x0f); const LEO_M256 clr_mask = _mm256_set1_epi8(0x0f);
@ -353,8 +350,8 @@ void mul_mem(
} }
#endif // LEO_TRY_AVX2 #endif // LEO_TRY_AVX2
const LEO_M128 table_lo_y = _mm_loadu_si128(Multiply128LUT.Lo + log_m); const LEO_M128 table_lo_y = _mm_loadu_si128(&Multiply128LUT[log_m].Value[0]);
const LEO_M128 table_hi_y = _mm_loadu_si128(Multiply128LUT.Hi + log_m); const LEO_M128 table_hi_y = _mm_loadu_si128(&Multiply128LUT[log_m].Value[1]);
const LEO_M128 clr_mask = _mm_set1_epi8(0x0f); const LEO_M128 clr_mask = _mm_set1_epi8(0x0f);
@ -393,8 +390,8 @@ void fft_butterfly(
#if defined(LEO_TRY_AVX2) #if defined(LEO_TRY_AVX2)
if (CpuHasAVX2) if (CpuHasAVX2)
{ {
const LEO_M256 table_lo_y = _mm256_loadu_si256(Multiply256LUT.Lo + log_m); const LEO_M256 table_lo_y = _mm256_loadu_si256(&Multiply256LUT[log_m].Value[0]);
const LEO_M256 table_hi_y = _mm256_loadu_si256(Multiply256LUT.Hi + log_m); const LEO_M256 table_hi_y = _mm256_loadu_si256(&Multiply256LUT[log_m].Value[1]);
const LEO_M256 clr_mask = _mm256_set1_epi8(0x0f); const LEO_M256 clr_mask = _mm256_set1_epi8(0x0f);
@ -427,8 +424,8 @@ void fft_butterfly(
} }
#endif // LEO_TRY_AVX2 #endif // LEO_TRY_AVX2
const LEO_M128 table_lo_y = _mm_loadu_si128(Multiply128LUT.Lo + log_m); const LEO_M128 table_lo_y = _mm_loadu_si128(&Multiply128LUT[log_m].Value[0]);
const LEO_M128 table_hi_y = _mm_loadu_si128(Multiply128LUT.Hi + log_m); const LEO_M128 table_hi_y = _mm_loadu_si128(&Multiply128LUT[log_m].Value[1]);
const LEO_M128 clr_mask = _mm_set1_epi8(0x0f); const LEO_M128 clr_mask = _mm_set1_epi8(0x0f);
@ -472,8 +469,8 @@ void fft_butterfly4(
#if defined(LEO_TRY_AVX2) #if defined(LEO_TRY_AVX2)
if (CpuHasAVX2) if (CpuHasAVX2)
{ {
const LEO_M256 table_lo_y = _mm256_loadu_si256(Multiply256LUT.Lo + log_m); const LEO_M256 table_lo_y = _mm256_loadu_si256(&Multiply256LUT[log_m].Value[0]);
const LEO_M256 table_hi_y = _mm256_loadu_si256(Multiply256LUT.Hi + log_m); const LEO_M256 table_hi_y = _mm256_loadu_si256(&Multiply256LUT[log_m].Value[1]);
const LEO_M256 clr_mask = _mm256_set1_epi8(0x0f); const LEO_M256 clr_mask = _mm256_set1_epi8(0x0f);
@ -511,8 +508,8 @@ void fft_butterfly4(
} }
#endif // LEO_TRY_AVX2 #endif // LEO_TRY_AVX2
const LEO_M128 table_lo_y = _mm_loadu_si128(Multiply128LUT.Lo + log_m); const LEO_M128 table_lo_y = _mm_loadu_si128(&Multiply128LUT[log_m].Value[0]);
const LEO_M128 table_hi_y = _mm_loadu_si128(Multiply128LUT.Hi + log_m); const LEO_M128 table_hi_y = _mm_loadu_si128(&Multiply128LUT[log_m].Value[1]);
const LEO_M128 clr_mask = _mm_set1_epi8(0x0f); const LEO_M128 clr_mask = _mm_set1_epi8(0x0f);
@ -568,8 +565,8 @@ void ifft_butterfly(
#if defined(LEO_TRY_AVX2) #if defined(LEO_TRY_AVX2)
if (CpuHasAVX2) if (CpuHasAVX2)
{ {
const LEO_M256 table_lo_y = _mm256_loadu_si256(Multiply256LUT.Lo + log_m); const LEO_M256 table_lo_y = _mm256_loadu_si256(&Multiply256LUT[log_m].Value[0]);
const LEO_M256 table_hi_y = _mm256_loadu_si256(Multiply256LUT.Hi + log_m); const LEO_M256 table_hi_y = _mm256_loadu_si256(&Multiply256LUT[log_m].Value[1]);
const LEO_M256 clr_mask = _mm256_set1_epi8(0x0f); const LEO_M256 clr_mask = _mm256_set1_epi8(0x0f);
@ -602,8 +599,8 @@ void ifft_butterfly(
} }
#endif // LEO_TRY_AVX2 #endif // LEO_TRY_AVX2
const LEO_M128 table_lo_y = _mm_loadu_si128(Multiply128LUT.Lo + log_m); const LEO_M128 table_lo_y = _mm_loadu_si128(&Multiply128LUT[log_m].Value[0]);
const LEO_M128 table_hi_y = _mm_loadu_si128(Multiply128LUT.Hi + log_m); const LEO_M128 table_hi_y = _mm_loadu_si128(&Multiply128LUT[log_m].Value[1]);
const LEO_M128 clr_mask = _mm_set1_epi8(0x0f); const LEO_M128 clr_mask = _mm_set1_epi8(0x0f);
@ -647,8 +644,8 @@ void ifft_butterfly4(
#if defined(LEO_TRY_AVX2) #if defined(LEO_TRY_AVX2)
if (CpuHasAVX2) if (CpuHasAVX2)
{ {
const LEO_M256 table_lo_y = _mm256_loadu_si256(Multiply256LUT.Lo + log_m); const LEO_M256 table_lo_y = _mm256_loadu_si256(&Multiply256LUT[log_m].Value[0]);
const LEO_M256 table_hi_y = _mm256_loadu_si256(Multiply256LUT.Hi + log_m); const LEO_M256 table_hi_y = _mm256_loadu_si256(&Multiply256LUT[log_m].Value[1]);
const LEO_M256 clr_mask = _mm256_set1_epi8(0x0f); const LEO_M256 clr_mask = _mm256_set1_epi8(0x0f);
@ -686,8 +683,8 @@ void ifft_butterfly4(
} }
#endif // LEO_TRY_AVX2 #endif // LEO_TRY_AVX2
const LEO_M128 table_lo_y = _mm_loadu_si128(Multiply128LUT.Lo + log_m); const LEO_M128 table_lo_y = _mm_loadu_si128(&Multiply128LUT[log_m].Value[0]);
const LEO_M128 table_hi_y = _mm_loadu_si128(Multiply128LUT.Hi + log_m); const LEO_M128 table_hi_y = _mm_loadu_si128(&Multiply128LUT[log_m].Value[1]);
const LEO_M128 clr_mask = _mm_set1_epi8(0x0f); const LEO_M128 clr_mask = _mm_set1_epi8(0x0f);

View File

@ -45,11 +45,11 @@ struct TestParameters
unsigned original_count = 1000; // under 65536 unsigned original_count = 1000; // under 65536
unsigned recovery_count = 100; // under 65536 - original_count unsigned recovery_count = 100; // under 65536 - original_count
#else #else
unsigned original_count = 200; // under 65536 unsigned original_count = 128; // under 65536
unsigned recovery_count = 20; // under 65536 - original_count unsigned recovery_count = 128; // under 65536 - original_count
#endif #endif
unsigned buffer_bytes = 64000; // multiple of 64 bytes unsigned buffer_bytes = 64000; // multiple of 64 bytes
unsigned loss_count = 20; // some fraction of original_count unsigned loss_count = 128; // some fraction of original_count
unsigned seed = 0; unsigned seed = 0;
bool multithreaded = true; bool multithreaded = true;
}; };