mirror of https://github.com/status-im/leopard.git
Refactor multiply table code
This commit is contained in:
parent
283c1aac22
commit
49960e90f3
942
LeopardFF16.cpp
942
LeopardFF16.cpp
File diff suppressed because it is too large
Load Diff
|
@ -76,44 +76,90 @@ void FWHT(ffe_t data[kOrder]);
|
|||
//------------------------------------------------------------------------------
|
||||
// Multiplies
|
||||
|
||||
// x[] = y[] * m
|
||||
void mul_mem_set(
|
||||
// x[] = exp(log(y[]) + log_m)
|
||||
void mul_mem(
|
||||
void * LEO_RESTRICT x, const void * LEO_RESTRICT y,
|
||||
ffe_t m, uint64_t bytes);
|
||||
ffe_t log_m, uint64_t bytes);
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// FFT Operations
|
||||
|
||||
// x[] ^= y[] * m, y[] ^= x[]
|
||||
/*
|
||||
Precondition: log_m != kModulus
|
||||
|
||||
x[] ^= exp(log(y[]) + log_m)
|
||||
y[] ^= x[]
|
||||
*/
|
||||
void fft_butterfly(
|
||||
void * LEO_RESTRICT x, void * LEO_RESTRICT y,
|
||||
ffe_t m, uint64_t bytes);
|
||||
ffe_t log_m, uint64_t bytes);
|
||||
|
||||
// For i = {0, 1, 2, 3}: x_i[] ^= y_i[] * m, y_i[] ^= x_i[]
|
||||
#ifdef LEO_USE_VECTOR4_OPT
|
||||
|
||||
// Unroll 4 rows at a time
|
||||
void fft_butterfly4(
|
||||
void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
|
||||
void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1,
|
||||
void * LEO_RESTRICT x_2, void * LEO_RESTRICT y_2,
|
||||
void * LEO_RESTRICT x_3, void * LEO_RESTRICT y_3,
|
||||
ffe_t m, uint64_t bytes);
|
||||
ffe_t log_m, uint64_t bytes);
|
||||
|
||||
#endif // LEO_USE_VECTOR4_OPT
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// IFFT Operations
|
||||
|
||||
// y[] ^= x[], x[] ^= y[] * m
|
||||
/*
|
||||
Precondition: log_m != kModulus
|
||||
|
||||
y[] ^= x[]
|
||||
x[] ^= exp(log(y[]) + log_m)
|
||||
*/
|
||||
void ifft_butterfly(
|
||||
void * LEO_RESTRICT x, void * LEO_RESTRICT y,
|
||||
ffe_t m, uint64_t bytes);
|
||||
ffe_t log_m, uint64_t bytes);
|
||||
|
||||
// For i = {0, 1, 2, 3}: y_i[] ^= x_i[], x_i[] ^= y_i[] * m
|
||||
#ifdef LEO_USE_VECTOR4_OPT
|
||||
|
||||
// Unroll 4 rows at a time
|
||||
void ifft_butterfly4(
|
||||
void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
|
||||
void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1,
|
||||
void * LEO_RESTRICT x_2, void * LEO_RESTRICT y_2,
|
||||
void * LEO_RESTRICT x_3, void * LEO_RESTRICT y_3,
|
||||
ffe_t m, uint64_t bytes);
|
||||
ffe_t log_m, uint64_t bytes);
|
||||
|
||||
#endif // LEO_USE_VECTOR4_OPT
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// FFT
|
||||
|
||||
/*
|
||||
if (log_m != kModulus)
|
||||
x[] ^= exp(log(y[]) + log_m)
|
||||
y[] ^= x[]
|
||||
*/
|
||||
void VectorFFTButterfly(
|
||||
const uint64_t bytes,
|
||||
unsigned count,
|
||||
void** x,
|
||||
void** y,
|
||||
const ffe_t log_m);
|
||||
|
||||
/*
|
||||
y[] ^= x[]
|
||||
if (log_m != kModulus)
|
||||
x[] ^= exp(log(y[]) + log_m)
|
||||
*/
|
||||
void VectorIFFTButterfly(
|
||||
const uint64_t bytes,
|
||||
unsigned count,
|
||||
void** x,
|
||||
void** y,
|
||||
const ffe_t log_m);
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
|
|
@ -253,20 +253,17 @@ static void InitializeLogarithmTables()
|
|||
ExpLUT[kModulus] = ExpLUT[0];
|
||||
}
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Multiplies
|
||||
|
||||
// We require memory to be aligned since the SIMD instructions benefit from
|
||||
// or require aligned accesses to the table data.
|
||||
struct {
|
||||
LEO_ALIGNED LEO_M128 Lo[256];
|
||||
LEO_ALIGNED LEO_M128 Hi[256];
|
||||
} static Multiply128LUT;
|
||||
LEO_ALIGNED LEO_M128 Value[kBits / 4];
|
||||
} static Multiply128LUT[kOrder];
|
||||
#if defined(LEO_TRY_AVX2)
|
||||
struct {
|
||||
LEO_ALIGNED LEO_M256 Lo[256];
|
||||
LEO_ALIGNED LEO_M256 Hi[256];
|
||||
} static Multiply256LUT;
|
||||
LEO_ALIGNED LEO_M256 Value[kBits / 4];
|
||||
} static Multiply256LUT[kOrder];
|
||||
#endif // LEO_TRY_AVX2
|
||||
|
||||
// Returns a * Log(b)
|
||||
|
@ -285,33 +282,33 @@ static ffe_t MultiplyLog(ffe_t a, ffe_t log_b)
|
|||
return ExpLUT[AddMod(LogLUT[a], log_b)];
|
||||
}
|
||||
|
||||
|
||||
void InitializeMultiplyTables()
|
||||
{
|
||||
for (int log_y = 0; log_y < 256; ++log_y)
|
||||
// For each value we could multiply by:
|
||||
for (unsigned log_m = 0; log_m < kOrder; ++log_m)
|
||||
{
|
||||
uint8_t lo[16], hi[16];
|
||||
for (uint8_t x = 0; x < 16; ++x)
|
||||
// For each 4 bits of the finite field width in bits:
|
||||
for (unsigned i = 0, shift = 0; i < kBits / 4; ++i, shift += 4)
|
||||
{
|
||||
lo[x] = MultiplyLog(x, static_cast<uint8_t>(log_y));
|
||||
hi[x] = MultiplyLog(x << 4, static_cast<uint8_t>(log_y));
|
||||
}
|
||||
// Construct 16 entry LUT for PSHUFB
|
||||
ffe_t lut[16];
|
||||
for (uint8_t x = 0; x < 16; ++x)
|
||||
lut[x] = MultiplyLog(x << shift, static_cast<ffe_t>(log_m));
|
||||
|
||||
const LEO_M128 table_lo = _mm_loadu_si128((LEO_M128*)lo);
|
||||
const LEO_M128 table_hi = _mm_loadu_si128((LEO_M128*)hi);
|
||||
|
||||
_mm_storeu_si128(Multiply128LUT.Lo + log_y, table_lo);
|
||||
_mm_storeu_si128(Multiply128LUT.Hi + log_y, table_hi);
|
||||
// Store in 128-bit wide table
|
||||
const LEO_M128 *v_ptr = reinterpret_cast<const LEO_M128 *>(&lut[0]);
|
||||
const LEO_M128 value = _mm_loadu_si128(v_ptr);
|
||||
_mm_storeu_si128(&Multiply128LUT[log_m].Value[i], value);
|
||||
|
||||
// Store in 256-bit wide table
|
||||
#if defined(LEO_TRY_AVX2)
|
||||
if (CpuHasAVX2)
|
||||
{
|
||||
_mm256_storeu_si256(Multiply256LUT.Lo + log_y,
|
||||
_mm256_broadcastsi128_si256(table_lo));
|
||||
_mm256_storeu_si256(Multiply256LUT.Hi + log_y,
|
||||
_mm256_broadcastsi128_si256(table_hi));
|
||||
}
|
||||
if (CpuHasAVX2)
|
||||
{
|
||||
_mm256_storeu_si256(&Multiply256LUT[log_m].Value[i],
|
||||
_mm256_broadcastsi128_si256(value));
|
||||
}
|
||||
#endif // LEO_TRY_AVX2
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -323,8 +320,8 @@ void mul_mem(
|
|||
#if defined(LEO_TRY_AVX2)
|
||||
if (CpuHasAVX2)
|
||||
{
|
||||
const LEO_M256 table_lo_y = _mm256_loadu_si256(Multiply256LUT.Lo + log_m);
|
||||
const LEO_M256 table_hi_y = _mm256_loadu_si256(Multiply256LUT.Hi + log_m);
|
||||
const LEO_M256 table_lo_y = _mm256_loadu_si256(&Multiply256LUT[log_m].Value[0]);
|
||||
const LEO_M256 table_hi_y = _mm256_loadu_si256(&Multiply256LUT[log_m].Value[1]);
|
||||
|
||||
const LEO_M256 clr_mask = _mm256_set1_epi8(0x0f);
|
||||
|
||||
|
@ -353,8 +350,8 @@ void mul_mem(
|
|||
}
|
||||
#endif // LEO_TRY_AVX2
|
||||
|
||||
const LEO_M128 table_lo_y = _mm_loadu_si128(Multiply128LUT.Lo + log_m);
|
||||
const LEO_M128 table_hi_y = _mm_loadu_si128(Multiply128LUT.Hi + log_m);
|
||||
const LEO_M128 table_lo_y = _mm_loadu_si128(&Multiply128LUT[log_m].Value[0]);
|
||||
const LEO_M128 table_hi_y = _mm_loadu_si128(&Multiply128LUT[log_m].Value[1]);
|
||||
|
||||
const LEO_M128 clr_mask = _mm_set1_epi8(0x0f);
|
||||
|
||||
|
@ -393,8 +390,8 @@ void fft_butterfly(
|
|||
#if defined(LEO_TRY_AVX2)
|
||||
if (CpuHasAVX2)
|
||||
{
|
||||
const LEO_M256 table_lo_y = _mm256_loadu_si256(Multiply256LUT.Lo + log_m);
|
||||
const LEO_M256 table_hi_y = _mm256_loadu_si256(Multiply256LUT.Hi + log_m);
|
||||
const LEO_M256 table_lo_y = _mm256_loadu_si256(&Multiply256LUT[log_m].Value[0]);
|
||||
const LEO_M256 table_hi_y = _mm256_loadu_si256(&Multiply256LUT[log_m].Value[1]);
|
||||
|
||||
const LEO_M256 clr_mask = _mm256_set1_epi8(0x0f);
|
||||
|
||||
|
@ -427,8 +424,8 @@ void fft_butterfly(
|
|||
}
|
||||
#endif // LEO_TRY_AVX2
|
||||
|
||||
const LEO_M128 table_lo_y = _mm_loadu_si128(Multiply128LUT.Lo + log_m);
|
||||
const LEO_M128 table_hi_y = _mm_loadu_si128(Multiply128LUT.Hi + log_m);
|
||||
const LEO_M128 table_lo_y = _mm_loadu_si128(&Multiply128LUT[log_m].Value[0]);
|
||||
const LEO_M128 table_hi_y = _mm_loadu_si128(&Multiply128LUT[log_m].Value[1]);
|
||||
|
||||
const LEO_M128 clr_mask = _mm_set1_epi8(0x0f);
|
||||
|
||||
|
@ -472,8 +469,8 @@ void fft_butterfly4(
|
|||
#if defined(LEO_TRY_AVX2)
|
||||
if (CpuHasAVX2)
|
||||
{
|
||||
const LEO_M256 table_lo_y = _mm256_loadu_si256(Multiply256LUT.Lo + log_m);
|
||||
const LEO_M256 table_hi_y = _mm256_loadu_si256(Multiply256LUT.Hi + log_m);
|
||||
const LEO_M256 table_lo_y = _mm256_loadu_si256(&Multiply256LUT[log_m].Value[0]);
|
||||
const LEO_M256 table_hi_y = _mm256_loadu_si256(&Multiply256LUT[log_m].Value[1]);
|
||||
|
||||
const LEO_M256 clr_mask = _mm256_set1_epi8(0x0f);
|
||||
|
||||
|
@ -511,8 +508,8 @@ void fft_butterfly4(
|
|||
}
|
||||
#endif // LEO_TRY_AVX2
|
||||
|
||||
const LEO_M128 table_lo_y = _mm_loadu_si128(Multiply128LUT.Lo + log_m);
|
||||
const LEO_M128 table_hi_y = _mm_loadu_si128(Multiply128LUT.Hi + log_m);
|
||||
const LEO_M128 table_lo_y = _mm_loadu_si128(&Multiply128LUT[log_m].Value[0]);
|
||||
const LEO_M128 table_hi_y = _mm_loadu_si128(&Multiply128LUT[log_m].Value[1]);
|
||||
|
||||
const LEO_M128 clr_mask = _mm_set1_epi8(0x0f);
|
||||
|
||||
|
@ -568,8 +565,8 @@ void ifft_butterfly(
|
|||
#if defined(LEO_TRY_AVX2)
|
||||
if (CpuHasAVX2)
|
||||
{
|
||||
const LEO_M256 table_lo_y = _mm256_loadu_si256(Multiply256LUT.Lo + log_m);
|
||||
const LEO_M256 table_hi_y = _mm256_loadu_si256(Multiply256LUT.Hi + log_m);
|
||||
const LEO_M256 table_lo_y = _mm256_loadu_si256(&Multiply256LUT[log_m].Value[0]);
|
||||
const LEO_M256 table_hi_y = _mm256_loadu_si256(&Multiply256LUT[log_m].Value[1]);
|
||||
|
||||
const LEO_M256 clr_mask = _mm256_set1_epi8(0x0f);
|
||||
|
||||
|
@ -602,8 +599,8 @@ void ifft_butterfly(
|
|||
}
|
||||
#endif // LEO_TRY_AVX2
|
||||
|
||||
const LEO_M128 table_lo_y = _mm_loadu_si128(Multiply128LUT.Lo + log_m);
|
||||
const LEO_M128 table_hi_y = _mm_loadu_si128(Multiply128LUT.Hi + log_m);
|
||||
const LEO_M128 table_lo_y = _mm_loadu_si128(&Multiply128LUT[log_m].Value[0]);
|
||||
const LEO_M128 table_hi_y = _mm_loadu_si128(&Multiply128LUT[log_m].Value[1]);
|
||||
|
||||
const LEO_M128 clr_mask = _mm_set1_epi8(0x0f);
|
||||
|
||||
|
@ -647,8 +644,8 @@ void ifft_butterfly4(
|
|||
#if defined(LEO_TRY_AVX2)
|
||||
if (CpuHasAVX2)
|
||||
{
|
||||
const LEO_M256 table_lo_y = _mm256_loadu_si256(Multiply256LUT.Lo + log_m);
|
||||
const LEO_M256 table_hi_y = _mm256_loadu_si256(Multiply256LUT.Hi + log_m);
|
||||
const LEO_M256 table_lo_y = _mm256_loadu_si256(&Multiply256LUT[log_m].Value[0]);
|
||||
const LEO_M256 table_hi_y = _mm256_loadu_si256(&Multiply256LUT[log_m].Value[1]);
|
||||
|
||||
const LEO_M256 clr_mask = _mm256_set1_epi8(0x0f);
|
||||
|
||||
|
@ -686,8 +683,8 @@ void ifft_butterfly4(
|
|||
}
|
||||
#endif // LEO_TRY_AVX2
|
||||
|
||||
const LEO_M128 table_lo_y = _mm_loadu_si128(Multiply128LUT.Lo + log_m);
|
||||
const LEO_M128 table_hi_y = _mm_loadu_si128(Multiply128LUT.Hi + log_m);
|
||||
const LEO_M128 table_lo_y = _mm_loadu_si128(&Multiply128LUT[log_m].Value[0]);
|
||||
const LEO_M128 table_hi_y = _mm_loadu_si128(&Multiply128LUT[log_m].Value[1]);
|
||||
|
||||
const LEO_M128 clr_mask = _mm_set1_epi8(0x0f);
|
||||
|
||||
|
|
|
@ -45,11 +45,11 @@ struct TestParameters
|
|||
unsigned original_count = 1000; // under 65536
|
||||
unsigned recovery_count = 100; // under 65536 - original_count
|
||||
#else
|
||||
unsigned original_count = 200; // under 65536
|
||||
unsigned recovery_count = 20; // under 65536 - original_count
|
||||
unsigned original_count = 128; // under 65536
|
||||
unsigned recovery_count = 128; // under 65536 - original_count
|
||||
#endif
|
||||
unsigned buffer_bytes = 64000; // multiple of 64 bytes
|
||||
unsigned loss_count = 20; // some fraction of original_count
|
||||
unsigned loss_count = 128; // some fraction of original_count
|
||||
unsigned seed = 0;
|
||||
bool multithreaded = true;
|
||||
};
|
||||
|
|
Loading…
Reference in New Issue