diff --git a/LeopardCommon.h b/LeopardCommon.h index 75fb93f..f12f7b3 100644 --- a/LeopardCommon.h +++ b/LeopardCommon.h @@ -150,7 +150,7 @@ // Constants // Unroll inner loops 4 times -#define LEO_USE_VECTOR4_OPT +//#define LEO_USE_VECTOR4_OPT // Define this to enable the optimized version of FWHT() #define LEO_FWHT_OPT @@ -158,6 +158,9 @@ // Avoid scheduling reduced FFT operations that are unneeded #define LEO_SCHEDULE_OPT +// Avoid calculating final FFT values in decoder using bitfield +//#define LEO_ERROR_BITFIELD_OPT + // Optimize M=1 case #define LEO_M1_OPT diff --git a/LeopardFF16.cpp b/LeopardFF16.cpp index 8a0ab34..d030ac0 100644 --- a/LeopardFF16.cpp +++ b/LeopardFF16.cpp @@ -75,8 +75,6 @@ static inline ffe_t SubMod(const ffe_t a, const ffe_t b) //------------------------------------------------------------------------------ // Fast Walsh-Hadamard Transform (FWHT) (mod kModulus) -#if defined(LEO_FWHT_OPT) - // {a, b} = {a + b, a - b} (Mod Q) static LEO_FORCE_INLINE void FWHT_2(ffe_t& LEO_RESTRICT a, ffe_t& LEO_RESTRICT b) { @@ -86,6 +84,8 @@ static LEO_FORCE_INLINE void FWHT_2(ffe_t& LEO_RESTRICT a, ffe_t& LEO_RESTRICT b b = dif; } +#if defined(LEO_FWHT_OPT) + static LEO_FORCE_INLINE void FWHT_4(ffe_t* data) { ffe_t t0 = data[0]; @@ -304,6 +304,23 @@ static ffe_t LogLUT[kOrder]; static ffe_t ExpLUT[kOrder]; +// Returns a * Log(b) +static ffe_t MultiplyLog(ffe_t a, ffe_t log_b) +{ + /* + Note that this operation is not a normal multiplication in a finite + field because the right operand is already a logarithm. This is done + because it moves K table lookups from the Decode() method into the + initialization step that is less performance critical. The LogWalsh[] + table below contains precalculated logarithms so it is easier to do + all the other multiplies in that form as well. + */ + if (a == 0) + return 0; + return ExpLUT[AddMod(LogLUT[a], log_b)]; +} + + // Initialize LogLUT[], ExpLUT[] static void InitializeLogarithmTables() { @@ -344,63 +361,59 @@ static void InitializeLogarithmTables() //------------------------------------------------------------------------------ // Multiplies +/* + The multiplication algorithm used follows the approach outlined in {4}. + Specifically section 7 outlines the algorithm used here for 16-bit fields. + I use the ALTMAP memory layout since I do not need to convert in/out of it. +*/ + struct { - LEO_ALIGNED LEO_M128 Value[kBits / 4]; + LEO_ALIGNED LEO_M128 Lo[4]; + LEO_ALIGNED LEO_M128 Hi[4]; } static Multiply128LUT[kOrder]; #if defined(LEO_TRY_AVX2) struct { - LEO_ALIGNED LEO_M256 Value[kBits / 4]; + LEO_ALIGNED LEO_M256 Lo[4]; + LEO_ALIGNED LEO_M256 Hi[4]; } static Multiply256LUT[kOrder]; #endif // LEO_TRY_AVX2 -// Returns a * Log(b) -static ffe_t MultiplyLog(ffe_t a, ffe_t log_b) -{ - /* - Note that this operation is not a normal multiplication in a finite - field because the right operand is already a logarithm. This is done - because it moves K table lookups from the Decode() method into the - initialization step that is less performance critical. The LogWalsh[] - table below contains precalculated logarithms so it is easier to do - all the other multiplies in that form as well. - */ - if (a == 0) - return 0; - return ExpLUT[AddMod(LogLUT[a], log_b)]; -} - void InitializeMultiplyTables() { + // For each value we could multiply by: for (unsigned log_m = 0; log_m < kOrder; ++log_m) { - uint8_t table0[16], table1[16], table2[16], table3[16]; - - for (uint8_t x = 0; x < 16; ++x) + // For each 4 bits of the finite field width in bits: + for (unsigned i = 0, shift = 0; i < 4; ++i, shift += 4) { - table0[x] = MultiplyLog(x, static_cast(log_m)); - table1[x] = MultiplyLog(x << 4, static_cast(log_m)); - table2[x] = MultiplyLog(x << 8, static_cast(log_m)); - table3[x] = MultiplyLog(x << 12, static_cast(log_m)); - } + // Construct 16 entry LUT for PSHUFB + uint8_t prod_lo[16], prod_hi[16]; + for (ffe_t x = 0; x < 16; ++x) + { + const ffe_t prod = MultiplyLog(x << shift, static_cast(log_m)); + prod_lo[x] = static_cast(prod); + prod_hi[x] = static_cast(prod >> 8); + } - const LEO_M128 value0 = _mm_loadu_si128((LEO_M128*)table0); - const LEO_M128 value1 = _mm_loadu_si128((LEO_M128*)table1); - const LEO_M128 value2 = _mm_loadu_si128((LEO_M128*)table2); - const LEO_M128 value3 = _mm_loadu_si128((LEO_M128*)table3); + const LEO_M128 value_lo = _mm_loadu_si128((LEO_M128*)prod_lo); + const LEO_M128 value_hi = _mm_loadu_si128((LEO_M128*)prod_hi); - _mm_storeu_si128(&Multiply128LUT[log_m].Value[0], value0); - _mm_storeu_si128(&Multiply128LUT[log_m].Value[1], value1); + // Store in 128-bit wide table + _mm_storeu_si128(&Multiply128LUT[log_m].Lo[i], value_lo); + _mm_storeu_si128(&Multiply128LUT[log_m].Hi[i], value_hi); + // Store in 256-bit wide table #if defined(LEO_TRY_AVX2) - if (CpuHasAVX2) - { - _mm256_storeu_si256(&Multiply256LUT[log_m].Value[0], - _mm256_broadcastsi128_si256(table_lo)); - _mm256_storeu_si256(&Multiply256LUT[log_m].Value[1], - _mm256_broadcastsi128_si256(table_hi)); - } + if (CpuHasAVX2) + { + _mm256_storeu_si256(&Multiply256LUT[log_m].Lo[i], + _mm256_broadcastsi128_si256(value_lo)); + _mm256_storeu_si256(&Multiply256LUT[log_m].Hi[i], + _mm256_broadcastsi128_si256(value_hi)); + } #endif // LEO_TRY_AVX2 + } } } @@ -412,8 +425,17 @@ void mul_mem( #if defined(LEO_TRY_AVX2) if (CpuHasAVX2) { - const LEO_M256 table_lo_y = _mm256_loadu_si256(&Multiply256LUT[log_m].Value[0]); - const LEO_M256 table_hi_y = _mm256_loadu_si256(&Multiply256LUT[log_m].Value[1]); +#define LEO_MUL_TABLES_256() \ + const LEO_M256 T0_lo = _mm256_loadu_si256(&Multiply256LUT[log_m].Lo[0]); \ + const LEO_M256 T1_lo = _mm256_loadu_si256(&Multiply256LUT[log_m].Lo[1]); \ + const LEO_M256 T2_lo = _mm256_loadu_si256(&Multiply256LUT[log_m].Lo[2]); \ + const LEO_M256 T3_lo = _mm256_loadu_si256(&Multiply256LUT[log_m].Lo[3]); \ + const LEO_M256 T0_hi = _mm256_loadu_si256(&Multiply256LUT[log_m].Hi[0]); \ + const LEO_M256 T1_hi = _mm256_loadu_si256(&Multiply256LUT[log_m].Hi[1]); \ + const LEO_M256 T2_hi = _mm256_loadu_si256(&Multiply256LUT[log_m].Hi[2]); \ + const LEO_M256 T3_hi = _mm256_loadu_si256(&Multiply256LUT[log_m].Hi[3]); + + LEO_MUL_TABLES_256(); const LEO_M256 clr_mask = _mm256_set1_epi8(0x0f); @@ -423,15 +445,25 @@ void mul_mem( do { #define LEO_MUL_256(x_ptr, y_ptr) { \ - LEO_M256 data = _mm256_loadu_si256(y_ptr); \ - LEO_M256 lo = _mm256_and_si256(data, clr_mask); \ - lo = _mm256_shuffle_epi8(table_lo_y, lo); \ - LEO_M256 hi = _mm256_srli_epi64(data, 4); \ - hi = _mm256_and_si256(hi, clr_mask); \ - hi = _mm256_shuffle_epi8(table_hi_y, hi); \ - _mm256_storeu_si256(x_ptr, _mm256_xor_si256(lo, hi)); } + const LEO_M256 A_lo = _mm256_loadu_si256(y_ptr); \ + const LEO_M256 A_hi = _mm256_loadu_si256(y_ptr + 1); \ + LEO_M256 data_0 = _mm256_and_si256(A_lo, clr_mask); \ + LEO_M256 data_1 = _mm256_srli_epi64(A_lo, 4); \ + data_1 = _mm256_and_si256(data_1, clr_mask); \ + LEO_M256 data_2 = _mm256_and_si256(A_hi, clr_mask); \ + LEO_M256 data_3 = _mm256_srli_epi64(A_hi, 4); \ + data_3 = _mm256_and_si256(data_3, clr_mask); \ + LEO_M256 output_lo = _mm256_shuffle_epi8(T0_lo, data_0); \ + output_lo = _mm256_xor_si256(output_lo, _mm256_shuffle_epi8(T1_lo, data_1)); \ + output_lo = _mm256_xor_si256(output_lo, _mm256_shuffle_epi8(T2_lo, data_2)); \ + output_lo = _mm256_xor_si256(output_lo, _mm256_shuffle_epi8(T3_lo, data_3)); \ + LEO_M256 output_hi = _mm256_shuffle_epi8(T0_hi, data_0); \ + output_hi = _mm256_xor_si256(output_hi, _mm256_shuffle_epi8(T1_hi, data_1)); \ + output_hi = _mm256_xor_si256(output_hi, _mm256_shuffle_epi8(T2_hi, data_2)); \ + output_hi = _mm256_xor_si256(output_hi, _mm256_shuffle_epi8(T3_hi, data_3)); \ + _mm256_storeu_si256(x_ptr, output_lo); \ + _mm256_storeu_si256(x_ptr + 1, output_hi); } - LEO_MUL_256(x32 + 1, y32 + 1); LEO_MUL_256(x32, y32); y32 += 2, x32 += 2; @@ -442,8 +474,17 @@ void mul_mem( } #endif // LEO_TRY_AVX2 - const LEO_M128 table_lo_y = _mm_loadu_si128(&Multiply128LUT[log_m].Value[0]); - const LEO_M128 table_hi_y = _mm_loadu_si128(&Multiply128LUT[log_m].Value[1]); +#define LEO_MUL_TABLES_128() \ + const LEO_M128 T0_lo = _mm_loadu_si128(&Multiply128LUT[log_m].Lo[0]); \ + const LEO_M128 T1_lo = _mm_loadu_si128(&Multiply128LUT[log_m].Lo[1]); \ + const LEO_M128 T2_lo = _mm_loadu_si128(&Multiply128LUT[log_m].Lo[2]); \ + const LEO_M128 T3_lo = _mm_loadu_si128(&Multiply128LUT[log_m].Lo[3]); \ + const LEO_M128 T0_hi = _mm_loadu_si128(&Multiply128LUT[log_m].Hi[0]); \ + const LEO_M128 T1_hi = _mm_loadu_si128(&Multiply128LUT[log_m].Hi[1]); \ + const LEO_M128 T2_hi = _mm_loadu_si128(&Multiply128LUT[log_m].Hi[2]); \ + const LEO_M128 T3_hi = _mm_loadu_si128(&Multiply128LUT[log_m].Hi[3]); + + LEO_MUL_TABLES_128(); const LEO_M128 clr_mask = _mm_set1_epi8(0x0f); @@ -453,16 +494,25 @@ void mul_mem( do { #define LEO_MUL_128(x_ptr, y_ptr) { \ - LEO_M128 data = _mm_loadu_si128(y_ptr); \ - LEO_M128 lo = _mm_and_si128(data, clr_mask); \ - lo = _mm_shuffle_epi8(table_lo_y, lo); \ - LEO_M128 hi = _mm_srli_epi64(data, 4); \ - hi = _mm_and_si128(hi, clr_mask); \ - hi = _mm_shuffle_epi8(table_hi_y, hi); \ - _mm_storeu_si128(x_ptr, _mm_xor_si128(lo, hi)); } + const LEO_M128 A_lo = _mm_loadu_si128(y_ptr); \ + const LEO_M128 A_hi = _mm_loadu_si128(y_ptr + 2); \ + LEO_M128 data_0 = _mm_and_si128(A_lo, clr_mask); \ + LEO_M128 data_1 = _mm_srli_epi64(A_lo, 4); \ + data_1 = _mm_and_si128(data_1, clr_mask); \ + LEO_M128 data_2 = _mm_and_si128(A_hi, clr_mask); \ + LEO_M128 data_3 = _mm_srli_epi64(A_hi, 4); \ + data_3 = _mm_and_si128(data_3, clr_mask); \ + LEO_M128 output_lo = _mm_shuffle_epi8(T0_lo, data_0); \ + output_lo = _mm_xor_si128(output_lo, _mm_shuffle_epi8(T1_lo, data_1)); \ + output_lo = _mm_xor_si128(output_lo, _mm_shuffle_epi8(T2_lo, data_2)); \ + output_lo = _mm_xor_si128(output_lo, _mm_shuffle_epi8(T3_lo, data_3)); \ + LEO_M128 output_hi = _mm_shuffle_epi8(T0_hi, data_0); \ + output_hi = _mm_xor_si128(output_hi, _mm_shuffle_epi8(T1_hi, data_1)); \ + output_hi = _mm_xor_si128(output_hi, _mm_shuffle_epi8(T2_hi, data_2)); \ + output_hi = _mm_xor_si128(output_hi, _mm_shuffle_epi8(T3_hi, data_3)); \ + _mm_storeu_si128(x_ptr, output_lo); \ + _mm_storeu_si128(x_ptr + 2, output_hi); } - LEO_MUL_128(x16 + 3, y16 + 3); - LEO_MUL_128(x16 + 2, y16 + 2); LEO_MUL_128(x16 + 1, y16 + 1); LEO_MUL_128(x16, y16); x16 += 4, y16 += 4; @@ -482,8 +532,7 @@ void fft_butterfly( #if defined(LEO_TRY_AVX2) if (CpuHasAVX2) { - const LEO_M256 table_lo_y = _mm256_loadu_si256(&Multiply256LUT[log_m].Value[0]); - const LEO_M256 table_hi_y = _mm256_loadu_si256(&Multiply256LUT[log_m].Value[1]); + LEO_MUL_TABLES_256(); const LEO_M256 clr_mask = _mm256_set1_epi8(0x0f); @@ -493,19 +542,33 @@ void fft_butterfly( do { #define LEO_FFTB_256(x_ptr, y_ptr) { \ - LEO_M256 y_data = _mm256_loadu_si256(y_ptr); \ - LEO_M256 lo = _mm256_and_si256(y_data, clr_mask); \ - lo = _mm256_shuffle_epi8(table_lo_y, lo); \ - LEO_M256 hi = _mm256_srli_epi64(y_data, 4); \ - hi = _mm256_and_si256(hi, clr_mask); \ - hi = _mm256_shuffle_epi8(table_hi_y, hi); \ - LEO_M256 x_data = _mm256_loadu_si256(x_ptr); \ - x_data = _mm256_xor_si256(x_data, _mm256_xor_si256(lo, hi)); \ - y_data = _mm256_xor_si256(y_data, x_data); \ - _mm256_storeu_si256(x_ptr, x_data); \ - _mm256_storeu_si256(y_ptr, y_data); } + LEO_M256 y_lo = _mm256_loadu_si256(y_ptr); \ + LEO_M256 data_0 = _mm256_and_si256(y_lo, clr_mask); \ + LEO_M256 data_1 = _mm256_srli_epi64(y_lo, 4); \ + data_1 = _mm256_and_si256(data_1, clr_mask); \ + LEO_M256 prod_lo = _mm256_shuffle_epi8(T0_lo, data_0); \ + prod_lo = _mm256_xor_si256(prod_lo, _mm256_shuffle_epi8(T1_lo, data_1)); \ + LEO_M256 prod_hi = _mm256_shuffle_epi8(T0_hi, data_0); \ + prod_hi = _mm256_xor_si256(prod_hi, _mm256_shuffle_epi8(T1_hi, data_1)); \ + LEO_M256 y_hi = _mm256_loadu_si256(y_ptr + 1); \ + data_0 = _mm256_and_si256(y_hi, clr_mask); \ + data_1 = _mm256_srli_epi64(y_hi, 4); \ + data_1 = _mm256_and_si256(data_1, clr_mask); \ + prod_lo = _mm256_xor_si256(prod_lo, _mm256_shuffle_epi8(T2_lo, data_0)); \ + prod_lo = _mm256_xor_si256(prod_lo, _mm256_shuffle_epi8(T3_lo, data_1)); \ + prod_hi = _mm256_xor_si256(prod_hi, _mm256_shuffle_epi8(T2_hi, data_0)); \ + prod_hi = _mm256_xor_si256(prod_hi, _mm256_shuffle_epi8(T3_hi, data_1)); \ + LEO_M256 x_lo = _mm256_loadu_si256(x_ptr); \ + LEO_M256 x_hi = _mm256_loadu_si256(x_ptr + 1); \ + x_lo = _mm256_xor_si256(prod_lo, x_lo); \ + _mm256_storeu_si256(x_ptr, x_lo); \ + x_hi = _mm256_xor_si256(prod_hi, x_hi); \ + _mm256_storeu_si256(x_ptr + 1, x_hi); \ + y_lo = _mm256_xor_si256(y_lo, x_lo); \ + _mm256_storeu_si256(y_ptr, y_lo); \ + y_hi = _mm256_xor_si256(y_hi, x_hi); \ + _mm256_storeu_si256(y_ptr + 1, y_hi); } - LEO_FFTB_256(x32 + 1, y32 + 1); LEO_FFTB_256(x32, y32); y32 += 2, x32 += 2; @@ -516,8 +579,7 @@ void fft_butterfly( } #endif // LEO_TRY_AVX2 - const LEO_M128 table_lo_y = _mm_loadu_si128(&Multiply128LUT[log_m].Value[0]); - const LEO_M128 table_hi_y = _mm_loadu_si128(&Multiply128LUT[log_m].Value[1]); + LEO_MUL_TABLES_128(); const LEO_M128 clr_mask = _mm_set1_epi8(0x0f); @@ -527,20 +589,33 @@ void fft_butterfly( do { #define LEO_FFTB_128(x_ptr, y_ptr) { \ - LEO_M128 y_data = _mm_loadu_si128(y_ptr); \ - LEO_M128 lo = _mm_and_si128(y_data, clr_mask); \ - lo = _mm_shuffle_epi8(table_lo_y, lo); \ - LEO_M128 hi = _mm_srli_epi64(y_data, 4); \ - hi = _mm_and_si128(hi, clr_mask); \ - hi = _mm_shuffle_epi8(table_hi_y, hi); \ - LEO_M128 x_data = _mm_loadu_si128(x_ptr); \ - x_data = _mm_xor_si128(x_data, _mm_xor_si128(lo, hi)); \ - y_data = _mm_xor_si128(y_data, x_data); \ - _mm_storeu_si128(x_ptr, x_data); \ - _mm_storeu_si128(y_ptr, y_data); } + LEO_M128 y_lo = _mm_loadu_si128(y_ptr); \ + LEO_M128 data_0 = _mm_and_si128(y_lo, clr_mask); \ + LEO_M128 data_1 = _mm_srli_epi64(y_lo, 4); \ + data_1 = _mm_and_si128(data_1, clr_mask); \ + LEO_M128 prod_lo = _mm_shuffle_epi8(T0_lo, data_0); \ + prod_lo = _mm_xor_si128(prod_lo, _mm_shuffle_epi8(T1_lo, data_1)); \ + LEO_M128 prod_hi = _mm_shuffle_epi8(T0_hi, data_0); \ + prod_hi = _mm_xor_si128(prod_hi, _mm_shuffle_epi8(T1_hi, data_1)); \ + LEO_M128 y_hi = _mm_loadu_si128(y_ptr + 2); \ + data_0 = _mm_and_si128(y_hi, clr_mask); \ + data_1 = _mm_srli_epi64(y_hi, 4); \ + data_1 = _mm_and_si128(data_1, clr_mask); \ + prod_lo = _mm_xor_si128(prod_lo, _mm_shuffle_epi8(T2_lo, data_0)); \ + prod_lo = _mm_xor_si128(prod_lo, _mm_shuffle_epi8(T3_lo, data_1)); \ + prod_hi = _mm_xor_si128(prod_hi, _mm_shuffle_epi8(T2_hi, data_0)); \ + prod_hi = _mm_xor_si128(prod_hi, _mm_shuffle_epi8(T3_hi, data_1)); \ + LEO_M128 x_lo = _mm_loadu_si128(x_ptr); \ + LEO_M128 x_hi = _mm_loadu_si128(x_ptr + 2); \ + x_lo = _mm_xor_si128(prod_lo, x_lo); \ + _mm_storeu_si128(x_ptr, x_lo); \ + x_hi = _mm_xor_si128(prod_hi, x_hi); \ + _mm_storeu_si128(x_ptr + 2, x_hi); \ + y_lo = _mm_xor_si128(y_lo, x_lo); \ + _mm_storeu_si128(y_ptr, y_lo); \ + y_hi = _mm_xor_si128(y_hi, x_hi); \ + _mm_storeu_si128(y_ptr + 2, y_hi); } - LEO_FFTB_128(x16 + 3, y16 + 3); - LEO_FFTB_128(x16 + 2, y16 + 2); LEO_FFTB_128(x16 + 1, y16 + 1); LEO_FFTB_128(x16, y16); x16 += 4, y16 += 4; @@ -600,8 +675,7 @@ void fft_butterfly4( } #endif // LEO_TRY_AVX2 - const LEO_M128 table_lo_y = _mm_loadu_si128(&Multiply128LUT[log_m].Value[0]); - const LEO_M128 table_hi_y = _mm_loadu_si128(&Multiply128LUT[log_m].Value[1]); + LEO_MUL_TABLES_128(); const LEO_M128 clr_mask = _mm_set1_epi8(0x0f); @@ -657,8 +731,7 @@ void ifft_butterfly( #if defined(LEO_TRY_AVX2) if (CpuHasAVX2) { - const LEO_M256 table_lo_y = _mm256_loadu_si256(&Multiply256LUT[log_m].Value[0]); - const LEO_M256 table_hi_y = _mm256_loadu_si256(&Multiply256LUT[log_m].Value[1]); + LEO_MUL_TABLES_256(); const LEO_M256 clr_mask = _mm256_set1_epi8(0x0f); @@ -668,19 +741,33 @@ void ifft_butterfly( do { #define LEO_IFFTB_256(x_ptr, y_ptr) { \ - LEO_M256 x_data = _mm256_loadu_si256(x_ptr); \ - LEO_M256 y_data = _mm256_loadu_si256(y_ptr); \ - y_data = _mm256_xor_si256(y_data, x_data); \ - _mm256_storeu_si256(y_ptr, y_data); \ - LEO_M256 lo = _mm256_and_si256(y_data, clr_mask); \ - lo = _mm256_shuffle_epi8(table_lo_y, lo); \ - LEO_M256 hi = _mm256_srli_epi64(y_data, 4); \ - hi = _mm256_and_si256(hi, clr_mask); \ - hi = _mm256_shuffle_epi8(table_hi_y, hi); \ - x_data = _mm256_xor_si256(x_data, _mm256_xor_si256(lo, hi)); \ - _mm256_storeu_si256(x_ptr, x_data); } + LEO_M256 x_lo = _mm256_loadu_si256(x_ptr); \ + LEO_M256 y_lo = _mm256_loadu_si256(y_ptr); \ + y_lo = _mm256_xor_si256(y_lo, x_lo); \ + _mm256_storeu_si256(y_ptr, y_lo); \ + LEO_M256 data_0 = _mm256_and_si256(y_lo, clr_mask); \ + LEO_M256 data_1 = _mm256_srli_epi64(y_lo, 4); \ + data_1 = _mm256_and_si256(data_1, clr_mask); \ + LEO_M256 prod_lo = _mm256_shuffle_epi8(T0_lo, data_0); \ + prod_lo = _mm256_xor_si256(prod_lo, _mm256_shuffle_epi8(T1_lo, data_1)); \ + LEO_M256 prod_hi = _mm256_shuffle_epi8(T0_hi, data_0); \ + prod_hi = _mm256_xor_si256(prod_hi, _mm256_shuffle_epi8(T1_hi, data_1)); \ + LEO_M256 x_hi = _mm256_loadu_si256(x_ptr + 1); \ + LEO_M256 y_hi = _mm256_loadu_si256(y_ptr + 1); \ + y_hi = _mm256_xor_si256(y_hi, x_hi); \ + _mm256_storeu_si256(y_ptr + 1, y_hi); \ + data_0 = _mm256_and_si256(y_hi, clr_mask); \ + data_1 = _mm256_srli_epi64(y_hi, 4); \ + data_1 = _mm256_and_si256(data_1, clr_mask); \ + prod_lo = _mm256_xor_si256(prod_lo, _mm256_shuffle_epi8(T2_lo, data_0)); \ + prod_lo = _mm256_xor_si256(prod_lo, _mm256_shuffle_epi8(T3_lo, data_1)); \ + prod_hi = _mm256_xor_si256(prod_hi, _mm256_shuffle_epi8(T2_hi, data_0)); \ + prod_hi = _mm256_xor_si256(prod_hi, _mm256_shuffle_epi8(T3_hi, data_1)); \ + x_lo = _mm256_xor_si256(prod_lo, x_lo); \ + _mm256_storeu_si256(x_ptr, x_lo); \ + x_hi = _mm256_xor_si256(prod_hi, x_hi); \ + _mm256_storeu_si256(x_ptr + 1, x_hi); } - LEO_IFFTB_256(x32 + 1, y32 + 1); LEO_IFFTB_256(x32, y32); y32 += 2, x32 += 2; @@ -691,8 +778,7 @@ void ifft_butterfly( } #endif // LEO_TRY_AVX2 - const LEO_M128 table_lo_y = _mm_loadu_si128(&Multiply128LUT[log_m].Value[0]); - const LEO_M128 table_hi_y = _mm_loadu_si128(&Multiply128LUT[log_m].Value[1]); + LEO_MUL_TABLES_128(); const LEO_M128 clr_mask = _mm_set1_epi8(0x0f); @@ -702,20 +788,33 @@ void ifft_butterfly( do { #define LEO_IFFTB_128(x_ptr, y_ptr) { \ - LEO_M128 x_data = _mm_loadu_si128(x_ptr); \ - LEO_M128 y_data = _mm_loadu_si128(y_ptr); \ - y_data = _mm_xor_si128(y_data, x_data); \ - _mm_storeu_si128(y_ptr, y_data); \ - LEO_M128 lo = _mm_and_si128(y_data, clr_mask); \ - lo = _mm_shuffle_epi8(table_lo_y, lo); \ - LEO_M128 hi = _mm_srli_epi64(y_data, 4); \ - hi = _mm_and_si128(hi, clr_mask); \ - hi = _mm_shuffle_epi8(table_hi_y, hi); \ - x_data = _mm_xor_si128(x_data, _mm_xor_si128(lo, hi)); \ - _mm_storeu_si128(x_ptr, x_data); } + LEO_M128 x_lo = _mm_loadu_si128(x_ptr); \ + LEO_M128 y_lo = _mm_loadu_si128(y_ptr); \ + y_lo = _mm_xor_si128(y_lo, x_lo); \ + _mm_storeu_si128(y_ptr, y_lo); \ + LEO_M128 data_0 = _mm_and_si128(y_lo, clr_mask); \ + LEO_M128 data_1 = _mm_srli_epi64(y_lo, 4); \ + data_1 = _mm_and_si128(data_1, clr_mask); \ + LEO_M128 prod_lo = _mm_shuffle_epi8(T0_lo, data_0); \ + prod_lo = _mm_xor_si128(prod_lo, _mm_shuffle_epi8(T1_lo, data_1)); \ + LEO_M128 prod_hi = _mm_shuffle_epi8(T0_hi, data_0); \ + prod_hi = _mm_xor_si128(prod_hi, _mm_shuffle_epi8(T1_hi, data_1)); \ + LEO_M128 x_hi = _mm_loadu_si128(x_ptr + 2); \ + LEO_M128 y_hi = _mm_loadu_si128(y_ptr + 2); \ + y_hi = _mm_xor_si128(y_hi, x_hi); \ + _mm_storeu_si128(y_ptr + 2, y_hi); \ + data_0 = _mm_and_si128(y_hi, clr_mask); \ + data_1 = _mm_srli_epi64(y_hi, 4); \ + data_1 = _mm_and_si128(data_1, clr_mask); \ + prod_lo = _mm_xor_si128(prod_lo, _mm_shuffle_epi8(T2_lo, data_0)); \ + prod_lo = _mm_xor_si128(prod_lo, _mm_shuffle_epi8(T3_lo, data_1)); \ + prod_hi = _mm_xor_si128(prod_hi, _mm_shuffle_epi8(T2_hi, data_0)); \ + prod_hi = _mm_xor_si128(prod_hi, _mm_shuffle_epi8(T3_hi, data_1)); \ + x_lo = _mm_xor_si128(prod_lo, x_lo); \ + _mm_storeu_si128(x_ptr, x_lo); \ + x_hi = _mm_xor_si128(prod_hi, x_hi); \ + _mm_storeu_si128(x_ptr + 2, x_hi); } - LEO_IFFTB_128(x16 + 3, y16 + 3); - LEO_IFFTB_128(x16 + 2, y16 + 2); LEO_IFFTB_128(x16 + 1, y16 + 1); LEO_IFFTB_128(x16, y16); x16 += 4, y16 += 4; @@ -1105,13 +1204,21 @@ skip_body: //------------------------------------------------------------------------------ // ErrorBitfield -#ifdef LEO_SCHEDULE_OPT +#ifdef LEO_ERROR_BITFIELD_OPT // Used in decoding to decide which final FFT operations to perform class ErrorBitfield { + static const unsigned kWordMips = 5; static const unsigned kWords = kOrder / 64; - uint64_t Words[7][kWords] = {}; + uint64_t Words[kWordMips][kWords] = {}; + + static const unsigned kBigMips = 5; + static const unsigned kBigWords = (kWords + 63) / 64; + uint64_t BigWords[kBigMips][kBigWords] = {}; + + static const unsigned kBiggestMips = 5; + uint64_t BiggestWords[kBiggestMips] = {}; public: LEO_FORCE_INLINE void Set(unsigned i) @@ -1123,8 +1230,18 @@ public: LEO_FORCE_INLINE bool IsNeeded(unsigned mip_level, unsigned bit) const { - if (mip_level >= 8) + if (mip_level >= 16) return true; + if (mip_level >= 11) + { + bit /= 4096; + return 0 != (BiggestWords[mip_level - 11] & ((uint64_t)1 << bit)); + } + if (mip_level >= 6) + { + bit /= 64; + return 0 != (BigWords[mip_level - 6][bit / 64] & ((uint64_t)1 << (bit % 64))); + } return 0 != (Words[mip_level - 1][bit / 64] & ((uint64_t)1 << (bit % 64))); } }; @@ -1142,33 +1259,57 @@ void ErrorBitfield::Prepare() // First mip level is for final layer of FFT: pairs of data for (unsigned i = 0; i < kWords; ++i) { - const uint64_t w0 = Words[0][i]; - const uint64_t hi2lo0 = w0 | ((w0 & kHiMasks[0]) >> 1); - const uint64_t lo2hi0 = ((w0 & (kHiMasks[0] >> 1)) << 1); - Words[0][i] = hi2lo0 | lo2hi0; + uint64_t w_i = Words[0][i]; + const uint64_t hi2lo0 = w_i | ((w_i & kHiMasks[0]) >> 1); + const uint64_t lo2hi0 = ((w_i & (kHiMasks[0] >> 1)) << 1); + Words[0][i] = w_i = hi2lo0 | lo2hi0; - for (unsigned j = 1, bits = 2; j < 5; ++j, bits <<= 1) + for (unsigned j = 1, bits = 2; j < kWordMips; ++j, bits <<= 1) { - const uint64_t w_j = Words[j - 1][i]; - const uint64_t hi2lo_j = w_j | ((w_j & kHiMasks[j]) >> bits); - const uint64_t lo2hi_j = ((w_j & (kHiMasks[j] >> bits)) << bits); - Words[j][i] = hi2lo_j | lo2hi_j; + const uint64_t hi2lo_j = w_i | ((w_i & kHiMasks[j]) >> bits); + const uint64_t lo2hi_j = ((w_i & (kHiMasks[j] >> bits)) << bits); + Words[j][i] = w_i = hi2lo_j | lo2hi_j; } } - for (unsigned i = 0; i < kWords; ++i) + for (unsigned i = 0; i < kBigWords; ++i) { - uint64_t w = Words[4][i]; - w |= w >> 32; - w |= w << 32; - Words[5][i] = w; + uint64_t w_i = 0; + uint64_t bit = 1; + const uint64_t* src = &Words[4][i * 64]; + for (unsigned j = 0; j < 64; ++j, bit <<= 1) + { + const uint64_t w = src[j]; + w_i |= (w | (w >> 32) | (w << 32)) & bit; + } + BigWords[0][i] = w_i; + + for (unsigned j = 1, bits = 1; j < kBigMips; ++j, bits <<= 1) + { + const uint64_t hi2lo_j = w_i | ((w_i & kHiMasks[j - 1]) >> bits); + const uint64_t lo2hi_j = ((w_i & (kHiMasks[j - 1] >> bits)) << bits); + BigWords[j][i] = w_i = hi2lo_j | lo2hi_j; + } } - for (unsigned i = 0; i < kWords; i += 2) - Words[6][i] = Words[6][i + 1] = Words[5][i] | Words[5][i + 1]; + uint64_t w_i = 0; + uint64_t bit = 1; + for (unsigned j = 0; j < kBigWords; ++j, bit <<= 1) + { + const uint64_t w = BigWords[kBigMips - 1][j]; + w_i |= (w | (w >> 32) | (w << 32)) & bit; + } + BiggestWords[0] = w_i; + + for (unsigned j = 1, bits = 1; j < kBiggestMips; ++j, bits <<= 1) + { + const uint64_t hi2lo_j = w_i | ((w_i & kHiMasks[j - 1]) >> bits); + const uint64_t lo2hi_j = ((w_i & (kHiMasks[j - 1] >> bits)) << bits); + BiggestWords[j] = w_i = hi2lo_j | lo2hi_j; + } } -#endif // LEO_SCHEDULE_OPT +#endif // LEO_ERROR_BITFIELD_OPT //------------------------------------------------------------------------------ @@ -1186,9 +1327,9 @@ void ReedSolomonDecode( { // Fill in error locations -#ifdef LEO_SCHEDULE_OPT +#ifdef LEO_ERROR_BITFIELD_OPT ErrorBitfield ErrorBits; -#endif // LEO_SCHEDULE_OPT +#endif // LEO_ERROR_BITFIELD_OPT ffe_t ErrorLocations[kOrder] = {}; for (unsigned i = 0; i < recovery_count; ++i) @@ -1201,15 +1342,15 @@ void ReedSolomonDecode( if (!original[i]) { ErrorLocations[i + m] = 1; -#ifdef LEO_SCHEDULE_OPT +#ifdef LEO_ERROR_BITFIELD_OPT ErrorBits.Set(i + m); -#endif // LEO_SCHEDULE_OPT +#endif // LEO_ERROR_BITFIELD_OPT } } -#ifdef LEO_SCHEDULE_OPT +#ifdef LEO_ERROR_BITFIELD_OPT ErrorBits.Prepare(); -#endif // LEO_SCHEDULE_OPT +#endif // LEO_ERROR_BITFIELD_OPT // Evaluate error locator polynomial @@ -1291,10 +1432,10 @@ void ReedSolomonDecode( for (unsigned j = 0; j < n; j += range) #endif { -#ifdef LEO_SCHEDULE_OPT +#ifdef LEO_ERROR_BITFIELD_OPT if (!ErrorBits.IsNeeded(mip_level, j)) continue; -#endif // LEO_SCHEDULE_OPT +#endif // LEO_ERROR_BITFIELD_OPT VectorFFTButterfly( buffer_bytes, diff --git a/LeopardFF8.cpp b/LeopardFF8.cpp index b78c1d0..2f3f1a3 100644 --- a/LeopardFF8.cpp +++ b/LeopardFF8.cpp @@ -214,6 +214,23 @@ static ffe_t LogLUT[kOrder]; static ffe_t ExpLUT[kOrder]; +// Returns a * Log(b) +static ffe_t MultiplyLog(ffe_t a, ffe_t log_b) +{ + /* + Note that this operation is not a normal multiplication in a finite + field because the right operand is already a logarithm. This is done + because it moves K table lookups from the Decode() method into the + initialization step that is less performance critical. The LogWalsh[] + table below contains precalculated logarithms so it is easier to do + all the other multiplies in that form as well. + */ + if (a == 0) + return 0; + return ExpLUT[AddMod(LogLUT[a], log_b)]; +} + + // Initialize LogLUT[], ExpLUT[] static void InitializeLogarithmTables() { @@ -257,30 +274,20 @@ static void InitializeLogarithmTables() //------------------------------------------------------------------------------ // Multiplies +/* + The multiplication algorithm used follows the approach outlined in {4}. + Specifically section 6 outlines the algorithm used here for 8-bit fields. +*/ + struct { - LEO_ALIGNED LEO_M128 Value[kBits / 4]; + LEO_ALIGNED LEO_M128 Value[2]; } static Multiply128LUT[kOrder]; #if defined(LEO_TRY_AVX2) struct { - LEO_ALIGNED LEO_M256 Value[kBits / 4]; + LEO_ALIGNED LEO_M256 Value[2]; } static Multiply256LUT[kOrder]; #endif // LEO_TRY_AVX2 -// Returns a * Log(b) -static ffe_t MultiplyLog(ffe_t a, ffe_t log_b) -{ - /* - Note that this operation is not a normal multiplication in a finite - field because the right operand is already a logarithm. This is done - because it moves K table lookups from the Decode() method into the - initialization step that is less performance critical. The LogWalsh[] - table below contains precalculated logarithms so it is easier to do - all the other multiplies in that form as well. - */ - if (a == 0) - return 0; - return ExpLUT[AddMod(LogLUT[a], log_b)]; -} void InitializeMultiplyTables() { @@ -288,11 +295,11 @@ void InitializeMultiplyTables() for (unsigned log_m = 0; log_m < kOrder; ++log_m) { // For each 4 bits of the finite field width in bits: - for (unsigned i = 0, shift = 0; i < kBits / 4; ++i, shift += 4) + for (unsigned i = 0, shift = 0; i < 2; ++i, shift += 4) { // Construct 16 entry LUT for PSHUFB - ffe_t lut[16]; - for (uint8_t x = 0; x < 16; ++x) + uint8_t lut[16]; + for (ffe_t x = 0; x < 16; ++x) lut[x] = MultiplyLog(x << shift, static_cast(log_m)); // Store in 128-bit wide table @@ -1013,7 +1020,7 @@ skip_body: //------------------------------------------------------------------------------ // ErrorBitfield -#ifdef LEO_SCHEDULE_OPT +#ifdef LEO_ERROR_BITFIELD_OPT // Used in decoding to decide which final FFT operations to perform class ErrorBitfield @@ -1050,17 +1057,16 @@ void ErrorBitfield::Prepare() // First mip level is for final layer of FFT: pairs of data for (unsigned i = 0; i < kWords; ++i) { - const uint64_t w0 = Words[0][i]; - const uint64_t hi2lo0 = w0 | ((w0 & kHiMasks[0]) >> 1); - const uint64_t lo2hi0 = ((w0 & (kHiMasks[0] >> 1)) << 1); - Words[0][i] = hi2lo0 | lo2hi0; + uint64_t w_i = Words[0][i]; + const uint64_t hi2lo0 = w_i | ((w_i & kHiMasks[0]) >> 1); + const uint64_t lo2hi0 = ((w_i & (kHiMasks[0] >> 1)) << 1); + Words[0][i] = w_i = hi2lo0 | lo2hi0; for (unsigned j = 1, bits = 2; j < 5; ++j, bits <<= 1) { - const uint64_t w_j = Words[j - 1][i]; - const uint64_t hi2lo_j = w_j | ((w_j & kHiMasks[j]) >> bits); - const uint64_t lo2hi_j = ((w_j & (kHiMasks[j] >> bits)) << bits); - Words[j][i] = hi2lo_j | lo2hi_j; + const uint64_t hi2lo_j = w_i | ((w_i & kHiMasks[j]) >> bits); + const uint64_t lo2hi_j = ((w_i & (kHiMasks[j] >> bits)) << bits); + Words[j][i] = w_i = hi2lo_j | lo2hi_j; } } @@ -1076,7 +1082,7 @@ void ErrorBitfield::Prepare() Words[6][i] = Words[6][i + 1] = Words[5][i] | Words[5][i + 1]; } -#endif // LEO_SCHEDULE_OPT +#endif // LEO_ERROR_BITFIELD_OPT //------------------------------------------------------------------------------ @@ -1094,9 +1100,9 @@ void ReedSolomonDecode( { // Fill in error locations -#ifdef LEO_SCHEDULE_OPT +#ifdef LEO_ERROR_BITFIELD_OPT ErrorBitfield ErrorBits; -#endif // LEO_SCHEDULE_OPT +#endif // LEO_ERROR_BITFIELD_OPT ffe_t ErrorLocations[kOrder] = {}; for (unsigned i = 0; i < recovery_count; ++i) @@ -1109,15 +1115,15 @@ void ReedSolomonDecode( if (!original[i]) { ErrorLocations[i + m] = 1; -#ifdef LEO_SCHEDULE_OPT +#ifdef LEO_ERROR_BITFIELD_OPT ErrorBits.Set(i + m); -#endif // LEO_SCHEDULE_OPT +#endif // LEO_ERROR_BITFIELD_OPT } } -#ifdef LEO_SCHEDULE_OPT +#ifdef LEO_ERROR_BITFIELD_OPT ErrorBits.Prepare(); -#endif // LEO_SCHEDULE_OPT +#endif // LEO_ERROR_BITFIELD_OPT // Evaluate error locator polynomial @@ -1199,10 +1205,10 @@ void ReedSolomonDecode( for (unsigned j = 0; j < n; j += range) #endif { -#ifdef LEO_SCHEDULE_OPT +#ifdef LEO_ERROR_BITFIELD_OPT if (!ErrorBits.IsNeeded(mip_level, j)) continue; -#endif // LEO_SCHEDULE_OPT +#endif // LEO_ERROR_BITFIELD_OPT VectorFFTButterfly( buffer_bytes, diff --git a/README.md b/README.md index d22352a..c133f1d 100644 --- a/README.md +++ b/README.md @@ -308,7 +308,8 @@ This library implements an MDS erasure code introduced in this paper: ~~~ {1} S.-J. Lin, T. Y. Al-Naffouri, Y. S. Han, and W.-H. Chung, - "Novel Polynomial Basis with Fast Fourier Transform and Its Application to Reed-Solomon Erasure Codes" + "Novel Polynomial Basis with Fast Fourier Transform + and Its Application to Reed-Solomon Erasure Codes" IEEE Trans. on Information Theory, pp. 6284-6299, November, 2016. ~~~ @@ -318,10 +319,16 @@ This library implements an MDS erasure code introduced in this paper: ~~~ ~~~ - {3} Sian-Jheng Lin, Wei-Ho Chung, “An Efficient (n, k) Information - Dispersal Algorithm for High Code Rate System over Fermat Fields,” + {3} Sian-Jheng Lin, Wei-Ho Chung, "An Efficient (n, k) Information + Dispersal Algorithm for High Code Rate System over Fermat Fields," IEEE Commun. Lett., vol.16, no.12, pp. 2036-2039, Dec. 2012. ~~~ + +~~~ + {4} Plank, J. S., Greenan, K. M., Miller, E. L., "Screaming fast Galois Field + arithmetic using Intel SIMD instructions." In: FAST-2013: 11th Usenix + Conference on File and Storage Technologies, San Jose, 2013 +~~~ Some papers are mirrored in the /docs/ folder. diff --git a/docs/plank-fast13.pdf b/docs/plank-fast13.pdf new file mode 100644 index 0000000..a597e9a Binary files /dev/null and b/docs/plank-fast13.pdf differ diff --git a/leopard.h b/leopard.h index 7722c7f..07529df 100644 --- a/leopard.h +++ b/leopard.h @@ -42,25 +42,29 @@ Bulat Ziganshin : Author of FastECC Yutaka Sawada : Author of MultiPar + References: {1} S.-J. Lin, T. Y. Al-Naffouri, Y. S. Han, and W.-H. Chung, - "Novel Polynomial Basis with Fast Fourier Transform and Its Application to Reed-Solomon Erasure Codes" + "Novel Polynomial Basis with Fast Fourier Transform + and Its Application to Reed-Solomon Erasure Codes" IEEE Trans. on Information Theory, pp. 6284-6299, November, 2016. - http://ct.ee.ntust.edu.tw/it2016-2.pdf {2} D. G. Cantor, "On arithmetical algorithms over finite fields", Journal of Combinatorial Theory, Series A, vol. 50, no. 2, pp. 285-300, 1989. - {3} Sian-Jheng Lin, Wei-Ho Chung, An Efficient (n, k) Information - Dispersal Algorithm for High Code Rate System over Fermat Fields, + {3} Sian-Jheng Lin, Wei-Ho Chung, "An Efficient (n, k) Information + Dispersal Algorithm for High Code Rate System over Fermat Fields," IEEE Commun. Lett., vol.16, no.12, pp. 2036-2039, Dec. 2012. + + {4} Plank, J. S., Greenan, K. M., Miller, E. L., "Screaming fast Galois Field + arithmetic using Intel SIMD instructions." In: FAST-2013: 11th Usenix + Conference on File and Storage Technologies, San Jose, 2013 */ /* TODO: - + New 16-bit Muladd inner loops - + Benchmarks for large data! + + Benchmarks for large data! + Add multi-threading to split up long parallelizable calculations + Final benchmarks! + Release version 1 @@ -76,7 +80,7 @@ // Enable 8-bit or 16-bit fields #define LEO_HAS_FF8 -//#define LEO_HAS_FF16 +#define LEO_HAS_FF16 // Tweak if the functions are exported or statically linked //#define LEO_DLL /* Defined when building/linking as DLL */ diff --git a/tests/benchmark.cpp b/tests/benchmark.cpp index 6d408ee..cf00cc9 100644 --- a/tests/benchmark.cpp +++ b/tests/benchmark.cpp @@ -42,15 +42,15 @@ using namespace std; struct TestParameters { #ifdef LEO_HAS_FF16 - unsigned original_count = 1000; // under 65536 - unsigned recovery_count = 100; // under 65536 - original_count + unsigned original_count = 677; // under 65536 + unsigned recovery_count = 487; // under 65536 - original_count #else unsigned original_count = 128; // under 65536 unsigned recovery_count = 128; // under 65536 - original_count #endif - unsigned buffer_bytes = 64000; // multiple of 64 bytes - unsigned loss_count = 128; // some fraction of original_count - unsigned seed = 0; + unsigned buffer_bytes = 640; // multiple of 64 bytes + unsigned loss_count = 2; // some fraction of original_count + unsigned seed = 2; bool multithreaded = true; }; @@ -535,10 +535,12 @@ static bool BasicTest(const TestParameters& params) t_mem_free.EndCall(); } +#if 0 t_mem_alloc.Print(kTrials); t_leo_encode.Print(kTrials); t_leo_decode.Print(kTrials); t_mem_free.Print(kTrials); +#endif float encode_input_MBPS = total_bytes * kTrials / (float)(t_leo_encode.TotalUsec); float encode_output_MBPS = params.buffer_bytes * (uint64_t)params.recovery_count * kTrials / (float)(t_leo_encode.TotalUsec); @@ -784,6 +786,7 @@ int main(int argc, char **argv) #endif TestParameters params; + PCGRandom prng; if (argc >= 2) params.original_count = atoi(argv[1]); @@ -804,6 +807,20 @@ int main(int argc, char **argv) if (!BasicTest(params)) goto Failed; + prng.Seed(params.seed, 7); + for (;; ++params.seed) + { + params.original_count = prng.Next() % 32768; + params.recovery_count = prng.Next() % params.original_count + 1; + params.loss_count = prng.Next() % params.recovery_count + 1; + + cout << "Parameters: [original count=" << params.original_count << "] [recovery count=" << params.recovery_count << "] [buffer bytes=" << params.buffer_bytes << "] [loss count=" << params.loss_count << "] [random seed=" << params.seed << "]" << endl; + + if (!BasicTest(params)) + goto Failed; + } + +#ifdef LEO_BENCH_ALL_256_PARAMS for (unsigned original_count = 1; original_count <= 256; ++original_count) { for (unsigned recovery_count = 1; recovery_count <= original_count; ++recovery_count) @@ -818,6 +835,7 @@ int main(int argc, char **argv) goto Failed; } } +#endif Failed: getchar(); diff --git a/tests/experiments.cpp b/tests/experiments.cpp index 26ce205..da447eb 100644 --- a/tests/experiments.cpp +++ b/tests/experiments.cpp @@ -33,7 +33,7 @@ #include -#define LEO_SHORT_FIELD +//#define LEO_SHORT_FIELD //#define LEO_EXPERIMENT_EXTRA_XOR //#define LEO_EXPERIMENT_EXTRA_MULS #define LEO_EXPERIMENT_CANTOR_BASIS @@ -603,8 +603,8 @@ int main(int argc, char **argv) const unsigned input_count = 32; const unsigned recovery_count = 16; #else // LEO_SHORT_FIELD - const unsigned input_count = 10000; - const unsigned recovery_count = 2000; + const unsigned input_count = 32768; + const unsigned recovery_count = 32768; #endif // LEO_SHORT_FIELD test(input_count, recovery_count, seed);