FF16 is mostly working still some bugs

This commit is contained in:
Christopher Taylor 2017-05-30 01:37:27 -07:00
parent 6e7dfea7c9
commit 2e1007c4aa
8 changed files with 386 additions and 207 deletions

View File

@ -150,7 +150,7 @@
// Constants
// Unroll inner loops 4 times
#define LEO_USE_VECTOR4_OPT
//#define LEO_USE_VECTOR4_OPT
// Define this to enable the optimized version of FWHT()
#define LEO_FWHT_OPT
@ -158,6 +158,9 @@
// Avoid scheduling reduced FFT operations that are unneeded
#define LEO_SCHEDULE_OPT
// Avoid calculating final FFT values in decoder using bitfield
//#define LEO_ERROR_BITFIELD_OPT
// Optimize M=1 case
#define LEO_M1_OPT

View File

@ -75,8 +75,6 @@ static inline ffe_t SubMod(const ffe_t a, const ffe_t b)
//------------------------------------------------------------------------------
// Fast Walsh-Hadamard Transform (FWHT) (mod kModulus)
#if defined(LEO_FWHT_OPT)
// {a, b} = {a + b, a - b} (Mod Q)
static LEO_FORCE_INLINE void FWHT_2(ffe_t& LEO_RESTRICT a, ffe_t& LEO_RESTRICT b)
{
@ -86,6 +84,8 @@ static LEO_FORCE_INLINE void FWHT_2(ffe_t& LEO_RESTRICT a, ffe_t& LEO_RESTRICT b
b = dif;
}
#if defined(LEO_FWHT_OPT)
static LEO_FORCE_INLINE void FWHT_4(ffe_t* data)
{
ffe_t t0 = data[0];
@ -304,6 +304,23 @@ static ffe_t LogLUT[kOrder];
static ffe_t ExpLUT[kOrder];
// Returns a * Log(b)
static ffe_t MultiplyLog(ffe_t a, ffe_t log_b)
{
/*
Note that this operation is not a normal multiplication in a finite
field because the right operand is already a logarithm. This is done
because it moves K table lookups from the Decode() method into the
initialization step that is less performance critical. The LogWalsh[]
table below contains precalculated logarithms so it is easier to do
all the other multiplies in that form as well.
*/
if (a == 0)
return 0;
return ExpLUT[AddMod(LogLUT[a], log_b)];
}
// Initialize LogLUT[], ExpLUT[]
static void InitializeLogarithmTables()
{
@ -344,63 +361,59 @@ static void InitializeLogarithmTables()
//------------------------------------------------------------------------------
// Multiplies
/*
The multiplication algorithm used follows the approach outlined in {4}.
Specifically section 7 outlines the algorithm used here for 16-bit fields.
I use the ALTMAP memory layout since I do not need to convert in/out of it.
*/
struct {
LEO_ALIGNED LEO_M128 Value[kBits / 4];
LEO_ALIGNED LEO_M128 Lo[4];
LEO_ALIGNED LEO_M128 Hi[4];
} static Multiply128LUT[kOrder];
#if defined(LEO_TRY_AVX2)
struct {
LEO_ALIGNED LEO_M256 Value[kBits / 4];
LEO_ALIGNED LEO_M256 Lo[4];
LEO_ALIGNED LEO_M256 Hi[4];
} static Multiply256LUT[kOrder];
#endif // LEO_TRY_AVX2
// Returns a * Log(b)
static ffe_t MultiplyLog(ffe_t a, ffe_t log_b)
{
/*
Note that this operation is not a normal multiplication in a finite
field because the right operand is already a logarithm. This is done
because it moves K table lookups from the Decode() method into the
initialization step that is less performance critical. The LogWalsh[]
table below contains precalculated logarithms so it is easier to do
all the other multiplies in that form as well.
*/
if (a == 0)
return 0;
return ExpLUT[AddMod(LogLUT[a], log_b)];
}
void InitializeMultiplyTables()
{
// For each value we could multiply by:
for (unsigned log_m = 0; log_m < kOrder; ++log_m)
{
uint8_t table0[16], table1[16], table2[16], table3[16];
for (uint8_t x = 0; x < 16; ++x)
// For each 4 bits of the finite field width in bits:
for (unsigned i = 0, shift = 0; i < 4; ++i, shift += 4)
{
table0[x] = MultiplyLog(x, static_cast<ffe_t>(log_m));
table1[x] = MultiplyLog(x << 4, static_cast<ffe_t>(log_m));
table2[x] = MultiplyLog(x << 8, static_cast<ffe_t>(log_m));
table3[x] = MultiplyLog(x << 12, static_cast<ffe_t>(log_m));
}
// Construct 16 entry LUT for PSHUFB
uint8_t prod_lo[16], prod_hi[16];
for (ffe_t x = 0; x < 16; ++x)
{
const ffe_t prod = MultiplyLog(x << shift, static_cast<ffe_t>(log_m));
prod_lo[x] = static_cast<uint8_t>(prod);
prod_hi[x] = static_cast<uint8_t>(prod >> 8);
}
const LEO_M128 value0 = _mm_loadu_si128((LEO_M128*)table0);
const LEO_M128 value1 = _mm_loadu_si128((LEO_M128*)table1);
const LEO_M128 value2 = _mm_loadu_si128((LEO_M128*)table2);
const LEO_M128 value3 = _mm_loadu_si128((LEO_M128*)table3);
const LEO_M128 value_lo = _mm_loadu_si128((LEO_M128*)prod_lo);
const LEO_M128 value_hi = _mm_loadu_si128((LEO_M128*)prod_hi);
_mm_storeu_si128(&Multiply128LUT[log_m].Value[0], value0);
_mm_storeu_si128(&Multiply128LUT[log_m].Value[1], value1);
// Store in 128-bit wide table
_mm_storeu_si128(&Multiply128LUT[log_m].Lo[i], value_lo);
_mm_storeu_si128(&Multiply128LUT[log_m].Hi[i], value_hi);
// Store in 256-bit wide table
#if defined(LEO_TRY_AVX2)
if (CpuHasAVX2)
{
_mm256_storeu_si256(&Multiply256LUT[log_m].Value[0],
_mm256_broadcastsi128_si256(table_lo));
_mm256_storeu_si256(&Multiply256LUT[log_m].Value[1],
_mm256_broadcastsi128_si256(table_hi));
}
if (CpuHasAVX2)
{
_mm256_storeu_si256(&Multiply256LUT[log_m].Lo[i],
_mm256_broadcastsi128_si256(value_lo));
_mm256_storeu_si256(&Multiply256LUT[log_m].Hi[i],
_mm256_broadcastsi128_si256(value_hi));
}
#endif // LEO_TRY_AVX2
}
}
}
@ -412,8 +425,17 @@ void mul_mem(
#if defined(LEO_TRY_AVX2)
if (CpuHasAVX2)
{
const LEO_M256 table_lo_y = _mm256_loadu_si256(&Multiply256LUT[log_m].Value[0]);
const LEO_M256 table_hi_y = _mm256_loadu_si256(&Multiply256LUT[log_m].Value[1]);
#define LEO_MUL_TABLES_256() \
const LEO_M256 T0_lo = _mm256_loadu_si256(&Multiply256LUT[log_m].Lo[0]); \
const LEO_M256 T1_lo = _mm256_loadu_si256(&Multiply256LUT[log_m].Lo[1]); \
const LEO_M256 T2_lo = _mm256_loadu_si256(&Multiply256LUT[log_m].Lo[2]); \
const LEO_M256 T3_lo = _mm256_loadu_si256(&Multiply256LUT[log_m].Lo[3]); \
const LEO_M256 T0_hi = _mm256_loadu_si256(&Multiply256LUT[log_m].Hi[0]); \
const LEO_M256 T1_hi = _mm256_loadu_si256(&Multiply256LUT[log_m].Hi[1]); \
const LEO_M256 T2_hi = _mm256_loadu_si256(&Multiply256LUT[log_m].Hi[2]); \
const LEO_M256 T3_hi = _mm256_loadu_si256(&Multiply256LUT[log_m].Hi[3]);
LEO_MUL_TABLES_256();
const LEO_M256 clr_mask = _mm256_set1_epi8(0x0f);
@ -423,15 +445,25 @@ void mul_mem(
do
{
#define LEO_MUL_256(x_ptr, y_ptr) { \
LEO_M256 data = _mm256_loadu_si256(y_ptr); \
LEO_M256 lo = _mm256_and_si256(data, clr_mask); \
lo = _mm256_shuffle_epi8(table_lo_y, lo); \
LEO_M256 hi = _mm256_srli_epi64(data, 4); \
hi = _mm256_and_si256(hi, clr_mask); \
hi = _mm256_shuffle_epi8(table_hi_y, hi); \
_mm256_storeu_si256(x_ptr, _mm256_xor_si256(lo, hi)); }
const LEO_M256 A_lo = _mm256_loadu_si256(y_ptr); \
const LEO_M256 A_hi = _mm256_loadu_si256(y_ptr + 1); \
LEO_M256 data_0 = _mm256_and_si256(A_lo, clr_mask); \
LEO_M256 data_1 = _mm256_srli_epi64(A_lo, 4); \
data_1 = _mm256_and_si256(data_1, clr_mask); \
LEO_M256 data_2 = _mm256_and_si256(A_hi, clr_mask); \
LEO_M256 data_3 = _mm256_srli_epi64(A_hi, 4); \
data_3 = _mm256_and_si256(data_3, clr_mask); \
LEO_M256 output_lo = _mm256_shuffle_epi8(T0_lo, data_0); \
output_lo = _mm256_xor_si256(output_lo, _mm256_shuffle_epi8(T1_lo, data_1)); \
output_lo = _mm256_xor_si256(output_lo, _mm256_shuffle_epi8(T2_lo, data_2)); \
output_lo = _mm256_xor_si256(output_lo, _mm256_shuffle_epi8(T3_lo, data_3)); \
LEO_M256 output_hi = _mm256_shuffle_epi8(T0_hi, data_0); \
output_hi = _mm256_xor_si256(output_hi, _mm256_shuffle_epi8(T1_hi, data_1)); \
output_hi = _mm256_xor_si256(output_hi, _mm256_shuffle_epi8(T2_hi, data_2)); \
output_hi = _mm256_xor_si256(output_hi, _mm256_shuffle_epi8(T3_hi, data_3)); \
_mm256_storeu_si256(x_ptr, output_lo); \
_mm256_storeu_si256(x_ptr + 1, output_hi); }
LEO_MUL_256(x32 + 1, y32 + 1);
LEO_MUL_256(x32, y32);
y32 += 2, x32 += 2;
@ -442,8 +474,17 @@ void mul_mem(
}
#endif // LEO_TRY_AVX2
const LEO_M128 table_lo_y = _mm_loadu_si128(&Multiply128LUT[log_m].Value[0]);
const LEO_M128 table_hi_y = _mm_loadu_si128(&Multiply128LUT[log_m].Value[1]);
#define LEO_MUL_TABLES_128() \
const LEO_M128 T0_lo = _mm_loadu_si128(&Multiply128LUT[log_m].Lo[0]); \
const LEO_M128 T1_lo = _mm_loadu_si128(&Multiply128LUT[log_m].Lo[1]); \
const LEO_M128 T2_lo = _mm_loadu_si128(&Multiply128LUT[log_m].Lo[2]); \
const LEO_M128 T3_lo = _mm_loadu_si128(&Multiply128LUT[log_m].Lo[3]); \
const LEO_M128 T0_hi = _mm_loadu_si128(&Multiply128LUT[log_m].Hi[0]); \
const LEO_M128 T1_hi = _mm_loadu_si128(&Multiply128LUT[log_m].Hi[1]); \
const LEO_M128 T2_hi = _mm_loadu_si128(&Multiply128LUT[log_m].Hi[2]); \
const LEO_M128 T3_hi = _mm_loadu_si128(&Multiply128LUT[log_m].Hi[3]);
LEO_MUL_TABLES_128();
const LEO_M128 clr_mask = _mm_set1_epi8(0x0f);
@ -453,16 +494,25 @@ void mul_mem(
do
{
#define LEO_MUL_128(x_ptr, y_ptr) { \
LEO_M128 data = _mm_loadu_si128(y_ptr); \
LEO_M128 lo = _mm_and_si128(data, clr_mask); \
lo = _mm_shuffle_epi8(table_lo_y, lo); \
LEO_M128 hi = _mm_srli_epi64(data, 4); \
hi = _mm_and_si128(hi, clr_mask); \
hi = _mm_shuffle_epi8(table_hi_y, hi); \
_mm_storeu_si128(x_ptr, _mm_xor_si128(lo, hi)); }
const LEO_M128 A_lo = _mm_loadu_si128(y_ptr); \
const LEO_M128 A_hi = _mm_loadu_si128(y_ptr + 2); \
LEO_M128 data_0 = _mm_and_si128(A_lo, clr_mask); \
LEO_M128 data_1 = _mm_srli_epi64(A_lo, 4); \
data_1 = _mm_and_si128(data_1, clr_mask); \
LEO_M128 data_2 = _mm_and_si128(A_hi, clr_mask); \
LEO_M128 data_3 = _mm_srli_epi64(A_hi, 4); \
data_3 = _mm_and_si128(data_3, clr_mask); \
LEO_M128 output_lo = _mm_shuffle_epi8(T0_lo, data_0); \
output_lo = _mm_xor_si128(output_lo, _mm_shuffle_epi8(T1_lo, data_1)); \
output_lo = _mm_xor_si128(output_lo, _mm_shuffle_epi8(T2_lo, data_2)); \
output_lo = _mm_xor_si128(output_lo, _mm_shuffle_epi8(T3_lo, data_3)); \
LEO_M128 output_hi = _mm_shuffle_epi8(T0_hi, data_0); \
output_hi = _mm_xor_si128(output_hi, _mm_shuffle_epi8(T1_hi, data_1)); \
output_hi = _mm_xor_si128(output_hi, _mm_shuffle_epi8(T2_hi, data_2)); \
output_hi = _mm_xor_si128(output_hi, _mm_shuffle_epi8(T3_hi, data_3)); \
_mm_storeu_si128(x_ptr, output_lo); \
_mm_storeu_si128(x_ptr + 2, output_hi); }
LEO_MUL_128(x16 + 3, y16 + 3);
LEO_MUL_128(x16 + 2, y16 + 2);
LEO_MUL_128(x16 + 1, y16 + 1);
LEO_MUL_128(x16, y16);
x16 += 4, y16 += 4;
@ -482,8 +532,7 @@ void fft_butterfly(
#if defined(LEO_TRY_AVX2)
if (CpuHasAVX2)
{
const LEO_M256 table_lo_y = _mm256_loadu_si256(&Multiply256LUT[log_m].Value[0]);
const LEO_M256 table_hi_y = _mm256_loadu_si256(&Multiply256LUT[log_m].Value[1]);
LEO_MUL_TABLES_256();
const LEO_M256 clr_mask = _mm256_set1_epi8(0x0f);
@ -493,19 +542,33 @@ void fft_butterfly(
do
{
#define LEO_FFTB_256(x_ptr, y_ptr) { \
LEO_M256 y_data = _mm256_loadu_si256(y_ptr); \
LEO_M256 lo = _mm256_and_si256(y_data, clr_mask); \
lo = _mm256_shuffle_epi8(table_lo_y, lo); \
LEO_M256 hi = _mm256_srli_epi64(y_data, 4); \
hi = _mm256_and_si256(hi, clr_mask); \
hi = _mm256_shuffle_epi8(table_hi_y, hi); \
LEO_M256 x_data = _mm256_loadu_si256(x_ptr); \
x_data = _mm256_xor_si256(x_data, _mm256_xor_si256(lo, hi)); \
y_data = _mm256_xor_si256(y_data, x_data); \
_mm256_storeu_si256(x_ptr, x_data); \
_mm256_storeu_si256(y_ptr, y_data); }
LEO_M256 y_lo = _mm256_loadu_si256(y_ptr); \
LEO_M256 data_0 = _mm256_and_si256(y_lo, clr_mask); \
LEO_M256 data_1 = _mm256_srli_epi64(y_lo, 4); \
data_1 = _mm256_and_si256(data_1, clr_mask); \
LEO_M256 prod_lo = _mm256_shuffle_epi8(T0_lo, data_0); \
prod_lo = _mm256_xor_si256(prod_lo, _mm256_shuffle_epi8(T1_lo, data_1)); \
LEO_M256 prod_hi = _mm256_shuffle_epi8(T0_hi, data_0); \
prod_hi = _mm256_xor_si256(prod_hi, _mm256_shuffle_epi8(T1_hi, data_1)); \
LEO_M256 y_hi = _mm256_loadu_si256(y_ptr + 1); \
data_0 = _mm256_and_si256(y_hi, clr_mask); \
data_1 = _mm256_srli_epi64(y_hi, 4); \
data_1 = _mm256_and_si256(data_1, clr_mask); \
prod_lo = _mm256_xor_si256(prod_lo, _mm256_shuffle_epi8(T2_lo, data_0)); \
prod_lo = _mm256_xor_si256(prod_lo, _mm256_shuffle_epi8(T3_lo, data_1)); \
prod_hi = _mm256_xor_si256(prod_hi, _mm256_shuffle_epi8(T2_hi, data_0)); \
prod_hi = _mm256_xor_si256(prod_hi, _mm256_shuffle_epi8(T3_hi, data_1)); \
LEO_M256 x_lo = _mm256_loadu_si256(x_ptr); \
LEO_M256 x_hi = _mm256_loadu_si256(x_ptr + 1); \
x_lo = _mm256_xor_si256(prod_lo, x_lo); \
_mm256_storeu_si256(x_ptr, x_lo); \
x_hi = _mm256_xor_si256(prod_hi, x_hi); \
_mm256_storeu_si256(x_ptr + 1, x_hi); \
y_lo = _mm256_xor_si256(y_lo, x_lo); \
_mm256_storeu_si256(y_ptr, y_lo); \
y_hi = _mm256_xor_si256(y_hi, x_hi); \
_mm256_storeu_si256(y_ptr + 1, y_hi); }
LEO_FFTB_256(x32 + 1, y32 + 1);
LEO_FFTB_256(x32, y32);
y32 += 2, x32 += 2;
@ -516,8 +579,7 @@ void fft_butterfly(
}
#endif // LEO_TRY_AVX2
const LEO_M128 table_lo_y = _mm_loadu_si128(&Multiply128LUT[log_m].Value[0]);
const LEO_M128 table_hi_y = _mm_loadu_si128(&Multiply128LUT[log_m].Value[1]);
LEO_MUL_TABLES_128();
const LEO_M128 clr_mask = _mm_set1_epi8(0x0f);
@ -527,20 +589,33 @@ void fft_butterfly(
do
{
#define LEO_FFTB_128(x_ptr, y_ptr) { \
LEO_M128 y_data = _mm_loadu_si128(y_ptr); \
LEO_M128 lo = _mm_and_si128(y_data, clr_mask); \
lo = _mm_shuffle_epi8(table_lo_y, lo); \
LEO_M128 hi = _mm_srli_epi64(y_data, 4); \
hi = _mm_and_si128(hi, clr_mask); \
hi = _mm_shuffle_epi8(table_hi_y, hi); \
LEO_M128 x_data = _mm_loadu_si128(x_ptr); \
x_data = _mm_xor_si128(x_data, _mm_xor_si128(lo, hi)); \
y_data = _mm_xor_si128(y_data, x_data); \
_mm_storeu_si128(x_ptr, x_data); \
_mm_storeu_si128(y_ptr, y_data); }
LEO_M128 y_lo = _mm_loadu_si128(y_ptr); \
LEO_M128 data_0 = _mm_and_si128(y_lo, clr_mask); \
LEO_M128 data_1 = _mm_srli_epi64(y_lo, 4); \
data_1 = _mm_and_si128(data_1, clr_mask); \
LEO_M128 prod_lo = _mm_shuffle_epi8(T0_lo, data_0); \
prod_lo = _mm_xor_si128(prod_lo, _mm_shuffle_epi8(T1_lo, data_1)); \
LEO_M128 prod_hi = _mm_shuffle_epi8(T0_hi, data_0); \
prod_hi = _mm_xor_si128(prod_hi, _mm_shuffle_epi8(T1_hi, data_1)); \
LEO_M128 y_hi = _mm_loadu_si128(y_ptr + 2); \
data_0 = _mm_and_si128(y_hi, clr_mask); \
data_1 = _mm_srli_epi64(y_hi, 4); \
data_1 = _mm_and_si128(data_1, clr_mask); \
prod_lo = _mm_xor_si128(prod_lo, _mm_shuffle_epi8(T2_lo, data_0)); \
prod_lo = _mm_xor_si128(prod_lo, _mm_shuffle_epi8(T3_lo, data_1)); \
prod_hi = _mm_xor_si128(prod_hi, _mm_shuffle_epi8(T2_hi, data_0)); \
prod_hi = _mm_xor_si128(prod_hi, _mm_shuffle_epi8(T3_hi, data_1)); \
LEO_M128 x_lo = _mm_loadu_si128(x_ptr); \
LEO_M128 x_hi = _mm_loadu_si128(x_ptr + 2); \
x_lo = _mm_xor_si128(prod_lo, x_lo); \
_mm_storeu_si128(x_ptr, x_lo); \
x_hi = _mm_xor_si128(prod_hi, x_hi); \
_mm_storeu_si128(x_ptr + 2, x_hi); \
y_lo = _mm_xor_si128(y_lo, x_lo); \
_mm_storeu_si128(y_ptr, y_lo); \
y_hi = _mm_xor_si128(y_hi, x_hi); \
_mm_storeu_si128(y_ptr + 2, y_hi); }
LEO_FFTB_128(x16 + 3, y16 + 3);
LEO_FFTB_128(x16 + 2, y16 + 2);
LEO_FFTB_128(x16 + 1, y16 + 1);
LEO_FFTB_128(x16, y16);
x16 += 4, y16 += 4;
@ -600,8 +675,7 @@ void fft_butterfly4(
}
#endif // LEO_TRY_AVX2
const LEO_M128 table_lo_y = _mm_loadu_si128(&Multiply128LUT[log_m].Value[0]);
const LEO_M128 table_hi_y = _mm_loadu_si128(&Multiply128LUT[log_m].Value[1]);
LEO_MUL_TABLES_128();
const LEO_M128 clr_mask = _mm_set1_epi8(0x0f);
@ -657,8 +731,7 @@ void ifft_butterfly(
#if defined(LEO_TRY_AVX2)
if (CpuHasAVX2)
{
const LEO_M256 table_lo_y = _mm256_loadu_si256(&Multiply256LUT[log_m].Value[0]);
const LEO_M256 table_hi_y = _mm256_loadu_si256(&Multiply256LUT[log_m].Value[1]);
LEO_MUL_TABLES_256();
const LEO_M256 clr_mask = _mm256_set1_epi8(0x0f);
@ -668,19 +741,33 @@ void ifft_butterfly(
do
{
#define LEO_IFFTB_256(x_ptr, y_ptr) { \
LEO_M256 x_data = _mm256_loadu_si256(x_ptr); \
LEO_M256 y_data = _mm256_loadu_si256(y_ptr); \
y_data = _mm256_xor_si256(y_data, x_data); \
_mm256_storeu_si256(y_ptr, y_data); \
LEO_M256 lo = _mm256_and_si256(y_data, clr_mask); \
lo = _mm256_shuffle_epi8(table_lo_y, lo); \
LEO_M256 hi = _mm256_srli_epi64(y_data, 4); \
hi = _mm256_and_si256(hi, clr_mask); \
hi = _mm256_shuffle_epi8(table_hi_y, hi); \
x_data = _mm256_xor_si256(x_data, _mm256_xor_si256(lo, hi)); \
_mm256_storeu_si256(x_ptr, x_data); }
LEO_M256 x_lo = _mm256_loadu_si256(x_ptr); \
LEO_M256 y_lo = _mm256_loadu_si256(y_ptr); \
y_lo = _mm256_xor_si256(y_lo, x_lo); \
_mm256_storeu_si256(y_ptr, y_lo); \
LEO_M256 data_0 = _mm256_and_si256(y_lo, clr_mask); \
LEO_M256 data_1 = _mm256_srli_epi64(y_lo, 4); \
data_1 = _mm256_and_si256(data_1, clr_mask); \
LEO_M256 prod_lo = _mm256_shuffle_epi8(T0_lo, data_0); \
prod_lo = _mm256_xor_si256(prod_lo, _mm256_shuffle_epi8(T1_lo, data_1)); \
LEO_M256 prod_hi = _mm256_shuffle_epi8(T0_hi, data_0); \
prod_hi = _mm256_xor_si256(prod_hi, _mm256_shuffle_epi8(T1_hi, data_1)); \
LEO_M256 x_hi = _mm256_loadu_si256(x_ptr + 1); \
LEO_M256 y_hi = _mm256_loadu_si256(y_ptr + 1); \
y_hi = _mm256_xor_si256(y_hi, x_hi); \
_mm256_storeu_si256(y_ptr + 1, y_hi); \
data_0 = _mm256_and_si256(y_hi, clr_mask); \
data_1 = _mm256_srli_epi64(y_hi, 4); \
data_1 = _mm256_and_si256(data_1, clr_mask); \
prod_lo = _mm256_xor_si256(prod_lo, _mm256_shuffle_epi8(T2_lo, data_0)); \
prod_lo = _mm256_xor_si256(prod_lo, _mm256_shuffle_epi8(T3_lo, data_1)); \
prod_hi = _mm256_xor_si256(prod_hi, _mm256_shuffle_epi8(T2_hi, data_0)); \
prod_hi = _mm256_xor_si256(prod_hi, _mm256_shuffle_epi8(T3_hi, data_1)); \
x_lo = _mm256_xor_si256(prod_lo, x_lo); \
_mm256_storeu_si256(x_ptr, x_lo); \
x_hi = _mm256_xor_si256(prod_hi, x_hi); \
_mm256_storeu_si256(x_ptr + 1, x_hi); }
LEO_IFFTB_256(x32 + 1, y32 + 1);
LEO_IFFTB_256(x32, y32);
y32 += 2, x32 += 2;
@ -691,8 +778,7 @@ void ifft_butterfly(
}
#endif // LEO_TRY_AVX2
const LEO_M128 table_lo_y = _mm_loadu_si128(&Multiply128LUT[log_m].Value[0]);
const LEO_M128 table_hi_y = _mm_loadu_si128(&Multiply128LUT[log_m].Value[1]);
LEO_MUL_TABLES_128();
const LEO_M128 clr_mask = _mm_set1_epi8(0x0f);
@ -702,20 +788,33 @@ void ifft_butterfly(
do
{
#define LEO_IFFTB_128(x_ptr, y_ptr) { \
LEO_M128 x_data = _mm_loadu_si128(x_ptr); \
LEO_M128 y_data = _mm_loadu_si128(y_ptr); \
y_data = _mm_xor_si128(y_data, x_data); \
_mm_storeu_si128(y_ptr, y_data); \
LEO_M128 lo = _mm_and_si128(y_data, clr_mask); \
lo = _mm_shuffle_epi8(table_lo_y, lo); \
LEO_M128 hi = _mm_srli_epi64(y_data, 4); \
hi = _mm_and_si128(hi, clr_mask); \
hi = _mm_shuffle_epi8(table_hi_y, hi); \
x_data = _mm_xor_si128(x_data, _mm_xor_si128(lo, hi)); \
_mm_storeu_si128(x_ptr, x_data); }
LEO_M128 x_lo = _mm_loadu_si128(x_ptr); \
LEO_M128 y_lo = _mm_loadu_si128(y_ptr); \
y_lo = _mm_xor_si128(y_lo, x_lo); \
_mm_storeu_si128(y_ptr, y_lo); \
LEO_M128 data_0 = _mm_and_si128(y_lo, clr_mask); \
LEO_M128 data_1 = _mm_srli_epi64(y_lo, 4); \
data_1 = _mm_and_si128(data_1, clr_mask); \
LEO_M128 prod_lo = _mm_shuffle_epi8(T0_lo, data_0); \
prod_lo = _mm_xor_si128(prod_lo, _mm_shuffle_epi8(T1_lo, data_1)); \
LEO_M128 prod_hi = _mm_shuffle_epi8(T0_hi, data_0); \
prod_hi = _mm_xor_si128(prod_hi, _mm_shuffle_epi8(T1_hi, data_1)); \
LEO_M128 x_hi = _mm_loadu_si128(x_ptr + 2); \
LEO_M128 y_hi = _mm_loadu_si128(y_ptr + 2); \
y_hi = _mm_xor_si128(y_hi, x_hi); \
_mm_storeu_si128(y_ptr + 2, y_hi); \
data_0 = _mm_and_si128(y_hi, clr_mask); \
data_1 = _mm_srli_epi64(y_hi, 4); \
data_1 = _mm_and_si128(data_1, clr_mask); \
prod_lo = _mm_xor_si128(prod_lo, _mm_shuffle_epi8(T2_lo, data_0)); \
prod_lo = _mm_xor_si128(prod_lo, _mm_shuffle_epi8(T3_lo, data_1)); \
prod_hi = _mm_xor_si128(prod_hi, _mm_shuffle_epi8(T2_hi, data_0)); \
prod_hi = _mm_xor_si128(prod_hi, _mm_shuffle_epi8(T3_hi, data_1)); \
x_lo = _mm_xor_si128(prod_lo, x_lo); \
_mm_storeu_si128(x_ptr, x_lo); \
x_hi = _mm_xor_si128(prod_hi, x_hi); \
_mm_storeu_si128(x_ptr + 2, x_hi); }
LEO_IFFTB_128(x16 + 3, y16 + 3);
LEO_IFFTB_128(x16 + 2, y16 + 2);
LEO_IFFTB_128(x16 + 1, y16 + 1);
LEO_IFFTB_128(x16, y16);
x16 += 4, y16 += 4;
@ -1105,13 +1204,21 @@ skip_body:
//------------------------------------------------------------------------------
// ErrorBitfield
#ifdef LEO_SCHEDULE_OPT
#ifdef LEO_ERROR_BITFIELD_OPT
// Used in decoding to decide which final FFT operations to perform
class ErrorBitfield
{
static const unsigned kWordMips = 5;
static const unsigned kWords = kOrder / 64;
uint64_t Words[7][kWords] = {};
uint64_t Words[kWordMips][kWords] = {};
static const unsigned kBigMips = 5;
static const unsigned kBigWords = (kWords + 63) / 64;
uint64_t BigWords[kBigMips][kBigWords] = {};
static const unsigned kBiggestMips = 5;
uint64_t BiggestWords[kBiggestMips] = {};
public:
LEO_FORCE_INLINE void Set(unsigned i)
@ -1123,8 +1230,18 @@ public:
LEO_FORCE_INLINE bool IsNeeded(unsigned mip_level, unsigned bit) const
{
if (mip_level >= 8)
if (mip_level >= 16)
return true;
if (mip_level >= 11)
{
bit /= 4096;
return 0 != (BiggestWords[mip_level - 11] & ((uint64_t)1 << bit));
}
if (mip_level >= 6)
{
bit /= 64;
return 0 != (BigWords[mip_level - 6][bit / 64] & ((uint64_t)1 << (bit % 64)));
}
return 0 != (Words[mip_level - 1][bit / 64] & ((uint64_t)1 << (bit % 64)));
}
};
@ -1142,33 +1259,57 @@ void ErrorBitfield::Prepare()
// First mip level is for final layer of FFT: pairs of data
for (unsigned i = 0; i < kWords; ++i)
{
const uint64_t w0 = Words[0][i];
const uint64_t hi2lo0 = w0 | ((w0 & kHiMasks[0]) >> 1);
const uint64_t lo2hi0 = ((w0 & (kHiMasks[0] >> 1)) << 1);
Words[0][i] = hi2lo0 | lo2hi0;
uint64_t w_i = Words[0][i];
const uint64_t hi2lo0 = w_i | ((w_i & kHiMasks[0]) >> 1);
const uint64_t lo2hi0 = ((w_i & (kHiMasks[0] >> 1)) << 1);
Words[0][i] = w_i = hi2lo0 | lo2hi0;
for (unsigned j = 1, bits = 2; j < 5; ++j, bits <<= 1)
for (unsigned j = 1, bits = 2; j < kWordMips; ++j, bits <<= 1)
{
const uint64_t w_j = Words[j - 1][i];
const uint64_t hi2lo_j = w_j | ((w_j & kHiMasks[j]) >> bits);
const uint64_t lo2hi_j = ((w_j & (kHiMasks[j] >> bits)) << bits);
Words[j][i] = hi2lo_j | lo2hi_j;
const uint64_t hi2lo_j = w_i | ((w_i & kHiMasks[j]) >> bits);
const uint64_t lo2hi_j = ((w_i & (kHiMasks[j] >> bits)) << bits);
Words[j][i] = w_i = hi2lo_j | lo2hi_j;
}
}
for (unsigned i = 0; i < kWords; ++i)
for (unsigned i = 0; i < kBigWords; ++i)
{
uint64_t w = Words[4][i];
w |= w >> 32;
w |= w << 32;
Words[5][i] = w;
uint64_t w_i = 0;
uint64_t bit = 1;
const uint64_t* src = &Words[4][i * 64];
for (unsigned j = 0; j < 64; ++j, bit <<= 1)
{
const uint64_t w = src[j];
w_i |= (w | (w >> 32) | (w << 32)) & bit;
}
BigWords[0][i] = w_i;
for (unsigned j = 1, bits = 1; j < kBigMips; ++j, bits <<= 1)
{
const uint64_t hi2lo_j = w_i | ((w_i & kHiMasks[j - 1]) >> bits);
const uint64_t lo2hi_j = ((w_i & (kHiMasks[j - 1] >> bits)) << bits);
BigWords[j][i] = w_i = hi2lo_j | lo2hi_j;
}
}
for (unsigned i = 0; i < kWords; i += 2)
Words[6][i] = Words[6][i + 1] = Words[5][i] | Words[5][i + 1];
uint64_t w_i = 0;
uint64_t bit = 1;
for (unsigned j = 0; j < kBigWords; ++j, bit <<= 1)
{
const uint64_t w = BigWords[kBigMips - 1][j];
w_i |= (w | (w >> 32) | (w << 32)) & bit;
}
BiggestWords[0] = w_i;
for (unsigned j = 1, bits = 1; j < kBiggestMips; ++j, bits <<= 1)
{
const uint64_t hi2lo_j = w_i | ((w_i & kHiMasks[j - 1]) >> bits);
const uint64_t lo2hi_j = ((w_i & (kHiMasks[j - 1] >> bits)) << bits);
BiggestWords[j] = w_i = hi2lo_j | lo2hi_j;
}
}
#endif // LEO_SCHEDULE_OPT
#endif // LEO_ERROR_BITFIELD_OPT
//------------------------------------------------------------------------------
@ -1186,9 +1327,9 @@ void ReedSolomonDecode(
{
// Fill in error locations
#ifdef LEO_SCHEDULE_OPT
#ifdef LEO_ERROR_BITFIELD_OPT
ErrorBitfield ErrorBits;
#endif // LEO_SCHEDULE_OPT
#endif // LEO_ERROR_BITFIELD_OPT
ffe_t ErrorLocations[kOrder] = {};
for (unsigned i = 0; i < recovery_count; ++i)
@ -1201,15 +1342,15 @@ void ReedSolomonDecode(
if (!original[i])
{
ErrorLocations[i + m] = 1;
#ifdef LEO_SCHEDULE_OPT
#ifdef LEO_ERROR_BITFIELD_OPT
ErrorBits.Set(i + m);
#endif // LEO_SCHEDULE_OPT
#endif // LEO_ERROR_BITFIELD_OPT
}
}
#ifdef LEO_SCHEDULE_OPT
#ifdef LEO_ERROR_BITFIELD_OPT
ErrorBits.Prepare();
#endif // LEO_SCHEDULE_OPT
#endif // LEO_ERROR_BITFIELD_OPT
// Evaluate error locator polynomial
@ -1291,10 +1432,10 @@ void ReedSolomonDecode(
for (unsigned j = 0; j < n; j += range)
#endif
{
#ifdef LEO_SCHEDULE_OPT
#ifdef LEO_ERROR_BITFIELD_OPT
if (!ErrorBits.IsNeeded(mip_level, j))
continue;
#endif // LEO_SCHEDULE_OPT
#endif // LEO_ERROR_BITFIELD_OPT
VectorFFTButterfly(
buffer_bytes,

View File

@ -214,6 +214,23 @@ static ffe_t LogLUT[kOrder];
static ffe_t ExpLUT[kOrder];
// Returns a * Log(b)
static ffe_t MultiplyLog(ffe_t a, ffe_t log_b)
{
/*
Note that this operation is not a normal multiplication in a finite
field because the right operand is already a logarithm. This is done
because it moves K table lookups from the Decode() method into the
initialization step that is less performance critical. The LogWalsh[]
table below contains precalculated logarithms so it is easier to do
all the other multiplies in that form as well.
*/
if (a == 0)
return 0;
return ExpLUT[AddMod(LogLUT[a], log_b)];
}
// Initialize LogLUT[], ExpLUT[]
static void InitializeLogarithmTables()
{
@ -257,30 +274,20 @@ static void InitializeLogarithmTables()
//------------------------------------------------------------------------------
// Multiplies
/*
The multiplication algorithm used follows the approach outlined in {4}.
Specifically section 6 outlines the algorithm used here for 8-bit fields.
*/
struct {
LEO_ALIGNED LEO_M128 Value[kBits / 4];
LEO_ALIGNED LEO_M128 Value[2];
} static Multiply128LUT[kOrder];
#if defined(LEO_TRY_AVX2)
struct {
LEO_ALIGNED LEO_M256 Value[kBits / 4];
LEO_ALIGNED LEO_M256 Value[2];
} static Multiply256LUT[kOrder];
#endif // LEO_TRY_AVX2
// Returns a * Log(b)
static ffe_t MultiplyLog(ffe_t a, ffe_t log_b)
{
/*
Note that this operation is not a normal multiplication in a finite
field because the right operand is already a logarithm. This is done
because it moves K table lookups from the Decode() method into the
initialization step that is less performance critical. The LogWalsh[]
table below contains precalculated logarithms so it is easier to do
all the other multiplies in that form as well.
*/
if (a == 0)
return 0;
return ExpLUT[AddMod(LogLUT[a], log_b)];
}
void InitializeMultiplyTables()
{
@ -288,11 +295,11 @@ void InitializeMultiplyTables()
for (unsigned log_m = 0; log_m < kOrder; ++log_m)
{
// For each 4 bits of the finite field width in bits:
for (unsigned i = 0, shift = 0; i < kBits / 4; ++i, shift += 4)
for (unsigned i = 0, shift = 0; i < 2; ++i, shift += 4)
{
// Construct 16 entry LUT for PSHUFB
ffe_t lut[16];
for (uint8_t x = 0; x < 16; ++x)
uint8_t lut[16];
for (ffe_t x = 0; x < 16; ++x)
lut[x] = MultiplyLog(x << shift, static_cast<ffe_t>(log_m));
// Store in 128-bit wide table
@ -1013,7 +1020,7 @@ skip_body:
//------------------------------------------------------------------------------
// ErrorBitfield
#ifdef LEO_SCHEDULE_OPT
#ifdef LEO_ERROR_BITFIELD_OPT
// Used in decoding to decide which final FFT operations to perform
class ErrorBitfield
@ -1050,17 +1057,16 @@ void ErrorBitfield::Prepare()
// First mip level is for final layer of FFT: pairs of data
for (unsigned i = 0; i < kWords; ++i)
{
const uint64_t w0 = Words[0][i];
const uint64_t hi2lo0 = w0 | ((w0 & kHiMasks[0]) >> 1);
const uint64_t lo2hi0 = ((w0 & (kHiMasks[0] >> 1)) << 1);
Words[0][i] = hi2lo0 | lo2hi0;
uint64_t w_i = Words[0][i];
const uint64_t hi2lo0 = w_i | ((w_i & kHiMasks[0]) >> 1);
const uint64_t lo2hi0 = ((w_i & (kHiMasks[0] >> 1)) << 1);
Words[0][i] = w_i = hi2lo0 | lo2hi0;
for (unsigned j = 1, bits = 2; j < 5; ++j, bits <<= 1)
{
const uint64_t w_j = Words[j - 1][i];
const uint64_t hi2lo_j = w_j | ((w_j & kHiMasks[j]) >> bits);
const uint64_t lo2hi_j = ((w_j & (kHiMasks[j] >> bits)) << bits);
Words[j][i] = hi2lo_j | lo2hi_j;
const uint64_t hi2lo_j = w_i | ((w_i & kHiMasks[j]) >> bits);
const uint64_t lo2hi_j = ((w_i & (kHiMasks[j] >> bits)) << bits);
Words[j][i] = w_i = hi2lo_j | lo2hi_j;
}
}
@ -1076,7 +1082,7 @@ void ErrorBitfield::Prepare()
Words[6][i] = Words[6][i + 1] = Words[5][i] | Words[5][i + 1];
}
#endif // LEO_SCHEDULE_OPT
#endif // LEO_ERROR_BITFIELD_OPT
//------------------------------------------------------------------------------
@ -1094,9 +1100,9 @@ void ReedSolomonDecode(
{
// Fill in error locations
#ifdef LEO_SCHEDULE_OPT
#ifdef LEO_ERROR_BITFIELD_OPT
ErrorBitfield ErrorBits;
#endif // LEO_SCHEDULE_OPT
#endif // LEO_ERROR_BITFIELD_OPT
ffe_t ErrorLocations[kOrder] = {};
for (unsigned i = 0; i < recovery_count; ++i)
@ -1109,15 +1115,15 @@ void ReedSolomonDecode(
if (!original[i])
{
ErrorLocations[i + m] = 1;
#ifdef LEO_SCHEDULE_OPT
#ifdef LEO_ERROR_BITFIELD_OPT
ErrorBits.Set(i + m);
#endif // LEO_SCHEDULE_OPT
#endif // LEO_ERROR_BITFIELD_OPT
}
}
#ifdef LEO_SCHEDULE_OPT
#ifdef LEO_ERROR_BITFIELD_OPT
ErrorBits.Prepare();
#endif // LEO_SCHEDULE_OPT
#endif // LEO_ERROR_BITFIELD_OPT
// Evaluate error locator polynomial
@ -1199,10 +1205,10 @@ void ReedSolomonDecode(
for (unsigned j = 0; j < n; j += range)
#endif
{
#ifdef LEO_SCHEDULE_OPT
#ifdef LEO_ERROR_BITFIELD_OPT
if (!ErrorBits.IsNeeded(mip_level, j))
continue;
#endif // LEO_SCHEDULE_OPT
#endif // LEO_ERROR_BITFIELD_OPT
VectorFFTButterfly(
buffer_bytes,

View File

@ -308,7 +308,8 @@ This library implements an MDS erasure code introduced in this paper:
~~~
{1} S.-J. Lin, T. Y. Al-Naffouri, Y. S. Han, and W.-H. Chung,
"Novel Polynomial Basis with Fast Fourier Transform and Its Application to Reed-Solomon Erasure Codes"
"Novel Polynomial Basis with Fast Fourier Transform
and Its Application to Reed-Solomon Erasure Codes"
IEEE Trans. on Information Theory, pp. 6284-6299, November, 2016.
~~~
@ -318,10 +319,16 @@ This library implements an MDS erasure code introduced in this paper:
~~~
~~~
{3} Sian-Jheng Lin, Wei-Ho Chung, An Efficient (n, k) Information
Dispersal Algorithm for High Code Rate System over Fermat Fields,
{3} Sian-Jheng Lin, Wei-Ho Chung, "An Efficient (n, k) Information
Dispersal Algorithm for High Code Rate System over Fermat Fields,"
IEEE Commun. Lett., vol.16, no.12, pp. 2036-2039, Dec. 2012.
~~~
~~~
{4} Plank, J. S., Greenan, K. M., Miller, E. L., "Screaming fast Galois Field
arithmetic using Intel SIMD instructions." In: FAST-2013: 11th Usenix
Conference on File and Storage Technologies, San Jose, 2013
~~~
Some papers are mirrored in the /docs/ folder.

BIN
docs/plank-fast13.pdf Normal file

Binary file not shown.

View File

@ -42,25 +42,29 @@
Bulat Ziganshin <bulat.ziganshin@gmail.com> : Author of FastECC
Yutaka Sawada <tenfon@outlook.jp> : Author of MultiPar
References:
{1} S.-J. Lin, T. Y. Al-Naffouri, Y. S. Han, and W.-H. Chung,
"Novel Polynomial Basis with Fast Fourier Transform and Its Application to Reed-Solomon Erasure Codes"
"Novel Polynomial Basis with Fast Fourier Transform
and Its Application to Reed-Solomon Erasure Codes"
IEEE Trans. on Information Theory, pp. 6284-6299, November, 2016.
http://ct.ee.ntust.edu.tw/it2016-2.pdf
{2} D. G. Cantor, "On arithmetical algorithms over finite fields",
Journal of Combinatorial Theory, Series A, vol. 50, no. 2, pp. 285-300, 1989.
{3} Sian-Jheng Lin, Wei-Ho Chung, An Efficient (n, k) Information
Dispersal Algorithm for High Code Rate System over Fermat Fields,
{3} Sian-Jheng Lin, Wei-Ho Chung, "An Efficient (n, k) Information
Dispersal Algorithm for High Code Rate System over Fermat Fields,"
IEEE Commun. Lett., vol.16, no.12, pp. 2036-2039, Dec. 2012.
{4} Plank, J. S., Greenan, K. M., Miller, E. L., "Screaming fast Galois Field
arithmetic using Intel SIMD instructions." In: FAST-2013: 11th Usenix
Conference on File and Storage Technologies, San Jose, 2013
*/
/*
TODO:
+ New 16-bit Muladd inner loops
+ Benchmarks for large data!
+ Benchmarks for large data!
+ Add multi-threading to split up long parallelizable calculations
+ Final benchmarks!
+ Release version 1
@ -76,7 +80,7 @@
// Enable 8-bit or 16-bit fields
#define LEO_HAS_FF8
//#define LEO_HAS_FF16
#define LEO_HAS_FF16
// Tweak if the functions are exported or statically linked
//#define LEO_DLL /* Defined when building/linking as DLL */

View File

@ -42,15 +42,15 @@ using namespace std;
struct TestParameters
{
#ifdef LEO_HAS_FF16
unsigned original_count = 1000; // under 65536
unsigned recovery_count = 100; // under 65536 - original_count
unsigned original_count = 677; // under 65536
unsigned recovery_count = 487; // under 65536 - original_count
#else
unsigned original_count = 128; // under 65536
unsigned recovery_count = 128; // under 65536 - original_count
#endif
unsigned buffer_bytes = 64000; // multiple of 64 bytes
unsigned loss_count = 128; // some fraction of original_count
unsigned seed = 0;
unsigned buffer_bytes = 640; // multiple of 64 bytes
unsigned loss_count = 2; // some fraction of original_count
unsigned seed = 2;
bool multithreaded = true;
};
@ -535,10 +535,12 @@ static bool BasicTest(const TestParameters& params)
t_mem_free.EndCall();
}
#if 0
t_mem_alloc.Print(kTrials);
t_leo_encode.Print(kTrials);
t_leo_decode.Print(kTrials);
t_mem_free.Print(kTrials);
#endif
float encode_input_MBPS = total_bytes * kTrials / (float)(t_leo_encode.TotalUsec);
float encode_output_MBPS = params.buffer_bytes * (uint64_t)params.recovery_count * kTrials / (float)(t_leo_encode.TotalUsec);
@ -784,6 +786,7 @@ int main(int argc, char **argv)
#endif
TestParameters params;
PCGRandom prng;
if (argc >= 2)
params.original_count = atoi(argv[1]);
@ -804,6 +807,20 @@ int main(int argc, char **argv)
if (!BasicTest(params))
goto Failed;
prng.Seed(params.seed, 7);
for (;; ++params.seed)
{
params.original_count = prng.Next() % 32768;
params.recovery_count = prng.Next() % params.original_count + 1;
params.loss_count = prng.Next() % params.recovery_count + 1;
cout << "Parameters: [original count=" << params.original_count << "] [recovery count=" << params.recovery_count << "] [buffer bytes=" << params.buffer_bytes << "] [loss count=" << params.loss_count << "] [random seed=" << params.seed << "]" << endl;
if (!BasicTest(params))
goto Failed;
}
#ifdef LEO_BENCH_ALL_256_PARAMS
for (unsigned original_count = 1; original_count <= 256; ++original_count)
{
for (unsigned recovery_count = 1; recovery_count <= original_count; ++recovery_count)
@ -818,6 +835,7 @@ int main(int argc, char **argv)
goto Failed;
}
}
#endif
Failed:
getchar();

View File

@ -33,7 +33,7 @@
#include <stdlib.h>
#define LEO_SHORT_FIELD
//#define LEO_SHORT_FIELD
//#define LEO_EXPERIMENT_EXTRA_XOR
//#define LEO_EXPERIMENT_EXTRA_MULS
#define LEO_EXPERIMENT_CANTOR_BASIS
@ -603,8 +603,8 @@ int main(int argc, char **argv)
const unsigned input_count = 32;
const unsigned recovery_count = 16;
#else // LEO_SHORT_FIELD
const unsigned input_count = 10000;
const unsigned recovery_count = 2000;
const unsigned input_count = 32768;
const unsigned recovery_count = 32768;
#endif // LEO_SHORT_FIELD
test(input_count, recovery_count, seed);