FF16 is mostly working still some bugs

2017-05-30 01:37:27 -07:00 · 2017-05-30 01:37:27 -07:00 · 2e1007c4aa
parent 6e7dfea7c9
commit 2e1007c4aa
8 changed files with 386 additions and 207 deletions
--- a/LeopardCommon.h
+++ b/LeopardCommon.h
@ -150,7 +150,7 @@
 // Constants
 // Unroll inner loops 4 times
-#define LEO_USE_VECTOR4_OPT
+//#define LEO_USE_VECTOR4_OPT
 // Define this to enable the optimized version of FWHT()
 #define LEO_FWHT_OPT
@ -158,6 +158,9 @@
 // Avoid scheduling reduced FFT operations that are unneeded
 #define LEO_SCHEDULE_OPT
 // Avoid calculating final FFT values in decoder using bitfield
 //#define LEO_ERROR_BITFIELD_OPT
 // Optimize M=1 case
 #define LEO_M1_OPT
--- a/LeopardFF16.cpp
+++ b/LeopardFF16.cpp
@ -75,8 +75,6 @@ static inline ffe_t SubMod(const ffe_t a, const ffe_t b)
 //------------------------------------------------------------------------------
 // Fast Walsh-Hadamard Transform (FWHT) (mod kModulus)
 #if defined(LEO_FWHT_OPT)
 // {a, b} = {a + b, a - b} (Mod Q)
 static LEO_FORCE_INLINE void FWHT_2(ffe_t& LEO_RESTRICT a, ffe_t& LEO_RESTRICT b)
 {
@ -86,6 +84,8 @@ static LEO_FORCE_INLINE void FWHT_2(ffe_t& LEO_RESTRICT a, ffe_t& LEO_RESTRICT b
    b = dif;
 }
 #if defined(LEO_FWHT_OPT)
 static LEO_FORCE_INLINE void FWHT_4(ffe_t* data)
 {
    ffe_t t0 = data[0];
@ -304,6 +304,23 @@ static ffe_t LogLUT[kOrder];
 static ffe_t ExpLUT[kOrder];
 // Returns a * Log(b)
 static ffe_t MultiplyLog(ffe_t a, ffe_t log_b)
 {
    /*
        Note that this operation is not a normal multiplication in a finite
        field because the right operand is already a logarithm.  This is done
        because it moves K table lookups from the Decode() method into the
        initialization step that is less performance critical.  The LogWalsh[]
        table below contains precalculated logarithms so it is easier to do
        all the other multiplies in that form as well.
    */
    if (a == 0)
        return 0;
    return ExpLUT[AddMod(LogLUT[a], log_b)];
 }
 // Initialize LogLUT[], ExpLUT[]
 static void InitializeLogarithmTables()
 {
@ -344,63 +361,59 @@ static void InitializeLogarithmTables()
 //------------------------------------------------------------------------------
 // Multiplies
 /*
    The multiplication algorithm used follows the approach outlined in {4}.
    Specifically section 7 outlines the algorithm used here for 16-bit fields.
    I use the ALTMAP memory layout since I do not need to convert in/out of it.
 */
 struct {
-    LEO_ALIGNED LEO_M128 Value[kBits / 4];
+    LEO_ALIGNED LEO_M128 Lo[4];
    LEO_ALIGNED LEO_M128 Hi[4];
 } static Multiply128LUT[kOrder];
 #if defined(LEO_TRY_AVX2)
 struct {
-    LEO_ALIGNED LEO_M256 Value[kBits / 4];
+    LEO_ALIGNED LEO_M256 Lo[4];
    LEO_ALIGNED LEO_M256 Hi[4];
 } static Multiply256LUT[kOrder];
 #endif // LEO_TRY_AVX2
 // Returns a * Log(b)
 static ffe_t MultiplyLog(ffe_t a, ffe_t log_b)
 {
    /*
        Note that this operation is not a normal multiplication in a finite
        field because the right operand is already a logarithm.  This is done
        because it moves K table lookups from the Decode() method into the
        initialization step that is less performance critical.  The LogWalsh[]
        table below contains precalculated logarithms so it is easier to do
        all the other multiplies in that form as well.
    */
    if (a == 0)
        return 0;
    return ExpLUT[AddMod(LogLUT[a], log_b)];
 }
 void InitializeMultiplyTables()
 {
    // For each value we could multiply by:
    for (unsigned log_m = 0; log_m < kOrder; ++log_m)
    {
-        uint8_t table0[16], table1[16], table2[16], table3[16];
+        // For each 4 bits of the finite field width in bits:
-
+        for (unsigned i = 0, shift = 0; i < 4; ++i, shift += 4)
        for (uint8_t x = 0; x < 16; ++x)
        {
-            table0[x] = MultiplyLog(x, static_cast<ffe_t>(log_m));
+            // Construct 16 entry LUT for PSHUFB
-            table1[x] = MultiplyLog(x << 4, static_cast<ffe_t>(log_m));
+            uint8_t prod_lo[16], prod_hi[16];
-            table2[x] = MultiplyLog(x << 8, static_cast<ffe_t>(log_m));
+            for (ffe_t x = 0; x < 16; ++x)
-            table3[x] = MultiplyLog(x << 12, static_cast<ffe_t>(log_m));
+            {
-        }
+                const ffe_t prod = MultiplyLog(x << shift, static_cast<ffe_t>(log_m));
                prod_lo[x] = static_cast<uint8_t>(prod);
                prod_hi[x] = static_cast<uint8_t>(prod >> 8);
            }
-        const LEO_M128 value0 = _mm_loadu_si128((LEO_M128*)table0);
+            const LEO_M128 value_lo = _mm_loadu_si128((LEO_M128*)prod_lo);
-        const LEO_M128 value1 = _mm_loadu_si128((LEO_M128*)table1);
+            const LEO_M128 value_hi = _mm_loadu_si128((LEO_M128*)prod_hi);
        const LEO_M128 value2 = _mm_loadu_si128((LEO_M128*)table2);
        const LEO_M128 value3 = _mm_loadu_si128((LEO_M128*)table3);
-        _mm_storeu_si128(&Multiply128LUT[log_m].Value[0], value0);
+            // Store in 128-bit wide table
-        _mm_storeu_si128(&Multiply128LUT[log_m].Value[1], value1);
+            _mm_storeu_si128(&Multiply128LUT[log_m].Lo[i], value_lo);
            _mm_storeu_si128(&Multiply128LUT[log_m].Hi[i], value_hi);
            // Store in 256-bit wide table
 #if defined(LEO_TRY_AVX2)
-        if (CpuHasAVX2)
+            if (CpuHasAVX2)
-        {
+            {
-            _mm256_storeu_si256(&Multiply256LUT[log_m].Value[0],
+                _mm256_storeu_si256(&Multiply256LUT[log_m].Lo[i],
-                _mm256_broadcastsi128_si256(table_lo));
+                    _mm256_broadcastsi128_si256(value_lo));
-            _mm256_storeu_si256(&Multiply256LUT[log_m].Value[1],
+                _mm256_storeu_si256(&Multiply256LUT[log_m].Hi[i],
-                _mm256_broadcastsi128_si256(table_hi));
+                    _mm256_broadcastsi128_si256(value_hi));
-        }
+            }
 #endif // LEO_TRY_AVX2
        }
    }
 }
@ -412,8 +425,17 @@ void mul_mem(
 #if defined(LEO_TRY_AVX2)
    if (CpuHasAVX2)
    {
-        const LEO_M256 table_lo_y = _mm256_loadu_si256(&Multiply256LUT[log_m].Value[0]);
+#define LEO_MUL_TABLES_256() \
-        const LEO_M256 table_hi_y = _mm256_loadu_si256(&Multiply256LUT[log_m].Value[1]);
+        const LEO_M256 T0_lo = _mm256_loadu_si256(&Multiply256LUT[log_m].Lo[0]); \
        const LEO_M256 T1_lo = _mm256_loadu_si256(&Multiply256LUT[log_m].Lo[1]); \
        const LEO_M256 T2_lo = _mm256_loadu_si256(&Multiply256LUT[log_m].Lo[2]); \
        const LEO_M256 T3_lo = _mm256_loadu_si256(&Multiply256LUT[log_m].Lo[3]); \
        const LEO_M256 T0_hi = _mm256_loadu_si256(&Multiply256LUT[log_m].Hi[0]); \
        const LEO_M256 T1_hi = _mm256_loadu_si256(&Multiply256LUT[log_m].Hi[1]); \
        const LEO_M256 T2_hi = _mm256_loadu_si256(&Multiply256LUT[log_m].Hi[2]); \
        const LEO_M256 T3_hi = _mm256_loadu_si256(&Multiply256LUT[log_m].Hi[3]);
        LEO_MUL_TABLES_256();
        const LEO_M256 clr_mask = _mm256_set1_epi8(0x0f);
@ -423,15 +445,25 @@ void mul_mem(
        do
        {
 #define LEO_MUL_256(x_ptr, y_ptr) { \
-            LEO_M256 data = _mm256_loadu_si256(y_ptr); \
+	        const LEO_M256 A_lo = _mm256_loadu_si256(y_ptr); \
-            LEO_M256 lo = _mm256_and_si256(data, clr_mask); \
+            const LEO_M256 A_hi = _mm256_loadu_si256(y_ptr + 1); \
-            lo = _mm256_shuffle_epi8(table_lo_y, lo); \
+            LEO_M256 data_0 = _mm256_and_si256(A_lo, clr_mask); \
-            LEO_M256 hi = _mm256_srli_epi64(data, 4); \
+            LEO_M256 data_1 = _mm256_srli_epi64(A_lo, 4); \
-            hi = _mm256_and_si256(hi, clr_mask); \
+            data_1 = _mm256_and_si256(data_1, clr_mask); \
-            hi = _mm256_shuffle_epi8(table_hi_y, hi); \
+            LEO_M256 data_2 = _mm256_and_si256(A_hi, clr_mask); \
-            _mm256_storeu_si256(x_ptr, _mm256_xor_si256(lo, hi)); }
+            LEO_M256 data_3 = _mm256_srli_epi64(A_hi, 4); \
            data_3 = _mm256_and_si256(data_3, clr_mask); \
            LEO_M256 output_lo = _mm256_shuffle_epi8(T0_lo, data_0); \
            output_lo = _mm256_xor_si256(output_lo, _mm256_shuffle_epi8(T1_lo, data_1)); \
            output_lo = _mm256_xor_si256(output_lo, _mm256_shuffle_epi8(T2_lo, data_2)); \
            output_lo = _mm256_xor_si256(output_lo, _mm256_shuffle_epi8(T3_lo, data_3)); \
            LEO_M256 output_hi = _mm256_shuffle_epi8(T0_hi, data_0); \
            output_hi = _mm256_xor_si256(output_hi, _mm256_shuffle_epi8(T1_hi, data_1)); \
            output_hi = _mm256_xor_si256(output_hi, _mm256_shuffle_epi8(T2_hi, data_2)); \
            output_hi = _mm256_xor_si256(output_hi, _mm256_shuffle_epi8(T3_hi, data_3)); \
            _mm256_storeu_si256(x_ptr, output_lo); \
            _mm256_storeu_si256(x_ptr + 1, output_hi); }
            LEO_MUL_256(x32 + 1, y32 + 1);
            LEO_MUL_256(x32, y32);
            y32 += 2, x32 += 2;
@ -442,8 +474,17 @@ void mul_mem(
    }
 #endif // LEO_TRY_AVX2
-    const LEO_M128 table_lo_y = _mm_loadu_si128(&Multiply128LUT[log_m].Value[0]);
+#define LEO_MUL_TABLES_128() \
-    const LEO_M128 table_hi_y = _mm_loadu_si128(&Multiply128LUT[log_m].Value[1]);
+    const LEO_M128 T0_lo = _mm_loadu_si128(&Multiply128LUT[log_m].Lo[0]); \
    const LEO_M128 T1_lo = _mm_loadu_si128(&Multiply128LUT[log_m].Lo[1]); \
    const LEO_M128 T2_lo = _mm_loadu_si128(&Multiply128LUT[log_m].Lo[2]); \
    const LEO_M128 T3_lo = _mm_loadu_si128(&Multiply128LUT[log_m].Lo[3]); \
    const LEO_M128 T0_hi = _mm_loadu_si128(&Multiply128LUT[log_m].Hi[0]); \
    const LEO_M128 T1_hi = _mm_loadu_si128(&Multiply128LUT[log_m].Hi[1]); \
    const LEO_M128 T2_hi = _mm_loadu_si128(&Multiply128LUT[log_m].Hi[2]); \
    const LEO_M128 T3_hi = _mm_loadu_si128(&Multiply128LUT[log_m].Hi[3]);
    LEO_MUL_TABLES_128();
    const LEO_M128 clr_mask = _mm_set1_epi8(0x0f);
@ -453,16 +494,25 @@ void mul_mem(
    do
    {
 #define LEO_MUL_128(x_ptr, y_ptr) { \
-        LEO_M128 data = _mm_loadu_si128(y_ptr); \
+	        const LEO_M128 A_lo = _mm_loadu_si128(y_ptr); \
-        LEO_M128 lo = _mm_and_si128(data, clr_mask); \
+            const LEO_M128 A_hi = _mm_loadu_si128(y_ptr + 2); \
-        lo = _mm_shuffle_epi8(table_lo_y, lo); \
+            LEO_M128 data_0 = _mm_and_si128(A_lo, clr_mask); \
-        LEO_M128 hi = _mm_srli_epi64(data, 4); \
+            LEO_M128 data_1 = _mm_srli_epi64(A_lo, 4); \
-        hi = _mm_and_si128(hi, clr_mask); \
+            data_1 = _mm_and_si128(data_1, clr_mask); \
-        hi = _mm_shuffle_epi8(table_hi_y, hi); \
+            LEO_M128 data_2 = _mm_and_si128(A_hi, clr_mask); \
-        _mm_storeu_si128(x_ptr, _mm_xor_si128(lo, hi)); }
+            LEO_M128 data_3 = _mm_srli_epi64(A_hi, 4); \
            data_3 = _mm_and_si128(data_3, clr_mask); \
            LEO_M128 output_lo = _mm_shuffle_epi8(T0_lo, data_0); \
            output_lo = _mm_xor_si128(output_lo, _mm_shuffle_epi8(T1_lo, data_1)); \
            output_lo = _mm_xor_si128(output_lo, _mm_shuffle_epi8(T2_lo, data_2)); \
            output_lo = _mm_xor_si128(output_lo, _mm_shuffle_epi8(T3_lo, data_3)); \
            LEO_M128 output_hi = _mm_shuffle_epi8(T0_hi, data_0); \
            output_hi = _mm_xor_si128(output_hi, _mm_shuffle_epi8(T1_hi, data_1)); \
            output_hi = _mm_xor_si128(output_hi, _mm_shuffle_epi8(T2_hi, data_2)); \
            output_hi = _mm_xor_si128(output_hi, _mm_shuffle_epi8(T3_hi, data_3)); \
            _mm_storeu_si128(x_ptr, output_lo); \
            _mm_storeu_si128(x_ptr + 2, output_hi); }
        LEO_MUL_128(x16 + 3, y16 + 3);
        LEO_MUL_128(x16 + 2, y16 + 2);
        LEO_MUL_128(x16 + 1, y16 + 1);
        LEO_MUL_128(x16, y16);
        x16 += 4, y16 += 4;
@ -482,8 +532,7 @@ void fft_butterfly(
 #if defined(LEO_TRY_AVX2)
    if (CpuHasAVX2)
    {
-        const LEO_M256 table_lo_y = _mm256_loadu_si256(&Multiply256LUT[log_m].Value[0]);
+        LEO_MUL_TABLES_256();
        const LEO_M256 table_hi_y = _mm256_loadu_si256(&Multiply256LUT[log_m].Value[1]);
        const LEO_M256 clr_mask = _mm256_set1_epi8(0x0f);
@ -493,19 +542,33 @@ void fft_butterfly(
        do
        {
 #define LEO_FFTB_256(x_ptr, y_ptr) { \
-            LEO_M256 y_data = _mm256_loadu_si256(y_ptr); \
+	        LEO_M256 y_lo = _mm256_loadu_si256(y_ptr); \
-            LEO_M256 lo = _mm256_and_si256(y_data, clr_mask); \
+            LEO_M256 data_0 = _mm256_and_si256(y_lo, clr_mask); \
-            lo = _mm256_shuffle_epi8(table_lo_y, lo); \
+            LEO_M256 data_1 = _mm256_srli_epi64(y_lo, 4); \
-            LEO_M256 hi = _mm256_srli_epi64(y_data, 4); \
+            data_1 = _mm256_and_si256(data_1, clr_mask); \
-            hi = _mm256_and_si256(hi, clr_mask); \
+            LEO_M256 prod_lo = _mm256_shuffle_epi8(T0_lo, data_0); \
-            hi = _mm256_shuffle_epi8(table_hi_y, hi); \
+            prod_lo = _mm256_xor_si256(prod_lo, _mm256_shuffle_epi8(T1_lo, data_1)); \
-            LEO_M256 x_data = _mm256_loadu_si256(x_ptr); \
+            LEO_M256 prod_hi = _mm256_shuffle_epi8(T0_hi, data_0); \
-            x_data = _mm256_xor_si256(x_data, _mm256_xor_si256(lo, hi)); \
+            prod_hi = _mm256_xor_si256(prod_hi, _mm256_shuffle_epi8(T1_hi, data_1)); \
-            y_data = _mm256_xor_si256(y_data, x_data); \
+            LEO_M256 y_hi = _mm256_loadu_si256(y_ptr + 1); \
-            _mm256_storeu_si256(x_ptr, x_data); \
+            data_0 = _mm256_and_si256(y_hi, clr_mask); \
-            _mm256_storeu_si256(y_ptr, y_data); }
+            data_1 = _mm256_srli_epi64(y_hi, 4); \
            data_1 = _mm256_and_si256(data_1, clr_mask); \
            prod_lo = _mm256_xor_si256(prod_lo, _mm256_shuffle_epi8(T2_lo, data_0)); \
            prod_lo = _mm256_xor_si256(prod_lo, _mm256_shuffle_epi8(T3_lo, data_1)); \
            prod_hi = _mm256_xor_si256(prod_hi, _mm256_shuffle_epi8(T2_hi, data_0)); \
            prod_hi = _mm256_xor_si256(prod_hi, _mm256_shuffle_epi8(T3_hi, data_1)); \
 	        LEO_M256 x_lo = _mm256_loadu_si256(x_ptr); \
            LEO_M256 x_hi = _mm256_loadu_si256(x_ptr + 1); \
            x_lo = _mm256_xor_si256(prod_lo, x_lo); \
            _mm256_storeu_si256(x_ptr, x_lo); \
            x_hi = _mm256_xor_si256(prod_hi, x_hi); \
            _mm256_storeu_si256(x_ptr + 1, x_hi); \
            y_lo = _mm256_xor_si256(y_lo, x_lo); \
            _mm256_storeu_si256(y_ptr, y_lo); \
            y_hi = _mm256_xor_si256(y_hi, x_hi); \
            _mm256_storeu_si256(y_ptr + 1, y_hi); }
            LEO_FFTB_256(x32 + 1, y32 + 1);
            LEO_FFTB_256(x32, y32);
            y32 += 2, x32 += 2;
@ -516,8 +579,7 @@ void fft_butterfly(
    }
 #endif // LEO_TRY_AVX2
-    const LEO_M128 table_lo_y = _mm_loadu_si128(&Multiply128LUT[log_m].Value[0]);
+    LEO_MUL_TABLES_128();
    const LEO_M128 table_hi_y = _mm_loadu_si128(&Multiply128LUT[log_m].Value[1]);
    const LEO_M128 clr_mask = _mm_set1_epi8(0x0f);
@ -527,20 +589,33 @@ void fft_butterfly(
    do
    {
 #define LEO_FFTB_128(x_ptr, y_ptr) { \
-        LEO_M128 y_data = _mm_loadu_si128(y_ptr); \
+	        LEO_M128 y_lo = _mm_loadu_si128(y_ptr); \
-        LEO_M128 lo = _mm_and_si128(y_data, clr_mask); \
+            LEO_M128 data_0 = _mm_and_si128(y_lo, clr_mask); \
-        lo = _mm_shuffle_epi8(table_lo_y, lo); \
+            LEO_M128 data_1 = _mm_srli_epi64(y_lo, 4); \
-        LEO_M128 hi = _mm_srli_epi64(y_data, 4); \
+            data_1 = _mm_and_si128(data_1, clr_mask); \
-        hi = _mm_and_si128(hi, clr_mask); \
+            LEO_M128 prod_lo = _mm_shuffle_epi8(T0_lo, data_0); \
-        hi = _mm_shuffle_epi8(table_hi_y, hi); \
+            prod_lo = _mm_xor_si128(prod_lo, _mm_shuffle_epi8(T1_lo, data_1)); \
-        LEO_M128 x_data = _mm_loadu_si128(x_ptr); \
+            LEO_M128 prod_hi = _mm_shuffle_epi8(T0_hi, data_0); \
-        x_data = _mm_xor_si128(x_data, _mm_xor_si128(lo, hi)); \
+            prod_hi = _mm_xor_si128(prod_hi, _mm_shuffle_epi8(T1_hi, data_1)); \
-        y_data = _mm_xor_si128(y_data, x_data); \
+            LEO_M128 y_hi = _mm_loadu_si128(y_ptr + 2); \
-        _mm_storeu_si128(x_ptr, x_data); \
+            data_0 = _mm_and_si128(y_hi, clr_mask); \
-        _mm_storeu_si128(y_ptr, y_data); }
+            data_1 = _mm_srli_epi64(y_hi, 4); \
            data_1 = _mm_and_si128(data_1, clr_mask); \
            prod_lo = _mm_xor_si128(prod_lo, _mm_shuffle_epi8(T2_lo, data_0)); \
            prod_lo = _mm_xor_si128(prod_lo, _mm_shuffle_epi8(T3_lo, data_1)); \
            prod_hi = _mm_xor_si128(prod_hi, _mm_shuffle_epi8(T2_hi, data_0)); \
            prod_hi = _mm_xor_si128(prod_hi, _mm_shuffle_epi8(T3_hi, data_1)); \
 	        LEO_M128 x_lo = _mm_loadu_si128(x_ptr); \
            LEO_M128 x_hi = _mm_loadu_si128(x_ptr + 2); \
            x_lo = _mm_xor_si128(prod_lo, x_lo); \
            _mm_storeu_si128(x_ptr, x_lo); \
            x_hi = _mm_xor_si128(prod_hi, x_hi); \
            _mm_storeu_si128(x_ptr + 2, x_hi); \
            y_lo = _mm_xor_si128(y_lo, x_lo); \
            _mm_storeu_si128(y_ptr, y_lo); \
            y_hi = _mm_xor_si128(y_hi, x_hi); \
            _mm_storeu_si128(y_ptr + 2, y_hi); }
        LEO_FFTB_128(x16 + 3, y16 + 3);
        LEO_FFTB_128(x16 + 2, y16 + 2);
        LEO_FFTB_128(x16 + 1, y16 + 1);
        LEO_FFTB_128(x16, y16);
        x16 += 4, y16 += 4;
@ -600,8 +675,7 @@ void fft_butterfly4(
    }
 #endif // LEO_TRY_AVX2
-    const LEO_M128 table_lo_y = _mm_loadu_si128(&Multiply128LUT[log_m].Value[0]);
+    LEO_MUL_TABLES_128();
    const LEO_M128 table_hi_y = _mm_loadu_si128(&Multiply128LUT[log_m].Value[1]);
    const LEO_M128 clr_mask = _mm_set1_epi8(0x0f);
@ -657,8 +731,7 @@ void ifft_butterfly(
 #if defined(LEO_TRY_AVX2)
    if (CpuHasAVX2)
    {
-        const LEO_M256 table_lo_y = _mm256_loadu_si256(&Multiply256LUT[log_m].Value[0]);
+        LEO_MUL_TABLES_256();
        const LEO_M256 table_hi_y = _mm256_loadu_si256(&Multiply256LUT[log_m].Value[1]);
        const LEO_M256 clr_mask = _mm256_set1_epi8(0x0f);
@ -668,19 +741,33 @@ void ifft_butterfly(
        do
        {
 #define LEO_IFFTB_256(x_ptr, y_ptr) { \
-            LEO_M256 x_data = _mm256_loadu_si256(x_ptr); \
+	        LEO_M256 x_lo = _mm256_loadu_si256(x_ptr); \
-            LEO_M256 y_data = _mm256_loadu_si256(y_ptr); \
+	        LEO_M256 y_lo = _mm256_loadu_si256(y_ptr); \
-            y_data = _mm256_xor_si256(y_data, x_data); \
+            y_lo = _mm256_xor_si256(y_lo, x_lo); \
-            _mm256_storeu_si256(y_ptr, y_data); \
+            _mm256_storeu_si256(y_ptr, y_lo); \
-            LEO_M256 lo = _mm256_and_si256(y_data, clr_mask); \
+            LEO_M256 data_0 = _mm256_and_si256(y_lo, clr_mask); \
-            lo = _mm256_shuffle_epi8(table_lo_y, lo); \
+            LEO_M256 data_1 = _mm256_srli_epi64(y_lo, 4); \
-            LEO_M256 hi = _mm256_srli_epi64(y_data, 4); \
+            data_1 = _mm256_and_si256(data_1, clr_mask); \
-            hi = _mm256_and_si256(hi, clr_mask); \
+            LEO_M256 prod_lo = _mm256_shuffle_epi8(T0_lo, data_0); \
-            hi = _mm256_shuffle_epi8(table_hi_y, hi); \
+            prod_lo = _mm256_xor_si256(prod_lo, _mm256_shuffle_epi8(T1_lo, data_1)); \
-            x_data = _mm256_xor_si256(x_data, _mm256_xor_si256(lo, hi)); \
+            LEO_M256 prod_hi = _mm256_shuffle_epi8(T0_hi, data_0); \
-            _mm256_storeu_si256(x_ptr, x_data); }
+            prod_hi = _mm256_xor_si256(prod_hi, _mm256_shuffle_epi8(T1_hi, data_1)); \
            LEO_M256 x_hi = _mm256_loadu_si256(x_ptr + 1); \
            LEO_M256 y_hi = _mm256_loadu_si256(y_ptr + 1); \
            y_hi = _mm256_xor_si256(y_hi, x_hi); \
            _mm256_storeu_si256(y_ptr + 1, y_hi); \
            data_0 = _mm256_and_si256(y_hi, clr_mask); \
            data_1 = _mm256_srli_epi64(y_hi, 4); \
            data_1 = _mm256_and_si256(data_1, clr_mask); \
            prod_lo = _mm256_xor_si256(prod_lo, _mm256_shuffle_epi8(T2_lo, data_0)); \
            prod_lo = _mm256_xor_si256(prod_lo, _mm256_shuffle_epi8(T3_lo, data_1)); \
            prod_hi = _mm256_xor_si256(prod_hi, _mm256_shuffle_epi8(T2_hi, data_0)); \
            prod_hi = _mm256_xor_si256(prod_hi, _mm256_shuffle_epi8(T3_hi, data_1)); \
            x_lo = _mm256_xor_si256(prod_lo, x_lo); \
            _mm256_storeu_si256(x_ptr, x_lo); \
            x_hi = _mm256_xor_si256(prod_hi, x_hi); \
            _mm256_storeu_si256(x_ptr + 1, x_hi); }
            LEO_IFFTB_256(x32 + 1, y32 + 1);
            LEO_IFFTB_256(x32, y32);
            y32 += 2, x32 += 2;
@ -691,8 +778,7 @@ void ifft_butterfly(
    }
 #endif // LEO_TRY_AVX2
-    const LEO_M128 table_lo_y = _mm_loadu_si128(&Multiply128LUT[log_m].Value[0]);
+    LEO_MUL_TABLES_128();
    const LEO_M128 table_hi_y = _mm_loadu_si128(&Multiply128LUT[log_m].Value[1]);
    const LEO_M128 clr_mask = _mm_set1_epi8(0x0f);
@ -702,20 +788,33 @@ void ifft_butterfly(
    do
    {
 #define LEO_IFFTB_128(x_ptr, y_ptr) { \
-        LEO_M128 x_data = _mm_loadu_si128(x_ptr); \
+	        LEO_M128 x_lo = _mm_loadu_si128(x_ptr); \
-        LEO_M128 y_data = _mm_loadu_si128(y_ptr); \
+	        LEO_M128 y_lo = _mm_loadu_si128(y_ptr); \
-        y_data = _mm_xor_si128(y_data, x_data); \
+            y_lo = _mm_xor_si128(y_lo, x_lo); \
-        _mm_storeu_si128(y_ptr, y_data); \
+            _mm_storeu_si128(y_ptr, y_lo); \
-        LEO_M128 lo = _mm_and_si128(y_data, clr_mask); \
+            LEO_M128 data_0 = _mm_and_si128(y_lo, clr_mask); \
-        lo = _mm_shuffle_epi8(table_lo_y, lo); \
+            LEO_M128 data_1 = _mm_srli_epi64(y_lo, 4); \
-        LEO_M128 hi = _mm_srli_epi64(y_data, 4); \
+            data_1 = _mm_and_si128(data_1, clr_mask); \
-        hi = _mm_and_si128(hi, clr_mask); \
+            LEO_M128 prod_lo = _mm_shuffle_epi8(T0_lo, data_0); \
-        hi = _mm_shuffle_epi8(table_hi_y, hi); \
+            prod_lo = _mm_xor_si128(prod_lo, _mm_shuffle_epi8(T1_lo, data_1)); \
-        x_data = _mm_xor_si128(x_data, _mm_xor_si128(lo, hi)); \
+            LEO_M128 prod_hi = _mm_shuffle_epi8(T0_hi, data_0); \
-        _mm_storeu_si128(x_ptr, x_data); }
+            prod_hi = _mm_xor_si128(prod_hi, _mm_shuffle_epi8(T1_hi, data_1)); \
            LEO_M128 x_hi = _mm_loadu_si128(x_ptr + 2); \
            LEO_M128 y_hi = _mm_loadu_si128(y_ptr + 2); \
            y_hi = _mm_xor_si128(y_hi, x_hi); \
            _mm_storeu_si128(y_ptr + 2, y_hi); \
            data_0 = _mm_and_si128(y_hi, clr_mask); \
            data_1 = _mm_srli_epi64(y_hi, 4); \
            data_1 = _mm_and_si128(data_1, clr_mask); \
            prod_lo = _mm_xor_si128(prod_lo, _mm_shuffle_epi8(T2_lo, data_0)); \
            prod_lo = _mm_xor_si128(prod_lo, _mm_shuffle_epi8(T3_lo, data_1)); \
            prod_hi = _mm_xor_si128(prod_hi, _mm_shuffle_epi8(T2_hi, data_0)); \
            prod_hi = _mm_xor_si128(prod_hi, _mm_shuffle_epi8(T3_hi, data_1)); \
            x_lo = _mm_xor_si128(prod_lo, x_lo); \
            _mm_storeu_si128(x_ptr, x_lo); \
            x_hi = _mm_xor_si128(prod_hi, x_hi); \
            _mm_storeu_si128(x_ptr + 2, x_hi); }
        LEO_IFFTB_128(x16 + 3, y16 + 3);
        LEO_IFFTB_128(x16 + 2, y16 + 2);
        LEO_IFFTB_128(x16 + 1, y16 + 1);
        LEO_IFFTB_128(x16, y16);
        x16 += 4, y16 += 4;
@ -1105,13 +1204,21 @@ skip_body:
 //------------------------------------------------------------------------------
 // ErrorBitfield
-#ifdef LEO_SCHEDULE_OPT
+#ifdef LEO_ERROR_BITFIELD_OPT
 // Used in decoding to decide which final FFT operations to perform
 class ErrorBitfield
 {
    static const unsigned kWordMips = 5;
    static const unsigned kWords = kOrder / 64;
-    uint64_t Words[7][kWords] = {};
+    uint64_t Words[kWordMips][kWords] = {};
    static const unsigned kBigMips = 5;
    static const unsigned kBigWords = (kWords + 63) / 64;
    uint64_t BigWords[kBigMips][kBigWords] = {};
    static const unsigned kBiggestMips = 5;
    uint64_t BiggestWords[kBiggestMips] = {};
 public:
    LEO_FORCE_INLINE void Set(unsigned i)
@ -1123,8 +1230,18 @@ public:
    LEO_FORCE_INLINE bool IsNeeded(unsigned mip_level, unsigned bit) const
    {
-        if (mip_level >= 8)
+        if (mip_level >= 16)
            return true;
        if (mip_level >= 11)
        {
            bit /= 4096;
            return 0 != (BiggestWords[mip_level - 11] & ((uint64_t)1 << bit));
        }
        if (mip_level >= 6)
        {
            bit /= 64;
            return 0 != (BigWords[mip_level - 6][bit / 64] & ((uint64_t)1 << (bit % 64)));
        }
        return 0 != (Words[mip_level - 1][bit / 64] & ((uint64_t)1 << (bit % 64)));
    }
 };
@ -1142,33 +1259,57 @@ void ErrorBitfield::Prepare()
    // First mip level is for final layer of FFT: pairs of data
    for (unsigned i = 0; i < kWords; ++i)
    {
-        const uint64_t w0 = Words[0][i];
+        uint64_t w_i = Words[0][i];
-        const uint64_t hi2lo0 = w0 | ((w0 & kHiMasks[0]) >> 1);
+        const uint64_t hi2lo0 = w_i | ((w_i & kHiMasks[0]) >> 1);
-        const uint64_t lo2hi0 = ((w0 & (kHiMasks[0] >> 1)) << 1);
+        const uint64_t lo2hi0 = ((w_i & (kHiMasks[0] >> 1)) << 1);
-        Words[0][i] = hi2lo0 | lo2hi0;
+        Words[0][i] = w_i = hi2lo0 | lo2hi0;
-        for (unsigned j = 1, bits = 2; j < 5; ++j, bits <<= 1)
+        for (unsigned j = 1, bits = 2; j < kWordMips; ++j, bits <<= 1)
        {
-            const uint64_t w_j = Words[j - 1][i];
+            const uint64_t hi2lo_j = w_i | ((w_i & kHiMasks[j]) >> bits);
-            const uint64_t hi2lo_j = w_j | ((w_j & kHiMasks[j]) >> bits);
+            const uint64_t lo2hi_j = ((w_i & (kHiMasks[j] >> bits)) << bits);
-            const uint64_t lo2hi_j = ((w_j & (kHiMasks[j] >> bits)) << bits);
+            Words[j][i] = w_i = hi2lo_j | lo2hi_j;
            Words[j][i] = hi2lo_j | lo2hi_j;
        }
    }
-    for (unsigned i = 0; i < kWords; ++i)
+    for (unsigned i = 0; i < kBigWords; ++i)
    {
-        uint64_t w = Words[4][i];
+        uint64_t w_i = 0;
-        w |= w >> 32;
+        uint64_t bit = 1;
-        w |= w << 32;
+        const uint64_t* src = &Words[4][i * 64];
-        Words[5][i] = w;
+        for (unsigned j = 0; j < 64; ++j, bit <<= 1)
        {
            const uint64_t w = src[j];
            w_i |= (w | (w >> 32) | (w << 32)) & bit;
        }
        BigWords[0][i] = w_i;
        for (unsigned j = 1, bits = 1; j < kBigMips; ++j, bits <<= 1)
        {
            const uint64_t hi2lo_j = w_i | ((w_i & kHiMasks[j - 1]) >> bits);
            const uint64_t lo2hi_j = ((w_i & (kHiMasks[j - 1] >> bits)) << bits);
            BigWords[j][i] = w_i = hi2lo_j | lo2hi_j;
        }
    }
-    for (unsigned i = 0; i < kWords; i += 2)
+    uint64_t w_i = 0;
-        Words[6][i] = Words[6][i + 1] = Words[5][i] | Words[5][i + 1];
+    uint64_t bit = 1;
    for (unsigned j = 0; j < kBigWords; ++j, bit <<= 1)
    {
        const uint64_t w = BigWords[kBigMips - 1][j];
        w_i |= (w | (w >> 32) | (w << 32)) & bit;
    }
    BiggestWords[0] = w_i;
    for (unsigned j = 1, bits = 1; j < kBiggestMips; ++j, bits <<= 1)
    {
        const uint64_t hi2lo_j = w_i | ((w_i & kHiMasks[j - 1]) >> bits);
        const uint64_t lo2hi_j = ((w_i & (kHiMasks[j - 1] >> bits)) << bits);
        BiggestWords[j] = w_i = hi2lo_j | lo2hi_j;
    }
 }
-#endif // LEO_SCHEDULE_OPT
+#endif // LEO_ERROR_BITFIELD_OPT
 //------------------------------------------------------------------------------
@ -1186,9 +1327,9 @@ void ReedSolomonDecode(
 {
    // Fill in error locations
-#ifdef LEO_SCHEDULE_OPT
+#ifdef LEO_ERROR_BITFIELD_OPT
    ErrorBitfield ErrorBits;
-#endif // LEO_SCHEDULE_OPT
+#endif // LEO_ERROR_BITFIELD_OPT
    ffe_t ErrorLocations[kOrder] = {};
    for (unsigned i = 0; i < recovery_count; ++i)
@ -1201,15 +1342,15 @@ void ReedSolomonDecode(
        if (!original[i])
        {
            ErrorLocations[i + m] = 1;
-#ifdef LEO_SCHEDULE_OPT
+#ifdef LEO_ERROR_BITFIELD_OPT
            ErrorBits.Set(i + m);
-#endif // LEO_SCHEDULE_OPT
+#endif // LEO_ERROR_BITFIELD_OPT
        }
    }
-#ifdef LEO_SCHEDULE_OPT
+#ifdef LEO_ERROR_BITFIELD_OPT
    ErrorBits.Prepare();
-#endif // LEO_SCHEDULE_OPT
+#endif // LEO_ERROR_BITFIELD_OPT
    // Evaluate error locator polynomial
@ -1291,10 +1432,10 @@ void ReedSolomonDecode(
        for (unsigned j = 0; j < n; j += range)
 #endif
        {
-#ifdef LEO_SCHEDULE_OPT
+#ifdef LEO_ERROR_BITFIELD_OPT
            if (!ErrorBits.IsNeeded(mip_level, j))
                continue;
-#endif // LEO_SCHEDULE_OPT
+#endif // LEO_ERROR_BITFIELD_OPT
            VectorFFTButterfly(
                buffer_bytes,
--- a/LeopardFF8.cpp
+++ b/LeopardFF8.cpp
@ -214,6 +214,23 @@ static ffe_t LogLUT[kOrder];
 static ffe_t ExpLUT[kOrder];
 // Returns a * Log(b)
 static ffe_t MultiplyLog(ffe_t a, ffe_t log_b)
 {
    /*
        Note that this operation is not a normal multiplication in a finite
        field because the right operand is already a logarithm.  This is done
        because it moves K table lookups from the Decode() method into the
        initialization step that is less performance critical.  The LogWalsh[]
        table below contains precalculated logarithms so it is easier to do
        all the other multiplies in that form as well.
    */
    if (a == 0)
        return 0;
    return ExpLUT[AddMod(LogLUT[a], log_b)];
 }
 // Initialize LogLUT[], ExpLUT[]
 static void InitializeLogarithmTables()
 {
@ -257,30 +274,20 @@ static void InitializeLogarithmTables()
 //------------------------------------------------------------------------------
 // Multiplies
 /*
    The multiplication algorithm used follows the approach outlined in {4}.
    Specifically section 6 outlines the algorithm used here for 8-bit fields.
 */
 struct {
-    LEO_ALIGNED LEO_M128 Value[kBits / 4];
+    LEO_ALIGNED LEO_M128 Value[2];
 } static Multiply128LUT[kOrder];
 #if defined(LEO_TRY_AVX2)
 struct {
-    LEO_ALIGNED LEO_M256 Value[kBits / 4];
+    LEO_ALIGNED LEO_M256 Value[2];
 } static Multiply256LUT[kOrder];
 #endif // LEO_TRY_AVX2
 // Returns a * Log(b)
 static ffe_t MultiplyLog(ffe_t a, ffe_t log_b)
 {
    /*
        Note that this operation is not a normal multiplication in a finite
        field because the right operand is already a logarithm.  This is done
        because it moves K table lookups from the Decode() method into the
        initialization step that is less performance critical.  The LogWalsh[]
        table below contains precalculated logarithms so it is easier to do
        all the other multiplies in that form as well.
    */
    if (a == 0)
        return 0;
    return ExpLUT[AddMod(LogLUT[a], log_b)];
 }
 void InitializeMultiplyTables()
 {
@ -288,11 +295,11 @@ void InitializeMultiplyTables()
    for (unsigned log_m = 0; log_m < kOrder; ++log_m)
    {
        // For each 4 bits of the finite field width in bits:
-        for (unsigned i = 0, shift = 0; i < kBits / 4; ++i, shift += 4)
+        for (unsigned i = 0, shift = 0; i < 2; ++i, shift += 4)
        {
            // Construct 16 entry LUT for PSHUFB
-            ffe_t lut[16];
+            uint8_t lut[16];
-            for (uint8_t x = 0; x < 16; ++x)
+            for (ffe_t x = 0; x < 16; ++x)
                lut[x] = MultiplyLog(x << shift, static_cast<ffe_t>(log_m));
            // Store in 128-bit wide table
@ -1013,7 +1020,7 @@ skip_body:
 //------------------------------------------------------------------------------
 // ErrorBitfield
-#ifdef LEO_SCHEDULE_OPT
+#ifdef LEO_ERROR_BITFIELD_OPT
 // Used in decoding to decide which final FFT operations to perform
 class ErrorBitfield
@ -1050,17 +1057,16 @@ void ErrorBitfield::Prepare()
    // First mip level is for final layer of FFT: pairs of data
    for (unsigned i = 0; i < kWords; ++i)
    {
-        const uint64_t w0 = Words[0][i];
+        uint64_t w_i = Words[0][i];
-        const uint64_t hi2lo0 = w0 | ((w0 & kHiMasks[0]) >> 1);
+        const uint64_t hi2lo0 = w_i | ((w_i & kHiMasks[0]) >> 1);
-        const uint64_t lo2hi0 = ((w0 & (kHiMasks[0] >> 1)) << 1);
+        const uint64_t lo2hi0 = ((w_i & (kHiMasks[0] >> 1)) << 1);
-        Words[0][i] = hi2lo0 | lo2hi0;
+        Words[0][i] = w_i = hi2lo0 | lo2hi0;
        for (unsigned j = 1, bits = 2; j < 5; ++j, bits <<= 1)
        {
-            const uint64_t w_j = Words[j - 1][i];
+            const uint64_t hi2lo_j = w_i | ((w_i & kHiMasks[j]) >> bits);
-            const uint64_t hi2lo_j = w_j | ((w_j & kHiMasks[j]) >> bits);
+            const uint64_t lo2hi_j = ((w_i & (kHiMasks[j] >> bits)) << bits);
-            const uint64_t lo2hi_j = ((w_j & (kHiMasks[j] >> bits)) << bits);
+            Words[j][i] = w_i = hi2lo_j | lo2hi_j;
            Words[j][i] = hi2lo_j | lo2hi_j;
        }
    }
@ -1076,7 +1082,7 @@ void ErrorBitfield::Prepare()
        Words[6][i] = Words[6][i + 1] = Words[5][i] | Words[5][i + 1];
 }
-#endif // LEO_SCHEDULE_OPT
+#endif // LEO_ERROR_BITFIELD_OPT
 //------------------------------------------------------------------------------
@ -1094,9 +1100,9 @@ void ReedSolomonDecode(
 {
    // Fill in error locations
-#ifdef LEO_SCHEDULE_OPT
+#ifdef LEO_ERROR_BITFIELD_OPT
    ErrorBitfield ErrorBits;
-#endif // LEO_SCHEDULE_OPT
+#endif // LEO_ERROR_BITFIELD_OPT
    ffe_t ErrorLocations[kOrder] = {};
    for (unsigned i = 0; i < recovery_count; ++i)
@ -1109,15 +1115,15 @@ void ReedSolomonDecode(
        if (!original[i])
        {
            ErrorLocations[i + m] = 1;
-#ifdef LEO_SCHEDULE_OPT
+#ifdef LEO_ERROR_BITFIELD_OPT
            ErrorBits.Set(i + m);
-#endif // LEO_SCHEDULE_OPT
+#endif // LEO_ERROR_BITFIELD_OPT
        }
    }
-#ifdef LEO_SCHEDULE_OPT
+#ifdef LEO_ERROR_BITFIELD_OPT
    ErrorBits.Prepare();
-#endif // LEO_SCHEDULE_OPT
+#endif // LEO_ERROR_BITFIELD_OPT
    // Evaluate error locator polynomial
@ -1199,10 +1205,10 @@ void ReedSolomonDecode(
        for (unsigned j = 0; j < n; j += range)
 #endif
        {
-#ifdef LEO_SCHEDULE_OPT
+#ifdef LEO_ERROR_BITFIELD_OPT
            if (!ErrorBits.IsNeeded(mip_level, j))
                continue;
-#endif // LEO_SCHEDULE_OPT
+#endif // LEO_ERROR_BITFIELD_OPT
            VectorFFTButterfly(
                buffer_bytes,
--- a/README.md
+++ b/README.md
@ -308,7 +308,8 @@ This library implements an MDS erasure code introduced in this paper:
 ~~~
    {1} S.-J. Lin, T. Y. Al-Naffouri, Y. S. Han, and W.-H. Chung,
-    "Novel Polynomial Basis with Fast Fourier Transform and Its Application to Reed-Solomon Erasure Codes"
+    "Novel Polynomial Basis with Fast Fourier Transform
 	and Its Application to Reed-Solomon Erasure Codes"
    IEEE Trans. on Information Theory, pp. 6284-6299, November, 2016.
 ~~~
@ -318,10 +319,16 @@ This library implements an MDS erasure code introduced in this paper:
 ~~~
 ~~~
-    {3} Sian-Jheng Lin, Wei-Ho Chung, “An Efficient (n, k) Information
+    {3} Sian-Jheng Lin, Wei-Ho Chung, "An Efficient (n, k) Information
-    Dispersal Algorithm for High Code Rate System over Fermat Fields,”
+    Dispersal Algorithm for High Code Rate System over Fermat Fields,"
    IEEE Commun. Lett., vol.16, no.12, pp. 2036-2039, Dec. 2012.
 ~~~
 ~~~
 	{4} Plank, J. S., Greenan, K. M., Miller, E. L., "Screaming fast Galois Field
 	arithmetic using Intel SIMD instructions."  In: FAST-2013: 11th Usenix
 	Conference on File and Storage Technologies, San Jose, 2013
 ~~~
 Some papers are mirrored in the /docs/ folder.
--- a/docs/plank-fast13.pdf
+++ b/docs/plank-fast13.pdf
--- a/leopard.h
+++ b/leopard.h
@ -42,25 +42,29 @@
    Bulat Ziganshin <bulat.ziganshin@gmail.com> : Author of FastECC
    Yutaka Sawada <tenfon@outlook.jp> : Author of MultiPar
    References:
    {1} S.-J. Lin, T. Y. Al-Naffouri, Y. S. Han, and W.-H. Chung,
-    "Novel Polynomial Basis with Fast Fourier Transform and Its Application to Reed-Solomon Erasure Codes"
+    "Novel Polynomial Basis with Fast Fourier Transform
 	and Its Application to Reed-Solomon Erasure Codes"
    IEEE Trans. on Information Theory, pp. 6284-6299, November, 2016.
    http://ct.ee.ntust.edu.tw/it2016-2.pdf
    {2} D. G. Cantor, "On arithmetical algorithms over finite fields",
    Journal of Combinatorial Theory, Series A, vol. 50, no. 2, pp. 285-300, 1989.
-    {3} Sian-Jheng Lin, Wei-Ho Chung, “An Efficient (n, k) Information
+    {3} Sian-Jheng Lin, Wei-Ho Chung, "An Efficient (n, k) Information
-    Dispersal Algorithm for High Code Rate System over Fermat Fields,”
+    Dispersal Algorithm for High Code Rate System over Fermat Fields,"
    IEEE Commun. Lett., vol.16, no.12, pp. 2036-2039, Dec. 2012.
    {4} Plank, J. S., Greenan, K. M., Miller, E. L., "Screaming fast Galois Field
 	arithmetic using Intel SIMD instructions."  In: FAST-2013: 11th Usenix
 	Conference on File and Storage Technologies, San Jose, 2013
 */
 /*
    TODO:
-    + New 16-bit Muladd inner loops
+    + Benchmarks for large data!
        + Benchmarks for large data!
    + Add multi-threading to split up long parallelizable calculations
        + Final benchmarks!
    + Release version 1
@ -76,7 +80,7 @@
 // Enable 8-bit or 16-bit fields
 #define LEO_HAS_FF8
-//#define LEO_HAS_FF16
+#define LEO_HAS_FF16
 // Tweak if the functions are exported or statically linked
 //#define LEO_DLL /* Defined when building/linking as DLL */
--- a/tests/benchmark.cpp
+++ b/tests/benchmark.cpp
@ -42,15 +42,15 @@ using namespace std;
 struct TestParameters
 {
 #ifdef LEO_HAS_FF16
-    unsigned original_count = 1000; // under 65536
+    unsigned original_count = 677; // under 65536
-    unsigned recovery_count = 100; // under 65536 - original_count
+    unsigned recovery_count = 487; // under 65536 - original_count
 #else
    unsigned original_count = 128; // under 65536
    unsigned recovery_count = 128; // under 65536 - original_count
 #endif
-    unsigned buffer_bytes = 64000; // multiple of 64 bytes
+    unsigned buffer_bytes = 640; // multiple of 64 bytes
-    unsigned loss_count = 128; // some fraction of original_count
+    unsigned loss_count = 2; // some fraction of original_count
-    unsigned seed = 0;
+    unsigned seed = 2;
    bool multithreaded = true;
 };
@ -535,10 +535,12 @@ static bool BasicTest(const TestParameters& params)
        t_mem_free.EndCall();
    }
 #if 0
    t_mem_alloc.Print(kTrials);
    t_leo_encode.Print(kTrials);
    t_leo_decode.Print(kTrials);
    t_mem_free.Print(kTrials);
 #endif
    float encode_input_MBPS = total_bytes * kTrials / (float)(t_leo_encode.TotalUsec);
    float encode_output_MBPS = params.buffer_bytes * (uint64_t)params.recovery_count * kTrials / (float)(t_leo_encode.TotalUsec);
@ -784,6 +786,7 @@ int main(int argc, char **argv)
 #endif
    TestParameters params;
    PCGRandom prng;
    if (argc >= 2)
        params.original_count = atoi(argv[1]);
@ -804,6 +807,20 @@ int main(int argc, char **argv)
    if (!BasicTest(params))
        goto Failed;
    prng.Seed(params.seed, 7);
    for (;; ++params.seed)
    {
        params.original_count = prng.Next() % 32768;
        params.recovery_count = prng.Next() % params.original_count + 1;
        params.loss_count = prng.Next() % params.recovery_count + 1;
        cout << "Parameters: [original count=" << params.original_count << "] [recovery count=" << params.recovery_count << "] [buffer bytes=" << params.buffer_bytes << "] [loss count=" << params.loss_count << "] [random seed=" << params.seed << "]" << endl;
        if (!BasicTest(params))
            goto Failed;
    }
 #ifdef LEO_BENCH_ALL_256_PARAMS
    for (unsigned original_count = 1; original_count <= 256; ++original_count)
    {
        for (unsigned recovery_count = 1; recovery_count <= original_count; ++recovery_count)
@ -818,6 +835,7 @@ int main(int argc, char **argv)
                goto Failed;
        }
    }
 #endif
 Failed:
    getchar();
--- a/tests/experiments.cpp
+++ b/tests/experiments.cpp
@ -33,7 +33,7 @@
 #include <stdlib.h>
-#define LEO_SHORT_FIELD
+//#define LEO_SHORT_FIELD
 //#define LEO_EXPERIMENT_EXTRA_XOR
 //#define LEO_EXPERIMENT_EXTRA_MULS
 #define LEO_EXPERIMENT_CANTOR_BASIS
@ -603,8 +603,8 @@ int main(int argc, char **argv)
        const unsigned input_count = 32;
        const unsigned recovery_count = 16;
 #else // LEO_SHORT_FIELD
-        const unsigned input_count = 10000;
+        const unsigned input_count = 32768;
-        const unsigned recovery_count = 2000;
+        const unsigned recovery_count = 32768;
 #endif // LEO_SHORT_FIELD
        test(input_count, recovery_count, seed);