FF16 is mostly working still some bugs

2025-02-19 17:34:19 +00:00 · 2017-05-30 01:37:27 -07:00 · 2017-05-30 01:37:27 -07:00 · 2e1007c4aa
commit 2e1007c4aa
parent 6e7dfea7c9
8 changed files with 386 additions and 207 deletions
--- a/LeopardCommon.h
+++ b/LeopardCommon.h
@ -150,7 +150,7 @@
 // Constants

 // Unroll inner loops 4 times
-#define LEO_USE_VECTOR4_OPT
+//#define LEO_USE_VECTOR4_OPT

 // Define this to enable the optimized version of FWHT()
 #define LEO_FWHT_OPT
@ -158,6 +158,9 @@
 // Avoid scheduling reduced FFT operations that are unneeded
 #define LEO_SCHEDULE_OPT

+// Avoid calculating final FFT values in decoder using bitfield
+//#define LEO_ERROR_BITFIELD_OPT
+
 // Optimize M=1 case
 #define LEO_M1_OPT

--- a/LeopardFF16.cpp
+++ b/LeopardFF16.cpp
@ -75,8 +75,6 @@ static inline ffe_t SubMod(const ffe_t a, const ffe_t b)
 //------------------------------------------------------------------------------
 // Fast Walsh-Hadamard Transform (FWHT) (mod kModulus)

-#if defined(LEO_FWHT_OPT)
-
 // {a, b} = {a + b, a - b} (Mod Q)
 static LEO_FORCE_INLINE void FWHT_2(ffe_t& LEO_RESTRICT a, ffe_t& LEO_RESTRICT b)
 {
@ -86,6 +84,8 @@ static LEO_FORCE_INLINE void FWHT_2(ffe_t& LEO_RESTRICT a, ffe_t& LEO_RESTRICT b
    b = dif;
 }

+#if defined(LEO_FWHT_OPT)
+
 static LEO_FORCE_INLINE void FWHT_4(ffe_t* data)
 {
    ffe_t t0 = data[0];
@ -304,6 +304,23 @@ static ffe_t LogLUT[kOrder];
 static ffe_t ExpLUT[kOrder];


+// Returns a * Log(b)
+static ffe_t MultiplyLog(ffe_t a, ffe_t log_b)
+{
+    /*
+        Note that this operation is not a normal multiplication in a finite
+        field because the right operand is already a logarithm.  This is done
+        because it moves K table lookups from the Decode() method into the
+        initialization step that is less performance critical.  The LogWalsh[]
+        table below contains precalculated logarithms so it is easier to do
+        all the other multiplies in that form as well.
+    */
+    if (a == 0)
+        return 0;
+    return ExpLUT[AddMod(LogLUT[a], log_b)];
+}
+
+
 // Initialize LogLUT[], ExpLUT[]
 static void InitializeLogarithmTables()
 {
@ -344,63 +361,59 @@ static void InitializeLogarithmTables()
 //------------------------------------------------------------------------------
 // Multiplies

+/*
+    The multiplication algorithm used follows the approach outlined in {4}.
+    Specifically section 7 outlines the algorithm used here for 16-bit fields.
+    I use the ALTMAP memory layout since I do not need to convert in/out of it.
+*/
+
 struct {
-    LEO_ALIGNED LEO_M128 Value[kBits / 4];
+    LEO_ALIGNED LEO_M128 Lo[4];
+    LEO_ALIGNED LEO_M128 Hi[4];
 } static Multiply128LUT[kOrder];
 #if defined(LEO_TRY_AVX2)
 struct {
-    LEO_ALIGNED LEO_M256 Value[kBits / 4];
+    LEO_ALIGNED LEO_M256 Lo[4];
+    LEO_ALIGNED LEO_M256 Hi[4];
 } static Multiply256LUT[kOrder];
 #endif // LEO_TRY_AVX2

-// Returns a * Log(b)
-static ffe_t MultiplyLog(ffe_t a, ffe_t log_b)
-{
-    /*
-        Note that this operation is not a normal multiplication in a finite
-        field because the right operand is already a logarithm.  This is done
-        because it moves K table lookups from the Decode() method into the
-        initialization step that is less performance critical.  The LogWalsh[]
-        table below contains precalculated logarithms so it is easier to do
-        all the other multiplies in that form as well.
-    */
-    if (a == 0)
-        return 0;
-    return ExpLUT[AddMod(LogLUT[a], log_b)];
-}
-

 void InitializeMultiplyTables()
 {
+    // For each value we could multiply by:
    for (unsigned log_m = 0; log_m < kOrder; ++log_m)
    {
-        uint8_t table0[16], table1[16], table2[16], table3[16];
-
-        for (uint8_t x = 0; x < 16; ++x)
+        // For each 4 bits of the finite field width in bits:
+        for (unsigned i = 0, shift = 0; i < 4; ++i, shift += 4)
        {
-            table0[x] = MultiplyLog(x, static_cast<ffe_t>(log_m));
-            table1[x] = MultiplyLog(x << 4, static_cast<ffe_t>(log_m));
-            table2[x] = MultiplyLog(x << 8, static_cast<ffe_t>(log_m));
-            table3[x] = MultiplyLog(x << 12, static_cast<ffe_t>(log_m));
-        }
+            // Construct 16 entry LUT for PSHUFB
+            uint8_t prod_lo[16], prod_hi[16];
+            for (ffe_t x = 0; x < 16; ++x)
+            {
+                const ffe_t prod = MultiplyLog(x << shift, static_cast<ffe_t>(log_m));
+                prod_lo[x] = static_cast<uint8_t>(prod);
+                prod_hi[x] = static_cast<uint8_t>(prod >> 8);
+            }

-        const LEO_M128 value0 = _mm_loadu_si128((LEO_M128*)table0);
-        const LEO_M128 value1 = _mm_loadu_si128((LEO_M128*)table1);
-        const LEO_M128 value2 = _mm_loadu_si128((LEO_M128*)table2);
-        const LEO_M128 value3 = _mm_loadu_si128((LEO_M128*)table3);
+            const LEO_M128 value_lo = _mm_loadu_si128((LEO_M128*)prod_lo);
+            const LEO_M128 value_hi = _mm_loadu_si128((LEO_M128*)prod_hi);

-        _mm_storeu_si128(&Multiply128LUT[log_m].Value[0], value0);
-        _mm_storeu_si128(&Multiply128LUT[log_m].Value[1], value1);
+            // Store in 128-bit wide table
+            _mm_storeu_si128(&Multiply128LUT[log_m].Lo[i], value_lo);
+            _mm_storeu_si128(&Multiply128LUT[log_m].Hi[i], value_hi);

+            // Store in 256-bit wide table
 #if defined(LEO_TRY_AVX2)
-        if (CpuHasAVX2)
-        {
-            _mm256_storeu_si256(&Multiply256LUT[log_m].Value[0],
-                _mm256_broadcastsi128_si256(table_lo));
-            _mm256_storeu_si256(&Multiply256LUT[log_m].Value[1],
-                _mm256_broadcastsi128_si256(table_hi));
-        }
+            if (CpuHasAVX2)
+            {
+                _mm256_storeu_si256(&Multiply256LUT[log_m].Lo[i],
+                    _mm256_broadcastsi128_si256(value_lo));
+                _mm256_storeu_si256(&Multiply256LUT[log_m].Hi[i],
+                    _mm256_broadcastsi128_si256(value_hi));
+            }
 #endif // LEO_TRY_AVX2
+        }
    }
 }

@ -412,8 +425,17 @@ void mul_mem(
 #if defined(LEO_TRY_AVX2)
    if (CpuHasAVX2)
    {
-        const LEO_M256 table_lo_y = _mm256_loadu_si256(&Multiply256LUT[log_m].Value[0]);
-        const LEO_M256 table_hi_y = _mm256_loadu_si256(&Multiply256LUT[log_m].Value[1]);
+#define LEO_MUL_TABLES_256() \
+        const LEO_M256 T0_lo = _mm256_loadu_si256(&Multiply256LUT[log_m].Lo[0]); \
+        const LEO_M256 T1_lo = _mm256_loadu_si256(&Multiply256LUT[log_m].Lo[1]); \
+        const LEO_M256 T2_lo = _mm256_loadu_si256(&Multiply256LUT[log_m].Lo[2]); \
+        const LEO_M256 T3_lo = _mm256_loadu_si256(&Multiply256LUT[log_m].Lo[3]); \
+        const LEO_M256 T0_hi = _mm256_loadu_si256(&Multiply256LUT[log_m].Hi[0]); \
+        const LEO_M256 T1_hi = _mm256_loadu_si256(&Multiply256LUT[log_m].Hi[1]); \
+        const LEO_M256 T2_hi = _mm256_loadu_si256(&Multiply256LUT[log_m].Hi[2]); \
+        const LEO_M256 T3_hi = _mm256_loadu_si256(&Multiply256LUT[log_m].Hi[3]);
+
+        LEO_MUL_TABLES_256();

        const LEO_M256 clr_mask = _mm256_set1_epi8(0x0f);

@ -423,15 +445,25 @@ void mul_mem(
        do
        {
 #define LEO_MUL_256(x_ptr, y_ptr) { \
-            LEO_M256 data = _mm256_loadu_si256(y_ptr); \
-            LEO_M256 lo = _mm256_and_si256(data, clr_mask); \
-            lo = _mm256_shuffle_epi8(table_lo_y, lo); \
-            LEO_M256 hi = _mm256_srli_epi64(data, 4); \
-            hi = _mm256_and_si256(hi, clr_mask); \
-            hi = _mm256_shuffle_epi8(table_hi_y, hi); \
-            _mm256_storeu_si256(x_ptr, _mm256_xor_si256(lo, hi)); }
+	        const LEO_M256 A_lo = _mm256_loadu_si256(y_ptr); \
+            const LEO_M256 A_hi = _mm256_loadu_si256(y_ptr + 1); \
+            LEO_M256 data_0 = _mm256_and_si256(A_lo, clr_mask); \
+            LEO_M256 data_1 = _mm256_srli_epi64(A_lo, 4); \
+            data_1 = _mm256_and_si256(data_1, clr_mask); \
+            LEO_M256 data_2 = _mm256_and_si256(A_hi, clr_mask); \
+            LEO_M256 data_3 = _mm256_srli_epi64(A_hi, 4); \
+            data_3 = _mm256_and_si256(data_3, clr_mask); \
+            LEO_M256 output_lo = _mm256_shuffle_epi8(T0_lo, data_0); \
+            output_lo = _mm256_xor_si256(output_lo, _mm256_shuffle_epi8(T1_lo, data_1)); \
+            output_lo = _mm256_xor_si256(output_lo, _mm256_shuffle_epi8(T2_lo, data_2)); \
+            output_lo = _mm256_xor_si256(output_lo, _mm256_shuffle_epi8(T3_lo, data_3)); \
+            LEO_M256 output_hi = _mm256_shuffle_epi8(T0_hi, data_0); \
+            output_hi = _mm256_xor_si256(output_hi, _mm256_shuffle_epi8(T1_hi, data_1)); \
+            output_hi = _mm256_xor_si256(output_hi, _mm256_shuffle_epi8(T2_hi, data_2)); \
+            output_hi = _mm256_xor_si256(output_hi, _mm256_shuffle_epi8(T3_hi, data_3)); \
+            _mm256_storeu_si256(x_ptr, output_lo); \
+            _mm256_storeu_si256(x_ptr + 1, output_hi); }

-            LEO_MUL_256(x32 + 1, y32 + 1);
            LEO_MUL_256(x32, y32);
            y32 += 2, x32 += 2;

@ -442,8 +474,17 @@ void mul_mem(
    }
 #endif // LEO_TRY_AVX2

-    const LEO_M128 table_lo_y = _mm_loadu_si128(&Multiply128LUT[log_m].Value[0]);
-    const LEO_M128 table_hi_y = _mm_loadu_si128(&Multiply128LUT[log_m].Value[1]);
+#define LEO_MUL_TABLES_128() \
+    const LEO_M128 T0_lo = _mm_loadu_si128(&Multiply128LUT[log_m].Lo[0]); \
+    const LEO_M128 T1_lo = _mm_loadu_si128(&Multiply128LUT[log_m].Lo[1]); \
+    const LEO_M128 T2_lo = _mm_loadu_si128(&Multiply128LUT[log_m].Lo[2]); \
+    const LEO_M128 T3_lo = _mm_loadu_si128(&Multiply128LUT[log_m].Lo[3]); \
+    const LEO_M128 T0_hi = _mm_loadu_si128(&Multiply128LUT[log_m].Hi[0]); \
+    const LEO_M128 T1_hi = _mm_loadu_si128(&Multiply128LUT[log_m].Hi[1]); \
+    const LEO_M128 T2_hi = _mm_loadu_si128(&Multiply128LUT[log_m].Hi[2]); \
+    const LEO_M128 T3_hi = _mm_loadu_si128(&Multiply128LUT[log_m].Hi[3]);
+
+    LEO_MUL_TABLES_128();

    const LEO_M128 clr_mask = _mm_set1_epi8(0x0f);

@ -453,16 +494,25 @@ void mul_mem(
    do
    {
 #define LEO_MUL_128(x_ptr, y_ptr) { \
-        LEO_M128 data = _mm_loadu_si128(y_ptr); \
-        LEO_M128 lo = _mm_and_si128(data, clr_mask); \
-        lo = _mm_shuffle_epi8(table_lo_y, lo); \
-        LEO_M128 hi = _mm_srli_epi64(data, 4); \
-        hi = _mm_and_si128(hi, clr_mask); \
-        hi = _mm_shuffle_epi8(table_hi_y, hi); \
-        _mm_storeu_si128(x_ptr, _mm_xor_si128(lo, hi)); }
+	        const LEO_M128 A_lo = _mm_loadu_si128(y_ptr); \
+            const LEO_M128 A_hi = _mm_loadu_si128(y_ptr + 2); \
+            LEO_M128 data_0 = _mm_and_si128(A_lo, clr_mask); \
+            LEO_M128 data_1 = _mm_srli_epi64(A_lo, 4); \
+            data_1 = _mm_and_si128(data_1, clr_mask); \
+            LEO_M128 data_2 = _mm_and_si128(A_hi, clr_mask); \
+            LEO_M128 data_3 = _mm_srli_epi64(A_hi, 4); \
+            data_3 = _mm_and_si128(data_3, clr_mask); \
+            LEO_M128 output_lo = _mm_shuffle_epi8(T0_lo, data_0); \
+            output_lo = _mm_xor_si128(output_lo, _mm_shuffle_epi8(T1_lo, data_1)); \
+            output_lo = _mm_xor_si128(output_lo, _mm_shuffle_epi8(T2_lo, data_2)); \
+            output_lo = _mm_xor_si128(output_lo, _mm_shuffle_epi8(T3_lo, data_3)); \
+            LEO_M128 output_hi = _mm_shuffle_epi8(T0_hi, data_0); \
+            output_hi = _mm_xor_si128(output_hi, _mm_shuffle_epi8(T1_hi, data_1)); \
+            output_hi = _mm_xor_si128(output_hi, _mm_shuffle_epi8(T2_hi, data_2)); \
+            output_hi = _mm_xor_si128(output_hi, _mm_shuffle_epi8(T3_hi, data_3)); \
+            _mm_storeu_si128(x_ptr, output_lo); \
+            _mm_storeu_si128(x_ptr + 2, output_hi); }

-        LEO_MUL_128(x16 + 3, y16 + 3);
-        LEO_MUL_128(x16 + 2, y16 + 2);
        LEO_MUL_128(x16 + 1, y16 + 1);
        LEO_MUL_128(x16, y16);
        x16 += 4, y16 += 4;
@ -482,8 +532,7 @@ void fft_butterfly(
 #if defined(LEO_TRY_AVX2)
    if (CpuHasAVX2)
    {
-        const LEO_M256 table_lo_y = _mm256_loadu_si256(&Multiply256LUT[log_m].Value[0]);
-        const LEO_M256 table_hi_y = _mm256_loadu_si256(&Multiply256LUT[log_m].Value[1]);
+        LEO_MUL_TABLES_256();

        const LEO_M256 clr_mask = _mm256_set1_epi8(0x0f);

@ -493,19 +542,33 @@ void fft_butterfly(
        do
        {
 #define LEO_FFTB_256(x_ptr, y_ptr) { \
-            LEO_M256 y_data = _mm256_loadu_si256(y_ptr); \
-            LEO_M256 lo = _mm256_and_si256(y_data, clr_mask); \
-            lo = _mm256_shuffle_epi8(table_lo_y, lo); \
-            LEO_M256 hi = _mm256_srli_epi64(y_data, 4); \
-            hi = _mm256_and_si256(hi, clr_mask); \
-            hi = _mm256_shuffle_epi8(table_hi_y, hi); \
-            LEO_M256 x_data = _mm256_loadu_si256(x_ptr); \
-            x_data = _mm256_xor_si256(x_data, _mm256_xor_si256(lo, hi)); \
-            y_data = _mm256_xor_si256(y_data, x_data); \
-            _mm256_storeu_si256(x_ptr, x_data); \
-            _mm256_storeu_si256(y_ptr, y_data); }
+	        LEO_M256 y_lo = _mm256_loadu_si256(y_ptr); \
+            LEO_M256 data_0 = _mm256_and_si256(y_lo, clr_mask); \
+            LEO_M256 data_1 = _mm256_srli_epi64(y_lo, 4); \
+            data_1 = _mm256_and_si256(data_1, clr_mask); \
+            LEO_M256 prod_lo = _mm256_shuffle_epi8(T0_lo, data_0); \
+            prod_lo = _mm256_xor_si256(prod_lo, _mm256_shuffle_epi8(T1_lo, data_1)); \
+            LEO_M256 prod_hi = _mm256_shuffle_epi8(T0_hi, data_0); \
+            prod_hi = _mm256_xor_si256(prod_hi, _mm256_shuffle_epi8(T1_hi, data_1)); \
+            LEO_M256 y_hi = _mm256_loadu_si256(y_ptr + 1); \
+            data_0 = _mm256_and_si256(y_hi, clr_mask); \
+            data_1 = _mm256_srli_epi64(y_hi, 4); \
+            data_1 = _mm256_and_si256(data_1, clr_mask); \
+            prod_lo = _mm256_xor_si256(prod_lo, _mm256_shuffle_epi8(T2_lo, data_0)); \
+            prod_lo = _mm256_xor_si256(prod_lo, _mm256_shuffle_epi8(T3_lo, data_1)); \
+            prod_hi = _mm256_xor_si256(prod_hi, _mm256_shuffle_epi8(T2_hi, data_0)); \
+            prod_hi = _mm256_xor_si256(prod_hi, _mm256_shuffle_epi8(T3_hi, data_1)); \
+	        LEO_M256 x_lo = _mm256_loadu_si256(x_ptr); \
+            LEO_M256 x_hi = _mm256_loadu_si256(x_ptr + 1); \
+            x_lo = _mm256_xor_si256(prod_lo, x_lo); \
+            _mm256_storeu_si256(x_ptr, x_lo); \
+            x_hi = _mm256_xor_si256(prod_hi, x_hi); \
+            _mm256_storeu_si256(x_ptr + 1, x_hi); \
+            y_lo = _mm256_xor_si256(y_lo, x_lo); \
+            _mm256_storeu_si256(y_ptr, y_lo); \
+            y_hi = _mm256_xor_si256(y_hi, x_hi); \
+            _mm256_storeu_si256(y_ptr + 1, y_hi); }

-            LEO_FFTB_256(x32 + 1, y32 + 1);
            LEO_FFTB_256(x32, y32);
            y32 += 2, x32 += 2;

@ -516,8 +579,7 @@ void fft_butterfly(
    }
 #endif // LEO_TRY_AVX2

-    const LEO_M128 table_lo_y = _mm_loadu_si128(&Multiply128LUT[log_m].Value[0]);
-    const LEO_M128 table_hi_y = _mm_loadu_si128(&Multiply128LUT[log_m].Value[1]);
+    LEO_MUL_TABLES_128();

    const LEO_M128 clr_mask = _mm_set1_epi8(0x0f);

@ -527,20 +589,33 @@ void fft_butterfly(
    do
    {
 #define LEO_FFTB_128(x_ptr, y_ptr) { \
-        LEO_M128 y_data = _mm_loadu_si128(y_ptr); \
-        LEO_M128 lo = _mm_and_si128(y_data, clr_mask); \
-        lo = _mm_shuffle_epi8(table_lo_y, lo); \
-        LEO_M128 hi = _mm_srli_epi64(y_data, 4); \
-        hi = _mm_and_si128(hi, clr_mask); \
-        hi = _mm_shuffle_epi8(table_hi_y, hi); \
-        LEO_M128 x_data = _mm_loadu_si128(x_ptr); \
-        x_data = _mm_xor_si128(x_data, _mm_xor_si128(lo, hi)); \
-        y_data = _mm_xor_si128(y_data, x_data); \
-        _mm_storeu_si128(x_ptr, x_data); \
-        _mm_storeu_si128(y_ptr, y_data); }
+	        LEO_M128 y_lo = _mm_loadu_si128(y_ptr); \
+            LEO_M128 data_0 = _mm_and_si128(y_lo, clr_mask); \
+            LEO_M128 data_1 = _mm_srli_epi64(y_lo, 4); \
+            data_1 = _mm_and_si128(data_1, clr_mask); \
+            LEO_M128 prod_lo = _mm_shuffle_epi8(T0_lo, data_0); \
+            prod_lo = _mm_xor_si128(prod_lo, _mm_shuffle_epi8(T1_lo, data_1)); \
+            LEO_M128 prod_hi = _mm_shuffle_epi8(T0_hi, data_0); \
+            prod_hi = _mm_xor_si128(prod_hi, _mm_shuffle_epi8(T1_hi, data_1)); \
+            LEO_M128 y_hi = _mm_loadu_si128(y_ptr + 2); \
+            data_0 = _mm_and_si128(y_hi, clr_mask); \
+            data_1 = _mm_srli_epi64(y_hi, 4); \
+            data_1 = _mm_and_si128(data_1, clr_mask); \
+            prod_lo = _mm_xor_si128(prod_lo, _mm_shuffle_epi8(T2_lo, data_0)); \
+            prod_lo = _mm_xor_si128(prod_lo, _mm_shuffle_epi8(T3_lo, data_1)); \
+            prod_hi = _mm_xor_si128(prod_hi, _mm_shuffle_epi8(T2_hi, data_0)); \
+            prod_hi = _mm_xor_si128(prod_hi, _mm_shuffle_epi8(T3_hi, data_1)); \
+	        LEO_M128 x_lo = _mm_loadu_si128(x_ptr); \
+            LEO_M128 x_hi = _mm_loadu_si128(x_ptr + 2); \
+            x_lo = _mm_xor_si128(prod_lo, x_lo); \
+            _mm_storeu_si128(x_ptr, x_lo); \
+            x_hi = _mm_xor_si128(prod_hi, x_hi); \
+            _mm_storeu_si128(x_ptr + 2, x_hi); \
+            y_lo = _mm_xor_si128(y_lo, x_lo); \
+            _mm_storeu_si128(y_ptr, y_lo); \
+            y_hi = _mm_xor_si128(y_hi, x_hi); \
+            _mm_storeu_si128(y_ptr + 2, y_hi); }

-        LEO_FFTB_128(x16 + 3, y16 + 3);
-        LEO_FFTB_128(x16 + 2, y16 + 2);
        LEO_FFTB_128(x16 + 1, y16 + 1);
        LEO_FFTB_128(x16, y16);
        x16 += 4, y16 += 4;
@ -600,8 +675,7 @@ void fft_butterfly4(
    }
 #endif // LEO_TRY_AVX2

-    const LEO_M128 table_lo_y = _mm_loadu_si128(&Multiply128LUT[log_m].Value[0]);
-    const LEO_M128 table_hi_y = _mm_loadu_si128(&Multiply128LUT[log_m].Value[1]);
+    LEO_MUL_TABLES_128();

    const LEO_M128 clr_mask = _mm_set1_epi8(0x0f);

@ -657,8 +731,7 @@ void ifft_butterfly(
 #if defined(LEO_TRY_AVX2)
    if (CpuHasAVX2)
    {
-        const LEO_M256 table_lo_y = _mm256_loadu_si256(&Multiply256LUT[log_m].Value[0]);
-        const LEO_M256 table_hi_y = _mm256_loadu_si256(&Multiply256LUT[log_m].Value[1]);
+        LEO_MUL_TABLES_256();

        const LEO_M256 clr_mask = _mm256_set1_epi8(0x0f);

@ -668,19 +741,33 @@ void ifft_butterfly(
        do
        {
 #define LEO_IFFTB_256(x_ptr, y_ptr) { \
-            LEO_M256 x_data = _mm256_loadu_si256(x_ptr); \
-            LEO_M256 y_data = _mm256_loadu_si256(y_ptr); \
-            y_data = _mm256_xor_si256(y_data, x_data); \
-            _mm256_storeu_si256(y_ptr, y_data); \
-            LEO_M256 lo = _mm256_and_si256(y_data, clr_mask); \
-            lo = _mm256_shuffle_epi8(table_lo_y, lo); \
-            LEO_M256 hi = _mm256_srli_epi64(y_data, 4); \
-            hi = _mm256_and_si256(hi, clr_mask); \
-            hi = _mm256_shuffle_epi8(table_hi_y, hi); \
-            x_data = _mm256_xor_si256(x_data, _mm256_xor_si256(lo, hi)); \
-            _mm256_storeu_si256(x_ptr, x_data); }
+	        LEO_M256 x_lo = _mm256_loadu_si256(x_ptr); \
+	        LEO_M256 y_lo = _mm256_loadu_si256(y_ptr); \
+            y_lo = _mm256_xor_si256(y_lo, x_lo); \
+            _mm256_storeu_si256(y_ptr, y_lo); \
+            LEO_M256 data_0 = _mm256_and_si256(y_lo, clr_mask); \
+            LEO_M256 data_1 = _mm256_srli_epi64(y_lo, 4); \
+            data_1 = _mm256_and_si256(data_1, clr_mask); \
+            LEO_M256 prod_lo = _mm256_shuffle_epi8(T0_lo, data_0); \
+            prod_lo = _mm256_xor_si256(prod_lo, _mm256_shuffle_epi8(T1_lo, data_1)); \
+            LEO_M256 prod_hi = _mm256_shuffle_epi8(T0_hi, data_0); \
+            prod_hi = _mm256_xor_si256(prod_hi, _mm256_shuffle_epi8(T1_hi, data_1)); \
+            LEO_M256 x_hi = _mm256_loadu_si256(x_ptr + 1); \
+            LEO_M256 y_hi = _mm256_loadu_si256(y_ptr + 1); \
+            y_hi = _mm256_xor_si256(y_hi, x_hi); \
+            _mm256_storeu_si256(y_ptr + 1, y_hi); \
+            data_0 = _mm256_and_si256(y_hi, clr_mask); \
+            data_1 = _mm256_srli_epi64(y_hi, 4); \
+            data_1 = _mm256_and_si256(data_1, clr_mask); \
+            prod_lo = _mm256_xor_si256(prod_lo, _mm256_shuffle_epi8(T2_lo, data_0)); \
+            prod_lo = _mm256_xor_si256(prod_lo, _mm256_shuffle_epi8(T3_lo, data_1)); \
+            prod_hi = _mm256_xor_si256(prod_hi, _mm256_shuffle_epi8(T2_hi, data_0)); \
+            prod_hi = _mm256_xor_si256(prod_hi, _mm256_shuffle_epi8(T3_hi, data_1)); \
+            x_lo = _mm256_xor_si256(prod_lo, x_lo); \
+            _mm256_storeu_si256(x_ptr, x_lo); \
+            x_hi = _mm256_xor_si256(prod_hi, x_hi); \
+            _mm256_storeu_si256(x_ptr + 1, x_hi); }

-            LEO_IFFTB_256(x32 + 1, y32 + 1);
            LEO_IFFTB_256(x32, y32);
            y32 += 2, x32 += 2;

@ -691,8 +778,7 @@ void ifft_butterfly(
    }
 #endif // LEO_TRY_AVX2

-    const LEO_M128 table_lo_y = _mm_loadu_si128(&Multiply128LUT[log_m].Value[0]);
-    const LEO_M128 table_hi_y = _mm_loadu_si128(&Multiply128LUT[log_m].Value[1]);
+    LEO_MUL_TABLES_128();

    const LEO_M128 clr_mask = _mm_set1_epi8(0x0f);

@ -702,20 +788,33 @@ void ifft_butterfly(
    do
    {
 #define LEO_IFFTB_128(x_ptr, y_ptr) { \
-        LEO_M128 x_data = _mm_loadu_si128(x_ptr); \
-        LEO_M128 y_data = _mm_loadu_si128(y_ptr); \
-        y_data = _mm_xor_si128(y_data, x_data); \
-        _mm_storeu_si128(y_ptr, y_data); \
-        LEO_M128 lo = _mm_and_si128(y_data, clr_mask); \
-        lo = _mm_shuffle_epi8(table_lo_y, lo); \
-        LEO_M128 hi = _mm_srli_epi64(y_data, 4); \
-        hi = _mm_and_si128(hi, clr_mask); \
-        hi = _mm_shuffle_epi8(table_hi_y, hi); \
-        x_data = _mm_xor_si128(x_data, _mm_xor_si128(lo, hi)); \
-        _mm_storeu_si128(x_ptr, x_data); }
+	        LEO_M128 x_lo = _mm_loadu_si128(x_ptr); \
+	        LEO_M128 y_lo = _mm_loadu_si128(y_ptr); \
+            y_lo = _mm_xor_si128(y_lo, x_lo); \
+            _mm_storeu_si128(y_ptr, y_lo); \
+            LEO_M128 data_0 = _mm_and_si128(y_lo, clr_mask); \
+            LEO_M128 data_1 = _mm_srli_epi64(y_lo, 4); \
+            data_1 = _mm_and_si128(data_1, clr_mask); \
+            LEO_M128 prod_lo = _mm_shuffle_epi8(T0_lo, data_0); \
+            prod_lo = _mm_xor_si128(prod_lo, _mm_shuffle_epi8(T1_lo, data_1)); \
+            LEO_M128 prod_hi = _mm_shuffle_epi8(T0_hi, data_0); \
+            prod_hi = _mm_xor_si128(prod_hi, _mm_shuffle_epi8(T1_hi, data_1)); \
+            LEO_M128 x_hi = _mm_loadu_si128(x_ptr + 2); \
+            LEO_M128 y_hi = _mm_loadu_si128(y_ptr + 2); \
+            y_hi = _mm_xor_si128(y_hi, x_hi); \
+            _mm_storeu_si128(y_ptr + 2, y_hi); \
+            data_0 = _mm_and_si128(y_hi, clr_mask); \
+            data_1 = _mm_srli_epi64(y_hi, 4); \
+            data_1 = _mm_and_si128(data_1, clr_mask); \
+            prod_lo = _mm_xor_si128(prod_lo, _mm_shuffle_epi8(T2_lo, data_0)); \
+            prod_lo = _mm_xor_si128(prod_lo, _mm_shuffle_epi8(T3_lo, data_1)); \
+            prod_hi = _mm_xor_si128(prod_hi, _mm_shuffle_epi8(T2_hi, data_0)); \
+            prod_hi = _mm_xor_si128(prod_hi, _mm_shuffle_epi8(T3_hi, data_1)); \
+            x_lo = _mm_xor_si128(prod_lo, x_lo); \
+            _mm_storeu_si128(x_ptr, x_lo); \
+            x_hi = _mm_xor_si128(prod_hi, x_hi); \
+            _mm_storeu_si128(x_ptr + 2, x_hi); }

-        LEO_IFFTB_128(x16 + 3, y16 + 3);
-        LEO_IFFTB_128(x16 + 2, y16 + 2);
        LEO_IFFTB_128(x16 + 1, y16 + 1);
        LEO_IFFTB_128(x16, y16);
        x16 += 4, y16 += 4;
@ -1105,13 +1204,21 @@ skip_body:
 //------------------------------------------------------------------------------
 // ErrorBitfield

-#ifdef LEO_SCHEDULE_OPT
+#ifdef LEO_ERROR_BITFIELD_OPT

 // Used in decoding to decide which final FFT operations to perform
 class ErrorBitfield
 {
+    static const unsigned kWordMips = 5;
    static const unsigned kWords = kOrder / 64;
-    uint64_t Words[7][kWords] = {};
+    uint64_t Words[kWordMips][kWords] = {};
+
+    static const unsigned kBigMips = 5;
+    static const unsigned kBigWords = (kWords + 63) / 64;
+    uint64_t BigWords[kBigMips][kBigWords] = {};
+
+    static const unsigned kBiggestMips = 5;
+    uint64_t BiggestWords[kBiggestMips] = {};

 public:
    LEO_FORCE_INLINE void Set(unsigned i)
@ -1123,8 +1230,18 @@ public:

    LEO_FORCE_INLINE bool IsNeeded(unsigned mip_level, unsigned bit) const
    {
-        if (mip_level >= 8)
+        if (mip_level >= 16)
            return true;
+        if (mip_level >= 11)
+        {
+            bit /= 4096;
+            return 0 != (BiggestWords[mip_level - 11] & ((uint64_t)1 << bit));
+        }
+        if (mip_level >= 6)
+        {
+            bit /= 64;
+            return 0 != (BigWords[mip_level - 6][bit / 64] & ((uint64_t)1 << (bit % 64)));
+        }
        return 0 != (Words[mip_level - 1][bit / 64] & ((uint64_t)1 << (bit % 64)));
    }
 };
@ -1142,33 +1259,57 @@ void ErrorBitfield::Prepare()
    // First mip level is for final layer of FFT: pairs of data
    for (unsigned i = 0; i < kWords; ++i)
    {
-        const uint64_t w0 = Words[0][i];
-        const uint64_t hi2lo0 = w0 | ((w0 & kHiMasks[0]) >> 1);
-        const uint64_t lo2hi0 = ((w0 & (kHiMasks[0] >> 1)) << 1);
-        Words[0][i] = hi2lo0 | lo2hi0;
+        uint64_t w_i = Words[0][i];
+        const uint64_t hi2lo0 = w_i | ((w_i & kHiMasks[0]) >> 1);
+        const uint64_t lo2hi0 = ((w_i & (kHiMasks[0] >> 1)) << 1);
+        Words[0][i] = w_i = hi2lo0 | lo2hi0;

-        for (unsigned j = 1, bits = 2; j < 5; ++j, bits <<= 1)
+        for (unsigned j = 1, bits = 2; j < kWordMips; ++j, bits <<= 1)
        {
-            const uint64_t w_j = Words[j - 1][i];
-            const uint64_t hi2lo_j = w_j | ((w_j & kHiMasks[j]) >> bits);
-            const uint64_t lo2hi_j = ((w_j & (kHiMasks[j] >> bits)) << bits);
-            Words[j][i] = hi2lo_j | lo2hi_j;
+            const uint64_t hi2lo_j = w_i | ((w_i & kHiMasks[j]) >> bits);
+            const uint64_t lo2hi_j = ((w_i & (kHiMasks[j] >> bits)) << bits);
+            Words[j][i] = w_i = hi2lo_j | lo2hi_j;
        }
    }

-    for (unsigned i = 0; i < kWords; ++i)
+    for (unsigned i = 0; i < kBigWords; ++i)
    {
-        uint64_t w = Words[4][i];
-        w |= w >> 32;
-        w |= w << 32;
-        Words[5][i] = w;
+        uint64_t w_i = 0;
+        uint64_t bit = 1;
+        const uint64_t* src = &Words[4][i * 64];
+        for (unsigned j = 0; j < 64; ++j, bit <<= 1)
+        {
+            const uint64_t w = src[j];
+            w_i |= (w | (w >> 32) | (w << 32)) & bit;
+        }
+        BigWords[0][i] = w_i;
+
+        for (unsigned j = 1, bits = 1; j < kBigMips; ++j, bits <<= 1)
+        {
+            const uint64_t hi2lo_j = w_i | ((w_i & kHiMasks[j - 1]) >> bits);
+            const uint64_t lo2hi_j = ((w_i & (kHiMasks[j - 1] >> bits)) << bits);
+            BigWords[j][i] = w_i = hi2lo_j | lo2hi_j;
+        }
    }

-    for (unsigned i = 0; i < kWords; i += 2)
-        Words[6][i] = Words[6][i + 1] = Words[5][i] | Words[5][i + 1];
+    uint64_t w_i = 0;
+    uint64_t bit = 1;
+    for (unsigned j = 0; j < kBigWords; ++j, bit <<= 1)
+    {
+        const uint64_t w = BigWords[kBigMips - 1][j];
+        w_i |= (w | (w >> 32) | (w << 32)) & bit;
+    }
+    BiggestWords[0] = w_i;
+
+    for (unsigned j = 1, bits = 1; j < kBiggestMips; ++j, bits <<= 1)
+    {
+        const uint64_t hi2lo_j = w_i | ((w_i & kHiMasks[j - 1]) >> bits);
+        const uint64_t lo2hi_j = ((w_i & (kHiMasks[j - 1] >> bits)) << bits);
+        BiggestWords[j] = w_i = hi2lo_j | lo2hi_j;
+    }
 }

-#endif // LEO_SCHEDULE_OPT
+#endif // LEO_ERROR_BITFIELD_OPT


 //------------------------------------------------------------------------------
@ -1186,9 +1327,9 @@ void ReedSolomonDecode(
 {
    // Fill in error locations

-#ifdef LEO_SCHEDULE_OPT
+#ifdef LEO_ERROR_BITFIELD_OPT
    ErrorBitfield ErrorBits;
-#endif // LEO_SCHEDULE_OPT
+#endif // LEO_ERROR_BITFIELD_OPT

    ffe_t ErrorLocations[kOrder] = {};
    for (unsigned i = 0; i < recovery_count; ++i)
@ -1201,15 +1342,15 @@ void ReedSolomonDecode(
        if (!original[i])
        {
            ErrorLocations[i + m] = 1;
-#ifdef LEO_SCHEDULE_OPT
+#ifdef LEO_ERROR_BITFIELD_OPT
            ErrorBits.Set(i + m);
-#endif // LEO_SCHEDULE_OPT
+#endif // LEO_ERROR_BITFIELD_OPT
        }
    }

-#ifdef LEO_SCHEDULE_OPT
+#ifdef LEO_ERROR_BITFIELD_OPT
    ErrorBits.Prepare();
-#endif // LEO_SCHEDULE_OPT
+#endif // LEO_ERROR_BITFIELD_OPT

    // Evaluate error locator polynomial

@ -1291,10 +1432,10 @@ void ReedSolomonDecode(
        for (unsigned j = 0; j < n; j += range)
 #endif
        {
-#ifdef LEO_SCHEDULE_OPT
+#ifdef LEO_ERROR_BITFIELD_OPT
            if (!ErrorBits.IsNeeded(mip_level, j))
                continue;
-#endif // LEO_SCHEDULE_OPT
+#endif // LEO_ERROR_BITFIELD_OPT

            VectorFFTButterfly(
                buffer_bytes,
--- a/LeopardFF8.cpp
+++ b/LeopardFF8.cpp
@ -214,6 +214,23 @@ static ffe_t LogLUT[kOrder];
 static ffe_t ExpLUT[kOrder];


+// Returns a * Log(b)
+static ffe_t MultiplyLog(ffe_t a, ffe_t log_b)
+{
+    /*
+        Note that this operation is not a normal multiplication in a finite
+        field because the right operand is already a logarithm.  This is done
+        because it moves K table lookups from the Decode() method into the
+        initialization step that is less performance critical.  The LogWalsh[]
+        table below contains precalculated logarithms so it is easier to do
+        all the other multiplies in that form as well.
+    */
+    if (a == 0)
+        return 0;
+    return ExpLUT[AddMod(LogLUT[a], log_b)];
+}
+
+
 // Initialize LogLUT[], ExpLUT[]
 static void InitializeLogarithmTables()
 {
@ -257,30 +274,20 @@ static void InitializeLogarithmTables()
 //------------------------------------------------------------------------------
 // Multiplies

+/*
+    The multiplication algorithm used follows the approach outlined in {4}.
+    Specifically section 6 outlines the algorithm used here for 8-bit fields.
+*/
+
 struct {
-    LEO_ALIGNED LEO_M128 Value[kBits / 4];
+    LEO_ALIGNED LEO_M128 Value[2];
 } static Multiply128LUT[kOrder];
 #if defined(LEO_TRY_AVX2)
 struct {
-    LEO_ALIGNED LEO_M256 Value[kBits / 4];
+    LEO_ALIGNED LEO_M256 Value[2];
 } static Multiply256LUT[kOrder];
 #endif // LEO_TRY_AVX2

-// Returns a * Log(b)
-static ffe_t MultiplyLog(ffe_t a, ffe_t log_b)
-{
-    /*
-        Note that this operation is not a normal multiplication in a finite
-        field because the right operand is already a logarithm.  This is done
-        because it moves K table lookups from the Decode() method into the
-        initialization step that is less performance critical.  The LogWalsh[]
-        table below contains precalculated logarithms so it is easier to do
-        all the other multiplies in that form as well.
-    */
-    if (a == 0)
-        return 0;
-    return ExpLUT[AddMod(LogLUT[a], log_b)];
-}

 void InitializeMultiplyTables()
 {
@ -288,11 +295,11 @@ void InitializeMultiplyTables()
    for (unsigned log_m = 0; log_m < kOrder; ++log_m)
    {
        // For each 4 bits of the finite field width in bits:
-        for (unsigned i = 0, shift = 0; i < kBits / 4; ++i, shift += 4)
+        for (unsigned i = 0, shift = 0; i < 2; ++i, shift += 4)
        {
            // Construct 16 entry LUT for PSHUFB
-            ffe_t lut[16];
-            for (uint8_t x = 0; x < 16; ++x)
+            uint8_t lut[16];
+            for (ffe_t x = 0; x < 16; ++x)
                lut[x] = MultiplyLog(x << shift, static_cast<ffe_t>(log_m));

            // Store in 128-bit wide table
@ -1013,7 +1020,7 @@ skip_body:
 //------------------------------------------------------------------------------
 // ErrorBitfield

-#ifdef LEO_SCHEDULE_OPT
+#ifdef LEO_ERROR_BITFIELD_OPT

 // Used in decoding to decide which final FFT operations to perform
 class ErrorBitfield
@ -1050,17 +1057,16 @@ void ErrorBitfield::Prepare()
    // First mip level is for final layer of FFT: pairs of data
    for (unsigned i = 0; i < kWords; ++i)
    {
-        const uint64_t w0 = Words[0][i];
-        const uint64_t hi2lo0 = w0 | ((w0 & kHiMasks[0]) >> 1);
-        const uint64_t lo2hi0 = ((w0 & (kHiMasks[0] >> 1)) << 1);
-        Words[0][i] = hi2lo0 | lo2hi0;
+        uint64_t w_i = Words[0][i];
+        const uint64_t hi2lo0 = w_i | ((w_i & kHiMasks[0]) >> 1);
+        const uint64_t lo2hi0 = ((w_i & (kHiMasks[0] >> 1)) << 1);
+        Words[0][i] = w_i = hi2lo0 | lo2hi0;

        for (unsigned j = 1, bits = 2; j < 5; ++j, bits <<= 1)
        {
-            const uint64_t w_j = Words[j - 1][i];
-            const uint64_t hi2lo_j = w_j | ((w_j & kHiMasks[j]) >> bits);
-            const uint64_t lo2hi_j = ((w_j & (kHiMasks[j] >> bits)) << bits);
-            Words[j][i] = hi2lo_j | lo2hi_j;
+            const uint64_t hi2lo_j = w_i | ((w_i & kHiMasks[j]) >> bits);
+            const uint64_t lo2hi_j = ((w_i & (kHiMasks[j] >> bits)) << bits);
+            Words[j][i] = w_i = hi2lo_j | lo2hi_j;
        }
    }

@ -1076,7 +1082,7 @@ void ErrorBitfield::Prepare()
        Words[6][i] = Words[6][i + 1] = Words[5][i] | Words[5][i + 1];
 }

-#endif // LEO_SCHEDULE_OPT
+#endif // LEO_ERROR_BITFIELD_OPT


 //------------------------------------------------------------------------------
@ -1094,9 +1100,9 @@ void ReedSolomonDecode(
 {
    // Fill in error locations

-#ifdef LEO_SCHEDULE_OPT
+#ifdef LEO_ERROR_BITFIELD_OPT
    ErrorBitfield ErrorBits;
-#endif // LEO_SCHEDULE_OPT
+#endif // LEO_ERROR_BITFIELD_OPT

    ffe_t ErrorLocations[kOrder] = {};
    for (unsigned i = 0; i < recovery_count; ++i)
@ -1109,15 +1115,15 @@ void ReedSolomonDecode(
        if (!original[i])
        {
            ErrorLocations[i + m] = 1;
-#ifdef LEO_SCHEDULE_OPT
+#ifdef LEO_ERROR_BITFIELD_OPT
            ErrorBits.Set(i + m);
-#endif // LEO_SCHEDULE_OPT
+#endif // LEO_ERROR_BITFIELD_OPT
        }
    }

-#ifdef LEO_SCHEDULE_OPT
+#ifdef LEO_ERROR_BITFIELD_OPT
    ErrorBits.Prepare();
-#endif // LEO_SCHEDULE_OPT
+#endif // LEO_ERROR_BITFIELD_OPT

    // Evaluate error locator polynomial

@ -1199,10 +1205,10 @@ void ReedSolomonDecode(
        for (unsigned j = 0; j < n; j += range)
 #endif
        {
-#ifdef LEO_SCHEDULE_OPT
+#ifdef LEO_ERROR_BITFIELD_OPT
            if (!ErrorBits.IsNeeded(mip_level, j))
                continue;
-#endif // LEO_SCHEDULE_OPT
+#endif // LEO_ERROR_BITFIELD_OPT

            VectorFFTButterfly(
                buffer_bytes,
--- a/README.md
+++ b/README.md
@ -308,7 +308,8 @@ This library implements an MDS erasure code introduced in this paper:

 ~~~
    {1} S.-J. Lin, T. Y. Al-Naffouri, Y. S. Han, and W.-H. Chung,
-    "Novel Polynomial Basis with Fast Fourier Transform and Its Application to Reed-Solomon Erasure Codes"
+    "Novel Polynomial Basis with Fast Fourier Transform
+	and Its Application to Reed-Solomon Erasure Codes"
    IEEE Trans. on Information Theory, pp. 6284-6299, November, 2016.
 ~~~

@ -318,10 +319,16 @@ This library implements an MDS erasure code introduced in this paper:
 ~~~

 ~~~
-    {3} Sian-Jheng Lin, Wei-Ho Chung, “An Efficient (n, k) Information
-    Dispersal Algorithm for High Code Rate System over Fermat Fields,”
+    {3} Sian-Jheng Lin, Wei-Ho Chung, "An Efficient (n, k) Information
+    Dispersal Algorithm for High Code Rate System over Fermat Fields,"
    IEEE Commun. Lett., vol.16, no.12, pp. 2036-2039, Dec. 2012.
 ~~~
+
+~~~
+	{4} Plank, J. S., Greenan, K. M., Miller, E. L., "Screaming fast Galois Field
+	arithmetic using Intel SIMD instructions."  In: FAST-2013: 11th Usenix
+	Conference on File and Storage Technologies, San Jose, 2013
+~~~
 	
 Some papers are mirrored in the /docs/ folder.

--- a/docs/plank-fast13.pdf
+++ b/docs/plank-fast13.pdf
--- a/leopard.h
+++ b/leopard.h
@ -42,25 +42,29 @@
    Bulat Ziganshin <bulat.ziganshin@gmail.com> : Author of FastECC
    Yutaka Sawada <tenfon@outlook.jp> : Author of MultiPar

+
    References:

    {1} S.-J. Lin, T. Y. Al-Naffouri, Y. S. Han, and W.-H. Chung,
-    "Novel Polynomial Basis with Fast Fourier Transform and Its Application to Reed-Solomon Erasure Codes"
+    "Novel Polynomial Basis with Fast Fourier Transform
+	and Its Application to Reed-Solomon Erasure Codes"
    IEEE Trans. on Information Theory, pp. 6284-6299, November, 2016.
-    http://ct.ee.ntust.edu.tw/it2016-2.pdf

    {2} D. G. Cantor, "On arithmetical algorithms over finite fields",
    Journal of Combinatorial Theory, Series A, vol. 50, no. 2, pp. 285-300, 1989.

-    {3} Sian-Jheng Lin, Wei-Ho Chung, “An Efficient (n, k) Information
-    Dispersal Algorithm for High Code Rate System over Fermat Fields,”
+    {3} Sian-Jheng Lin, Wei-Ho Chung, "An Efficient (n, k) Information
+    Dispersal Algorithm for High Code Rate System over Fermat Fields,"
    IEEE Commun. Lett., vol.16, no.12, pp. 2036-2039, Dec. 2012.
+
+    {4} Plank, J. S., Greenan, K. M., Miller, E. L., "Screaming fast Galois Field
+	arithmetic using Intel SIMD instructions."  In: FAST-2013: 11th Usenix
+	Conference on File and Storage Technologies, San Jose, 2013
 */

 /*
    TODO:
-    + New 16-bit Muladd inner loops
-        + Benchmarks for large data!
+    + Benchmarks for large data!
    + Add multi-threading to split up long parallelizable calculations
        + Final benchmarks!
    + Release version 1
@ -76,7 +80,7 @@

 // Enable 8-bit or 16-bit fields
 #define LEO_HAS_FF8
-//#define LEO_HAS_FF16
+#define LEO_HAS_FF16

 // Tweak if the functions are exported or statically linked
 //#define LEO_DLL /* Defined when building/linking as DLL */
--- a/tests/benchmark.cpp
+++ b/tests/benchmark.cpp
@ -42,15 +42,15 @@ using namespace std;
 struct TestParameters
 {
 #ifdef LEO_HAS_FF16
-    unsigned original_count = 1000; // under 65536
-    unsigned recovery_count = 100; // under 65536 - original_count
+    unsigned original_count = 677; // under 65536
+    unsigned recovery_count = 487; // under 65536 - original_count
 #else
    unsigned original_count = 128; // under 65536
    unsigned recovery_count = 128; // under 65536 - original_count
 #endif
-    unsigned buffer_bytes = 64000; // multiple of 64 bytes
-    unsigned loss_count = 128; // some fraction of original_count
-    unsigned seed = 0;
+    unsigned buffer_bytes = 640; // multiple of 64 bytes
+    unsigned loss_count = 2; // some fraction of original_count
+    unsigned seed = 2;
    bool multithreaded = true;
 };

@ -535,10 +535,12 @@ static bool BasicTest(const TestParameters& params)
        t_mem_free.EndCall();
    }

+#if 0
    t_mem_alloc.Print(kTrials);
    t_leo_encode.Print(kTrials);
    t_leo_decode.Print(kTrials);
    t_mem_free.Print(kTrials);
+#endif

    float encode_input_MBPS = total_bytes * kTrials / (float)(t_leo_encode.TotalUsec);
    float encode_output_MBPS = params.buffer_bytes * (uint64_t)params.recovery_count * kTrials / (float)(t_leo_encode.TotalUsec);
@ -784,6 +786,7 @@ int main(int argc, char **argv)
 #endif

    TestParameters params;
+    PCGRandom prng;

    if (argc >= 2)
        params.original_count = atoi(argv[1]);
@ -804,6 +807,20 @@ int main(int argc, char **argv)
    if (!BasicTest(params))
        goto Failed;

+    prng.Seed(params.seed, 7);
+    for (;; ++params.seed)
+    {
+        params.original_count = prng.Next() % 32768;
+        params.recovery_count = prng.Next() % params.original_count + 1;
+        params.loss_count = prng.Next() % params.recovery_count + 1;
+
+        cout << "Parameters: [original count=" << params.original_count << "] [recovery count=" << params.recovery_count << "] [buffer bytes=" << params.buffer_bytes << "] [loss count=" << params.loss_count << "] [random seed=" << params.seed << "]" << endl;
+
+        if (!BasicTest(params))
+            goto Failed;
+    }
+
+#ifdef LEO_BENCH_ALL_256_PARAMS
    for (unsigned original_count = 1; original_count <= 256; ++original_count)
    {
        for (unsigned recovery_count = 1; recovery_count <= original_count; ++recovery_count)
@ -818,6 +835,7 @@ int main(int argc, char **argv)
                goto Failed;
        }
    }
+#endif

 Failed:
    getchar();
--- a/tests/experiments.cpp
+++ b/tests/experiments.cpp
@ -33,7 +33,7 @@
 #include <stdlib.h>


-#define LEO_SHORT_FIELD
+//#define LEO_SHORT_FIELD
 //#define LEO_EXPERIMENT_EXTRA_XOR
 //#define LEO_EXPERIMENT_EXTRA_MULS
 #define LEO_EXPERIMENT_CANTOR_BASIS
@ -603,8 +603,8 @@ int main(int argc, char **argv)
        const unsigned input_count = 32;
        const unsigned recovery_count = 16;
 #else // LEO_SHORT_FIELD
-        const unsigned input_count = 10000;
-        const unsigned recovery_count = 2000;
+        const unsigned input_count = 32768;
+        const unsigned recovery_count = 32768;
 #endif // LEO_SHORT_FIELD

        test(input_count, recovery_count, seed);