Add benchmark tests for 4-way ops

2017-05-27 01:15:24 -07:00 · 2017-05-27 01:15:24 -07:00 · 5b9cab04b6
parent f3003488da
commit 5b9cab04b6
6 changed files with 294 additions and 44 deletions
--- a/LeopardCommon.cpp
+++ b/LeopardCommon.cpp
@ -151,7 +151,7 @@ void xor_mem(
    {
        LEO_M256 * LEO_RESTRICT x32 = reinterpret_cast<LEO_M256 *>(vx);
        const LEO_M256 * LEO_RESTRICT y32 = reinterpret_cast<const LEO_M256 *>(vy);
-        do
+        while (bytes >= 128)
        {
            const LEO_M256 x0 = _mm256_xor_si256(_mm256_loadu_si256(x32),     _mm256_loadu_si256(y32));
            const LEO_M256 x1 = _mm256_xor_si256(_mm256_loadu_si256(x32 + 1), _mm256_loadu_si256(y32 + 1));
@ -161,8 +161,9 @@ void xor_mem(
            _mm256_storeu_si256(x32 + 1, x1);
            _mm256_storeu_si256(x32 + 2, x2);
            _mm256_storeu_si256(x32 + 3, x3);
-            bytes -= 128, x32 += 4, y32 += 4;
-        } while (bytes >= 128);
+            x32 += 4, y32 += 4;
+            bytes -= 128;
+        };
        if (bytes > 0)
        {
            const LEO_M256 x0 = _mm256_xor_si256(_mm256_loadu_si256(x32),     _mm256_loadu_si256(y32));
@ -185,7 +186,8 @@ void xor_mem(
        _mm_storeu_si128(x16 + 1, x1);
        _mm_storeu_si128(x16 + 2, x2);
        _mm_storeu_si128(x16 + 3, x3);
-        bytes -= 64, x16 += 4, y16 += 4;
+        x16 += 4, y16 += 4;
+        bytes -= 64;
    } while (bytes > 0);
 }

@ -196,8 +198,6 @@ void xor_mem4(
    void * LEO_RESTRICT vx_3, const void * LEO_RESTRICT vy_3,
    uint64_t bytes)
 {
-    // FIXME: Add args
-
 #if defined(LEO_TRY_AVX2)
    if (CpuHasAVX2)
    {
@ -207,51 +207,66 @@ void xor_mem4(
        const LEO_M256 * LEO_RESTRICT y32_1 = reinterpret_cast<const LEO_M256 *>(vy_1);
        LEO_M256 * LEO_RESTRICT       x32_2 = reinterpret_cast<LEO_M256 *>      (vx_2);
        const LEO_M256 * LEO_RESTRICT y32_2 = reinterpret_cast<const LEO_M256 *>(vy_2);
-        do
+        LEO_M256 * LEO_RESTRICT       x32_3 = reinterpret_cast<LEO_M256 *>      (vx_3);
+        const LEO_M256 * LEO_RESTRICT y32_3 = reinterpret_cast<const LEO_M256 *>(vy_3);
+        while (bytes >= 128)
        {
            const LEO_M256 x0_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0),     _mm256_loadu_si256(y32_0));
            const LEO_M256 x1_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 1), _mm256_loadu_si256(y32_0 + 1));
            const LEO_M256 x2_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 2), _mm256_loadu_si256(y32_0 + 2));
            const LEO_M256 x3_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 3), _mm256_loadu_si256(y32_0 + 3));
+            _mm256_storeu_si256(x32_0, x0_0);
+            _mm256_storeu_si256(x32_0 + 1, x1_0);
+            _mm256_storeu_si256(x32_0 + 2, x2_0);
+            _mm256_storeu_si256(x32_0 + 3, x3_0);
+            x32_0 += 4, y32_0 += 4;
            const LEO_M256 x0_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1),     _mm256_loadu_si256(y32_1));
            const LEO_M256 x1_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 1), _mm256_loadu_si256(y32_1 + 1));
            const LEO_M256 x2_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 2), _mm256_loadu_si256(y32_1 + 2));
            const LEO_M256 x3_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 3), _mm256_loadu_si256(y32_1 + 3));
+            _mm256_storeu_si256(x32_1, x0_1);
+            _mm256_storeu_si256(x32_1 + 1, x1_1);
+            _mm256_storeu_si256(x32_1 + 2, x2_1);
+            _mm256_storeu_si256(x32_1 + 3, x3_1);
+            x32_1 += 4, y32_1 += 4;
            const LEO_M256 x0_2 = _mm256_xor_si256(_mm256_loadu_si256(x32_2),     _mm256_loadu_si256(y32_2));
            const LEO_M256 x1_2 = _mm256_xor_si256(_mm256_loadu_si256(x32_2 + 1), _mm256_loadu_si256(y32_2 + 1));
            const LEO_M256 x2_2 = _mm256_xor_si256(_mm256_loadu_si256(x32_2 + 2), _mm256_loadu_si256(y32_2 + 2));
            const LEO_M256 x3_2 = _mm256_xor_si256(_mm256_loadu_si256(x32_2 + 3), _mm256_loadu_si256(y32_2 + 3));
-            _mm256_storeu_si256(x32_0,     x0_0);
-            _mm256_storeu_si256(x32_0 + 1, x1_0);
-            _mm256_storeu_si256(x32_0 + 2, x2_0);
-            _mm256_storeu_si256(x32_0 + 3, x3_0);
-            _mm256_storeu_si256(x32_1,     x0_1);
-            _mm256_storeu_si256(x32_1 + 1, x1_1);
-            _mm256_storeu_si256(x32_1 + 2, x2_1);
-            _mm256_storeu_si256(x32_1 + 3, x3_1);
-            _mm256_storeu_si256(x32_2,     x0_2);
+            _mm256_storeu_si256(x32_2, x0_2);
            _mm256_storeu_si256(x32_2 + 1, x1_2);
            _mm256_storeu_si256(x32_2 + 2, x2_2);
            _mm256_storeu_si256(x32_2 + 3, x3_2);
-            x32_0 += 4, y32_0 += 4;
-            x32_1 += 4, y32_1 += 4;
            x32_2 += 4, y32_2 += 4;
+            const LEO_M256 x0_3 = _mm256_xor_si256(_mm256_loadu_si256(x32_3),     _mm256_loadu_si256(y32_3));
+            const LEO_M256 x1_3 = _mm256_xor_si256(_mm256_loadu_si256(x32_3 + 1), _mm256_loadu_si256(y32_3 + 1));
+            const LEO_M256 x2_3 = _mm256_xor_si256(_mm256_loadu_si256(x32_3 + 2), _mm256_loadu_si256(y32_3 + 2));
+            const LEO_M256 x3_3 = _mm256_xor_si256(_mm256_loadu_si256(x32_3 + 3), _mm256_loadu_si256(y32_3 + 3));
+            _mm256_storeu_si256(x32_3,     x0_3);
+            _mm256_storeu_si256(x32_3 + 1, x1_3);
+            _mm256_storeu_si256(x32_3 + 2, x2_3);
+            _mm256_storeu_si256(x32_3 + 3, x3_3);
+            x32_3 += 4, y32_3 += 4;
            bytes -= 128;
-        } while (bytes >= 128);
+        }
        if (bytes > 0)
        {
            const LEO_M256 x0_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0),     _mm256_loadu_si256(y32_0));
            const LEO_M256 x1_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 1), _mm256_loadu_si256(y32_0 + 1));
            const LEO_M256 x0_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1),     _mm256_loadu_si256(y32_1));
            const LEO_M256 x1_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 1), _mm256_loadu_si256(y32_1 + 1));
+            _mm256_storeu_si256(x32_0, x0_0);
+            _mm256_storeu_si256(x32_0 + 1, x1_0);
+            _mm256_storeu_si256(x32_1, x0_1);
+            _mm256_storeu_si256(x32_1 + 1, x1_1);
            const LEO_M256 x0_2 = _mm256_xor_si256(_mm256_loadu_si256(x32_2),     _mm256_loadu_si256(y32_2));
            const LEO_M256 x1_2 = _mm256_xor_si256(_mm256_loadu_si256(x32_2 + 1), _mm256_loadu_si256(y32_2 + 1));
-            _mm256_storeu_si256(x32_0,     x0_0);
-            _mm256_storeu_si256(x32_0 + 1, x1_0);
-            _mm256_storeu_si256(x32_1,     x0_1);
-            _mm256_storeu_si256(x32_1 + 1, x1_1);
+            const LEO_M256 x0_3 = _mm256_xor_si256(_mm256_loadu_si256(x32_3),     _mm256_loadu_si256(y32_3));
+            const LEO_M256 x1_3 = _mm256_xor_si256(_mm256_loadu_si256(x32_3 + 1), _mm256_loadu_si256(y32_3 + 1));
            _mm256_storeu_si256(x32_2,     x0_2);
            _mm256_storeu_si256(x32_2 + 1, x1_2);
+            _mm256_storeu_si256(x32_3,     x0_3);
+            _mm256_storeu_si256(x32_3 + 1, x1_3);
        }
        return;
    }
@ -262,35 +277,46 @@ void xor_mem4(
    const LEO_M128 * LEO_RESTRICT y16_1 = reinterpret_cast<const LEO_M128 *>(vy_1);
    LEO_M128 * LEO_RESTRICT       x16_2 = reinterpret_cast<LEO_M128 *>      (vx_2);
    const LEO_M128 * LEO_RESTRICT y16_2 = reinterpret_cast<const LEO_M128 *>(vy_2);
+    LEO_M128 * LEO_RESTRICT       x16_3 = reinterpret_cast<LEO_M128 *>      (vx_3);
+    const LEO_M128 * LEO_RESTRICT y16_3 = reinterpret_cast<const LEO_M128 *>(vy_3);
    do
    {
        const LEO_M128 x0_0 = _mm_xor_si128(_mm_loadu_si128(x16_0),     _mm_loadu_si128(y16_0));
        const LEO_M128 x1_0 = _mm_xor_si128(_mm_loadu_si128(x16_0 + 1), _mm_loadu_si128(y16_0 + 1));
        const LEO_M128 x2_0 = _mm_xor_si128(_mm_loadu_si128(x16_0 + 2), _mm_loadu_si128(y16_0 + 2));
        const LEO_M128 x3_0 = _mm_xor_si128(_mm_loadu_si128(x16_0 + 3), _mm_loadu_si128(y16_0 + 3));
+        _mm_storeu_si128(x16_0, x0_0);
+        _mm_storeu_si128(x16_0 + 1, x1_0);
+        _mm_storeu_si128(x16_0 + 2, x2_0);
+        _mm_storeu_si128(x16_0 + 3, x3_0);
+        x16_0 += 4, y16_0 += 4;
        const LEO_M128 x0_1 = _mm_xor_si128(_mm_loadu_si128(x16_1),     _mm_loadu_si128(y16_1));
        const LEO_M128 x1_1 = _mm_xor_si128(_mm_loadu_si128(x16_1 + 1), _mm_loadu_si128(y16_1 + 1));
        const LEO_M128 x2_1 = _mm_xor_si128(_mm_loadu_si128(x16_1 + 2), _mm_loadu_si128(y16_1 + 2));
        const LEO_M128 x3_1 = _mm_xor_si128(_mm_loadu_si128(x16_1 + 3), _mm_loadu_si128(y16_1 + 3));
+        _mm_storeu_si128(x16_1, x0_1);
+        _mm_storeu_si128(x16_1 + 1, x1_1);
+        _mm_storeu_si128(x16_1 + 2, x2_1);
+        _mm_storeu_si128(x16_1 + 3, x3_1);
+        x16_1 += 4, y16_1 += 4;
        const LEO_M128 x0_2 = _mm_xor_si128(_mm_loadu_si128(x16_2),     _mm_loadu_si128(y16_2));
        const LEO_M128 x1_2 = _mm_xor_si128(_mm_loadu_si128(x16_2 + 1), _mm_loadu_si128(y16_2 + 1));
        const LEO_M128 x2_2 = _mm_xor_si128(_mm_loadu_si128(x16_2 + 2), _mm_loadu_si128(y16_2 + 2));
        const LEO_M128 x3_2 = _mm_xor_si128(_mm_loadu_si128(x16_2 + 3), _mm_loadu_si128(y16_2 + 3));
-        _mm_storeu_si128(x16_0,     x0_0);
-        _mm_storeu_si128(x16_0 + 1, x1_0);
-        _mm_storeu_si128(x16_0 + 2, x2_0);
-        _mm_storeu_si128(x16_0 + 3, x3_0);
-        _mm_storeu_si128(x16_1,     x0_1);
-        _mm_storeu_si128(x16_1 + 1, x1_1);
-        _mm_storeu_si128(x16_1 + 2, x2_1);
-        _mm_storeu_si128(x16_1 + 3, x3_1);
-        _mm_storeu_si128(x16_2,     x0_2);
+        _mm_storeu_si128(x16_2, x0_2);
        _mm_storeu_si128(x16_2 + 1, x1_2);
        _mm_storeu_si128(x16_2 + 2, x2_2);
        _mm_storeu_si128(x16_2 + 3, x3_2);
-        x16_0 += 4, y16_0 += 4;
-        x16_1 += 4, y16_1 += 4;
        x16_2 += 4, y16_2 += 4;
+        const LEO_M128 x0_3 = _mm_xor_si128(_mm_loadu_si128(x16_3),     _mm_loadu_si128(y16_3));
+        const LEO_M128 x1_3 = _mm_xor_si128(_mm_loadu_si128(x16_3 + 1), _mm_loadu_si128(y16_3 + 1));
+        const LEO_M128 x2_3 = _mm_xor_si128(_mm_loadu_si128(x16_3 + 2), _mm_loadu_si128(y16_3 + 2));
+        const LEO_M128 x3_3 = _mm_xor_si128(_mm_loadu_si128(x16_3 + 3), _mm_loadu_si128(y16_3 + 3));
+        _mm_storeu_si128(x16_3,     x0_3);
+        _mm_storeu_si128(x16_3 + 1, x1_3);
+        _mm_storeu_si128(x16_3 + 2, x2_3);
+        _mm_storeu_si128(x16_3 + 3, x3_3);
+        x16_3 += 4, y16_3 += 4;
        bytes -= 64;
    } while (bytes > 0);
 }
--- a/LeopardFF16.cpp
+++ b/LeopardFF16.cpp
@ -55,6 +55,9 @@ static const ffe_t kBasis[kBits] = {
    0xFDB8, 0xFB34, 0xFF38, 0x991E
 };

+// Using the Cantor basis here enables us to avoid a lot of extra calculations
+// when applying the formal derivative in decoding.
+

 //------------------------------------------------------------------------------
 // Field Operations
--- a/LeopardFF8.cpp
+++ b/LeopardFF8.cpp
@ -50,9 +50,11 @@ static const unsigned kPolynomial = 0x11D;
 // Basis used for generating logarithm tables
 static const ffe_t kBasis[kBits] = {
    1, 214, 152, 146, 86, 200, 88, 230 // Cantor basis
-    // 1, 2, 4, 8, 16, 32, 64, 128 // Monomial basis
 };

+// Using the Cantor basis here enables us to avoid a lot of extra calculations
+// when applying the formal derivative in decoding.
+

 //------------------------------------------------------------------------------
 // Field Operations
--- a/leopard.cpp
+++ b/leopard.cpp
@ -48,6 +48,8 @@ LEO_EXPORT int leo_init_(int version)
    if (version != LEO_VERSION)
        return Leopard_InvalidInput;

+    leopard::InitializeCPUArch();
+
 #ifdef LEO_HAS_FF8
    if (!leopard::ff8::Initialize())
        return Leopard_Platform;
--- a/tests/benchmark.cpp
+++ b/tests/benchmark.cpp
@ -27,6 +27,8 @@
 */

 #include "../LeopardCommon.h"
+#include "../LeopardFF8.h"
+#include "../LeopardFF16.h"
 #include "../leopard.h"

 #include <memory>
@ -238,7 +240,7 @@ public:
    }
    void Print(unsigned trials)
    {
-        cout << FunctionName << " called " << Invokations / (float)trials << " times per trial (avg).  " << TotalUsec / (double)Invokations << " usec avg for all invokations.  " << TotalUsec / (float)trials << " usec (avg) of " << trials << " trials" << endl;
+        cout << FunctionName << " called " << Invokations / (float)trials << " times per trial. " << TotalUsec / (double)Invokations << " usec avg. " << TotalUsec / (float)trials << " usec for each of " << trials << " trials" << endl;
    }

    uint64_t t0 = 0;
@ -526,6 +528,197 @@ static void BasicTest(const TestParameters& params)
 }


+//------------------------------------------------------------------------------
+// Parallel XOR Benchmark
+
+// Demonstrate about 10% performance boost by doing parallel rows for XORs
+void ParallelXORBenchmark()
+{
+    FunctionTimer t_1("xor_mem");
+    FunctionTimer t_4("xor_mem4");
+
+    static const unsigned buffer_bytes = 4096;
+    static const unsigned buffer_count = 1024;
+
+    uint8_t* buffers_x[buffer_count] = {};
+    uint8_t* buffers_y[buffer_count] = {};
+
+    for (unsigned i = 0; i < buffer_count; ++i)
+    {
+        buffers_x[i] = SIMDSafeAllocate(buffer_bytes);
+        buffers_y[i] = SIMDSafeAllocate(buffer_bytes);
+    }
+
+    static const unsigned iteration_count = 1000;
+
+    for (unsigned i = 0; i < iteration_count; ++i)
+    {
+        t_1.BeginCall();
+        for (unsigned j = 0; j < buffer_count; ++j)
+            leopard::xor_mem(
+                buffers_x[j], buffers_y[j],
+                buffer_bytes);
+        t_1.EndCall();
+    }
+
+    for (unsigned i = 0; i < iteration_count; ++i)
+    {
+        t_4.BeginCall();
+        for (unsigned j = 0; j < buffer_count; j += 4)
+            leopard::xor_mem4(
+                buffers_x[j], buffers_y[j],
+                buffers_x[j + 1], buffers_y[j + 1],
+                buffers_x[j + 2], buffers_y[j + 2],
+                buffers_x[j + 3], buffers_y[j + 3],
+                buffer_bytes);
+        t_4.EndCall();
+    }
+
+    for (unsigned i = 0; i < buffer_count; ++i)
+    {
+        SIMDSafeFree(buffers_x[i]);
+        SIMDSafeFree(buffers_y[i]);
+    }
+
+    t_1.Print(iteration_count);
+    t_4.Print(iteration_count);
+}
+
+
+//------------------------------------------------------------------------------
+// Parallel Butterfly8 Benchmark
+
+#ifdef LEO_HAS_FF8
+
+// Demonstrate performance boost by doing parallel rows for Butterfly8s
+void ParallelButterfly8Benchmark()
+{
+    FunctionTimer t_1("8-bit fft_butterfly");
+    FunctionTimer t_4("8-bit fft_butterfly4");
+
+    static const unsigned buffer_bytes = 4096;
+    static const unsigned buffer_count = 1024;
+
+    uint8_t* buffers_x[buffer_count] = {};
+    uint8_t* buffers_y[buffer_count] = {};
+
+    for (unsigned i = 0; i < buffer_count; ++i)
+    {
+        buffers_x[i] = SIMDSafeAllocate(buffer_bytes);
+        buffers_y[i] = SIMDSafeAllocate(buffer_bytes);
+    }
+
+    static const unsigned iteration_count = 1000;
+
+    for (unsigned i = 0; i < iteration_count; ++i)
+    {
+        leopard::ff8::ffe_t m = (leopard::ff8::ffe_t)(i + 2);
+
+        t_1.BeginCall();
+        for (unsigned j = 0; j < buffer_count; ++j)
+            leopard::ff8::fft_butterfly(
+                buffers_x[j], buffers_y[j],
+                m,
+                buffer_bytes);
+        t_1.EndCall();
+    }
+
+    for (unsigned i = 0; i < iteration_count; ++i)
+    {
+        leopard::ff8::ffe_t m = (leopard::ff8::ffe_t)(i + 2);
+
+        t_4.BeginCall();
+        for (unsigned j = 0; j < buffer_count; j += 4)
+            leopard::ff8::fft_butterfly4(
+                buffers_x[j], buffers_y[j],
+                buffers_x[j + 1], buffers_y[j + 1],
+                buffers_x[j + 2], buffers_y[j + 2],
+                buffers_x[j + 3], buffers_y[j + 3],
+                m,
+                buffer_bytes);
+        t_4.EndCall();
+    }
+
+    for (unsigned i = 0; i < buffer_count; ++i)
+    {
+        SIMDSafeFree(buffers_x[i]);
+        SIMDSafeFree(buffers_y[i]);
+    }
+
+    t_1.Print(iteration_count);
+    t_4.Print(iteration_count);
+}
+
+#endif // LEO_HAS_FF8
+
+
+//------------------------------------------------------------------------------
+// Parallel Butterfly16 Benchmark
+
+#ifdef LEO_HAS_FF16
+
+// Demonstrate performance boost by doing parallel rows for Butterfly16s
+void ParallelButterfly16Benchmark()
+{
+    FunctionTimer t_1("16-bit fft_butterfly");
+    FunctionTimer t_4("16-bit fft_butterfly4");
+
+    static const unsigned buffer_bytes = 4096;
+    static const unsigned buffer_count = 1024;
+
+    uint8_t* buffers_x[buffer_count] = {};
+    uint8_t* buffers_y[buffer_count] = {};
+
+    for (unsigned i = 0; i < buffer_count; ++i)
+    {
+        buffers_x[i] = SIMDSafeAllocate(buffer_bytes);
+        buffers_y[i] = SIMDSafeAllocate(buffer_bytes);
+    }
+
+    static const unsigned iteration_count = 100;
+
+    for (unsigned i = 0; i < iteration_count; ++i)
+    {
+        leopard::ff16::ffe_t m = (leopard::ff16::ffe_t)(i + 2);
+
+        t_1.BeginCall();
+        for (unsigned j = 0; j < buffer_count; ++j)
+            leopard::ff16::fft_butterfly(
+                buffers_x[j], buffers_y[j],
+                m,
+                buffer_bytes);
+        t_1.EndCall();
+    }
+
+    for (unsigned i = 0; i < iteration_count; ++i)
+    {
+        leopard::ff16::ffe_t m = (leopard::ff16::ffe_t)(i + 2);
+
+        t_4.BeginCall();
+        for (unsigned j = 0; j < buffer_count; j += 4)
+            leopard::ff16::fft_butterfly4(
+                buffers_x[j], buffers_y[j],
+                buffers_x[j + 1], buffers_y[j + 1],
+                buffers_x[j + 2], buffers_y[j + 2],
+                buffers_x[j + 3], buffers_y[j + 3],
+                m,
+                buffer_bytes);
+        t_4.EndCall();
+    }
+
+    for (unsigned i = 0; i < buffer_count; ++i)
+    {
+        SIMDSafeFree(buffers_x[i]);
+        SIMDSafeFree(buffers_y[i]);
+    }
+
+    t_1.Print(iteration_count);
+    t_4.Print(iteration_count);
+}
+
+#endif // LEO_HAS_FF8
+
+
 //------------------------------------------------------------------------------
 // Entrypoint

@ -544,6 +737,14 @@ int main(int argc, char **argv)
    t_leo_init.EndCall();
    t_leo_init.Print(1);

+    ParallelXORBenchmark();
+#ifdef LEO_HAS_FF8
+    ParallelButterfly8Benchmark();
+#endif // LEO_HAS_FF8
+#ifdef LEO_HAS_FF16
+    ParallelButterfly16Benchmark();
+#endif // LEO_HAS_FF16
+
    TestParameters params;

    if (argc >= 2)
--- a/tests/experiments.cpp
+++ b/tests/experiments.cpp
@ -33,6 +33,11 @@
 #include <stdlib.h>


+//#define LEO_SHORT_FIELD
+//#define LEO_EXPERIMENT_EXTRA_XOR
+//#define LEO_EXPERIMENT_EXTRA_MULS
+#define LEO_EXPERIMENT_CANTOR_BASIS
+
 //------------------------------------------------------------------------------
 // Debug

@ -70,24 +75,33 @@
 //------------------------------------------------------------------------------
 // Field

-//#define LEO_SHORT_FIELD
-
 #ifdef LEO_SHORT_FIELD
 typedef uint8_t ffe_t;
 static const unsigned kGFBits = 8;
 static const unsigned kGFPolynomial = 0x11D;
 ffe_t kGFBasis[kGFBits] = {
+#ifdef LEO_EXPERIMENT_CANTOR_BASIS
    1, 214, 152, 146, 86, 200, 88, 230 // Cantor basis
+#else
+    1, 2, 4, 8, 16, 32, 64, 128 // Monomial basis
+#endif
 };
 #else
 typedef uint16_t ffe_t;
 static const unsigned kGFBits = 16;
 static const unsigned kGFPolynomial = 0x1002D;
 ffe_t kGFBasis[kGFBits] = {
+#ifdef LEO_EXPERIMENT_CANTOR_BASIS
    0x0001, 0xACCA, 0x3C0E, 0x163E, // Cantor basis
    0xC582, 0xED2E, 0x914C, 0x4012,
    0x6C98, 0x10D8, 0x6A72, 0xB900,
    0xFDB8, 0xFB34, 0xFF38, 0x991E
+#else
+    1, 2, 4, 8, // Monomial basis
+    16, 32, 64, 128,
+    256, 512, 1024, 2048,
+    4096, 8192, 16384, 32768
+#endif
 };
 #endif

@ -223,7 +237,7 @@ static void formal_derivative(ffe_t* cos, const unsigned size)
    }

    // Doesn't seem to be needed
-#if 0
+#ifdef LEO_EXPERIMENT_EXTRA_XOR
    /*
        Same here - Zeroes on the right are preserved
    */
@ -305,7 +319,9 @@ static void FLT(ffe_t* data, const unsigned size, const unsigned skewIndex, cons
 //------------------------------------------------------------------------------
 // FFT Initialization

-//static ffe_t B[kFieldSize >> 1];     // factors used in formal derivative
+#ifdef LEO_EXPERIMENT_EXTRA_MULS
+static ffe_t B[kFieldSize >> 1];     // factors used in formal derivative
+#endif
 static fwht_t log_walsh[kFieldSize];  // factors used in the evaluation of the error locator polynomial

 // Initialize skewVec[], B[], log_walsh[]
@ -339,7 +355,7 @@ static void InitFieldOperations()
    for (unsigned i = 0; i < kFieldSize; ++i)
        skewVec[i] = GFLog[skewVec[i]];

-#if 0
+#ifdef LEO_EXPERIMENT_EXTRA_MULS
    temp[0] = kModulus - temp[0];

    for (unsigned i = 1; i < (kGFBits - 1); ++i)
@ -444,7 +460,7 @@ static void decode(ffe_t* codeword, const unsigned m, const unsigned original_co
    IFLT(codeword, n, 0);

    // Note: This is not needed to recover successfully...
-#if 0
+#ifdef LEO_EXPERIMENT_EXTRA_MULS
    // formal derivative
    // Note: Preserves zeroes on the right
    for (unsigned i = 0; i < m + original_count; i += 2)
@ -456,7 +472,7 @@ static void decode(ffe_t* codeword, const unsigned m, const unsigned original_co

    formal_derivative(codeword, n);

-#if 0
+#ifdef LEO_EXPERIMENT_EXTRA_MULS
    // Note: Preserves zeroes on the right
    for (unsigned i = 0; i < m + original_count; i += 2)
    {
@ -598,7 +614,7 @@ int main(int argc, char **argv)
    {
 #ifdef LEO_SHORT_FIELD
        const unsigned input_count = 100;
-        const unsigned recovery_count = 20;
+        const unsigned recovery_count = 10;
 #else // LEO_SHORT_FIELD
        const unsigned input_count = 10000;
        const unsigned recovery_count = 2000;