WIP

2025-02-19 17:34:19 +00:00 · 2017-05-26 19:51:30 -07:00 · 2017-05-26 19:51:30 -07:00 · 5cba1989ec
commit 5cba1989ec
parent 49dbcdc8b1
21 changed files with 2458 additions and 8201 deletions
--- a/LeopardCommon.cpp
+++ b/LeopardCommon.cpp
--- a/LeopardCommon.h
+++ b/LeopardCommon.h
@ -30,42 +30,20 @@

 /*
    TODO:
-    + Refactor software
-        + I think it should be split up into several C++ modules
-    + Replace GFSymbol with a file data pointer
-    + New 16-bit Muladd inner loops
-        + Class to contain the (large) muladd tables
-    + Preliminary benchmarks for large data!
    + New 8-bit Muladd inner loops
-    + Benchmarks for smaller data!
-    + Write detailed comments for all the routines
-    + Look into getting EncodeL working so we can support smaller data (Ask Lin)
-    + Look into using k instead of k2 to speed up decoder (Ask Lin)
-    + Avoid performing FFT/IFFT intermediate calculations we're not going to use
-    + Benchmarks, fun!
+        + Benchmarks for smaller data!
+    + New 16-bit Muladd inner loops
+        + Benchmarks for large data!
+    + Use parallel row ops
    + Add multi-threading to split up long parallelizable calculations
-    + Final benchmarks!
-    + Finish up documentation
+        + Write detailed comments for all the routines
+        + Final benchmarks!
    + Release version 1
+        + Finish up documentation

-
-    Muladd implementation notes:
-
-    Specialize for 1-3 rows at a time since often times we're multiplying by
-    the same (skew) value repeatedly, as the ISA-L library does here:
-
-    https://github.com/01org/isa-l/blob/master/erasure_code/gf_3vect_mad_avx.asm#L258
-
-    Except we should be doing it for 16-bit Galois Field.
-    To implement that use the ALTMAP trick from Jerasure:
-
-    http://lab.jerasure.org/jerasure/gf-complete/blob/master/src/gf_w16.c#L1140
-
-    Except we should also support AVX2 since that is a 40% perf boost, so put
-    the high and low bytes 32 bytes instead of 16 bytes apart.
-
-    Also I think we should go ahead and precompute the multiply tables since
-    it avoids a bunch of memory lookups for each muladd, and only costs 8 MB.
+    TBD:
+    + Look into getting EncodeL working so we can support smaller data (Ask Lin)
+    + Look into using FFT_m instead of FFT_n for decoder
 */

 #include <stdint.h>
@ -191,4 +169,57 @@ extern bool CpuHasSSSE3;
 #endif // LEO_TARGET_MOBILE


+//------------------------------------------------------------------------------
+// Portable Intrinsics
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#endif
+
+// Returns highest bit index 0..31 where the first non-zero bit is found
+// Precondition: x != 0
+LEO_FORCE_INLINE unsigned LastNonzeroBit32(unsigned x)
+{
+#ifdef _MSC_VER
+    unsigned long index;
+    // Note: Ignoring result because x != 0
+    _BitScanReverse(&index, (uint32_t)x);
+    return (unsigned)index;
+#else
+    // Note: Ignoring return value of 0 because x != 0
+    return 31 - (unsigned)__builtin_clzl(x);
+#endif
+}
+
+// Returns next power of two at or above given value
+LEO_FORCE_INLINE unsigned NextPow2(unsigned n)
+{
+    return 2UL << LastNonzeroBit32(n - 1);
+}
+
+
+//------------------------------------------------------------------------------
+// XOR Memory
+//
+// This works for both 8-bit and 16-bit finite fields
+
+// x[] ^= y[]
+void xor_mem(
+    void * LEO_RESTRICT x, const void * LEO_RESTRICT y,
+    unsigned bytes);
+
+// For i = {0, 1}: x_i[] ^= x_i[]
+void xor_mem2(
+    void * LEO_RESTRICT x_0, const void * LEO_RESTRICT y_0,
+    void * LEO_RESTRICT x_1, const void * LEO_RESTRICT y_1,
+    unsigned bytes);
+
+// For i = {0, 1, 2}: x_i[] ^= x_i[]
+void xor_mem3(
+    void * LEO_RESTRICT x_0, const void * LEO_RESTRICT y_0,
+    void * LEO_RESTRICT x_1, const void * LEO_RESTRICT y_1,
+    void * LEO_RESTRICT x_2, const void * LEO_RESTRICT y_2,
+    unsigned bytes);
+
+
 } // namespace leopard
--- a/LeopardDecoder.cpp
+++ b/LeopardDecoder.cpp
--- a/LeopardDecoder.h
+++ b/LeopardDecoder.h
--- a/LeopardEncoder.cpp
+++ b/LeopardEncoder.cpp
--- a/LeopardEncoder.h
+++ b/LeopardEncoder.h
--- a/LeopardFF16.cpp
+++ b/LeopardFF16.cpp
--- a/LeopardFF16.h
+++ b/LeopardFF16.h
--- a/LeopardFF8.cpp
+++ b/LeopardFF8.cpp
@ -9,7 +9,7 @@
    * Redistributions in binary form must reproduce the above copyright notice,
      this list of conditions and the following disclaimer in the documentation
      and/or other materials provided with the distribution.
-    * Neither the name of LHC-RS nor the names of its contributors may be
+    * Neither the name of Leopard-RS nor the names of its contributors may be
      used to endorse or promote products derived from this software without
      specific prior written permission.

@ -27,6 +27,10 @@
 */

 #include "LeopardFF8.h"
+#include <string.h>
+
+// Define this to enable the optimized version of FWHT()
+#define LEO_FF8_FWHT_OPTIMIZED

 namespace leopard { namespace ff8 {

@ -34,6 +38,9 @@ namespace leopard { namespace ff8 {
 //------------------------------------------------------------------------------
 // Datatypes and Constants

+// Modulus for field operations
+static const ffe_t kModulus = 255;
+
 // LFSR Polynomial that generates the field elements
 static const unsigned kPolynomial = 0x11D;

@ -47,9 +54,6 @@ static const ffe_t kBasis[kBits] = {
 //------------------------------------------------------------------------------
 // Field Operations

-// Modulus for field operations
-static const ffe_t kModulus = 255;
-
 // z = x + y (mod kModulus)
 static inline ffe_t AddMod(const ffe_t a, const ffe_t b)
 {
@ -69,50 +73,6 @@ static inline ffe_t SubMod(const ffe_t a, const ffe_t b)
 }


-//------------------------------------------------------------------------------
-// Logarithm Tables
-
-static ffe_t LogLUT[kOrder];
-static ffe_t ExpLUT[kOrder];
-
-
-// Initialize LogLUT[], ExpLUT[]
-static void InitializeLogarithmTables()
-{
-    // LFSR table generation:
-
-    unsigned state = 1;
-    for (unsigned i = 0; i < kModulus; ++i)
-    {
-        ExpLUT[state] = static_cast<ffe_t>(i);
-        state <<= 1;
-        if (state >= kOrder)
-            state ^= kPolynomial;
-    }
-    ExpLUT[0] = kModulus;
-
-    // Conversion to chosen basis:
-
-    LogLUT[0] = 0;
-    for (unsigned i = 0; i < kBits; ++i)
-    {
-        const ffe_t basis = kBasis[i];
-        const unsigned width = static_cast<unsigned>(1UL << i);
-
-        for (unsigned j = 0; j < width; ++j)
-            LogLUT[j + width] = LogLUT[j] ^ basis;
-    }
-
-    for (unsigned i = 0; i < kOrder; ++i)
-        LogLUT[i] = ExpLUT[LogLUT[i]];
-
-    for (unsigned i = 0; i < kOrder; ++i)
-        ExpLUT[LogLUT[i]] = i;
-
-    ExpLUT[kModulus] = ExpLUT[0];
-}
-
-
 //------------------------------------------------------------------------------
 // Fast Walsh-Hadamard Transform (FWHT) (mod kModulus)

@ -248,234 +208,47 @@ void FWHT(ffe_t data[kOrder])


 //------------------------------------------------------------------------------
-// XOR Memory
+// Logarithm Tables

-void xor_mem(
-    void * LEO_RESTRICT vx, const void * LEO_RESTRICT vy,
-    unsigned bytes)
+static ffe_t LogLUT[kOrder];
+static ffe_t ExpLUT[kOrder];
+
+
+// Initialize LogLUT[], ExpLUT[]
+static void InitializeLogarithmTables()
 {
-#if defined(LEO_TRY_AVX2)
-    if (CpuHasAVX2)
-    {
-        LEO_M256 * LEO_RESTRICT x32 = reinterpret_cast<LEO_M256 *>(vx);
-        const LEO_M256 * LEO_RESTRICT y32 = reinterpret_cast<const LEO_M256 *>(vy);
-        do
-        {
-            const LEO_M256 x0 = _mm256_xor_si256(_mm256_loadu_si256(x32), _mm256_loadu_si256(y32));
-            const LEO_M256 x1 = _mm256_xor_si256(_mm256_loadu_si256(x32 + 1), _mm256_loadu_si256(y32 + 1));
-            const LEO_M256 x2 = _mm256_xor_si256(_mm256_loadu_si256(x32 + 2), _mm256_loadu_si256(y32 + 2));
-            const LEO_M256 x3 = _mm256_xor_si256(_mm256_loadu_si256(x32 + 3), _mm256_loadu_si256(y32 + 3));
-            _mm256_storeu_si256(x32, x0);
-            _mm256_storeu_si256(x32 + 1, x1);
-            _mm256_storeu_si256(x32 + 2, x2);
-            _mm256_storeu_si256(x32 + 3, x3);
-            bytes -= 128, x32 += 4, y32 += 4;
-        } while (bytes >= 128);
-        if (bytes > 0)
-        {
-            const LEO_M256 x0 = _mm256_xor_si256(_mm256_loadu_si256(x32), _mm256_loadu_si256(y32));
-            const LEO_M256 x1 = _mm256_xor_si256(_mm256_loadu_si256(x32 + 1), _mm256_loadu_si256(y32 + 1));
-            _mm256_storeu_si256(x32, x0);
-            _mm256_storeu_si256(x32 + 1, x1);
-        }
-        return;
-    }
-#endif // LEO_TRY_AVX2
-    LEO_M128 * LEO_RESTRICT x16 = reinterpret_cast<LEO_M128 *>(vx);
-    const LEO_M128 * LEO_RESTRICT y16 = reinterpret_cast<const LEO_M128 *>(vy);
-    do
-    {
-        const LEO_M128 x0 = _mm_xor_si128(_mm_loadu_si128(x16), _mm_loadu_si128(y16));
-        const LEO_M128 x1 = _mm_xor_si128(_mm_loadu_si128(x16 + 1), _mm_loadu_si128(y16 + 1));
-        const LEO_M128 x2 = _mm_xor_si128(_mm_loadu_si128(x16 + 2), _mm_loadu_si128(y16 + 2));
-        const LEO_M128 x3 = _mm_xor_si128(_mm_loadu_si128(x16 + 3), _mm_loadu_si128(y16 + 3));
-        _mm_storeu_si128(x16, x0);
-        _mm_storeu_si128(x16 + 1, x1);
-        _mm_storeu_si128(x16 + 2, x2);
-        _mm_storeu_si128(x16 + 3, x3);
-        bytes -= 64, x16 += 4, y16 += 4;
-    } while (bytes > 0);
-}
+    // LFSR table generation:

-void xor_mem2(
-    void * LEO_RESTRICT vx_0, const void * LEO_RESTRICT vy_0,
-    void * LEO_RESTRICT vx_1, const void * LEO_RESTRICT vy_1,
-    unsigned bytes)
-{
-#if defined(LEO_TRY_AVX2)
-    if (CpuHasAVX2)
+    unsigned state = 1;
+    for (unsigned i = 0; i < kModulus; ++i)
    {
-        LEO_M256 * LEO_RESTRICT       x32_0 = reinterpret_cast<LEO_M256 *>      (vx_0);
-        const LEO_M256 * LEO_RESTRICT y32_0 = reinterpret_cast<const LEO_M256 *>(vy_0);
-        LEO_M256 * LEO_RESTRICT       x32_1 = reinterpret_cast<LEO_M256 *>      (vx_1);
-        const LEO_M256 * LEO_RESTRICT y32_1 = reinterpret_cast<const LEO_M256 *>(vy_1);
-        do
-        {
-            const LEO_M256 x0_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0),     _mm256_loadu_si256(y32_0));
-            const LEO_M256 x1_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 1), _mm256_loadu_si256(y32_0 + 1));
-            const LEO_M256 x2_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 2), _mm256_loadu_si256(y32_0 + 2));
-            const LEO_M256 x3_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 3), _mm256_loadu_si256(y32_0 + 3));
-            const LEO_M256 x0_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1),     _mm256_loadu_si256(y32_1));
-            const LEO_M256 x1_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 1), _mm256_loadu_si256(y32_1 + 1));
-            const LEO_M256 x2_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 2), _mm256_loadu_si256(y32_1 + 2));
-            const LEO_M256 x3_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 3), _mm256_loadu_si256(y32_1 + 3));
-            _mm256_storeu_si256(x32_0,     x0_0);
-            _mm256_storeu_si256(x32_0 + 1, x1_0);
-            _mm256_storeu_si256(x32_0 + 2, x2_0);
-            _mm256_storeu_si256(x32_0 + 3, x3_0);
-            _mm256_storeu_si256(x32_1,     x0_1);
-            _mm256_storeu_si256(x32_1 + 1, x1_1);
-            _mm256_storeu_si256(x32_1 + 2, x2_1);
-            _mm256_storeu_si256(x32_1 + 3, x3_1);
-            x32_0 += 4, y32_0 += 4;
-            x32_1 += 4, y32_1 += 4;
-            bytes -= 128;
-        } while (bytes >= 128);
-        if (bytes > 0)
-        {
-            const LEO_M256 x0_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0),     _mm256_loadu_si256(y32_0));
-            const LEO_M256 x1_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 1), _mm256_loadu_si256(y32_0 + 1));
-            const LEO_M256 x0_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1),     _mm256_loadu_si256(y32_1));
-            const LEO_M256 x1_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 1), _mm256_loadu_si256(y32_1 + 1));
-            _mm256_storeu_si256(x32_0,     x0_0);
-            _mm256_storeu_si256(x32_0 + 1, x1_0);
-            _mm256_storeu_si256(x32_1,     x0_1);
-            _mm256_storeu_si256(x32_1 + 1, x1_1);
-        }
-        return;
+        ExpLUT[state] = static_cast<ffe_t>(i);
+        state <<= 1;
+        if (state >= kOrder)
+            state ^= kPolynomial;
    }
-#endif // LEO_TRY_AVX2
-    LEO_M128 * LEO_RESTRICT       x16_0 = reinterpret_cast<LEO_M128 *>      (vx_0);
-    const LEO_M128 * LEO_RESTRICT y16_0 = reinterpret_cast<const LEO_M128 *>(vy_0);
-    LEO_M128 * LEO_RESTRICT       x16_1 = reinterpret_cast<LEO_M128 *>      (vx_1);
-    const LEO_M128 * LEO_RESTRICT y16_1 = reinterpret_cast<const LEO_M128 *>(vy_1);
-    do
-    {
-        const LEO_M128 x0_0 = _mm_xor_si128(_mm_loadu_si128(x16_0),     _mm_loadu_si128(y16_0));
-        const LEO_M128 x1_0 = _mm_xor_si128(_mm_loadu_si128(x16_0 + 1), _mm_loadu_si128(y16_0 + 1));
-        const LEO_M128 x2_0 = _mm_xor_si128(_mm_loadu_si128(x16_0 + 2), _mm_loadu_si128(y16_0 + 2));
-        const LEO_M128 x3_0 = _mm_xor_si128(_mm_loadu_si128(x16_0 + 3), _mm_loadu_si128(y16_0 + 3));
-        const LEO_M128 x0_1 = _mm_xor_si128(_mm_loadu_si128(x16_1),     _mm_loadu_si128(y16_1));
-        const LEO_M128 x1_1 = _mm_xor_si128(_mm_loadu_si128(x16_1 + 1), _mm_loadu_si128(y16_1 + 1));
-        const LEO_M128 x2_1 = _mm_xor_si128(_mm_loadu_si128(x16_1 + 2), _mm_loadu_si128(y16_1 + 2));
-        const LEO_M128 x3_1 = _mm_xor_si128(_mm_loadu_si128(x16_1 + 3), _mm_loadu_si128(y16_1 + 3));
-        _mm_storeu_si128(x16_0,     x0_0);
-        _mm_storeu_si128(x16_0 + 1, x1_0);
-        _mm_storeu_si128(x16_0 + 2, x2_0);
-        _mm_storeu_si128(x16_0 + 3, x3_0);
-        _mm_storeu_si128(x16_1,     x0_1);
-        _mm_storeu_si128(x16_1 + 1, x1_1);
-        _mm_storeu_si128(x16_1 + 2, x2_1);
-        _mm_storeu_si128(x16_1 + 3, x3_1);
-        x16_0 += 4, y16_0 += 4;
-        x16_1 += 4, y16_1 += 4;
-        bytes -= 64;
-    } while (bytes > 0);
-}
+    ExpLUT[0] = kModulus;

-void xor_mem3(
-    void * LEO_RESTRICT vx_0, const void * LEO_RESTRICT vy_0,
-    void * LEO_RESTRICT vx_1, const void * LEO_RESTRICT vy_1,
-    void * LEO_RESTRICT vx_2, const void * LEO_RESTRICT vy_2,
-    unsigned bytes)
-{
-#if defined(LEO_TRY_AVX2)
-    if (CpuHasAVX2)
+    // Conversion to chosen basis:
+
+    LogLUT[0] = 0;
+    for (unsigned i = 0; i < kBits; ++i)
    {
-        LEO_M256 * LEO_RESTRICT       x32_0 = reinterpret_cast<LEO_M256 *>      (vx_0);
-        const LEO_M256 * LEO_RESTRICT y32_0 = reinterpret_cast<const LEO_M256 *>(vy_0);
-        LEO_M256 * LEO_RESTRICT       x32_1 = reinterpret_cast<LEO_M256 *>      (vx_1);
-        const LEO_M256 * LEO_RESTRICT y32_1 = reinterpret_cast<const LEO_M256 *>(vy_1);
-        LEO_M256 * LEO_RESTRICT       x32_2 = reinterpret_cast<LEO_M256 *>      (vx_2);
-        const LEO_M256 * LEO_RESTRICT y32_2 = reinterpret_cast<const LEO_M256 *>(vy_2);
-        do
-        {
-            const LEO_M256 x0_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0),     _mm256_loadu_si256(y32_0));
-            const LEO_M256 x1_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 1), _mm256_loadu_si256(y32_0 + 1));
-            const LEO_M256 x2_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 2), _mm256_loadu_si256(y32_0 + 2));
-            const LEO_M256 x3_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 3), _mm256_loadu_si256(y32_0 + 3));
-            const LEO_M256 x0_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1),     _mm256_loadu_si256(y32_1));
-            const LEO_M256 x1_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 1), _mm256_loadu_si256(y32_1 + 1));
-            const LEO_M256 x2_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 2), _mm256_loadu_si256(y32_1 + 2));
-            const LEO_M256 x3_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 3), _mm256_loadu_si256(y32_1 + 3));
-            const LEO_M256 x0_2 = _mm256_xor_si256(_mm256_loadu_si256(x32_2),     _mm256_loadu_si256(y32_2));
-            const LEO_M256 x1_2 = _mm256_xor_si256(_mm256_loadu_si256(x32_2 + 1), _mm256_loadu_si256(y32_2 + 1));
-            const LEO_M256 x2_2 = _mm256_xor_si256(_mm256_loadu_si256(x32_2 + 2), _mm256_loadu_si256(y32_2 + 2));
-            const LEO_M256 x3_2 = _mm256_xor_si256(_mm256_loadu_si256(x32_2 + 3), _mm256_loadu_si256(y32_2 + 3));
-            _mm256_storeu_si256(x32_0,     x0_0);
-            _mm256_storeu_si256(x32_0 + 1, x1_0);
-            _mm256_storeu_si256(x32_0 + 2, x2_0);
-            _mm256_storeu_si256(x32_0 + 3, x3_0);
-            _mm256_storeu_si256(x32_1,     x0_1);
-            _mm256_storeu_si256(x32_1 + 1, x1_1);
-            _mm256_storeu_si256(x32_1 + 2, x2_1);
-            _mm256_storeu_si256(x32_1 + 3, x3_1);
-            _mm256_storeu_si256(x32_2,     x0_2);
-            _mm256_storeu_si256(x32_2 + 1, x1_2);
-            _mm256_storeu_si256(x32_2 + 2, x2_2);
-            _mm256_storeu_si256(x32_2 + 3, x3_2);
-            x32_0 += 4, y32_0 += 4;
-            x32_1 += 4, y32_1 += 4;
-            x32_2 += 4, y32_2 += 4;
-            bytes -= 128;
-        } while (bytes >= 128);
-        if (bytes > 0)
-        {
-            const LEO_M256 x0_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0),     _mm256_loadu_si256(y32_0));
-            const LEO_M256 x1_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 1), _mm256_loadu_si256(y32_0 + 1));
-            const LEO_M256 x0_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1),     _mm256_loadu_si256(y32_1));
-            const LEO_M256 x1_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 1), _mm256_loadu_si256(y32_1 + 1));
-            const LEO_M256 x0_2 = _mm256_xor_si256(_mm256_loadu_si256(x32_2),     _mm256_loadu_si256(y32_2));
-            const LEO_M256 x1_2 = _mm256_xor_si256(_mm256_loadu_si256(x32_2 + 1), _mm256_loadu_si256(y32_2 + 1));
-            _mm256_storeu_si256(x32_0,     x0_0);
-            _mm256_storeu_si256(x32_0 + 1, x1_0);
-            _mm256_storeu_si256(x32_1,     x0_1);
-            _mm256_storeu_si256(x32_1 + 1, x1_1);
-            _mm256_storeu_si256(x32_2,     x0_2);
-            _mm256_storeu_si256(x32_2 + 1, x1_2);
-        }
-        return;
+        const ffe_t basis = kBasis[i];
+        const unsigned width = static_cast<unsigned>(1UL << i);
+
+        for (unsigned j = 0; j < width; ++j)
+            LogLUT[j + width] = LogLUT[j] ^ basis;
    }
-#endif // LEO_TRY_AVX2
-    LEO_M128 * LEO_RESTRICT       x16_0 = reinterpret_cast<LEO_M128 *>      (vx_0);
-    const LEO_M128 * LEO_RESTRICT y16_0 = reinterpret_cast<const LEO_M128 *>(vy_0);
-    LEO_M128 * LEO_RESTRICT       x16_1 = reinterpret_cast<LEO_M128 *>      (vx_1);
-    const LEO_M128 * LEO_RESTRICT y16_1 = reinterpret_cast<const LEO_M128 *>(vy_1);
-    LEO_M128 * LEO_RESTRICT       x16_2 = reinterpret_cast<LEO_M128 *>      (vx_2);
-    const LEO_M128 * LEO_RESTRICT y16_2 = reinterpret_cast<const LEO_M128 *>(vy_2);
-    do
-    {
-        const LEO_M128 x0_0 = _mm_xor_si128(_mm_loadu_si128(x16_0),     _mm_loadu_si128(y16_0));
-        const LEO_M128 x1_0 = _mm_xor_si128(_mm_loadu_si128(x16_0 + 1), _mm_loadu_si128(y16_0 + 1));
-        const LEO_M128 x2_0 = _mm_xor_si128(_mm_loadu_si128(x16_0 + 2), _mm_loadu_si128(y16_0 + 2));
-        const LEO_M128 x3_0 = _mm_xor_si128(_mm_loadu_si128(x16_0 + 3), _mm_loadu_si128(y16_0 + 3));
-        const LEO_M128 x0_1 = _mm_xor_si128(_mm_loadu_si128(x16_1),     _mm_loadu_si128(y16_1));
-        const LEO_M128 x1_1 = _mm_xor_si128(_mm_loadu_si128(x16_1 + 1), _mm_loadu_si128(y16_1 + 1));
-        const LEO_M128 x2_1 = _mm_xor_si128(_mm_loadu_si128(x16_1 + 2), _mm_loadu_si128(y16_1 + 2));
-        const LEO_M128 x3_1 = _mm_xor_si128(_mm_loadu_si128(x16_1 + 3), _mm_loadu_si128(y16_1 + 3));
-        const LEO_M128 x0_2 = _mm_xor_si128(_mm_loadu_si128(x16_2),     _mm_loadu_si128(y16_2));
-        const LEO_M128 x1_2 = _mm_xor_si128(_mm_loadu_si128(x16_2 + 1), _mm_loadu_si128(y16_2 + 1));
-        const LEO_M128 x2_2 = _mm_xor_si128(_mm_loadu_si128(x16_2 + 2), _mm_loadu_si128(y16_2 + 2));
-        const LEO_M128 x3_2 = _mm_xor_si128(_mm_loadu_si128(x16_2 + 3), _mm_loadu_si128(y16_2 + 3));
-        _mm_storeu_si128(x16_0,     x0_0);
-        _mm_storeu_si128(x16_0 + 1, x1_0);
-        _mm_storeu_si128(x16_0 + 2, x2_0);
-        _mm_storeu_si128(x16_0 + 3, x3_0);
-        _mm_storeu_si128(x16_1,     x0_1);
-        _mm_storeu_si128(x16_1 + 1, x1_1);
-        _mm_storeu_si128(x16_1 + 2, x2_1);
-        _mm_storeu_si128(x16_1 + 3, x3_1);
-        _mm_storeu_si128(x16_2,     x0_2);
-        _mm_storeu_si128(x16_2 + 1, x1_2);
-        _mm_storeu_si128(x16_2 + 2, x2_2);
-        _mm_storeu_si128(x16_2 + 3, x3_2);
-        x16_0 += 4, y16_0 += 4;
-        x16_1 += 4, y16_1 += 4;
-        x16_2 += 4, y16_2 += 4;
-        bytes -= 64;
-    } while (bytes > 0);
-}

+    for (unsigned i = 0; i < kOrder; ++i)
+        LogLUT[i] = ExpLUT[LogLUT[i]];
+
+    for (unsigned i = 0; i < kOrder; ++i)
+        ExpLUT[LogLUT[i]] = i;
+
+    ExpLUT[kModulus] = ExpLUT[0];
+}

 //------------------------------------------------------------------------------
 // Multiplies
@ -485,12 +258,12 @@ void xor_mem3(
 struct {
    LEO_ALIGNED LEO_M128 Lo[256];
    LEO_ALIGNED LEO_M128 Hi[256];
-} Multiply128LUT;
+} static Multiply128LUT;
 #if defined(LEO_TRY_AVX2)
 struct {
    LEO_ALIGNED LEO_M256 Lo[256];
    LEO_ALIGNED LEO_M256 Hi[256];
-} Multiply256LUT;
+} static Multiply256LUT;
 #endif // LEO_TRY_AVX2

 // Returns a * b
@ -501,14 +274,19 @@ static ffe_t FFEMultiply(ffe_t a, ffe_t b)
    return ExpLUT[AddMod(LogLUT[a], LogLUT[b])];
 }

+// Returns a * Log(b)
+static ffe_t FFEMultiplyLog(ffe_t a, ffe_t log_b)
+{
+    if (a == 0)
+        return 0;
+    return ExpLUT[AddMod(LogLUT[a], b)];
+}
+
 bool InitializeMultiplyTables()
 {
-    // Reuse aligned self test buffers to load table data
-    uint8_t* lo = m_SelfTestBuffers.A;
-    uint8_t* hi = m_SelfTestBuffers.B;
-
    for (int y = 0; y < 256; ++y)
    {
+        uint8_t lo[16], hi[16];
        for (unsigned char x = 0; x < 16; ++x)
        {
            lo[x] = FFEMultiply(x,      static_cast<uint8_t>(y));
@ -517,15 +295,17 @@ bool InitializeMultiplyTables()

        const LEO_M128 table_lo = _mm_loadu_si128((LEO_M128*)lo);
        const LEO_M128 table_hi = _mm_loadu_si128((LEO_M128*)hi);
+
        _mm_storeu_si128(Multiply128LUT.Lo + y, table_lo);
        _mm_storeu_si128(Multiply128LUT.Hi + y, table_hi);
+
 #if defined(LEO_TRY_AVX2)
        if (CpuHasAVX2)
        {
-            const LEO_M256 table_lo2 = _mm256_broadcastsi128_si256(table_lo);
-            const LEO_M256 table_hi2 = _mm256_broadcastsi128_si256(table_hi);
-            _mm256_storeu_si256(Multiply256LUT.Lo + y, table_lo2);
-            _mm256_storeu_si256(Multiply256LUT.Hi + y, table_hi2);
+            _mm256_storeu_si256(Multiply256LUT.Lo + y,
+                _mm256_broadcastsi128_si256(table_lo));
+            _mm256_storeu_si256(Multiply256LUT.Hi + y,
+                _mm256_broadcastsi128_si256(table_hi));
        }
 #endif // LEO_TRY_AVX2
    }
@ -536,7 +316,7 @@ bool InitializeMultiplyTables()
 // vx[] = vy[] * m
 void mul_mem_set(
    void * LEO_RESTRICT vx, const void * LEO_RESTRICT vy,
-    ffe_t m, unsigned bytes)
+    ffe_t m, uint64_t bytes)
 {
    if (m <= 1)
    {
@ -633,7 +413,7 @@ void mul_mem_set(
 void mul_mem2_inplace(
    void * LEO_RESTRICT vx_0,
    void * LEO_RESTRICT vx_1,
-    ffe_t m, unsigned bytes)
+    ffe_t m, uint64_t bytes)
 {
    if (m <= 1)
    {
@ -759,28 +539,28 @@ void mul_mem2_inplace(
 // FFT Operations

 // x[] ^= y[] * m, y[] ^= x[]
-void mul_fft(
+void fft_butterfly(
    void * LEO_RESTRICT x, void * LEO_RESTRICT y,
-    ffe_t m, unsigned bytes)
+    ffe_t m, uint64_t bytes)
 {

 }

 // For i = {0, 1}: x_i[] ^= y_i[] * m, y_i[] ^= x_i[]
-void mul_fft2(
+void fft_butterfly2(
    void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
    void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1,
-    ffe_t m, unsigned bytes)
+    ffe_t m, uint64_t bytes)
 {

 }

 // For i = {0, 1, 2}: x_i[] ^= y_i[] * m, y_i[] ^= x_i[]
-void mul_fft3(
+void fft_butterfly3(
    void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
    void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1,
    void * LEO_RESTRICT x_2, void * LEO_RESTRICT y_2,
-    ffe_t m, unsigned bytes)
+    ffe_t m, uint64_t bytes)
 {

 }
@ -790,33 +570,348 @@ void mul_fft3(
 // IFFT Operations

 // y[] ^= x[], x[] ^= y[] * m
-void mul_ifft(
+void ifft_butterfly(
    void * LEO_RESTRICT x, void * LEO_RESTRICT y,
-    ffe_t m, unsigned bytes)
+    ffe_t m, uint64_t bytes)
 {

 }

 // For i = {0, 1}: y_i[] ^= x_i[], x_i[] ^= y_i[] * m
-void mul_ifft2(
+void ifft_butterfly2(
    void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
    void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1,
-    ffe_t m, unsigned bytes)
+    ffe_t m, uint64_t bytes)
 {

 }

 // For i = {0, 1, 2}: y_i[] ^= x_i[], x_i[] ^= y_i[] * m
-void mul_ifft3(
+void ifft_butterfly3(
    void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
    void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1,
    void * LEO_RESTRICT x_2, void * LEO_RESTRICT y_2,
-    ffe_t m, unsigned bytes)
+    ffe_t m, uint64_t bytes)
 {

 }


+//------------------------------------------------------------------------------
+// FFT
+
+static ffe_t FFTSkew[kFieldModulus]; // twisted factors used in FFT
+static ffe_t LogWalsh[kOrder]; // factors used in the evaluation of the error locator polynomial
+
+void FFTInitialize()
+{
+    ffe_t temp[kBits - 1];
+
+    for (unsigned i = 1; i < kBits; ++i)
+        temp[i - 1] = (ffe_t)((unsigned)1 << i);
+
+    for (unsigned m = 0; m < (kBits - 1); ++m)
+    {
+        const unsigned step = (unsigned)1 << (m + 1);
+
+        FFTSkew[((unsigned)1 << m) - 1] = 0;
+
+        for (unsigned i = m; i < (kBits - 1); ++i)
+        {
+            const unsigned s = ((unsigned)1 << (i + 1));
+
+            for (unsigned j = ((unsigned)1 << m) - 1; j < s; j += step)
+                FFTSkew[j + s] = FFTSkew[j] ^ temp[i];
+        }
+
+        // TBD: This can be cleaned up
+        temp[m] = kFieldModulus - LogLUT[FFEMultiply(temp[m], temp[m] ^ 1)];
+
+        for (unsigned i = m + 1; i < (kBits - 1); ++i)
+            temp[i] = FFEMultiplyLog(temp[i], (LogLUT[temp[i] ^ 1] + temp[m]) % kFieldModulus);
+    }
+
+    for (unsigned i = 0; i < kOrder; ++i)
+        FFTSkew[i] = LogLUT[FFTSkew[i]];
+
+    // Precalculate FWHT(Log[i]):
+
+    for (unsigned i = 0; i < kOrder; ++i)
+        LogWalsh[i] = LogLUT[i];
+    LogWalsh[0] = 0;
+    FWHT(LogWalsh, kBits);
+}
+
+
+//------------------------------------------------------------------------------
+// Encode
+
+void Encode(
+    uint64_t buffer_bytes,
+    unsigned original_count,
+    unsigned recovery_count,
+    unsigned m,
+    void* const * const data,
+    void** work)
+{
+    // work <- data
+
+    // FIXME: Unroll first loop to eliminate this
+    for (unsigned i = 0; i < m; ++i)
+        memcpy(work[i], data[i], buffer_bytes);
+
+    // work <- IFFT(data, m, m)
+
+    for (unsigned width = 1; width < m; width <<= 1)
+    {
+        for (unsigned j = width; j < m; j += (width << 1))
+        {
+            const ffe_t skew = FFTSkew[j + m - 1];
+
+            if (skew != kFieldModulus)
+            {
+                for (unsigned i = j - width; i < j; ++i)
+                    ifft_butterfly(work[i], work[i + width], skew, buffer_bytes);
+            }
+            else
+            {
+                for (unsigned i = j - width; i < j; ++i)
+                    xor_mem(work[i + width], work[i], buffer_bytes);
+            }
+        }
+    }
+
+    for (unsigned i = m; i + m <= original_count; i += m)
+    {
+        // temp <- data + i
+
+        void** temp = work + m;
+
+        // FIXME: Unroll first loop to eliminate this
+        for (unsigned j = 0; j < m; ++j)
+            memcpy(temp[j], data[j], buffer_bytes);
+
+        // temp <- IFFT(temp, m, m + i)
+
+        for (unsigned width = 1; width < m; width <<= 1)
+        {
+            for (unsigned j = width; j < m; j += (width << 1))
+            {
+                const ffe_t skew = FFTSkew[j + m + i - 1];
+
+                if (skew != kFieldModulus)
+                {
+                    for (unsigned k = j - width; k < j; ++k)
+                        ifft_butterfly(temp[k], temp[k + width], skew, buffer_bytes);
+                }
+                else
+                {
+                    for (unsigned k = j - width; k < j; ++k)
+                        xor_mem(temp[k + width], temp[k], buffer_bytes);
+                }
+            }
+        }
+
+        // work <- work XOR temp
+
+        // FIXME: Unroll last loop to eliminate this
+        for (unsigned j = 0; j < m; ++j)
+            xor_mem(work[j], temp[j], buffer_bytes);
+    }
+
+    const unsigned last_count = original_count % m;
+    if (last_count != 0)
+    {
+        const unsigned i = original_count - last_count;
+
+        // temp <- data + i
+
+        void** temp = work + m;
+
+        for (unsigned j = 0; j < last_count; ++j)
+            memcpy(temp[j], data[j], buffer_bytes);
+        for (unsigned j = last_count; j < m; ++j)
+            memset(temp[j], 0, buffer_bytes);
+
+        // temp <- IFFT(temp, m, m + i)
+
+        for (unsigned width = 1, shift = 1; width < m; width <<= 1, ++shift)
+        {
+            // Calculate stop considering that the right is all zeroes
+            const unsigned stop = ((last_count + width - 1) >> shift) << shift;
+
+            for (unsigned j = width; j < stop; j += (width << 1))
+            {
+                const ffe_t skew = FFTSkew[j + m + i - 1];
+
+                if (skew != kFieldModulus)
+                {
+                    for (unsigned k = j - width; k < j; ++k)
+                        ifft_butterfly(temp[k], temp[k + width], skew, buffer_bytes);
+                }
+                else
+                {
+                    for (unsigned k = j - width; k < j; ++k)
+                        xor_mem(temp[k + width], temp[k], buffer_bytes);
+                }
+            }
+        }
+
+        // work <- work XOR temp
+
+        // FIXME: Unroll last loop to eliminate this
+        for (unsigned j = 0; j < m; ++j)
+            xor_mem(work[j], temp[j], buffer_bytes);
+    }
+
+    // work <- FFT(work, m, 0)
+
+    for (unsigned width = (m >> 1); width > 0; width >>= 1)
+    {
+        const ffe_t* skewLUT = FFTSkew + width - 1;
+        const unsigned range = width << 1;
+
+        for (unsigned j = 0; j < m; j += range)
+        {
+            const ffe_t skew = skewLUT[j];
+
+            if (skew != kFieldModulus)
+            {
+                for (unsigned k = j, count = j + width; k < count; ++k)
+                    fft_butterfly(data[k], data[k + width], skew, buffer_bytes);
+            }
+            else
+            {
+                for (unsigned k = j, count = j + width; k < count; ++k)
+                    xor_mem(work[k + width], work[k], buffer_bytes);
+            }
+        }
+    }
+}
+
+
+//------------------------------------------------------------------------------
+// Decode
+
+void Decode(
+    uint64_t buffer_bytes,
+    unsigned original_count,
+    unsigned recovery_count,
+    unsigned m, // NextPow2(recovery_count)
+    unsigned n, // NextPow2(m + original_count) = work_count
+    void* const * const original, // original_count entries
+    void* const * const recovery, // recovery_count entries
+    void** work) // n entries
+{
+    // Fill in error locations
+
+    ffe_t ErrorLocations[kOrder];
+    for (unsigned i = 0; i < recovery_count; ++i)
+        ErrorLocations[i] = recovery[i] ? 0 : 1;
+    for (unsigned i = recovery_count; i < m; ++i)
+        ErrorLocations[i] = 1;
+    for (unsigned i = 0; i < original_count; ++i)
+        ErrorLocations[i + m] = original[i] ? 0 : 1;
+    memset(ErrorLocations + m + original_count, 0, (n - original_count - m) * sizeof(ffe_t));
+
+    // Evaluate error locator polynomial
+
+    FWHT(ErrorLocations, kBits);
+
+    for (unsigned i = 0; i < kOrder; ++i)
+        ErrorLocations[i] = ((unsigned)ErrorLocations[i] * (unsigned)LogWalsh[i]) % kFieldModulus;
+
+    FWHT(ErrorLocations, kBits);
+
+    // work <- recovery data
+
+    for (unsigned i = 0; i < recovery_count; ++i)
+    {
+        if (recovery[i])
+            mul_mem_set(work[i], recovery[i], ErrorLocations[i], buffer_bytes);
+        else
+            memset(work[i], 0, buffer_bytes);
+    }
+    for (unsigned i = recovery_count; i < m; ++i)
+        memset(work[i], 0, buffer_bytes);
+
+    // work <- original data
+
+    for (unsigned i = 0; i < original_count; ++i)
+    {
+        if (original[i])
+            mul_mem_set(work[m + i], original[i], ErrorLocations[m + i], buffer_bytes);
+        else
+            memset(work[m + i], 0, buffer_bytes);
+    }
+    for (unsigned i = m + original_count; i < n; ++i)
+        memset(work[i], 0, buffer_bytes);
+
+    // work <- IFFT(work, n, 0)
+
+    for (unsigned width = 1; width < n; width <<= 1)
+    {
+        for (unsigned j = width; j < n; j += (width << 1))
+        {
+            const ffe_t skew = FFTSkew[j - 1];
+
+            if (skew != kFieldModulus)
+            {
+                for (unsigned i = j - width; i < j; ++i)
+                    ifft_butterfly(work[i], work[i + width], skew, buffer_bytes);
+            }
+            else
+            {
+                for (unsigned i = j - width; i < j; ++i)
+                    xor_mem(work[i + width], work[i], buffer_bytes);
+            }
+        }
+    }
+
+    // work <- FormalDerivative(work, n)
+
+    for (unsigned i = 1; i < n; ++i)
+    {
+        const unsigned width = ((i ^ (i - 1)) + 1) >> 1;
+
+        // If a large number of values are being XORed:
+        for (unsigned j = i - width; j < i; ++j)
+            xor_mem(work[j], work[j + width], buffer_bytes);
+    }
+
+    // work <- FFT(work, n, 0) truncated to m + original_count
+
+    const unsigned output_count = m + original_count;
+    for (unsigned width = (n >> 1); width > 0; width >>= 1)
+    {
+        const ffe_t* skewLUT = FFTSkew + width - 1;
+        const unsigned range = width << 1;
+
+        for (unsigned j = (m < range) ? 0 : m; j < output_count; j += range)
+        {
+            const ffe_t skew = skewLUT[j];
+
+            if (skew != kFieldModulus)
+            {
+                for (unsigned i = j; i < j + width; ++i)
+                    fft_butterfly(work[i], work[i + width], skew, buffer_bytes);
+            }
+            else
+            {
+                for (unsigned i = j; i < j + width; ++i)
+                    xor_mem(work[i + width], work[i], buffer_bytes);
+            }
+        }
+    }
+
+    // Reveal erasures
+
+    for (unsigned i = 0; i < original_count; ++i)
+        if (!original[i])
+            mul_mem_set(work[i], work[i + m], kFieldModulus - ErrorLocations[i], buffer_bytes);
+}
+
+
 //------------------------------------------------------------------------------
 // API

@ -831,6 +926,7 @@ bool Initialize()
        return false;

    InitializeLogarithmTables();
+    FFTInitialize();

    IsInitialized = true;
    return true;
--- a/LeopardFF8.h
+++ b/LeopardFF8.h
@ -56,9 +56,6 @@ static const unsigned kOrder = 256;
 //------------------------------------------------------------------------------
 // Fast Walsh-Hadamard Transform (FWHT) (mod kModulus)

-// Define this to enable the optimized version of FWHT()
-#define LEO_FF8_FWHT_OPTIMIZED
-
 // Transform for a variable number of bits (up to kOrder)
 void FWHT(ffe_t* data, const unsigned bits);

@ -66,85 +63,89 @@ void FWHT(ffe_t* data, const unsigned bits);
 void FWHT(ffe_t data[kOrder]);


-//------------------------------------------------------------------------------
-// XOR Memory
-
-// x[] ^= y[]
-void xor_mem(
-    void * LEO_RESTRICT x, const void * LEO_RESTRICT y,
-    unsigned bytes);
-
-// For i = {0, 1}: x_i[] ^= x_i[]
-void xor_mem2(
-    void * LEO_RESTRICT x_0, const void * LEO_RESTRICT y_0,
-    void * LEO_RESTRICT x_1, const void * LEO_RESTRICT y_1,
-    unsigned bytes);
-
-// For i = {0, 1, 2}: x_i[] ^= x_i[]
-void xor_mem3(
-    void * LEO_RESTRICT x_0, const void * LEO_RESTRICT y_0,
-    void * LEO_RESTRICT x_1, const void * LEO_RESTRICT y_1,
-    void * LEO_RESTRICT x_2, const void * LEO_RESTRICT y_2,
-    unsigned bytes);
-
-
 //------------------------------------------------------------------------------
 // Multiplies

 // x[] = y[] * m
 void mul_mem_set(
    void * LEO_RESTRICT x, const void * LEO_RESTRICT y,
-    ffe_t m, unsigned bytes);
+    ffe_t m, uint64_t bytes);

 // For i = {0, 1}: x_i[] *= m
 void mul_mem2_inplace(
    void * LEO_RESTRICT x_0,
    void * LEO_RESTRICT x_1,
-    ffe_t m, unsigned bytes);
+    ffe_t m, uint64_t bytes);


 //------------------------------------------------------------------------------
 // FFT Operations

 // x[] ^= y[] * m, y[] ^= x[]
-void mul_fft(
+void fft_butterfly(
    void * LEO_RESTRICT x, void * LEO_RESTRICT y,
-    ffe_t m, unsigned bytes);
+    ffe_t m, uint64_t bytes);

 // For i = {0, 1}: x_i[] ^= y_i[] * m, y_i[] ^= x_i[]
-void mul_fft2(
+void fft_butterfly2(
    void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
    void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1,
-    ffe_t m, unsigned bytes);
+    ffe_t m, uint64_t bytes);

 // For i = {0, 1, 2}: x_i[] ^= y_i[] * m, y_i[] ^= x_i[]
-void mul_fft3(
+void fft_butterfly3(
    void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
    void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1,
    void * LEO_RESTRICT x_2, void * LEO_RESTRICT y_2,
-    ffe_t m, unsigned bytes);
+    ffe_t m, uint64_t bytes);


 //------------------------------------------------------------------------------
 // IFFT Operations

 // y[] ^= x[], x[] ^= y[] * m
-void mul_ifft(
+void ifft_butterfly(
    void * LEO_RESTRICT x, void * LEO_RESTRICT y,
-    ffe_t m, unsigned bytes);
+    ffe_t m, uint64_t bytes);

 // For i = {0, 1}: y_i[] ^= x_i[], x_i[] ^= y_i[] * m
-void mul_ifft2(
+void ifft_butterfly2(
    void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
    void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1,
-    ffe_t m, unsigned bytes);
+    ffe_t m, uint64_t bytes);

 // For i = {0, 1, 2}: y_i[] ^= x_i[], x_i[] ^= y_i[] * m
-void mul_ifft3(
+void ifft_butterfly3(
    void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
    void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1,
    void * LEO_RESTRICT x_2, void * LEO_RESTRICT y_2,
-    ffe_t m, unsigned bytes);
+    ffe_t m, uint64_t bytes);
+
+
+//------------------------------------------------------------------------------
+// Encode
+
+void Encode(
+    uint64_t buffer_bytes,
+    unsigned original_count,
+    unsigned recovery_count,
+    unsigned m, // = NextPow2(recovery_count) * 2 = work_count
+    void* const * const data,
+    void** work); // Size of GetEncodeWorkCount()
+
+
+//------------------------------------------------------------------------------
+// Decode
+
+void Decode(
+    uint64_t buffer_bytes,
+    unsigned original_count,
+    unsigned recovery_count,
+    unsigned m, // = NextPow2(recovery_count)
+    unsigned n, // = NextPow2(m + original_count) = work_count
+    void* const * const original, // original_count entries
+    void* const * const recovery, // recovery_count entries
+    void** work); // n entries


 //------------------------------------------------------------------------------
--- a/docs/HighRateDecoder.pdf
+++ b/docs/HighRateDecoder.pdf
--- a/docs/LowRateDecoder.pdf
+++ b/docs/LowRateDecoder.pdf
--- a/leopard.cpp
+++ b/leopard.cpp
@ -27,8 +27,8 @@
 */

 #include "leopard.h"
-#include "FecalEncoder.h"
-#include "FecalDecoder.h"
+#include "LeopardFF8.h"
+#include "LeopardFF16.h"

 extern "C" {

@ -38,134 +38,152 @@ extern "C" {

 static bool m_Initialized = false;

-FECAL_EXPORT int fecal_init_(int version)
+LEO_EXPORT int leo_init_(int version)
 {
-    if (version != FECAL_VERSION)
-        return Fecal_InvalidInput;
+    if (version != LEO_VERSION)
+        return Leopard_InvalidInput;

-    if (0 != gf256_init())
-        return Fecal_Platform;
+    if (!leopard::ff8::Initialize())
+        return Leopard_Platform;
+
+    if (!leopard::ff16::Initialize())
+        return Leopard_Platform;

    m_Initialized = true;
-    return Fecal_Success;
+    return Leopard_Success;
 }


 //------------------------------------------------------------------------------
 // Encoder API

-FECAL_EXPORT FecalEncoder fecal_encoder_create(unsigned input_count, void* const * const input_data, uint64_t total_bytes)
+LEO_EXPORT unsigned leo_encode_work_count(
+    unsigned original_count,
+    unsigned recovery_count)
 {
-    if (input_count <= 0 || !input_data || total_bytes < input_count)
-    {
-        FECAL_DEBUG_BREAK; // Invalid input
-        return nullptr;
-    }
-
-    FECAL_DEBUG_ASSERT(m_Initialized); // Must call fecal_init() first
-    if (!m_Initialized)
-        return nullptr;
-
-    fecal::Encoder* encoder = new(std::nothrow) fecal::Encoder;
-    if (!encoder)
-    {
-        FECAL_DEBUG_BREAK; // Out of memory
-        return nullptr;
-    }
-
-    if (Fecal_Success != encoder->Initialize(input_count, input_data, total_bytes))
-    {
-        delete encoder;
-        return nullptr;
-    }
-
-    return reinterpret_cast<FecalEncoder>( encoder );
+    return leopard::NextPow2(recovery_count) * 2;
 }

-FECAL_EXPORT int fecal_encode(FecalEncoder encoder_v, FecalSymbol* symbol)
+LEO_EXPORT LeopardResult leo_encode(
+    uint64_t buffer_bytes,              // Number of bytes in each data buffer
+    unsigned original_count,            // Number of original_data[] buffer pointers
+    unsigned recovery_count,            // Number of recovery_data[] buffer pointers
+    unsigned work_count,                // Number of work_data[] buffer pointers, from leo_encode_work_count()
+    void* const * const original_data,  // Array of pointers to original data buffers
+    void** work_data,                   // Array of work buffers
+    unsigned flags)                     // Operation flags
 {
-    fecal::Encoder* encoder = reinterpret_cast<fecal::Encoder*>( encoder_v );
-    if (!encoder || !symbol)
-        return Fecal_InvalidInput;
+    if (buffer_bytes <= 0 || buffer_bytes % 64 != 0)
+        return Leopard_InvalidSize;

-    return encoder->Encode(*symbol);
-}
+    if (recovery_count <= 0 || recovery_count > original_count)
+        return Leopard_InvalidCounts;

-FECAL_EXPORT void fecal_free(void* codec_v)
-{
-    if (codec_v)
+    if (!original_data || !work_data)
+        return Leopard_InvalidInput;
+
+    const unsigned m = leopard::NextPow2(recovery_count);
+    const unsigned n = leopard::NextPow2(m + original_count);
+
+    if (work_count != m * 2)
+        return Leopard_InvalidCounts;
+
+    const bool mt = (flags & LeopardFlags_Multithreaded) != 0;
+
+    if (n <= leopard::ff8::kOrder)
    {
-        fecal::ICodec* icodec = reinterpret_cast<fecal::ICodec*>( codec_v );
-        delete icodec;
+        leopard::ff8::Encode(
+            buffer_bytes,
+            original_count,
+            recovery_count,
+            m,
+            original_data,
+            work_data);
    }
+    else if (n <= leopard::ff16::kOrder)
+    {
+        leopard::ff16::Encode(
+            buffer_bytes,
+            original_count,
+            recovery_count,
+            m,
+            original_data,
+            work_data);
+    }
+    else
+        return Leopard_TooMuchData;
+
+    return Leopard_Success;
 }


 //------------------------------------------------------------------------------
 // Decoder API

-FECAL_EXPORT FecalDecoder fecal_decoder_create(unsigned input_count, uint64_t total_bytes)
+LEO_EXPORT unsigned leo_decode_work_count(
+    unsigned original_count,
+    unsigned recovery_count)
 {
-    if (input_count <= 0 || total_bytes < input_count)
+    const unsigned m = leopard::NextPow2(recovery_count);
+    const unsigned n = leopard::NextPow2(m + original_count);
+    return n;
+}
+
+LEO_EXPORT LeopardResult leo_decode(
+    uint64_t buffer_bytes,              // Number of bytes in each data buffer
+    unsigned original_count,            // Number of original_data[] buffer pointers
+    unsigned recovery_count,            // Number of recovery_data[] buffer pointers
+    unsigned work_count,                // Number of buffer pointers in work_data[]
+    void* const * const original_data,  // Array of original data buffers
+    void* const * const recovery_data,  // Array of recovery data buffers
+    void** work_data,                   // Array of work data buffers
+    unsigned flags)                     // Operation flags
+{
+    if (buffer_bytes <= 0 || buffer_bytes % 64 != 0)
+        return Leopard_InvalidSize;
+
+    if (recovery_count <= 0 || recovery_count > original_count)
+        return Leopard_InvalidCounts;
+
+    if (!original_data || !recovery_data || !work_data)
+        return Leopard_InvalidInput;
+
+    const unsigned m = leopard::NextPow2(recovery_count);
+    const unsigned n = leopard::NextPow2(m + original_count);
+
+    if (work_count != n)
+        return Leopard_InvalidCounts;
+
+    const bool mt = (flags & LeopardFlags_Multithreaded) != 0;
+
+    if (n <= leopard::ff8::kOrder)
    {
-        FECAL_DEBUG_BREAK; // Invalid input
-        return nullptr;
+        leopard::ff8::Decode(
+            buffer_bytes,
+            original_count,
+            recovery_count,
+            m,
+            n,
+            original_data,
+            recovery_data,
+            work_data);
    }
-
-    FECAL_DEBUG_ASSERT(m_Initialized); // Must call fecal_init() first
-    if (!m_Initialized)
-        return nullptr;
-
-    fecal::Decoder* decoder = new(std::nothrow) fecal::Decoder;
-    if (!decoder)
+    else if (n <= leopard::ff16::kOrder)
    {
-        FECAL_DEBUG_BREAK; // Out of memory
-        return nullptr;
+        leopard::ff16::Decode(
+            buffer_bytes,
+            original_count,
+            recovery_count,
+            m,
+            n,
+            original_data,
+            recovery_data,
+            work_data);
    }
+    else
+        return Leopard_TooMuchData;

-    if (Fecal_Success != decoder->Initialize(input_count, total_bytes))
-    {
-        delete decoder;
-        return nullptr;
-    }
-
-    return reinterpret_cast<FecalDecoder>( decoder );
-}
-
-FECAL_EXPORT int fecal_decoder_add_original(FecalDecoder decoder_v, const FecalSymbol* symbol)
-{
-    fecal::Decoder* decoder = reinterpret_cast<fecal::Decoder*>( decoder_v );
-    if (!decoder || !symbol)
-        return Fecal_InvalidInput;
-
-    return decoder->AddOriginal(*symbol);
-}
-
-FECAL_EXPORT int fecal_decoder_add_recovery(FecalDecoder decoder_v, const FecalSymbol* symbol)
-{
-    fecal::Decoder* decoder = reinterpret_cast<fecal::Decoder*>( decoder_v );
-    if (!decoder || !symbol)
-        return Fecal_InvalidInput;
-
-    return decoder->AddRecovery(*symbol);
-}
-
-FECAL_EXPORT int fecal_decode(FecalDecoder decoder_v, RecoveredSymbols* symbols)
-{
-    fecal::Decoder* decoder = reinterpret_cast<fecal::Decoder*>( decoder_v );
-    if (!decoder || !symbols)
-        return Fecal_InvalidInput;
-
-    return decoder->Decode(*symbols);
-}
-
-FECAL_EXPORT int fecal_decoder_get(FecalDecoder decoder_v, unsigned input_index, FecalSymbol* symbol)
-{
-    fecal::Decoder* decoder = reinterpret_cast<fecal::Decoder*>( decoder_v );
-    if (!decoder || !symbol)
-        return Fecal_InvalidInput;
-
-    return decoder->GetOriginal(input_index, *symbol);
+    return Leopard_Success;
 }


--- a/leopard.h
+++ b/leopard.h
@ -59,6 +59,7 @@
 # endif
 #endif

+#include <stdint.h>

 #ifdef __cplusplus
 extern "C" {
@ -90,14 +91,13 @@ typedef enum LeopardResultT
    Leopard_Success           =  0, // Operation succeeded

    Leopard_TooMuchData       = -1, // Buffer counts are too high
-    Leopard_InvalidBlockSize  = -2, // Buffer size must be a multiple of 64 bytes
-    Leopard_InvalidInput      = -3, // A function parameter was invalid
-    Leopard_Platform          = -4, // Platform is unsupported
-    Leopard_OutOfMemory       = -5, // Out of memory error occurred
-    Leopard_Unexpected        = -6, // Unexpected error - Software bug?
+    Leopard_InvalidSize       = -2, // Buffer size must be a multiple of 64 bytes
+    Leopard_InvalidCounts     = -3, // Invalid counts provided
+    Leopard_InvalidInput      = -4, // A function parameter was invalid
+    Leopard_Platform          = -5, // Platform is unsupported
 } LeopardResult;

-// Results
+// Flags
 typedef enum LeopardFlagsT
 {
    LeopardFlags_Defaults      = 0, // Default settings
@ -119,7 +119,6 @@ typedef enum LeopardFlagsT
 	Returns the work_count value to pass into leo_encode().
    Returns 0 on invalid input.
 */
-
 LEO_EXPORT unsigned leo_encode_work_count(
    unsigned original_count,
    unsigned recovery_count);
@ -138,6 +137,8 @@ LEO_EXPORT unsigned leo_encode_work_count(
    flags:          Flags for encoding e.g. LeopardFlag_Multithreaded

    The sum of original_count + recovery_count must not exceed 65536.
+    The recovery_count <= original_count.
+
    The buffer_bytes must be a multiple of 64.
    Each buffer should have the same number of bytes.
    Even the last piece must be rounded up to the block size.
@ -153,15 +154,11 @@ LEO_EXPORT unsigned leo_encode_work_count(
            ((uint64_t)total_bytes + original_count - 1) / original_count);

    Returns Leopard_Success on success.
-    The first set of recovery_count buffers in work_data will be the result.
-
-    Returns Leopard_TooMuchData if the data is too large.
-    Returns Leopard_InvalidBlockSize if the data is the wrong size.
-    Returns Leopard_InvalidInput on invalid input.
+    * The first set of recovery_count buffers in work_data will be the result.
    Returns other values on errors.
 */
 LEO_EXPORT LeopardResult leo_encode(
-    unsigned buffer_bytes,              // Number of bytes in each data buffer
+    uint64_t buffer_bytes,              // Number of bytes in each data buffer
    unsigned original_count,            // Number of original_data[] buffer pointers
    unsigned recovery_count,            // Number of recovery_data[] buffer pointers
    unsigned work_count,                // Number of work_data[] buffer pointers, from leo_encode_work_count()
@ -183,7 +180,6 @@ LEO_EXPORT LeopardResult leo_encode(
 	Returns the work_count value to pass into leo_encode().
    Returns 0 on invalid input.
 */
-
 LEO_EXPORT unsigned leo_decode_work_count(
    unsigned original_count,
    unsigned recovery_count);
@ -211,7 +207,7 @@ LEO_EXPORT unsigned leo_decode_work_count(
    Returns other values on errors.
 */
 LEO_EXPORT LeopardResult leo_decode(
-    unsigned buffer_bytes,              // Number of bytes in each data buffer
+    uint64_t buffer_bytes,              // Number of bytes in each data buffer
    unsigned original_count,            // Number of original_data[] buffer pointers
    unsigned recovery_count,            // Number of recovery_data[] buffer pointers
    unsigned work_count,                // Number of buffer pointers in work_data[]
--- a/proj/Leopard.sln
+++ b/proj/Leopard.sln
@ -1,12 +1,14 @@

 Microsoft Visual Studio Solution File, Format Version 12.00
-# Visual Studio 15
-VisualStudioVersion = 15.0.26127.3
+# Visual Studio 14
+VisualStudioVersion = 14.0.25420.1
 MinimumVisualStudioVersion = 10.0.40219.1
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "Leopard", "Leopard.vcxproj", "{32176592-2F30-4BD5-B645-EB11C8D3453E}"
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "LeopardBenchmark", "..\tests\proj\Benchmark.vcxproj", "{97FCA15F-EAF3-4F1A-AFF8-83E693DA9D45}"
 EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "LeopardExperiments", "..\tests\proj\Experiments.vcxproj", "{97FCA15F-EAF3-4F1A-AFF8-83E693DA9D65}"
+EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|Win32 = Debug|Win32
@ -31,6 +33,14 @@ Global
 		{97FCA15F-EAF3-4F1A-AFF8-83E693DA9D45}.Release|Win32.Build.0 = Release|Win32
 		{97FCA15F-EAF3-4F1A-AFF8-83E693DA9D45}.Release|x64.ActiveCfg = Release|x64
 		{97FCA15F-EAF3-4F1A-AFF8-83E693DA9D45}.Release|x64.Build.0 = Release|x64
+		{97FCA15F-EAF3-4F1A-AFF8-83E693DA9D65}.Debug|Win32.ActiveCfg = Debug|Win32
+		{97FCA15F-EAF3-4F1A-AFF8-83E693DA9D65}.Debug|Win32.Build.0 = Debug|Win32
+		{97FCA15F-EAF3-4F1A-AFF8-83E693DA9D65}.Debug|x64.ActiveCfg = Debug|x64
+		{97FCA15F-EAF3-4F1A-AFF8-83E693DA9D65}.Debug|x64.Build.0 = Debug|x64
+		{97FCA15F-EAF3-4F1A-AFF8-83E693DA9D65}.Release|Win32.ActiveCfg = Release|Win32
+		{97FCA15F-EAF3-4F1A-AFF8-83E693DA9D65}.Release|Win32.Build.0 = Release|Win32
+		{97FCA15F-EAF3-4F1A-AFF8-83E693DA9D65}.Release|x64.ActiveCfg = Release|x64
+		{97FCA15F-EAF3-4F1A-AFF8-83E693DA9D65}.Release|x64.Build.0 = Release|x64
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
--- a/proj/Leopard.vcxproj
+++ b/proj/Leopard.vcxproj
@ -21,16 +21,12 @@
  <ItemGroup>
    <ClInclude Include="..\leopard.h" />
    <ClInclude Include="..\LeopardCommon.h" />
-    <ClInclude Include="..\LeopardDecoder.h" />
-    <ClInclude Include="..\LeopardEncoder.h" />
    <ClInclude Include="..\LeopardFF8.h" />
    <ClInclude Include="..\LeopardFF16.h" />
  </ItemGroup>
  <ItemGroup>
    <ClCompile Include="..\leopard.cpp" />
    <ClCompile Include="..\LeopardCommon.cpp" />
-    <ClCompile Include="..\LeopardDecoder.cpp" />
-    <ClCompile Include="..\LeopardEncoder.cpp" />
    <ClCompile Include="..\LeopardFF8.cpp" />
    <ClCompile Include="..\LeopardFF16.cpp" />
  </ItemGroup>
@ -38,34 +34,33 @@
    <ProjectGuid>{32176592-2F30-4BD5-B645-EB11C8D3453E}</ProjectGuid>
    <RootNamespace>GF65536</RootNamespace>
    <ProjectName>Leopard</ProjectName>
-    <WindowsTargetPlatformVersion>10.0.14393.0</WindowsTargetPlatformVersion>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
    <ConfigurationType>StaticLibrary</ConfigurationType>
    <UseDebugLibraries>true</UseDebugLibraries>
    <CharacterSet>MultiByte</CharacterSet>
-    <PlatformToolset>v141</PlatformToolset>
+    <PlatformToolset>v140</PlatformToolset>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
    <ConfigurationType>StaticLibrary</ConfigurationType>
    <UseDebugLibraries>true</UseDebugLibraries>
    <CharacterSet>MultiByte</CharacterSet>
-    <PlatformToolset>v141</PlatformToolset>
+    <PlatformToolset>v140</PlatformToolset>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
    <ConfigurationType>StaticLibrary</ConfigurationType>
    <UseDebugLibraries>false</UseDebugLibraries>
    <WholeProgramOptimization>true</WholeProgramOptimization>
    <CharacterSet>MultiByte</CharacterSet>
-    <PlatformToolset>v141</PlatformToolset>
+    <PlatformToolset>v140</PlatformToolset>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
    <ConfigurationType>StaticLibrary</ConfigurationType>
    <UseDebugLibraries>false</UseDebugLibraries>
    <WholeProgramOptimization>true</WholeProgramOptimization>
    <CharacterSet>MultiByte</CharacterSet>
-    <PlatformToolset>v141</PlatformToolset>
+    <PlatformToolset>v140</PlatformToolset>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
--- a/proj/Leopard.vcxproj.filters
+++ b/proj/Leopard.vcxproj.filters
@ -21,12 +21,6 @@
    <ClInclude Include="..\LeopardCommon.h">
      <Filter>Source Files</Filter>
    </ClInclude>
-    <ClInclude Include="..\LeopardDecoder.h">
-      <Filter>Source Files</Filter>
-    </ClInclude>
-    <ClInclude Include="..\LeopardEncoder.h">
-      <Filter>Source Files</Filter>
-    </ClInclude>
    <ClInclude Include="..\LeopardFF16.h">
      <Filter>Source Files</Filter>
    </ClInclude>
@ -35,12 +29,6 @@
    </ClInclude>
  </ItemGroup>
  <ItemGroup>
-    <ClCompile Include="..\LeopardDecoder.cpp">
-      <Filter>Source Files</Filter>
-    </ClCompile>
-    <ClCompile Include="..\LeopardEncoder.cpp">
-      <Filter>Source Files</Filter>
-    </ClCompile>
    <ClCompile Include="..\leopard.cpp">
      <Filter>Source Files</Filter>
    </ClCompile>
--- a/tests/experiments.cpp
+++ b/tests/experiments.cpp
@ -0,0 +1,615 @@
+/*
+    Copyright (c) 2017 Christopher A. Taylor.  All rights reserved.
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright notice,
+      this list of conditions and the following disclaimer in the documentation
+      and/or other materials provided with the distribution.
+    * Neither the name of LHC-RS nor the names of its contributors may be
+      used to endorse or promote products derived from this software without
+      specific prior written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+    ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+    POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <string.h>
+#include <time.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+
+//------------------------------------------------------------------------------
+// Debug
+
+// Some bugs only repro in release mode, so this can be helpful
+//#define LEO_DEBUG_IN_RELEASE
+
+#if defined(_DEBUG) || defined(DEBUG) || defined(LEO_DEBUG_IN_RELEASE)
+    #define LEO_DEBUG
+    #ifdef _WIN32
+        #define LEO_DEBUG_BREAK __debugbreak()
+    #else
+        #define LEO_DEBUG_BREAK __builtin_trap()
+    #endif
+    #define LEO_DEBUG_ASSERT(cond) { if (!(cond)) { LEO_DEBUG_BREAK; } }
+#else
+    #define LEO_DEBUG_BREAK ;
+    #define LEO_DEBUG_ASSERT(cond) ;
+#endif
+
+
+//------------------------------------------------------------------------------
+// Platform/Architecture
+
+// Compiler-specific C++11 restrict keyword
+#define LEO_RESTRICT __restrict
+
+// Compiler-specific force inline keyword
+#ifdef _MSC_VER
+    #define LEO_FORCE_INLINE inline __forceinline
+#else
+    #define LEO_FORCE_INLINE inline __attribute__((always_inline))
+#endif
+
+
+
+
+//------------------------------------------------------------------------------
+// Field
+
+//#define LEO_SHORT_FIELD
+
+#ifdef LEO_SHORT_FIELD
+typedef uint8_t ffe_t;
+static const unsigned kGFBits = 8;
+static const unsigned kGFPolynomial = 0x11D;
+ffe_t kGFBasis[kGFBits] = {
+    1, 214, 152, 146, 86, 200, 88, 230 // Cantor basis
+};
+#else
+typedef uint16_t ffe_t;
+static const unsigned kGFBits = 16;
+static const unsigned kGFPolynomial = 0x1002D;
+ffe_t kGFBasis[kGFBits] = {
+    0x0001, 0xACCA, 0x3C0E, 0x163E, // Cantor basis
+    0xC582, 0xED2E, 0x914C, 0x4012,
+    0x6C98, 0x10D8, 0x6A72, 0xB900,
+    0xFDB8, 0xFB34, 0xFF38, 0x991E
+};
+#endif
+
+/*
+    Cantor Basis introduced by:
+    D. G. Cantor, "On arithmetical algorithms over finite fields",
+    Journal of Combinatorial Theory, Series A, vol. 50, no. 2, pp. 285-300, 1989.
+*/
+
+static const unsigned kFieldSize = (unsigned)1 << kGFBits; //Field size
+static const unsigned kFieldModulus = kFieldSize - 1;
+
+static ffe_t GFLog[kFieldSize];
+static ffe_t GFExp[kFieldSize];
+
+// Initialize GFLog[], GFExp[]
+static void InitField()
+{
+    unsigned state = 1;
+    for (unsigned i = 0; i < kFieldModulus; ++i)
+    {
+        GFExp[state] = static_cast<ffe_t>(i);
+        state <<= 1;
+        if (state >= kFieldSize)
+            state ^= kGFPolynomial;
+    }
+    GFExp[0] = kFieldModulus;
+
+    // Conversion to chosen basis:
+
+    GFLog[0] = 0;
+    for (unsigned i = 0; i < kGFBits; ++i)
+    {
+        const ffe_t basis = kGFBasis[i];
+        const unsigned width = (unsigned)(1UL << i);
+
+        for (unsigned j = 0; j < width; ++j)
+            GFLog[j + width] = GFLog[j] ^ basis;
+    }
+
+    for (unsigned i = 0; i < kFieldSize; ++i)
+        GFLog[i] = GFExp[GFLog[i]];
+
+    for (unsigned i = 0; i < kFieldSize; ++i)
+        GFExp[GFLog[i]] = i;
+
+    GFExp[kFieldModulus] = GFExp[0];
+}
+
+
+//------------------------------------------------------------------------------
+// Mod Q Field Operations
+//
+// Q is the maximum symbol value, e.g. 255 or 65535.
+
+// z = x + y (mod Q)
+static inline ffe_t AddModQ(ffe_t a, ffe_t b)
+{
+    const unsigned sum = (unsigned)a + b;
+
+    // Partial reduction step, allowing for Q to be returned
+    return static_cast<ffe_t>(sum + (sum >> kGFBits));
+}
+
+// z = x - y (mod Q)
+static inline ffe_t SubModQ(ffe_t a, ffe_t b)
+{
+    const unsigned dif = (unsigned)a - b;
+
+    // Partial reduction step, allowing for Q to be returned
+    return static_cast<ffe_t>(dif + (dif >> kGFBits));
+}
+
+// return a*GFExp[b] over GF(2^r)
+static ffe_t mulE(ffe_t a, ffe_t b)
+{
+    if (a == 0)
+        return 0;
+
+    const ffe_t sum = static_cast<ffe_t>(AddModQ(GFLog[a], b));
+    return GFExp[sum];
+}
+
+
+//------------------------------------------------------------------------------
+// Fast Walsh-Hadamard Transform (FWHT) Mod Q
+//
+// Q is the maximum symbol value, e.g. 255 or 65535.
+
+// Define this to enable the optimized version of FWHT()
+#define LEO_FWHT_OPTIMIZED
+
+typedef ffe_t fwht_t;
+
+// {a, b} = {a + b, a - b} (Mod Q)
+static LEO_FORCE_INLINE void FWHT_2(fwht_t& LEO_RESTRICT a, fwht_t& LEO_RESTRICT b)
+{
+    const fwht_t sum = AddModQ(a, b);
+    const fwht_t dif = SubModQ(a, b);
+    a = sum;
+    b = dif;
+}
+
+// Reference implementation
+static void FWHT(fwht_t* data, const unsigned bits)
+{
+    const unsigned size = (unsigned)(1UL << bits);
+    for (unsigned width = 1; width < size; width <<= 1)
+        for (unsigned i = 0; i < size; i += (width << 1))
+            for (unsigned j = i; j < (width + i); ++j)
+                FWHT_2(data[j], data[j + width]);
+}
+
+
+//------------------------------------------------------------------------------
+// Formal Derivative
+
+// Formal derivative of polynomial in the new basis
+static void formal_derivative(ffe_t* cos, const unsigned size)
+{
+    /*
+        Left to right xoring data ahead into data behind.
+
+        If the data ends in all zeroes, this can simply stop.
+    */
+    for (unsigned i = 1; i < size; ++i)
+    {
+        const unsigned leng = ((i ^ (i - 1)) + 1) >> 1;
+
+        // If a large number of values are being XORed:
+        for (unsigned j = i - leng; j < i; ++j)
+            cos[j] ^= cos[j + leng];
+    }
+
+    // Doesn't seem to be needed
+#if 0
+    /*
+        Same here - Zeroes on the right are preserved
+    */
+    for (unsigned i = size; i < kFieldSize; i <<= 1)
+    {
+        for (unsigned j = 0; j < size; ++j)
+            cos[j] ^= cos[j + i];
+    }
+#endif
+}
+
+
+//------------------------------------------------------------------------------
+// Fast Fourier Transform
+
+static ffe_t skewVec[kFieldModulus]; // twisted factors used in FFT
+
+static LEO_FORCE_INLINE void ifft_butterfly(ffe_t& a, ffe_t& b, ffe_t skew)
+{
+    b ^= a;
+    a ^= mulE(b, skew);
+}
+
+// IFFT in the proposed basis
+static void IFLT(ffe_t* data, const unsigned size, const unsigned index)
+{
+    for (unsigned width = 1; width < size; width <<= 1)
+    {
+        for (unsigned j = width; j < size; j += (width << 1))
+        {
+            const ffe_t skew = skewVec[j + index - 1];
+
+            if (skew != kFieldModulus)
+            {
+                for (unsigned i = j - width; i < j; ++i)
+                    ifft_butterfly(data[i], data[i + width], skew);
+            }
+            else
+            {
+                for (unsigned i = j - width; i < j; ++i)
+                    data[i + width] ^= data[i];
+            }
+        }
+    }
+}
+
+static LEO_FORCE_INLINE void fft_butterfly(ffe_t& a, ffe_t& b, ffe_t skew)
+{
+    a ^= mulE(b, skew);
+    b ^= a;
+}
+
+// FFT in the proposed basis
+static void FLT(ffe_t* data, const unsigned size, const unsigned skewIndex, const unsigned output_elements)
+{
+    for (unsigned width = (size >> 1); width > 0; width >>= 1)
+    {
+        const ffe_t* skewLUT = skewVec + width + skewIndex - 1;
+
+        for (unsigned j = 0; j < output_elements; j += (width << 1))
+        {
+            const ffe_t skew = skewLUT[j];
+
+            if (skew != kFieldModulus)
+            {
+                for (unsigned i = j; i < j + width; ++i)
+                    fft_butterfly(data[i], data[i + width], skew);
+            }
+            else
+            {
+                for (unsigned i = j; i < j + width; ++i)
+                    data[i + width] ^= data[i];
+            }
+        }
+    }
+}
+
+
+//------------------------------------------------------------------------------
+// FFT Initialization
+
+//static ffe_t B[kFieldSize >> 1];     // factors used in formal derivative
+static fwht_t log_walsh[kFieldSize];  // factors used in the evaluation of the error locator polynomial
+
+// Initialize skewVec[], B[], log_walsh[]
+static void InitFieldOperations()
+{
+    ffe_t temp[kGFBits - 1];
+
+    for (unsigned i = 1; i < kGFBits; ++i)
+        temp[i - 1] = (ffe_t)((unsigned)1 << i);
+
+    for (unsigned m = 0; m < (kGFBits - 1); ++m)
+    {
+        const unsigned step = (unsigned)1 << (m + 1);
+
+        skewVec[((unsigned)1 << m) - 1] = 0;
+
+        for (unsigned i = m; i < (kGFBits - 1); ++i)
+        {
+            const unsigned s = ((unsigned)1 << (i + 1));
+
+            for (unsigned j = ((unsigned)1 << m) - 1; j < s; j += step)
+                skewVec[j + s] = skewVec[j] ^ temp[i];
+        }
+
+        temp[m] = kFieldModulus - GFLog[mulE(temp[m], GFLog[temp[m] ^ 1])];
+
+        for (unsigned i = m + 1; i < (kGFBits - 1); ++i)
+            temp[i] = mulE(temp[i], (GFLog[temp[i] ^ 1] + temp[m]) % kFieldModulus);
+    }
+
+    for (unsigned i = 0; i < kFieldSize; ++i)
+        skewVec[i] = GFLog[skewVec[i]];
+
+#if 0
+    temp[0] = kFieldModulus - temp[0];
+
+    for (unsigned i = 1; i < (kGFBits - 1); ++i)
+        temp[i] = (kFieldModulus - temp[i] + temp[i - 1]) % kFieldModulus;
+
+    B[0] = 0;
+    for (unsigned i = 0; i < (kGFBits - 1); ++i)
+    {
+        const unsigned depart = ((unsigned)1 << i);
+
+        for (unsigned j = 0; j < depart; ++j)
+            B[j + depart] = (B[j] + temp[i]) % kFieldModulus;
+    }
+#endif
+
+    for (unsigned i = 0; i < kFieldSize; ++i)
+        log_walsh[i] = GFLog[i];
+
+    log_walsh[0] = 0;
+
+    FWHT(log_walsh, kGFBits);
+}
+
+
+//------------------------------------------------------------------------------
+// Encoder
+
+// Encoding alg for k/n<0.5: message is a power of two
+static void encodeL(ffe_t* data, const unsigned k, ffe_t* codeword)
+{
+    memcpy(codeword, data, sizeof(ffe_t) * k);
+
+    IFLT(codeword, k, 0);
+
+    for (unsigned i = k; i < kFieldSize; i += k)
+    {
+        memcpy(&codeword[i], codeword, sizeof(ffe_t) * k);
+
+        FLT(&codeword[i], k, i, k);
+    }
+
+    memcpy(codeword, data, sizeof(ffe_t) * k);
+}
+
+// Encoding alg for k/n>0.5: parity is a power of two.
+// data: message array. parity: parity array. mem: buffer(size>= n-k)
+static void encodeH(const ffe_t* data, const unsigned m, const unsigned original_count, ffe_t* parity, ffe_t* mem)
+{
+    // Note: Assumes data is padded with zeroes out to the next multiple of m
+
+    memcpy(parity, data, m * sizeof(ffe_t));
+    IFLT(parity, m, m);
+
+    for (unsigned i = m; i < original_count; i += m)
+    {
+        memcpy(mem, data + i, m * sizeof(ffe_t));
+        IFLT(mem, m, m + i);
+        for (unsigned j = 0; j < m; ++j)
+            parity[j] ^= mem[j];
+    }
+
+    FLT(parity, m, 0, m);
+}
+
+
+//------------------------------------------------------------------------------
+// Decoder
+
+static void decode(ffe_t* codeword, const unsigned m, const unsigned original_count, const unsigned n, const bool* erasure)
+{
+    fwht_t log_walsh2[kFieldSize];
+
+    // Compute the evaluations of the error locator polynomial
+    for (unsigned i = 0; i < kFieldSize; ++i)
+        log_walsh2[i] = erasure[i] ? 1 : 0;
+
+    FWHT(log_walsh2, kGFBits);
+
+    for (unsigned i = 0; i < kFieldSize; ++i)
+        log_walsh2[i] = ((unsigned)log_walsh2[i] * (unsigned)log_walsh[i]) % kFieldModulus;
+
+    FWHT(log_walsh2, kGFBits);
+
+    // k2 can be replaced with k
+    //const unsigned k2 = kFieldSize;
+    //const unsigned k2 = k; // cannot actually be replaced with k.  maybe for encodeL() only?
+
+    for (unsigned i = 0; i < m + original_count; ++i)
+    {
+        if (erasure[i])
+        {
+            codeword[i] = 0;
+        }
+        else
+        {
+            codeword[i] = mulE(codeword[i], log_walsh2[i]);
+        }
+    }
+    for (unsigned i = m + original_count; i < n; ++i)
+        codeword[i] = 0;
+
+    IFLT(codeword, n, 0);
+
+    // Note: This is not needed to recover successfully...
+#if 0
+    // formal derivative
+    // Note: Preserves zeroes on the right
+    for (unsigned i = 0; i < m + original_count; i += 2)
+    {
+        codeword[i] = mulE(codeword[i], kFieldModulus - B[i >> 1]);
+        codeword[i + 1] = mulE(codeword[i + 1], kFieldModulus - B[i >> 1]);
+    }
+#endif
+
+    formal_derivative(codeword, n);
+
+#if 0
+    // Note: Preserves zeroes on the right
+    for (unsigned i = 0; i < m + original_count; i += 2)
+    {
+        codeword[i] = mulE(codeword[i], B[i >> 1]);
+        codeword[i + 1] = mulE(codeword[i + 1], B[i >> 1]);
+    }
+#endif
+
+    FLT(codeword, n, 0, m + original_count);
+
+    for (unsigned i = 0; i < kFieldSize; ++i)
+    {
+        if (erasure[i])
+        {
+            codeword[i] = mulE(codeword[i], kFieldModulus - log_walsh2[i]);
+        }
+    }
+}
+
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#endif
+
+// Returns highest bit index 0..63 where the first non-zero bit is found
+// Precondition: x != 0
+LEO_FORCE_INLINE unsigned LastNonzeroBit64(uint64_t x)
+{
+#ifdef _MSC_VER
+#ifdef _WIN64
+    unsigned long index;
+    // Note: Ignoring result because x != 0
+    _BitScanReverse64(&index, x);
+    return (unsigned)index;
+#else
+    unsigned long index;
+    if (0 != _BitScanReverse(&index, (uint32_t)x))
+        return (unsigned)index;
+    // Note: Ignoring result because x != 0
+    _BitScanReverse(&index, (uint32_t)(x >> 32));
+    return (unsigned)index + 32;
+#endif
+#else
+    // Note: Ignoring return value of 0 because x != 0
+    return 63 - (unsigned)__builtin_clzll(x);
+#endif
+}
+
+
+//------------------------------------------------------------------------------
+// Test Application
+
+void test(unsigned original_count, unsigned recovery_count, unsigned seed)
+{
+    unsigned m = 2UL << LastNonzeroBit64(recovery_count - 1);
+    unsigned n = 2UL << LastNonzeroBit64(m + original_count - 1);
+
+    srand(seed);
+
+    //-----------Generating message----------
+
+    // Message array
+    ffe_t data[kFieldSize] = {0};
+
+    // Filled with random numbers
+    for (unsigned i = m; i < m + original_count; ++i)
+        data[i] = (ffe_t)rand();
+
+
+    //---------encoding----------
+
+    ffe_t codeword[kFieldSize] = {};
+    // First m codewords are for the parity data
+    encodeH(data + m, m, original_count, data, codeword);
+    //encodeL(data, k, codeword); // does not seem to work with any input?  what else needs to change?
+
+    memcpy(codeword, data, sizeof(ffe_t) * kFieldSize);
+
+
+    //--------erasure simulation---------
+
+    // Array indicating erasures
+    bool erasure[kFieldSize] = {
+        false
+    };
+
+    // Tag the first "recovery_count" elements as erasures
+    for (unsigned i = m; i < m + recovery_count; ++i)
+        erasure[i] = true;
+
+    // permuting the erasure array
+    for (unsigned i = m + original_count - 1; i > 0; --i)
+    {
+        unsigned pos = rand() % (i + 1);
+
+        if (i != pos)
+        {
+            bool tmp = erasure[i];
+            erasure[i] = erasure[pos];
+            erasure[pos] = tmp;
+        }
+    }
+
+
+    //---------main processing----------
+    decode(codeword, m, original_count, n, erasure);
+
+    // Check the correctness of the result
+    for (unsigned i = 0; i < kFieldSize; ++i)
+    {
+        if (erasure[i])
+        {
+            if (data[i] != codeword[i])
+            {
+                printf("Decoding Error with seed = %d!\n", seed);
+                LEO_DEBUG_BREAK;
+                return;
+            }
+        }
+    }
+
+    printf(":D ");
+}
+
+
+//------------------------------------------------------------------------------
+// Entrypoint
+
+int main(int argc, char **argv)
+{
+    // Fill GFLog table and GFExp table
+    InitField();
+
+    // Compute factors used in erasure decoder
+    InitFieldOperations();
+
+    unsigned seed = (unsigned)time(NULL);
+    for (;;)
+    {
+#ifdef LEO_SHORT_FIELD
+        const unsigned input_count = 100;
+        const unsigned recovery_count = 20;
+#else // LEO_SHORT_FIELD
+        const unsigned input_count = 10000;
+        const unsigned recovery_count = 2000;
+#endif // LEO_SHORT_FIELD
+
+        test(input_count, recovery_count, seed);
+
+        ++seed;
+    }
+
+    return 0;
+}
--- a/tests/proj/Benchmark.vcxproj
+++ b/tests/proj/Benchmark.vcxproj
@ -20,36 +20,35 @@
  </ItemGroup>
  <PropertyGroup Label="Globals">
    <ProjectGuid>{97FCA15F-EAF3-4F1A-AFF8-83E693DA9D45}</ProjectGuid>
-    <RootNamespace>Fecal</RootNamespace>
+    <RootNamespace>Leopard</RootNamespace>
    <ProjectName>LeopardBenchmark</ProjectName>
-    <WindowsTargetPlatformVersion>10.0.14393.0</WindowsTargetPlatformVersion>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>true</UseDebugLibraries>
    <CharacterSet>MultiByte</CharacterSet>
-    <PlatformToolset>v141</PlatformToolset>
+    <PlatformToolset>v140</PlatformToolset>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>true</UseDebugLibraries>
    <CharacterSet>MultiByte</CharacterSet>
-    <PlatformToolset>v141</PlatformToolset>
+    <PlatformToolset>v140</PlatformToolset>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>false</UseDebugLibraries>
    <WholeProgramOptimization>true</WholeProgramOptimization>
    <CharacterSet>MultiByte</CharacterSet>
-    <PlatformToolset>v141</PlatformToolset>
+    <PlatformToolset>v140</PlatformToolset>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>false</UseDebugLibraries>
    <WholeProgramOptimization>true</WholeProgramOptimization>
    <CharacterSet>MultiByte</CharacterSet>
-    <PlatformToolset>v141</PlatformToolset>
+    <PlatformToolset>v140</PlatformToolset>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
--- a/tests/proj/Experiments.filters
+++ b/tests/proj/Experiments.filters
@ -0,0 +1,22 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup>
+    <Filter Include="Source Files">
+      <UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
+      <Extensions>cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx</Extensions>
+    </Filter>
+    <Filter Include="Header Files">
+      <UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>
+      <Extensions>h;hh;hpp;hxx;hm;inl;inc;xsd</Extensions>
+    </Filter>
+    <Filter Include="Resource Files">
+      <UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier>
+      <Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
+    </Filter>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="..\benchmark.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+  </ItemGroup>
+</Project>
--- a/tests/proj/Experiments.vcxproj
+++ b/tests/proj/Experiments.vcxproj
@ -0,0 +1,181 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{97FCA15F-EAF3-4F1A-AFF8-83E693DA9D65}</ProjectGuid>
+    <RootNamespace>Leopard</RootNamespace>
+    <ProjectName>LeopardExperiments</ProjectName>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v140</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v140</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v140</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v140</PlatformToolset>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <OutDir>Output/$(ProjectName)/$(Configuration)/$(Platform)/</OutDir>
+    <IntDir>Obj/$(ProjectName)/$(Configuration)/$(Platform)/</IntDir>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <OutDir>Output/$(ProjectName)/$(Configuration)/$(Platform)/</OutDir>
+    <IntDir>Obj/$(ProjectName)/$(Configuration)/$(Platform)/</IntDir>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <OutDir>Output/$(ProjectName)/$(Configuration)/$(Platform)/</OutDir>
+    <IntDir>Obj/$(ProjectName)/$(Configuration)/$(Platform)/</IntDir>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <OutDir>Output/$(ProjectName)/$(Configuration)/$(Platform)/</OutDir>
+    <IntDir>Obj/$(ProjectName)/$(Configuration)/$(Platform)/</IntDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+      <PreprocessorDefinitions>_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <AdditionalLibraryDirectories>
+      </AdditionalLibraryDirectories>
+    </Link>
+    <PostBuildEvent>
+      <Command>
+      </Command>
+    </PostBuildEvent>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+      <PreprocessorDefinitions>_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <AdditionalLibraryDirectories>
+      </AdditionalLibraryDirectories>
+    </Link>
+    <PostBuildEvent>
+      <Command>
+      </Command>
+    </PostBuildEvent>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <InlineFunctionExpansion>AnySuitable</InlineFunctionExpansion>
+      <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
+      <OmitFramePointers>false</OmitFramePointers>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+      <BufferSecurityCheck>true</BufferSecurityCheck>
+      <PreprocessorDefinitions>_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <AdditionalLibraryDirectories>
+      </AdditionalLibraryDirectories>
+    </Link>
+    <PostBuildEvent>
+      <Command>
+      </Command>
+    </PostBuildEvent>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <InlineFunctionExpansion>AnySuitable</InlineFunctionExpansion>
+      <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
+      <OmitFramePointers>false</OmitFramePointers>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+      <BufferSecurityCheck>true</BufferSecurityCheck>
+      <PreprocessorDefinitions>_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <AdditionalLibraryDirectories>
+      </AdditionalLibraryDirectories>
+    </Link>
+    <PostBuildEvent>
+      <Command>
+      </Command>
+    </PostBuildEvent>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="..\experiments.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>