diff --git a/LeopardCommon.cpp b/LeopardCommon.cpp
index 82bdbcf..55850bc 100644
--- a/LeopardCommon.cpp
+++ b/LeopardCommon.cpp
@@ -139,818 +139,233 @@ void InitializeCPUArch()
 }
 
 
-
-// vx[] += vy[] * z
-static void muladd_mem(GFSymbol * LEO_RESTRICT vx, const GFSymbol * LEO_RESTRICT vy, GFSymbol z, unsigned symbolCount)
-{
-    for (unsigned i = 0; i < symbolCount; ++i)
-    {
-        const GFSymbol a = vy[i];
-        if (a == 0)
-            continue;
-
-        GFSymbol sum1 = static_cast<GFSymbol>(AddModQ(GFLog[a & 0x0f], z));
-        GFSymbol value1 = GFExp[sum1];
-        if ((a & 0x0f) == 0)
-        {
-            value1 = 0;
-        }
-        GFSymbol sum2 = static_cast<GFSymbol>(AddModQ(GFLog[a & 0xf0], z));
-        GFSymbol value2 = GFExp[sum2];
-        if ((a & 0xf0) == 0)
-        {
-            value2 = 0;
-        }
-        GFSymbol sum3 = static_cast<GFSymbol>(AddModQ(GFLog[a & 0x0f00], z));
-        GFSymbol value3 = GFExp[sum3];
-        if ((a & 0x0f00) == 0)
-        {
-            value3 = 0;
-        }
-        GFSymbol sum4 = static_cast<GFSymbol>(AddModQ(GFLog[a & 0xf000], z));
-        GFSymbol value4 = GFExp[sum4];
-        if ((a & 0xf000) == 0)
-        {
-            value4 = 0;
-        }
-
-        vx[i] ^= value1;
-        vx[i] ^= value2;
-        vx[i] ^= value3;
-        vx[i] ^= value4;
-    }
-}
-
-// return a*GFExp[b] over GF(2^r)
-static GFSymbol mulE(GFSymbol a, GFSymbol b)
-{
-    if (a == 0)
-        return 0;
-
-    const GFSymbol sum = static_cast<GFSymbol>(AddModQ(GFLog[a], b));
-    return GFExp[sum];
-}
-
-
 //------------------------------------------------------------------------------
-// Fast Walsh-Hadamard Transform (FWHT) Mod Q
-//
-// Q is the maximum symbol value, e.g. 255 or 65535.
+// XOR Memory
 
-// Define this to enable the optimized version of FWHT()
-#define LEO_FWHT_OPTIMIZED
-
-typedef GFSymbol fwht_t;
-
-// {a, b} = {a + b, a - b} (Mod Q)
-static LEO_FORCE_INLINE void FWHT_2(fwht_t& LEO_RESTRICT a, fwht_t& LEO_RESTRICT b)
+void xor_mem(
+    void * LEO_RESTRICT vx, const void * LEO_RESTRICT vy,
+    unsigned bytes)
 {
-    const fwht_t sum = AddModQ(a, b);
-    const fwht_t dif = SubModQ(a, b);
-    a = sum;
-    b = dif;
-}
-
-/*
-    FWHT is a minor slice of the runtime and does not grow with data size,
-    but I did attempt a few additional optimizations that failed:
-
-    I've attempted to vectorize (with partial reductions) FWHT_4(data, s),
-    which is 70% of the algorithm, but it was slower.  Left in _attic_.
-
-    I've attempted to avoid reductions in all or parts of the FWHT.
-    The final modular reduction ends up being slower than the savings.
-    Specifically I tried doing it for the whole FWHT and also I tried
-    doing it just for the FWHT_2 loop in the main routine, but both
-    approaches are slower than partial reductions.
-
-    Replacing word reads with wider reads does speed up the operation, but
-    at too high a complexity cost relative to minor perf improvement.
-*/
-
-#ifndef LEO_FWHT_OPTIMIZED
-
-// Reference implementation
-static void FWHT(fwht_t* data, const unsigned bits)
-{
-    const unsigned size = (unsigned)(1UL << bits);
-    for (unsigned width = 1; width < size; width <<= 1)
-        for (unsigned i = 0; i < size; i += (width << 1))
-            for (unsigned j = i; j < (width + i); ++j)
-                FWHT_2(data[j], data[j + width]);
-}
-
-#else
-
-static LEO_FORCE_INLINE void FWHT_4(fwht_t* data)
-{
-    fwht_t t0 = data[0];
-    fwht_t t1 = data[1];
-    fwht_t t2 = data[2];
-    fwht_t t3 = data[3];
-    FWHT_2(t0, t1);
-    FWHT_2(t2, t3);
-    FWHT_2(t0, t2);
-    FWHT_2(t1, t3);
-    data[0] = t0;
-    data[1] = t1;
-    data[2] = t2;
-    data[3] = t3;
-}
-
-static LEO_FORCE_INLINE void FWHT_4(fwht_t* data, unsigned s)
-{
-    unsigned x = 0;
-    fwht_t t0 = data[x];  x += s;
-    fwht_t t1 = data[x];  x += s;
-    fwht_t t2 = data[x];  x += s;
-    fwht_t t3 = data[x];
-    FWHT_2(t0, t1);
-    FWHT_2(t2, t3);
-    FWHT_2(t0, t2);
-    FWHT_2(t1, t3);
-    unsigned y = 0;
-    data[y] = t0;  y += s;
-    data[y] = t1;  y += s;
-    data[y] = t2;  y += s;
-    data[y] = t3;
-}
-
-static inline void FWHT_8(fwht_t* data)
-{
-    fwht_t t0 = data[0];
-    fwht_t t1 = data[1];
-    fwht_t t2 = data[2];
-    fwht_t t3 = data[3];
-    fwht_t t4 = data[4];
-    fwht_t t5 = data[5];
-    fwht_t t6 = data[6];
-    fwht_t t7 = data[7];
-    FWHT_2(t0, t1);
-    FWHT_2(t2, t3);
-    FWHT_2(t4, t5);
-    FWHT_2(t6, t7);
-    FWHT_2(t0, t2);
-    FWHT_2(t1, t3);
-    FWHT_2(t4, t6);
-    FWHT_2(t5, t7);
-    FWHT_2(t0, t4);
-    FWHT_2(t1, t5);
-    FWHT_2(t2, t6);
-    FWHT_2(t3, t7);
-    data[0] = t0;
-    data[1] = t1;
-    data[2] = t2;
-    data[3] = t3;
-    data[4] = t4;
-    data[5] = t5;
-    data[6] = t6;
-    data[7] = t7;
-}
-
-static inline void FWHT_16(fwht_t* data)
-{
-    fwht_t t0 = data[0];
-    fwht_t t1 = data[1];
-    fwht_t t2 = data[2];
-    fwht_t t3 = data[3];
-    fwht_t t4 = data[4];
-    fwht_t t5 = data[5];
-    fwht_t t6 = data[6];
-    fwht_t t7 = data[7];
-    fwht_t t8 = data[8];
-    fwht_t t9 = data[9];
-    fwht_t t10 = data[10];
-    fwht_t t11 = data[11];
-    fwht_t t12 = data[12];
-    fwht_t t13 = data[13];
-    fwht_t t14 = data[14];
-    fwht_t t15 = data[15];
-    FWHT_2(t0, t1);
-    FWHT_2(t2, t3);
-    FWHT_2(t4, t5);
-    FWHT_2(t6, t7);
-    FWHT_2(t8, t9);
-    FWHT_2(t10, t11);
-    FWHT_2(t12, t13);
-    FWHT_2(t14, t15);
-    FWHT_2(t0, t2);
-    FWHT_2(t1, t3);
-    FWHT_2(t4, t6);
-    FWHT_2(t5, t7);
-    FWHT_2(t8, t10);
-    FWHT_2(t9, t11);
-    FWHT_2(t12, t14);
-    FWHT_2(t13, t15);
-    FWHT_2(t0, t4);
-    FWHT_2(t1, t5);
-    FWHT_2(t2, t6);
-    FWHT_2(t3, t7);
-    FWHT_2(t8, t12);
-    FWHT_2(t9, t13);
-    FWHT_2(t10, t14);
-    FWHT_2(t11, t15);
-    FWHT_2(t0, t8);
-    FWHT_2(t1, t9);
-    FWHT_2(t2, t10);
-    FWHT_2(t3, t11);
-    FWHT_2(t4, t12);
-    FWHT_2(t5, t13);
-    FWHT_2(t6, t14);
-    FWHT_2(t7, t15);
-    data[0] = t0;
-    data[1] = t1;
-    data[2] = t2;
-    data[3] = t3;
-    data[4] = t4;
-    data[5] = t5;
-    data[6] = t6;
-    data[7] = t7;
-    data[8] = t8;
-    data[9] = t9;
-    data[10] = t10;
-    data[11] = t11;
-    data[12] = t12;
-    data[13] = t13;
-    data[14] = t14;
-    data[15] = t15;
-}
-
-static void FWHT_SmallData(fwht_t* data, unsigned ldn)
-{
-    const unsigned n = (1UL << ldn);
-
-    if (n <= 2)
-    {
-        if (n == 2)
-            FWHT_2(data[0], data[1]);
-        return;
-    }
-
-    for (unsigned ldm = ldn; ldm > 3; ldm -= 2)
-    {
-        unsigned m = (1UL << ldm);
-        unsigned m4 = (m >> 2);
-        for (unsigned r = 0; r < n; r += m)
-            for (unsigned j = 0; j < m4; j++)
-                FWHT_4(data + j + r, m4);
-    }
-
-    if (ldn & 1)
-    {
-        for (unsigned i0 = 0; i0 < n; i0 += 8)
-            FWHT_8(data + i0);
-    }
-    else
-    {
-        for (unsigned i0 = 0; i0 < n; i0 += 4)
-            FWHT_4(data + i0);
-    }
-}
-
-// Decimation in time (DIT) version
-static void FWHT(fwht_t* data, const unsigned ldn)
-{
-    if (ldn <= 13)
-    {
-        FWHT_SmallData(data, ldn);
-        return;
-    }
-
-    FWHT_2(data[2], data[3]);
-    FWHT_4(data + 4);
-    FWHT_8(data + 8);
-    FWHT_16(data + 16);
-    for (unsigned ldm = 5; ldm < ldn; ++ldm)
-        FWHT(data + (unsigned)(1UL << ldm), ldm);
-
-    for (unsigned ldm = 0; ldm < ldn; ++ldm)
-    {
-        const unsigned mh = (1UL << ldm);
-        for (unsigned t1 = 0, t2 = mh; t1 < mh; ++t1, ++t2)
-            FWHT_2(data[t1], data[t2]);
-    }
-}
-
-#endif
-
-
-//------------------------------------------------------------------------------
-// Memory Buffer XOR
-
-static void xor_mem(void * LEO_RESTRICT vx, const void * LEO_RESTRICT vy, unsigned bytes)
-{
-    LEO_M128 * LEO_RESTRICT x16 = reinterpret_cast<LEO_M128 *>(vx);
-    const LEO_M128 * LEO_RESTRICT y16 = reinterpret_cast<const LEO_M128 *>(vy);
-
-#if defined(LEO_TARGET_MOBILE)
-# if defined(LEO_TRY_NEON)
-    // Handle multiples of 64 bytes
-    if (CpuHasNeon)
-    {
-        while (bytes >= 64)
-        {
-            LEO_M128 x0 = vld1q_u8(x16);
-            LEO_M128 x1 = vld1q_u8(x16 + 1);
-            LEO_M128 x2 = vld1q_u8(x16 + 2);
-            LEO_M128 x3 = vld1q_u8(x16 + 3);
-            LEO_M128 y0 = vld1q_u8(y16);
-            LEO_M128 y1 = vld1q_u8(y16 + 1);
-            LEO_M128 y2 = vld1q_u8(y16 + 2);
-            LEO_M128 y3 = vld1q_u8(y16 + 3);
-
-            vst1q_u8(x16,     veorq_u8(x0, y0));
-            vst1q_u8(x16 + 1, veorq_u8(x1, y1));
-            vst1q_u8(x16 + 2, veorq_u8(x2, y2));
-            vst1q_u8(x16 + 3, veorq_u8(x3, y3));
-
-            bytes -= 64, x16 += 4, y16 += 4;
-        }
-
-        // Handle multiples of 16 bytes
-        while (bytes >= 16)
-        {
-            LEO_M128 x0 = vld1q_u8(x16);
-            LEO_M128 y0 = vld1q_u8(y16);
-
-            vst1q_u8(x16, veorq_u8(x0, y0));
-
-            bytes -= 16, ++x16, ++y16;
-        }
-    }
-    else
-# endif // LEO_TRY_NEON
-    {
-        uint64_t * LEO_RESTRICT x8 = reinterpret_cast<uint64_t *>(x16);
-        const uint64_t * LEO_RESTRICT y8 = reinterpret_cast<const uint64_t *>(y16);
-
-        const unsigned count = (unsigned)bytes / 8;
-        for (unsigned ii = 0; ii < count; ++ii)
-            x8[ii] ^= y8[ii];
-
-        x16 = reinterpret_cast<LEO_M128 *>(x8 + count);
-        y16 = reinterpret_cast<const LEO_M128 *>(y8 + count);
-    }
-#else // LEO_TARGET_MOBILE
-# if defined(LEO_TRY_AVX2)
+#if defined(LEO_TRY_AVX2)
     if (CpuHasAVX2)
     {
-        LEO_M256 * LEO_RESTRICT x32 = reinterpret_cast<LEO_M256 *>(x16);
-        const LEO_M256 * LEO_RESTRICT y32 = reinterpret_cast<const LEO_M256 *>(y16);
-
-        while (bytes >= 128)
+        LEO_M256 * LEO_RESTRICT x32 = reinterpret_cast<LEO_M256 *>(vx);
+        const LEO_M256 * LEO_RESTRICT y32 = reinterpret_cast<const LEO_M256 *>(vy);
+        do
         {
-            LEO_M256 x0 = _mm256_loadu_si256(x32);
-            LEO_M256 y0 = _mm256_loadu_si256(y32);
-            x0 = _mm256_xor_si256(x0, y0);
-            LEO_M256 x1 = _mm256_loadu_si256(x32 + 1);
-            LEO_M256 y1 = _mm256_loadu_si256(y32 + 1);
-            x1 = _mm256_xor_si256(x1, y1);
-            LEO_M256 x2 = _mm256_loadu_si256(x32 + 2);
-            LEO_M256 y2 = _mm256_loadu_si256(y32 + 2);
-            x2 = _mm256_xor_si256(x2, y2);
-            LEO_M256 x3 = _mm256_loadu_si256(x32 + 3);
-            LEO_M256 y3 = _mm256_loadu_si256(y32 + 3);
-            x3 = _mm256_xor_si256(x3, y3);
-
+            const LEO_M256 x0 = _mm256_xor_si256(_mm256_loadu_si256(x32),     _mm256_loadu_si256(y32));
+            const LEO_M256 x1 = _mm256_xor_si256(_mm256_loadu_si256(x32 + 1), _mm256_loadu_si256(y32 + 1));
+            const LEO_M256 x2 = _mm256_xor_si256(_mm256_loadu_si256(x32 + 2), _mm256_loadu_si256(y32 + 2));
+            const LEO_M256 x3 = _mm256_xor_si256(_mm256_loadu_si256(x32 + 3), _mm256_loadu_si256(y32 + 3));
             _mm256_storeu_si256(x32, x0);
             _mm256_storeu_si256(x32 + 1, x1);
             _mm256_storeu_si256(x32 + 2, x2);
             _mm256_storeu_si256(x32 + 3, x3);
-
             bytes -= 128, x32 += 4, y32 += 4;
-        }
-
-        // Handle multiples of 32 bytes
-        while (bytes >= 32)
+        } while (bytes >= 128);
+        if (bytes > 0)
         {
-            // x[i] = x[i] xor y[i]
-            _mm256_storeu_si256(x32,
-                _mm256_xor_si256(
-                    _mm256_loadu_si256(x32),
-                    _mm256_loadu_si256(y32)));
-
-            bytes -= 32, ++x32, ++y32;
+            const LEO_M256 x0 = _mm256_xor_si256(_mm256_loadu_si256(x32),     _mm256_loadu_si256(y32));
+            const LEO_M256 x1 = _mm256_xor_si256(_mm256_loadu_si256(x32 + 1), _mm256_loadu_si256(y32 + 1));
+            _mm256_storeu_si256(x32, x0);
+            _mm256_storeu_si256(x32 + 1, x1);
         }
-
-        x16 = reinterpret_cast<LEO_M128 *>(x32);
-        y16 = reinterpret_cast<const LEO_M128 *>(y32);
+        return;
     }
-    else
-# endif // LEO_TRY_AVX2
+#endif // LEO_TRY_AVX2
+    LEO_M128 * LEO_RESTRICT x16 = reinterpret_cast<LEO_M128 *>(vx);
+    const LEO_M128 * LEO_RESTRICT y16 = reinterpret_cast<const LEO_M128 *>(vy);
+    do
     {
-        while (bytes >= 64)
-        {
-            LEO_M128 x0 = _mm_loadu_si128(x16);
-            LEO_M128 y0 = _mm_loadu_si128(y16);
-            x0 = _mm_xor_si128(x0, y0);
-            LEO_M128 x1 = _mm_loadu_si128(x16 + 1);
-            LEO_M128 y1 = _mm_loadu_si128(y16 + 1);
-            x1 = _mm_xor_si128(x1, y1);
-            LEO_M128 x2 = _mm_loadu_si128(x16 + 2);
-            LEO_M128 y2 = _mm_loadu_si128(y16 + 2);
-            x2 = _mm_xor_si128(x2, y2);
-            LEO_M128 x3 = _mm_loadu_si128(x16 + 3);
-            LEO_M128 y3 = _mm_loadu_si128(y16 + 3);
-            x3 = _mm_xor_si128(x3, y3);
-
-            _mm_storeu_si128(x16, x0);
-            _mm_storeu_si128(x16 + 1, x1);
-            _mm_storeu_si128(x16 + 2, x2);
-            _mm_storeu_si128(x16 + 3, x3);
-
-            bytes -= 64, x16 += 4, y16 += 4;
-        }
-    }
-#endif // LEO_TARGET_MOBILE
-
-    // Handle multiples of 16 bytes
-    while (bytes >= 16)
-    {
-        // x[i] = x[i] xor y[i]
-        _mm_storeu_si128(x16,
-            _mm_xor_si128(
-                _mm_loadu_si128(x16),
-                _mm_loadu_si128(y16)));
-
-        bytes -= 16, ++x16, ++y16;
-    }
-
-    uint8_t * LEO_RESTRICT x1 = reinterpret_cast<uint8_t *>(x16);
-    const uint8_t * LEO_RESTRICT y1 = reinterpret_cast<const uint8_t *>(y16);
-
-    // Handle a block of 8 bytes
-    const unsigned eight = bytes & 8;
-    if (eight)
-    {
-        uint64_t * LEO_RESTRICT x8 = reinterpret_cast<uint64_t *>(x1);
-        const uint64_t * LEO_RESTRICT y8 = reinterpret_cast<const uint64_t *>(y1);
-        *x8 ^= *y8;
-    }
-
-    // Handle a block of 4 bytes
-    const unsigned four = bytes & 4;
-    if (four)
-    {
-        uint32_t * LEO_RESTRICT x4 = reinterpret_cast<uint32_t *>(x1 + eight);
-        const uint32_t * LEO_RESTRICT y4 = reinterpret_cast<const uint32_t *>(y1 + eight);
-        *x4 ^= *y4;
-    }
-
-    // Handle final bytes
-    const unsigned offset = eight + four;
-    switch (bytes & 3)
-    {
-    case 3: x1[offset + 2] ^= y1[offset + 2];
-    case 2: x1[offset + 1] ^= y1[offset + 1];
-    case 1: x1[offset] ^= y1[offset];
-    default:
-        break;
-    }
+        const LEO_M128 x0 = _mm_xor_si128(_mm_loadu_si128(x16),     _mm_loadu_si128(y16));
+        const LEO_M128 x1 = _mm_xor_si128(_mm_loadu_si128(x16 + 1), _mm_loadu_si128(y16 + 1));
+        const LEO_M128 x2 = _mm_xor_si128(_mm_loadu_si128(x16 + 2), _mm_loadu_si128(y16 + 2));
+        const LEO_M128 x3 = _mm_xor_si128(_mm_loadu_si128(x16 + 3), _mm_loadu_si128(y16 + 3));
+        _mm_storeu_si128(x16, x0);
+        _mm_storeu_si128(x16 + 1, x1);
+        _mm_storeu_si128(x16 + 2, x2);
+        _mm_storeu_si128(x16 + 3, x3);
+        bytes -= 64, x16 += 4, y16 += 4;
+    } while (bytes > 0);
 }
 
-
-//------------------------------------------------------------------------------
-// Formal Derivative
-
-// Formal derivative of polynomial in the new basis
-static void formal_derivative(GFSymbol* cos, const unsigned size)
+void xor_mem2(
+    void * LEO_RESTRICT vx_0, const void * LEO_RESTRICT vy_0,
+    void * LEO_RESTRICT vx_1, const void * LEO_RESTRICT vy_1,
+    unsigned bytes)
 {
-    for (unsigned i = 1; i < size; ++i)
+#if defined(LEO_TRY_AVX2)
+    if (CpuHasAVX2)
     {
-        const unsigned leng = ((i ^ (i - 1)) + 1) >> 1;
-
-        // If a large number of values are being XORed:
-        if (leng >= 8)
-            xor_mem(cos + i - leng, cos + i, leng * sizeof(GFSymbol));
-        else
-            for (unsigned j = i - leng; j < i; j++)
-                cos[j] ^= cos[j + leng];
+        LEO_M256 * LEO_RESTRICT       x32_0 = reinterpret_cast<LEO_M256 *>      (vx_0);
+        const LEO_M256 * LEO_RESTRICT y32_0 = reinterpret_cast<const LEO_M256 *>(vy_0);
+        LEO_M256 * LEO_RESTRICT       x32_1 = reinterpret_cast<LEO_M256 *>      (vx_1);
+        const LEO_M256 * LEO_RESTRICT y32_1 = reinterpret_cast<const LEO_M256 *>(vy_1);
+        do
+        {
+            const LEO_M256 x0_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0),     _mm256_loadu_si256(y32_0));
+            const LEO_M256 x1_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 1), _mm256_loadu_si256(y32_0 + 1));
+            const LEO_M256 x2_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 2), _mm256_loadu_si256(y32_0 + 2));
+            const LEO_M256 x3_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 3), _mm256_loadu_si256(y32_0 + 3));
+            const LEO_M256 x0_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1),     _mm256_loadu_si256(y32_1));
+            const LEO_M256 x1_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 1), _mm256_loadu_si256(y32_1 + 1));
+            const LEO_M256 x2_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 2), _mm256_loadu_si256(y32_1 + 2));
+            const LEO_M256 x3_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 3), _mm256_loadu_si256(y32_1 + 3));
+            _mm256_storeu_si256(x32_0,     x0_0);
+            _mm256_storeu_si256(x32_0 + 1, x1_0);
+            _mm256_storeu_si256(x32_0 + 2, x2_0);
+            _mm256_storeu_si256(x32_0 + 3, x3_0);
+            _mm256_storeu_si256(x32_1,     x0_1);
+            _mm256_storeu_si256(x32_1 + 1, x1_1);
+            _mm256_storeu_si256(x32_1 + 2, x2_1);
+            _mm256_storeu_si256(x32_1 + 3, x3_1);
+            x32_0 += 4, y32_0 += 4;
+            x32_1 += 4, y32_1 += 4;
+            bytes -= 128;
+        } while (bytes >= 128);
+        if (bytes > 0)
+        {
+            const LEO_M256 x0_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0),     _mm256_loadu_si256(y32_0));
+            const LEO_M256 x1_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 1), _mm256_loadu_si256(y32_0 + 1));
+            const LEO_M256 x0_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1),     _mm256_loadu_si256(y32_1));
+            const LEO_M256 x1_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 1), _mm256_loadu_si256(y32_1 + 1));
+            _mm256_storeu_si256(x32_0,     x0_0);
+            _mm256_storeu_si256(x32_0 + 1, x1_0);
+            _mm256_storeu_si256(x32_1,     x0_1);
+            _mm256_storeu_si256(x32_1 + 1, x1_1);
+        }
+        return;
     }
-
-    for (unsigned i = size; i < kFieldSize; i <<= 1)
-        xor_mem(cos, cos + i, size * sizeof(GFSymbol));
+#endif // LEO_TRY_AVX2
+    LEO_M128 * LEO_RESTRICT       x16_0 = reinterpret_cast<LEO_M128 *>      (vx_0);
+    const LEO_M128 * LEO_RESTRICT y16_0 = reinterpret_cast<const LEO_M128 *>(vy_0);
+    LEO_M128 * LEO_RESTRICT       x16_1 = reinterpret_cast<LEO_M128 *>      (vx_1);
+    const LEO_M128 * LEO_RESTRICT y16_1 = reinterpret_cast<const LEO_M128 *>(vy_1);
+    do
+    {
+        const LEO_M128 x0_0 = _mm_xor_si128(_mm_loadu_si128(x16_0),     _mm_loadu_si128(y16_0));
+        const LEO_M128 x1_0 = _mm_xor_si128(_mm_loadu_si128(x16_0 + 1), _mm_loadu_si128(y16_0 + 1));
+        const LEO_M128 x2_0 = _mm_xor_si128(_mm_loadu_si128(x16_0 + 2), _mm_loadu_si128(y16_0 + 2));
+        const LEO_M128 x3_0 = _mm_xor_si128(_mm_loadu_si128(x16_0 + 3), _mm_loadu_si128(y16_0 + 3));
+        const LEO_M128 x0_1 = _mm_xor_si128(_mm_loadu_si128(x16_1),     _mm_loadu_si128(y16_1));
+        const LEO_M128 x1_1 = _mm_xor_si128(_mm_loadu_si128(x16_1 + 1), _mm_loadu_si128(y16_1 + 1));
+        const LEO_M128 x2_1 = _mm_xor_si128(_mm_loadu_si128(x16_1 + 2), _mm_loadu_si128(y16_1 + 2));
+        const LEO_M128 x3_1 = _mm_xor_si128(_mm_loadu_si128(x16_1 + 3), _mm_loadu_si128(y16_1 + 3));
+        _mm_storeu_si128(x16_0,     x0_0);
+        _mm_storeu_si128(x16_0 + 1, x1_0);
+        _mm_storeu_si128(x16_0 + 2, x2_0);
+        _mm_storeu_si128(x16_0 + 3, x3_0);
+        _mm_storeu_si128(x16_1,     x0_1);
+        _mm_storeu_si128(x16_1 + 1, x1_1);
+        _mm_storeu_si128(x16_1 + 2, x2_1);
+        _mm_storeu_si128(x16_1 + 3, x3_1);
+        x16_0 += 4, y16_0 += 4;
+        x16_1 += 4, y16_1 += 4;
+        bytes -= 64;
+    } while (bytes > 0);
 }
 
-
-//------------------------------------------------------------------------------
-// Fast Fourier Transform
-
-static GFSymbol skewVec[kFieldModulus]; // twisted factors used in FFT
-
-// IFFT in the proposed basis
-static void IFLT(GFSymbol* data, const unsigned size, const unsigned index)
+void xor_mem3(
+    void * LEO_RESTRICT vx_0, const void * LEO_RESTRICT vy_0,
+    void * LEO_RESTRICT vx_1, const void * LEO_RESTRICT vy_1,
+    void * LEO_RESTRICT vx_2, const void * LEO_RESTRICT vy_2,
+    unsigned bytes)
 {
-    for (unsigned depart_no = 1; depart_no < size; depart_no <<= 1)
+#if defined(LEO_TRY_AVX2)
+    if (CpuHasAVX2)
     {
-        for (unsigned j = depart_no; j < size; j += (depart_no << 1))
+        LEO_M256 * LEO_RESTRICT       x32_0 = reinterpret_cast<LEO_M256 *>      (vx_0);
+        const LEO_M256 * LEO_RESTRICT y32_0 = reinterpret_cast<const LEO_M256 *>(vy_0);
+        LEO_M256 * LEO_RESTRICT       x32_1 = reinterpret_cast<LEO_M256 *>      (vx_1);
+        const LEO_M256 * LEO_RESTRICT y32_1 = reinterpret_cast<const LEO_M256 *>(vy_1);
+        LEO_M256 * LEO_RESTRICT       x32_2 = reinterpret_cast<LEO_M256 *>      (vx_2);
+        const LEO_M256 * LEO_RESTRICT y32_2 = reinterpret_cast<const LEO_M256 *>(vy_2);
+        do
         {
-            // If a large number of values are being XORed:
-            if (depart_no >= 8)
-                xor_mem(data + j, data + j - depart_no, depart_no * sizeof(GFSymbol));
-            else
-                for (unsigned i = j - depart_no; i < j; ++i)
-                    data[i + depart_no] ^= data[i];
-
-            const GFSymbol skew = skewVec[j + index - 1];
-
-            if (skew != kFieldModulus)
-                muladd_mem(data + j - depart_no, data + j, skew, depart_no);
-        }
-    }
-}
-
-// FFT in the proposed basis
-static void FLT(GFSymbol* data, const unsigned size, const unsigned index)
-{
-    for (unsigned depart_no = (size >> 1); depart_no > 0; depart_no >>= 1)
-    {
-        for (unsigned j = depart_no; j < size; j += (depart_no << 1))
+            const LEO_M256 x0_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0),     _mm256_loadu_si256(y32_0));
+            const LEO_M256 x1_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 1), _mm256_loadu_si256(y32_0 + 1));
+            const LEO_M256 x2_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 2), _mm256_loadu_si256(y32_0 + 2));
+            const LEO_M256 x3_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 3), _mm256_loadu_si256(y32_0 + 3));
+            const LEO_M256 x0_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1),     _mm256_loadu_si256(y32_1));
+            const LEO_M256 x1_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 1), _mm256_loadu_si256(y32_1 + 1));
+            const LEO_M256 x2_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 2), _mm256_loadu_si256(y32_1 + 2));
+            const LEO_M256 x3_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 3), _mm256_loadu_si256(y32_1 + 3));
+            const LEO_M256 x0_2 = _mm256_xor_si256(_mm256_loadu_si256(x32_2),     _mm256_loadu_si256(y32_2));
+            const LEO_M256 x1_2 = _mm256_xor_si256(_mm256_loadu_si256(x32_2 + 1), _mm256_loadu_si256(y32_2 + 1));
+            const LEO_M256 x2_2 = _mm256_xor_si256(_mm256_loadu_si256(x32_2 + 2), _mm256_loadu_si256(y32_2 + 2));
+            const LEO_M256 x3_2 = _mm256_xor_si256(_mm256_loadu_si256(x32_2 + 3), _mm256_loadu_si256(y32_2 + 3));
+            _mm256_storeu_si256(x32_0,     x0_0);
+            _mm256_storeu_si256(x32_0 + 1, x1_0);
+            _mm256_storeu_si256(x32_0 + 2, x2_0);
+            _mm256_storeu_si256(x32_0 + 3, x3_0);
+            _mm256_storeu_si256(x32_1,     x0_1);
+            _mm256_storeu_si256(x32_1 + 1, x1_1);
+            _mm256_storeu_si256(x32_1 + 2, x2_1);
+            _mm256_storeu_si256(x32_1 + 3, x3_1);
+            _mm256_storeu_si256(x32_2,     x0_2);
+            _mm256_storeu_si256(x32_2 + 1, x1_2);
+            _mm256_storeu_si256(x32_2 + 2, x2_2);
+            _mm256_storeu_si256(x32_2 + 3, x3_2);
+            x32_0 += 4, y32_0 += 4;
+            x32_1 += 4, y32_1 += 4;
+            x32_2 += 4, y32_2 += 4;
+            bytes -= 128;
+        } while (bytes >= 128);
+        if (bytes > 0)
         {
-            const GFSymbol skew = skewVec[j + index - 1];
-
-            if (skew != kFieldModulus)
-                muladd_mem(data + j - depart_no, data + j, skew, depart_no);
-
-            // If a large number of values are being XORed:
-            if (depart_no >= 8)
-                xor_mem(data + j, data + j - depart_no, depart_no * sizeof(GFSymbol));
-            else
-                for (unsigned i = j - depart_no; i < j; ++i)
-                    data[i + depart_no] ^= data[i];
+            const LEO_M256 x0_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0),     _mm256_loadu_si256(y32_0));
+            const LEO_M256 x1_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 1), _mm256_loadu_si256(y32_0 + 1));
+            const LEO_M256 x0_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1),     _mm256_loadu_si256(y32_1));
+            const LEO_M256 x1_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 1), _mm256_loadu_si256(y32_1 + 1));
+            const LEO_M256 x0_2 = _mm256_xor_si256(_mm256_loadu_si256(x32_2),     _mm256_loadu_si256(y32_2));
+            const LEO_M256 x1_2 = _mm256_xor_si256(_mm256_loadu_si256(x32_2 + 1), _mm256_loadu_si256(y32_2 + 1));
+            _mm256_storeu_si256(x32_0,     x0_0);
+            _mm256_storeu_si256(x32_0 + 1, x1_0);
+            _mm256_storeu_si256(x32_1,     x0_1);
+            _mm256_storeu_si256(x32_1 + 1, x1_1);
+            _mm256_storeu_si256(x32_2,     x0_2);
+            _mm256_storeu_si256(x32_2 + 1, x1_2);
         }
+        return;
     }
-}
-
-
-//------------------------------------------------------------------------------
-// FFT Initialization
-
-static GFSymbol B[kFieldSize >> 1];     // factors used in formal derivative
-static fwht_t log_walsh[kFieldSize];  // factors used in the evaluation of the error locator polynomial
-
-// Initialize skewVec[], B[], log_walsh[]
-static void InitFieldOperations()
-{
-    GFSymbol temp[kGFBits - 1];
-
-    for (unsigned i = 1; i < kGFBits; ++i)
-        temp[i - 1] = (GFSymbol)((unsigned)1 << i);
-
-    for (unsigned m = 0; m < (kGFBits - 1); ++m)
+#endif // LEO_TRY_AVX2
+    LEO_M128 * LEO_RESTRICT       x16_0 = reinterpret_cast<LEO_M128 *>      (vx_0);
+    const LEO_M128 * LEO_RESTRICT y16_0 = reinterpret_cast<const LEO_M128 *>(vy_0);
+    LEO_M128 * LEO_RESTRICT       x16_1 = reinterpret_cast<LEO_M128 *>      (vx_1);
+    const LEO_M128 * LEO_RESTRICT y16_1 = reinterpret_cast<const LEO_M128 *>(vy_1);
+    LEO_M128 * LEO_RESTRICT       x16_2 = reinterpret_cast<LEO_M128 *>      (vx_2);
+    const LEO_M128 * LEO_RESTRICT y16_2 = reinterpret_cast<const LEO_M128 *>(vy_2);
+    do
     {
-        const unsigned step = (unsigned)1 << (m + 1);
-
-        skewVec[((unsigned)1 << m) - 1] = 0;
-
-        for (unsigned i = m; i < (kGFBits - 1); ++i)
-        {
-            const unsigned s = ((unsigned)1 << (i + 1));
-
-            for (unsigned j = ((unsigned)1 << m) - 1; j < s; j += step)
-                skewVec[j + s] = skewVec[j] ^ temp[i];
-        }
-
-        temp[m] = kFieldModulus - GFLog[mulE(temp[m], GFLog[temp[m] ^ 1])];
-
-        for (unsigned i = m + 1; i < (kGFBits - 1); ++i)
-            temp[i] = mulE(temp[i], (GFLog[temp[i] ^ 1] + temp[m]) % kFieldModulus);
-    }
-
-    for (unsigned i = 0; i < kFieldSize; ++i)
-        skewVec[i] = GFLog[skewVec[i]];
-
-    temp[0] = kFieldModulus - temp[0];
-
-    for (unsigned i = 1; i < (kGFBits - 1); ++i)
-        temp[i] = (kFieldModulus - temp[i] + temp[i - 1]) % kFieldModulus;
-
-    B[0] = 0;
-    for (unsigned i = 0; i < (kGFBits - 1); ++i)
-    {
-        const unsigned depart = ((unsigned)1 << i);
-
-        for (unsigned j = 0; j < depart; ++j)
-            B[j + depart] = (B[j] + temp[i]) % kFieldModulus;
-    }
-
-    for (unsigned i = 0; i < kFieldSize; ++i)
-        log_walsh[i] = GFLog[i];
-
-    log_walsh[0] = 0;
-
-    FWHT(log_walsh, kGFBits);
-}
-
-
-//------------------------------------------------------------------------------
-// Encoder
-
-// Encoding alg for k/n<0.5: message is a power of two
-static void encodeL(GFSymbol* data, const unsigned k, GFSymbol* codeword)
-{
-    memcpy(codeword, data, sizeof(GFSymbol) * k);
-
-    IFLT(codeword, k, 0);
-
-    for (unsigned i = k; i < kFieldSize; i += k)
-    {
-        memcpy(&codeword[i], codeword, sizeof(GFSymbol) * k);
-
-        FLT(&codeword[i], k, i);
-    }
-
-    memcpy(codeword, data, sizeof(GFSymbol) * k);
-}
-
-// Encoding alg for k/n>0.5: parity is a power of two.
-// data: message array. parity: parity array. mem: buffer(size>= n-k)
-static void encodeH(const GFSymbol* data, const unsigned k, GFSymbol* parity, GFSymbol* mem)
-{
-    const unsigned t = kFieldSize - k;
-
-    memset(parity, 0, sizeof(GFSymbol) * t);
-
-    for (unsigned i = t; i < kFieldSize; i += t)
-    {
-        memcpy(mem, &data[i - t], sizeof(GFSymbol) * t);
-
-        IFLT(mem, t, i);
-
-        xor_mem(parity, mem, t * sizeof(GFSymbol));
-    }
-
-    FLT(parity, t, 0);
-}
-
-
-//------------------------------------------------------------------------------
-// Decoder
-
-static void decode(GFSymbol* codeword, unsigned k, const bool* erasure)
-{
-    fwht_t log_walsh2[kFieldSize];
-
-    // Compute the evaluations of the error locator polynomial
-    for (unsigned i = 0; i < kFieldSize; ++i)
-        log_walsh2[i] = erasure[i] ? 1 : 0;
-
-    FWHT(log_walsh2, kGFBits);
-
-    for (unsigned i = 0; i < kFieldSize; ++i)
-        log_walsh2[i] = ((unsigned)log_walsh2[i] * (unsigned)log_walsh[i]) % kFieldModulus;
-
-    FWHT(log_walsh2, kGFBits);
-
-    // k2 can be replaced with k
-    const unsigned k2 = kFieldSize;
-    //const unsigned k2 = k; // cannot actually be replaced with k.  what else need to change?
-
-    for (unsigned i = 0; i < kFieldSize; ++i)
-    {
-        if (erasure[i])
-        {
-            codeword[i] = 0;
-        }
-        else
-        {
-            codeword[i] = mulE(codeword[i], log_walsh2[i]);
-        }
-    }
-
-    IFLT(codeword, kFieldSize, 0);
-
-    // formal derivative
-    for (unsigned i = 0; i < kFieldSize; i += 2)
-    {
-        codeword[i] = mulE(codeword[i], kFieldModulus - B[i >> 1]);
-        codeword[i + 1] = mulE(codeword[i + 1], kFieldModulus - B[i >> 1]);
-    }
-
-    formal_derivative(codeword, k2);
-
-    for (unsigned i = 0; i < k2; i += 2)
-    {
-        codeword[i] = mulE(codeword[i], B[i >> 1]);
-        codeword[i + 1] = mulE(codeword[i + 1], B[i >> 1]);
-    }
-
-    FLT(codeword, k2, 0);
-
-    for (unsigned i = 0; i < k2; ++i)
-    {
-        if (erasure[i])
-        {
-            codeword[i] = mulE(codeword[i], kFieldModulus - log_walsh2[i]);
-        }
-    }
-}
-
-
-//------------------------------------------------------------------------------
-// Test Application
-
-void test(unsigned k, unsigned seed)
-{
-    srand(seed);
-
-    //-----------Generating message----------
-
-    // Message array
-    GFSymbol data[kFieldSize] = {0};
-
-    // Filled with random numbers
-    for (unsigned i = kFieldSize - k; i < kFieldSize; ++i)
-        data[i] = (GFSymbol)rand();
-
-
-    //---------encoding----------
-
-    GFSymbol codeword[kFieldSize];
-    encodeH(&data[kFieldSize - k], k, data, codeword);
-    //encodeL(data, k, codeword); // does not seem to work with any input?  what else needs to change?
-
-    memcpy(codeword, data, sizeof(GFSymbol) * kFieldSize);
-
-
-    //--------erasure simulation---------
-
-    // Array indicating erasures
-    bool erasure[kFieldSize] = {
-        false
-    };
-
-    for (unsigned i = k; i < kFieldSize; ++i)
-        erasure[i] = true;
-
-    // permuting the erasure array
-    for (unsigned i = kFieldSize - 1; i > 0; --i)
-    {
-        unsigned pos = rand() % (i + 1);
-
-        if (i != pos)
-        {
-            bool tmp = erasure[i];
-            erasure[i] = erasure[pos];
-            erasure[pos] = tmp;
-        }
-    }
-
-    // erasure codeword symbols
-    for (unsigned i = 0; i < kFieldSize; ++i)
-        if (erasure[i])
-            codeword[i] = 0;
-
-
-    //---------main processing----------
-    decode(codeword, k, erasure);
-
-    // Check the correctness of the result
-    for (unsigned i = 0; i < kFieldSize; ++i)
-    {
-        if (erasure[i] == 1)
-        {
-            if (data[i] != codeword[i])
-            {
-                printf("Decoding Error with seed = %d!\n", seed);
-                LEO_DEBUG_BREAK;
-                return;
-            }
-        }
-    }
-
-    //printf("Decoding is successful!\n");
-}
-
-
-//------------------------------------------------------------------------------
-// Entrypoint
-
-int main(int argc, char **argv)
-{
-    // Initialize architecture-specific code
-    leo_architecture_init();
-
-    // Fill GFLog table and GFExp table
-    InitField();
-
-    // Compute factors used in erasure decoder
-    InitFieldOperations();
-
-    unsigned seed = (unsigned)time(NULL);
-    for (;;)
-    {
-        // test(int k), k: message size
-        /*
-            EncodeH works for kFieldSize / 2 and kFieldSize * 3 / 4, etc,
-            s.t. the number of recovery pieces is a power of two
-        */
-        test(kFieldSize / 2, seed);
-
-        ++seed;
-    }
-
-    return 0;
+        const LEO_M128 x0_0 = _mm_xor_si128(_mm_loadu_si128(x16_0),     _mm_loadu_si128(y16_0));
+        const LEO_M128 x1_0 = _mm_xor_si128(_mm_loadu_si128(x16_0 + 1), _mm_loadu_si128(y16_0 + 1));
+        const LEO_M128 x2_0 = _mm_xor_si128(_mm_loadu_si128(x16_0 + 2), _mm_loadu_si128(y16_0 + 2));
+        const LEO_M128 x3_0 = _mm_xor_si128(_mm_loadu_si128(x16_0 + 3), _mm_loadu_si128(y16_0 + 3));
+        const LEO_M128 x0_1 = _mm_xor_si128(_mm_loadu_si128(x16_1),     _mm_loadu_si128(y16_1));
+        const LEO_M128 x1_1 = _mm_xor_si128(_mm_loadu_si128(x16_1 + 1), _mm_loadu_si128(y16_1 + 1));
+        const LEO_M128 x2_1 = _mm_xor_si128(_mm_loadu_si128(x16_1 + 2), _mm_loadu_si128(y16_1 + 2));
+        const LEO_M128 x3_1 = _mm_xor_si128(_mm_loadu_si128(x16_1 + 3), _mm_loadu_si128(y16_1 + 3));
+        const LEO_M128 x0_2 = _mm_xor_si128(_mm_loadu_si128(x16_2),     _mm_loadu_si128(y16_2));
+        const LEO_M128 x1_2 = _mm_xor_si128(_mm_loadu_si128(x16_2 + 1), _mm_loadu_si128(y16_2 + 1));
+        const LEO_M128 x2_2 = _mm_xor_si128(_mm_loadu_si128(x16_2 + 2), _mm_loadu_si128(y16_2 + 2));
+        const LEO_M128 x3_2 = _mm_xor_si128(_mm_loadu_si128(x16_2 + 3), _mm_loadu_si128(y16_2 + 3));
+        _mm_storeu_si128(x16_0,     x0_0);
+        _mm_storeu_si128(x16_0 + 1, x1_0);
+        _mm_storeu_si128(x16_0 + 2, x2_0);
+        _mm_storeu_si128(x16_0 + 3, x3_0);
+        _mm_storeu_si128(x16_1,     x0_1);
+        _mm_storeu_si128(x16_1 + 1, x1_1);
+        _mm_storeu_si128(x16_1 + 2, x2_1);
+        _mm_storeu_si128(x16_1 + 3, x3_1);
+        _mm_storeu_si128(x16_2,     x0_2);
+        _mm_storeu_si128(x16_2 + 1, x1_2);
+        _mm_storeu_si128(x16_2 + 2, x2_2);
+        _mm_storeu_si128(x16_2 + 3, x3_2);
+        x16_0 += 4, y16_0 += 4;
+        x16_1 += 4, y16_1 += 4;
+        x16_2 += 4, y16_2 += 4;
+        bytes -= 64;
+    } while (bytes > 0);
 }
 
 
diff --git a/LeopardCommon.h b/LeopardCommon.h
index 17425c0..a737304 100644
--- a/LeopardCommon.h
+++ b/LeopardCommon.h
@@ -30,42 +30,20 @@
 
 /*
     TODO:
-    + Refactor software
-        + I think it should be split up into several C++ modules
-    + Replace GFSymbol with a file data pointer
-    + New 16-bit Muladd inner loops
-        + Class to contain the (large) muladd tables
-    + Preliminary benchmarks for large data!
     + New 8-bit Muladd inner loops
-    + Benchmarks for smaller data!
-    + Write detailed comments for all the routines
-    + Look into getting EncodeL working so we can support smaller data (Ask Lin)
-    + Look into using k instead of k2 to speed up decoder (Ask Lin)
-    + Avoid performing FFT/IFFT intermediate calculations we're not going to use
-    + Benchmarks, fun!
+        + Benchmarks for smaller data!
+    + New 16-bit Muladd inner loops
+        + Benchmarks for large data!
+    + Use parallel row ops
     + Add multi-threading to split up long parallelizable calculations
-    + Final benchmarks!
-    + Finish up documentation
+        + Write detailed comments for all the routines
+        + Final benchmarks!
     + Release version 1
+        + Finish up documentation
 
-
-    Muladd implementation notes:
-
-    Specialize for 1-3 rows at a time since often times we're multiplying by
-    the same (skew) value repeatedly, as the ISA-L library does here:
-
-    https://github.com/01org/isa-l/blob/master/erasure_code/gf_3vect_mad_avx.asm#L258
-
-    Except we should be doing it for 16-bit Galois Field.
-    To implement that use the ALTMAP trick from Jerasure:
-
-    http://lab.jerasure.org/jerasure/gf-complete/blob/master/src/gf_w16.c#L1140
-
-    Except we should also support AVX2 since that is a 40% perf boost, so put
-    the high and low bytes 32 bytes instead of 16 bytes apart.
-
-    Also I think we should go ahead and precompute the multiply tables since
-    it avoids a bunch of memory lookups for each muladd, and only costs 8 MB.
+    TBD:
+    + Look into getting EncodeL working so we can support smaller data (Ask Lin)
+    + Look into using FFT_m instead of FFT_n for decoder
 */
 
 #include <stdint.h>
@@ -191,4 +169,57 @@ extern bool CpuHasSSSE3;
 #endif // LEO_TARGET_MOBILE
 
 
+//------------------------------------------------------------------------------
+// Portable Intrinsics
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#endif
+
+// Returns highest bit index 0..31 where the first non-zero bit is found
+// Precondition: x != 0
+LEO_FORCE_INLINE unsigned LastNonzeroBit32(unsigned x)
+{
+#ifdef _MSC_VER
+    unsigned long index;
+    // Note: Ignoring result because x != 0
+    _BitScanReverse(&index, (uint32_t)x);
+    return (unsigned)index;
+#else
+    // Note: Ignoring return value of 0 because x != 0
+    return 31 - (unsigned)__builtin_clzl(x);
+#endif
+}
+
+// Returns next power of two at or above given value
+LEO_FORCE_INLINE unsigned NextPow2(unsigned n)
+{
+    return 2UL << LastNonzeroBit32(n - 1);
+}
+
+
+//------------------------------------------------------------------------------
+// XOR Memory
+//
+// This works for both 8-bit and 16-bit finite fields
+
+// x[] ^= y[]
+void xor_mem(
+    void * LEO_RESTRICT x, const void * LEO_RESTRICT y,
+    unsigned bytes);
+
+// For i = {0, 1}: x_i[] ^= x_i[]
+void xor_mem2(
+    void * LEO_RESTRICT x_0, const void * LEO_RESTRICT y_0,
+    void * LEO_RESTRICT x_1, const void * LEO_RESTRICT y_1,
+    unsigned bytes);
+
+// For i = {0, 1, 2}: x_i[] ^= x_i[]
+void xor_mem3(
+    void * LEO_RESTRICT x_0, const void * LEO_RESTRICT y_0,
+    void * LEO_RESTRICT x_1, const void * LEO_RESTRICT y_1,
+    void * LEO_RESTRICT x_2, const void * LEO_RESTRICT y_2,
+    unsigned bytes);
+
+
 } // namespace leopard
diff --git a/LeopardDecoder.cpp b/LeopardDecoder.cpp
deleted file mode 100644
index 71d22e2..0000000
--- a/LeopardDecoder.cpp
+++ /dev/null
@@ -1,1220 +0,0 @@
-/*
-    Copyright (c) 2017 Christopher A. Taylor.  All rights reserved.
-
-    Redistribution and use in source and binary forms, with or without
-    modification, are permitted provided that the following conditions are met:
-
-    * Redistributions of source code must retain the above copyright notice,
-      this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above copyright notice,
-      this list of conditions and the following disclaimer in the documentation
-      and/or other materials provided with the distribution.
-    * Neither the name of LHC-RS nor the names of its contributors may be
-      used to endorse or promote products derived from this software without
-      specific prior written permission.
-
-    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-    ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
-    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-    POSSIBILITY OF SUCH DAMAGE.
-*/
-
-#include <string.h>
-#include <time.h>
-#include <stdio.h>
-#include <stdint.h>
-#include <stdlib.h>
-
-
-/*
-    TODO:
-    + Write C API and unit tester
-        + Limit input to multiples of 64 bytes
-    + Replace GFSymbol with a file data pointer
-    + New 16-bit Muladd inner loops
-        + Class to contain the (large) muladd tables
-    + Preliminary benchmarks for large data!
-    + New 8-bit Muladd inner loops
-    + Benchmarks for smaller data!
-    + Refactor software
-        + Pick a name for the software better than LEO_RS
-        + I think it should be split up into several C++ modules
-    + Write detailed comments for all the routines
-    + Look into getting EncodeL working so we can support smaller data (Ask Lin)
-    + Look into using k instead of k2 to speed up decoder (Ask Lin)
-    + Avoid performing FFT/IFFT intermediate calculations we're not going to use
-    + Benchmarks, fun!
-    + Add multi-threading to split up long parallelizable calculations
-    + Final benchmarks!
-    + Finish up documentation
-    + Release version 1
-
-
-    Muladd implementation notes:
-
-    Specialize for 1-3 rows at a time since often times we're multiplying by
-    the same (skew) value repeatedly, as the ISA-L library does here:
-
-    https://github.com/01org/isa-l/blob/master/erasure_code/gf_3vect_mad_avx.asm#L258
-
-    Except we should be doing it for 16-bit Galois Field.
-    To implement that use the ALTMAP trick from Jerasure:
-
-    http://lab.jerasure.org/jerasure/gf-complete/blob/master/src/gf_w16.c#L1140
-
-    Except we should also support AVX2 since that is a 40% perf boost, so put
-    the high and low bytes 32 bytes instead of 16 bytes apart.
-
-    Also I think we should go ahead and precompute the multiply tables since
-    it avoids a bunch of memory lookups for each muladd, and only costs 8 MB.
-*/
-
-
-//------------------------------------------------------------------------------
-// Debug
-
-// Some bugs only repro in release mode, so this can be helpful
-//#define LEO_DEBUG_IN_RELEASE
-
-#if defined(_DEBUG) || defined(DEBUG) || defined(LEO_DEBUG_IN_RELEASE)
-    #define LEO_DEBUG
-    #ifdef _WIN32
-        #define LEO_DEBUG_BREAK __debugbreak()
-    #else
-        #define LEO_DEBUG_BREAK __builtin_trap()
-    #endif
-    #define LEO_DEBUG_ASSERT(cond) { if (!(cond)) { LEO_DEBUG_BREAK; } }
-#else
-    #define LEO_DEBUG_BREAK ;
-    #define LEO_DEBUG_ASSERT(cond) ;
-#endif
-
-
-//------------------------------------------------------------------------------
-// Platform/Architecture
-
-#if defined(ANDROID) || defined(IOS)
-    #define LEO_TARGET_MOBILE
-#endif // ANDROID
-
-#if defined(__AVX2__) || (defined (_MSC_VER) && _MSC_VER >= 1900)
-    #define LEO_TRY_AVX2 /* 256-bit */
-    #include <immintrin.h>
-    #define LEO_ALIGN_BYTES 32
-#else // __AVX2__
-    #define LEO_ALIGN_BYTES 16
-#endif // __AVX2__
-
-#if !defined(LEO_TARGET_MOBILE)
-    // Note: MSVC currently only supports SSSE3 but not AVX2
-    #include <tmmintrin.h> // SSSE3: _mm_shuffle_epi8
-    #include <emmintrin.h> // SSE2
-#endif // LEO_TARGET_MOBILE
-
-#if defined(HAVE_ARM_NEON_H)
-    #include <arm_neon.h>
-#endif // HAVE_ARM_NEON_H
-
-#if defined(LEO_TARGET_MOBILE)
-
-    #define LEO_ALIGNED_ACCESSES /* Inputs must be aligned to LEO_ALIGN_BYTES */
-
-# if defined(HAVE_ARM_NEON_H)
-    // Compiler-specific 128-bit SIMD register keyword
-    #define LEO_M128 uint8x16_t
-    #define LEO_TRY_NEON
-#else
-    #define LEO_M128 uint64_t
-# endif
-
-#else // LEO_TARGET_MOBILE
-
-    // Compiler-specific 128-bit SIMD register keyword
-    #define LEO_M128 __m128i
-
-#endif // LEO_TARGET_MOBILE
-
-#ifdef LEO_TRY_AVX2
-    // Compiler-specific 256-bit SIMD register keyword
-    #define LEO_M256 __m256i
-#endif
-
-// Compiler-specific C++11 restrict keyword
-#define LEO_RESTRICT __restrict
-
-// Compiler-specific force inline keyword
-#ifdef _MSC_VER
-    #define LEO_FORCE_INLINE inline __forceinline
-#else
-    #define LEO_FORCE_INLINE inline __attribute__((always_inline))
-#endif
-
-// Compiler-specific alignment keyword
-// Note: Alignment only matters for ARM NEON where it should be 16
-#ifdef _MSC_VER
-    #define LEO_ALIGNED __declspec(align(LEO_ALIGN_BYTES))
-#else // _MSC_VER
-    #define LEO_ALIGNED __attribute__((aligned(LEO_ALIGN_BYTES)))
-#endif // _MSC_VER
-
-
-//------------------------------------------------------------------------------
-// Runtime CPU Architecture Check
-//
-// Feature checks stolen shamelessly from
-// https://github.com/jedisct1/libsodium/blob/master/src/libsodium/sodium/runtime.c
-
-#if defined(HAVE_ANDROID_GETCPUFEATURES)
-    #include <cpu-features.h>
-#endif
-
-#if defined(LEO_TRY_NEON)
-# if defined(IOS) && defined(__ARM_NEON__)
-        // Requires iPhone 5S or newer
-        static const bool CpuHasNeon = true;
-        static const bool CpuHasNeon64 = true;
-# else
-        // Remember to add LOCAL_STATIC_LIBRARIES := cpufeatures
-        static bool CpuHasNeon = false; // V6 / V7
-        static bool CpuHasNeon64 = false; // 64-bit
-# endif
-#endif
-
-
-#if !defined(LEO_TARGET_MOBILE)
-
-#ifdef _MSC_VER
-    #include <intrin.h> // __cpuid
-    #pragma warning(disable: 4752) // found Intel(R) Advanced Vector Extensions; consider using /arch:AVX
-#endif
-
-#ifdef LEO_TRY_AVX2
-static bool CpuHasAVX2 = false;
-#endif
-static bool CpuHasSSSE3 = false;
-
-#define CPUID_EBX_AVX2    0x00000020
-#define CPUID_ECX_SSSE3   0x00000200
-
-static void _cpuid(unsigned int cpu_info[4U], const unsigned int cpu_info_type)
-{
-#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_AMD64) || defined(_M_IX86))
-    __cpuid((int *) cpu_info, cpu_info_type);
-#else //if defined(HAVE_CPUID)
-    cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0;
-# ifdef __i386__
-    __asm__ __volatile__ ("pushfl; pushfl; "
-                          "popl %0; "
-                          "movl %0, %1; xorl %2, %0; "
-                          "pushl %0; "
-                          "popfl; pushfl; popl %0; popfl" :
-                          "=&r" (cpu_info[0]), "=&r" (cpu_info[1]) :
-                          "i" (0x200000));
-    if (((cpu_info[0] ^ cpu_info[1]) & 0x200000) == 0) {
-        return; /* LCOV_EXCL_LINE */
-    }
-# endif
-# ifdef __i386__
-    __asm__ __volatile__ ("xchgl %%ebx, %k1; cpuid; xchgl %%ebx, %k1" :
-                          "=a" (cpu_info[0]), "=&r" (cpu_info[1]),
-                          "=c" (cpu_info[2]), "=d" (cpu_info[3]) :
-                          "0" (cpu_info_type), "2" (0U));
-# elif defined(__x86_64__)
-    __asm__ __volatile__ ("xchgq %%rbx, %q1; cpuid; xchgq %%rbx, %q1" :
-                          "=a" (cpu_info[0]), "=&r" (cpu_info[1]),
-                          "=c" (cpu_info[2]), "=d" (cpu_info[3]) :
-                          "0" (cpu_info_type), "2" (0U));
-# else
-    __asm__ __volatile__ ("cpuid" :
-                          "=a" (cpu_info[0]), "=b" (cpu_info[1]),
-                          "=c" (cpu_info[2]), "=d" (cpu_info[3]) :
-                          "0" (cpu_info_type), "2" (0U));
-# endif
-#endif
-}
-
-#endif // defined(LEO_TARGET_MOBILE)
-
-
-static void leo_architecture_init()
-{
-#if defined(LEO_TRY_NEON) && defined(HAVE_ANDROID_GETCPUFEATURES)
-    AndroidCpuFamily family = android_getCpuFamily();
-    if (family == ANDROID_CPU_FAMILY_ARM)
-    {
-        if (android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON)
-            CpuHasNeon = true;
-    }
-    else if (family == ANDROID_CPU_FAMILY_ARM64)
-    {
-        CpuHasNeon = true;
-        if (android_getCpuFeatures() & ANDROID_CPU_ARM64_FEATURE_ASIMD)
-            CpuHasNeon64 = true;
-    }
-#endif
-
-#if !defined(LEO_TARGET_MOBILE)
-    unsigned int cpu_info[4];
-
-    _cpuid(cpu_info, 1);
-    CpuHasSSSE3 = ((cpu_info[2] & CPUID_ECX_SSSE3) != 0);
-
-#if defined(LEO_TRY_AVX2)
-    _cpuid(cpu_info, 7);
-    CpuHasAVX2 = ((cpu_info[1] & CPUID_EBX_AVX2) != 0);
-#endif // LEO_TRY_AVX2
-
-#endif // LEO_TARGET_MOBILE
-}
-
-
-//------------------------------------------------------------------------------
-// SIMD-Safe Aligned Memory Allocations
-
-static const unsigned kAlignmentBytes = LEO_ALIGN_BYTES;
-
-LEO_FORCE_INLINE unsigned NextAlignedOffset(unsigned offset)
-{
-    return (offset + kAlignmentBytes - 1) & ~(kAlignmentBytes - 1);
-}
-
-static LEO_FORCE_INLINE uint8_t* SIMDSafeAllocate(size_t size)
-{
-    uint8_t* data = (uint8_t*)calloc(1, kAlignmentBytes + size);
-    if (!data)
-        return nullptr;
-    unsigned offset = (unsigned)((uintptr_t)data % kAlignmentBytes);
-    data += kAlignmentBytes - offset;
-    data[-1] = (uint8_t)offset;
-    return data;
-}
-
-static LEO_FORCE_INLINE void SIMDSafeFree(void* ptr)
-{
-    if (!ptr)
-        return;
-    uint8_t* data = (uint8_t*)ptr;
-    unsigned offset = data[-1];
-    if (offset >= kAlignmentBytes)
-    {
-        LEO_DEBUG_BREAK; // Should never happen
-        return;
-    }
-    data -= kAlignmentBytes - offset;
-    free(data);
-}
-
-
-//------------------------------------------------------------------------------
-// Field
-
-//#define LEO_SHORT_FIELD
-
-#ifdef LEO_SHORT_FIELD
-typedef uint8_t GFSymbol;
-static const unsigned kGFBits = 8;
-static const unsigned kGFPolynomial = 0x11D;
-GFSymbol kGFBasis[kGFBits] = {
-    1, 214, 152, 146, 86, 200, 88, 230 // Cantor basis
-};
-#else
-typedef uint16_t GFSymbol;
-static const unsigned kGFBits = 16;
-static const unsigned kGFPolynomial = 0x1002D;
-GFSymbol kGFBasis[kGFBits] = {
-    0x0001, 0xACCA, 0x3C0E, 0x163E, // Cantor basis
-    0xC582, 0xED2E, 0x914C, 0x4012,
-    0x6C98, 0x10D8, 0x6A72, 0xB900,
-    0xFDB8, 0xFB34, 0xFF38, 0x991E
-};
-#endif
-
-/*
-    Cantor Basis introduced by:
-    D. G. Cantor, "On arithmetical algorithms over finite fields",
-    Journal of Combinatorial Theory, Series A, vol. 50, no. 2, pp. 285-300, 1989.
-*/
-
-static const unsigned kFieldSize = (unsigned)1 << kGFBits; //Field size
-static const unsigned kFieldModulus = kFieldSize - 1;
-
-static GFSymbol GFLog[kFieldSize];
-static GFSymbol GFExp[kFieldSize];
-
-// Initialize GFLog[], GFExp[]
-static void InitField()
-{
-    unsigned state = 1;
-    for (unsigned i = 0; i < kFieldModulus; ++i)
-    {
-        GFExp[state] = static_cast<GFSymbol>(i);
-        state <<= 1;
-        if (state >= kFieldSize)
-            state ^= kGFPolynomial;
-    }
-    GFExp[0] = kFieldModulus;
-
-    // Conversion to chosen basis:
-
-    GFLog[0] = 0;
-    for (unsigned i = 0; i < kGFBits; ++i)
-    {
-        const GFSymbol basis = kGFBasis[i];
-        const unsigned width = (unsigned)(1UL << i);
-
-        for (unsigned j = 0; j < width; ++j)
-            GFLog[j + width] = GFLog[j] ^ basis;
-    }
-
-    for (unsigned i = 0; i < kFieldSize; ++i)
-        GFLog[i] = GFExp[GFLog[i]];
-
-    for (unsigned i = 0; i < kFieldSize; ++i)
-        GFExp[GFLog[i]] = i;
-
-    GFExp[kFieldModulus] = GFExp[0];
-}
-
-
-//------------------------------------------------------------------------------
-// Mod Q Field Operations
-//
-// Q is the maximum symbol value, e.g. 255 or 65535.
-
-// z = x + y (mod Q)
-static inline GFSymbol AddModQ(GFSymbol a, GFSymbol b)
-{
-    const unsigned sum = (unsigned)a + b;
-
-    // Partial reduction step, allowing for Q to be returned
-    return static_cast<GFSymbol>(sum + (sum >> kGFBits));
-}
-
-// z = x - y (mod Q)
-static inline GFSymbol SubModQ(GFSymbol a, GFSymbol b)
-{
-    const unsigned dif = (unsigned)a - b;
-
-    // Partial reduction step, allowing for Q to be returned
-    return static_cast<GFSymbol>(dif + (dif >> kGFBits));
-}
-
-// vx[] += vy[] * z
-static void muladd_mem(GFSymbol * LEO_RESTRICT vx, const GFSymbol * LEO_RESTRICT vy, GFSymbol z, unsigned symbolCount)
-{
-    for (unsigned i = 0; i < symbolCount; ++i)
-    {
-        const GFSymbol a = vy[i];
-        if (a == 0)
-            continue;
-
-        GFSymbol sum1 = static_cast<GFSymbol>(AddModQ(GFLog[a & 0x0f], z));
-        GFSymbol value1 = GFExp[sum1];
-        if ((a & 0x0f) == 0)
-        {
-            value1 = 0;
-        }
-        GFSymbol sum2 = static_cast<GFSymbol>(AddModQ(GFLog[a & 0xf0], z));
-        GFSymbol value2 = GFExp[sum2];
-        if ((a & 0xf0) == 0)
-        {
-            value2 = 0;
-        }
-        GFSymbol sum3 = static_cast<GFSymbol>(AddModQ(GFLog[a & 0x0f00], z));
-        GFSymbol value3 = GFExp[sum3];
-        if ((a & 0x0f00) == 0)
-        {
-            value3 = 0;
-        }
-        GFSymbol sum4 = static_cast<GFSymbol>(AddModQ(GFLog[a & 0xf000], z));
-        GFSymbol value4 = GFExp[sum4];
-        if ((a & 0xf000) == 0)
-        {
-            value4 = 0;
-        }
-
-        vx[i] ^= value1;
-        vx[i] ^= value2;
-        vx[i] ^= value3;
-        vx[i] ^= value4;
-    }
-}
-
-// return a*GFExp[b] over GF(2^r)
-static GFSymbol mulE(GFSymbol a, GFSymbol b)
-{
-    if (a == 0)
-        return 0;
-
-    const GFSymbol sum = static_cast<GFSymbol>(AddModQ(GFLog[a], b));
-    return GFExp[sum];
-}
-
-
-//------------------------------------------------------------------------------
-// Fast Walsh-Hadamard Transform (FWHT) Mod Q
-//
-// Q is the maximum symbol value, e.g. 255 or 65535.
-
-// Define this to enable the optimized version of FWHT()
-#define LEO_FWHT_OPTIMIZED
-
-typedef GFSymbol fwht_t;
-
-// {a, b} = {a + b, a - b} (Mod Q)
-static LEO_FORCE_INLINE void FWHT_2(fwht_t& LEO_RESTRICT a, fwht_t& LEO_RESTRICT b)
-{
-    const fwht_t sum = AddModQ(a, b);
-    const fwht_t dif = SubModQ(a, b);
-    a = sum;
-    b = dif;
-}
-
-/*
-    FWHT is a minor slice of the runtime and does not grow with data size,
-    but I did attempt a few additional optimizations that failed:
-
-    I've attempted to vectorize (with partial reductions) FWHT_4(data, s),
-    which is 70% of the algorithm, but it was slower.  Left in _attic_.
-
-    I've attempted to avoid reductions in all or parts of the FWHT.
-    The final modular reduction ends up being slower than the savings.
-    Specifically I tried doing it for the whole FWHT and also I tried
-    doing it just for the FWHT_2 loop in the main routine, but both
-    approaches are slower than partial reductions.
-
-    Replacing word reads with wider reads does speed up the operation, but
-    at too high a complexity cost relative to minor perf improvement.
-*/
-
-#ifndef LEO_FWHT_OPTIMIZED
-
-// Reference implementation
-static void FWHT(fwht_t* data, const unsigned bits)
-{
-    const unsigned size = (unsigned)(1UL << bits);
-    for (unsigned width = 1; width < size; width <<= 1)
-        for (unsigned i = 0; i < size; i += (width << 1))
-            for (unsigned j = i; j < (width + i); ++j)
-                FWHT_2(data[j], data[j + width]);
-}
-
-#else
-
-static LEO_FORCE_INLINE void FWHT_4(fwht_t* data)
-{
-    fwht_t t0 = data[0];
-    fwht_t t1 = data[1];
-    fwht_t t2 = data[2];
-    fwht_t t3 = data[3];
-    FWHT_2(t0, t1);
-    FWHT_2(t2, t3);
-    FWHT_2(t0, t2);
-    FWHT_2(t1, t3);
-    data[0] = t0;
-    data[1] = t1;
-    data[2] = t2;
-    data[3] = t3;
-}
-
-static LEO_FORCE_INLINE void FWHT_4(fwht_t* data, unsigned s)
-{
-    unsigned x = 0;
-    fwht_t t0 = data[x];  x += s;
-    fwht_t t1 = data[x];  x += s;
-    fwht_t t2 = data[x];  x += s;
-    fwht_t t3 = data[x];
-    FWHT_2(t0, t1);
-    FWHT_2(t2, t3);
-    FWHT_2(t0, t2);
-    FWHT_2(t1, t3);
-    unsigned y = 0;
-    data[y] = t0;  y += s;
-    data[y] = t1;  y += s;
-    data[y] = t2;  y += s;
-    data[y] = t3;
-}
-
-static inline void FWHT_8(fwht_t* data)
-{
-    fwht_t t0 = data[0];
-    fwht_t t1 = data[1];
-    fwht_t t2 = data[2];
-    fwht_t t3 = data[3];
-    fwht_t t4 = data[4];
-    fwht_t t5 = data[5];
-    fwht_t t6 = data[6];
-    fwht_t t7 = data[7];
-    FWHT_2(t0, t1);
-    FWHT_2(t2, t3);
-    FWHT_2(t4, t5);
-    FWHT_2(t6, t7);
-    FWHT_2(t0, t2);
-    FWHT_2(t1, t3);
-    FWHT_2(t4, t6);
-    FWHT_2(t5, t7);
-    FWHT_2(t0, t4);
-    FWHT_2(t1, t5);
-    FWHT_2(t2, t6);
-    FWHT_2(t3, t7);
-    data[0] = t0;
-    data[1] = t1;
-    data[2] = t2;
-    data[3] = t3;
-    data[4] = t4;
-    data[5] = t5;
-    data[6] = t6;
-    data[7] = t7;
-}
-
-static inline void FWHT_16(fwht_t* data)
-{
-    fwht_t t0 = data[0];
-    fwht_t t1 = data[1];
-    fwht_t t2 = data[2];
-    fwht_t t3 = data[3];
-    fwht_t t4 = data[4];
-    fwht_t t5 = data[5];
-    fwht_t t6 = data[6];
-    fwht_t t7 = data[7];
-    fwht_t t8 = data[8];
-    fwht_t t9 = data[9];
-    fwht_t t10 = data[10];
-    fwht_t t11 = data[11];
-    fwht_t t12 = data[12];
-    fwht_t t13 = data[13];
-    fwht_t t14 = data[14];
-    fwht_t t15 = data[15];
-    FWHT_2(t0, t1);
-    FWHT_2(t2, t3);
-    FWHT_2(t4, t5);
-    FWHT_2(t6, t7);
-    FWHT_2(t8, t9);
-    FWHT_2(t10, t11);
-    FWHT_2(t12, t13);
-    FWHT_2(t14, t15);
-    FWHT_2(t0, t2);
-    FWHT_2(t1, t3);
-    FWHT_2(t4, t6);
-    FWHT_2(t5, t7);
-    FWHT_2(t8, t10);
-    FWHT_2(t9, t11);
-    FWHT_2(t12, t14);
-    FWHT_2(t13, t15);
-    FWHT_2(t0, t4);
-    FWHT_2(t1, t5);
-    FWHT_2(t2, t6);
-    FWHT_2(t3, t7);
-    FWHT_2(t8, t12);
-    FWHT_2(t9, t13);
-    FWHT_2(t10, t14);
-    FWHT_2(t11, t15);
-    FWHT_2(t0, t8);
-    FWHT_2(t1, t9);
-    FWHT_2(t2, t10);
-    FWHT_2(t3, t11);
-    FWHT_2(t4, t12);
-    FWHT_2(t5, t13);
-    FWHT_2(t6, t14);
-    FWHT_2(t7, t15);
-    data[0] = t0;
-    data[1] = t1;
-    data[2] = t2;
-    data[3] = t3;
-    data[4] = t4;
-    data[5] = t5;
-    data[6] = t6;
-    data[7] = t7;
-    data[8] = t8;
-    data[9] = t9;
-    data[10] = t10;
-    data[11] = t11;
-    data[12] = t12;
-    data[13] = t13;
-    data[14] = t14;
-    data[15] = t15;
-}
-
-static void FWHT_SmallData(fwht_t* data, unsigned ldn)
-{
-    const unsigned n = (1UL << ldn);
-
-    if (n <= 2)
-    {
-        if (n == 2)
-            FWHT_2(data[0], data[1]);
-        return;
-    }
-
-    for (unsigned ldm = ldn; ldm > 3; ldm -= 2)
-    {
-        unsigned m = (1UL << ldm);
-        unsigned m4 = (m >> 2);
-        for (unsigned r = 0; r < n; r += m)
-            for (unsigned j = 0; j < m4; j++)
-                FWHT_4(data + j + r, m4);
-    }
-
-    if (ldn & 1)
-    {
-        for (unsigned i0 = 0; i0 < n; i0 += 8)
-            FWHT_8(data + i0);
-    }
-    else
-    {
-        for (unsigned i0 = 0; i0 < n; i0 += 4)
-            FWHT_4(data + i0);
-    }
-}
-
-// Decimation in time (DIT) version
-static void FWHT(fwht_t* data, const unsigned ldn)
-{
-    if (ldn <= 13)
-    {
-        FWHT_SmallData(data, ldn);
-        return;
-    }
-
-    FWHT_2(data[2], data[3]);
-    FWHT_4(data + 4);
-    FWHT_8(data + 8);
-    FWHT_16(data + 16);
-    for (unsigned ldm = 5; ldm < ldn; ++ldm)
-        FWHT(data + (unsigned)(1UL << ldm), ldm);
-
-    for (unsigned ldm = 0; ldm < ldn; ++ldm)
-    {
-        const unsigned mh = (1UL << ldm);
-        for (unsigned t1 = 0, t2 = mh; t1 < mh; ++t1, ++t2)
-            FWHT_2(data[t1], data[t2]);
-    }
-}
-
-#endif
-
-
-//------------------------------------------------------------------------------
-// Memory Buffer XOR
-
-static void xor_mem(void * LEO_RESTRICT vx, const void * LEO_RESTRICT vy, unsigned bytes)
-{
-    LEO_M128 * LEO_RESTRICT x16 = reinterpret_cast<LEO_M128 *>(vx);
-    const LEO_M128 * LEO_RESTRICT y16 = reinterpret_cast<const LEO_M128 *>(vy);
-
-#if defined(LEO_TARGET_MOBILE)
-# if defined(LEO_TRY_NEON)
-    // Handle multiples of 64 bytes
-    if (CpuHasNeon)
-    {
-        while (bytes >= 64)
-        {
-            LEO_M128 x0 = vld1q_u8(x16);
-            LEO_M128 x1 = vld1q_u8(x16 + 1);
-            LEO_M128 x2 = vld1q_u8(x16 + 2);
-            LEO_M128 x3 = vld1q_u8(x16 + 3);
-            LEO_M128 y0 = vld1q_u8(y16);
-            LEO_M128 y1 = vld1q_u8(y16 + 1);
-            LEO_M128 y2 = vld1q_u8(y16 + 2);
-            LEO_M128 y3 = vld1q_u8(y16 + 3);
-
-            vst1q_u8(x16,     veorq_u8(x0, y0));
-            vst1q_u8(x16 + 1, veorq_u8(x1, y1));
-            vst1q_u8(x16 + 2, veorq_u8(x2, y2));
-            vst1q_u8(x16 + 3, veorq_u8(x3, y3));
-
-            bytes -= 64, x16 += 4, y16 += 4;
-        }
-
-        // Handle multiples of 16 bytes
-        while (bytes >= 16)
-        {
-            LEO_M128 x0 = vld1q_u8(x16);
-            LEO_M128 y0 = vld1q_u8(y16);
-
-            vst1q_u8(x16, veorq_u8(x0, y0));
-
-            bytes -= 16, ++x16, ++y16;
-        }
-    }
-    else
-# endif // LEO_TRY_NEON
-    {
-        uint64_t * LEO_RESTRICT x8 = reinterpret_cast<uint64_t *>(x16);
-        const uint64_t * LEO_RESTRICT y8 = reinterpret_cast<const uint64_t *>(y16);
-
-        const unsigned count = (unsigned)bytes / 8;
-        for (unsigned ii = 0; ii < count; ++ii)
-            x8[ii] ^= y8[ii];
-
-        x16 = reinterpret_cast<LEO_M128 *>(x8 + count);
-        y16 = reinterpret_cast<const LEO_M128 *>(y8 + count);
-    }
-#else // LEO_TARGET_MOBILE
-# if defined(LEO_TRY_AVX2)
-    if (CpuHasAVX2)
-    {
-        LEO_M256 * LEO_RESTRICT x32 = reinterpret_cast<LEO_M256 *>(x16);
-        const LEO_M256 * LEO_RESTRICT y32 = reinterpret_cast<const LEO_M256 *>(y16);
-
-        while (bytes >= 128)
-        {
-            LEO_M256 x0 = _mm256_loadu_si256(x32);
-            LEO_M256 y0 = _mm256_loadu_si256(y32);
-            x0 = _mm256_xor_si256(x0, y0);
-            LEO_M256 x1 = _mm256_loadu_si256(x32 + 1);
-            LEO_M256 y1 = _mm256_loadu_si256(y32 + 1);
-            x1 = _mm256_xor_si256(x1, y1);
-            LEO_M256 x2 = _mm256_loadu_si256(x32 + 2);
-            LEO_M256 y2 = _mm256_loadu_si256(y32 + 2);
-            x2 = _mm256_xor_si256(x2, y2);
-            LEO_M256 x3 = _mm256_loadu_si256(x32 + 3);
-            LEO_M256 y3 = _mm256_loadu_si256(y32 + 3);
-            x3 = _mm256_xor_si256(x3, y3);
-
-            _mm256_storeu_si256(x32, x0);
-            _mm256_storeu_si256(x32 + 1, x1);
-            _mm256_storeu_si256(x32 + 2, x2);
-            _mm256_storeu_si256(x32 + 3, x3);
-
-            bytes -= 128, x32 += 4, y32 += 4;
-        }
-
-        // Handle multiples of 32 bytes
-        while (bytes >= 32)
-        {
-            // x[i] = x[i] xor y[i]
-            _mm256_storeu_si256(x32,
-                _mm256_xor_si256(
-                    _mm256_loadu_si256(x32),
-                    _mm256_loadu_si256(y32)));
-
-            bytes -= 32, ++x32, ++y32;
-        }
-
-        x16 = reinterpret_cast<LEO_M128 *>(x32);
-        y16 = reinterpret_cast<const LEO_M128 *>(y32);
-    }
-    else
-# endif // LEO_TRY_AVX2
-    {
-        while (bytes >= 64)
-        {
-            LEO_M128 x0 = _mm_loadu_si128(x16);
-            LEO_M128 y0 = _mm_loadu_si128(y16);
-            x0 = _mm_xor_si128(x0, y0);
-            LEO_M128 x1 = _mm_loadu_si128(x16 + 1);
-            LEO_M128 y1 = _mm_loadu_si128(y16 + 1);
-            x1 = _mm_xor_si128(x1, y1);
-            LEO_M128 x2 = _mm_loadu_si128(x16 + 2);
-            LEO_M128 y2 = _mm_loadu_si128(y16 + 2);
-            x2 = _mm_xor_si128(x2, y2);
-            LEO_M128 x3 = _mm_loadu_si128(x16 + 3);
-            LEO_M128 y3 = _mm_loadu_si128(y16 + 3);
-            x3 = _mm_xor_si128(x3, y3);
-
-            _mm_storeu_si128(x16, x0);
-            _mm_storeu_si128(x16 + 1, x1);
-            _mm_storeu_si128(x16 + 2, x2);
-            _mm_storeu_si128(x16 + 3, x3);
-
-            bytes -= 64, x16 += 4, y16 += 4;
-        }
-    }
-#endif // LEO_TARGET_MOBILE
-
-    // Handle multiples of 16 bytes
-    while (bytes >= 16)
-    {
-        // x[i] = x[i] xor y[i]
-        _mm_storeu_si128(x16,
-            _mm_xor_si128(
-                _mm_loadu_si128(x16),
-                _mm_loadu_si128(y16)));
-
-        bytes -= 16, ++x16, ++y16;
-    }
-
-    uint8_t * LEO_RESTRICT x1 = reinterpret_cast<uint8_t *>(x16);
-    const uint8_t * LEO_RESTRICT y1 = reinterpret_cast<const uint8_t *>(y16);
-
-    // Handle a block of 8 bytes
-    const unsigned eight = bytes & 8;
-    if (eight)
-    {
-        uint64_t * LEO_RESTRICT x8 = reinterpret_cast<uint64_t *>(x1);
-        const uint64_t * LEO_RESTRICT y8 = reinterpret_cast<const uint64_t *>(y1);
-        *x8 ^= *y8;
-    }
-
-    // Handle a block of 4 bytes
-    const unsigned four = bytes & 4;
-    if (four)
-    {
-        uint32_t * LEO_RESTRICT x4 = reinterpret_cast<uint32_t *>(x1 + eight);
-        const uint32_t * LEO_RESTRICT y4 = reinterpret_cast<const uint32_t *>(y1 + eight);
-        *x4 ^= *y4;
-    }
-
-    // Handle final bytes
-    const unsigned offset = eight + four;
-    switch (bytes & 3)
-    {
-    case 3: x1[offset + 2] ^= y1[offset + 2];
-    case 2: x1[offset + 1] ^= y1[offset + 1];
-    case 1: x1[offset] ^= y1[offset];
-    default:
-        break;
-    }
-}
-
-
-//------------------------------------------------------------------------------
-// Formal Derivative
-
-// Formal derivative of polynomial in the new basis
-static void formal_derivative(GFSymbol* cos, const unsigned size)
-{
-    for (unsigned i = 1; i < size; ++i)
-    {
-        const unsigned leng = ((i ^ (i - 1)) + 1) >> 1;
-
-        // If a large number of values are being XORed:
-        if (leng >= 8)
-            xor_mem(cos + i - leng, cos + i, leng * sizeof(GFSymbol));
-        else
-            for (unsigned j = i - leng; j < i; j++)
-                cos[j] ^= cos[j + leng];
-    }
-
-    for (unsigned i = size; i < kFieldSize; i <<= 1)
-        xor_mem(cos, cos + i, size * sizeof(GFSymbol));
-}
-
-
-//------------------------------------------------------------------------------
-// Fast Fourier Transform
-
-static GFSymbol skewVec[kFieldModulus]; // twisted factors used in FFT
-
-// IFFT in the proposed basis
-static void IFLT(GFSymbol* data, const unsigned size, const unsigned index)
-{
-    for (unsigned depart_no = 1; depart_no < size; depart_no <<= 1)
-    {
-        for (unsigned j = depart_no; j < size; j += (depart_no << 1))
-        {
-            // If a large number of values are being XORed:
-            if (depart_no >= 8)
-                xor_mem(data + j, data + j - depart_no, depart_no * sizeof(GFSymbol));
-            else
-                for (unsigned i = j - depart_no; i < j; ++i)
-                    data[i + depart_no] ^= data[i];
-
-            const GFSymbol skew = skewVec[j + index - 1];
-
-            if (skew != kFieldModulus)
-                muladd_mem(data + j - depart_no, data + j, skew, depart_no);
-        }
-    }
-}
-
-// FFT in the proposed basis
-static void FLT(GFSymbol* data, const unsigned size, const unsigned index)
-{
-    for (unsigned depart_no = (size >> 1); depart_no > 0; depart_no >>= 1)
-    {
-        for (unsigned j = depart_no; j < size; j += (depart_no << 1))
-        {
-            const GFSymbol skew = skewVec[j + index - 1];
-
-            if (skew != kFieldModulus)
-                muladd_mem(data + j - depart_no, data + j, skew, depart_no);
-
-            // If a large number of values are being XORed:
-            if (depart_no >= 8)
-                xor_mem(data + j, data + j - depart_no, depart_no * sizeof(GFSymbol));
-            else
-                for (unsigned i = j - depart_no; i < j; ++i)
-                    data[i + depart_no] ^= data[i];
-        }
-    }
-}
-
-
-//------------------------------------------------------------------------------
-// FFT Initialization
-
-static GFSymbol B[kFieldSize >> 1];     // factors used in formal derivative
-static fwht_t log_walsh[kFieldSize];  // factors used in the evaluation of the error locator polynomial
-
-// Initialize skewVec[], B[], log_walsh[]
-static void InitFieldOperations()
-{
-    GFSymbol temp[kGFBits - 1];
-
-    for (unsigned i = 1; i < kGFBits; ++i)
-        temp[i - 1] = (GFSymbol)((unsigned)1 << i);
-
-    for (unsigned m = 0; m < (kGFBits - 1); ++m)
-    {
-        const unsigned step = (unsigned)1 << (m + 1);
-
-        skewVec[((unsigned)1 << m) - 1] = 0;
-
-        for (unsigned i = m; i < (kGFBits - 1); ++i)
-        {
-            const unsigned s = ((unsigned)1 << (i + 1));
-
-            for (unsigned j = ((unsigned)1 << m) - 1; j < s; j += step)
-                skewVec[j + s] = skewVec[j] ^ temp[i];
-        }
-
-        temp[m] = kFieldModulus - GFLog[mulE(temp[m], GFLog[temp[m] ^ 1])];
-
-        for (unsigned i = m + 1; i < (kGFBits - 1); ++i)
-            temp[i] = mulE(temp[i], (GFLog[temp[i] ^ 1] + temp[m]) % kFieldModulus);
-    }
-
-    for (unsigned i = 0; i < kFieldSize; ++i)
-        skewVec[i] = GFLog[skewVec[i]];
-
-    temp[0] = kFieldModulus - temp[0];
-
-    for (unsigned i = 1; i < (kGFBits - 1); ++i)
-        temp[i] = (kFieldModulus - temp[i] + temp[i - 1]) % kFieldModulus;
-
-    B[0] = 0;
-    for (unsigned i = 0; i < (kGFBits - 1); ++i)
-    {
-        const unsigned depart = ((unsigned)1 << i);
-
-        for (unsigned j = 0; j < depart; ++j)
-            B[j + depart] = (B[j] + temp[i]) % kFieldModulus;
-    }
-
-    for (unsigned i = 0; i < kFieldSize; ++i)
-        log_walsh[i] = GFLog[i];
-
-    log_walsh[0] = 0;
-
-    FWHT(log_walsh, kGFBits);
-}
-
-
-//------------------------------------------------------------------------------
-// Encoder
-
-// Encoding alg for k/n<0.5: message is a power of two
-static void encodeL(GFSymbol* data, const unsigned k, GFSymbol* codeword)
-{
-    memcpy(codeword, data, sizeof(GFSymbol) * k);
-
-    IFLT(codeword, k, 0);
-
-    for (unsigned i = k; i < kFieldSize; i += k)
-    {
-        memcpy(&codeword[i], codeword, sizeof(GFSymbol) * k);
-
-        FLT(&codeword[i], k, i);
-    }
-
-    memcpy(codeword, data, sizeof(GFSymbol) * k);
-}
-
-// Encoding alg for k/n>0.5: parity is a power of two.
-// data: message array. parity: parity array. mem: buffer(size>= n-k)
-static void encodeH(const GFSymbol* data, const unsigned k, GFSymbol* parity, GFSymbol* mem)
-{
-    const unsigned t = kFieldSize - k;
-
-    memset(parity, 0, sizeof(GFSymbol) * t);
-
-    for (unsigned i = t; i < kFieldSize; i += t)
-    {
-        memcpy(mem, &data[i - t], sizeof(GFSymbol) * t);
-
-        IFLT(mem, t, i);
-
-        xor_mem(parity, mem, t * sizeof(GFSymbol));
-    }
-
-    FLT(parity, t, 0);
-}
-
-
-//------------------------------------------------------------------------------
-// Decoder
-
-static void decode(GFSymbol* codeword, unsigned k, const bool* erasure)
-{
-    fwht_t log_walsh2[kFieldSize];
-
-    // Compute the evaluations of the error locator polynomial
-    for (unsigned i = 0; i < kFieldSize; ++i)
-        log_walsh2[i] = erasure[i] ? 1 : 0;
-
-    FWHT(log_walsh2, kGFBits);
-
-    for (unsigned i = 0; i < kFieldSize; ++i)
-        log_walsh2[i] = ((unsigned)log_walsh2[i] * (unsigned)log_walsh[i]) % kFieldModulus;
-
-    FWHT(log_walsh2, kGFBits);
-
-    // k2 can be replaced with k
-    const unsigned k2 = kFieldSize;
-    //const unsigned k2 = k; // cannot actually be replaced with k.  what else need to change?
-
-    for (unsigned i = 0; i < kFieldSize; ++i)
-    {
-        if (erasure[i])
-        {
-            codeword[i] = 0;
-        }
-        else
-        {
-            codeword[i] = mulE(codeword[i], log_walsh2[i]);
-        }
-    }
-
-    IFLT(codeword, kFieldSize, 0);
-
-    // formal derivative
-    for (unsigned i = 0; i < kFieldSize; i += 2)
-    {
-        codeword[i] = mulE(codeword[i], kFieldModulus - B[i >> 1]);
-        codeword[i + 1] = mulE(codeword[i + 1], kFieldModulus - B[i >> 1]);
-    }
-
-    formal_derivative(codeword, k2);
-
-    for (unsigned i = 0; i < k2; i += 2)
-    {
-        codeword[i] = mulE(codeword[i], B[i >> 1]);
-        codeword[i + 1] = mulE(codeword[i + 1], B[i >> 1]);
-    }
-
-    FLT(codeword, k2, 0);
-
-    for (unsigned i = 0; i < k2; ++i)
-    {
-        if (erasure[i])
-        {
-            codeword[i] = mulE(codeword[i], kFieldModulus - log_walsh2[i]);
-        }
-    }
-}
-
-
-//------------------------------------------------------------------------------
-// Test Application
-
-void test(unsigned k, unsigned seed)
-{
-    srand(seed);
-
-    //-----------Generating message----------
-
-    // Message array
-    GFSymbol data[kFieldSize] = {0};
-
-    // Filled with random numbers
-    for (unsigned i = kFieldSize - k; i < kFieldSize; ++i)
-        data[i] = (GFSymbol)rand();
-
-
-    //---------encoding----------
-
-    GFSymbol codeword[kFieldSize];
-    encodeH(&data[kFieldSize - k], k, data, codeword);
-    //encodeL(data, k, codeword); // does not seem to work with any input?  what else needs to change?
-
-    memcpy(codeword, data, sizeof(GFSymbol) * kFieldSize);
-
-
-    //--------erasure simulation---------
-
-    // Array indicating erasures
-    bool erasure[kFieldSize] = {
-        false
-    };
-
-    for (unsigned i = k; i < kFieldSize; ++i)
-        erasure[i] = true;
-
-    // permuting the erasure array
-    for (unsigned i = kFieldSize - 1; i > 0; --i)
-    {
-        unsigned pos = rand() % (i + 1);
-
-        if (i != pos)
-        {
-            bool tmp = erasure[i];
-            erasure[i] = erasure[pos];
-            erasure[pos] = tmp;
-        }
-    }
-
-    // erasure codeword symbols
-    for (unsigned i = 0; i < kFieldSize; ++i)
-        if (erasure[i])
-            codeword[i] = 0;
-
-
-    //---------main processing----------
-    decode(codeword, k, erasure);
-
-    // Check the correctness of the result
-    for (unsigned i = 0; i < kFieldSize; ++i)
-    {
-        if (erasure[i] == 1)
-        {
-            if (data[i] != codeword[i])
-            {
-                printf("Decoding Error with seed = %d!\n", seed);
-                LEO_DEBUG_BREAK;
-                return;
-            }
-        }
-    }
-
-    //printf("Decoding is successful!\n");
-}
-
-
-//------------------------------------------------------------------------------
-// Entrypoint
-
-int main(int argc, char **argv)
-{
-    // Initialize architecture-specific code
-    leo_architecture_init();
-
-    // Fill GFLog table and GFExp table
-    InitField();
-
-    // Compute factors used in erasure decoder
-    InitFieldOperations();
-
-    unsigned seed = (unsigned)time(NULL);
-    for (;;)
-    {
-        // test(int k), k: message size
-        /*
-            EncodeH works for kFieldSize / 2 and kFieldSize * 3 / 4, etc,
-            s.t. the number of recovery pieces is a power of two
-        */
-        test(kFieldSize / 2, seed);
-
-        ++seed;
-    }
-
-    return 0;
-}
diff --git a/LeopardDecoder.h b/LeopardDecoder.h
deleted file mode 100644
index 71d22e2..0000000
--- a/LeopardDecoder.h
+++ /dev/null
@@ -1,1220 +0,0 @@
-/*
-    Copyright (c) 2017 Christopher A. Taylor.  All rights reserved.
-
-    Redistribution and use in source and binary forms, with or without
-    modification, are permitted provided that the following conditions are met:
-
-    * Redistributions of source code must retain the above copyright notice,
-      this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above copyright notice,
-      this list of conditions and the following disclaimer in the documentation
-      and/or other materials provided with the distribution.
-    * Neither the name of LHC-RS nor the names of its contributors may be
-      used to endorse or promote products derived from this software without
-      specific prior written permission.
-
-    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-    ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
-    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-    POSSIBILITY OF SUCH DAMAGE.
-*/
-
-#include <string.h>
-#include <time.h>
-#include <stdio.h>
-#include <stdint.h>
-#include <stdlib.h>
-
-
-/*
-    TODO:
-    + Write C API and unit tester
-        + Limit input to multiples of 64 bytes
-    + Replace GFSymbol with a file data pointer
-    + New 16-bit Muladd inner loops
-        + Class to contain the (large) muladd tables
-    + Preliminary benchmarks for large data!
-    + New 8-bit Muladd inner loops
-    + Benchmarks for smaller data!
-    + Refactor software
-        + Pick a name for the software better than LEO_RS
-        + I think it should be split up into several C++ modules
-    + Write detailed comments for all the routines
-    + Look into getting EncodeL working so we can support smaller data (Ask Lin)
-    + Look into using k instead of k2 to speed up decoder (Ask Lin)
-    + Avoid performing FFT/IFFT intermediate calculations we're not going to use
-    + Benchmarks, fun!
-    + Add multi-threading to split up long parallelizable calculations
-    + Final benchmarks!
-    + Finish up documentation
-    + Release version 1
-
-
-    Muladd implementation notes:
-
-    Specialize for 1-3 rows at a time since often times we're multiplying by
-    the same (skew) value repeatedly, as the ISA-L library does here:
-
-    https://github.com/01org/isa-l/blob/master/erasure_code/gf_3vect_mad_avx.asm#L258
-
-    Except we should be doing it for 16-bit Galois Field.
-    To implement that use the ALTMAP trick from Jerasure:
-
-    http://lab.jerasure.org/jerasure/gf-complete/blob/master/src/gf_w16.c#L1140
-
-    Except we should also support AVX2 since that is a 40% perf boost, so put
-    the high and low bytes 32 bytes instead of 16 bytes apart.
-
-    Also I think we should go ahead and precompute the multiply tables since
-    it avoids a bunch of memory lookups for each muladd, and only costs 8 MB.
-*/
-
-
-//------------------------------------------------------------------------------
-// Debug
-
-// Some bugs only repro in release mode, so this can be helpful
-//#define LEO_DEBUG_IN_RELEASE
-
-#if defined(_DEBUG) || defined(DEBUG) || defined(LEO_DEBUG_IN_RELEASE)
-    #define LEO_DEBUG
-    #ifdef _WIN32
-        #define LEO_DEBUG_BREAK __debugbreak()
-    #else
-        #define LEO_DEBUG_BREAK __builtin_trap()
-    #endif
-    #define LEO_DEBUG_ASSERT(cond) { if (!(cond)) { LEO_DEBUG_BREAK; } }
-#else
-    #define LEO_DEBUG_BREAK ;
-    #define LEO_DEBUG_ASSERT(cond) ;
-#endif
-
-
-//------------------------------------------------------------------------------
-// Platform/Architecture
-
-#if defined(ANDROID) || defined(IOS)
-    #define LEO_TARGET_MOBILE
-#endif // ANDROID
-
-#if defined(__AVX2__) || (defined (_MSC_VER) && _MSC_VER >= 1900)
-    #define LEO_TRY_AVX2 /* 256-bit */
-    #include <immintrin.h>
-    #define LEO_ALIGN_BYTES 32
-#else // __AVX2__
-    #define LEO_ALIGN_BYTES 16
-#endif // __AVX2__
-
-#if !defined(LEO_TARGET_MOBILE)
-    // Note: MSVC currently only supports SSSE3 but not AVX2
-    #include <tmmintrin.h> // SSSE3: _mm_shuffle_epi8
-    #include <emmintrin.h> // SSE2
-#endif // LEO_TARGET_MOBILE
-
-#if defined(HAVE_ARM_NEON_H)
-    #include <arm_neon.h>
-#endif // HAVE_ARM_NEON_H
-
-#if defined(LEO_TARGET_MOBILE)
-
-    #define LEO_ALIGNED_ACCESSES /* Inputs must be aligned to LEO_ALIGN_BYTES */
-
-# if defined(HAVE_ARM_NEON_H)
-    // Compiler-specific 128-bit SIMD register keyword
-    #define LEO_M128 uint8x16_t
-    #define LEO_TRY_NEON
-#else
-    #define LEO_M128 uint64_t
-# endif
-
-#else // LEO_TARGET_MOBILE
-
-    // Compiler-specific 128-bit SIMD register keyword
-    #define LEO_M128 __m128i
-
-#endif // LEO_TARGET_MOBILE
-
-#ifdef LEO_TRY_AVX2
-    // Compiler-specific 256-bit SIMD register keyword
-    #define LEO_M256 __m256i
-#endif
-
-// Compiler-specific C++11 restrict keyword
-#define LEO_RESTRICT __restrict
-
-// Compiler-specific force inline keyword
-#ifdef _MSC_VER
-    #define LEO_FORCE_INLINE inline __forceinline
-#else
-    #define LEO_FORCE_INLINE inline __attribute__((always_inline))
-#endif
-
-// Compiler-specific alignment keyword
-// Note: Alignment only matters for ARM NEON where it should be 16
-#ifdef _MSC_VER
-    #define LEO_ALIGNED __declspec(align(LEO_ALIGN_BYTES))
-#else // _MSC_VER
-    #define LEO_ALIGNED __attribute__((aligned(LEO_ALIGN_BYTES)))
-#endif // _MSC_VER
-
-
-//------------------------------------------------------------------------------
-// Runtime CPU Architecture Check
-//
-// Feature checks stolen shamelessly from
-// https://github.com/jedisct1/libsodium/blob/master/src/libsodium/sodium/runtime.c
-
-#if defined(HAVE_ANDROID_GETCPUFEATURES)
-    #include <cpu-features.h>
-#endif
-
-#if defined(LEO_TRY_NEON)
-# if defined(IOS) && defined(__ARM_NEON__)
-        // Requires iPhone 5S or newer
-        static const bool CpuHasNeon = true;
-        static const bool CpuHasNeon64 = true;
-# else
-        // Remember to add LOCAL_STATIC_LIBRARIES := cpufeatures
-        static bool CpuHasNeon = false; // V6 / V7
-        static bool CpuHasNeon64 = false; // 64-bit
-# endif
-#endif
-
-
-#if !defined(LEO_TARGET_MOBILE)
-
-#ifdef _MSC_VER
-    #include <intrin.h> // __cpuid
-    #pragma warning(disable: 4752) // found Intel(R) Advanced Vector Extensions; consider using /arch:AVX
-#endif
-
-#ifdef LEO_TRY_AVX2
-static bool CpuHasAVX2 = false;
-#endif
-static bool CpuHasSSSE3 = false;
-
-#define CPUID_EBX_AVX2    0x00000020
-#define CPUID_ECX_SSSE3   0x00000200
-
-static void _cpuid(unsigned int cpu_info[4U], const unsigned int cpu_info_type)
-{
-#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_AMD64) || defined(_M_IX86))
-    __cpuid((int *) cpu_info, cpu_info_type);
-#else //if defined(HAVE_CPUID)
-    cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0;
-# ifdef __i386__
-    __asm__ __volatile__ ("pushfl; pushfl; "
-                          "popl %0; "
-                          "movl %0, %1; xorl %2, %0; "
-                          "pushl %0; "
-                          "popfl; pushfl; popl %0; popfl" :
-                          "=&r" (cpu_info[0]), "=&r" (cpu_info[1]) :
-                          "i" (0x200000));
-    if (((cpu_info[0] ^ cpu_info[1]) & 0x200000) == 0) {
-        return; /* LCOV_EXCL_LINE */
-    }
-# endif
-# ifdef __i386__
-    __asm__ __volatile__ ("xchgl %%ebx, %k1; cpuid; xchgl %%ebx, %k1" :
-                          "=a" (cpu_info[0]), "=&r" (cpu_info[1]),
-                          "=c" (cpu_info[2]), "=d" (cpu_info[3]) :
-                          "0" (cpu_info_type), "2" (0U));
-# elif defined(__x86_64__)
-    __asm__ __volatile__ ("xchgq %%rbx, %q1; cpuid; xchgq %%rbx, %q1" :
-                          "=a" (cpu_info[0]), "=&r" (cpu_info[1]),
-                          "=c" (cpu_info[2]), "=d" (cpu_info[3]) :
-                          "0" (cpu_info_type), "2" (0U));
-# else
-    __asm__ __volatile__ ("cpuid" :
-                          "=a" (cpu_info[0]), "=b" (cpu_info[1]),
-                          "=c" (cpu_info[2]), "=d" (cpu_info[3]) :
-                          "0" (cpu_info_type), "2" (0U));
-# endif
-#endif
-}
-
-#endif // defined(LEO_TARGET_MOBILE)
-
-
-static void leo_architecture_init()
-{
-#if defined(LEO_TRY_NEON) && defined(HAVE_ANDROID_GETCPUFEATURES)
-    AndroidCpuFamily family = android_getCpuFamily();
-    if (family == ANDROID_CPU_FAMILY_ARM)
-    {
-        if (android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON)
-            CpuHasNeon = true;
-    }
-    else if (family == ANDROID_CPU_FAMILY_ARM64)
-    {
-        CpuHasNeon = true;
-        if (android_getCpuFeatures() & ANDROID_CPU_ARM64_FEATURE_ASIMD)
-            CpuHasNeon64 = true;
-    }
-#endif
-
-#if !defined(LEO_TARGET_MOBILE)
-    unsigned int cpu_info[4];
-
-    _cpuid(cpu_info, 1);
-    CpuHasSSSE3 = ((cpu_info[2] & CPUID_ECX_SSSE3) != 0);
-
-#if defined(LEO_TRY_AVX2)
-    _cpuid(cpu_info, 7);
-    CpuHasAVX2 = ((cpu_info[1] & CPUID_EBX_AVX2) != 0);
-#endif // LEO_TRY_AVX2
-
-#endif // LEO_TARGET_MOBILE
-}
-
-
-//------------------------------------------------------------------------------
-// SIMD-Safe Aligned Memory Allocations
-
-static const unsigned kAlignmentBytes = LEO_ALIGN_BYTES;
-
-LEO_FORCE_INLINE unsigned NextAlignedOffset(unsigned offset)
-{
-    return (offset + kAlignmentBytes - 1) & ~(kAlignmentBytes - 1);
-}
-
-static LEO_FORCE_INLINE uint8_t* SIMDSafeAllocate(size_t size)
-{
-    uint8_t* data = (uint8_t*)calloc(1, kAlignmentBytes + size);
-    if (!data)
-        return nullptr;
-    unsigned offset = (unsigned)((uintptr_t)data % kAlignmentBytes);
-    data += kAlignmentBytes - offset;
-    data[-1] = (uint8_t)offset;
-    return data;
-}
-
-static LEO_FORCE_INLINE void SIMDSafeFree(void* ptr)
-{
-    if (!ptr)
-        return;
-    uint8_t* data = (uint8_t*)ptr;
-    unsigned offset = data[-1];
-    if (offset >= kAlignmentBytes)
-    {
-        LEO_DEBUG_BREAK; // Should never happen
-        return;
-    }
-    data -= kAlignmentBytes - offset;
-    free(data);
-}
-
-
-//------------------------------------------------------------------------------
-// Field
-
-//#define LEO_SHORT_FIELD
-
-#ifdef LEO_SHORT_FIELD
-typedef uint8_t GFSymbol;
-static const unsigned kGFBits = 8;
-static const unsigned kGFPolynomial = 0x11D;
-GFSymbol kGFBasis[kGFBits] = {
-    1, 214, 152, 146, 86, 200, 88, 230 // Cantor basis
-};
-#else
-typedef uint16_t GFSymbol;
-static const unsigned kGFBits = 16;
-static const unsigned kGFPolynomial = 0x1002D;
-GFSymbol kGFBasis[kGFBits] = {
-    0x0001, 0xACCA, 0x3C0E, 0x163E, // Cantor basis
-    0xC582, 0xED2E, 0x914C, 0x4012,
-    0x6C98, 0x10D8, 0x6A72, 0xB900,
-    0xFDB8, 0xFB34, 0xFF38, 0x991E
-};
-#endif
-
-/*
-    Cantor Basis introduced by:
-    D. G. Cantor, "On arithmetical algorithms over finite fields",
-    Journal of Combinatorial Theory, Series A, vol. 50, no. 2, pp. 285-300, 1989.
-*/
-
-static const unsigned kFieldSize = (unsigned)1 << kGFBits; //Field size
-static const unsigned kFieldModulus = kFieldSize - 1;
-
-static GFSymbol GFLog[kFieldSize];
-static GFSymbol GFExp[kFieldSize];
-
-// Initialize GFLog[], GFExp[]
-static void InitField()
-{
-    unsigned state = 1;
-    for (unsigned i = 0; i < kFieldModulus; ++i)
-    {
-        GFExp[state] = static_cast<GFSymbol>(i);
-        state <<= 1;
-        if (state >= kFieldSize)
-            state ^= kGFPolynomial;
-    }
-    GFExp[0] = kFieldModulus;
-
-    // Conversion to chosen basis:
-
-    GFLog[0] = 0;
-    for (unsigned i = 0; i < kGFBits; ++i)
-    {
-        const GFSymbol basis = kGFBasis[i];
-        const unsigned width = (unsigned)(1UL << i);
-
-        for (unsigned j = 0; j < width; ++j)
-            GFLog[j + width] = GFLog[j] ^ basis;
-    }
-
-    for (unsigned i = 0; i < kFieldSize; ++i)
-        GFLog[i] = GFExp[GFLog[i]];
-
-    for (unsigned i = 0; i < kFieldSize; ++i)
-        GFExp[GFLog[i]] = i;
-
-    GFExp[kFieldModulus] = GFExp[0];
-}
-
-
-//------------------------------------------------------------------------------
-// Mod Q Field Operations
-//
-// Q is the maximum symbol value, e.g. 255 or 65535.
-
-// z = x + y (mod Q)
-static inline GFSymbol AddModQ(GFSymbol a, GFSymbol b)
-{
-    const unsigned sum = (unsigned)a + b;
-
-    // Partial reduction step, allowing for Q to be returned
-    return static_cast<GFSymbol>(sum + (sum >> kGFBits));
-}
-
-// z = x - y (mod Q)
-static inline GFSymbol SubModQ(GFSymbol a, GFSymbol b)
-{
-    const unsigned dif = (unsigned)a - b;
-
-    // Partial reduction step, allowing for Q to be returned
-    return static_cast<GFSymbol>(dif + (dif >> kGFBits));
-}
-
-// vx[] += vy[] * z
-static void muladd_mem(GFSymbol * LEO_RESTRICT vx, const GFSymbol * LEO_RESTRICT vy, GFSymbol z, unsigned symbolCount)
-{
-    for (unsigned i = 0; i < symbolCount; ++i)
-    {
-        const GFSymbol a = vy[i];
-        if (a == 0)
-            continue;
-
-        GFSymbol sum1 = static_cast<GFSymbol>(AddModQ(GFLog[a & 0x0f], z));
-        GFSymbol value1 = GFExp[sum1];
-        if ((a & 0x0f) == 0)
-        {
-            value1 = 0;
-        }
-        GFSymbol sum2 = static_cast<GFSymbol>(AddModQ(GFLog[a & 0xf0], z));
-        GFSymbol value2 = GFExp[sum2];
-        if ((a & 0xf0) == 0)
-        {
-            value2 = 0;
-        }
-        GFSymbol sum3 = static_cast<GFSymbol>(AddModQ(GFLog[a & 0x0f00], z));
-        GFSymbol value3 = GFExp[sum3];
-        if ((a & 0x0f00) == 0)
-        {
-            value3 = 0;
-        }
-        GFSymbol sum4 = static_cast<GFSymbol>(AddModQ(GFLog[a & 0xf000], z));
-        GFSymbol value4 = GFExp[sum4];
-        if ((a & 0xf000) == 0)
-        {
-            value4 = 0;
-        }
-
-        vx[i] ^= value1;
-        vx[i] ^= value2;
-        vx[i] ^= value3;
-        vx[i] ^= value4;
-    }
-}
-
-// return a*GFExp[b] over GF(2^r)
-static GFSymbol mulE(GFSymbol a, GFSymbol b)
-{
-    if (a == 0)
-        return 0;
-
-    const GFSymbol sum = static_cast<GFSymbol>(AddModQ(GFLog[a], b));
-    return GFExp[sum];
-}
-
-
-//------------------------------------------------------------------------------
-// Fast Walsh-Hadamard Transform (FWHT) Mod Q
-//
-// Q is the maximum symbol value, e.g. 255 or 65535.
-
-// Define this to enable the optimized version of FWHT()
-#define LEO_FWHT_OPTIMIZED
-
-typedef GFSymbol fwht_t;
-
-// {a, b} = {a + b, a - b} (Mod Q)
-static LEO_FORCE_INLINE void FWHT_2(fwht_t& LEO_RESTRICT a, fwht_t& LEO_RESTRICT b)
-{
-    const fwht_t sum = AddModQ(a, b);
-    const fwht_t dif = SubModQ(a, b);
-    a = sum;
-    b = dif;
-}
-
-/*
-    FWHT is a minor slice of the runtime and does not grow with data size,
-    but I did attempt a few additional optimizations that failed:
-
-    I've attempted to vectorize (with partial reductions) FWHT_4(data, s),
-    which is 70% of the algorithm, but it was slower.  Left in _attic_.
-
-    I've attempted to avoid reductions in all or parts of the FWHT.
-    The final modular reduction ends up being slower than the savings.
-    Specifically I tried doing it for the whole FWHT and also I tried
-    doing it just for the FWHT_2 loop in the main routine, but both
-    approaches are slower than partial reductions.
-
-    Replacing word reads with wider reads does speed up the operation, but
-    at too high a complexity cost relative to minor perf improvement.
-*/
-
-#ifndef LEO_FWHT_OPTIMIZED
-
-// Reference implementation
-static void FWHT(fwht_t* data, const unsigned bits)
-{
-    const unsigned size = (unsigned)(1UL << bits);
-    for (unsigned width = 1; width < size; width <<= 1)
-        for (unsigned i = 0; i < size; i += (width << 1))
-            for (unsigned j = i; j < (width + i); ++j)
-                FWHT_2(data[j], data[j + width]);
-}
-
-#else
-
-static LEO_FORCE_INLINE void FWHT_4(fwht_t* data)
-{
-    fwht_t t0 = data[0];
-    fwht_t t1 = data[1];
-    fwht_t t2 = data[2];
-    fwht_t t3 = data[3];
-    FWHT_2(t0, t1);
-    FWHT_2(t2, t3);
-    FWHT_2(t0, t2);
-    FWHT_2(t1, t3);
-    data[0] = t0;
-    data[1] = t1;
-    data[2] = t2;
-    data[3] = t3;
-}
-
-static LEO_FORCE_INLINE void FWHT_4(fwht_t* data, unsigned s)
-{
-    unsigned x = 0;
-    fwht_t t0 = data[x];  x += s;
-    fwht_t t1 = data[x];  x += s;
-    fwht_t t2 = data[x];  x += s;
-    fwht_t t3 = data[x];
-    FWHT_2(t0, t1);
-    FWHT_2(t2, t3);
-    FWHT_2(t0, t2);
-    FWHT_2(t1, t3);
-    unsigned y = 0;
-    data[y] = t0;  y += s;
-    data[y] = t1;  y += s;
-    data[y] = t2;  y += s;
-    data[y] = t3;
-}
-
-static inline void FWHT_8(fwht_t* data)
-{
-    fwht_t t0 = data[0];
-    fwht_t t1 = data[1];
-    fwht_t t2 = data[2];
-    fwht_t t3 = data[3];
-    fwht_t t4 = data[4];
-    fwht_t t5 = data[5];
-    fwht_t t6 = data[6];
-    fwht_t t7 = data[7];
-    FWHT_2(t0, t1);
-    FWHT_2(t2, t3);
-    FWHT_2(t4, t5);
-    FWHT_2(t6, t7);
-    FWHT_2(t0, t2);
-    FWHT_2(t1, t3);
-    FWHT_2(t4, t6);
-    FWHT_2(t5, t7);
-    FWHT_2(t0, t4);
-    FWHT_2(t1, t5);
-    FWHT_2(t2, t6);
-    FWHT_2(t3, t7);
-    data[0] = t0;
-    data[1] = t1;
-    data[2] = t2;
-    data[3] = t3;
-    data[4] = t4;
-    data[5] = t5;
-    data[6] = t6;
-    data[7] = t7;
-}
-
-static inline void FWHT_16(fwht_t* data)
-{
-    fwht_t t0 = data[0];
-    fwht_t t1 = data[1];
-    fwht_t t2 = data[2];
-    fwht_t t3 = data[3];
-    fwht_t t4 = data[4];
-    fwht_t t5 = data[5];
-    fwht_t t6 = data[6];
-    fwht_t t7 = data[7];
-    fwht_t t8 = data[8];
-    fwht_t t9 = data[9];
-    fwht_t t10 = data[10];
-    fwht_t t11 = data[11];
-    fwht_t t12 = data[12];
-    fwht_t t13 = data[13];
-    fwht_t t14 = data[14];
-    fwht_t t15 = data[15];
-    FWHT_2(t0, t1);
-    FWHT_2(t2, t3);
-    FWHT_2(t4, t5);
-    FWHT_2(t6, t7);
-    FWHT_2(t8, t9);
-    FWHT_2(t10, t11);
-    FWHT_2(t12, t13);
-    FWHT_2(t14, t15);
-    FWHT_2(t0, t2);
-    FWHT_2(t1, t3);
-    FWHT_2(t4, t6);
-    FWHT_2(t5, t7);
-    FWHT_2(t8, t10);
-    FWHT_2(t9, t11);
-    FWHT_2(t12, t14);
-    FWHT_2(t13, t15);
-    FWHT_2(t0, t4);
-    FWHT_2(t1, t5);
-    FWHT_2(t2, t6);
-    FWHT_2(t3, t7);
-    FWHT_2(t8, t12);
-    FWHT_2(t9, t13);
-    FWHT_2(t10, t14);
-    FWHT_2(t11, t15);
-    FWHT_2(t0, t8);
-    FWHT_2(t1, t9);
-    FWHT_2(t2, t10);
-    FWHT_2(t3, t11);
-    FWHT_2(t4, t12);
-    FWHT_2(t5, t13);
-    FWHT_2(t6, t14);
-    FWHT_2(t7, t15);
-    data[0] = t0;
-    data[1] = t1;
-    data[2] = t2;
-    data[3] = t3;
-    data[4] = t4;
-    data[5] = t5;
-    data[6] = t6;
-    data[7] = t7;
-    data[8] = t8;
-    data[9] = t9;
-    data[10] = t10;
-    data[11] = t11;
-    data[12] = t12;
-    data[13] = t13;
-    data[14] = t14;
-    data[15] = t15;
-}
-
-static void FWHT_SmallData(fwht_t* data, unsigned ldn)
-{
-    const unsigned n = (1UL << ldn);
-
-    if (n <= 2)
-    {
-        if (n == 2)
-            FWHT_2(data[0], data[1]);
-        return;
-    }
-
-    for (unsigned ldm = ldn; ldm > 3; ldm -= 2)
-    {
-        unsigned m = (1UL << ldm);
-        unsigned m4 = (m >> 2);
-        for (unsigned r = 0; r < n; r += m)
-            for (unsigned j = 0; j < m4; j++)
-                FWHT_4(data + j + r, m4);
-    }
-
-    if (ldn & 1)
-    {
-        for (unsigned i0 = 0; i0 < n; i0 += 8)
-            FWHT_8(data + i0);
-    }
-    else
-    {
-        for (unsigned i0 = 0; i0 < n; i0 += 4)
-            FWHT_4(data + i0);
-    }
-}
-
-// Decimation in time (DIT) version
-static void FWHT(fwht_t* data, const unsigned ldn)
-{
-    if (ldn <= 13)
-    {
-        FWHT_SmallData(data, ldn);
-        return;
-    }
-
-    FWHT_2(data[2], data[3]);
-    FWHT_4(data + 4);
-    FWHT_8(data + 8);
-    FWHT_16(data + 16);
-    for (unsigned ldm = 5; ldm < ldn; ++ldm)
-        FWHT(data + (unsigned)(1UL << ldm), ldm);
-
-    for (unsigned ldm = 0; ldm < ldn; ++ldm)
-    {
-        const unsigned mh = (1UL << ldm);
-        for (unsigned t1 = 0, t2 = mh; t1 < mh; ++t1, ++t2)
-            FWHT_2(data[t1], data[t2]);
-    }
-}
-
-#endif
-
-
-//------------------------------------------------------------------------------
-// Memory Buffer XOR
-
-static void xor_mem(void * LEO_RESTRICT vx, const void * LEO_RESTRICT vy, unsigned bytes)
-{
-    LEO_M128 * LEO_RESTRICT x16 = reinterpret_cast<LEO_M128 *>(vx);
-    const LEO_M128 * LEO_RESTRICT y16 = reinterpret_cast<const LEO_M128 *>(vy);
-
-#if defined(LEO_TARGET_MOBILE)
-# if defined(LEO_TRY_NEON)
-    // Handle multiples of 64 bytes
-    if (CpuHasNeon)
-    {
-        while (bytes >= 64)
-        {
-            LEO_M128 x0 = vld1q_u8(x16);
-            LEO_M128 x1 = vld1q_u8(x16 + 1);
-            LEO_M128 x2 = vld1q_u8(x16 + 2);
-            LEO_M128 x3 = vld1q_u8(x16 + 3);
-            LEO_M128 y0 = vld1q_u8(y16);
-            LEO_M128 y1 = vld1q_u8(y16 + 1);
-            LEO_M128 y2 = vld1q_u8(y16 + 2);
-            LEO_M128 y3 = vld1q_u8(y16 + 3);
-
-            vst1q_u8(x16,     veorq_u8(x0, y0));
-            vst1q_u8(x16 + 1, veorq_u8(x1, y1));
-            vst1q_u8(x16 + 2, veorq_u8(x2, y2));
-            vst1q_u8(x16 + 3, veorq_u8(x3, y3));
-
-            bytes -= 64, x16 += 4, y16 += 4;
-        }
-
-        // Handle multiples of 16 bytes
-        while (bytes >= 16)
-        {
-            LEO_M128 x0 = vld1q_u8(x16);
-            LEO_M128 y0 = vld1q_u8(y16);
-
-            vst1q_u8(x16, veorq_u8(x0, y0));
-
-            bytes -= 16, ++x16, ++y16;
-        }
-    }
-    else
-# endif // LEO_TRY_NEON
-    {
-        uint64_t * LEO_RESTRICT x8 = reinterpret_cast<uint64_t *>(x16);
-        const uint64_t * LEO_RESTRICT y8 = reinterpret_cast<const uint64_t *>(y16);
-
-        const unsigned count = (unsigned)bytes / 8;
-        for (unsigned ii = 0; ii < count; ++ii)
-            x8[ii] ^= y8[ii];
-
-        x16 = reinterpret_cast<LEO_M128 *>(x8 + count);
-        y16 = reinterpret_cast<const LEO_M128 *>(y8 + count);
-    }
-#else // LEO_TARGET_MOBILE
-# if defined(LEO_TRY_AVX2)
-    if (CpuHasAVX2)
-    {
-        LEO_M256 * LEO_RESTRICT x32 = reinterpret_cast<LEO_M256 *>(x16);
-        const LEO_M256 * LEO_RESTRICT y32 = reinterpret_cast<const LEO_M256 *>(y16);
-
-        while (bytes >= 128)
-        {
-            LEO_M256 x0 = _mm256_loadu_si256(x32);
-            LEO_M256 y0 = _mm256_loadu_si256(y32);
-            x0 = _mm256_xor_si256(x0, y0);
-            LEO_M256 x1 = _mm256_loadu_si256(x32 + 1);
-            LEO_M256 y1 = _mm256_loadu_si256(y32 + 1);
-            x1 = _mm256_xor_si256(x1, y1);
-            LEO_M256 x2 = _mm256_loadu_si256(x32 + 2);
-            LEO_M256 y2 = _mm256_loadu_si256(y32 + 2);
-            x2 = _mm256_xor_si256(x2, y2);
-            LEO_M256 x3 = _mm256_loadu_si256(x32 + 3);
-            LEO_M256 y3 = _mm256_loadu_si256(y32 + 3);
-            x3 = _mm256_xor_si256(x3, y3);
-
-            _mm256_storeu_si256(x32, x0);
-            _mm256_storeu_si256(x32 + 1, x1);
-            _mm256_storeu_si256(x32 + 2, x2);
-            _mm256_storeu_si256(x32 + 3, x3);
-
-            bytes -= 128, x32 += 4, y32 += 4;
-        }
-
-        // Handle multiples of 32 bytes
-        while (bytes >= 32)
-        {
-            // x[i] = x[i] xor y[i]
-            _mm256_storeu_si256(x32,
-                _mm256_xor_si256(
-                    _mm256_loadu_si256(x32),
-                    _mm256_loadu_si256(y32)));
-
-            bytes -= 32, ++x32, ++y32;
-        }
-
-        x16 = reinterpret_cast<LEO_M128 *>(x32);
-        y16 = reinterpret_cast<const LEO_M128 *>(y32);
-    }
-    else
-# endif // LEO_TRY_AVX2
-    {
-        while (bytes >= 64)
-        {
-            LEO_M128 x0 = _mm_loadu_si128(x16);
-            LEO_M128 y0 = _mm_loadu_si128(y16);
-            x0 = _mm_xor_si128(x0, y0);
-            LEO_M128 x1 = _mm_loadu_si128(x16 + 1);
-            LEO_M128 y1 = _mm_loadu_si128(y16 + 1);
-            x1 = _mm_xor_si128(x1, y1);
-            LEO_M128 x2 = _mm_loadu_si128(x16 + 2);
-            LEO_M128 y2 = _mm_loadu_si128(y16 + 2);
-            x2 = _mm_xor_si128(x2, y2);
-            LEO_M128 x3 = _mm_loadu_si128(x16 + 3);
-            LEO_M128 y3 = _mm_loadu_si128(y16 + 3);
-            x3 = _mm_xor_si128(x3, y3);
-
-            _mm_storeu_si128(x16, x0);
-            _mm_storeu_si128(x16 + 1, x1);
-            _mm_storeu_si128(x16 + 2, x2);
-            _mm_storeu_si128(x16 + 3, x3);
-
-            bytes -= 64, x16 += 4, y16 += 4;
-        }
-    }
-#endif // LEO_TARGET_MOBILE
-
-    // Handle multiples of 16 bytes
-    while (bytes >= 16)
-    {
-        // x[i] = x[i] xor y[i]
-        _mm_storeu_si128(x16,
-            _mm_xor_si128(
-                _mm_loadu_si128(x16),
-                _mm_loadu_si128(y16)));
-
-        bytes -= 16, ++x16, ++y16;
-    }
-
-    uint8_t * LEO_RESTRICT x1 = reinterpret_cast<uint8_t *>(x16);
-    const uint8_t * LEO_RESTRICT y1 = reinterpret_cast<const uint8_t *>(y16);
-
-    // Handle a block of 8 bytes
-    const unsigned eight = bytes & 8;
-    if (eight)
-    {
-        uint64_t * LEO_RESTRICT x8 = reinterpret_cast<uint64_t *>(x1);
-        const uint64_t * LEO_RESTRICT y8 = reinterpret_cast<const uint64_t *>(y1);
-        *x8 ^= *y8;
-    }
-
-    // Handle a block of 4 bytes
-    const unsigned four = bytes & 4;
-    if (four)
-    {
-        uint32_t * LEO_RESTRICT x4 = reinterpret_cast<uint32_t *>(x1 + eight);
-        const uint32_t * LEO_RESTRICT y4 = reinterpret_cast<const uint32_t *>(y1 + eight);
-        *x4 ^= *y4;
-    }
-
-    // Handle final bytes
-    const unsigned offset = eight + four;
-    switch (bytes & 3)
-    {
-    case 3: x1[offset + 2] ^= y1[offset + 2];
-    case 2: x1[offset + 1] ^= y1[offset + 1];
-    case 1: x1[offset] ^= y1[offset];
-    default:
-        break;
-    }
-}
-
-
-//------------------------------------------------------------------------------
-// Formal Derivative
-
-// Formal derivative of polynomial in the new basis
-static void formal_derivative(GFSymbol* cos, const unsigned size)
-{
-    for (unsigned i = 1; i < size; ++i)
-    {
-        const unsigned leng = ((i ^ (i - 1)) + 1) >> 1;
-
-        // If a large number of values are being XORed:
-        if (leng >= 8)
-            xor_mem(cos + i - leng, cos + i, leng * sizeof(GFSymbol));
-        else
-            for (unsigned j = i - leng; j < i; j++)
-                cos[j] ^= cos[j + leng];
-    }
-
-    for (unsigned i = size; i < kFieldSize; i <<= 1)
-        xor_mem(cos, cos + i, size * sizeof(GFSymbol));
-}
-
-
-//------------------------------------------------------------------------------
-// Fast Fourier Transform
-
-static GFSymbol skewVec[kFieldModulus]; // twisted factors used in FFT
-
-// IFFT in the proposed basis
-static void IFLT(GFSymbol* data, const unsigned size, const unsigned index)
-{
-    for (unsigned depart_no = 1; depart_no < size; depart_no <<= 1)
-    {
-        for (unsigned j = depart_no; j < size; j += (depart_no << 1))
-        {
-            // If a large number of values are being XORed:
-            if (depart_no >= 8)
-                xor_mem(data + j, data + j - depart_no, depart_no * sizeof(GFSymbol));
-            else
-                for (unsigned i = j - depart_no; i < j; ++i)
-                    data[i + depart_no] ^= data[i];
-
-            const GFSymbol skew = skewVec[j + index - 1];
-
-            if (skew != kFieldModulus)
-                muladd_mem(data + j - depart_no, data + j, skew, depart_no);
-        }
-    }
-}
-
-// FFT in the proposed basis
-static void FLT(GFSymbol* data, const unsigned size, const unsigned index)
-{
-    for (unsigned depart_no = (size >> 1); depart_no > 0; depart_no >>= 1)
-    {
-        for (unsigned j = depart_no; j < size; j += (depart_no << 1))
-        {
-            const GFSymbol skew = skewVec[j + index - 1];
-
-            if (skew != kFieldModulus)
-                muladd_mem(data + j - depart_no, data + j, skew, depart_no);
-
-            // If a large number of values are being XORed:
-            if (depart_no >= 8)
-                xor_mem(data + j, data + j - depart_no, depart_no * sizeof(GFSymbol));
-            else
-                for (unsigned i = j - depart_no; i < j; ++i)
-                    data[i + depart_no] ^= data[i];
-        }
-    }
-}
-
-
-//------------------------------------------------------------------------------
-// FFT Initialization
-
-static GFSymbol B[kFieldSize >> 1];     // factors used in formal derivative
-static fwht_t log_walsh[kFieldSize];  // factors used in the evaluation of the error locator polynomial
-
-// Initialize skewVec[], B[], log_walsh[]
-static void InitFieldOperations()
-{
-    GFSymbol temp[kGFBits - 1];
-
-    for (unsigned i = 1; i < kGFBits; ++i)
-        temp[i - 1] = (GFSymbol)((unsigned)1 << i);
-
-    for (unsigned m = 0; m < (kGFBits - 1); ++m)
-    {
-        const unsigned step = (unsigned)1 << (m + 1);
-
-        skewVec[((unsigned)1 << m) - 1] = 0;
-
-        for (unsigned i = m; i < (kGFBits - 1); ++i)
-        {
-            const unsigned s = ((unsigned)1 << (i + 1));
-
-            for (unsigned j = ((unsigned)1 << m) - 1; j < s; j += step)
-                skewVec[j + s] = skewVec[j] ^ temp[i];
-        }
-
-        temp[m] = kFieldModulus - GFLog[mulE(temp[m], GFLog[temp[m] ^ 1])];
-
-        for (unsigned i = m + 1; i < (kGFBits - 1); ++i)
-            temp[i] = mulE(temp[i], (GFLog[temp[i] ^ 1] + temp[m]) % kFieldModulus);
-    }
-
-    for (unsigned i = 0; i < kFieldSize; ++i)
-        skewVec[i] = GFLog[skewVec[i]];
-
-    temp[0] = kFieldModulus - temp[0];
-
-    for (unsigned i = 1; i < (kGFBits - 1); ++i)
-        temp[i] = (kFieldModulus - temp[i] + temp[i - 1]) % kFieldModulus;
-
-    B[0] = 0;
-    for (unsigned i = 0; i < (kGFBits - 1); ++i)
-    {
-        const unsigned depart = ((unsigned)1 << i);
-
-        for (unsigned j = 0; j < depart; ++j)
-            B[j + depart] = (B[j] + temp[i]) % kFieldModulus;
-    }
-
-    for (unsigned i = 0; i < kFieldSize; ++i)
-        log_walsh[i] = GFLog[i];
-
-    log_walsh[0] = 0;
-
-    FWHT(log_walsh, kGFBits);
-}
-
-
-//------------------------------------------------------------------------------
-// Encoder
-
-// Encoding alg for k/n<0.5: message is a power of two
-static void encodeL(GFSymbol* data, const unsigned k, GFSymbol* codeword)
-{
-    memcpy(codeword, data, sizeof(GFSymbol) * k);
-
-    IFLT(codeword, k, 0);
-
-    for (unsigned i = k; i < kFieldSize; i += k)
-    {
-        memcpy(&codeword[i], codeword, sizeof(GFSymbol) * k);
-
-        FLT(&codeword[i], k, i);
-    }
-
-    memcpy(codeword, data, sizeof(GFSymbol) * k);
-}
-
-// Encoding alg for k/n>0.5: parity is a power of two.
-// data: message array. parity: parity array. mem: buffer(size>= n-k)
-static void encodeH(const GFSymbol* data, const unsigned k, GFSymbol* parity, GFSymbol* mem)
-{
-    const unsigned t = kFieldSize - k;
-
-    memset(parity, 0, sizeof(GFSymbol) * t);
-
-    for (unsigned i = t; i < kFieldSize; i += t)
-    {
-        memcpy(mem, &data[i - t], sizeof(GFSymbol) * t);
-
-        IFLT(mem, t, i);
-
-        xor_mem(parity, mem, t * sizeof(GFSymbol));
-    }
-
-    FLT(parity, t, 0);
-}
-
-
-//------------------------------------------------------------------------------
-// Decoder
-
-static void decode(GFSymbol* codeword, unsigned k, const bool* erasure)
-{
-    fwht_t log_walsh2[kFieldSize];
-
-    // Compute the evaluations of the error locator polynomial
-    for (unsigned i = 0; i < kFieldSize; ++i)
-        log_walsh2[i] = erasure[i] ? 1 : 0;
-
-    FWHT(log_walsh2, kGFBits);
-
-    for (unsigned i = 0; i < kFieldSize; ++i)
-        log_walsh2[i] = ((unsigned)log_walsh2[i] * (unsigned)log_walsh[i]) % kFieldModulus;
-
-    FWHT(log_walsh2, kGFBits);
-
-    // k2 can be replaced with k
-    const unsigned k2 = kFieldSize;
-    //const unsigned k2 = k; // cannot actually be replaced with k.  what else need to change?
-
-    for (unsigned i = 0; i < kFieldSize; ++i)
-    {
-        if (erasure[i])
-        {
-            codeword[i] = 0;
-        }
-        else
-        {
-            codeword[i] = mulE(codeword[i], log_walsh2[i]);
-        }
-    }
-
-    IFLT(codeword, kFieldSize, 0);
-
-    // formal derivative
-    for (unsigned i = 0; i < kFieldSize; i += 2)
-    {
-        codeword[i] = mulE(codeword[i], kFieldModulus - B[i >> 1]);
-        codeword[i + 1] = mulE(codeword[i + 1], kFieldModulus - B[i >> 1]);
-    }
-
-    formal_derivative(codeword, k2);
-
-    for (unsigned i = 0; i < k2; i += 2)
-    {
-        codeword[i] = mulE(codeword[i], B[i >> 1]);
-        codeword[i + 1] = mulE(codeword[i + 1], B[i >> 1]);
-    }
-
-    FLT(codeword, k2, 0);
-
-    for (unsigned i = 0; i < k2; ++i)
-    {
-        if (erasure[i])
-        {
-            codeword[i] = mulE(codeword[i], kFieldModulus - log_walsh2[i]);
-        }
-    }
-}
-
-
-//------------------------------------------------------------------------------
-// Test Application
-
-void test(unsigned k, unsigned seed)
-{
-    srand(seed);
-
-    //-----------Generating message----------
-
-    // Message array
-    GFSymbol data[kFieldSize] = {0};
-
-    // Filled with random numbers
-    for (unsigned i = kFieldSize - k; i < kFieldSize; ++i)
-        data[i] = (GFSymbol)rand();
-
-
-    //---------encoding----------
-
-    GFSymbol codeword[kFieldSize];
-    encodeH(&data[kFieldSize - k], k, data, codeword);
-    //encodeL(data, k, codeword); // does not seem to work with any input?  what else needs to change?
-
-    memcpy(codeword, data, sizeof(GFSymbol) * kFieldSize);
-
-
-    //--------erasure simulation---------
-
-    // Array indicating erasures
-    bool erasure[kFieldSize] = {
-        false
-    };
-
-    for (unsigned i = k; i < kFieldSize; ++i)
-        erasure[i] = true;
-
-    // permuting the erasure array
-    for (unsigned i = kFieldSize - 1; i > 0; --i)
-    {
-        unsigned pos = rand() % (i + 1);
-
-        if (i != pos)
-        {
-            bool tmp = erasure[i];
-            erasure[i] = erasure[pos];
-            erasure[pos] = tmp;
-        }
-    }
-
-    // erasure codeword symbols
-    for (unsigned i = 0; i < kFieldSize; ++i)
-        if (erasure[i])
-            codeword[i] = 0;
-
-
-    //---------main processing----------
-    decode(codeword, k, erasure);
-
-    // Check the correctness of the result
-    for (unsigned i = 0; i < kFieldSize; ++i)
-    {
-        if (erasure[i] == 1)
-        {
-            if (data[i] != codeword[i])
-            {
-                printf("Decoding Error with seed = %d!\n", seed);
-                LEO_DEBUG_BREAK;
-                return;
-            }
-        }
-    }
-
-    //printf("Decoding is successful!\n");
-}
-
-
-//------------------------------------------------------------------------------
-// Entrypoint
-
-int main(int argc, char **argv)
-{
-    // Initialize architecture-specific code
-    leo_architecture_init();
-
-    // Fill GFLog table and GFExp table
-    InitField();
-
-    // Compute factors used in erasure decoder
-    InitFieldOperations();
-
-    unsigned seed = (unsigned)time(NULL);
-    for (;;)
-    {
-        // test(int k), k: message size
-        /*
-            EncodeH works for kFieldSize / 2 and kFieldSize * 3 / 4, etc,
-            s.t. the number of recovery pieces is a power of two
-        */
-        test(kFieldSize / 2, seed);
-
-        ++seed;
-    }
-
-    return 0;
-}
diff --git a/LeopardEncoder.cpp b/LeopardEncoder.cpp
deleted file mode 100644
index 71d22e2..0000000
--- a/LeopardEncoder.cpp
+++ /dev/null
@@ -1,1220 +0,0 @@
-/*
-    Copyright (c) 2017 Christopher A. Taylor.  All rights reserved.
-
-    Redistribution and use in source and binary forms, with or without
-    modification, are permitted provided that the following conditions are met:
-
-    * Redistributions of source code must retain the above copyright notice,
-      this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above copyright notice,
-      this list of conditions and the following disclaimer in the documentation
-      and/or other materials provided with the distribution.
-    * Neither the name of LHC-RS nor the names of its contributors may be
-      used to endorse or promote products derived from this software without
-      specific prior written permission.
-
-    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-    ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
-    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-    POSSIBILITY OF SUCH DAMAGE.
-*/
-
-#include <string.h>
-#include <time.h>
-#include <stdio.h>
-#include <stdint.h>
-#include <stdlib.h>
-
-
-/*
-    TODO:
-    + Write C API and unit tester
-        + Limit input to multiples of 64 bytes
-    + Replace GFSymbol with a file data pointer
-    + New 16-bit Muladd inner loops
-        + Class to contain the (large) muladd tables
-    + Preliminary benchmarks for large data!
-    + New 8-bit Muladd inner loops
-    + Benchmarks for smaller data!
-    + Refactor software
-        + Pick a name for the software better than LEO_RS
-        + I think it should be split up into several C++ modules
-    + Write detailed comments for all the routines
-    + Look into getting EncodeL working so we can support smaller data (Ask Lin)
-    + Look into using k instead of k2 to speed up decoder (Ask Lin)
-    + Avoid performing FFT/IFFT intermediate calculations we're not going to use
-    + Benchmarks, fun!
-    + Add multi-threading to split up long parallelizable calculations
-    + Final benchmarks!
-    + Finish up documentation
-    + Release version 1
-
-
-    Muladd implementation notes:
-
-    Specialize for 1-3 rows at a time since often times we're multiplying by
-    the same (skew) value repeatedly, as the ISA-L library does here:
-
-    https://github.com/01org/isa-l/blob/master/erasure_code/gf_3vect_mad_avx.asm#L258
-
-    Except we should be doing it for 16-bit Galois Field.
-    To implement that use the ALTMAP trick from Jerasure:
-
-    http://lab.jerasure.org/jerasure/gf-complete/blob/master/src/gf_w16.c#L1140
-
-    Except we should also support AVX2 since that is a 40% perf boost, so put
-    the high and low bytes 32 bytes instead of 16 bytes apart.
-
-    Also I think we should go ahead and precompute the multiply tables since
-    it avoids a bunch of memory lookups for each muladd, and only costs 8 MB.
-*/
-
-
-//------------------------------------------------------------------------------
-// Debug
-
-// Some bugs only repro in release mode, so this can be helpful
-//#define LEO_DEBUG_IN_RELEASE
-
-#if defined(_DEBUG) || defined(DEBUG) || defined(LEO_DEBUG_IN_RELEASE)
-    #define LEO_DEBUG
-    #ifdef _WIN32
-        #define LEO_DEBUG_BREAK __debugbreak()
-    #else
-        #define LEO_DEBUG_BREAK __builtin_trap()
-    #endif
-    #define LEO_DEBUG_ASSERT(cond) { if (!(cond)) { LEO_DEBUG_BREAK; } }
-#else
-    #define LEO_DEBUG_BREAK ;
-    #define LEO_DEBUG_ASSERT(cond) ;
-#endif
-
-
-//------------------------------------------------------------------------------
-// Platform/Architecture
-
-#if defined(ANDROID) || defined(IOS)
-    #define LEO_TARGET_MOBILE
-#endif // ANDROID
-
-#if defined(__AVX2__) || (defined (_MSC_VER) && _MSC_VER >= 1900)
-    #define LEO_TRY_AVX2 /* 256-bit */
-    #include <immintrin.h>
-    #define LEO_ALIGN_BYTES 32
-#else // __AVX2__
-    #define LEO_ALIGN_BYTES 16
-#endif // __AVX2__
-
-#if !defined(LEO_TARGET_MOBILE)
-    // Note: MSVC currently only supports SSSE3 but not AVX2
-    #include <tmmintrin.h> // SSSE3: _mm_shuffle_epi8
-    #include <emmintrin.h> // SSE2
-#endif // LEO_TARGET_MOBILE
-
-#if defined(HAVE_ARM_NEON_H)
-    #include <arm_neon.h>
-#endif // HAVE_ARM_NEON_H
-
-#if defined(LEO_TARGET_MOBILE)
-
-    #define LEO_ALIGNED_ACCESSES /* Inputs must be aligned to LEO_ALIGN_BYTES */
-
-# if defined(HAVE_ARM_NEON_H)
-    // Compiler-specific 128-bit SIMD register keyword
-    #define LEO_M128 uint8x16_t
-    #define LEO_TRY_NEON
-#else
-    #define LEO_M128 uint64_t
-# endif
-
-#else // LEO_TARGET_MOBILE
-
-    // Compiler-specific 128-bit SIMD register keyword
-    #define LEO_M128 __m128i
-
-#endif // LEO_TARGET_MOBILE
-
-#ifdef LEO_TRY_AVX2
-    // Compiler-specific 256-bit SIMD register keyword
-    #define LEO_M256 __m256i
-#endif
-
-// Compiler-specific C++11 restrict keyword
-#define LEO_RESTRICT __restrict
-
-// Compiler-specific force inline keyword
-#ifdef _MSC_VER
-    #define LEO_FORCE_INLINE inline __forceinline
-#else
-    #define LEO_FORCE_INLINE inline __attribute__((always_inline))
-#endif
-
-// Compiler-specific alignment keyword
-// Note: Alignment only matters for ARM NEON where it should be 16
-#ifdef _MSC_VER
-    #define LEO_ALIGNED __declspec(align(LEO_ALIGN_BYTES))
-#else // _MSC_VER
-    #define LEO_ALIGNED __attribute__((aligned(LEO_ALIGN_BYTES)))
-#endif // _MSC_VER
-
-
-//------------------------------------------------------------------------------
-// Runtime CPU Architecture Check
-//
-// Feature checks stolen shamelessly from
-// https://github.com/jedisct1/libsodium/blob/master/src/libsodium/sodium/runtime.c
-
-#if defined(HAVE_ANDROID_GETCPUFEATURES)
-    #include <cpu-features.h>
-#endif
-
-#if defined(LEO_TRY_NEON)
-# if defined(IOS) && defined(__ARM_NEON__)
-        // Requires iPhone 5S or newer
-        static const bool CpuHasNeon = true;
-        static const bool CpuHasNeon64 = true;
-# else
-        // Remember to add LOCAL_STATIC_LIBRARIES := cpufeatures
-        static bool CpuHasNeon = false; // V6 / V7
-        static bool CpuHasNeon64 = false; // 64-bit
-# endif
-#endif
-
-
-#if !defined(LEO_TARGET_MOBILE)
-
-#ifdef _MSC_VER
-    #include <intrin.h> // __cpuid
-    #pragma warning(disable: 4752) // found Intel(R) Advanced Vector Extensions; consider using /arch:AVX
-#endif
-
-#ifdef LEO_TRY_AVX2
-static bool CpuHasAVX2 = false;
-#endif
-static bool CpuHasSSSE3 = false;
-
-#define CPUID_EBX_AVX2    0x00000020
-#define CPUID_ECX_SSSE3   0x00000200
-
-static void _cpuid(unsigned int cpu_info[4U], const unsigned int cpu_info_type)
-{
-#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_AMD64) || defined(_M_IX86))
-    __cpuid((int *) cpu_info, cpu_info_type);
-#else //if defined(HAVE_CPUID)
-    cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0;
-# ifdef __i386__
-    __asm__ __volatile__ ("pushfl; pushfl; "
-                          "popl %0; "
-                          "movl %0, %1; xorl %2, %0; "
-                          "pushl %0; "
-                          "popfl; pushfl; popl %0; popfl" :
-                          "=&r" (cpu_info[0]), "=&r" (cpu_info[1]) :
-                          "i" (0x200000));
-    if (((cpu_info[0] ^ cpu_info[1]) & 0x200000) == 0) {
-        return; /* LCOV_EXCL_LINE */
-    }
-# endif
-# ifdef __i386__
-    __asm__ __volatile__ ("xchgl %%ebx, %k1; cpuid; xchgl %%ebx, %k1" :
-                          "=a" (cpu_info[0]), "=&r" (cpu_info[1]),
-                          "=c" (cpu_info[2]), "=d" (cpu_info[3]) :
-                          "0" (cpu_info_type), "2" (0U));
-# elif defined(__x86_64__)
-    __asm__ __volatile__ ("xchgq %%rbx, %q1; cpuid; xchgq %%rbx, %q1" :
-                          "=a" (cpu_info[0]), "=&r" (cpu_info[1]),
-                          "=c" (cpu_info[2]), "=d" (cpu_info[3]) :
-                          "0" (cpu_info_type), "2" (0U));
-# else
-    __asm__ __volatile__ ("cpuid" :
-                          "=a" (cpu_info[0]), "=b" (cpu_info[1]),
-                          "=c" (cpu_info[2]), "=d" (cpu_info[3]) :
-                          "0" (cpu_info_type), "2" (0U));
-# endif
-#endif
-}
-
-#endif // defined(LEO_TARGET_MOBILE)
-
-
-static void leo_architecture_init()
-{
-#if defined(LEO_TRY_NEON) && defined(HAVE_ANDROID_GETCPUFEATURES)
-    AndroidCpuFamily family = android_getCpuFamily();
-    if (family == ANDROID_CPU_FAMILY_ARM)
-    {
-        if (android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON)
-            CpuHasNeon = true;
-    }
-    else if (family == ANDROID_CPU_FAMILY_ARM64)
-    {
-        CpuHasNeon = true;
-        if (android_getCpuFeatures() & ANDROID_CPU_ARM64_FEATURE_ASIMD)
-            CpuHasNeon64 = true;
-    }
-#endif
-
-#if !defined(LEO_TARGET_MOBILE)
-    unsigned int cpu_info[4];
-
-    _cpuid(cpu_info, 1);
-    CpuHasSSSE3 = ((cpu_info[2] & CPUID_ECX_SSSE3) != 0);
-
-#if defined(LEO_TRY_AVX2)
-    _cpuid(cpu_info, 7);
-    CpuHasAVX2 = ((cpu_info[1] & CPUID_EBX_AVX2) != 0);
-#endif // LEO_TRY_AVX2
-
-#endif // LEO_TARGET_MOBILE
-}
-
-
-//------------------------------------------------------------------------------
-// SIMD-Safe Aligned Memory Allocations
-
-static const unsigned kAlignmentBytes = LEO_ALIGN_BYTES;
-
-LEO_FORCE_INLINE unsigned NextAlignedOffset(unsigned offset)
-{
-    return (offset + kAlignmentBytes - 1) & ~(kAlignmentBytes - 1);
-}
-
-static LEO_FORCE_INLINE uint8_t* SIMDSafeAllocate(size_t size)
-{
-    uint8_t* data = (uint8_t*)calloc(1, kAlignmentBytes + size);
-    if (!data)
-        return nullptr;
-    unsigned offset = (unsigned)((uintptr_t)data % kAlignmentBytes);
-    data += kAlignmentBytes - offset;
-    data[-1] = (uint8_t)offset;
-    return data;
-}
-
-static LEO_FORCE_INLINE void SIMDSafeFree(void* ptr)
-{
-    if (!ptr)
-        return;
-    uint8_t* data = (uint8_t*)ptr;
-    unsigned offset = data[-1];
-    if (offset >= kAlignmentBytes)
-    {
-        LEO_DEBUG_BREAK; // Should never happen
-        return;
-    }
-    data -= kAlignmentBytes - offset;
-    free(data);
-}
-
-
-//------------------------------------------------------------------------------
-// Field
-
-//#define LEO_SHORT_FIELD
-
-#ifdef LEO_SHORT_FIELD
-typedef uint8_t GFSymbol;
-static const unsigned kGFBits = 8;
-static const unsigned kGFPolynomial = 0x11D;
-GFSymbol kGFBasis[kGFBits] = {
-    1, 214, 152, 146, 86, 200, 88, 230 // Cantor basis
-};
-#else
-typedef uint16_t GFSymbol;
-static const unsigned kGFBits = 16;
-static const unsigned kGFPolynomial = 0x1002D;
-GFSymbol kGFBasis[kGFBits] = {
-    0x0001, 0xACCA, 0x3C0E, 0x163E, // Cantor basis
-    0xC582, 0xED2E, 0x914C, 0x4012,
-    0x6C98, 0x10D8, 0x6A72, 0xB900,
-    0xFDB8, 0xFB34, 0xFF38, 0x991E
-};
-#endif
-
-/*
-    Cantor Basis introduced by:
-    D. G. Cantor, "On arithmetical algorithms over finite fields",
-    Journal of Combinatorial Theory, Series A, vol. 50, no. 2, pp. 285-300, 1989.
-*/
-
-static const unsigned kFieldSize = (unsigned)1 << kGFBits; //Field size
-static const unsigned kFieldModulus = kFieldSize - 1;
-
-static GFSymbol GFLog[kFieldSize];
-static GFSymbol GFExp[kFieldSize];
-
-// Initialize GFLog[], GFExp[]
-static void InitField()
-{
-    unsigned state = 1;
-    for (unsigned i = 0; i < kFieldModulus; ++i)
-    {
-        GFExp[state] = static_cast<GFSymbol>(i);
-        state <<= 1;
-        if (state >= kFieldSize)
-            state ^= kGFPolynomial;
-    }
-    GFExp[0] = kFieldModulus;
-
-    // Conversion to chosen basis:
-
-    GFLog[0] = 0;
-    for (unsigned i = 0; i < kGFBits; ++i)
-    {
-        const GFSymbol basis = kGFBasis[i];
-        const unsigned width = (unsigned)(1UL << i);
-
-        for (unsigned j = 0; j < width; ++j)
-            GFLog[j + width] = GFLog[j] ^ basis;
-    }
-
-    for (unsigned i = 0; i < kFieldSize; ++i)
-        GFLog[i] = GFExp[GFLog[i]];
-
-    for (unsigned i = 0; i < kFieldSize; ++i)
-        GFExp[GFLog[i]] = i;
-
-    GFExp[kFieldModulus] = GFExp[0];
-}
-
-
-//------------------------------------------------------------------------------
-// Mod Q Field Operations
-//
-// Q is the maximum symbol value, e.g. 255 or 65535.
-
-// z = x + y (mod Q)
-static inline GFSymbol AddModQ(GFSymbol a, GFSymbol b)
-{
-    const unsigned sum = (unsigned)a + b;
-
-    // Partial reduction step, allowing for Q to be returned
-    return static_cast<GFSymbol>(sum + (sum >> kGFBits));
-}
-
-// z = x - y (mod Q)
-static inline GFSymbol SubModQ(GFSymbol a, GFSymbol b)
-{
-    const unsigned dif = (unsigned)a - b;
-
-    // Partial reduction step, allowing for Q to be returned
-    return static_cast<GFSymbol>(dif + (dif >> kGFBits));
-}
-
-// vx[] += vy[] * z
-static void muladd_mem(GFSymbol * LEO_RESTRICT vx, const GFSymbol * LEO_RESTRICT vy, GFSymbol z, unsigned symbolCount)
-{
-    for (unsigned i = 0; i < symbolCount; ++i)
-    {
-        const GFSymbol a = vy[i];
-        if (a == 0)
-            continue;
-
-        GFSymbol sum1 = static_cast<GFSymbol>(AddModQ(GFLog[a & 0x0f], z));
-        GFSymbol value1 = GFExp[sum1];
-        if ((a & 0x0f) == 0)
-        {
-            value1 = 0;
-        }
-        GFSymbol sum2 = static_cast<GFSymbol>(AddModQ(GFLog[a & 0xf0], z));
-        GFSymbol value2 = GFExp[sum2];
-        if ((a & 0xf0) == 0)
-        {
-            value2 = 0;
-        }
-        GFSymbol sum3 = static_cast<GFSymbol>(AddModQ(GFLog[a & 0x0f00], z));
-        GFSymbol value3 = GFExp[sum3];
-        if ((a & 0x0f00) == 0)
-        {
-            value3 = 0;
-        }
-        GFSymbol sum4 = static_cast<GFSymbol>(AddModQ(GFLog[a & 0xf000], z));
-        GFSymbol value4 = GFExp[sum4];
-        if ((a & 0xf000) == 0)
-        {
-            value4 = 0;
-        }
-
-        vx[i] ^= value1;
-        vx[i] ^= value2;
-        vx[i] ^= value3;
-        vx[i] ^= value4;
-    }
-}
-
-// return a*GFExp[b] over GF(2^r)
-static GFSymbol mulE(GFSymbol a, GFSymbol b)
-{
-    if (a == 0)
-        return 0;
-
-    const GFSymbol sum = static_cast<GFSymbol>(AddModQ(GFLog[a], b));
-    return GFExp[sum];
-}
-
-
-//------------------------------------------------------------------------------
-// Fast Walsh-Hadamard Transform (FWHT) Mod Q
-//
-// Q is the maximum symbol value, e.g. 255 or 65535.
-
-// Define this to enable the optimized version of FWHT()
-#define LEO_FWHT_OPTIMIZED
-
-typedef GFSymbol fwht_t;
-
-// {a, b} = {a + b, a - b} (Mod Q)
-static LEO_FORCE_INLINE void FWHT_2(fwht_t& LEO_RESTRICT a, fwht_t& LEO_RESTRICT b)
-{
-    const fwht_t sum = AddModQ(a, b);
-    const fwht_t dif = SubModQ(a, b);
-    a = sum;
-    b = dif;
-}
-
-/*
-    FWHT is a minor slice of the runtime and does not grow with data size,
-    but I did attempt a few additional optimizations that failed:
-
-    I've attempted to vectorize (with partial reductions) FWHT_4(data, s),
-    which is 70% of the algorithm, but it was slower.  Left in _attic_.
-
-    I've attempted to avoid reductions in all or parts of the FWHT.
-    The final modular reduction ends up being slower than the savings.
-    Specifically I tried doing it for the whole FWHT and also I tried
-    doing it just for the FWHT_2 loop in the main routine, but both
-    approaches are slower than partial reductions.
-
-    Replacing word reads with wider reads does speed up the operation, but
-    at too high a complexity cost relative to minor perf improvement.
-*/
-
-#ifndef LEO_FWHT_OPTIMIZED
-
-// Reference implementation
-static void FWHT(fwht_t* data, const unsigned bits)
-{
-    const unsigned size = (unsigned)(1UL << bits);
-    for (unsigned width = 1; width < size; width <<= 1)
-        for (unsigned i = 0; i < size; i += (width << 1))
-            for (unsigned j = i; j < (width + i); ++j)
-                FWHT_2(data[j], data[j + width]);
-}
-
-#else
-
-static LEO_FORCE_INLINE void FWHT_4(fwht_t* data)
-{
-    fwht_t t0 = data[0];
-    fwht_t t1 = data[1];
-    fwht_t t2 = data[2];
-    fwht_t t3 = data[3];
-    FWHT_2(t0, t1);
-    FWHT_2(t2, t3);
-    FWHT_2(t0, t2);
-    FWHT_2(t1, t3);
-    data[0] = t0;
-    data[1] = t1;
-    data[2] = t2;
-    data[3] = t3;
-}
-
-static LEO_FORCE_INLINE void FWHT_4(fwht_t* data, unsigned s)
-{
-    unsigned x = 0;
-    fwht_t t0 = data[x];  x += s;
-    fwht_t t1 = data[x];  x += s;
-    fwht_t t2 = data[x];  x += s;
-    fwht_t t3 = data[x];
-    FWHT_2(t0, t1);
-    FWHT_2(t2, t3);
-    FWHT_2(t0, t2);
-    FWHT_2(t1, t3);
-    unsigned y = 0;
-    data[y] = t0;  y += s;
-    data[y] = t1;  y += s;
-    data[y] = t2;  y += s;
-    data[y] = t3;
-}
-
-static inline void FWHT_8(fwht_t* data)
-{
-    fwht_t t0 = data[0];
-    fwht_t t1 = data[1];
-    fwht_t t2 = data[2];
-    fwht_t t3 = data[3];
-    fwht_t t4 = data[4];
-    fwht_t t5 = data[5];
-    fwht_t t6 = data[6];
-    fwht_t t7 = data[7];
-    FWHT_2(t0, t1);
-    FWHT_2(t2, t3);
-    FWHT_2(t4, t5);
-    FWHT_2(t6, t7);
-    FWHT_2(t0, t2);
-    FWHT_2(t1, t3);
-    FWHT_2(t4, t6);
-    FWHT_2(t5, t7);
-    FWHT_2(t0, t4);
-    FWHT_2(t1, t5);
-    FWHT_2(t2, t6);
-    FWHT_2(t3, t7);
-    data[0] = t0;
-    data[1] = t1;
-    data[2] = t2;
-    data[3] = t3;
-    data[4] = t4;
-    data[5] = t5;
-    data[6] = t6;
-    data[7] = t7;
-}
-
-static inline void FWHT_16(fwht_t* data)
-{
-    fwht_t t0 = data[0];
-    fwht_t t1 = data[1];
-    fwht_t t2 = data[2];
-    fwht_t t3 = data[3];
-    fwht_t t4 = data[4];
-    fwht_t t5 = data[5];
-    fwht_t t6 = data[6];
-    fwht_t t7 = data[7];
-    fwht_t t8 = data[8];
-    fwht_t t9 = data[9];
-    fwht_t t10 = data[10];
-    fwht_t t11 = data[11];
-    fwht_t t12 = data[12];
-    fwht_t t13 = data[13];
-    fwht_t t14 = data[14];
-    fwht_t t15 = data[15];
-    FWHT_2(t0, t1);
-    FWHT_2(t2, t3);
-    FWHT_2(t4, t5);
-    FWHT_2(t6, t7);
-    FWHT_2(t8, t9);
-    FWHT_2(t10, t11);
-    FWHT_2(t12, t13);
-    FWHT_2(t14, t15);
-    FWHT_2(t0, t2);
-    FWHT_2(t1, t3);
-    FWHT_2(t4, t6);
-    FWHT_2(t5, t7);
-    FWHT_2(t8, t10);
-    FWHT_2(t9, t11);
-    FWHT_2(t12, t14);
-    FWHT_2(t13, t15);
-    FWHT_2(t0, t4);
-    FWHT_2(t1, t5);
-    FWHT_2(t2, t6);
-    FWHT_2(t3, t7);
-    FWHT_2(t8, t12);
-    FWHT_2(t9, t13);
-    FWHT_2(t10, t14);
-    FWHT_2(t11, t15);
-    FWHT_2(t0, t8);
-    FWHT_2(t1, t9);
-    FWHT_2(t2, t10);
-    FWHT_2(t3, t11);
-    FWHT_2(t4, t12);
-    FWHT_2(t5, t13);
-    FWHT_2(t6, t14);
-    FWHT_2(t7, t15);
-    data[0] = t0;
-    data[1] = t1;
-    data[2] = t2;
-    data[3] = t3;
-    data[4] = t4;
-    data[5] = t5;
-    data[6] = t6;
-    data[7] = t7;
-    data[8] = t8;
-    data[9] = t9;
-    data[10] = t10;
-    data[11] = t11;
-    data[12] = t12;
-    data[13] = t13;
-    data[14] = t14;
-    data[15] = t15;
-}
-
-static void FWHT_SmallData(fwht_t* data, unsigned ldn)
-{
-    const unsigned n = (1UL << ldn);
-
-    if (n <= 2)
-    {
-        if (n == 2)
-            FWHT_2(data[0], data[1]);
-        return;
-    }
-
-    for (unsigned ldm = ldn; ldm > 3; ldm -= 2)
-    {
-        unsigned m = (1UL << ldm);
-        unsigned m4 = (m >> 2);
-        for (unsigned r = 0; r < n; r += m)
-            for (unsigned j = 0; j < m4; j++)
-                FWHT_4(data + j + r, m4);
-    }
-
-    if (ldn & 1)
-    {
-        for (unsigned i0 = 0; i0 < n; i0 += 8)
-            FWHT_8(data + i0);
-    }
-    else
-    {
-        for (unsigned i0 = 0; i0 < n; i0 += 4)
-            FWHT_4(data + i0);
-    }
-}
-
-// Decimation in time (DIT) version
-static void FWHT(fwht_t* data, const unsigned ldn)
-{
-    if (ldn <= 13)
-    {
-        FWHT_SmallData(data, ldn);
-        return;
-    }
-
-    FWHT_2(data[2], data[3]);
-    FWHT_4(data + 4);
-    FWHT_8(data + 8);
-    FWHT_16(data + 16);
-    for (unsigned ldm = 5; ldm < ldn; ++ldm)
-        FWHT(data + (unsigned)(1UL << ldm), ldm);
-
-    for (unsigned ldm = 0; ldm < ldn; ++ldm)
-    {
-        const unsigned mh = (1UL << ldm);
-        for (unsigned t1 = 0, t2 = mh; t1 < mh; ++t1, ++t2)
-            FWHT_2(data[t1], data[t2]);
-    }
-}
-
-#endif
-
-
-//------------------------------------------------------------------------------
-// Memory Buffer XOR
-
-static void xor_mem(void * LEO_RESTRICT vx, const void * LEO_RESTRICT vy, unsigned bytes)
-{
-    LEO_M128 * LEO_RESTRICT x16 = reinterpret_cast<LEO_M128 *>(vx);
-    const LEO_M128 * LEO_RESTRICT y16 = reinterpret_cast<const LEO_M128 *>(vy);
-
-#if defined(LEO_TARGET_MOBILE)
-# if defined(LEO_TRY_NEON)
-    // Handle multiples of 64 bytes
-    if (CpuHasNeon)
-    {
-        while (bytes >= 64)
-        {
-            LEO_M128 x0 = vld1q_u8(x16);
-            LEO_M128 x1 = vld1q_u8(x16 + 1);
-            LEO_M128 x2 = vld1q_u8(x16 + 2);
-            LEO_M128 x3 = vld1q_u8(x16 + 3);
-            LEO_M128 y0 = vld1q_u8(y16);
-            LEO_M128 y1 = vld1q_u8(y16 + 1);
-            LEO_M128 y2 = vld1q_u8(y16 + 2);
-            LEO_M128 y3 = vld1q_u8(y16 + 3);
-
-            vst1q_u8(x16,     veorq_u8(x0, y0));
-            vst1q_u8(x16 + 1, veorq_u8(x1, y1));
-            vst1q_u8(x16 + 2, veorq_u8(x2, y2));
-            vst1q_u8(x16 + 3, veorq_u8(x3, y3));
-
-            bytes -= 64, x16 += 4, y16 += 4;
-        }
-
-        // Handle multiples of 16 bytes
-        while (bytes >= 16)
-        {
-            LEO_M128 x0 = vld1q_u8(x16);
-            LEO_M128 y0 = vld1q_u8(y16);
-
-            vst1q_u8(x16, veorq_u8(x0, y0));
-
-            bytes -= 16, ++x16, ++y16;
-        }
-    }
-    else
-# endif // LEO_TRY_NEON
-    {
-        uint64_t * LEO_RESTRICT x8 = reinterpret_cast<uint64_t *>(x16);
-        const uint64_t * LEO_RESTRICT y8 = reinterpret_cast<const uint64_t *>(y16);
-
-        const unsigned count = (unsigned)bytes / 8;
-        for (unsigned ii = 0; ii < count; ++ii)
-            x8[ii] ^= y8[ii];
-
-        x16 = reinterpret_cast<LEO_M128 *>(x8 + count);
-        y16 = reinterpret_cast<const LEO_M128 *>(y8 + count);
-    }
-#else // LEO_TARGET_MOBILE
-# if defined(LEO_TRY_AVX2)
-    if (CpuHasAVX2)
-    {
-        LEO_M256 * LEO_RESTRICT x32 = reinterpret_cast<LEO_M256 *>(x16);
-        const LEO_M256 * LEO_RESTRICT y32 = reinterpret_cast<const LEO_M256 *>(y16);
-
-        while (bytes >= 128)
-        {
-            LEO_M256 x0 = _mm256_loadu_si256(x32);
-            LEO_M256 y0 = _mm256_loadu_si256(y32);
-            x0 = _mm256_xor_si256(x0, y0);
-            LEO_M256 x1 = _mm256_loadu_si256(x32 + 1);
-            LEO_M256 y1 = _mm256_loadu_si256(y32 + 1);
-            x1 = _mm256_xor_si256(x1, y1);
-            LEO_M256 x2 = _mm256_loadu_si256(x32 + 2);
-            LEO_M256 y2 = _mm256_loadu_si256(y32 + 2);
-            x2 = _mm256_xor_si256(x2, y2);
-            LEO_M256 x3 = _mm256_loadu_si256(x32 + 3);
-            LEO_M256 y3 = _mm256_loadu_si256(y32 + 3);
-            x3 = _mm256_xor_si256(x3, y3);
-
-            _mm256_storeu_si256(x32, x0);
-            _mm256_storeu_si256(x32 + 1, x1);
-            _mm256_storeu_si256(x32 + 2, x2);
-            _mm256_storeu_si256(x32 + 3, x3);
-
-            bytes -= 128, x32 += 4, y32 += 4;
-        }
-
-        // Handle multiples of 32 bytes
-        while (bytes >= 32)
-        {
-            // x[i] = x[i] xor y[i]
-            _mm256_storeu_si256(x32,
-                _mm256_xor_si256(
-                    _mm256_loadu_si256(x32),
-                    _mm256_loadu_si256(y32)));
-
-            bytes -= 32, ++x32, ++y32;
-        }
-
-        x16 = reinterpret_cast<LEO_M128 *>(x32);
-        y16 = reinterpret_cast<const LEO_M128 *>(y32);
-    }
-    else
-# endif // LEO_TRY_AVX2
-    {
-        while (bytes >= 64)
-        {
-            LEO_M128 x0 = _mm_loadu_si128(x16);
-            LEO_M128 y0 = _mm_loadu_si128(y16);
-            x0 = _mm_xor_si128(x0, y0);
-            LEO_M128 x1 = _mm_loadu_si128(x16 + 1);
-            LEO_M128 y1 = _mm_loadu_si128(y16 + 1);
-            x1 = _mm_xor_si128(x1, y1);
-            LEO_M128 x2 = _mm_loadu_si128(x16 + 2);
-            LEO_M128 y2 = _mm_loadu_si128(y16 + 2);
-            x2 = _mm_xor_si128(x2, y2);
-            LEO_M128 x3 = _mm_loadu_si128(x16 + 3);
-            LEO_M128 y3 = _mm_loadu_si128(y16 + 3);
-            x3 = _mm_xor_si128(x3, y3);
-
-            _mm_storeu_si128(x16, x0);
-            _mm_storeu_si128(x16 + 1, x1);
-            _mm_storeu_si128(x16 + 2, x2);
-            _mm_storeu_si128(x16 + 3, x3);
-
-            bytes -= 64, x16 += 4, y16 += 4;
-        }
-    }
-#endif // LEO_TARGET_MOBILE
-
-    // Handle multiples of 16 bytes
-    while (bytes >= 16)
-    {
-        // x[i] = x[i] xor y[i]
-        _mm_storeu_si128(x16,
-            _mm_xor_si128(
-                _mm_loadu_si128(x16),
-                _mm_loadu_si128(y16)));
-
-        bytes -= 16, ++x16, ++y16;
-    }
-
-    uint8_t * LEO_RESTRICT x1 = reinterpret_cast<uint8_t *>(x16);
-    const uint8_t * LEO_RESTRICT y1 = reinterpret_cast<const uint8_t *>(y16);
-
-    // Handle a block of 8 bytes
-    const unsigned eight = bytes & 8;
-    if (eight)
-    {
-        uint64_t * LEO_RESTRICT x8 = reinterpret_cast<uint64_t *>(x1);
-        const uint64_t * LEO_RESTRICT y8 = reinterpret_cast<const uint64_t *>(y1);
-        *x8 ^= *y8;
-    }
-
-    // Handle a block of 4 bytes
-    const unsigned four = bytes & 4;
-    if (four)
-    {
-        uint32_t * LEO_RESTRICT x4 = reinterpret_cast<uint32_t *>(x1 + eight);
-        const uint32_t * LEO_RESTRICT y4 = reinterpret_cast<const uint32_t *>(y1 + eight);
-        *x4 ^= *y4;
-    }
-
-    // Handle final bytes
-    const unsigned offset = eight + four;
-    switch (bytes & 3)
-    {
-    case 3: x1[offset + 2] ^= y1[offset + 2];
-    case 2: x1[offset + 1] ^= y1[offset + 1];
-    case 1: x1[offset] ^= y1[offset];
-    default:
-        break;
-    }
-}
-
-
-//------------------------------------------------------------------------------
-// Formal Derivative
-
-// Formal derivative of polynomial in the new basis
-static void formal_derivative(GFSymbol* cos, const unsigned size)
-{
-    for (unsigned i = 1; i < size; ++i)
-    {
-        const unsigned leng = ((i ^ (i - 1)) + 1) >> 1;
-
-        // If a large number of values are being XORed:
-        if (leng >= 8)
-            xor_mem(cos + i - leng, cos + i, leng * sizeof(GFSymbol));
-        else
-            for (unsigned j = i - leng; j < i; j++)
-                cos[j] ^= cos[j + leng];
-    }
-
-    for (unsigned i = size; i < kFieldSize; i <<= 1)
-        xor_mem(cos, cos + i, size * sizeof(GFSymbol));
-}
-
-
-//------------------------------------------------------------------------------
-// Fast Fourier Transform
-
-static GFSymbol skewVec[kFieldModulus]; // twisted factors used in FFT
-
-// IFFT in the proposed basis
-static void IFLT(GFSymbol* data, const unsigned size, const unsigned index)
-{
-    for (unsigned depart_no = 1; depart_no < size; depart_no <<= 1)
-    {
-        for (unsigned j = depart_no; j < size; j += (depart_no << 1))
-        {
-            // If a large number of values are being XORed:
-            if (depart_no >= 8)
-                xor_mem(data + j, data + j - depart_no, depart_no * sizeof(GFSymbol));
-            else
-                for (unsigned i = j - depart_no; i < j; ++i)
-                    data[i + depart_no] ^= data[i];
-
-            const GFSymbol skew = skewVec[j + index - 1];
-
-            if (skew != kFieldModulus)
-                muladd_mem(data + j - depart_no, data + j, skew, depart_no);
-        }
-    }
-}
-
-// FFT in the proposed basis
-static void FLT(GFSymbol* data, const unsigned size, const unsigned index)
-{
-    for (unsigned depart_no = (size >> 1); depart_no > 0; depart_no >>= 1)
-    {
-        for (unsigned j = depart_no; j < size; j += (depart_no << 1))
-        {
-            const GFSymbol skew = skewVec[j + index - 1];
-
-            if (skew != kFieldModulus)
-                muladd_mem(data + j - depart_no, data + j, skew, depart_no);
-
-            // If a large number of values are being XORed:
-            if (depart_no >= 8)
-                xor_mem(data + j, data + j - depart_no, depart_no * sizeof(GFSymbol));
-            else
-                for (unsigned i = j - depart_no; i < j; ++i)
-                    data[i + depart_no] ^= data[i];
-        }
-    }
-}
-
-
-//------------------------------------------------------------------------------
-// FFT Initialization
-
-static GFSymbol B[kFieldSize >> 1];     // factors used in formal derivative
-static fwht_t log_walsh[kFieldSize];  // factors used in the evaluation of the error locator polynomial
-
-// Initialize skewVec[], B[], log_walsh[]
-static void InitFieldOperations()
-{
-    GFSymbol temp[kGFBits - 1];
-
-    for (unsigned i = 1; i < kGFBits; ++i)
-        temp[i - 1] = (GFSymbol)((unsigned)1 << i);
-
-    for (unsigned m = 0; m < (kGFBits - 1); ++m)
-    {
-        const unsigned step = (unsigned)1 << (m + 1);
-
-        skewVec[((unsigned)1 << m) - 1] = 0;
-
-        for (unsigned i = m; i < (kGFBits - 1); ++i)
-        {
-            const unsigned s = ((unsigned)1 << (i + 1));
-
-            for (unsigned j = ((unsigned)1 << m) - 1; j < s; j += step)
-                skewVec[j + s] = skewVec[j] ^ temp[i];
-        }
-
-        temp[m] = kFieldModulus - GFLog[mulE(temp[m], GFLog[temp[m] ^ 1])];
-
-        for (unsigned i = m + 1; i < (kGFBits - 1); ++i)
-            temp[i] = mulE(temp[i], (GFLog[temp[i] ^ 1] + temp[m]) % kFieldModulus);
-    }
-
-    for (unsigned i = 0; i < kFieldSize; ++i)
-        skewVec[i] = GFLog[skewVec[i]];
-
-    temp[0] = kFieldModulus - temp[0];
-
-    for (unsigned i = 1; i < (kGFBits - 1); ++i)
-        temp[i] = (kFieldModulus - temp[i] + temp[i - 1]) % kFieldModulus;
-
-    B[0] = 0;
-    for (unsigned i = 0; i < (kGFBits - 1); ++i)
-    {
-        const unsigned depart = ((unsigned)1 << i);
-
-        for (unsigned j = 0; j < depart; ++j)
-            B[j + depart] = (B[j] + temp[i]) % kFieldModulus;
-    }
-
-    for (unsigned i = 0; i < kFieldSize; ++i)
-        log_walsh[i] = GFLog[i];
-
-    log_walsh[0] = 0;
-
-    FWHT(log_walsh, kGFBits);
-}
-
-
-//------------------------------------------------------------------------------
-// Encoder
-
-// Encoding alg for k/n<0.5: message is a power of two
-static void encodeL(GFSymbol* data, const unsigned k, GFSymbol* codeword)
-{
-    memcpy(codeword, data, sizeof(GFSymbol) * k);
-
-    IFLT(codeword, k, 0);
-
-    for (unsigned i = k; i < kFieldSize; i += k)
-    {
-        memcpy(&codeword[i], codeword, sizeof(GFSymbol) * k);
-
-        FLT(&codeword[i], k, i);
-    }
-
-    memcpy(codeword, data, sizeof(GFSymbol) * k);
-}
-
-// Encoding alg for k/n>0.5: parity is a power of two.
-// data: message array. parity: parity array. mem: buffer(size>= n-k)
-static void encodeH(const GFSymbol* data, const unsigned k, GFSymbol* parity, GFSymbol* mem)
-{
-    const unsigned t = kFieldSize - k;
-
-    memset(parity, 0, sizeof(GFSymbol) * t);
-
-    for (unsigned i = t; i < kFieldSize; i += t)
-    {
-        memcpy(mem, &data[i - t], sizeof(GFSymbol) * t);
-
-        IFLT(mem, t, i);
-
-        xor_mem(parity, mem, t * sizeof(GFSymbol));
-    }
-
-    FLT(parity, t, 0);
-}
-
-
-//------------------------------------------------------------------------------
-// Decoder
-
-static void decode(GFSymbol* codeword, unsigned k, const bool* erasure)
-{
-    fwht_t log_walsh2[kFieldSize];
-
-    // Compute the evaluations of the error locator polynomial
-    for (unsigned i = 0; i < kFieldSize; ++i)
-        log_walsh2[i] = erasure[i] ? 1 : 0;
-
-    FWHT(log_walsh2, kGFBits);
-
-    for (unsigned i = 0; i < kFieldSize; ++i)
-        log_walsh2[i] = ((unsigned)log_walsh2[i] * (unsigned)log_walsh[i]) % kFieldModulus;
-
-    FWHT(log_walsh2, kGFBits);
-
-    // k2 can be replaced with k
-    const unsigned k2 = kFieldSize;
-    //const unsigned k2 = k; // cannot actually be replaced with k.  what else need to change?
-
-    for (unsigned i = 0; i < kFieldSize; ++i)
-    {
-        if (erasure[i])
-        {
-            codeword[i] = 0;
-        }
-        else
-        {
-            codeword[i] = mulE(codeword[i], log_walsh2[i]);
-        }
-    }
-
-    IFLT(codeword, kFieldSize, 0);
-
-    // formal derivative
-    for (unsigned i = 0; i < kFieldSize; i += 2)
-    {
-        codeword[i] = mulE(codeword[i], kFieldModulus - B[i >> 1]);
-        codeword[i + 1] = mulE(codeword[i + 1], kFieldModulus - B[i >> 1]);
-    }
-
-    formal_derivative(codeword, k2);
-
-    for (unsigned i = 0; i < k2; i += 2)
-    {
-        codeword[i] = mulE(codeword[i], B[i >> 1]);
-        codeword[i + 1] = mulE(codeword[i + 1], B[i >> 1]);
-    }
-
-    FLT(codeword, k2, 0);
-
-    for (unsigned i = 0; i < k2; ++i)
-    {
-        if (erasure[i])
-        {
-            codeword[i] = mulE(codeword[i], kFieldModulus - log_walsh2[i]);
-        }
-    }
-}
-
-
-//------------------------------------------------------------------------------
-// Test Application
-
-void test(unsigned k, unsigned seed)
-{
-    srand(seed);
-
-    //-----------Generating message----------
-
-    // Message array
-    GFSymbol data[kFieldSize] = {0};
-
-    // Filled with random numbers
-    for (unsigned i = kFieldSize - k; i < kFieldSize; ++i)
-        data[i] = (GFSymbol)rand();
-
-
-    //---------encoding----------
-
-    GFSymbol codeword[kFieldSize];
-    encodeH(&data[kFieldSize - k], k, data, codeword);
-    //encodeL(data, k, codeword); // does not seem to work with any input?  what else needs to change?
-
-    memcpy(codeword, data, sizeof(GFSymbol) * kFieldSize);
-
-
-    //--------erasure simulation---------
-
-    // Array indicating erasures
-    bool erasure[kFieldSize] = {
-        false
-    };
-
-    for (unsigned i = k; i < kFieldSize; ++i)
-        erasure[i] = true;
-
-    // permuting the erasure array
-    for (unsigned i = kFieldSize - 1; i > 0; --i)
-    {
-        unsigned pos = rand() % (i + 1);
-
-        if (i != pos)
-        {
-            bool tmp = erasure[i];
-            erasure[i] = erasure[pos];
-            erasure[pos] = tmp;
-        }
-    }
-
-    // erasure codeword symbols
-    for (unsigned i = 0; i < kFieldSize; ++i)
-        if (erasure[i])
-            codeword[i] = 0;
-
-
-    //---------main processing----------
-    decode(codeword, k, erasure);
-
-    // Check the correctness of the result
-    for (unsigned i = 0; i < kFieldSize; ++i)
-    {
-        if (erasure[i] == 1)
-        {
-            if (data[i] != codeword[i])
-            {
-                printf("Decoding Error with seed = %d!\n", seed);
-                LEO_DEBUG_BREAK;
-                return;
-            }
-        }
-    }
-
-    //printf("Decoding is successful!\n");
-}
-
-
-//------------------------------------------------------------------------------
-// Entrypoint
-
-int main(int argc, char **argv)
-{
-    // Initialize architecture-specific code
-    leo_architecture_init();
-
-    // Fill GFLog table and GFExp table
-    InitField();
-
-    // Compute factors used in erasure decoder
-    InitFieldOperations();
-
-    unsigned seed = (unsigned)time(NULL);
-    for (;;)
-    {
-        // test(int k), k: message size
-        /*
-            EncodeH works for kFieldSize / 2 and kFieldSize * 3 / 4, etc,
-            s.t. the number of recovery pieces is a power of two
-        */
-        test(kFieldSize / 2, seed);
-
-        ++seed;
-    }
-
-    return 0;
-}
diff --git a/LeopardEncoder.h b/LeopardEncoder.h
deleted file mode 100644
index 71d22e2..0000000
--- a/LeopardEncoder.h
+++ /dev/null
@@ -1,1220 +0,0 @@
-/*
-    Copyright (c) 2017 Christopher A. Taylor.  All rights reserved.
-
-    Redistribution and use in source and binary forms, with or without
-    modification, are permitted provided that the following conditions are met:
-
-    * Redistributions of source code must retain the above copyright notice,
-      this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above copyright notice,
-      this list of conditions and the following disclaimer in the documentation
-      and/or other materials provided with the distribution.
-    * Neither the name of LHC-RS nor the names of its contributors may be
-      used to endorse or promote products derived from this software without
-      specific prior written permission.
-
-    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-    ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
-    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-    POSSIBILITY OF SUCH DAMAGE.
-*/
-
-#include <string.h>
-#include <time.h>
-#include <stdio.h>
-#include <stdint.h>
-#include <stdlib.h>
-
-
-/*
-    TODO:
-    + Write C API and unit tester
-        + Limit input to multiples of 64 bytes
-    + Replace GFSymbol with a file data pointer
-    + New 16-bit Muladd inner loops
-        + Class to contain the (large) muladd tables
-    + Preliminary benchmarks for large data!
-    + New 8-bit Muladd inner loops
-    + Benchmarks for smaller data!
-    + Refactor software
-        + Pick a name for the software better than LEO_RS
-        + I think it should be split up into several C++ modules
-    + Write detailed comments for all the routines
-    + Look into getting EncodeL working so we can support smaller data (Ask Lin)
-    + Look into using k instead of k2 to speed up decoder (Ask Lin)
-    + Avoid performing FFT/IFFT intermediate calculations we're not going to use
-    + Benchmarks, fun!
-    + Add multi-threading to split up long parallelizable calculations
-    + Final benchmarks!
-    + Finish up documentation
-    + Release version 1
-
-
-    Muladd implementation notes:
-
-    Specialize for 1-3 rows at a time since often times we're multiplying by
-    the same (skew) value repeatedly, as the ISA-L library does here:
-
-    https://github.com/01org/isa-l/blob/master/erasure_code/gf_3vect_mad_avx.asm#L258
-
-    Except we should be doing it for 16-bit Galois Field.
-    To implement that use the ALTMAP trick from Jerasure:
-
-    http://lab.jerasure.org/jerasure/gf-complete/blob/master/src/gf_w16.c#L1140
-
-    Except we should also support AVX2 since that is a 40% perf boost, so put
-    the high and low bytes 32 bytes instead of 16 bytes apart.
-
-    Also I think we should go ahead and precompute the multiply tables since
-    it avoids a bunch of memory lookups for each muladd, and only costs 8 MB.
-*/
-
-
-//------------------------------------------------------------------------------
-// Debug
-
-// Some bugs only repro in release mode, so this can be helpful
-//#define LEO_DEBUG_IN_RELEASE
-
-#if defined(_DEBUG) || defined(DEBUG) || defined(LEO_DEBUG_IN_RELEASE)
-    #define LEO_DEBUG
-    #ifdef _WIN32
-        #define LEO_DEBUG_BREAK __debugbreak()
-    #else
-        #define LEO_DEBUG_BREAK __builtin_trap()
-    #endif
-    #define LEO_DEBUG_ASSERT(cond) { if (!(cond)) { LEO_DEBUG_BREAK; } }
-#else
-    #define LEO_DEBUG_BREAK ;
-    #define LEO_DEBUG_ASSERT(cond) ;
-#endif
-
-
-//------------------------------------------------------------------------------
-// Platform/Architecture
-
-#if defined(ANDROID) || defined(IOS)
-    #define LEO_TARGET_MOBILE
-#endif // ANDROID
-
-#if defined(__AVX2__) || (defined (_MSC_VER) && _MSC_VER >= 1900)
-    #define LEO_TRY_AVX2 /* 256-bit */
-    #include <immintrin.h>
-    #define LEO_ALIGN_BYTES 32
-#else // __AVX2__
-    #define LEO_ALIGN_BYTES 16
-#endif // __AVX2__
-
-#if !defined(LEO_TARGET_MOBILE)
-    // Note: MSVC currently only supports SSSE3 but not AVX2
-    #include <tmmintrin.h> // SSSE3: _mm_shuffle_epi8
-    #include <emmintrin.h> // SSE2
-#endif // LEO_TARGET_MOBILE
-
-#if defined(HAVE_ARM_NEON_H)
-    #include <arm_neon.h>
-#endif // HAVE_ARM_NEON_H
-
-#if defined(LEO_TARGET_MOBILE)
-
-    #define LEO_ALIGNED_ACCESSES /* Inputs must be aligned to LEO_ALIGN_BYTES */
-
-# if defined(HAVE_ARM_NEON_H)
-    // Compiler-specific 128-bit SIMD register keyword
-    #define LEO_M128 uint8x16_t
-    #define LEO_TRY_NEON
-#else
-    #define LEO_M128 uint64_t
-# endif
-
-#else // LEO_TARGET_MOBILE
-
-    // Compiler-specific 128-bit SIMD register keyword
-    #define LEO_M128 __m128i
-
-#endif // LEO_TARGET_MOBILE
-
-#ifdef LEO_TRY_AVX2
-    // Compiler-specific 256-bit SIMD register keyword
-    #define LEO_M256 __m256i
-#endif
-
-// Compiler-specific C++11 restrict keyword
-#define LEO_RESTRICT __restrict
-
-// Compiler-specific force inline keyword
-#ifdef _MSC_VER
-    #define LEO_FORCE_INLINE inline __forceinline
-#else
-    #define LEO_FORCE_INLINE inline __attribute__((always_inline))
-#endif
-
-// Compiler-specific alignment keyword
-// Note: Alignment only matters for ARM NEON where it should be 16
-#ifdef _MSC_VER
-    #define LEO_ALIGNED __declspec(align(LEO_ALIGN_BYTES))
-#else // _MSC_VER
-    #define LEO_ALIGNED __attribute__((aligned(LEO_ALIGN_BYTES)))
-#endif // _MSC_VER
-
-
-//------------------------------------------------------------------------------
-// Runtime CPU Architecture Check
-//
-// Feature checks stolen shamelessly from
-// https://github.com/jedisct1/libsodium/blob/master/src/libsodium/sodium/runtime.c
-
-#if defined(HAVE_ANDROID_GETCPUFEATURES)
-    #include <cpu-features.h>
-#endif
-
-#if defined(LEO_TRY_NEON)
-# if defined(IOS) && defined(__ARM_NEON__)
-        // Requires iPhone 5S or newer
-        static const bool CpuHasNeon = true;
-        static const bool CpuHasNeon64 = true;
-# else
-        // Remember to add LOCAL_STATIC_LIBRARIES := cpufeatures
-        static bool CpuHasNeon = false; // V6 / V7
-        static bool CpuHasNeon64 = false; // 64-bit
-# endif
-#endif
-
-
-#if !defined(LEO_TARGET_MOBILE)
-
-#ifdef _MSC_VER
-    #include <intrin.h> // __cpuid
-    #pragma warning(disable: 4752) // found Intel(R) Advanced Vector Extensions; consider using /arch:AVX
-#endif
-
-#ifdef LEO_TRY_AVX2
-static bool CpuHasAVX2 = false;
-#endif
-static bool CpuHasSSSE3 = false;
-
-#define CPUID_EBX_AVX2    0x00000020
-#define CPUID_ECX_SSSE3   0x00000200
-
-static void _cpuid(unsigned int cpu_info[4U], const unsigned int cpu_info_type)
-{
-#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_AMD64) || defined(_M_IX86))
-    __cpuid((int *) cpu_info, cpu_info_type);
-#else //if defined(HAVE_CPUID)
-    cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0;
-# ifdef __i386__
-    __asm__ __volatile__ ("pushfl; pushfl; "
-                          "popl %0; "
-                          "movl %0, %1; xorl %2, %0; "
-                          "pushl %0; "
-                          "popfl; pushfl; popl %0; popfl" :
-                          "=&r" (cpu_info[0]), "=&r" (cpu_info[1]) :
-                          "i" (0x200000));
-    if (((cpu_info[0] ^ cpu_info[1]) & 0x200000) == 0) {
-        return; /* LCOV_EXCL_LINE */
-    }
-# endif
-# ifdef __i386__
-    __asm__ __volatile__ ("xchgl %%ebx, %k1; cpuid; xchgl %%ebx, %k1" :
-                          "=a" (cpu_info[0]), "=&r" (cpu_info[1]),
-                          "=c" (cpu_info[2]), "=d" (cpu_info[3]) :
-                          "0" (cpu_info_type), "2" (0U));
-# elif defined(__x86_64__)
-    __asm__ __volatile__ ("xchgq %%rbx, %q1; cpuid; xchgq %%rbx, %q1" :
-                          "=a" (cpu_info[0]), "=&r" (cpu_info[1]),
-                          "=c" (cpu_info[2]), "=d" (cpu_info[3]) :
-                          "0" (cpu_info_type), "2" (0U));
-# else
-    __asm__ __volatile__ ("cpuid" :
-                          "=a" (cpu_info[0]), "=b" (cpu_info[1]),
-                          "=c" (cpu_info[2]), "=d" (cpu_info[3]) :
-                          "0" (cpu_info_type), "2" (0U));
-# endif
-#endif
-}
-
-#endif // defined(LEO_TARGET_MOBILE)
-
-
-static void leo_architecture_init()
-{
-#if defined(LEO_TRY_NEON) && defined(HAVE_ANDROID_GETCPUFEATURES)
-    AndroidCpuFamily family = android_getCpuFamily();
-    if (family == ANDROID_CPU_FAMILY_ARM)
-    {
-        if (android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON)
-            CpuHasNeon = true;
-    }
-    else if (family == ANDROID_CPU_FAMILY_ARM64)
-    {
-        CpuHasNeon = true;
-        if (android_getCpuFeatures() & ANDROID_CPU_ARM64_FEATURE_ASIMD)
-            CpuHasNeon64 = true;
-    }
-#endif
-
-#if !defined(LEO_TARGET_MOBILE)
-    unsigned int cpu_info[4];
-
-    _cpuid(cpu_info, 1);
-    CpuHasSSSE3 = ((cpu_info[2] & CPUID_ECX_SSSE3) != 0);
-
-#if defined(LEO_TRY_AVX2)
-    _cpuid(cpu_info, 7);
-    CpuHasAVX2 = ((cpu_info[1] & CPUID_EBX_AVX2) != 0);
-#endif // LEO_TRY_AVX2
-
-#endif // LEO_TARGET_MOBILE
-}
-
-
-//------------------------------------------------------------------------------
-// SIMD-Safe Aligned Memory Allocations
-
-static const unsigned kAlignmentBytes = LEO_ALIGN_BYTES;
-
-LEO_FORCE_INLINE unsigned NextAlignedOffset(unsigned offset)
-{
-    return (offset + kAlignmentBytes - 1) & ~(kAlignmentBytes - 1);
-}
-
-static LEO_FORCE_INLINE uint8_t* SIMDSafeAllocate(size_t size)
-{
-    uint8_t* data = (uint8_t*)calloc(1, kAlignmentBytes + size);
-    if (!data)
-        return nullptr;
-    unsigned offset = (unsigned)((uintptr_t)data % kAlignmentBytes);
-    data += kAlignmentBytes - offset;
-    data[-1] = (uint8_t)offset;
-    return data;
-}
-
-static LEO_FORCE_INLINE void SIMDSafeFree(void* ptr)
-{
-    if (!ptr)
-        return;
-    uint8_t* data = (uint8_t*)ptr;
-    unsigned offset = data[-1];
-    if (offset >= kAlignmentBytes)
-    {
-        LEO_DEBUG_BREAK; // Should never happen
-        return;
-    }
-    data -= kAlignmentBytes - offset;
-    free(data);
-}
-
-
-//------------------------------------------------------------------------------
-// Field
-
-//#define LEO_SHORT_FIELD
-
-#ifdef LEO_SHORT_FIELD
-typedef uint8_t GFSymbol;
-static const unsigned kGFBits = 8;
-static const unsigned kGFPolynomial = 0x11D;
-GFSymbol kGFBasis[kGFBits] = {
-    1, 214, 152, 146, 86, 200, 88, 230 // Cantor basis
-};
-#else
-typedef uint16_t GFSymbol;
-static const unsigned kGFBits = 16;
-static const unsigned kGFPolynomial = 0x1002D;
-GFSymbol kGFBasis[kGFBits] = {
-    0x0001, 0xACCA, 0x3C0E, 0x163E, // Cantor basis
-    0xC582, 0xED2E, 0x914C, 0x4012,
-    0x6C98, 0x10D8, 0x6A72, 0xB900,
-    0xFDB8, 0xFB34, 0xFF38, 0x991E
-};
-#endif
-
-/*
-    Cantor Basis introduced by:
-    D. G. Cantor, "On arithmetical algorithms over finite fields",
-    Journal of Combinatorial Theory, Series A, vol. 50, no. 2, pp. 285-300, 1989.
-*/
-
-static const unsigned kFieldSize = (unsigned)1 << kGFBits; //Field size
-static const unsigned kFieldModulus = kFieldSize - 1;
-
-static GFSymbol GFLog[kFieldSize];
-static GFSymbol GFExp[kFieldSize];
-
-// Initialize GFLog[], GFExp[]
-static void InitField()
-{
-    unsigned state = 1;
-    for (unsigned i = 0; i < kFieldModulus; ++i)
-    {
-        GFExp[state] = static_cast<GFSymbol>(i);
-        state <<= 1;
-        if (state >= kFieldSize)
-            state ^= kGFPolynomial;
-    }
-    GFExp[0] = kFieldModulus;
-
-    // Conversion to chosen basis:
-
-    GFLog[0] = 0;
-    for (unsigned i = 0; i < kGFBits; ++i)
-    {
-        const GFSymbol basis = kGFBasis[i];
-        const unsigned width = (unsigned)(1UL << i);
-
-        for (unsigned j = 0; j < width; ++j)
-            GFLog[j + width] = GFLog[j] ^ basis;
-    }
-
-    for (unsigned i = 0; i < kFieldSize; ++i)
-        GFLog[i] = GFExp[GFLog[i]];
-
-    for (unsigned i = 0; i < kFieldSize; ++i)
-        GFExp[GFLog[i]] = i;
-
-    GFExp[kFieldModulus] = GFExp[0];
-}
-
-
-//------------------------------------------------------------------------------
-// Mod Q Field Operations
-//
-// Q is the maximum symbol value, e.g. 255 or 65535.
-
-// z = x + y (mod Q)
-static inline GFSymbol AddModQ(GFSymbol a, GFSymbol b)
-{
-    const unsigned sum = (unsigned)a + b;
-
-    // Partial reduction step, allowing for Q to be returned
-    return static_cast<GFSymbol>(sum + (sum >> kGFBits));
-}
-
-// z = x - y (mod Q)
-static inline GFSymbol SubModQ(GFSymbol a, GFSymbol b)
-{
-    const unsigned dif = (unsigned)a - b;
-
-    // Partial reduction step, allowing for Q to be returned
-    return static_cast<GFSymbol>(dif + (dif >> kGFBits));
-}
-
-// vx[] += vy[] * z
-static void muladd_mem(GFSymbol * LEO_RESTRICT vx, const GFSymbol * LEO_RESTRICT vy, GFSymbol z, unsigned symbolCount)
-{
-    for (unsigned i = 0; i < symbolCount; ++i)
-    {
-        const GFSymbol a = vy[i];
-        if (a == 0)
-            continue;
-
-        GFSymbol sum1 = static_cast<GFSymbol>(AddModQ(GFLog[a & 0x0f], z));
-        GFSymbol value1 = GFExp[sum1];
-        if ((a & 0x0f) == 0)
-        {
-            value1 = 0;
-        }
-        GFSymbol sum2 = static_cast<GFSymbol>(AddModQ(GFLog[a & 0xf0], z));
-        GFSymbol value2 = GFExp[sum2];
-        if ((a & 0xf0) == 0)
-        {
-            value2 = 0;
-        }
-        GFSymbol sum3 = static_cast<GFSymbol>(AddModQ(GFLog[a & 0x0f00], z));
-        GFSymbol value3 = GFExp[sum3];
-        if ((a & 0x0f00) == 0)
-        {
-            value3 = 0;
-        }
-        GFSymbol sum4 = static_cast<GFSymbol>(AddModQ(GFLog[a & 0xf000], z));
-        GFSymbol value4 = GFExp[sum4];
-        if ((a & 0xf000) == 0)
-        {
-            value4 = 0;
-        }
-
-        vx[i] ^= value1;
-        vx[i] ^= value2;
-        vx[i] ^= value3;
-        vx[i] ^= value4;
-    }
-}
-
-// return a*GFExp[b] over GF(2^r)
-static GFSymbol mulE(GFSymbol a, GFSymbol b)
-{
-    if (a == 0)
-        return 0;
-
-    const GFSymbol sum = static_cast<GFSymbol>(AddModQ(GFLog[a], b));
-    return GFExp[sum];
-}
-
-
-//------------------------------------------------------------------------------
-// Fast Walsh-Hadamard Transform (FWHT) Mod Q
-//
-// Q is the maximum symbol value, e.g. 255 or 65535.
-
-// Define this to enable the optimized version of FWHT()
-#define LEO_FWHT_OPTIMIZED
-
-typedef GFSymbol fwht_t;
-
-// {a, b} = {a + b, a - b} (Mod Q)
-static LEO_FORCE_INLINE void FWHT_2(fwht_t& LEO_RESTRICT a, fwht_t& LEO_RESTRICT b)
-{
-    const fwht_t sum = AddModQ(a, b);
-    const fwht_t dif = SubModQ(a, b);
-    a = sum;
-    b = dif;
-}
-
-/*
-    FWHT is a minor slice of the runtime and does not grow with data size,
-    but I did attempt a few additional optimizations that failed:
-
-    I've attempted to vectorize (with partial reductions) FWHT_4(data, s),
-    which is 70% of the algorithm, but it was slower.  Left in _attic_.
-
-    I've attempted to avoid reductions in all or parts of the FWHT.
-    The final modular reduction ends up being slower than the savings.
-    Specifically I tried doing it for the whole FWHT and also I tried
-    doing it just for the FWHT_2 loop in the main routine, but both
-    approaches are slower than partial reductions.
-
-    Replacing word reads with wider reads does speed up the operation, but
-    at too high a complexity cost relative to minor perf improvement.
-*/
-
-#ifndef LEO_FWHT_OPTIMIZED
-
-// Reference implementation
-static void FWHT(fwht_t* data, const unsigned bits)
-{
-    const unsigned size = (unsigned)(1UL << bits);
-    for (unsigned width = 1; width < size; width <<= 1)
-        for (unsigned i = 0; i < size; i += (width << 1))
-            for (unsigned j = i; j < (width + i); ++j)
-                FWHT_2(data[j], data[j + width]);
-}
-
-#else
-
-static LEO_FORCE_INLINE void FWHT_4(fwht_t* data)
-{
-    fwht_t t0 = data[0];
-    fwht_t t1 = data[1];
-    fwht_t t2 = data[2];
-    fwht_t t3 = data[3];
-    FWHT_2(t0, t1);
-    FWHT_2(t2, t3);
-    FWHT_2(t0, t2);
-    FWHT_2(t1, t3);
-    data[0] = t0;
-    data[1] = t1;
-    data[2] = t2;
-    data[3] = t3;
-}
-
-static LEO_FORCE_INLINE void FWHT_4(fwht_t* data, unsigned s)
-{
-    unsigned x = 0;
-    fwht_t t0 = data[x];  x += s;
-    fwht_t t1 = data[x];  x += s;
-    fwht_t t2 = data[x];  x += s;
-    fwht_t t3 = data[x];
-    FWHT_2(t0, t1);
-    FWHT_2(t2, t3);
-    FWHT_2(t0, t2);
-    FWHT_2(t1, t3);
-    unsigned y = 0;
-    data[y] = t0;  y += s;
-    data[y] = t1;  y += s;
-    data[y] = t2;  y += s;
-    data[y] = t3;
-}
-
-static inline void FWHT_8(fwht_t* data)
-{
-    fwht_t t0 = data[0];
-    fwht_t t1 = data[1];
-    fwht_t t2 = data[2];
-    fwht_t t3 = data[3];
-    fwht_t t4 = data[4];
-    fwht_t t5 = data[5];
-    fwht_t t6 = data[6];
-    fwht_t t7 = data[7];
-    FWHT_2(t0, t1);
-    FWHT_2(t2, t3);
-    FWHT_2(t4, t5);
-    FWHT_2(t6, t7);
-    FWHT_2(t0, t2);
-    FWHT_2(t1, t3);
-    FWHT_2(t4, t6);
-    FWHT_2(t5, t7);
-    FWHT_2(t0, t4);
-    FWHT_2(t1, t5);
-    FWHT_2(t2, t6);
-    FWHT_2(t3, t7);
-    data[0] = t0;
-    data[1] = t1;
-    data[2] = t2;
-    data[3] = t3;
-    data[4] = t4;
-    data[5] = t5;
-    data[6] = t6;
-    data[7] = t7;
-}
-
-static inline void FWHT_16(fwht_t* data)
-{
-    fwht_t t0 = data[0];
-    fwht_t t1 = data[1];
-    fwht_t t2 = data[2];
-    fwht_t t3 = data[3];
-    fwht_t t4 = data[4];
-    fwht_t t5 = data[5];
-    fwht_t t6 = data[6];
-    fwht_t t7 = data[7];
-    fwht_t t8 = data[8];
-    fwht_t t9 = data[9];
-    fwht_t t10 = data[10];
-    fwht_t t11 = data[11];
-    fwht_t t12 = data[12];
-    fwht_t t13 = data[13];
-    fwht_t t14 = data[14];
-    fwht_t t15 = data[15];
-    FWHT_2(t0, t1);
-    FWHT_2(t2, t3);
-    FWHT_2(t4, t5);
-    FWHT_2(t6, t7);
-    FWHT_2(t8, t9);
-    FWHT_2(t10, t11);
-    FWHT_2(t12, t13);
-    FWHT_2(t14, t15);
-    FWHT_2(t0, t2);
-    FWHT_2(t1, t3);
-    FWHT_2(t4, t6);
-    FWHT_2(t5, t7);
-    FWHT_2(t8, t10);
-    FWHT_2(t9, t11);
-    FWHT_2(t12, t14);
-    FWHT_2(t13, t15);
-    FWHT_2(t0, t4);
-    FWHT_2(t1, t5);
-    FWHT_2(t2, t6);
-    FWHT_2(t3, t7);
-    FWHT_2(t8, t12);
-    FWHT_2(t9, t13);
-    FWHT_2(t10, t14);
-    FWHT_2(t11, t15);
-    FWHT_2(t0, t8);
-    FWHT_2(t1, t9);
-    FWHT_2(t2, t10);
-    FWHT_2(t3, t11);
-    FWHT_2(t4, t12);
-    FWHT_2(t5, t13);
-    FWHT_2(t6, t14);
-    FWHT_2(t7, t15);
-    data[0] = t0;
-    data[1] = t1;
-    data[2] = t2;
-    data[3] = t3;
-    data[4] = t4;
-    data[5] = t5;
-    data[6] = t6;
-    data[7] = t7;
-    data[8] = t8;
-    data[9] = t9;
-    data[10] = t10;
-    data[11] = t11;
-    data[12] = t12;
-    data[13] = t13;
-    data[14] = t14;
-    data[15] = t15;
-}
-
-static void FWHT_SmallData(fwht_t* data, unsigned ldn)
-{
-    const unsigned n = (1UL << ldn);
-
-    if (n <= 2)
-    {
-        if (n == 2)
-            FWHT_2(data[0], data[1]);
-        return;
-    }
-
-    for (unsigned ldm = ldn; ldm > 3; ldm -= 2)
-    {
-        unsigned m = (1UL << ldm);
-        unsigned m4 = (m >> 2);
-        for (unsigned r = 0; r < n; r += m)
-            for (unsigned j = 0; j < m4; j++)
-                FWHT_4(data + j + r, m4);
-    }
-
-    if (ldn & 1)
-    {
-        for (unsigned i0 = 0; i0 < n; i0 += 8)
-            FWHT_8(data + i0);
-    }
-    else
-    {
-        for (unsigned i0 = 0; i0 < n; i0 += 4)
-            FWHT_4(data + i0);
-    }
-}
-
-// Decimation in time (DIT) version
-static void FWHT(fwht_t* data, const unsigned ldn)
-{
-    if (ldn <= 13)
-    {
-        FWHT_SmallData(data, ldn);
-        return;
-    }
-
-    FWHT_2(data[2], data[3]);
-    FWHT_4(data + 4);
-    FWHT_8(data + 8);
-    FWHT_16(data + 16);
-    for (unsigned ldm = 5; ldm < ldn; ++ldm)
-        FWHT(data + (unsigned)(1UL << ldm), ldm);
-
-    for (unsigned ldm = 0; ldm < ldn; ++ldm)
-    {
-        const unsigned mh = (1UL << ldm);
-        for (unsigned t1 = 0, t2 = mh; t1 < mh; ++t1, ++t2)
-            FWHT_2(data[t1], data[t2]);
-    }
-}
-
-#endif
-
-
-//------------------------------------------------------------------------------
-// Memory Buffer XOR
-
-static void xor_mem(void * LEO_RESTRICT vx, const void * LEO_RESTRICT vy, unsigned bytes)
-{
-    LEO_M128 * LEO_RESTRICT x16 = reinterpret_cast<LEO_M128 *>(vx);
-    const LEO_M128 * LEO_RESTRICT y16 = reinterpret_cast<const LEO_M128 *>(vy);
-
-#if defined(LEO_TARGET_MOBILE)
-# if defined(LEO_TRY_NEON)
-    // Handle multiples of 64 bytes
-    if (CpuHasNeon)
-    {
-        while (bytes >= 64)
-        {
-            LEO_M128 x0 = vld1q_u8(x16);
-            LEO_M128 x1 = vld1q_u8(x16 + 1);
-            LEO_M128 x2 = vld1q_u8(x16 + 2);
-            LEO_M128 x3 = vld1q_u8(x16 + 3);
-            LEO_M128 y0 = vld1q_u8(y16);
-            LEO_M128 y1 = vld1q_u8(y16 + 1);
-            LEO_M128 y2 = vld1q_u8(y16 + 2);
-            LEO_M128 y3 = vld1q_u8(y16 + 3);
-
-            vst1q_u8(x16,     veorq_u8(x0, y0));
-            vst1q_u8(x16 + 1, veorq_u8(x1, y1));
-            vst1q_u8(x16 + 2, veorq_u8(x2, y2));
-            vst1q_u8(x16 + 3, veorq_u8(x3, y3));
-
-            bytes -= 64, x16 += 4, y16 += 4;
-        }
-
-        // Handle multiples of 16 bytes
-        while (bytes >= 16)
-        {
-            LEO_M128 x0 = vld1q_u8(x16);
-            LEO_M128 y0 = vld1q_u8(y16);
-
-            vst1q_u8(x16, veorq_u8(x0, y0));
-
-            bytes -= 16, ++x16, ++y16;
-        }
-    }
-    else
-# endif // LEO_TRY_NEON
-    {
-        uint64_t * LEO_RESTRICT x8 = reinterpret_cast<uint64_t *>(x16);
-        const uint64_t * LEO_RESTRICT y8 = reinterpret_cast<const uint64_t *>(y16);
-
-        const unsigned count = (unsigned)bytes / 8;
-        for (unsigned ii = 0; ii < count; ++ii)
-            x8[ii] ^= y8[ii];
-
-        x16 = reinterpret_cast<LEO_M128 *>(x8 + count);
-        y16 = reinterpret_cast<const LEO_M128 *>(y8 + count);
-    }
-#else // LEO_TARGET_MOBILE
-# if defined(LEO_TRY_AVX2)
-    if (CpuHasAVX2)
-    {
-        LEO_M256 * LEO_RESTRICT x32 = reinterpret_cast<LEO_M256 *>(x16);
-        const LEO_M256 * LEO_RESTRICT y32 = reinterpret_cast<const LEO_M256 *>(y16);
-
-        while (bytes >= 128)
-        {
-            LEO_M256 x0 = _mm256_loadu_si256(x32);
-            LEO_M256 y0 = _mm256_loadu_si256(y32);
-            x0 = _mm256_xor_si256(x0, y0);
-            LEO_M256 x1 = _mm256_loadu_si256(x32 + 1);
-            LEO_M256 y1 = _mm256_loadu_si256(y32 + 1);
-            x1 = _mm256_xor_si256(x1, y1);
-            LEO_M256 x2 = _mm256_loadu_si256(x32 + 2);
-            LEO_M256 y2 = _mm256_loadu_si256(y32 + 2);
-            x2 = _mm256_xor_si256(x2, y2);
-            LEO_M256 x3 = _mm256_loadu_si256(x32 + 3);
-            LEO_M256 y3 = _mm256_loadu_si256(y32 + 3);
-            x3 = _mm256_xor_si256(x3, y3);
-
-            _mm256_storeu_si256(x32, x0);
-            _mm256_storeu_si256(x32 + 1, x1);
-            _mm256_storeu_si256(x32 + 2, x2);
-            _mm256_storeu_si256(x32 + 3, x3);
-
-            bytes -= 128, x32 += 4, y32 += 4;
-        }
-
-        // Handle multiples of 32 bytes
-        while (bytes >= 32)
-        {
-            // x[i] = x[i] xor y[i]
-            _mm256_storeu_si256(x32,
-                _mm256_xor_si256(
-                    _mm256_loadu_si256(x32),
-                    _mm256_loadu_si256(y32)));
-
-            bytes -= 32, ++x32, ++y32;
-        }
-
-        x16 = reinterpret_cast<LEO_M128 *>(x32);
-        y16 = reinterpret_cast<const LEO_M128 *>(y32);
-    }
-    else
-# endif // LEO_TRY_AVX2
-    {
-        while (bytes >= 64)
-        {
-            LEO_M128 x0 = _mm_loadu_si128(x16);
-            LEO_M128 y0 = _mm_loadu_si128(y16);
-            x0 = _mm_xor_si128(x0, y0);
-            LEO_M128 x1 = _mm_loadu_si128(x16 + 1);
-            LEO_M128 y1 = _mm_loadu_si128(y16 + 1);
-            x1 = _mm_xor_si128(x1, y1);
-            LEO_M128 x2 = _mm_loadu_si128(x16 + 2);
-            LEO_M128 y2 = _mm_loadu_si128(y16 + 2);
-            x2 = _mm_xor_si128(x2, y2);
-            LEO_M128 x3 = _mm_loadu_si128(x16 + 3);
-            LEO_M128 y3 = _mm_loadu_si128(y16 + 3);
-            x3 = _mm_xor_si128(x3, y3);
-
-            _mm_storeu_si128(x16, x0);
-            _mm_storeu_si128(x16 + 1, x1);
-            _mm_storeu_si128(x16 + 2, x2);
-            _mm_storeu_si128(x16 + 3, x3);
-
-            bytes -= 64, x16 += 4, y16 += 4;
-        }
-    }
-#endif // LEO_TARGET_MOBILE
-
-    // Handle multiples of 16 bytes
-    while (bytes >= 16)
-    {
-        // x[i] = x[i] xor y[i]
-        _mm_storeu_si128(x16,
-            _mm_xor_si128(
-                _mm_loadu_si128(x16),
-                _mm_loadu_si128(y16)));
-
-        bytes -= 16, ++x16, ++y16;
-    }
-
-    uint8_t * LEO_RESTRICT x1 = reinterpret_cast<uint8_t *>(x16);
-    const uint8_t * LEO_RESTRICT y1 = reinterpret_cast<const uint8_t *>(y16);
-
-    // Handle a block of 8 bytes
-    const unsigned eight = bytes & 8;
-    if (eight)
-    {
-        uint64_t * LEO_RESTRICT x8 = reinterpret_cast<uint64_t *>(x1);
-        const uint64_t * LEO_RESTRICT y8 = reinterpret_cast<const uint64_t *>(y1);
-        *x8 ^= *y8;
-    }
-
-    // Handle a block of 4 bytes
-    const unsigned four = bytes & 4;
-    if (four)
-    {
-        uint32_t * LEO_RESTRICT x4 = reinterpret_cast<uint32_t *>(x1 + eight);
-        const uint32_t * LEO_RESTRICT y4 = reinterpret_cast<const uint32_t *>(y1 + eight);
-        *x4 ^= *y4;
-    }
-
-    // Handle final bytes
-    const unsigned offset = eight + four;
-    switch (bytes & 3)
-    {
-    case 3: x1[offset + 2] ^= y1[offset + 2];
-    case 2: x1[offset + 1] ^= y1[offset + 1];
-    case 1: x1[offset] ^= y1[offset];
-    default:
-        break;
-    }
-}
-
-
-//------------------------------------------------------------------------------
-// Formal Derivative
-
-// Formal derivative of polynomial in the new basis
-static void formal_derivative(GFSymbol* cos, const unsigned size)
-{
-    for (unsigned i = 1; i < size; ++i)
-    {
-        const unsigned leng = ((i ^ (i - 1)) + 1) >> 1;
-
-        // If a large number of values are being XORed:
-        if (leng >= 8)
-            xor_mem(cos + i - leng, cos + i, leng * sizeof(GFSymbol));
-        else
-            for (unsigned j = i - leng; j < i; j++)
-                cos[j] ^= cos[j + leng];
-    }
-
-    for (unsigned i = size; i < kFieldSize; i <<= 1)
-        xor_mem(cos, cos + i, size * sizeof(GFSymbol));
-}
-
-
-//------------------------------------------------------------------------------
-// Fast Fourier Transform
-
-static GFSymbol skewVec[kFieldModulus]; // twisted factors used in FFT
-
-// IFFT in the proposed basis
-static void IFLT(GFSymbol* data, const unsigned size, const unsigned index)
-{
-    for (unsigned depart_no = 1; depart_no < size; depart_no <<= 1)
-    {
-        for (unsigned j = depart_no; j < size; j += (depart_no << 1))
-        {
-            // If a large number of values are being XORed:
-            if (depart_no >= 8)
-                xor_mem(data + j, data + j - depart_no, depart_no * sizeof(GFSymbol));
-            else
-                for (unsigned i = j - depart_no; i < j; ++i)
-                    data[i + depart_no] ^= data[i];
-
-            const GFSymbol skew = skewVec[j + index - 1];
-
-            if (skew != kFieldModulus)
-                muladd_mem(data + j - depart_no, data + j, skew, depart_no);
-        }
-    }
-}
-
-// FFT in the proposed basis
-static void FLT(GFSymbol* data, const unsigned size, const unsigned index)
-{
-    for (unsigned depart_no = (size >> 1); depart_no > 0; depart_no >>= 1)
-    {
-        for (unsigned j = depart_no; j < size; j += (depart_no << 1))
-        {
-            const GFSymbol skew = skewVec[j + index - 1];
-
-            if (skew != kFieldModulus)
-                muladd_mem(data + j - depart_no, data + j, skew, depart_no);
-
-            // If a large number of values are being XORed:
-            if (depart_no >= 8)
-                xor_mem(data + j, data + j - depart_no, depart_no * sizeof(GFSymbol));
-            else
-                for (unsigned i = j - depart_no; i < j; ++i)
-                    data[i + depart_no] ^= data[i];
-        }
-    }
-}
-
-
-//------------------------------------------------------------------------------
-// FFT Initialization
-
-static GFSymbol B[kFieldSize >> 1];     // factors used in formal derivative
-static fwht_t log_walsh[kFieldSize];  // factors used in the evaluation of the error locator polynomial
-
-// Initialize skewVec[], B[], log_walsh[]
-static void InitFieldOperations()
-{
-    GFSymbol temp[kGFBits - 1];
-
-    for (unsigned i = 1; i < kGFBits; ++i)
-        temp[i - 1] = (GFSymbol)((unsigned)1 << i);
-
-    for (unsigned m = 0; m < (kGFBits - 1); ++m)
-    {
-        const unsigned step = (unsigned)1 << (m + 1);
-
-        skewVec[((unsigned)1 << m) - 1] = 0;
-
-        for (unsigned i = m; i < (kGFBits - 1); ++i)
-        {
-            const unsigned s = ((unsigned)1 << (i + 1));
-
-            for (unsigned j = ((unsigned)1 << m) - 1; j < s; j += step)
-                skewVec[j + s] = skewVec[j] ^ temp[i];
-        }
-
-        temp[m] = kFieldModulus - GFLog[mulE(temp[m], GFLog[temp[m] ^ 1])];
-
-        for (unsigned i = m + 1; i < (kGFBits - 1); ++i)
-            temp[i] = mulE(temp[i], (GFLog[temp[i] ^ 1] + temp[m]) % kFieldModulus);
-    }
-
-    for (unsigned i = 0; i < kFieldSize; ++i)
-        skewVec[i] = GFLog[skewVec[i]];
-
-    temp[0] = kFieldModulus - temp[0];
-
-    for (unsigned i = 1; i < (kGFBits - 1); ++i)
-        temp[i] = (kFieldModulus - temp[i] + temp[i - 1]) % kFieldModulus;
-
-    B[0] = 0;
-    for (unsigned i = 0; i < (kGFBits - 1); ++i)
-    {
-        const unsigned depart = ((unsigned)1 << i);
-
-        for (unsigned j = 0; j < depart; ++j)
-            B[j + depart] = (B[j] + temp[i]) % kFieldModulus;
-    }
-
-    for (unsigned i = 0; i < kFieldSize; ++i)
-        log_walsh[i] = GFLog[i];
-
-    log_walsh[0] = 0;
-
-    FWHT(log_walsh, kGFBits);
-}
-
-
-//------------------------------------------------------------------------------
-// Encoder
-
-// Encoding alg for k/n<0.5: message is a power of two
-static void encodeL(GFSymbol* data, const unsigned k, GFSymbol* codeword)
-{
-    memcpy(codeword, data, sizeof(GFSymbol) * k);
-
-    IFLT(codeword, k, 0);
-
-    for (unsigned i = k; i < kFieldSize; i += k)
-    {
-        memcpy(&codeword[i], codeword, sizeof(GFSymbol) * k);
-
-        FLT(&codeword[i], k, i);
-    }
-
-    memcpy(codeword, data, sizeof(GFSymbol) * k);
-}
-
-// Encoding alg for k/n>0.5: parity is a power of two.
-// data: message array. parity: parity array. mem: buffer(size>= n-k)
-static void encodeH(const GFSymbol* data, const unsigned k, GFSymbol* parity, GFSymbol* mem)
-{
-    const unsigned t = kFieldSize - k;
-
-    memset(parity, 0, sizeof(GFSymbol) * t);
-
-    for (unsigned i = t; i < kFieldSize; i += t)
-    {
-        memcpy(mem, &data[i - t], sizeof(GFSymbol) * t);
-
-        IFLT(mem, t, i);
-
-        xor_mem(parity, mem, t * sizeof(GFSymbol));
-    }
-
-    FLT(parity, t, 0);
-}
-
-
-//------------------------------------------------------------------------------
-// Decoder
-
-static void decode(GFSymbol* codeword, unsigned k, const bool* erasure)
-{
-    fwht_t log_walsh2[kFieldSize];
-
-    // Compute the evaluations of the error locator polynomial
-    for (unsigned i = 0; i < kFieldSize; ++i)
-        log_walsh2[i] = erasure[i] ? 1 : 0;
-
-    FWHT(log_walsh2, kGFBits);
-
-    for (unsigned i = 0; i < kFieldSize; ++i)
-        log_walsh2[i] = ((unsigned)log_walsh2[i] * (unsigned)log_walsh[i]) % kFieldModulus;
-
-    FWHT(log_walsh2, kGFBits);
-
-    // k2 can be replaced with k
-    const unsigned k2 = kFieldSize;
-    //const unsigned k2 = k; // cannot actually be replaced with k.  what else need to change?
-
-    for (unsigned i = 0; i < kFieldSize; ++i)
-    {
-        if (erasure[i])
-        {
-            codeword[i] = 0;
-        }
-        else
-        {
-            codeword[i] = mulE(codeword[i], log_walsh2[i]);
-        }
-    }
-
-    IFLT(codeword, kFieldSize, 0);
-
-    // formal derivative
-    for (unsigned i = 0; i < kFieldSize; i += 2)
-    {
-        codeword[i] = mulE(codeword[i], kFieldModulus - B[i >> 1]);
-        codeword[i + 1] = mulE(codeword[i + 1], kFieldModulus - B[i >> 1]);
-    }
-
-    formal_derivative(codeword, k2);
-
-    for (unsigned i = 0; i < k2; i += 2)
-    {
-        codeword[i] = mulE(codeword[i], B[i >> 1]);
-        codeword[i + 1] = mulE(codeword[i + 1], B[i >> 1]);
-    }
-
-    FLT(codeword, k2, 0);
-
-    for (unsigned i = 0; i < k2; ++i)
-    {
-        if (erasure[i])
-        {
-            codeword[i] = mulE(codeword[i], kFieldModulus - log_walsh2[i]);
-        }
-    }
-}
-
-
-//------------------------------------------------------------------------------
-// Test Application
-
-void test(unsigned k, unsigned seed)
-{
-    srand(seed);
-
-    //-----------Generating message----------
-
-    // Message array
-    GFSymbol data[kFieldSize] = {0};
-
-    // Filled with random numbers
-    for (unsigned i = kFieldSize - k; i < kFieldSize; ++i)
-        data[i] = (GFSymbol)rand();
-
-
-    //---------encoding----------
-
-    GFSymbol codeword[kFieldSize];
-    encodeH(&data[kFieldSize - k], k, data, codeword);
-    //encodeL(data, k, codeword); // does not seem to work with any input?  what else needs to change?
-
-    memcpy(codeword, data, sizeof(GFSymbol) * kFieldSize);
-
-
-    //--------erasure simulation---------
-
-    // Array indicating erasures
-    bool erasure[kFieldSize] = {
-        false
-    };
-
-    for (unsigned i = k; i < kFieldSize; ++i)
-        erasure[i] = true;
-
-    // permuting the erasure array
-    for (unsigned i = kFieldSize - 1; i > 0; --i)
-    {
-        unsigned pos = rand() % (i + 1);
-
-        if (i != pos)
-        {
-            bool tmp = erasure[i];
-            erasure[i] = erasure[pos];
-            erasure[pos] = tmp;
-        }
-    }
-
-    // erasure codeword symbols
-    for (unsigned i = 0; i < kFieldSize; ++i)
-        if (erasure[i])
-            codeword[i] = 0;
-
-
-    //---------main processing----------
-    decode(codeword, k, erasure);
-
-    // Check the correctness of the result
-    for (unsigned i = 0; i < kFieldSize; ++i)
-    {
-        if (erasure[i] == 1)
-        {
-            if (data[i] != codeword[i])
-            {
-                printf("Decoding Error with seed = %d!\n", seed);
-                LEO_DEBUG_BREAK;
-                return;
-            }
-        }
-    }
-
-    //printf("Decoding is successful!\n");
-}
-
-
-//------------------------------------------------------------------------------
-// Entrypoint
-
-int main(int argc, char **argv)
-{
-    // Initialize architecture-specific code
-    leo_architecture_init();
-
-    // Fill GFLog table and GFExp table
-    InitField();
-
-    // Compute factors used in erasure decoder
-    InitFieldOperations();
-
-    unsigned seed = (unsigned)time(NULL);
-    for (;;)
-    {
-        // test(int k), k: message size
-        /*
-            EncodeH works for kFieldSize / 2 and kFieldSize * 3 / 4, etc,
-            s.t. the number of recovery pieces is a power of two
-        */
-        test(kFieldSize / 2, seed);
-
-        ++seed;
-    }
-
-    return 0;
-}
diff --git a/LeopardFF16.cpp b/LeopardFF16.cpp
index 71d22e2..bd5c1cb 100644
--- a/LeopardFF16.cpp
+++ b/LeopardFF16.cpp
@@ -9,7 +9,7 @@
     * Redistributions in binary form must reproduce the above copyright notice,
       this list of conditions and the following disclaimer in the documentation
       and/or other materials provided with the distribution.
-    * Neither the name of LHC-RS nor the names of its contributors may be
+    * Neither the name of Leopard-RS nor the names of its contributors may be
       used to endorse or promote products derived from this software without
       specific prior written permission.
 
@@ -26,494 +26,75 @@
     POSSIBILITY OF SUCH DAMAGE.
 */
 
+#include "LeopardFF16.h"
 #include <string.h>
-#include <time.h>
-#include <stdio.h>
-#include <stdint.h>
-#include <stdlib.h>
 
+// Define this to enable the optimized version of FWHT()
+#define LEO_FF16_FWHT_OPTIMIZED
 
-/*
-    TODO:
-    + Write C API and unit tester
-        + Limit input to multiples of 64 bytes
-    + Replace GFSymbol with a file data pointer
-    + New 16-bit Muladd inner loops
-        + Class to contain the (large) muladd tables
-    + Preliminary benchmarks for large data!
-    + New 8-bit Muladd inner loops
-    + Benchmarks for smaller data!
-    + Refactor software
-        + Pick a name for the software better than LEO_RS
-        + I think it should be split up into several C++ modules
-    + Write detailed comments for all the routines
-    + Look into getting EncodeL working so we can support smaller data (Ask Lin)
-    + Look into using k instead of k2 to speed up decoder (Ask Lin)
-    + Avoid performing FFT/IFFT intermediate calculations we're not going to use
-    + Benchmarks, fun!
-    + Add multi-threading to split up long parallelizable calculations
-    + Final benchmarks!
-    + Finish up documentation
-    + Release version 1
-
-
-    Muladd implementation notes:
-
-    Specialize for 1-3 rows at a time since often times we're multiplying by
-    the same (skew) value repeatedly, as the ISA-L library does here:
-
-    https://github.com/01org/isa-l/blob/master/erasure_code/gf_3vect_mad_avx.asm#L258
-
-    Except we should be doing it for 16-bit Galois Field.
-    To implement that use the ALTMAP trick from Jerasure:
-
-    http://lab.jerasure.org/jerasure/gf-complete/blob/master/src/gf_w16.c#L1140
-
-    Except we should also support AVX2 since that is a 40% perf boost, so put
-    the high and low bytes 32 bytes instead of 16 bytes apart.
-
-    Also I think we should go ahead and precompute the multiply tables since
-    it avoids a bunch of memory lookups for each muladd, and only costs 8 MB.
-*/
+namespace leopard { namespace ff16 {
 
 
 //------------------------------------------------------------------------------
-// Debug
+// Datatypes and Constants
 
-// Some bugs only repro in release mode, so this can be helpful
-//#define LEO_DEBUG_IN_RELEASE
+// Modulus for field operations
+static const ffe_t kModulus = 65535;
 
-#if defined(_DEBUG) || defined(DEBUG) || defined(LEO_DEBUG_IN_RELEASE)
-    #define LEO_DEBUG
-    #ifdef _WIN32
-        #define LEO_DEBUG_BREAK __debugbreak()
-    #else
-        #define LEO_DEBUG_BREAK __builtin_trap()
-    #endif
-    #define LEO_DEBUG_ASSERT(cond) { if (!(cond)) { LEO_DEBUG_BREAK; } }
-#else
-    #define LEO_DEBUG_BREAK ;
-    #define LEO_DEBUG_ASSERT(cond) ;
-#endif
+// LFSR Polynomial that generates the field elements
+static const unsigned kPolynomial = 0x1002D;
 
-
-//------------------------------------------------------------------------------
-// Platform/Architecture
-
-#if defined(ANDROID) || defined(IOS)
-    #define LEO_TARGET_MOBILE
-#endif // ANDROID
-
-#if defined(__AVX2__) || (defined (_MSC_VER) && _MSC_VER >= 1900)
-    #define LEO_TRY_AVX2 /* 256-bit */
-    #include <immintrin.h>
-    #define LEO_ALIGN_BYTES 32
-#else // __AVX2__
-    #define LEO_ALIGN_BYTES 16
-#endif // __AVX2__
-
-#if !defined(LEO_TARGET_MOBILE)
-    // Note: MSVC currently only supports SSSE3 but not AVX2
-    #include <tmmintrin.h> // SSSE3: _mm_shuffle_epi8
-    #include <emmintrin.h> // SSE2
-#endif // LEO_TARGET_MOBILE
-
-#if defined(HAVE_ARM_NEON_H)
-    #include <arm_neon.h>
-#endif // HAVE_ARM_NEON_H
-
-#if defined(LEO_TARGET_MOBILE)
-
-    #define LEO_ALIGNED_ACCESSES /* Inputs must be aligned to LEO_ALIGN_BYTES */
-
-# if defined(HAVE_ARM_NEON_H)
-    // Compiler-specific 128-bit SIMD register keyword
-    #define LEO_M128 uint8x16_t
-    #define LEO_TRY_NEON
-#else
-    #define LEO_M128 uint64_t
-# endif
-
-#else // LEO_TARGET_MOBILE
-
-    // Compiler-specific 128-bit SIMD register keyword
-    #define LEO_M128 __m128i
-
-#endif // LEO_TARGET_MOBILE
-
-#ifdef LEO_TRY_AVX2
-    // Compiler-specific 256-bit SIMD register keyword
-    #define LEO_M256 __m256i
-#endif
-
-// Compiler-specific C++11 restrict keyword
-#define LEO_RESTRICT __restrict
-
-// Compiler-specific force inline keyword
-#ifdef _MSC_VER
-    #define LEO_FORCE_INLINE inline __forceinline
-#else
-    #define LEO_FORCE_INLINE inline __attribute__((always_inline))
-#endif
-
-// Compiler-specific alignment keyword
-// Note: Alignment only matters for ARM NEON where it should be 16
-#ifdef _MSC_VER
-    #define LEO_ALIGNED __declspec(align(LEO_ALIGN_BYTES))
-#else // _MSC_VER
-    #define LEO_ALIGNED __attribute__((aligned(LEO_ALIGN_BYTES)))
-#endif // _MSC_VER
-
-
-//------------------------------------------------------------------------------
-// Runtime CPU Architecture Check
-//
-// Feature checks stolen shamelessly from
-// https://github.com/jedisct1/libsodium/blob/master/src/libsodium/sodium/runtime.c
-
-#if defined(HAVE_ANDROID_GETCPUFEATURES)
-    #include <cpu-features.h>
-#endif
-
-#if defined(LEO_TRY_NEON)
-# if defined(IOS) && defined(__ARM_NEON__)
-        // Requires iPhone 5S or newer
-        static const bool CpuHasNeon = true;
-        static const bool CpuHasNeon64 = true;
-# else
-        // Remember to add LOCAL_STATIC_LIBRARIES := cpufeatures
-        static bool CpuHasNeon = false; // V6 / V7
-        static bool CpuHasNeon64 = false; // 64-bit
-# endif
-#endif
-
-
-#if !defined(LEO_TARGET_MOBILE)
-
-#ifdef _MSC_VER
-    #include <intrin.h> // __cpuid
-    #pragma warning(disable: 4752) // found Intel(R) Advanced Vector Extensions; consider using /arch:AVX
-#endif
-
-#ifdef LEO_TRY_AVX2
-static bool CpuHasAVX2 = false;
-#endif
-static bool CpuHasSSSE3 = false;
-
-#define CPUID_EBX_AVX2    0x00000020
-#define CPUID_ECX_SSSE3   0x00000200
-
-static void _cpuid(unsigned int cpu_info[4U], const unsigned int cpu_info_type)
-{
-#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_AMD64) || defined(_M_IX86))
-    __cpuid((int *) cpu_info, cpu_info_type);
-#else //if defined(HAVE_CPUID)
-    cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0;
-# ifdef __i386__
-    __asm__ __volatile__ ("pushfl; pushfl; "
-                          "popl %0; "
-                          "movl %0, %1; xorl %2, %0; "
-                          "pushl %0; "
-                          "popfl; pushfl; popl %0; popfl" :
-                          "=&r" (cpu_info[0]), "=&r" (cpu_info[1]) :
-                          "i" (0x200000));
-    if (((cpu_info[0] ^ cpu_info[1]) & 0x200000) == 0) {
-        return; /* LCOV_EXCL_LINE */
-    }
-# endif
-# ifdef __i386__
-    __asm__ __volatile__ ("xchgl %%ebx, %k1; cpuid; xchgl %%ebx, %k1" :
-                          "=a" (cpu_info[0]), "=&r" (cpu_info[1]),
-                          "=c" (cpu_info[2]), "=d" (cpu_info[3]) :
-                          "0" (cpu_info_type), "2" (0U));
-# elif defined(__x86_64__)
-    __asm__ __volatile__ ("xchgq %%rbx, %q1; cpuid; xchgq %%rbx, %q1" :
-                          "=a" (cpu_info[0]), "=&r" (cpu_info[1]),
-                          "=c" (cpu_info[2]), "=d" (cpu_info[3]) :
-                          "0" (cpu_info_type), "2" (0U));
-# else
-    __asm__ __volatile__ ("cpuid" :
-                          "=a" (cpu_info[0]), "=b" (cpu_info[1]),
-                          "=c" (cpu_info[2]), "=d" (cpu_info[3]) :
-                          "0" (cpu_info_type), "2" (0U));
-# endif
-#endif
-}
-
-#endif // defined(LEO_TARGET_MOBILE)
-
-
-static void leo_architecture_init()
-{
-#if defined(LEO_TRY_NEON) && defined(HAVE_ANDROID_GETCPUFEATURES)
-    AndroidCpuFamily family = android_getCpuFamily();
-    if (family == ANDROID_CPU_FAMILY_ARM)
-    {
-        if (android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON)
-            CpuHasNeon = true;
-    }
-    else if (family == ANDROID_CPU_FAMILY_ARM64)
-    {
-        CpuHasNeon = true;
-        if (android_getCpuFeatures() & ANDROID_CPU_ARM64_FEATURE_ASIMD)
-            CpuHasNeon64 = true;
-    }
-#endif
-
-#if !defined(LEO_TARGET_MOBILE)
-    unsigned int cpu_info[4];
-
-    _cpuid(cpu_info, 1);
-    CpuHasSSSE3 = ((cpu_info[2] & CPUID_ECX_SSSE3) != 0);
-
-#if defined(LEO_TRY_AVX2)
-    _cpuid(cpu_info, 7);
-    CpuHasAVX2 = ((cpu_info[1] & CPUID_EBX_AVX2) != 0);
-#endif // LEO_TRY_AVX2
-
-#endif // LEO_TARGET_MOBILE
-}
-
-
-//------------------------------------------------------------------------------
-// SIMD-Safe Aligned Memory Allocations
-
-static const unsigned kAlignmentBytes = LEO_ALIGN_BYTES;
-
-LEO_FORCE_INLINE unsigned NextAlignedOffset(unsigned offset)
-{
-    return (offset + kAlignmentBytes - 1) & ~(kAlignmentBytes - 1);
-}
-
-static LEO_FORCE_INLINE uint8_t* SIMDSafeAllocate(size_t size)
-{
-    uint8_t* data = (uint8_t*)calloc(1, kAlignmentBytes + size);
-    if (!data)
-        return nullptr;
-    unsigned offset = (unsigned)((uintptr_t)data % kAlignmentBytes);
-    data += kAlignmentBytes - offset;
-    data[-1] = (uint8_t)offset;
-    return data;
-}
-
-static LEO_FORCE_INLINE void SIMDSafeFree(void* ptr)
-{
-    if (!ptr)
-        return;
-    uint8_t* data = (uint8_t*)ptr;
-    unsigned offset = data[-1];
-    if (offset >= kAlignmentBytes)
-    {
-        LEO_DEBUG_BREAK; // Should never happen
-        return;
-    }
-    data -= kAlignmentBytes - offset;
-    free(data);
-}
-
-
-//------------------------------------------------------------------------------
-// Field
-
-//#define LEO_SHORT_FIELD
-
-#ifdef LEO_SHORT_FIELD
-typedef uint8_t GFSymbol;
-static const unsigned kGFBits = 8;
-static const unsigned kGFPolynomial = 0x11D;
-GFSymbol kGFBasis[kGFBits] = {
-    1, 214, 152, 146, 86, 200, 88, 230 // Cantor basis
-};
-#else
-typedef uint16_t GFSymbol;
-static const unsigned kGFBits = 16;
-static const unsigned kGFPolynomial = 0x1002D;
-GFSymbol kGFBasis[kGFBits] = {
+// Basis used for generating logarithm tables
+static const ffe_t kBasis[kBits] = {
     0x0001, 0xACCA, 0x3C0E, 0x163E, // Cantor basis
     0xC582, 0xED2E, 0x914C, 0x4012,
     0x6C98, 0x10D8, 0x6A72, 0xB900,
     0xFDB8, 0xFB34, 0xFF38, 0x991E
 };
-#endif
-
-/*
-    Cantor Basis introduced by:
-    D. G. Cantor, "On arithmetical algorithms over finite fields",
-    Journal of Combinatorial Theory, Series A, vol. 50, no. 2, pp. 285-300, 1989.
-*/
-
-static const unsigned kFieldSize = (unsigned)1 << kGFBits; //Field size
-static const unsigned kFieldModulus = kFieldSize - 1;
-
-static GFSymbol GFLog[kFieldSize];
-static GFSymbol GFExp[kFieldSize];
-
-// Initialize GFLog[], GFExp[]
-static void InitField()
-{
-    unsigned state = 1;
-    for (unsigned i = 0; i < kFieldModulus; ++i)
-    {
-        GFExp[state] = static_cast<GFSymbol>(i);
-        state <<= 1;
-        if (state >= kFieldSize)
-            state ^= kGFPolynomial;
-    }
-    GFExp[0] = kFieldModulus;
-
-    // Conversion to chosen basis:
-
-    GFLog[0] = 0;
-    for (unsigned i = 0; i < kGFBits; ++i)
-    {
-        const GFSymbol basis = kGFBasis[i];
-        const unsigned width = (unsigned)(1UL << i);
-
-        for (unsigned j = 0; j < width; ++j)
-            GFLog[j + width] = GFLog[j] ^ basis;
-    }
-
-    for (unsigned i = 0; i < kFieldSize; ++i)
-        GFLog[i] = GFExp[GFLog[i]];
-
-    for (unsigned i = 0; i < kFieldSize; ++i)
-        GFExp[GFLog[i]] = i;
-
-    GFExp[kFieldModulus] = GFExp[0];
-}
 
 
 //------------------------------------------------------------------------------
-// Mod Q Field Operations
-//
-// Q is the maximum symbol value, e.g. 255 or 65535.
+// Field Operations
 
-// z = x + y (mod Q)
-static inline GFSymbol AddModQ(GFSymbol a, GFSymbol b)
+// z = x + y (mod kModulus)
+static inline ffe_t AddMod(const ffe_t a, const ffe_t b)
 {
     const unsigned sum = (unsigned)a + b;
 
-    // Partial reduction step, allowing for Q to be returned
-    return static_cast<GFSymbol>(sum + (sum >> kGFBits));
+    // Partial reduction step, allowing for kModulus to be returned
+    return static_cast<ffe_t>(sum + (sum >> kBits));
 }
 
-// z = x - y (mod Q)
-static inline GFSymbol SubModQ(GFSymbol a, GFSymbol b)
+// z = x - y (mod kModulus)
+static inline ffe_t SubMod(const ffe_t a, const ffe_t b)
 {
     const unsigned dif = (unsigned)a - b;
 
-    // Partial reduction step, allowing for Q to be returned
-    return static_cast<GFSymbol>(dif + (dif >> kGFBits));
-}
-
-// vx[] += vy[] * z
-static void muladd_mem(GFSymbol * LEO_RESTRICT vx, const GFSymbol * LEO_RESTRICT vy, GFSymbol z, unsigned symbolCount)
-{
-    for (unsigned i = 0; i < symbolCount; ++i)
-    {
-        const GFSymbol a = vy[i];
-        if (a == 0)
-            continue;
-
-        GFSymbol sum1 = static_cast<GFSymbol>(AddModQ(GFLog[a & 0x0f], z));
-        GFSymbol value1 = GFExp[sum1];
-        if ((a & 0x0f) == 0)
-        {
-            value1 = 0;
-        }
-        GFSymbol sum2 = static_cast<GFSymbol>(AddModQ(GFLog[a & 0xf0], z));
-        GFSymbol value2 = GFExp[sum2];
-        if ((a & 0xf0) == 0)
-        {
-            value2 = 0;
-        }
-        GFSymbol sum3 = static_cast<GFSymbol>(AddModQ(GFLog[a & 0x0f00], z));
-        GFSymbol value3 = GFExp[sum3];
-        if ((a & 0x0f00) == 0)
-        {
-            value3 = 0;
-        }
-        GFSymbol sum4 = static_cast<GFSymbol>(AddModQ(GFLog[a & 0xf000], z));
-        GFSymbol value4 = GFExp[sum4];
-        if ((a & 0xf000) == 0)
-        {
-            value4 = 0;
-        }
-
-        vx[i] ^= value1;
-        vx[i] ^= value2;
-        vx[i] ^= value3;
-        vx[i] ^= value4;
-    }
-}
-
-// return a*GFExp[b] over GF(2^r)
-static GFSymbol mulE(GFSymbol a, GFSymbol b)
-{
-    if (a == 0)
-        return 0;
-
-    const GFSymbol sum = static_cast<GFSymbol>(AddModQ(GFLog[a], b));
-    return GFExp[sum];
+    // Partial reduction step, allowing for kModulus to be returned
+    return static_cast<ffe_t>(dif + (dif >> kBits));
 }
 
 
 //------------------------------------------------------------------------------
-// Fast Walsh-Hadamard Transform (FWHT) Mod Q
-//
-// Q is the maximum symbol value, e.g. 255 or 65535.
+// Fast Walsh-Hadamard Transform (FWHT) (mod kModulus)
 
-// Define this to enable the optimized version of FWHT()
-#define LEO_FWHT_OPTIMIZED
-
-typedef GFSymbol fwht_t;
+#if defined(LEO_FF16_FWHT_OPTIMIZED)
 
 // {a, b} = {a + b, a - b} (Mod Q)
-static LEO_FORCE_INLINE void FWHT_2(fwht_t& LEO_RESTRICT a, fwht_t& LEO_RESTRICT b)
+static LEO_FORCE_INLINE void FWHT_2(ffe_t& LEO_RESTRICT a, ffe_t& LEO_RESTRICT b)
 {
-    const fwht_t sum = AddModQ(a, b);
-    const fwht_t dif = SubModQ(a, b);
+    const ffe_t sum = AddMod(a, b);
+    const ffe_t dif = SubMod(a, b);
     a = sum;
     b = dif;
 }
 
-/*
-    FWHT is a minor slice of the runtime and does not grow with data size,
-    but I did attempt a few additional optimizations that failed:
-
-    I've attempted to vectorize (with partial reductions) FWHT_4(data, s),
-    which is 70% of the algorithm, but it was slower.  Left in _attic_.
-
-    I've attempted to avoid reductions in all or parts of the FWHT.
-    The final modular reduction ends up being slower than the savings.
-    Specifically I tried doing it for the whole FWHT and also I tried
-    doing it just for the FWHT_2 loop in the main routine, but both
-    approaches are slower than partial reductions.
-
-    Replacing word reads with wider reads does speed up the operation, but
-    at too high a complexity cost relative to minor perf improvement.
-*/
-
-#ifndef LEO_FWHT_OPTIMIZED
-
-// Reference implementation
-static void FWHT(fwht_t* data, const unsigned bits)
+static LEO_FORCE_INLINE void FWHT_4(ffe_t* data)
 {
-    const unsigned size = (unsigned)(1UL << bits);
-    for (unsigned width = 1; width < size; width <<= 1)
-        for (unsigned i = 0; i < size; i += (width << 1))
-            for (unsigned j = i; j < (width + i); ++j)
-                FWHT_2(data[j], data[j + width]);
-}
-
-#else
-
-static LEO_FORCE_INLINE void FWHT_4(fwht_t* data)
-{
-    fwht_t t0 = data[0];
-    fwht_t t1 = data[1];
-    fwht_t t2 = data[2];
-    fwht_t t3 = data[3];
+    ffe_t t0 = data[0];
+    ffe_t t1 = data[1];
+    ffe_t t2 = data[2];
+    ffe_t t3 = data[3];
     FWHT_2(t0, t1);
     FWHT_2(t2, t3);
     FWHT_2(t0, t2);
@@ -524,13 +105,13 @@ static LEO_FORCE_INLINE void FWHT_4(fwht_t* data)
     data[3] = t3;
 }
 
-static LEO_FORCE_INLINE void FWHT_4(fwht_t* data, unsigned s)
+static LEO_FORCE_INLINE void FWHT_4(ffe_t* data, unsigned s)
 {
     unsigned x = 0;
-    fwht_t t0 = data[x];  x += s;
-    fwht_t t1 = data[x];  x += s;
-    fwht_t t2 = data[x];  x += s;
-    fwht_t t3 = data[x];
+    ffe_t t0 = data[x];  x += s;
+    ffe_t t1 = data[x];  x += s;
+    ffe_t t2 = data[x];  x += s;
+    ffe_t t3 = data[x];
     FWHT_2(t0, t1);
     FWHT_2(t2, t3);
     FWHT_2(t0, t2);
@@ -542,16 +123,16 @@ static LEO_FORCE_INLINE void FWHT_4(fwht_t* data, unsigned s)
     data[y] = t3;
 }
 
-static inline void FWHT_8(fwht_t* data)
+static inline void FWHT_8(ffe_t* data)
 {
-    fwht_t t0 = data[0];
-    fwht_t t1 = data[1];
-    fwht_t t2 = data[2];
-    fwht_t t3 = data[3];
-    fwht_t t4 = data[4];
-    fwht_t t5 = data[5];
-    fwht_t t6 = data[6];
-    fwht_t t7 = data[7];
+    ffe_t t0 = data[0];
+    ffe_t t1 = data[1];
+    ffe_t t2 = data[2];
+    ffe_t t3 = data[3];
+    ffe_t t4 = data[4];
+    ffe_t t5 = data[5];
+    ffe_t t6 = data[6];
+    ffe_t t7 = data[7];
     FWHT_2(t0, t1);
     FWHT_2(t2, t3);
     FWHT_2(t4, t5);
@@ -574,24 +155,24 @@ static inline void FWHT_8(fwht_t* data)
     data[7] = t7;
 }
 
-static inline void FWHT_16(fwht_t* data)
+static inline void FWHT_16(ffe_t* data)
 {
-    fwht_t t0 = data[0];
-    fwht_t t1 = data[1];
-    fwht_t t2 = data[2];
-    fwht_t t3 = data[3];
-    fwht_t t4 = data[4];
-    fwht_t t5 = data[5];
-    fwht_t t6 = data[6];
-    fwht_t t7 = data[7];
-    fwht_t t8 = data[8];
-    fwht_t t9 = data[9];
-    fwht_t t10 = data[10];
-    fwht_t t11 = data[11];
-    fwht_t t12 = data[12];
-    fwht_t t13 = data[13];
-    fwht_t t14 = data[14];
-    fwht_t t15 = data[15];
+    ffe_t t0 = data[0];
+    ffe_t t1 = data[1];
+    ffe_t t2 = data[2];
+    ffe_t t3 = data[3];
+    ffe_t t4 = data[4];
+    ffe_t t5 = data[5];
+    ffe_t t6 = data[6];
+    ffe_t t7 = data[7];
+    ffe_t t8 = data[8];
+    ffe_t t9 = data[9];
+    ffe_t t10 = data[10];
+    ffe_t t11 = data[11];
+    ffe_t t12 = data[12];
+    ffe_t t13 = data[13];
+    ffe_t t14 = data[14];
+    ffe_t t15 = data[15];
     FWHT_2(t0, t1);
     FWHT_2(t2, t3);
     FWHT_2(t4, t5);
@@ -642,7 +223,7 @@ static inline void FWHT_16(fwht_t* data)
     data[15] = t15;
 }
 
-static void FWHT_SmallData(fwht_t* data, unsigned ldn)
+static void FWHT_SmallData(ffe_t* data, unsigned ldn)
 {
     const unsigned n = (1UL << ldn);
 
@@ -675,7 +256,7 @@ static void FWHT_SmallData(fwht_t* data, unsigned ldn)
 }
 
 // Decimation in time (DIT) version
-static void FWHT(fwht_t* data, const unsigned ldn)
+static void FWHT(ffe_t* data, const unsigned ldn)
 {
     if (ldn <= 13)
     {
@@ -698,523 +279,774 @@ static void FWHT(fwht_t* data, const unsigned ldn)
     }
 }
 
-#endif
+#else // LEO_FF16_FWHT_OPTIMIZED
+
+// Reference implementation
+void FWHT(ffe_t* data, const unsigned bits)
+{
+    const unsigned size = (unsigned)(1UL << bits);
+    for (unsigned width = 1; width < size; width <<= 1)
+        for (unsigned i = 0; i < size; i += (width << 1))
+            for (unsigned j = i; j < (width + i); ++j)
+                FWHT_2(data[j], data[j + width]);
+}
+
+#endif // LEO_FF16_FWHT_OPTIMIZED
+
+// Transform specialized for the finite field order
+void FWHT(ffe_t data[kOrder])
+{
+    FWHT(data, kBits);
+}
 
 
 //------------------------------------------------------------------------------
-// Memory Buffer XOR
+// Logarithm Tables
 
-static void xor_mem(void * LEO_RESTRICT vx, const void * LEO_RESTRICT vy, unsigned bytes)
+static ffe_t LogLUT[kOrder];
+static ffe_t ExpLUT[kOrder];
+
+
+// Initialize LogLUT[], ExpLUT[]
+static void InitializeLogarithmTables()
 {
-    LEO_M128 * LEO_RESTRICT x16 = reinterpret_cast<LEO_M128 *>(vx);
-    const LEO_M128 * LEO_RESTRICT y16 = reinterpret_cast<const LEO_M128 *>(vy);
+    // LFSR table generation:
 
-#if defined(LEO_TARGET_MOBILE)
-# if defined(LEO_TRY_NEON)
-    // Handle multiples of 64 bytes
-    if (CpuHasNeon)
+    unsigned state = 1;
+    for (unsigned i = 0; i < kModulus; ++i)
     {
-        while (bytes >= 64)
+        ExpLUT[state] = static_cast<ffe_t>(i);
+        state <<= 1;
+        if (state >= kOrder)
+            state ^= kPolynomial;
+    }
+    ExpLUT[0] = kModulus;
+
+    // Conversion to chosen basis:
+
+    LogLUT[0] = 0;
+    for (unsigned i = 0; i < kBits; ++i)
+    {
+        const ffe_t basis = kBasis[i];
+        const unsigned width = static_cast<unsigned>(1UL << i);
+
+        for (unsigned j = 0; j < width; ++j)
+            LogLUT[j + width] = LogLUT[j] ^ basis;
+    }
+
+    for (unsigned i = 0; i < kOrder; ++i)
+        LogLUT[i] = ExpLUT[LogLUT[i]];
+
+    for (unsigned i = 0; i < kOrder; ++i)
+        ExpLUT[LogLUT[i]] = i;
+
+    ExpLUT[kModulus] = ExpLUT[0];
+}
+
+//------------------------------------------------------------------------------
+// Multiplies
+
+/*
+    Muladd implementation notes:
+
+    Specialize for 1-3 rows at a time since often times we're multiplying by
+    the same (skew) value repeatedly, as the ISA-L library does here:
+
+    https://github.com/01org/isa-l/blob/master/erasure_code/gf_3vect_mad_avx.asm#L258
+
+    Except we should be doing it for 16-bit Galois Field.
+    To implement that use the ALTMAP trick from Jerasure:
+
+    http://lab.jerasure.org/jerasure/gf-complete/blob/master/src/gf_w16.c#L1140
+
+    Except we should also support AVX2 since that is a 40% perf boost, so put
+    the high and low bytes 32 bytes instead of 16 bytes apart.
+
+    Also I think we should go ahead and precompute the multiply tables since
+    it avoids a bunch of memory lookups for each muladd, and only costs 8 MB.
+*/
+
+// We require memory to be aligned since the SIMD instructions benefit from
+// or require aligned accesses to the table data.
+struct {
+    LEO_ALIGNED LEO_M128 LUT[65536][4];
+} static Multiply128LUT;
+#if defined(LEO_TRY_AVX2)
+struct {
+    LEO_ALIGNED LEO_M256 LUT[65536][4];
+} static Multiply256LUT;
+#endif // LEO_TRY_AVX2
+
+// Returns a * b
+static ffe_t FFEMultiply(ffe_t a, ffe_t b)
+{
+    if (a == 0 || b == 0)
+        return 0;
+    return ExpLUT[AddMod(LogLUT[a], LogLUT[b])];
+}
+
+// Returns a * Log(b)
+static ffe_t FFEMultiplyLog(ffe_t a, ffe_t log_b)
+{
+    if (a == 0)
+        return 0;
+    return ExpLUT[AddMod(LogLUT[a], b)];
+}
+
+bool InitializeMultiplyTables()
+{
+    for (int y = 0; y < 256; ++y)
+    {
+        uint8_t lo[16], hi[16];
+        for (unsigned char x = 0; x < 16; ++x)
         {
-            LEO_M128 x0 = vld1q_u8(x16);
-            LEO_M128 x1 = vld1q_u8(x16 + 1);
-            LEO_M128 x2 = vld1q_u8(x16 + 2);
-            LEO_M128 x3 = vld1q_u8(x16 + 3);
-            LEO_M128 y0 = vld1q_u8(y16);
-            LEO_M128 y1 = vld1q_u8(y16 + 1);
-            LEO_M128 y2 = vld1q_u8(y16 + 2);
-            LEO_M128 y3 = vld1q_u8(y16 + 3);
-
-            vst1q_u8(x16,     veorq_u8(x0, y0));
-            vst1q_u8(x16 + 1, veorq_u8(x1, y1));
-            vst1q_u8(x16 + 2, veorq_u8(x2, y2));
-            vst1q_u8(x16 + 3, veorq_u8(x3, y3));
-
-            bytes -= 64, x16 += 4, y16 += 4;
+            lo[x] = FFEMultiply(x,      static_cast<uint8_t>(y));
+            hi[x] = FFEMultiply(x << 4, static_cast<uint8_t>(y));
         }
 
-        // Handle multiples of 16 bytes
-        while (bytes >= 16)
+        const LEO_M128 table_lo = _mm_loadu_si128((LEO_M128*)lo);
+        const LEO_M128 table_hi = _mm_loadu_si128((LEO_M128*)hi);
+
+        _mm_storeu_si128(Multiply128LUT.Lo + y, table_lo);
+        _mm_storeu_si128(Multiply128LUT.Hi + y, table_hi);
+
+#if defined(LEO_TRY_AVX2)
+        if (CpuHasAVX2)
         {
-            LEO_M128 x0 = vld1q_u8(x16);
-            LEO_M128 y0 = vld1q_u8(y16);
-
-            vst1q_u8(x16, veorq_u8(x0, y0));
-
-            bytes -= 16, ++x16, ++y16;
+            _mm256_storeu_si256(Multiply256LUT.Lo + y,
+                _mm256_broadcastsi128_si256(table_lo));
+            _mm256_storeu_si256(Multiply256LUT.Hi + y,
+                _mm256_broadcastsi128_si256(table_hi));
         }
+#endif // LEO_TRY_AVX2
     }
-    else
-# endif // LEO_TRY_NEON
+
+    return true;
+}
+
+// vx[] = vy[] * m
+void mul_mem_set(
+    void * LEO_RESTRICT vx, const void * LEO_RESTRICT vy,
+    ffe_t m, uint64_t bytes)
+{
+    if (m <= 1)
     {
-        uint64_t * LEO_RESTRICT x8 = reinterpret_cast<uint64_t *>(x16);
-        const uint64_t * LEO_RESTRICT y8 = reinterpret_cast<const uint64_t *>(y16);
-
-        const unsigned count = (unsigned)bytes / 8;
-        for (unsigned ii = 0; ii < count; ++ii)
-            x8[ii] ^= y8[ii];
-
-        x16 = reinterpret_cast<LEO_M128 *>(x8 + count);
-        y16 = reinterpret_cast<const LEO_M128 *>(y8 + count);
+        if (m == 1)
+            memcpy(vx, vy, bytes);
+        else
+            memset(vx, 0, bytes);
+        return;
     }
-#else // LEO_TARGET_MOBILE
-# if defined(LEO_TRY_AVX2)
+
+#if defined(LEO_TRY_AVX2)
     if (CpuHasAVX2)
     {
-        LEO_M256 * LEO_RESTRICT x32 = reinterpret_cast<LEO_M256 *>(x16);
-        const LEO_M256 * LEO_RESTRICT y32 = reinterpret_cast<const LEO_M256 *>(y16);
+        const LEO_M256 table_lo_y = _mm256_loadu_si256(Multiply256LUT.Lo + m);
+        const LEO_M256 table_hi_y = _mm256_loadu_si256(Multiply256LUT.Hi + m);
 
-        while (bytes >= 128)
+        const LEO_M256 clr_mask = _mm256_set1_epi8(0x0f);
+
+        LEO_M256 * LEO_RESTRICT z32 = reinterpret_cast<LEO_M256 *>(vx);
+        const LEO_M256 * LEO_RESTRICT x32 = reinterpret_cast<const LEO_M256 *>(vy);
+
+        const unsigned count = bytes / 64;
+        for (unsigned i = 0; i < count; ++i)
         {
-            LEO_M256 x0 = _mm256_loadu_si256(x32);
-            LEO_M256 y0 = _mm256_loadu_si256(y32);
-            x0 = _mm256_xor_si256(x0, y0);
-            LEO_M256 x1 = _mm256_loadu_si256(x32 + 1);
-            LEO_M256 y1 = _mm256_loadu_si256(y32 + 1);
-            x1 = _mm256_xor_si256(x1, y1);
-            LEO_M256 x2 = _mm256_loadu_si256(x32 + 2);
-            LEO_M256 y2 = _mm256_loadu_si256(y32 + 2);
-            x2 = _mm256_xor_si256(x2, y2);
-            LEO_M256 x3 = _mm256_loadu_si256(x32 + 3);
-            LEO_M256 y3 = _mm256_loadu_si256(y32 + 3);
-            x3 = _mm256_xor_si256(x3, y3);
+            LEO_M256 x0 = _mm256_loadu_si256(x32 + i * 2);
+            LEO_M256 l0 = _mm256_and_si256(x0, clr_mask);
+            x0 = _mm256_srli_epi64(x0, 4);
+            LEO_M256 h0 = _mm256_and_si256(x0, clr_mask);
+            l0 = _mm256_shuffle_epi8(table_lo_y, l0);
+            h0 = _mm256_shuffle_epi8(table_hi_y, h0);
+            _mm256_storeu_si256(z32 + i * 2, _mm256_xor_si256(l0, h0));
 
-            _mm256_storeu_si256(x32, x0);
-            _mm256_storeu_si256(x32 + 1, x1);
-            _mm256_storeu_si256(x32 + 2, x2);
-            _mm256_storeu_si256(x32 + 3, x3);
-
-            bytes -= 128, x32 += 4, y32 += 4;
+            LEO_M256 x1 = _mm256_loadu_si256(x32 + i * 2 + 1);
+            LEO_M256 l1 = _mm256_and_si256(x1, clr_mask);
+            x1 = _mm256_srli_epi64(x1, 4);
+            LEO_M256 h1 = _mm256_and_si256(x1, clr_mask);
+            l1 = _mm256_shuffle_epi8(table_lo_y, l1);
+            h1 = _mm256_shuffle_epi8(table_hi_y, h1);
+            _mm256_storeu_si256(z32 + i * 2 + 1, _mm256_xor_si256(l1, h1));
         }
+        return;
+    }
+#endif // LEO_TRY_AVX2
 
-        // Handle multiples of 32 bytes
-        while (bytes >= 32)
+    const LEO_M128 table_lo_y = _mm_loadu_si128(Multiply128LUT.Lo + m);
+    const LEO_M128 table_hi_y = _mm_loadu_si128(Multiply128LUT.Hi + m);
+
+    const LEO_M128 clr_mask = _mm_set1_epi8(0x0f);
+
+    LEO_M128 * LEO_RESTRICT       x16 = reinterpret_cast<LEO_M128 *>      (vx);
+    const LEO_M128 * LEO_RESTRICT y16 = reinterpret_cast<const LEO_M128 *>(vy);
+
+    do
+    {
+        LEO_M128 x3 = _mm_loadu_si128(y16 + 3);
+        LEO_M128 l3 = _mm_and_si128(x3, clr_mask);
+        x3 = _mm_srli_epi64(x3, 4);
+        LEO_M128 h3 = _mm_and_si128(x3, clr_mask);
+        l3 = _mm_shuffle_epi8(table_lo_y, l3);
+        h3 = _mm_shuffle_epi8(table_hi_y, h3);
+
+        LEO_M128 x2 = _mm_loadu_si128(y16 + 2);
+        LEO_M128 l2 = _mm_and_si128(x2, clr_mask);
+        x2 = _mm_srli_epi64(x2, 4);
+        LEO_M128 h2 = _mm_and_si128(x2, clr_mask);
+        l2 = _mm_shuffle_epi8(table_lo_y, l2);
+        h2 = _mm_shuffle_epi8(table_hi_y, h2);
+
+        LEO_M128 x1 = _mm_loadu_si128(y16 + 1);
+        LEO_M128 l1 = _mm_and_si128(x1, clr_mask);
+        x1 = _mm_srli_epi64(x1, 4);
+        LEO_M128 h1 = _mm_and_si128(x1, clr_mask);
+        l1 = _mm_shuffle_epi8(table_lo_y, l1);
+        h1 = _mm_shuffle_epi8(table_hi_y, h1);
+
+        LEO_M128 x0 = _mm_loadu_si128(y16);
+        LEO_M128 l0 = _mm_and_si128(x0, clr_mask);
+        x0 = _mm_srli_epi64(x0, 4);
+        LEO_M128 h0 = _mm_and_si128(x0, clr_mask);
+        l0 = _mm_shuffle_epi8(table_lo_y, l0);
+        h0 = _mm_shuffle_epi8(table_hi_y, h0);
+
+        _mm_storeu_si128(x16 + 3, _mm_xor_si128(l3, h3));
+        _mm_storeu_si128(x16 + 2, _mm_xor_si128(l2, h2));
+        _mm_storeu_si128(x16 + 1, _mm_xor_si128(l1, h1));
+        _mm_storeu_si128(x16,     _mm_xor_si128(l0, h0));
+
+        x16 += 4, y16 += 4;
+        bytes -= 64;
+    } while (bytes > 0);
+}
+
+// vx0[] *= m, vx1[] *= m
+void mul_mem2_inplace(
+    void * LEO_RESTRICT vx_0,
+    void * LEO_RESTRICT vx_1,
+    ffe_t m, uint64_t bytes)
+{
+    if (m <= 1)
+    {
+        if (m == 0)
         {
-            // x[i] = x[i] xor y[i]
-            _mm256_storeu_si256(x32,
-                _mm256_xor_si256(
-                    _mm256_loadu_si256(x32),
-                    _mm256_loadu_si256(y32)));
-
-            bytes -= 32, ++x32, ++y32;
+            memset(vx_0, 0, bytes);
+            memset(vx_1, 0, bytes);
         }
-
-        x16 = reinterpret_cast<LEO_M128 *>(x32);
-        y16 = reinterpret_cast<const LEO_M128 *>(y32);
+        return;
     }
-    else
-# endif // LEO_TRY_AVX2
+
+#if defined(LEO_TRY_AVX2)
+    if (CpuHasAVX2)
     {
-        while (bytes >= 64)
+        const LEO_M256 table_lo_y = _mm256_loadu_si256(Multiply256LUT.Lo + m);
+        const LEO_M256 table_hi_y = _mm256_loadu_si256(Multiply256LUT.Hi + m);
+
+        const LEO_M256 clr_mask = _mm256_set1_epi8(0x0f);
+
+        LEO_M256 * LEO_RESTRICT x32_0 = reinterpret_cast<LEO_M256 *>(vx_0);
+        LEO_M256 * LEO_RESTRICT x32_1 = reinterpret_cast<LEO_M256 *>(vx_1);
+
+        do
         {
-            LEO_M128 x0 = _mm_loadu_si128(x16);
-            LEO_M128 y0 = _mm_loadu_si128(y16);
-            x0 = _mm_xor_si128(x0, y0);
-            LEO_M128 x1 = _mm_loadu_si128(x16 + 1);
-            LEO_M128 y1 = _mm_loadu_si128(y16 + 1);
-            x1 = _mm_xor_si128(x1, y1);
-            LEO_M128 x2 = _mm_loadu_si128(x16 + 2);
-            LEO_M128 y2 = _mm_loadu_si128(y16 + 2);
-            x2 = _mm_xor_si128(x2, y2);
-            LEO_M128 x3 = _mm_loadu_si128(x16 + 3);
-            LEO_M128 y3 = _mm_loadu_si128(y16 + 3);
-            x3 = _mm_xor_si128(x3, y3);
+            LEO_M256 x0_0 = _mm256_loadu_si256(x32_0 + 1);
+            LEO_M256 l0_0 = _mm256_and_si256(x0_0, clr_mask);
+            x0_0 = _mm256_srli_epi64(x0_0, 4);
+            LEO_M256 h0_0 = _mm256_and_si256(x0_0, clr_mask);
+            l0_0 = _mm256_shuffle_epi8(table_lo_y, l0_0);
+            h0_0 = _mm256_shuffle_epi8(table_hi_y, h0_0);
+            l0_0 = _mm256_xor_si256(l0_0, h0_0);
 
-            _mm_storeu_si128(x16, x0);
-            _mm_storeu_si128(x16 + 1, x1);
-            _mm_storeu_si128(x16 + 2, x2);
-            _mm_storeu_si128(x16 + 3, x3);
+            LEO_M256 x1_0 = _mm256_loadu_si256(x32_0);
+            LEO_M256 l1_0 = _mm256_and_si256(x1_0, clr_mask);
+            x1_0 = _mm256_srli_epi64(x1_0, 4);
+            LEO_M256 h1_0 = _mm256_and_si256(x1_0, clr_mask);
+            l1_0 = _mm256_shuffle_epi8(table_lo_y, l1_0);
+            h1_0 = _mm256_shuffle_epi8(table_hi_y, h1_0);
+            l1_0 = _mm256_xor_si256(l1_0, h1_0);
 
-            bytes -= 64, x16 += 4, y16 += 4;
-        }
+            LEO_M256 x0_1 = _mm256_loadu_si256(x32_1 + 1);
+            LEO_M256 l0_1 = _mm256_and_si256(x0_1, clr_mask);
+            x0_1 = _mm256_srli_epi64(x0_1, 4);
+            LEO_M256 h0_1 = _mm256_and_si256(x0_1, clr_mask);
+            l0_1 = _mm256_shuffle_epi8(table_lo_y, l0_1);
+            h0_1 = _mm256_shuffle_epi8(table_hi_y, h0_1);
+            l0_1 = _mm256_xor_si256(l0_1, h0_1);
+
+            LEO_M256 x1_1 = _mm256_loadu_si256(x32_1);
+            LEO_M256 l1_1 = _mm256_and_si256(x1_1, clr_mask);
+            x1_1 = _mm256_srli_epi64(x1_1, 4);
+            LEO_M256 h1_1 = _mm256_and_si256(x1_1, clr_mask);
+            l1_1 = _mm256_shuffle_epi8(table_lo_y, l1_1);
+            h1_1 = _mm256_shuffle_epi8(table_hi_y, h1_1);
+            l1_1 = _mm256_xor_si256(l1_1, h1_1);
+
+            _mm256_storeu_si256(x32_0 + 1, l0_0);
+            _mm256_storeu_si256(x32_0, l1_0);
+            _mm256_storeu_si256(x32_1 + 1, l0_1);
+            _mm256_storeu_si256(x32_1, l1_1);
+
+            x32_0 += 2;
+            x32_1 += 2;
+            bytes -= 64;
+        } while (bytes > 0);
+        return;
     }
-#endif // LEO_TARGET_MOBILE
+#endif // LEO_TRY_AVX2
 
-    // Handle multiples of 16 bytes
-    while (bytes >= 16)
+    const LEO_M128 table_lo_y = _mm_loadu_si128(Multiply128LUT.Lo + m);
+    const LEO_M128 table_hi_y = _mm_loadu_si128(Multiply128LUT.Hi + m);
+
+    const LEO_M128 clr_mask = _mm_set1_epi8(0x0f);
+
+    LEO_M128 * LEO_RESTRICT x16_0 = reinterpret_cast<LEO_M128 *>(vx_0);
+    LEO_M128 * LEO_RESTRICT x16_1 = reinterpret_cast<LEO_M128 *>(vx_1);
+
+    do
     {
-        // x[i] = x[i] xor y[i]
-        _mm_storeu_si128(x16,
-            _mm_xor_si128(
-                _mm_loadu_si128(x16),
-                _mm_loadu_si128(y16)));
+        LEO_M128 x3 = _mm_loadu_si128(x16_0 + 3);
+        LEO_M128 l3 = _mm_and_si128(x3, clr_mask);
+        x3 = _mm_srli_epi64(x3, 4);
+        LEO_M128 h3 = _mm_and_si128(x3, clr_mask);
+        l3 = _mm_shuffle_epi8(table_lo_y, l3);
+        h3 = _mm_shuffle_epi8(table_hi_y, h3);
 
-        bytes -= 16, ++x16, ++y16;
-    }
+        LEO_M128 x2 = _mm_loadu_si128(x16_0 + 2);
+        LEO_M128 l2 = _mm_and_si128(x2, clr_mask);
+        x2 = _mm_srli_epi64(x2, 4);
+        LEO_M128 h2 = _mm_and_si128(x2, clr_mask);
+        l2 = _mm_shuffle_epi8(table_lo_y, l2);
+        h2 = _mm_shuffle_epi8(table_hi_y, h2);
 
-    uint8_t * LEO_RESTRICT x1 = reinterpret_cast<uint8_t *>(x16);
-    const uint8_t * LEO_RESTRICT y1 = reinterpret_cast<const uint8_t *>(y16);
+        LEO_M128 x1 = _mm_loadu_si128(x16_0 + 1);
+        LEO_M128 l1 = _mm_and_si128(x1, clr_mask);
+        x1 = _mm_srli_epi64(x1, 4);
+        LEO_M128 h1 = _mm_and_si128(x1, clr_mask);
+        l1 = _mm_shuffle_epi8(table_lo_y, l1);
+        h1 = _mm_shuffle_epi8(table_hi_y, h1);
 
-    // Handle a block of 8 bytes
-    const unsigned eight = bytes & 8;
-    if (eight)
-    {
-        uint64_t * LEO_RESTRICT x8 = reinterpret_cast<uint64_t *>(x1);
-        const uint64_t * LEO_RESTRICT y8 = reinterpret_cast<const uint64_t *>(y1);
-        *x8 ^= *y8;
-    }
+        LEO_M128 x0 = _mm_loadu_si128(x16_0);
+        LEO_M128 l0 = _mm_and_si128(x0, clr_mask);
+        x0 = _mm_srli_epi64(x0, 4);
+        LEO_M128 h0 = _mm_and_si128(x0, clr_mask);
+        l0 = _mm_shuffle_epi8(table_lo_y, l0);
+        h0 = _mm_shuffle_epi8(table_hi_y, h0);
 
-    // Handle a block of 4 bytes
-    const unsigned four = bytes & 4;
-    if (four)
-    {
-        uint32_t * LEO_RESTRICT x4 = reinterpret_cast<uint32_t *>(x1 + eight);
-        const uint32_t * LEO_RESTRICT y4 = reinterpret_cast<const uint32_t *>(y1 + eight);
-        *x4 ^= *y4;
-    }
+        _mm_storeu_si128(x16_0 + 3, _mm_xor_si128(l3, h3));
+        _mm_storeu_si128(x16_0 + 2, _mm_xor_si128(l2, h2));
+        _mm_storeu_si128(x16_0 + 1, _mm_xor_si128(l1, h1));
+        _mm_storeu_si128(x16_0,     _mm_xor_si128(l0, h0));
 
-    // Handle final bytes
-    const unsigned offset = eight + four;
-    switch (bytes & 3)
-    {
-    case 3: x1[offset + 2] ^= y1[offset + 2];
-    case 2: x1[offset + 1] ^= y1[offset + 1];
-    case 1: x1[offset] ^= y1[offset];
-    default:
-        break;
-    }
+        // FIXME: Add second one here
+
+        x16_0 += 4;
+        x16_1 += 4;
+        bytes -= 64;
+    } while (bytes > 0);
 }
 
 
 //------------------------------------------------------------------------------
-// Formal Derivative
+// FFT Operations
 
-// Formal derivative of polynomial in the new basis
-static void formal_derivative(GFSymbol* cos, const unsigned size)
+// x[] ^= y[] * m, y[] ^= x[]
+void fft_butterfly(
+    void * LEO_RESTRICT x, void * LEO_RESTRICT y,
+    ffe_t m, uint64_t bytes)
 {
-    for (unsigned i = 1; i < size; ++i)
-    {
-        const unsigned leng = ((i ^ (i - 1)) + 1) >> 1;
 
-        // If a large number of values are being XORed:
-        if (leng >= 8)
-            xor_mem(cos + i - leng, cos + i, leng * sizeof(GFSymbol));
-        else
-            for (unsigned j = i - leng; j < i; j++)
-                cos[j] ^= cos[j + leng];
-    }
+}
+
+// For i = {0, 1}: x_i[] ^= y_i[] * m, y_i[] ^= x_i[]
+void fft_butterfly2(
+    void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
+    void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1,
+    ffe_t m, uint64_t bytes)
+{
+
+}
+
+// For i = {0, 1, 2}: x_i[] ^= y_i[] * m, y_i[] ^= x_i[]
+void fft_butterfly3(
+    void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
+    void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1,
+    void * LEO_RESTRICT x_2, void * LEO_RESTRICT y_2,
+    ffe_t m, uint64_t bytes)
+{
 
-    for (unsigned i = size; i < kFieldSize; i <<= 1)
-        xor_mem(cos, cos + i, size * sizeof(GFSymbol));
 }
 
 
 //------------------------------------------------------------------------------
-// Fast Fourier Transform
+// IFFT Operations
 
-static GFSymbol skewVec[kFieldModulus]; // twisted factors used in FFT
-
-// IFFT in the proposed basis
-static void IFLT(GFSymbol* data, const unsigned size, const unsigned index)
+// y[] ^= x[], x[] ^= y[] * m
+void ifft_butterfly(
+    void * LEO_RESTRICT x, void * LEO_RESTRICT y,
+    ffe_t m, uint64_t bytes)
 {
-    for (unsigned depart_no = 1; depart_no < size; depart_no <<= 1)
-    {
-        for (unsigned j = depart_no; j < size; j += (depart_no << 1))
-        {
-            // If a large number of values are being XORed:
-            if (depart_no >= 8)
-                xor_mem(data + j, data + j - depart_no, depart_no * sizeof(GFSymbol));
-            else
-                for (unsigned i = j - depart_no; i < j; ++i)
-                    data[i + depart_no] ^= data[i];
 
-            const GFSymbol skew = skewVec[j + index - 1];
-
-            if (skew != kFieldModulus)
-                muladd_mem(data + j - depart_no, data + j, skew, depart_no);
-        }
-    }
 }
 
-// FFT in the proposed basis
-static void FLT(GFSymbol* data, const unsigned size, const unsigned index)
+// For i = {0, 1}: y_i[] ^= x_i[], x_i[] ^= y_i[] * m
+void ifft_butterfly2(
+    void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
+    void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1,
+    ffe_t m, uint64_t bytes)
 {
-    for (unsigned depart_no = (size >> 1); depart_no > 0; depart_no >>= 1)
-    {
-        for (unsigned j = depart_no; j < size; j += (depart_no << 1))
-        {
-            const GFSymbol skew = skewVec[j + index - 1];
 
-            if (skew != kFieldModulus)
-                muladd_mem(data + j - depart_no, data + j, skew, depart_no);
+}
+
+// For i = {0, 1, 2}: y_i[] ^= x_i[], x_i[] ^= y_i[] * m
+void ifft_butterfly3(
+    void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
+    void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1,
+    void * LEO_RESTRICT x_2, void * LEO_RESTRICT y_2,
+    ffe_t m, uint64_t bytes)
+{
 
-            // If a large number of values are being XORed:
-            if (depart_no >= 8)
-                xor_mem(data + j, data + j - depart_no, depart_no * sizeof(GFSymbol));
-            else
-                for (unsigned i = j - depart_no; i < j; ++i)
-                    data[i + depart_no] ^= data[i];
-        }
-    }
 }
 
 
 //------------------------------------------------------------------------------
-// FFT Initialization
+// FFT
 
-static GFSymbol B[kFieldSize >> 1];     // factors used in formal derivative
-static fwht_t log_walsh[kFieldSize];  // factors used in the evaluation of the error locator polynomial
+static ffe_t FFTSkew[kFieldModulus]; // twisted factors used in FFT
+static ffe_t LogWalsh[kOrder]; // factors used in the evaluation of the error locator polynomial
 
-// Initialize skewVec[], B[], log_walsh[]
-static void InitFieldOperations()
+void FFTInitialize()
 {
-    GFSymbol temp[kGFBits - 1];
+    ffe_t temp[kBits - 1];
 
-    for (unsigned i = 1; i < kGFBits; ++i)
-        temp[i - 1] = (GFSymbol)((unsigned)1 << i);
+    for (unsigned i = 1; i < kBits; ++i)
+        temp[i - 1] = (ffe_t)((unsigned)1 << i);
 
-    for (unsigned m = 0; m < (kGFBits - 1); ++m)
+    for (unsigned m = 0; m < (kBits - 1); ++m)
     {
         const unsigned step = (unsigned)1 << (m + 1);
 
-        skewVec[((unsigned)1 << m) - 1] = 0;
+        FFTSkew[((unsigned)1 << m) - 1] = 0;
 
-        for (unsigned i = m; i < (kGFBits - 1); ++i)
+        for (unsigned i = m; i < (kBits - 1); ++i)
         {
             const unsigned s = ((unsigned)1 << (i + 1));
 
             for (unsigned j = ((unsigned)1 << m) - 1; j < s; j += step)
-                skewVec[j + s] = skewVec[j] ^ temp[i];
+                FFTSkew[j + s] = FFTSkew[j] ^ temp[i];
         }
 
-        temp[m] = kFieldModulus - GFLog[mulE(temp[m], GFLog[temp[m] ^ 1])];
+        // TBD: This can be cleaned up
+        temp[m] = kFieldModulus - LogLUT[FFEMultiply(temp[m], temp[m] ^ 1)];
 
-        for (unsigned i = m + 1; i < (kGFBits - 1); ++i)
-            temp[i] = mulE(temp[i], (GFLog[temp[i] ^ 1] + temp[m]) % kFieldModulus);
+        for (unsigned i = m + 1; i < (kBits - 1); ++i)
+            temp[i] = FFEMultiplyLog(temp[i], (LogLUT[temp[i] ^ 1] + temp[m]) % kFieldModulus);
     }
 
-    for (unsigned i = 0; i < kFieldSize; ++i)
-        skewVec[i] = GFLog[skewVec[i]];
+    for (unsigned i = 0; i < kOrder; ++i)
+        FFTSkew[i] = LogLUT[FFTSkew[i]];
 
     temp[0] = kFieldModulus - temp[0];
 
-    for (unsigned i = 1; i < (kGFBits - 1); ++i)
+    for (unsigned i = 1; i < (kBits - 1); ++i)
         temp[i] = (kFieldModulus - temp[i] + temp[i - 1]) % kFieldModulus;
 
-    B[0] = 0;
-    for (unsigned i = 0; i < (kGFBits - 1); ++i)
-    {
-        const unsigned depart = ((unsigned)1 << i);
+    for (unsigned i = 0; i < kOrder; ++i)
+        LogWalsh[i] = LogLUT[i];
 
-        for (unsigned j = 0; j < depart; ++j)
-            B[j + depart] = (B[j] + temp[i]) % kFieldModulus;
-    }
+    LogWalsh[0] = 0;
 
-    for (unsigned i = 0; i < kFieldSize; ++i)
-        log_walsh[i] = GFLog[i];
-
-    log_walsh[0] = 0;
-
-    FWHT(log_walsh, kGFBits);
+    FWHT(LogWalsh, kBits);
 }
 
 
 //------------------------------------------------------------------------------
-// Encoder
+// Encode
 
-// Encoding alg for k/n<0.5: message is a power of two
-static void encodeL(GFSymbol* data, const unsigned k, GFSymbol* codeword)
+void Encode(
+    uint64_t buffer_bytes,
+    unsigned original_count,
+    unsigned recovery_count,
+    unsigned m,
+    void* const * const data,
+    void** work)
 {
-    memcpy(codeword, data, sizeof(GFSymbol) * k);
+    // work <- data
 
-    IFLT(codeword, k, 0);
+    // FIXME: Unroll first loop to eliminate this
+    for (unsigned i = 0; i < m; ++i)
+        memcpy(work[i], data[i], buffer_bytes);
 
-    for (unsigned i = k; i < kFieldSize; i += k)
+    // work <- IFFT(data, m, m)
+
+    for (unsigned width = 1; width < m; width <<= 1)
     {
-        memcpy(&codeword[i], codeword, sizeof(GFSymbol) * k);
-
-        FLT(&codeword[i], k, i);
-    }
-
-    memcpy(codeword, data, sizeof(GFSymbol) * k);
-}
-
-// Encoding alg for k/n>0.5: parity is a power of two.
-// data: message array. parity: parity array. mem: buffer(size>= n-k)
-static void encodeH(const GFSymbol* data, const unsigned k, GFSymbol* parity, GFSymbol* mem)
-{
-    const unsigned t = kFieldSize - k;
-
-    memset(parity, 0, sizeof(GFSymbol) * t);
-
-    for (unsigned i = t; i < kFieldSize; i += t)
-    {
-        memcpy(mem, &data[i - t], sizeof(GFSymbol) * t);
-
-        IFLT(mem, t, i);
-
-        xor_mem(parity, mem, t * sizeof(GFSymbol));
-    }
-
-    FLT(parity, t, 0);
-}
-
-
-//------------------------------------------------------------------------------
-// Decoder
-
-static void decode(GFSymbol* codeword, unsigned k, const bool* erasure)
-{
-    fwht_t log_walsh2[kFieldSize];
-
-    // Compute the evaluations of the error locator polynomial
-    for (unsigned i = 0; i < kFieldSize; ++i)
-        log_walsh2[i] = erasure[i] ? 1 : 0;
-
-    FWHT(log_walsh2, kGFBits);
-
-    for (unsigned i = 0; i < kFieldSize; ++i)
-        log_walsh2[i] = ((unsigned)log_walsh2[i] * (unsigned)log_walsh[i]) % kFieldModulus;
-
-    FWHT(log_walsh2, kGFBits);
-
-    // k2 can be replaced with k
-    const unsigned k2 = kFieldSize;
-    //const unsigned k2 = k; // cannot actually be replaced with k.  what else need to change?
-
-    for (unsigned i = 0; i < kFieldSize; ++i)
-    {
-        if (erasure[i])
+        for (unsigned j = width; j < m; j += (width << 1))
         {
-            codeword[i] = 0;
-        }
-        else
-        {
-            codeword[i] = mulE(codeword[i], log_walsh2[i]);
-        }
-    }
+            const ffe_t skew = FFTSkew[j + m - 1];
 
-    IFLT(codeword, kFieldSize, 0);
-
-    // formal derivative
-    for (unsigned i = 0; i < kFieldSize; i += 2)
-    {
-        codeword[i] = mulE(codeword[i], kFieldModulus - B[i >> 1]);
-        codeword[i + 1] = mulE(codeword[i + 1], kFieldModulus - B[i >> 1]);
-    }
-
-    formal_derivative(codeword, k2);
-
-    for (unsigned i = 0; i < k2; i += 2)
-    {
-        codeword[i] = mulE(codeword[i], B[i >> 1]);
-        codeword[i + 1] = mulE(codeword[i + 1], B[i >> 1]);
-    }
-
-    FLT(codeword, k2, 0);
-
-    for (unsigned i = 0; i < k2; ++i)
-    {
-        if (erasure[i])
-        {
-            codeword[i] = mulE(codeword[i], kFieldModulus - log_walsh2[i]);
-        }
-    }
-}
-
-
-//------------------------------------------------------------------------------
-// Test Application
-
-void test(unsigned k, unsigned seed)
-{
-    srand(seed);
-
-    //-----------Generating message----------
-
-    // Message array
-    GFSymbol data[kFieldSize] = {0};
-
-    // Filled with random numbers
-    for (unsigned i = kFieldSize - k; i < kFieldSize; ++i)
-        data[i] = (GFSymbol)rand();
-
-
-    //---------encoding----------
-
-    GFSymbol codeword[kFieldSize];
-    encodeH(&data[kFieldSize - k], k, data, codeword);
-    //encodeL(data, k, codeword); // does not seem to work with any input?  what else needs to change?
-
-    memcpy(codeword, data, sizeof(GFSymbol) * kFieldSize);
-
-
-    //--------erasure simulation---------
-
-    // Array indicating erasures
-    bool erasure[kFieldSize] = {
-        false
-    };
-
-    for (unsigned i = k; i < kFieldSize; ++i)
-        erasure[i] = true;
-
-    // permuting the erasure array
-    for (unsigned i = kFieldSize - 1; i > 0; --i)
-    {
-        unsigned pos = rand() % (i + 1);
-
-        if (i != pos)
-        {
-            bool tmp = erasure[i];
-            erasure[i] = erasure[pos];
-            erasure[pos] = tmp;
-        }
-    }
-
-    // erasure codeword symbols
-    for (unsigned i = 0; i < kFieldSize; ++i)
-        if (erasure[i])
-            codeword[i] = 0;
-
-
-    //---------main processing----------
-    decode(codeword, k, erasure);
-
-    // Check the correctness of the result
-    for (unsigned i = 0; i < kFieldSize; ++i)
-    {
-        if (erasure[i] == 1)
-        {
-            if (data[i] != codeword[i])
+            if (skew != kFieldModulus)
             {
-                printf("Decoding Error with seed = %d!\n", seed);
-                LEO_DEBUG_BREAK;
-                return;
+                for (unsigned i = j - width; i < j; ++i)
+                    ifft_butterfly(work[i], work[i + width], skew, buffer_bytes);
+            }
+            else
+            {
+                for (unsigned i = j - width; i < j; ++i)
+                    xor_mem(work[i + width], work[i], buffer_bytes);
             }
         }
     }
 
-    //printf("Decoding is successful!\n");
+    for (unsigned i = m; i + m <= original_count; i += m)
+    {
+        // temp <- data + i
+
+        void** temp = work + m;
+
+        // FIXME: Unroll first loop to eliminate this
+        for (unsigned j = 0; j < m; ++j)
+            memcpy(temp[j], data[j], buffer_bytes);
+
+        // temp <- IFFT(temp, m, m + i)
+
+        for (unsigned width = 1; width < m; width <<= 1)
+        {
+            for (unsigned j = width; j < m; j += (width << 1))
+            {
+                const ffe_t skew = FFTSkew[j + m + i - 1];
+
+                if (skew != kFieldModulus)
+                {
+                    for (unsigned k = j - width; k < j; ++k)
+                        ifft_butterfly(temp[k], temp[k + width], skew, buffer_bytes);
+                }
+                else
+                {
+                    for (unsigned k = j - width; k < j; ++k)
+                        xor_mem(temp[k + width], temp[k], buffer_bytes);
+                }
+            }
+        }
+
+        // work <- work XOR temp
+
+        // FIXME: Unroll last loop to eliminate this
+        for (unsigned j = 0; j < m; ++j)
+            xor_mem(work[j], temp[j], buffer_bytes);
+    }
+
+    const unsigned last_count = original_count % m;
+    if (last_count != 0)
+    {
+        const unsigned i = original_count - last_count;
+
+        // temp <- data + i
+
+        void** temp = work + m;
+
+        for (unsigned j = 0; j < last_count; ++j)
+            memcpy(temp[j], data[j], buffer_bytes);
+        for (unsigned j = last_count; j < m; ++j)
+            memset(temp[j], 0, buffer_bytes);
+
+        // temp <- IFFT(temp, m, m + i)
+
+        for (unsigned width = 1, shift = 1; width < m; width <<= 1, ++shift)
+        {
+            // Calculate stop considering that the right is all zeroes
+            const unsigned stop = ((last_count + width - 1) >> shift) << shift;
+
+            for (unsigned j = width; j < stop; j += (width << 1))
+            {
+                const ffe_t skew = FFTSkew[j + m + i - 1];
+
+                if (skew != kFieldModulus)
+                {
+                    for (unsigned k = j - width; k < j; ++k)
+                        ifft_butterfly(temp[k], temp[k + width], skew, buffer_bytes);
+                }
+                else
+                {
+                    for (unsigned k = j - width; k < j; ++k)
+                        xor_mem(temp[k + width], temp[k], buffer_bytes);
+                }
+            }
+        }
+
+        // work <- work XOR temp
+
+        // FIXME: Unroll last loop to eliminate this
+        for (unsigned j = 0; j < m; ++j)
+            xor_mem(work[j], temp[j], buffer_bytes);
+    }
+
+    // work <- FFT(work, m, 0)
+
+    for (unsigned width = (m >> 1); width > 0; width >>= 1)
+    {
+        const ffe_t* skewLUT = FFTSkew + width - 1;
+        const unsigned range = width << 1;
+
+        for (unsigned j = 0; j < m; j += range)
+        {
+            const ffe_t skew = skewLUT[j];
+
+            if (skew != kFieldModulus)
+            {
+                for (unsigned k = j, count = j + width; k < count; ++k)
+                    fft_butterfly(data[k], data[k + width], skew, buffer_bytes);
+            }
+            else
+            {
+                for (unsigned k = j, count = j + width; k < count; ++k)
+                    xor_mem(work[k + width], work[k], buffer_bytes);
+            }
+        }
+    }
 }
 
 
 //------------------------------------------------------------------------------
-// Entrypoint
+// Decode
 
-int main(int argc, char **argv)
+void Decode(
+    uint64_t buffer_bytes,
+    unsigned original_count,
+    unsigned recovery_count,
+    unsigned m, // NextPow2(recovery_count)
+    unsigned n, // NextPow2(m + original_count) = work_count
+    void* const * const original, // original_count entries
+    void* const * const recovery, // recovery_count entries
+    void** work) // n entries
 {
-    // Initialize architecture-specific code
-    leo_architecture_init();
+    // Fill in error locations
 
-    // Fill GFLog table and GFExp table
-    InitField();
+    ffe_t ErrorLocations[kOrder];
+    for (unsigned i = 0; i < recovery_count; ++i)
+        ErrorLocations[i] = recovery[i] ? 0 : 1;
+    for (unsigned i = recovery_count; i < m; ++i)
+        ErrorLocations[i] = 1;
+    for (unsigned i = 0; i < original_count; ++i)
+        ErrorLocations[i + m] = original[i] ? 0 : 1;
+    memset(ErrorLocations + m + original_count, 0, (n - original_count - m) * sizeof(ffe_t));
 
-    // Compute factors used in erasure decoder
-    InitFieldOperations();
+    // Evaluate error locator polynomial
 
-    unsigned seed = (unsigned)time(NULL);
-    for (;;)
+    FWHT(ErrorLocations, kBits);
+
+    for (unsigned i = 0; i < kOrder; ++i)
+        ErrorLocations[i] = ((unsigned)ErrorLocations[i] * (unsigned)LogWalsh[i]) % kFieldModulus;
+
+    FWHT(ErrorLocations, kBits);
+
+    // work <- recovery data
+
+    for (unsigned i = 0; i < recovery_count; ++i)
     {
-        // test(int k), k: message size
-        /*
-            EncodeH works for kFieldSize / 2 and kFieldSize * 3 / 4, etc,
-            s.t. the number of recovery pieces is a power of two
-        */
-        test(kFieldSize / 2, seed);
+        if (recovery[i])
+            mul_mem_set(work[i], recovery[i], ErrorLocations[i], buffer_bytes);
+        else
+            memset(work[i], 0, buffer_bytes);
+    }
+    for (unsigned i = recovery_count; i < m; ++i)
+        memset(work[i], 0, buffer_bytes);
 
-        ++seed;
+    // work <- original data
+
+    for (unsigned i = 0; i < original_count; ++i)
+    {
+        if (original[i])
+            mul_mem_set(work[m + i], original[i], ErrorLocations[m + i], buffer_bytes);
+        else
+            memset(work[m + i], 0, buffer_bytes);
+    }
+    for (unsigned i = m + original_count; i < n; ++i)
+        memset(work[i], 0, buffer_bytes);
+
+    // work <- IFFT(work, n, 0)
+
+    for (unsigned width = 1; width < n; width <<= 1)
+    {
+        for (unsigned j = width; j < n; j += (width << 1))
+        {
+            const ffe_t skew = FFTSkew[j - 1];
+
+            if (skew != kFieldModulus)
+            {
+                for (unsigned i = j - width; i < j; ++i)
+                    ifft_butterfly(work[i], work[i + width], skew, buffer_bytes);
+            }
+            else
+            {
+                for (unsigned i = j - width; i < j; ++i)
+                    xor_mem(work[i + width], work[i], buffer_bytes);
+            }
+        }
     }
 
-    return 0;
+    // work <- FormalDerivative(work, n)
+
+    for (unsigned i = 1; i < n; ++i)
+    {
+        const unsigned width = ((i ^ (i - 1)) + 1) >> 1;
+
+        // If a large number of values are being XORed:
+        for (unsigned j = i - width; j < i; ++j)
+            xor_mem(work[j], work[j + width], buffer_bytes);
+    }
+
+    // work <- FFT(work, n, 0) truncated to m + original_count
+
+    const unsigned output_count = m + original_count;
+    for (unsigned width = (n >> 1); width > 0; width >>= 1)
+    {
+        const ffe_t* skewLUT = FFTSkew + width - 1;
+        const unsigned range = width << 1;
+
+        for (unsigned j = (m < range) ? 0 : m; j < output_count; j += range)
+        {
+            const ffe_t skew = skewLUT[j];
+
+            if (skew != kFieldModulus)
+            {
+                for (unsigned i = j; i < j + width; ++i)
+                    fft_butterfly(work[i], work[i + width], skew, buffer_bytes);
+            }
+            else
+            {
+                for (unsigned i = j; i < j + width; ++i)
+                    xor_mem(work[i + width], work[i], buffer_bytes);
+            }
+        }
+    }
+
+    // Reveal erasures
+
+    for (unsigned i = 0; i < original_count; ++i)
+        if (!original[i])
+            mul_mem_set(work[i], work[i + m], kFieldModulus - ErrorLocations[i], buffer_bytes);
 }
+
+
+//------------------------------------------------------------------------------
+// API
+
+static bool IsInitialized = false;
+
+bool Initialize()
+{
+    if (IsInitialized)
+        return true;
+
+    if (!CpuHasSSSE3)
+        return false;
+
+    InitializeLogarithmTables();
+    FFTInitialize();
+
+    IsInitialized = true;
+    return true;
+}
+
+
+}} // namespace leopard::ff16
diff --git a/LeopardFF16.h b/LeopardFF16.h
index 71d22e2..981b9a9 100644
--- a/LeopardFF16.h
+++ b/LeopardFF16.h
@@ -9,7 +9,7 @@
     * Redistributions in binary form must reproduce the above copyright notice,
       this list of conditions and the following disclaimer in the documentation
       and/or other materials provided with the distribution.
-    * Neither the name of LHC-RS nor the names of its contributors may be
+    * Neither the name of Leopard-RS nor the names of its contributors may be
       used to endorse or promote products derived from this software without
       specific prior written permission.
 
@@ -26,1195 +26,133 @@
     POSSIBILITY OF SUCH DAMAGE.
 */
 
-#include <string.h>
-#include <time.h>
-#include <stdio.h>
-#include <stdint.h>
-#include <stdlib.h>
+#pragma once
 
+#include "LeopardCommon.h"
 
 /*
-    TODO:
-    + Write C API and unit tester
-        + Limit input to multiples of 64 bytes
-    + Replace GFSymbol with a file data pointer
-    + New 16-bit Muladd inner loops
-        + Class to contain the (large) muladd tables
-    + Preliminary benchmarks for large data!
-    + New 8-bit Muladd inner loops
-    + Benchmarks for smaller data!
-    + Refactor software
-        + Pick a name for the software better than LEO_RS
-        + I think it should be split up into several C++ modules
-    + Write detailed comments for all the routines
-    + Look into getting EncodeL working so we can support smaller data (Ask Lin)
-    + Look into using k instead of k2 to speed up decoder (Ask Lin)
-    + Avoid performing FFT/IFFT intermediate calculations we're not going to use
-    + Benchmarks, fun!
-    + Add multi-threading to split up long parallelizable calculations
-    + Final benchmarks!
-    + Finish up documentation
-    + Release version 1
+    16-bit Finite Field Math
 
-
-    Muladd implementation notes:
-
-    Specialize for 1-3 rows at a time since often times we're multiplying by
-    the same (skew) value repeatedly, as the ISA-L library does here:
-
-    https://github.com/01org/isa-l/blob/master/erasure_code/gf_3vect_mad_avx.asm#L258
-
-    Except we should be doing it for 16-bit Galois Field.
-    To implement that use the ALTMAP trick from Jerasure:
-
-    http://lab.jerasure.org/jerasure/gf-complete/blob/master/src/gf_w16.c#L1140
-
-    Except we should also support AVX2 since that is a 40% perf boost, so put
-    the high and low bytes 32 bytes instead of 16 bytes apart.
-
-    Also I think we should go ahead and precompute the multiply tables since
-    it avoids a bunch of memory lookups for each muladd, and only costs 8 MB.
+    This finite field contains 65536 elements and so each element is one byte.
+    This library is designed for data that is a multiple of 64 bytes in size.
 */
 
-
-//------------------------------------------------------------------------------
-// Debug
-
-// Some bugs only repro in release mode, so this can be helpful
-//#define LEO_DEBUG_IN_RELEASE
-
-#if defined(_DEBUG) || defined(DEBUG) || defined(LEO_DEBUG_IN_RELEASE)
-    #define LEO_DEBUG
-    #ifdef _WIN32
-        #define LEO_DEBUG_BREAK __debugbreak()
-    #else
-        #define LEO_DEBUG_BREAK __builtin_trap()
-    #endif
-    #define LEO_DEBUG_ASSERT(cond) { if (!(cond)) { LEO_DEBUG_BREAK; } }
-#else
-    #define LEO_DEBUG_BREAK ;
-    #define LEO_DEBUG_ASSERT(cond) ;
-#endif
+namespace leopard { namespace ff16 {
 
 
 //------------------------------------------------------------------------------
-// Platform/Architecture
+// Datatypes and Constants
 
-#if defined(ANDROID) || defined(IOS)
-    #define LEO_TARGET_MOBILE
-#endif // ANDROID
+// Finite field element type
+typedef uint16_t ffe_t;
 
-#if defined(__AVX2__) || (defined (_MSC_VER) && _MSC_VER >= 1900)
-    #define LEO_TRY_AVX2 /* 256-bit */
-    #include <immintrin.h>
-    #define LEO_ALIGN_BYTES 32
-#else // __AVX2__
-    #define LEO_ALIGN_BYTES 16
-#endif // __AVX2__
+// Number of bits per element
+static const unsigned kBits = 16;
 
-#if !defined(LEO_TARGET_MOBILE)
-    // Note: MSVC currently only supports SSSE3 but not AVX2
-    #include <tmmintrin.h> // SSSE3: _mm_shuffle_epi8
-    #include <emmintrin.h> // SSE2
-#endif // LEO_TARGET_MOBILE
-
-#if defined(HAVE_ARM_NEON_H)
-    #include <arm_neon.h>
-#endif // HAVE_ARM_NEON_H
-
-#if defined(LEO_TARGET_MOBILE)
-
-    #define LEO_ALIGNED_ACCESSES /* Inputs must be aligned to LEO_ALIGN_BYTES */
-
-# if defined(HAVE_ARM_NEON_H)
-    // Compiler-specific 128-bit SIMD register keyword
-    #define LEO_M128 uint8x16_t
-    #define LEO_TRY_NEON
-#else
-    #define LEO_M128 uint64_t
-# endif
-
-#else // LEO_TARGET_MOBILE
-
-    // Compiler-specific 128-bit SIMD register keyword
-    #define LEO_M128 __m128i
-
-#endif // LEO_TARGET_MOBILE
-
-#ifdef LEO_TRY_AVX2
-    // Compiler-specific 256-bit SIMD register keyword
-    #define LEO_M256 __m256i
-#endif
-
-// Compiler-specific C++11 restrict keyword
-#define LEO_RESTRICT __restrict
-
-// Compiler-specific force inline keyword
-#ifdef _MSC_VER
-    #define LEO_FORCE_INLINE inline __forceinline
-#else
-    #define LEO_FORCE_INLINE inline __attribute__((always_inline))
-#endif
-
-// Compiler-specific alignment keyword
-// Note: Alignment only matters for ARM NEON where it should be 16
-#ifdef _MSC_VER
-    #define LEO_ALIGNED __declspec(align(LEO_ALIGN_BYTES))
-#else // _MSC_VER
-    #define LEO_ALIGNED __attribute__((aligned(LEO_ALIGN_BYTES)))
-#endif // _MSC_VER
+// Finite field order: Number of elements in the field
+static const unsigned kOrder = 65536;
 
 
 //------------------------------------------------------------------------------
-// Runtime CPU Architecture Check
-//
-// Feature checks stolen shamelessly from
-// https://github.com/jedisct1/libsodium/blob/master/src/libsodium/sodium/runtime.c
+// Fast Walsh-Hadamard Transform (FWHT) (mod kModulus)
 
-#if defined(HAVE_ANDROID_GETCPUFEATURES)
-    #include <cpu-features.h>
-#endif
+// Transform for a variable number of bits (up to kOrder)
+void FWHT(ffe_t* data, const unsigned bits);
 
-#if defined(LEO_TRY_NEON)
-# if defined(IOS) && defined(__ARM_NEON__)
-        // Requires iPhone 5S or newer
-        static const bool CpuHasNeon = true;
-        static const bool CpuHasNeon64 = true;
-# else
-        // Remember to add LOCAL_STATIC_LIBRARIES := cpufeatures
-        static bool CpuHasNeon = false; // V6 / V7
-        static bool CpuHasNeon64 = false; // 64-bit
-# endif
-#endif
-
-
-#if !defined(LEO_TARGET_MOBILE)
-
-#ifdef _MSC_VER
-    #include <intrin.h> // __cpuid
-    #pragma warning(disable: 4752) // found Intel(R) Advanced Vector Extensions; consider using /arch:AVX
-#endif
-
-#ifdef LEO_TRY_AVX2
-static bool CpuHasAVX2 = false;
-#endif
-static bool CpuHasSSSE3 = false;
-
-#define CPUID_EBX_AVX2    0x00000020
-#define CPUID_ECX_SSSE3   0x00000200
-
-static void _cpuid(unsigned int cpu_info[4U], const unsigned int cpu_info_type)
-{
-#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_AMD64) || defined(_M_IX86))
-    __cpuid((int *) cpu_info, cpu_info_type);
-#else //if defined(HAVE_CPUID)
-    cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0;
-# ifdef __i386__
-    __asm__ __volatile__ ("pushfl; pushfl; "
-                          "popl %0; "
-                          "movl %0, %1; xorl %2, %0; "
-                          "pushl %0; "
-                          "popfl; pushfl; popl %0; popfl" :
-                          "=&r" (cpu_info[0]), "=&r" (cpu_info[1]) :
-                          "i" (0x200000));
-    if (((cpu_info[0] ^ cpu_info[1]) & 0x200000) == 0) {
-        return; /* LCOV_EXCL_LINE */
-    }
-# endif
-# ifdef __i386__
-    __asm__ __volatile__ ("xchgl %%ebx, %k1; cpuid; xchgl %%ebx, %k1" :
-                          "=a" (cpu_info[0]), "=&r" (cpu_info[1]),
-                          "=c" (cpu_info[2]), "=d" (cpu_info[3]) :
-                          "0" (cpu_info_type), "2" (0U));
-# elif defined(__x86_64__)
-    __asm__ __volatile__ ("xchgq %%rbx, %q1; cpuid; xchgq %%rbx, %q1" :
-                          "=a" (cpu_info[0]), "=&r" (cpu_info[1]),
-                          "=c" (cpu_info[2]), "=d" (cpu_info[3]) :
-                          "0" (cpu_info_type), "2" (0U));
-# else
-    __asm__ __volatile__ ("cpuid" :
-                          "=a" (cpu_info[0]), "=b" (cpu_info[1]),
-                          "=c" (cpu_info[2]), "=d" (cpu_info[3]) :
-                          "0" (cpu_info_type), "2" (0U));
-# endif
-#endif
-}
-
-#endif // defined(LEO_TARGET_MOBILE)
-
-
-static void leo_architecture_init()
-{
-#if defined(LEO_TRY_NEON) && defined(HAVE_ANDROID_GETCPUFEATURES)
-    AndroidCpuFamily family = android_getCpuFamily();
-    if (family == ANDROID_CPU_FAMILY_ARM)
-    {
-        if (android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON)
-            CpuHasNeon = true;
-    }
-    else if (family == ANDROID_CPU_FAMILY_ARM64)
-    {
-        CpuHasNeon = true;
-        if (android_getCpuFeatures() & ANDROID_CPU_ARM64_FEATURE_ASIMD)
-            CpuHasNeon64 = true;
-    }
-#endif
-
-#if !defined(LEO_TARGET_MOBILE)
-    unsigned int cpu_info[4];
-
-    _cpuid(cpu_info, 1);
-    CpuHasSSSE3 = ((cpu_info[2] & CPUID_ECX_SSSE3) != 0);
-
-#if defined(LEO_TRY_AVX2)
-    _cpuid(cpu_info, 7);
-    CpuHasAVX2 = ((cpu_info[1] & CPUID_EBX_AVX2) != 0);
-#endif // LEO_TRY_AVX2
-
-#endif // LEO_TARGET_MOBILE
-}
+// Transform specialized for the finite field order
+void FWHT(ffe_t data[kOrder]);
 
 
 //------------------------------------------------------------------------------
-// SIMD-Safe Aligned Memory Allocations
+// Multiplies
 
-static const unsigned kAlignmentBytes = LEO_ALIGN_BYTES;
+// x[] = y[] * m
+void mul_mem_set(
+    void * LEO_RESTRICT x, const void * LEO_RESTRICT y,
+    ffe_t m, uint64_t bytes);
 
-LEO_FORCE_INLINE unsigned NextAlignedOffset(unsigned offset)
-{
-    return (offset + kAlignmentBytes - 1) & ~(kAlignmentBytes - 1);
-}
-
-static LEO_FORCE_INLINE uint8_t* SIMDSafeAllocate(size_t size)
-{
-    uint8_t* data = (uint8_t*)calloc(1, kAlignmentBytes + size);
-    if (!data)
-        return nullptr;
-    unsigned offset = (unsigned)((uintptr_t)data % kAlignmentBytes);
-    data += kAlignmentBytes - offset;
-    data[-1] = (uint8_t)offset;
-    return data;
-}
-
-static LEO_FORCE_INLINE void SIMDSafeFree(void* ptr)
-{
-    if (!ptr)
-        return;
-    uint8_t* data = (uint8_t*)ptr;
-    unsigned offset = data[-1];
-    if (offset >= kAlignmentBytes)
-    {
-        LEO_DEBUG_BREAK; // Should never happen
-        return;
-    }
-    data -= kAlignmentBytes - offset;
-    free(data);
-}
+// For i = {0, 1}: x_i[] *= m
+void mul_mem2_inplace(
+    void * LEO_RESTRICT x_0,
+    void * LEO_RESTRICT x_1,
+    ffe_t m, uint64_t bytes);
 
 
 //------------------------------------------------------------------------------
-// Field
+// FFT Operations
 
-//#define LEO_SHORT_FIELD
+// x[] ^= y[] * m, y[] ^= x[]
+void fft_butterfly(
+    void * LEO_RESTRICT x, void * LEO_RESTRICT y,
+    ffe_t m, uint64_t bytes);
 
-#ifdef LEO_SHORT_FIELD
-typedef uint8_t GFSymbol;
-static const unsigned kGFBits = 8;
-static const unsigned kGFPolynomial = 0x11D;
-GFSymbol kGFBasis[kGFBits] = {
-    1, 214, 152, 146, 86, 200, 88, 230 // Cantor basis
-};
-#else
-typedef uint16_t GFSymbol;
-static const unsigned kGFBits = 16;
-static const unsigned kGFPolynomial = 0x1002D;
-GFSymbol kGFBasis[kGFBits] = {
-    0x0001, 0xACCA, 0x3C0E, 0x163E, // Cantor basis
-    0xC582, 0xED2E, 0x914C, 0x4012,
-    0x6C98, 0x10D8, 0x6A72, 0xB900,
-    0xFDB8, 0xFB34, 0xFF38, 0x991E
-};
-#endif
+// For i = {0, 1}: x_i[] ^= y_i[] * m, y_i[] ^= x_i[]
+void fft_butterfly2(
+    void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
+    void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1,
+    ffe_t m, uint64_t bytes);
 
-/*
-    Cantor Basis introduced by:
-    D. G. Cantor, "On arithmetical algorithms over finite fields",
-    Journal of Combinatorial Theory, Series A, vol. 50, no. 2, pp. 285-300, 1989.
-*/
-
-static const unsigned kFieldSize = (unsigned)1 << kGFBits; //Field size
-static const unsigned kFieldModulus = kFieldSize - 1;
-
-static GFSymbol GFLog[kFieldSize];
-static GFSymbol GFExp[kFieldSize];
-
-// Initialize GFLog[], GFExp[]
-static void InitField()
-{
-    unsigned state = 1;
-    for (unsigned i = 0; i < kFieldModulus; ++i)
-    {
-        GFExp[state] = static_cast<GFSymbol>(i);
-        state <<= 1;
-        if (state >= kFieldSize)
-            state ^= kGFPolynomial;
-    }
-    GFExp[0] = kFieldModulus;
-
-    // Conversion to chosen basis:
-
-    GFLog[0] = 0;
-    for (unsigned i = 0; i < kGFBits; ++i)
-    {
-        const GFSymbol basis = kGFBasis[i];
-        const unsigned width = (unsigned)(1UL << i);
-
-        for (unsigned j = 0; j < width; ++j)
-            GFLog[j + width] = GFLog[j] ^ basis;
-    }
-
-    for (unsigned i = 0; i < kFieldSize; ++i)
-        GFLog[i] = GFExp[GFLog[i]];
-
-    for (unsigned i = 0; i < kFieldSize; ++i)
-        GFExp[GFLog[i]] = i;
-
-    GFExp[kFieldModulus] = GFExp[0];
-}
+// For i = {0, 1, 2}: x_i[] ^= y_i[] * m, y_i[] ^= x_i[]
+void fft_butterfly3(
+    void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
+    void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1,
+    void * LEO_RESTRICT x_2, void * LEO_RESTRICT y_2,
+    ffe_t m, uint64_t bytes);
 
 
 //------------------------------------------------------------------------------
-// Mod Q Field Operations
-//
-// Q is the maximum symbol value, e.g. 255 or 65535.
+// IFFT Operations
 
-// z = x + y (mod Q)
-static inline GFSymbol AddModQ(GFSymbol a, GFSymbol b)
-{
-    const unsigned sum = (unsigned)a + b;
+// y[] ^= x[], x[] ^= y[] * m
+void ifft_butterfly(
+    void * LEO_RESTRICT x, void * LEO_RESTRICT y,
+    ffe_t m, uint64_t bytes);
 
-    // Partial reduction step, allowing for Q to be returned
-    return static_cast<GFSymbol>(sum + (sum >> kGFBits));
-}
+// For i = {0, 1}: y_i[] ^= x_i[], x_i[] ^= y_i[] * m
+void ifft_butterfly2(
+    void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
+    void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1,
+    ffe_t m, uint64_t bytes);
 
-// z = x - y (mod Q)
-static inline GFSymbol SubModQ(GFSymbol a, GFSymbol b)
-{
-    const unsigned dif = (unsigned)a - b;
-
-    // Partial reduction step, allowing for Q to be returned
-    return static_cast<GFSymbol>(dif + (dif >> kGFBits));
-}
-
-// vx[] += vy[] * z
-static void muladd_mem(GFSymbol * LEO_RESTRICT vx, const GFSymbol * LEO_RESTRICT vy, GFSymbol z, unsigned symbolCount)
-{
-    for (unsigned i = 0; i < symbolCount; ++i)
-    {
-        const GFSymbol a = vy[i];
-        if (a == 0)
-            continue;
-
-        GFSymbol sum1 = static_cast<GFSymbol>(AddModQ(GFLog[a & 0x0f], z));
-        GFSymbol value1 = GFExp[sum1];
-        if ((a & 0x0f) == 0)
-        {
-            value1 = 0;
-        }
-        GFSymbol sum2 = static_cast<GFSymbol>(AddModQ(GFLog[a & 0xf0], z));
-        GFSymbol value2 = GFExp[sum2];
-        if ((a & 0xf0) == 0)
-        {
-            value2 = 0;
-        }
-        GFSymbol sum3 = static_cast<GFSymbol>(AddModQ(GFLog[a & 0x0f00], z));
-        GFSymbol value3 = GFExp[sum3];
-        if ((a & 0x0f00) == 0)
-        {
-            value3 = 0;
-        }
-        GFSymbol sum4 = static_cast<GFSymbol>(AddModQ(GFLog[a & 0xf000], z));
-        GFSymbol value4 = GFExp[sum4];
-        if ((a & 0xf000) == 0)
-        {
-            value4 = 0;
-        }
-
-        vx[i] ^= value1;
-        vx[i] ^= value2;
-        vx[i] ^= value3;
-        vx[i] ^= value4;
-    }
-}
-
-// return a*GFExp[b] over GF(2^r)
-static GFSymbol mulE(GFSymbol a, GFSymbol b)
-{
-    if (a == 0)
-        return 0;
-
-    const GFSymbol sum = static_cast<GFSymbol>(AddModQ(GFLog[a], b));
-    return GFExp[sum];
-}
+// For i = {0, 1, 2}: y_i[] ^= x_i[], x_i[] ^= y_i[] * m
+void ifft_butterfly3(
+    void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
+    void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1,
+    void * LEO_RESTRICT x_2, void * LEO_RESTRICT y_2,
+    ffe_t m, uint64_t bytes);
 
 
 //------------------------------------------------------------------------------
-// Fast Walsh-Hadamard Transform (FWHT) Mod Q
-//
-// Q is the maximum symbol value, e.g. 255 or 65535.
+// Encode
 
-// Define this to enable the optimized version of FWHT()
-#define LEO_FWHT_OPTIMIZED
-
-typedef GFSymbol fwht_t;
-
-// {a, b} = {a + b, a - b} (Mod Q)
-static LEO_FORCE_INLINE void FWHT_2(fwht_t& LEO_RESTRICT a, fwht_t& LEO_RESTRICT b)
-{
-    const fwht_t sum = AddModQ(a, b);
-    const fwht_t dif = SubModQ(a, b);
-    a = sum;
-    b = dif;
-}
-
-/*
-    FWHT is a minor slice of the runtime and does not grow with data size,
-    but I did attempt a few additional optimizations that failed:
-
-    I've attempted to vectorize (with partial reductions) FWHT_4(data, s),
-    which is 70% of the algorithm, but it was slower.  Left in _attic_.
-
-    I've attempted to avoid reductions in all or parts of the FWHT.
-    The final modular reduction ends up being slower than the savings.
-    Specifically I tried doing it for the whole FWHT and also I tried
-    doing it just for the FWHT_2 loop in the main routine, but both
-    approaches are slower than partial reductions.
-
-    Replacing word reads with wider reads does speed up the operation, but
-    at too high a complexity cost relative to minor perf improvement.
-*/
-
-#ifndef LEO_FWHT_OPTIMIZED
-
-// Reference implementation
-static void FWHT(fwht_t* data, const unsigned bits)
-{
-    const unsigned size = (unsigned)(1UL << bits);
-    for (unsigned width = 1; width < size; width <<= 1)
-        for (unsigned i = 0; i < size; i += (width << 1))
-            for (unsigned j = i; j < (width + i); ++j)
-                FWHT_2(data[j], data[j + width]);
-}
-
-#else
-
-static LEO_FORCE_INLINE void FWHT_4(fwht_t* data)
-{
-    fwht_t t0 = data[0];
-    fwht_t t1 = data[1];
-    fwht_t t2 = data[2];
-    fwht_t t3 = data[3];
-    FWHT_2(t0, t1);
-    FWHT_2(t2, t3);
-    FWHT_2(t0, t2);
-    FWHT_2(t1, t3);
-    data[0] = t0;
-    data[1] = t1;
-    data[2] = t2;
-    data[3] = t3;
-}
-
-static LEO_FORCE_INLINE void FWHT_4(fwht_t* data, unsigned s)
-{
-    unsigned x = 0;
-    fwht_t t0 = data[x];  x += s;
-    fwht_t t1 = data[x];  x += s;
-    fwht_t t2 = data[x];  x += s;
-    fwht_t t3 = data[x];
-    FWHT_2(t0, t1);
-    FWHT_2(t2, t3);
-    FWHT_2(t0, t2);
-    FWHT_2(t1, t3);
-    unsigned y = 0;
-    data[y] = t0;  y += s;
-    data[y] = t1;  y += s;
-    data[y] = t2;  y += s;
-    data[y] = t3;
-}
-
-static inline void FWHT_8(fwht_t* data)
-{
-    fwht_t t0 = data[0];
-    fwht_t t1 = data[1];
-    fwht_t t2 = data[2];
-    fwht_t t3 = data[3];
-    fwht_t t4 = data[4];
-    fwht_t t5 = data[5];
-    fwht_t t6 = data[6];
-    fwht_t t7 = data[7];
-    FWHT_2(t0, t1);
-    FWHT_2(t2, t3);
-    FWHT_2(t4, t5);
-    FWHT_2(t6, t7);
-    FWHT_2(t0, t2);
-    FWHT_2(t1, t3);
-    FWHT_2(t4, t6);
-    FWHT_2(t5, t7);
-    FWHT_2(t0, t4);
-    FWHT_2(t1, t5);
-    FWHT_2(t2, t6);
-    FWHT_2(t3, t7);
-    data[0] = t0;
-    data[1] = t1;
-    data[2] = t2;
-    data[3] = t3;
-    data[4] = t4;
-    data[5] = t5;
-    data[6] = t6;
-    data[7] = t7;
-}
-
-static inline void FWHT_16(fwht_t* data)
-{
-    fwht_t t0 = data[0];
-    fwht_t t1 = data[1];
-    fwht_t t2 = data[2];
-    fwht_t t3 = data[3];
-    fwht_t t4 = data[4];
-    fwht_t t5 = data[5];
-    fwht_t t6 = data[6];
-    fwht_t t7 = data[7];
-    fwht_t t8 = data[8];
-    fwht_t t9 = data[9];
-    fwht_t t10 = data[10];
-    fwht_t t11 = data[11];
-    fwht_t t12 = data[12];
-    fwht_t t13 = data[13];
-    fwht_t t14 = data[14];
-    fwht_t t15 = data[15];
-    FWHT_2(t0, t1);
-    FWHT_2(t2, t3);
-    FWHT_2(t4, t5);
-    FWHT_2(t6, t7);
-    FWHT_2(t8, t9);
-    FWHT_2(t10, t11);
-    FWHT_2(t12, t13);
-    FWHT_2(t14, t15);
-    FWHT_2(t0, t2);
-    FWHT_2(t1, t3);
-    FWHT_2(t4, t6);
-    FWHT_2(t5, t7);
-    FWHT_2(t8, t10);
-    FWHT_2(t9, t11);
-    FWHT_2(t12, t14);
-    FWHT_2(t13, t15);
-    FWHT_2(t0, t4);
-    FWHT_2(t1, t5);
-    FWHT_2(t2, t6);
-    FWHT_2(t3, t7);
-    FWHT_2(t8, t12);
-    FWHT_2(t9, t13);
-    FWHT_2(t10, t14);
-    FWHT_2(t11, t15);
-    FWHT_2(t0, t8);
-    FWHT_2(t1, t9);
-    FWHT_2(t2, t10);
-    FWHT_2(t3, t11);
-    FWHT_2(t4, t12);
-    FWHT_2(t5, t13);
-    FWHT_2(t6, t14);
-    FWHT_2(t7, t15);
-    data[0] = t0;
-    data[1] = t1;
-    data[2] = t2;
-    data[3] = t3;
-    data[4] = t4;
-    data[5] = t5;
-    data[6] = t6;
-    data[7] = t7;
-    data[8] = t8;
-    data[9] = t9;
-    data[10] = t10;
-    data[11] = t11;
-    data[12] = t12;
-    data[13] = t13;
-    data[14] = t14;
-    data[15] = t15;
-}
-
-static void FWHT_SmallData(fwht_t* data, unsigned ldn)
-{
-    const unsigned n = (1UL << ldn);
-
-    if (n <= 2)
-    {
-        if (n == 2)
-            FWHT_2(data[0], data[1]);
-        return;
-    }
-
-    for (unsigned ldm = ldn; ldm > 3; ldm -= 2)
-    {
-        unsigned m = (1UL << ldm);
-        unsigned m4 = (m >> 2);
-        for (unsigned r = 0; r < n; r += m)
-            for (unsigned j = 0; j < m4; j++)
-                FWHT_4(data + j + r, m4);
-    }
-
-    if (ldn & 1)
-    {
-        for (unsigned i0 = 0; i0 < n; i0 += 8)
-            FWHT_8(data + i0);
-    }
-    else
-    {
-        for (unsigned i0 = 0; i0 < n; i0 += 4)
-            FWHT_4(data + i0);
-    }
-}
-
-// Decimation in time (DIT) version
-static void FWHT(fwht_t* data, const unsigned ldn)
-{
-    if (ldn <= 13)
-    {
-        FWHT_SmallData(data, ldn);
-        return;
-    }
-
-    FWHT_2(data[2], data[3]);
-    FWHT_4(data + 4);
-    FWHT_8(data + 8);
-    FWHT_16(data + 16);
-    for (unsigned ldm = 5; ldm < ldn; ++ldm)
-        FWHT(data + (unsigned)(1UL << ldm), ldm);
-
-    for (unsigned ldm = 0; ldm < ldn; ++ldm)
-    {
-        const unsigned mh = (1UL << ldm);
-        for (unsigned t1 = 0, t2 = mh; t1 < mh; ++t1, ++t2)
-            FWHT_2(data[t1], data[t2]);
-    }
-}
-
-#endif
+void Encode(
+    uint64_t buffer_bytes,
+    unsigned original_count,
+    unsigned recovery_count,
+    unsigned m, // = NextPow2(recovery_count) * 2 = work_count
+    void* const * const data,
+    void** work); // Size of GetEncodeWorkCount()
 
 
 //------------------------------------------------------------------------------
-// Memory Buffer XOR
+// Decode
 
-static void xor_mem(void * LEO_RESTRICT vx, const void * LEO_RESTRICT vy, unsigned bytes)
-{
-    LEO_M128 * LEO_RESTRICT x16 = reinterpret_cast<LEO_M128 *>(vx);
-    const LEO_M128 * LEO_RESTRICT y16 = reinterpret_cast<const LEO_M128 *>(vy);
-
-#if defined(LEO_TARGET_MOBILE)
-# if defined(LEO_TRY_NEON)
-    // Handle multiples of 64 bytes
-    if (CpuHasNeon)
-    {
-        while (bytes >= 64)
-        {
-            LEO_M128 x0 = vld1q_u8(x16);
-            LEO_M128 x1 = vld1q_u8(x16 + 1);
-            LEO_M128 x2 = vld1q_u8(x16 + 2);
-            LEO_M128 x3 = vld1q_u8(x16 + 3);
-            LEO_M128 y0 = vld1q_u8(y16);
-            LEO_M128 y1 = vld1q_u8(y16 + 1);
-            LEO_M128 y2 = vld1q_u8(y16 + 2);
-            LEO_M128 y3 = vld1q_u8(y16 + 3);
-
-            vst1q_u8(x16,     veorq_u8(x0, y0));
-            vst1q_u8(x16 + 1, veorq_u8(x1, y1));
-            vst1q_u8(x16 + 2, veorq_u8(x2, y2));
-            vst1q_u8(x16 + 3, veorq_u8(x3, y3));
-
-            bytes -= 64, x16 += 4, y16 += 4;
-        }
-
-        // Handle multiples of 16 bytes
-        while (bytes >= 16)
-        {
-            LEO_M128 x0 = vld1q_u8(x16);
-            LEO_M128 y0 = vld1q_u8(y16);
-
-            vst1q_u8(x16, veorq_u8(x0, y0));
-
-            bytes -= 16, ++x16, ++y16;
-        }
-    }
-    else
-# endif // LEO_TRY_NEON
-    {
-        uint64_t * LEO_RESTRICT x8 = reinterpret_cast<uint64_t *>(x16);
-        const uint64_t * LEO_RESTRICT y8 = reinterpret_cast<const uint64_t *>(y16);
-
-        const unsigned count = (unsigned)bytes / 8;
-        for (unsigned ii = 0; ii < count; ++ii)
-            x8[ii] ^= y8[ii];
-
-        x16 = reinterpret_cast<LEO_M128 *>(x8 + count);
-        y16 = reinterpret_cast<const LEO_M128 *>(y8 + count);
-    }
-#else // LEO_TARGET_MOBILE
-# if defined(LEO_TRY_AVX2)
-    if (CpuHasAVX2)
-    {
-        LEO_M256 * LEO_RESTRICT x32 = reinterpret_cast<LEO_M256 *>(x16);
-        const LEO_M256 * LEO_RESTRICT y32 = reinterpret_cast<const LEO_M256 *>(y16);
-
-        while (bytes >= 128)
-        {
-            LEO_M256 x0 = _mm256_loadu_si256(x32);
-            LEO_M256 y0 = _mm256_loadu_si256(y32);
-            x0 = _mm256_xor_si256(x0, y0);
-            LEO_M256 x1 = _mm256_loadu_si256(x32 + 1);
-            LEO_M256 y1 = _mm256_loadu_si256(y32 + 1);
-            x1 = _mm256_xor_si256(x1, y1);
-            LEO_M256 x2 = _mm256_loadu_si256(x32 + 2);
-            LEO_M256 y2 = _mm256_loadu_si256(y32 + 2);
-            x2 = _mm256_xor_si256(x2, y2);
-            LEO_M256 x3 = _mm256_loadu_si256(x32 + 3);
-            LEO_M256 y3 = _mm256_loadu_si256(y32 + 3);
-            x3 = _mm256_xor_si256(x3, y3);
-
-            _mm256_storeu_si256(x32, x0);
-            _mm256_storeu_si256(x32 + 1, x1);
-            _mm256_storeu_si256(x32 + 2, x2);
-            _mm256_storeu_si256(x32 + 3, x3);
-
-            bytes -= 128, x32 += 4, y32 += 4;
-        }
-
-        // Handle multiples of 32 bytes
-        while (bytes >= 32)
-        {
-            // x[i] = x[i] xor y[i]
-            _mm256_storeu_si256(x32,
-                _mm256_xor_si256(
-                    _mm256_loadu_si256(x32),
-                    _mm256_loadu_si256(y32)));
-
-            bytes -= 32, ++x32, ++y32;
-        }
-
-        x16 = reinterpret_cast<LEO_M128 *>(x32);
-        y16 = reinterpret_cast<const LEO_M128 *>(y32);
-    }
-    else
-# endif // LEO_TRY_AVX2
-    {
-        while (bytes >= 64)
-        {
-            LEO_M128 x0 = _mm_loadu_si128(x16);
-            LEO_M128 y0 = _mm_loadu_si128(y16);
-            x0 = _mm_xor_si128(x0, y0);
-            LEO_M128 x1 = _mm_loadu_si128(x16 + 1);
-            LEO_M128 y1 = _mm_loadu_si128(y16 + 1);
-            x1 = _mm_xor_si128(x1, y1);
-            LEO_M128 x2 = _mm_loadu_si128(x16 + 2);
-            LEO_M128 y2 = _mm_loadu_si128(y16 + 2);
-            x2 = _mm_xor_si128(x2, y2);
-            LEO_M128 x3 = _mm_loadu_si128(x16 + 3);
-            LEO_M128 y3 = _mm_loadu_si128(y16 + 3);
-            x3 = _mm_xor_si128(x3, y3);
-
-            _mm_storeu_si128(x16, x0);
-            _mm_storeu_si128(x16 + 1, x1);
-            _mm_storeu_si128(x16 + 2, x2);
-            _mm_storeu_si128(x16 + 3, x3);
-
-            bytes -= 64, x16 += 4, y16 += 4;
-        }
-    }
-#endif // LEO_TARGET_MOBILE
-
-    // Handle multiples of 16 bytes
-    while (bytes >= 16)
-    {
-        // x[i] = x[i] xor y[i]
-        _mm_storeu_si128(x16,
-            _mm_xor_si128(
-                _mm_loadu_si128(x16),
-                _mm_loadu_si128(y16)));
-
-        bytes -= 16, ++x16, ++y16;
-    }
-
-    uint8_t * LEO_RESTRICT x1 = reinterpret_cast<uint8_t *>(x16);
-    const uint8_t * LEO_RESTRICT y1 = reinterpret_cast<const uint8_t *>(y16);
-
-    // Handle a block of 8 bytes
-    const unsigned eight = bytes & 8;
-    if (eight)
-    {
-        uint64_t * LEO_RESTRICT x8 = reinterpret_cast<uint64_t *>(x1);
-        const uint64_t * LEO_RESTRICT y8 = reinterpret_cast<const uint64_t *>(y1);
-        *x8 ^= *y8;
-    }
-
-    // Handle a block of 4 bytes
-    const unsigned four = bytes & 4;
-    if (four)
-    {
-        uint32_t * LEO_RESTRICT x4 = reinterpret_cast<uint32_t *>(x1 + eight);
-        const uint32_t * LEO_RESTRICT y4 = reinterpret_cast<const uint32_t *>(y1 + eight);
-        *x4 ^= *y4;
-    }
-
-    // Handle final bytes
-    const unsigned offset = eight + four;
-    switch (bytes & 3)
-    {
-    case 3: x1[offset + 2] ^= y1[offset + 2];
-    case 2: x1[offset + 1] ^= y1[offset + 1];
-    case 1: x1[offset] ^= y1[offset];
-    default:
-        break;
-    }
-}
+void Decode(
+    uint64_t buffer_bytes,
+    unsigned original_count,
+    unsigned recovery_count,
+    unsigned m, // = NextPow2(recovery_count)
+    unsigned n, // = NextPow2(m + original_count) = work_count
+    void* const * const original, // original_count entries
+    void* const * const recovery, // recovery_count entries
+    void** work); // n entries
 
 
 //------------------------------------------------------------------------------
-// Formal Derivative
+// API
 
-// Formal derivative of polynomial in the new basis
-static void formal_derivative(GFSymbol* cos, const unsigned size)
-{
-    for (unsigned i = 1; i < size; ++i)
-    {
-        const unsigned leng = ((i ^ (i - 1)) + 1) >> 1;
+// Returns false if the self-test fails
+bool Initialize();
 
-        // If a large number of values are being XORed:
-        if (leng >= 8)
-            xor_mem(cos + i - leng, cos + i, leng * sizeof(GFSymbol));
-        else
-            for (unsigned j = i - leng; j < i; j++)
-                cos[j] ^= cos[j + leng];
-    }
 
-    for (unsigned i = size; i < kFieldSize; i <<= 1)
-        xor_mem(cos, cos + i, size * sizeof(GFSymbol));
-}
-
-
-//------------------------------------------------------------------------------
-// Fast Fourier Transform
-
-static GFSymbol skewVec[kFieldModulus]; // twisted factors used in FFT
-
-// IFFT in the proposed basis
-static void IFLT(GFSymbol* data, const unsigned size, const unsigned index)
-{
-    for (unsigned depart_no = 1; depart_no < size; depart_no <<= 1)
-    {
-        for (unsigned j = depart_no; j < size; j += (depart_no << 1))
-        {
-            // If a large number of values are being XORed:
-            if (depart_no >= 8)
-                xor_mem(data + j, data + j - depart_no, depart_no * sizeof(GFSymbol));
-            else
-                for (unsigned i = j - depart_no; i < j; ++i)
-                    data[i + depart_no] ^= data[i];
-
-            const GFSymbol skew = skewVec[j + index - 1];
-
-            if (skew != kFieldModulus)
-                muladd_mem(data + j - depart_no, data + j, skew, depart_no);
-        }
-    }
-}
-
-// FFT in the proposed basis
-static void FLT(GFSymbol* data, const unsigned size, const unsigned index)
-{
-    for (unsigned depart_no = (size >> 1); depart_no > 0; depart_no >>= 1)
-    {
-        for (unsigned j = depart_no; j < size; j += (depart_no << 1))
-        {
-            const GFSymbol skew = skewVec[j + index - 1];
-
-            if (skew != kFieldModulus)
-                muladd_mem(data + j - depart_no, data + j, skew, depart_no);
-
-            // If a large number of values are being XORed:
-            if (depart_no >= 8)
-                xor_mem(data + j, data + j - depart_no, depart_no * sizeof(GFSymbol));
-            else
-                for (unsigned i = j - depart_no; i < j; ++i)
-                    data[i + depart_no] ^= data[i];
-        }
-    }
-}
-
-
-//------------------------------------------------------------------------------
-// FFT Initialization
-
-static GFSymbol B[kFieldSize >> 1];     // factors used in formal derivative
-static fwht_t log_walsh[kFieldSize];  // factors used in the evaluation of the error locator polynomial
-
-// Initialize skewVec[], B[], log_walsh[]
-static void InitFieldOperations()
-{
-    GFSymbol temp[kGFBits - 1];
-
-    for (unsigned i = 1; i < kGFBits; ++i)
-        temp[i - 1] = (GFSymbol)((unsigned)1 << i);
-
-    for (unsigned m = 0; m < (kGFBits - 1); ++m)
-    {
-        const unsigned step = (unsigned)1 << (m + 1);
-
-        skewVec[((unsigned)1 << m) - 1] = 0;
-
-        for (unsigned i = m; i < (kGFBits - 1); ++i)
-        {
-            const unsigned s = ((unsigned)1 << (i + 1));
-
-            for (unsigned j = ((unsigned)1 << m) - 1; j < s; j += step)
-                skewVec[j + s] = skewVec[j] ^ temp[i];
-        }
-
-        temp[m] = kFieldModulus - GFLog[mulE(temp[m], GFLog[temp[m] ^ 1])];
-
-        for (unsigned i = m + 1; i < (kGFBits - 1); ++i)
-            temp[i] = mulE(temp[i], (GFLog[temp[i] ^ 1] + temp[m]) % kFieldModulus);
-    }
-
-    for (unsigned i = 0; i < kFieldSize; ++i)
-        skewVec[i] = GFLog[skewVec[i]];
-
-    temp[0] = kFieldModulus - temp[0];
-
-    for (unsigned i = 1; i < (kGFBits - 1); ++i)
-        temp[i] = (kFieldModulus - temp[i] + temp[i - 1]) % kFieldModulus;
-
-    B[0] = 0;
-    for (unsigned i = 0; i < (kGFBits - 1); ++i)
-    {
-        const unsigned depart = ((unsigned)1 << i);
-
-        for (unsigned j = 0; j < depart; ++j)
-            B[j + depart] = (B[j] + temp[i]) % kFieldModulus;
-    }
-
-    for (unsigned i = 0; i < kFieldSize; ++i)
-        log_walsh[i] = GFLog[i];
-
-    log_walsh[0] = 0;
-
-    FWHT(log_walsh, kGFBits);
-}
-
-
-//------------------------------------------------------------------------------
-// Encoder
-
-// Encoding alg for k/n<0.5: message is a power of two
-static void encodeL(GFSymbol* data, const unsigned k, GFSymbol* codeword)
-{
-    memcpy(codeword, data, sizeof(GFSymbol) * k);
-
-    IFLT(codeword, k, 0);
-
-    for (unsigned i = k; i < kFieldSize; i += k)
-    {
-        memcpy(&codeword[i], codeword, sizeof(GFSymbol) * k);
-
-        FLT(&codeword[i], k, i);
-    }
-
-    memcpy(codeword, data, sizeof(GFSymbol) * k);
-}
-
-// Encoding alg for k/n>0.5: parity is a power of two.
-// data: message array. parity: parity array. mem: buffer(size>= n-k)
-static void encodeH(const GFSymbol* data, const unsigned k, GFSymbol* parity, GFSymbol* mem)
-{
-    const unsigned t = kFieldSize - k;
-
-    memset(parity, 0, sizeof(GFSymbol) * t);
-
-    for (unsigned i = t; i < kFieldSize; i += t)
-    {
-        memcpy(mem, &data[i - t], sizeof(GFSymbol) * t);
-
-        IFLT(mem, t, i);
-
-        xor_mem(parity, mem, t * sizeof(GFSymbol));
-    }
-
-    FLT(parity, t, 0);
-}
-
-
-//------------------------------------------------------------------------------
-// Decoder
-
-static void decode(GFSymbol* codeword, unsigned k, const bool* erasure)
-{
-    fwht_t log_walsh2[kFieldSize];
-
-    // Compute the evaluations of the error locator polynomial
-    for (unsigned i = 0; i < kFieldSize; ++i)
-        log_walsh2[i] = erasure[i] ? 1 : 0;
-
-    FWHT(log_walsh2, kGFBits);
-
-    for (unsigned i = 0; i < kFieldSize; ++i)
-        log_walsh2[i] = ((unsigned)log_walsh2[i] * (unsigned)log_walsh[i]) % kFieldModulus;
-
-    FWHT(log_walsh2, kGFBits);
-
-    // k2 can be replaced with k
-    const unsigned k2 = kFieldSize;
-    //const unsigned k2 = k; // cannot actually be replaced with k.  what else need to change?
-
-    for (unsigned i = 0; i < kFieldSize; ++i)
-    {
-        if (erasure[i])
-        {
-            codeword[i] = 0;
-        }
-        else
-        {
-            codeword[i] = mulE(codeword[i], log_walsh2[i]);
-        }
-    }
-
-    IFLT(codeword, kFieldSize, 0);
-
-    // formal derivative
-    for (unsigned i = 0; i < kFieldSize; i += 2)
-    {
-        codeword[i] = mulE(codeword[i], kFieldModulus - B[i >> 1]);
-        codeword[i + 1] = mulE(codeword[i + 1], kFieldModulus - B[i >> 1]);
-    }
-
-    formal_derivative(codeword, k2);
-
-    for (unsigned i = 0; i < k2; i += 2)
-    {
-        codeword[i] = mulE(codeword[i], B[i >> 1]);
-        codeword[i + 1] = mulE(codeword[i + 1], B[i >> 1]);
-    }
-
-    FLT(codeword, k2, 0);
-
-    for (unsigned i = 0; i < k2; ++i)
-    {
-        if (erasure[i])
-        {
-            codeword[i] = mulE(codeword[i], kFieldModulus - log_walsh2[i]);
-        }
-    }
-}
-
-
-//------------------------------------------------------------------------------
-// Test Application
-
-void test(unsigned k, unsigned seed)
-{
-    srand(seed);
-
-    //-----------Generating message----------
-
-    // Message array
-    GFSymbol data[kFieldSize] = {0};
-
-    // Filled with random numbers
-    for (unsigned i = kFieldSize - k; i < kFieldSize; ++i)
-        data[i] = (GFSymbol)rand();
-
-
-    //---------encoding----------
-
-    GFSymbol codeword[kFieldSize];
-    encodeH(&data[kFieldSize - k], k, data, codeword);
-    //encodeL(data, k, codeword); // does not seem to work with any input?  what else needs to change?
-
-    memcpy(codeword, data, sizeof(GFSymbol) * kFieldSize);
-
-
-    //--------erasure simulation---------
-
-    // Array indicating erasures
-    bool erasure[kFieldSize] = {
-        false
-    };
-
-    for (unsigned i = k; i < kFieldSize; ++i)
-        erasure[i] = true;
-
-    // permuting the erasure array
-    for (unsigned i = kFieldSize - 1; i > 0; --i)
-    {
-        unsigned pos = rand() % (i + 1);
-
-        if (i != pos)
-        {
-            bool tmp = erasure[i];
-            erasure[i] = erasure[pos];
-            erasure[pos] = tmp;
-        }
-    }
-
-    // erasure codeword symbols
-    for (unsigned i = 0; i < kFieldSize; ++i)
-        if (erasure[i])
-            codeword[i] = 0;
-
-
-    //---------main processing----------
-    decode(codeword, k, erasure);
-
-    // Check the correctness of the result
-    for (unsigned i = 0; i < kFieldSize; ++i)
-    {
-        if (erasure[i] == 1)
-        {
-            if (data[i] != codeword[i])
-            {
-                printf("Decoding Error with seed = %d!\n", seed);
-                LEO_DEBUG_BREAK;
-                return;
-            }
-        }
-    }
-
-    //printf("Decoding is successful!\n");
-}
-
-
-//------------------------------------------------------------------------------
-// Entrypoint
-
-int main(int argc, char **argv)
-{
-    // Initialize architecture-specific code
-    leo_architecture_init();
-
-    // Fill GFLog table and GFExp table
-    InitField();
-
-    // Compute factors used in erasure decoder
-    InitFieldOperations();
-
-    unsigned seed = (unsigned)time(NULL);
-    for (;;)
-    {
-        // test(int k), k: message size
-        /*
-            EncodeH works for kFieldSize / 2 and kFieldSize * 3 / 4, etc,
-            s.t. the number of recovery pieces is a power of two
-        */
-        test(kFieldSize / 2, seed);
-
-        ++seed;
-    }
-
-    return 0;
-}
+}} // namespace leopard::ff16
diff --git a/LeopardFF8.cpp b/LeopardFF8.cpp
index 030a555..1e7d7cd 100644
--- a/LeopardFF8.cpp
+++ b/LeopardFF8.cpp
@@ -9,7 +9,7 @@
     * Redistributions in binary form must reproduce the above copyright notice,
       this list of conditions and the following disclaimer in the documentation
       and/or other materials provided with the distribution.
-    * Neither the name of LHC-RS nor the names of its contributors may be
+    * Neither the name of Leopard-RS nor the names of its contributors may be
       used to endorse or promote products derived from this software without
       specific prior written permission.
 
@@ -27,6 +27,10 @@
 */
 
 #include "LeopardFF8.h"
+#include <string.h>
+
+// Define this to enable the optimized version of FWHT()
+#define LEO_FF8_FWHT_OPTIMIZED
 
 namespace leopard { namespace ff8 {
 
@@ -34,6 +38,9 @@ namespace leopard { namespace ff8 {
 //------------------------------------------------------------------------------
 // Datatypes and Constants
 
+// Modulus for field operations
+static const ffe_t kModulus = 255;
+
 // LFSR Polynomial that generates the field elements
 static const unsigned kPolynomial = 0x11D;
 
@@ -47,9 +54,6 @@ static const ffe_t kBasis[kBits] = {
 //------------------------------------------------------------------------------
 // Field Operations
 
-// Modulus for field operations
-static const ffe_t kModulus = 255;
-
 // z = x + y (mod kModulus)
 static inline ffe_t AddMod(const ffe_t a, const ffe_t b)
 {
@@ -69,50 +73,6 @@ static inline ffe_t SubMod(const ffe_t a, const ffe_t b)
 }
 
 
-//------------------------------------------------------------------------------
-// Logarithm Tables
-
-static ffe_t LogLUT[kOrder];
-static ffe_t ExpLUT[kOrder];
-
-
-// Initialize LogLUT[], ExpLUT[]
-static void InitializeLogarithmTables()
-{
-    // LFSR table generation:
-
-    unsigned state = 1;
-    for (unsigned i = 0; i < kModulus; ++i)
-    {
-        ExpLUT[state] = static_cast<ffe_t>(i);
-        state <<= 1;
-        if (state >= kOrder)
-            state ^= kPolynomial;
-    }
-    ExpLUT[0] = kModulus;
-
-    // Conversion to chosen basis:
-
-    LogLUT[0] = 0;
-    for (unsigned i = 0; i < kBits; ++i)
-    {
-        const ffe_t basis = kBasis[i];
-        const unsigned width = static_cast<unsigned>(1UL << i);
-
-        for (unsigned j = 0; j < width; ++j)
-            LogLUT[j + width] = LogLUT[j] ^ basis;
-    }
-
-    for (unsigned i = 0; i < kOrder; ++i)
-        LogLUT[i] = ExpLUT[LogLUT[i]];
-
-    for (unsigned i = 0; i < kOrder; ++i)
-        ExpLUT[LogLUT[i]] = i;
-
-    ExpLUT[kModulus] = ExpLUT[0];
-}
-
-
 //------------------------------------------------------------------------------
 // Fast Walsh-Hadamard Transform (FWHT) (mod kModulus)
 
@@ -248,234 +208,47 @@ void FWHT(ffe_t data[kOrder])
 
 
 //------------------------------------------------------------------------------
-// XOR Memory
+// Logarithm Tables
 
-void xor_mem(
-    void * LEO_RESTRICT vx, const void * LEO_RESTRICT vy,
-    unsigned bytes)
+static ffe_t LogLUT[kOrder];
+static ffe_t ExpLUT[kOrder];
+
+
+// Initialize LogLUT[], ExpLUT[]
+static void InitializeLogarithmTables()
 {
-#if defined(LEO_TRY_AVX2)
-    if (CpuHasAVX2)
-    {
-        LEO_M256 * LEO_RESTRICT x32 = reinterpret_cast<LEO_M256 *>(vx);
-        const LEO_M256 * LEO_RESTRICT y32 = reinterpret_cast<const LEO_M256 *>(vy);
-        do
-        {
-            const LEO_M256 x0 = _mm256_xor_si256(_mm256_loadu_si256(x32), _mm256_loadu_si256(y32));
-            const LEO_M256 x1 = _mm256_xor_si256(_mm256_loadu_si256(x32 + 1), _mm256_loadu_si256(y32 + 1));
-            const LEO_M256 x2 = _mm256_xor_si256(_mm256_loadu_si256(x32 + 2), _mm256_loadu_si256(y32 + 2));
-            const LEO_M256 x3 = _mm256_xor_si256(_mm256_loadu_si256(x32 + 3), _mm256_loadu_si256(y32 + 3));
-            _mm256_storeu_si256(x32, x0);
-            _mm256_storeu_si256(x32 + 1, x1);
-            _mm256_storeu_si256(x32 + 2, x2);
-            _mm256_storeu_si256(x32 + 3, x3);
-            bytes -= 128, x32 += 4, y32 += 4;
-        } while (bytes >= 128);
-        if (bytes > 0)
-        {
-            const LEO_M256 x0 = _mm256_xor_si256(_mm256_loadu_si256(x32), _mm256_loadu_si256(y32));
-            const LEO_M256 x1 = _mm256_xor_si256(_mm256_loadu_si256(x32 + 1), _mm256_loadu_si256(y32 + 1));
-            _mm256_storeu_si256(x32, x0);
-            _mm256_storeu_si256(x32 + 1, x1);
-        }
-        return;
-    }
-#endif // LEO_TRY_AVX2
-    LEO_M128 * LEO_RESTRICT x16 = reinterpret_cast<LEO_M128 *>(vx);
-    const LEO_M128 * LEO_RESTRICT y16 = reinterpret_cast<const LEO_M128 *>(vy);
-    do
-    {
-        const LEO_M128 x0 = _mm_xor_si128(_mm_loadu_si128(x16), _mm_loadu_si128(y16));
-        const LEO_M128 x1 = _mm_xor_si128(_mm_loadu_si128(x16 + 1), _mm_loadu_si128(y16 + 1));
-        const LEO_M128 x2 = _mm_xor_si128(_mm_loadu_si128(x16 + 2), _mm_loadu_si128(y16 + 2));
-        const LEO_M128 x3 = _mm_xor_si128(_mm_loadu_si128(x16 + 3), _mm_loadu_si128(y16 + 3));
-        _mm_storeu_si128(x16, x0);
-        _mm_storeu_si128(x16 + 1, x1);
-        _mm_storeu_si128(x16 + 2, x2);
-        _mm_storeu_si128(x16 + 3, x3);
-        bytes -= 64, x16 += 4, y16 += 4;
-    } while (bytes > 0);
-}
+    // LFSR table generation:
 
-void xor_mem2(
-    void * LEO_RESTRICT vx_0, const void * LEO_RESTRICT vy_0,
-    void * LEO_RESTRICT vx_1, const void * LEO_RESTRICT vy_1,
-    unsigned bytes)
-{
-#if defined(LEO_TRY_AVX2)
-    if (CpuHasAVX2)
+    unsigned state = 1;
+    for (unsigned i = 0; i < kModulus; ++i)
     {
-        LEO_M256 * LEO_RESTRICT       x32_0 = reinterpret_cast<LEO_M256 *>      (vx_0);
-        const LEO_M256 * LEO_RESTRICT y32_0 = reinterpret_cast<const LEO_M256 *>(vy_0);
-        LEO_M256 * LEO_RESTRICT       x32_1 = reinterpret_cast<LEO_M256 *>      (vx_1);
-        const LEO_M256 * LEO_RESTRICT y32_1 = reinterpret_cast<const LEO_M256 *>(vy_1);
-        do
-        {
-            const LEO_M256 x0_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0),     _mm256_loadu_si256(y32_0));
-            const LEO_M256 x1_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 1), _mm256_loadu_si256(y32_0 + 1));
-            const LEO_M256 x2_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 2), _mm256_loadu_si256(y32_0 + 2));
-            const LEO_M256 x3_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 3), _mm256_loadu_si256(y32_0 + 3));
-            const LEO_M256 x0_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1),     _mm256_loadu_si256(y32_1));
-            const LEO_M256 x1_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 1), _mm256_loadu_si256(y32_1 + 1));
-            const LEO_M256 x2_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 2), _mm256_loadu_si256(y32_1 + 2));
-            const LEO_M256 x3_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 3), _mm256_loadu_si256(y32_1 + 3));
-            _mm256_storeu_si256(x32_0,     x0_0);
-            _mm256_storeu_si256(x32_0 + 1, x1_0);
-            _mm256_storeu_si256(x32_0 + 2, x2_0);
-            _mm256_storeu_si256(x32_0 + 3, x3_0);
-            _mm256_storeu_si256(x32_1,     x0_1);
-            _mm256_storeu_si256(x32_1 + 1, x1_1);
-            _mm256_storeu_si256(x32_1 + 2, x2_1);
-            _mm256_storeu_si256(x32_1 + 3, x3_1);
-            x32_0 += 4, y32_0 += 4;
-            x32_1 += 4, y32_1 += 4;
-            bytes -= 128;
-        } while (bytes >= 128);
-        if (bytes > 0)
-        {
-            const LEO_M256 x0_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0),     _mm256_loadu_si256(y32_0));
-            const LEO_M256 x1_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 1), _mm256_loadu_si256(y32_0 + 1));
-            const LEO_M256 x0_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1),     _mm256_loadu_si256(y32_1));
-            const LEO_M256 x1_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 1), _mm256_loadu_si256(y32_1 + 1));
-            _mm256_storeu_si256(x32_0,     x0_0);
-            _mm256_storeu_si256(x32_0 + 1, x1_0);
-            _mm256_storeu_si256(x32_1,     x0_1);
-            _mm256_storeu_si256(x32_1 + 1, x1_1);
-        }
-        return;
+        ExpLUT[state] = static_cast<ffe_t>(i);
+        state <<= 1;
+        if (state >= kOrder)
+            state ^= kPolynomial;
     }
-#endif // LEO_TRY_AVX2
-    LEO_M128 * LEO_RESTRICT       x16_0 = reinterpret_cast<LEO_M128 *>      (vx_0);
-    const LEO_M128 * LEO_RESTRICT y16_0 = reinterpret_cast<const LEO_M128 *>(vy_0);
-    LEO_M128 * LEO_RESTRICT       x16_1 = reinterpret_cast<LEO_M128 *>      (vx_1);
-    const LEO_M128 * LEO_RESTRICT y16_1 = reinterpret_cast<const LEO_M128 *>(vy_1);
-    do
-    {
-        const LEO_M128 x0_0 = _mm_xor_si128(_mm_loadu_si128(x16_0),     _mm_loadu_si128(y16_0));
-        const LEO_M128 x1_0 = _mm_xor_si128(_mm_loadu_si128(x16_0 + 1), _mm_loadu_si128(y16_0 + 1));
-        const LEO_M128 x2_0 = _mm_xor_si128(_mm_loadu_si128(x16_0 + 2), _mm_loadu_si128(y16_0 + 2));
-        const LEO_M128 x3_0 = _mm_xor_si128(_mm_loadu_si128(x16_0 + 3), _mm_loadu_si128(y16_0 + 3));
-        const LEO_M128 x0_1 = _mm_xor_si128(_mm_loadu_si128(x16_1),     _mm_loadu_si128(y16_1));
-        const LEO_M128 x1_1 = _mm_xor_si128(_mm_loadu_si128(x16_1 + 1), _mm_loadu_si128(y16_1 + 1));
-        const LEO_M128 x2_1 = _mm_xor_si128(_mm_loadu_si128(x16_1 + 2), _mm_loadu_si128(y16_1 + 2));
-        const LEO_M128 x3_1 = _mm_xor_si128(_mm_loadu_si128(x16_1 + 3), _mm_loadu_si128(y16_1 + 3));
-        _mm_storeu_si128(x16_0,     x0_0);
-        _mm_storeu_si128(x16_0 + 1, x1_0);
-        _mm_storeu_si128(x16_0 + 2, x2_0);
-        _mm_storeu_si128(x16_0 + 3, x3_0);
-        _mm_storeu_si128(x16_1,     x0_1);
-        _mm_storeu_si128(x16_1 + 1, x1_1);
-        _mm_storeu_si128(x16_1 + 2, x2_1);
-        _mm_storeu_si128(x16_1 + 3, x3_1);
-        x16_0 += 4, y16_0 += 4;
-        x16_1 += 4, y16_1 += 4;
-        bytes -= 64;
-    } while (bytes > 0);
-}
+    ExpLUT[0] = kModulus;
 
-void xor_mem3(
-    void * LEO_RESTRICT vx_0, const void * LEO_RESTRICT vy_0,
-    void * LEO_RESTRICT vx_1, const void * LEO_RESTRICT vy_1,
-    void * LEO_RESTRICT vx_2, const void * LEO_RESTRICT vy_2,
-    unsigned bytes)
-{
-#if defined(LEO_TRY_AVX2)
-    if (CpuHasAVX2)
+    // Conversion to chosen basis:
+
+    LogLUT[0] = 0;
+    for (unsigned i = 0; i < kBits; ++i)
     {
-        LEO_M256 * LEO_RESTRICT       x32_0 = reinterpret_cast<LEO_M256 *>      (vx_0);
-        const LEO_M256 * LEO_RESTRICT y32_0 = reinterpret_cast<const LEO_M256 *>(vy_0);
-        LEO_M256 * LEO_RESTRICT       x32_1 = reinterpret_cast<LEO_M256 *>      (vx_1);
-        const LEO_M256 * LEO_RESTRICT y32_1 = reinterpret_cast<const LEO_M256 *>(vy_1);
-        LEO_M256 * LEO_RESTRICT       x32_2 = reinterpret_cast<LEO_M256 *>      (vx_2);
-        const LEO_M256 * LEO_RESTRICT y32_2 = reinterpret_cast<const LEO_M256 *>(vy_2);
-        do
-        {
-            const LEO_M256 x0_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0),     _mm256_loadu_si256(y32_0));
-            const LEO_M256 x1_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 1), _mm256_loadu_si256(y32_0 + 1));
-            const LEO_M256 x2_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 2), _mm256_loadu_si256(y32_0 + 2));
-            const LEO_M256 x3_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 3), _mm256_loadu_si256(y32_0 + 3));
-            const LEO_M256 x0_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1),     _mm256_loadu_si256(y32_1));
-            const LEO_M256 x1_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 1), _mm256_loadu_si256(y32_1 + 1));
-            const LEO_M256 x2_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 2), _mm256_loadu_si256(y32_1 + 2));
-            const LEO_M256 x3_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 3), _mm256_loadu_si256(y32_1 + 3));
-            const LEO_M256 x0_2 = _mm256_xor_si256(_mm256_loadu_si256(x32_2),     _mm256_loadu_si256(y32_2));
-            const LEO_M256 x1_2 = _mm256_xor_si256(_mm256_loadu_si256(x32_2 + 1), _mm256_loadu_si256(y32_2 + 1));
-            const LEO_M256 x2_2 = _mm256_xor_si256(_mm256_loadu_si256(x32_2 + 2), _mm256_loadu_si256(y32_2 + 2));
-            const LEO_M256 x3_2 = _mm256_xor_si256(_mm256_loadu_si256(x32_2 + 3), _mm256_loadu_si256(y32_2 + 3));
-            _mm256_storeu_si256(x32_0,     x0_0);
-            _mm256_storeu_si256(x32_0 + 1, x1_0);
-            _mm256_storeu_si256(x32_0 + 2, x2_0);
-            _mm256_storeu_si256(x32_0 + 3, x3_0);
-            _mm256_storeu_si256(x32_1,     x0_1);
-            _mm256_storeu_si256(x32_1 + 1, x1_1);
-            _mm256_storeu_si256(x32_1 + 2, x2_1);
-            _mm256_storeu_si256(x32_1 + 3, x3_1);
-            _mm256_storeu_si256(x32_2,     x0_2);
-            _mm256_storeu_si256(x32_2 + 1, x1_2);
-            _mm256_storeu_si256(x32_2 + 2, x2_2);
-            _mm256_storeu_si256(x32_2 + 3, x3_2);
-            x32_0 += 4, y32_0 += 4;
-            x32_1 += 4, y32_1 += 4;
-            x32_2 += 4, y32_2 += 4;
-            bytes -= 128;
-        } while (bytes >= 128);
-        if (bytes > 0)
-        {
-            const LEO_M256 x0_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0),     _mm256_loadu_si256(y32_0));
-            const LEO_M256 x1_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 1), _mm256_loadu_si256(y32_0 + 1));
-            const LEO_M256 x0_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1),     _mm256_loadu_si256(y32_1));
-            const LEO_M256 x1_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 1), _mm256_loadu_si256(y32_1 + 1));
-            const LEO_M256 x0_2 = _mm256_xor_si256(_mm256_loadu_si256(x32_2),     _mm256_loadu_si256(y32_2));
-            const LEO_M256 x1_2 = _mm256_xor_si256(_mm256_loadu_si256(x32_2 + 1), _mm256_loadu_si256(y32_2 + 1));
-            _mm256_storeu_si256(x32_0,     x0_0);
-            _mm256_storeu_si256(x32_0 + 1, x1_0);
-            _mm256_storeu_si256(x32_1,     x0_1);
-            _mm256_storeu_si256(x32_1 + 1, x1_1);
-            _mm256_storeu_si256(x32_2,     x0_2);
-            _mm256_storeu_si256(x32_2 + 1, x1_2);
-        }
-        return;
+        const ffe_t basis = kBasis[i];
+        const unsigned width = static_cast<unsigned>(1UL << i);
+
+        for (unsigned j = 0; j < width; ++j)
+            LogLUT[j + width] = LogLUT[j] ^ basis;
     }
-#endif // LEO_TRY_AVX2
-    LEO_M128 * LEO_RESTRICT       x16_0 = reinterpret_cast<LEO_M128 *>      (vx_0);
-    const LEO_M128 * LEO_RESTRICT y16_0 = reinterpret_cast<const LEO_M128 *>(vy_0);
-    LEO_M128 * LEO_RESTRICT       x16_1 = reinterpret_cast<LEO_M128 *>      (vx_1);
-    const LEO_M128 * LEO_RESTRICT y16_1 = reinterpret_cast<const LEO_M128 *>(vy_1);
-    LEO_M128 * LEO_RESTRICT       x16_2 = reinterpret_cast<LEO_M128 *>      (vx_2);
-    const LEO_M128 * LEO_RESTRICT y16_2 = reinterpret_cast<const LEO_M128 *>(vy_2);
-    do
-    {
-        const LEO_M128 x0_0 = _mm_xor_si128(_mm_loadu_si128(x16_0),     _mm_loadu_si128(y16_0));
-        const LEO_M128 x1_0 = _mm_xor_si128(_mm_loadu_si128(x16_0 + 1), _mm_loadu_si128(y16_0 + 1));
-        const LEO_M128 x2_0 = _mm_xor_si128(_mm_loadu_si128(x16_0 + 2), _mm_loadu_si128(y16_0 + 2));
-        const LEO_M128 x3_0 = _mm_xor_si128(_mm_loadu_si128(x16_0 + 3), _mm_loadu_si128(y16_0 + 3));
-        const LEO_M128 x0_1 = _mm_xor_si128(_mm_loadu_si128(x16_1),     _mm_loadu_si128(y16_1));
-        const LEO_M128 x1_1 = _mm_xor_si128(_mm_loadu_si128(x16_1 + 1), _mm_loadu_si128(y16_1 + 1));
-        const LEO_M128 x2_1 = _mm_xor_si128(_mm_loadu_si128(x16_1 + 2), _mm_loadu_si128(y16_1 + 2));
-        const LEO_M128 x3_1 = _mm_xor_si128(_mm_loadu_si128(x16_1 + 3), _mm_loadu_si128(y16_1 + 3));
-        const LEO_M128 x0_2 = _mm_xor_si128(_mm_loadu_si128(x16_2),     _mm_loadu_si128(y16_2));
-        const LEO_M128 x1_2 = _mm_xor_si128(_mm_loadu_si128(x16_2 + 1), _mm_loadu_si128(y16_2 + 1));
-        const LEO_M128 x2_2 = _mm_xor_si128(_mm_loadu_si128(x16_2 + 2), _mm_loadu_si128(y16_2 + 2));
-        const LEO_M128 x3_2 = _mm_xor_si128(_mm_loadu_si128(x16_2 + 3), _mm_loadu_si128(y16_2 + 3));
-        _mm_storeu_si128(x16_0,     x0_0);
-        _mm_storeu_si128(x16_0 + 1, x1_0);
-        _mm_storeu_si128(x16_0 + 2, x2_0);
-        _mm_storeu_si128(x16_0 + 3, x3_0);
-        _mm_storeu_si128(x16_1,     x0_1);
-        _mm_storeu_si128(x16_1 + 1, x1_1);
-        _mm_storeu_si128(x16_1 + 2, x2_1);
-        _mm_storeu_si128(x16_1 + 3, x3_1);
-        _mm_storeu_si128(x16_2,     x0_2);
-        _mm_storeu_si128(x16_2 + 1, x1_2);
-        _mm_storeu_si128(x16_2 + 2, x2_2);
-        _mm_storeu_si128(x16_2 + 3, x3_2);
-        x16_0 += 4, y16_0 += 4;
-        x16_1 += 4, y16_1 += 4;
-        x16_2 += 4, y16_2 += 4;
-        bytes -= 64;
-    } while (bytes > 0);
-}
 
+    for (unsigned i = 0; i < kOrder; ++i)
+        LogLUT[i] = ExpLUT[LogLUT[i]];
+
+    for (unsigned i = 0; i < kOrder; ++i)
+        ExpLUT[LogLUT[i]] = i;
+
+    ExpLUT[kModulus] = ExpLUT[0];
+}
 
 //------------------------------------------------------------------------------
 // Multiplies
@@ -485,12 +258,12 @@ void xor_mem3(
 struct {
     LEO_ALIGNED LEO_M128 Lo[256];
     LEO_ALIGNED LEO_M128 Hi[256];
-} Multiply128LUT;
+} static Multiply128LUT;
 #if defined(LEO_TRY_AVX2)
 struct {
     LEO_ALIGNED LEO_M256 Lo[256];
     LEO_ALIGNED LEO_M256 Hi[256];
-} Multiply256LUT;
+} static Multiply256LUT;
 #endif // LEO_TRY_AVX2
 
 // Returns a * b
@@ -501,14 +274,19 @@ static ffe_t FFEMultiply(ffe_t a, ffe_t b)
     return ExpLUT[AddMod(LogLUT[a], LogLUT[b])];
 }
 
+// Returns a * Log(b)
+static ffe_t FFEMultiplyLog(ffe_t a, ffe_t log_b)
+{
+    if (a == 0)
+        return 0;
+    return ExpLUT[AddMod(LogLUT[a], b)];
+}
+
 bool InitializeMultiplyTables()
 {
-    // Reuse aligned self test buffers to load table data
-    uint8_t* lo = m_SelfTestBuffers.A;
-    uint8_t* hi = m_SelfTestBuffers.B;
-
     for (int y = 0; y < 256; ++y)
     {
+        uint8_t lo[16], hi[16];
         for (unsigned char x = 0; x < 16; ++x)
         {
             lo[x] = FFEMultiply(x,      static_cast<uint8_t>(y));
@@ -517,15 +295,17 @@ bool InitializeMultiplyTables()
 
         const LEO_M128 table_lo = _mm_loadu_si128((LEO_M128*)lo);
         const LEO_M128 table_hi = _mm_loadu_si128((LEO_M128*)hi);
+
         _mm_storeu_si128(Multiply128LUT.Lo + y, table_lo);
         _mm_storeu_si128(Multiply128LUT.Hi + y, table_hi);
+
 #if defined(LEO_TRY_AVX2)
         if (CpuHasAVX2)
         {
-            const LEO_M256 table_lo2 = _mm256_broadcastsi128_si256(table_lo);
-            const LEO_M256 table_hi2 = _mm256_broadcastsi128_si256(table_hi);
-            _mm256_storeu_si256(Multiply256LUT.Lo + y, table_lo2);
-            _mm256_storeu_si256(Multiply256LUT.Hi + y, table_hi2);
+            _mm256_storeu_si256(Multiply256LUT.Lo + y,
+                _mm256_broadcastsi128_si256(table_lo));
+            _mm256_storeu_si256(Multiply256LUT.Hi + y,
+                _mm256_broadcastsi128_si256(table_hi));
         }
 #endif // LEO_TRY_AVX2
     }
@@ -536,7 +316,7 @@ bool InitializeMultiplyTables()
 // vx[] = vy[] * m
 void mul_mem_set(
     void * LEO_RESTRICT vx, const void * LEO_RESTRICT vy,
-    ffe_t m, unsigned bytes)
+    ffe_t m, uint64_t bytes)
 {
     if (m <= 1)
     {
@@ -633,7 +413,7 @@ void mul_mem_set(
 void mul_mem2_inplace(
     void * LEO_RESTRICT vx_0,
     void * LEO_RESTRICT vx_1,
-    ffe_t m, unsigned bytes)
+    ffe_t m, uint64_t bytes)
 {
     if (m <= 1)
     {
@@ -759,28 +539,28 @@ void mul_mem2_inplace(
 // FFT Operations
 
 // x[] ^= y[] * m, y[] ^= x[]
-void mul_fft(
+void fft_butterfly(
     void * LEO_RESTRICT x, void * LEO_RESTRICT y,
-    ffe_t m, unsigned bytes)
+    ffe_t m, uint64_t bytes)
 {
 
 }
 
 // For i = {0, 1}: x_i[] ^= y_i[] * m, y_i[] ^= x_i[]
-void mul_fft2(
+void fft_butterfly2(
     void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
     void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1,
-    ffe_t m, unsigned bytes)
+    ffe_t m, uint64_t bytes)
 {
 
 }
 
 // For i = {0, 1, 2}: x_i[] ^= y_i[] * m, y_i[] ^= x_i[]
-void mul_fft3(
+void fft_butterfly3(
     void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
     void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1,
     void * LEO_RESTRICT x_2, void * LEO_RESTRICT y_2,
-    ffe_t m, unsigned bytes)
+    ffe_t m, uint64_t bytes)
 {
 
 }
@@ -790,33 +570,348 @@ void mul_fft3(
 // IFFT Operations
 
 // y[] ^= x[], x[] ^= y[] * m
-void mul_ifft(
+void ifft_butterfly(
     void * LEO_RESTRICT x, void * LEO_RESTRICT y,
-    ffe_t m, unsigned bytes)
+    ffe_t m, uint64_t bytes)
 {
 
 }
 
 // For i = {0, 1}: y_i[] ^= x_i[], x_i[] ^= y_i[] * m
-void mul_ifft2(
+void ifft_butterfly2(
     void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
     void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1,
-    ffe_t m, unsigned bytes)
+    ffe_t m, uint64_t bytes)
 {
 
 }
 
 // For i = {0, 1, 2}: y_i[] ^= x_i[], x_i[] ^= y_i[] * m
-void mul_ifft3(
+void ifft_butterfly3(
     void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
     void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1,
     void * LEO_RESTRICT x_2, void * LEO_RESTRICT y_2,
-    ffe_t m, unsigned bytes)
+    ffe_t m, uint64_t bytes)
 {
 
 }
 
 
+//------------------------------------------------------------------------------
+// FFT
+
+static ffe_t FFTSkew[kFieldModulus]; // twisted factors used in FFT
+static ffe_t LogWalsh[kOrder]; // factors used in the evaluation of the error locator polynomial
+
+void FFTInitialize()
+{
+    ffe_t temp[kBits - 1];
+
+    for (unsigned i = 1; i < kBits; ++i)
+        temp[i - 1] = (ffe_t)((unsigned)1 << i);
+
+    for (unsigned m = 0; m < (kBits - 1); ++m)
+    {
+        const unsigned step = (unsigned)1 << (m + 1);
+
+        FFTSkew[((unsigned)1 << m) - 1] = 0;
+
+        for (unsigned i = m; i < (kBits - 1); ++i)
+        {
+            const unsigned s = ((unsigned)1 << (i + 1));
+
+            for (unsigned j = ((unsigned)1 << m) - 1; j < s; j += step)
+                FFTSkew[j + s] = FFTSkew[j] ^ temp[i];
+        }
+
+        // TBD: This can be cleaned up
+        temp[m] = kFieldModulus - LogLUT[FFEMultiply(temp[m], temp[m] ^ 1)];
+
+        for (unsigned i = m + 1; i < (kBits - 1); ++i)
+            temp[i] = FFEMultiplyLog(temp[i], (LogLUT[temp[i] ^ 1] + temp[m]) % kFieldModulus);
+    }
+
+    for (unsigned i = 0; i < kOrder; ++i)
+        FFTSkew[i] = LogLUT[FFTSkew[i]];
+
+    // Precalculate FWHT(Log[i]):
+
+    for (unsigned i = 0; i < kOrder; ++i)
+        LogWalsh[i] = LogLUT[i];
+    LogWalsh[0] = 0;
+    FWHT(LogWalsh, kBits);
+}
+
+
+//------------------------------------------------------------------------------
+// Encode
+
+void Encode(
+    uint64_t buffer_bytes,
+    unsigned original_count,
+    unsigned recovery_count,
+    unsigned m,
+    void* const * const data,
+    void** work)
+{
+    // work <- data
+
+    // FIXME: Unroll first loop to eliminate this
+    for (unsigned i = 0; i < m; ++i)
+        memcpy(work[i], data[i], buffer_bytes);
+
+    // work <- IFFT(data, m, m)
+
+    for (unsigned width = 1; width < m; width <<= 1)
+    {
+        for (unsigned j = width; j < m; j += (width << 1))
+        {
+            const ffe_t skew = FFTSkew[j + m - 1];
+
+            if (skew != kFieldModulus)
+            {
+                for (unsigned i = j - width; i < j; ++i)
+                    ifft_butterfly(work[i], work[i + width], skew, buffer_bytes);
+            }
+            else
+            {
+                for (unsigned i = j - width; i < j; ++i)
+                    xor_mem(work[i + width], work[i], buffer_bytes);
+            }
+        }
+    }
+
+    for (unsigned i = m; i + m <= original_count; i += m)
+    {
+        // temp <- data + i
+
+        void** temp = work + m;
+
+        // FIXME: Unroll first loop to eliminate this
+        for (unsigned j = 0; j < m; ++j)
+            memcpy(temp[j], data[j], buffer_bytes);
+
+        // temp <- IFFT(temp, m, m + i)
+
+        for (unsigned width = 1; width < m; width <<= 1)
+        {
+            for (unsigned j = width; j < m; j += (width << 1))
+            {
+                const ffe_t skew = FFTSkew[j + m + i - 1];
+
+                if (skew != kFieldModulus)
+                {
+                    for (unsigned k = j - width; k < j; ++k)
+                        ifft_butterfly(temp[k], temp[k + width], skew, buffer_bytes);
+                }
+                else
+                {
+                    for (unsigned k = j - width; k < j; ++k)
+                        xor_mem(temp[k + width], temp[k], buffer_bytes);
+                }
+            }
+        }
+
+        // work <- work XOR temp
+
+        // FIXME: Unroll last loop to eliminate this
+        for (unsigned j = 0; j < m; ++j)
+            xor_mem(work[j], temp[j], buffer_bytes);
+    }
+
+    const unsigned last_count = original_count % m;
+    if (last_count != 0)
+    {
+        const unsigned i = original_count - last_count;
+
+        // temp <- data + i
+
+        void** temp = work + m;
+
+        for (unsigned j = 0; j < last_count; ++j)
+            memcpy(temp[j], data[j], buffer_bytes);
+        for (unsigned j = last_count; j < m; ++j)
+            memset(temp[j], 0, buffer_bytes);
+
+        // temp <- IFFT(temp, m, m + i)
+
+        for (unsigned width = 1, shift = 1; width < m; width <<= 1, ++shift)
+        {
+            // Calculate stop considering that the right is all zeroes
+            const unsigned stop = ((last_count + width - 1) >> shift) << shift;
+
+            for (unsigned j = width; j < stop; j += (width << 1))
+            {
+                const ffe_t skew = FFTSkew[j + m + i - 1];
+
+                if (skew != kFieldModulus)
+                {
+                    for (unsigned k = j - width; k < j; ++k)
+                        ifft_butterfly(temp[k], temp[k + width], skew, buffer_bytes);
+                }
+                else
+                {
+                    for (unsigned k = j - width; k < j; ++k)
+                        xor_mem(temp[k + width], temp[k], buffer_bytes);
+                }
+            }
+        }
+
+        // work <- work XOR temp
+
+        // FIXME: Unroll last loop to eliminate this
+        for (unsigned j = 0; j < m; ++j)
+            xor_mem(work[j], temp[j], buffer_bytes);
+    }
+
+    // work <- FFT(work, m, 0)
+
+    for (unsigned width = (m >> 1); width > 0; width >>= 1)
+    {
+        const ffe_t* skewLUT = FFTSkew + width - 1;
+        const unsigned range = width << 1;
+
+        for (unsigned j = 0; j < m; j += range)
+        {
+            const ffe_t skew = skewLUT[j];
+
+            if (skew != kFieldModulus)
+            {
+                for (unsigned k = j, count = j + width; k < count; ++k)
+                    fft_butterfly(data[k], data[k + width], skew, buffer_bytes);
+            }
+            else
+            {
+                for (unsigned k = j, count = j + width; k < count; ++k)
+                    xor_mem(work[k + width], work[k], buffer_bytes);
+            }
+        }
+    }
+}
+
+
+//------------------------------------------------------------------------------
+// Decode
+
+void Decode(
+    uint64_t buffer_bytes,
+    unsigned original_count,
+    unsigned recovery_count,
+    unsigned m, // NextPow2(recovery_count)
+    unsigned n, // NextPow2(m + original_count) = work_count
+    void* const * const original, // original_count entries
+    void* const * const recovery, // recovery_count entries
+    void** work) // n entries
+{
+    // Fill in error locations
+
+    ffe_t ErrorLocations[kOrder];
+    for (unsigned i = 0; i < recovery_count; ++i)
+        ErrorLocations[i] = recovery[i] ? 0 : 1;
+    for (unsigned i = recovery_count; i < m; ++i)
+        ErrorLocations[i] = 1;
+    for (unsigned i = 0; i < original_count; ++i)
+        ErrorLocations[i + m] = original[i] ? 0 : 1;
+    memset(ErrorLocations + m + original_count, 0, (n - original_count - m) * sizeof(ffe_t));
+
+    // Evaluate error locator polynomial
+
+    FWHT(ErrorLocations, kBits);
+
+    for (unsigned i = 0; i < kOrder; ++i)
+        ErrorLocations[i] = ((unsigned)ErrorLocations[i] * (unsigned)LogWalsh[i]) % kFieldModulus;
+
+    FWHT(ErrorLocations, kBits);
+
+    // work <- recovery data
+
+    for (unsigned i = 0; i < recovery_count; ++i)
+    {
+        if (recovery[i])
+            mul_mem_set(work[i], recovery[i], ErrorLocations[i], buffer_bytes);
+        else
+            memset(work[i], 0, buffer_bytes);
+    }
+    for (unsigned i = recovery_count; i < m; ++i)
+        memset(work[i], 0, buffer_bytes);
+
+    // work <- original data
+
+    for (unsigned i = 0; i < original_count; ++i)
+    {
+        if (original[i])
+            mul_mem_set(work[m + i], original[i], ErrorLocations[m + i], buffer_bytes);
+        else
+            memset(work[m + i], 0, buffer_bytes);
+    }
+    for (unsigned i = m + original_count; i < n; ++i)
+        memset(work[i], 0, buffer_bytes);
+
+    // work <- IFFT(work, n, 0)
+
+    for (unsigned width = 1; width < n; width <<= 1)
+    {
+        for (unsigned j = width; j < n; j += (width << 1))
+        {
+            const ffe_t skew = FFTSkew[j - 1];
+
+            if (skew != kFieldModulus)
+            {
+                for (unsigned i = j - width; i < j; ++i)
+                    ifft_butterfly(work[i], work[i + width], skew, buffer_bytes);
+            }
+            else
+            {
+                for (unsigned i = j - width; i < j; ++i)
+                    xor_mem(work[i + width], work[i], buffer_bytes);
+            }
+        }
+    }
+
+    // work <- FormalDerivative(work, n)
+
+    for (unsigned i = 1; i < n; ++i)
+    {
+        const unsigned width = ((i ^ (i - 1)) + 1) >> 1;
+
+        // If a large number of values are being XORed:
+        for (unsigned j = i - width; j < i; ++j)
+            xor_mem(work[j], work[j + width], buffer_bytes);
+    }
+
+    // work <- FFT(work, n, 0) truncated to m + original_count
+
+    const unsigned output_count = m + original_count;
+    for (unsigned width = (n >> 1); width > 0; width >>= 1)
+    {
+        const ffe_t* skewLUT = FFTSkew + width - 1;
+        const unsigned range = width << 1;
+
+        for (unsigned j = (m < range) ? 0 : m; j < output_count; j += range)
+        {
+            const ffe_t skew = skewLUT[j];
+
+            if (skew != kFieldModulus)
+            {
+                for (unsigned i = j; i < j + width; ++i)
+                    fft_butterfly(work[i], work[i + width], skew, buffer_bytes);
+            }
+            else
+            {
+                for (unsigned i = j; i < j + width; ++i)
+                    xor_mem(work[i + width], work[i], buffer_bytes);
+            }
+        }
+    }
+
+    // Reveal erasures
+
+    for (unsigned i = 0; i < original_count; ++i)
+        if (!original[i])
+            mul_mem_set(work[i], work[i + m], kFieldModulus - ErrorLocations[i], buffer_bytes);
+}
+
+
 //------------------------------------------------------------------------------
 // API
 
@@ -831,6 +926,7 @@ bool Initialize()
         return false;
 
     InitializeLogarithmTables();
+    FFTInitialize();
 
     IsInitialized = true;
     return true;
diff --git a/LeopardFF8.h b/LeopardFF8.h
index 1ef933b..88efa3f 100644
--- a/LeopardFF8.h
+++ b/LeopardFF8.h
@@ -56,9 +56,6 @@ static const unsigned kOrder = 256;
 //------------------------------------------------------------------------------
 // Fast Walsh-Hadamard Transform (FWHT) (mod kModulus)
 
-// Define this to enable the optimized version of FWHT()
-#define LEO_FF8_FWHT_OPTIMIZED
-
 // Transform for a variable number of bits (up to kOrder)
 void FWHT(ffe_t* data, const unsigned bits);
 
@@ -66,85 +63,89 @@ void FWHT(ffe_t* data, const unsigned bits);
 void FWHT(ffe_t data[kOrder]);
 
 
-//------------------------------------------------------------------------------
-// XOR Memory
-
-// x[] ^= y[]
-void xor_mem(
-    void * LEO_RESTRICT x, const void * LEO_RESTRICT y,
-    unsigned bytes);
-
-// For i = {0, 1}: x_i[] ^= x_i[]
-void xor_mem2(
-    void * LEO_RESTRICT x_0, const void * LEO_RESTRICT y_0,
-    void * LEO_RESTRICT x_1, const void * LEO_RESTRICT y_1,
-    unsigned bytes);
-
-// For i = {0, 1, 2}: x_i[] ^= x_i[]
-void xor_mem3(
-    void * LEO_RESTRICT x_0, const void * LEO_RESTRICT y_0,
-    void * LEO_RESTRICT x_1, const void * LEO_RESTRICT y_1,
-    void * LEO_RESTRICT x_2, const void * LEO_RESTRICT y_2,
-    unsigned bytes);
-
-
 //------------------------------------------------------------------------------
 // Multiplies
 
 // x[] = y[] * m
 void mul_mem_set(
     void * LEO_RESTRICT x, const void * LEO_RESTRICT y,
-    ffe_t m, unsigned bytes);
+    ffe_t m, uint64_t bytes);
 
 // For i = {0, 1}: x_i[] *= m
 void mul_mem2_inplace(
     void * LEO_RESTRICT x_0,
     void * LEO_RESTRICT x_1,
-    ffe_t m, unsigned bytes);
+    ffe_t m, uint64_t bytes);
 
 
 //------------------------------------------------------------------------------
 // FFT Operations
 
 // x[] ^= y[] * m, y[] ^= x[]
-void mul_fft(
+void fft_butterfly(
     void * LEO_RESTRICT x, void * LEO_RESTRICT y,
-    ffe_t m, unsigned bytes);
+    ffe_t m, uint64_t bytes);
 
 // For i = {0, 1}: x_i[] ^= y_i[] * m, y_i[] ^= x_i[]
-void mul_fft2(
+void fft_butterfly2(
     void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
     void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1,
-    ffe_t m, unsigned bytes);
+    ffe_t m, uint64_t bytes);
 
 // For i = {0, 1, 2}: x_i[] ^= y_i[] * m, y_i[] ^= x_i[]
-void mul_fft3(
+void fft_butterfly3(
     void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
     void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1,
     void * LEO_RESTRICT x_2, void * LEO_RESTRICT y_2,
-    ffe_t m, unsigned bytes);
+    ffe_t m, uint64_t bytes);
 
 
 //------------------------------------------------------------------------------
 // IFFT Operations
 
 // y[] ^= x[], x[] ^= y[] * m
-void mul_ifft(
+void ifft_butterfly(
     void * LEO_RESTRICT x, void * LEO_RESTRICT y,
-    ffe_t m, unsigned bytes);
+    ffe_t m, uint64_t bytes);
 
 // For i = {0, 1}: y_i[] ^= x_i[], x_i[] ^= y_i[] * m
-void mul_ifft2(
+void ifft_butterfly2(
     void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
     void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1,
-    ffe_t m, unsigned bytes);
+    ffe_t m, uint64_t bytes);
 
 // For i = {0, 1, 2}: y_i[] ^= x_i[], x_i[] ^= y_i[] * m
-void mul_ifft3(
+void ifft_butterfly3(
     void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
     void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1,
     void * LEO_RESTRICT x_2, void * LEO_RESTRICT y_2,
-    ffe_t m, unsigned bytes);
+    ffe_t m, uint64_t bytes);
+
+
+//------------------------------------------------------------------------------
+// Encode
+
+void Encode(
+    uint64_t buffer_bytes,
+    unsigned original_count,
+    unsigned recovery_count,
+    unsigned m, // = NextPow2(recovery_count) * 2 = work_count
+    void* const * const data,
+    void** work); // Size of GetEncodeWorkCount()
+
+
+//------------------------------------------------------------------------------
+// Decode
+
+void Decode(
+    uint64_t buffer_bytes,
+    unsigned original_count,
+    unsigned recovery_count,
+    unsigned m, // = NextPow2(recovery_count)
+    unsigned n, // = NextPow2(m + original_count) = work_count
+    void* const * const original, // original_count entries
+    void* const * const recovery, // recovery_count entries
+    void** work); // n entries
 
 
 //------------------------------------------------------------------------------
diff --git a/docs/HighRateDecoder.pdf b/docs/HighRateDecoder.pdf
new file mode 100644
index 0000000..6ce5054
Binary files /dev/null and b/docs/HighRateDecoder.pdf differ
diff --git a/docs/LowRateDecoder.pdf b/docs/LowRateDecoder.pdf
new file mode 100644
index 0000000..93ba65e
Binary files /dev/null and b/docs/LowRateDecoder.pdf differ
diff --git a/leopard.cpp b/leopard.cpp
index 5c694fd..51850f9 100644
--- a/leopard.cpp
+++ b/leopard.cpp
@@ -27,8 +27,8 @@
 */
 
 #include "leopard.h"
-#include "FecalEncoder.h"
-#include "FecalDecoder.h"
+#include "LeopardFF8.h"
+#include "LeopardFF16.h"
 
 extern "C" {
 
@@ -38,134 +38,152 @@ extern "C" {
 
 static bool m_Initialized = false;
 
-FECAL_EXPORT int fecal_init_(int version)
+LEO_EXPORT int leo_init_(int version)
 {
-    if (version != FECAL_VERSION)
-        return Fecal_InvalidInput;
+    if (version != LEO_VERSION)
+        return Leopard_InvalidInput;
 
-    if (0 != gf256_init())
-        return Fecal_Platform;
+    if (!leopard::ff8::Initialize())
+        return Leopard_Platform;
+
+    if (!leopard::ff16::Initialize())
+        return Leopard_Platform;
 
     m_Initialized = true;
-    return Fecal_Success;
+    return Leopard_Success;
 }
 
 
 //------------------------------------------------------------------------------
 // Encoder API
 
-FECAL_EXPORT FecalEncoder fecal_encoder_create(unsigned input_count, void* const * const input_data, uint64_t total_bytes)
+LEO_EXPORT unsigned leo_encode_work_count(
+    unsigned original_count,
+    unsigned recovery_count)
 {
-    if (input_count <= 0 || !input_data || total_bytes < input_count)
-    {
-        FECAL_DEBUG_BREAK; // Invalid input
-        return nullptr;
-    }
-
-    FECAL_DEBUG_ASSERT(m_Initialized); // Must call fecal_init() first
-    if (!m_Initialized)
-        return nullptr;
-
-    fecal::Encoder* encoder = new(std::nothrow) fecal::Encoder;
-    if (!encoder)
-    {
-        FECAL_DEBUG_BREAK; // Out of memory
-        return nullptr;
-    }
-
-    if (Fecal_Success != encoder->Initialize(input_count, input_data, total_bytes))
-    {
-        delete encoder;
-        return nullptr;
-    }
-
-    return reinterpret_cast<FecalEncoder>( encoder );
+    return leopard::NextPow2(recovery_count) * 2;
 }
 
-FECAL_EXPORT int fecal_encode(FecalEncoder encoder_v, FecalSymbol* symbol)
+LEO_EXPORT LeopardResult leo_encode(
+    uint64_t buffer_bytes,              // Number of bytes in each data buffer
+    unsigned original_count,            // Number of original_data[] buffer pointers
+    unsigned recovery_count,            // Number of recovery_data[] buffer pointers
+    unsigned work_count,                // Number of work_data[] buffer pointers, from leo_encode_work_count()
+    void* const * const original_data,  // Array of pointers to original data buffers
+    void** work_data,                   // Array of work buffers
+    unsigned flags)                     // Operation flags
 {
-    fecal::Encoder* encoder = reinterpret_cast<fecal::Encoder*>( encoder_v );
-    if (!encoder || !symbol)
-        return Fecal_InvalidInput;
+    if (buffer_bytes <= 0 || buffer_bytes % 64 != 0)
+        return Leopard_InvalidSize;
 
-    return encoder->Encode(*symbol);
-}
+    if (recovery_count <= 0 || recovery_count > original_count)
+        return Leopard_InvalidCounts;
 
-FECAL_EXPORT void fecal_free(void* codec_v)
-{
-    if (codec_v)
+    if (!original_data || !work_data)
+        return Leopard_InvalidInput;
+
+    const unsigned m = leopard::NextPow2(recovery_count);
+    const unsigned n = leopard::NextPow2(m + original_count);
+
+    if (work_count != m * 2)
+        return Leopard_InvalidCounts;
+
+    const bool mt = (flags & LeopardFlags_Multithreaded) != 0;
+
+    if (n <= leopard::ff8::kOrder)
     {
-        fecal::ICodec* icodec = reinterpret_cast<fecal::ICodec*>( codec_v );
-        delete icodec;
+        leopard::ff8::Encode(
+            buffer_bytes,
+            original_count,
+            recovery_count,
+            m,
+            original_data,
+            work_data);
     }
+    else if (n <= leopard::ff16::kOrder)
+    {
+        leopard::ff16::Encode(
+            buffer_bytes,
+            original_count,
+            recovery_count,
+            m,
+            original_data,
+            work_data);
+    }
+    else
+        return Leopard_TooMuchData;
+
+    return Leopard_Success;
 }
 
 
 //------------------------------------------------------------------------------
 // Decoder API
 
-FECAL_EXPORT FecalDecoder fecal_decoder_create(unsigned input_count, uint64_t total_bytes)
+LEO_EXPORT unsigned leo_decode_work_count(
+    unsigned original_count,
+    unsigned recovery_count)
 {
-    if (input_count <= 0 || total_bytes < input_count)
+    const unsigned m = leopard::NextPow2(recovery_count);
+    const unsigned n = leopard::NextPow2(m + original_count);
+    return n;
+}
+
+LEO_EXPORT LeopardResult leo_decode(
+    uint64_t buffer_bytes,              // Number of bytes in each data buffer
+    unsigned original_count,            // Number of original_data[] buffer pointers
+    unsigned recovery_count,            // Number of recovery_data[] buffer pointers
+    unsigned work_count,                // Number of buffer pointers in work_data[]
+    void* const * const original_data,  // Array of original data buffers
+    void* const * const recovery_data,  // Array of recovery data buffers
+    void** work_data,                   // Array of work data buffers
+    unsigned flags)                     // Operation flags
+{
+    if (buffer_bytes <= 0 || buffer_bytes % 64 != 0)
+        return Leopard_InvalidSize;
+
+    if (recovery_count <= 0 || recovery_count > original_count)
+        return Leopard_InvalidCounts;
+
+    if (!original_data || !recovery_data || !work_data)
+        return Leopard_InvalidInput;
+
+    const unsigned m = leopard::NextPow2(recovery_count);
+    const unsigned n = leopard::NextPow2(m + original_count);
+
+    if (work_count != n)
+        return Leopard_InvalidCounts;
+
+    const bool mt = (flags & LeopardFlags_Multithreaded) != 0;
+
+    if (n <= leopard::ff8::kOrder)
     {
-        FECAL_DEBUG_BREAK; // Invalid input
-        return nullptr;
+        leopard::ff8::Decode(
+            buffer_bytes,
+            original_count,
+            recovery_count,
+            m,
+            n,
+            original_data,
+            recovery_data,
+            work_data);
     }
-
-    FECAL_DEBUG_ASSERT(m_Initialized); // Must call fecal_init() first
-    if (!m_Initialized)
-        return nullptr;
-
-    fecal::Decoder* decoder = new(std::nothrow) fecal::Decoder;
-    if (!decoder)
+    else if (n <= leopard::ff16::kOrder)
     {
-        FECAL_DEBUG_BREAK; // Out of memory
-        return nullptr;
+        leopard::ff16::Decode(
+            buffer_bytes,
+            original_count,
+            recovery_count,
+            m,
+            n,
+            original_data,
+            recovery_data,
+            work_data);
     }
+    else
+        return Leopard_TooMuchData;
 
-    if (Fecal_Success != decoder->Initialize(input_count, total_bytes))
-    {
-        delete decoder;
-        return nullptr;
-    }
-
-    return reinterpret_cast<FecalDecoder>( decoder );
-}
-
-FECAL_EXPORT int fecal_decoder_add_original(FecalDecoder decoder_v, const FecalSymbol* symbol)
-{
-    fecal::Decoder* decoder = reinterpret_cast<fecal::Decoder*>( decoder_v );
-    if (!decoder || !symbol)
-        return Fecal_InvalidInput;
-
-    return decoder->AddOriginal(*symbol);
-}
-
-FECAL_EXPORT int fecal_decoder_add_recovery(FecalDecoder decoder_v, const FecalSymbol* symbol)
-{
-    fecal::Decoder* decoder = reinterpret_cast<fecal::Decoder*>( decoder_v );
-    if (!decoder || !symbol)
-        return Fecal_InvalidInput;
-
-    return decoder->AddRecovery(*symbol);
-}
-
-FECAL_EXPORT int fecal_decode(FecalDecoder decoder_v, RecoveredSymbols* symbols)
-{
-    fecal::Decoder* decoder = reinterpret_cast<fecal::Decoder*>( decoder_v );
-    if (!decoder || !symbols)
-        return Fecal_InvalidInput;
-
-    return decoder->Decode(*symbols);
-}
-
-FECAL_EXPORT int fecal_decoder_get(FecalDecoder decoder_v, unsigned input_index, FecalSymbol* symbol)
-{
-    fecal::Decoder* decoder = reinterpret_cast<fecal::Decoder*>( decoder_v );
-    if (!decoder || !symbol)
-        return Fecal_InvalidInput;
-
-    return decoder->GetOriginal(input_index, *symbol);
+    return Leopard_Success;
 }
 
 
diff --git a/leopard.h b/leopard.h
index 8c0e85f..e8a6b4f 100644
--- a/leopard.h
+++ b/leopard.h
@@ -59,6 +59,7 @@
 # endif
 #endif
 
+#include <stdint.h>
 
 #ifdef __cplusplus
 extern "C" {
@@ -90,14 +91,13 @@ typedef enum LeopardResultT
     Leopard_Success           =  0, // Operation succeeded
 
     Leopard_TooMuchData       = -1, // Buffer counts are too high
-    Leopard_InvalidBlockSize  = -2, // Buffer size must be a multiple of 64 bytes
-    Leopard_InvalidInput      = -3, // A function parameter was invalid
-    Leopard_Platform          = -4, // Platform is unsupported
-    Leopard_OutOfMemory       = -5, // Out of memory error occurred
-    Leopard_Unexpected        = -6, // Unexpected error - Software bug?
+    Leopard_InvalidSize       = -2, // Buffer size must be a multiple of 64 bytes
+    Leopard_InvalidCounts     = -3, // Invalid counts provided
+    Leopard_InvalidInput      = -4, // A function parameter was invalid
+    Leopard_Platform          = -5, // Platform is unsupported
 } LeopardResult;
 
-// Results
+// Flags
 typedef enum LeopardFlagsT
 {
     LeopardFlags_Defaults      = 0, // Default settings
@@ -119,7 +119,6 @@ typedef enum LeopardFlagsT
 	Returns the work_count value to pass into leo_encode().
     Returns 0 on invalid input.
 */
-
 LEO_EXPORT unsigned leo_encode_work_count(
     unsigned original_count,
     unsigned recovery_count);
@@ -138,6 +137,8 @@ LEO_EXPORT unsigned leo_encode_work_count(
     flags:          Flags for encoding e.g. LeopardFlag_Multithreaded
 
     The sum of original_count + recovery_count must not exceed 65536.
+    The recovery_count <= original_count.
+
     The buffer_bytes must be a multiple of 64.
     Each buffer should have the same number of bytes.
     Even the last piece must be rounded up to the block size.
@@ -153,15 +154,11 @@ LEO_EXPORT unsigned leo_encode_work_count(
             ((uint64_t)total_bytes + original_count - 1) / original_count);
 
     Returns Leopard_Success on success.
-    The first set of recovery_count buffers in work_data will be the result.
-
-    Returns Leopard_TooMuchData if the data is too large.
-    Returns Leopard_InvalidBlockSize if the data is the wrong size.
-    Returns Leopard_InvalidInput on invalid input.
+    * The first set of recovery_count buffers in work_data will be the result.
     Returns other values on errors.
 */
 LEO_EXPORT LeopardResult leo_encode(
-    unsigned buffer_bytes,              // Number of bytes in each data buffer
+    uint64_t buffer_bytes,              // Number of bytes in each data buffer
     unsigned original_count,            // Number of original_data[] buffer pointers
     unsigned recovery_count,            // Number of recovery_data[] buffer pointers
     unsigned work_count,                // Number of work_data[] buffer pointers, from leo_encode_work_count()
@@ -183,7 +180,6 @@ LEO_EXPORT LeopardResult leo_encode(
 	Returns the work_count value to pass into leo_encode().
     Returns 0 on invalid input.
 */
-
 LEO_EXPORT unsigned leo_decode_work_count(
     unsigned original_count,
     unsigned recovery_count);
@@ -211,7 +207,7 @@ LEO_EXPORT unsigned leo_decode_work_count(
     Returns other values on errors.
 */
 LEO_EXPORT LeopardResult leo_decode(
-    unsigned buffer_bytes,              // Number of bytes in each data buffer
+    uint64_t buffer_bytes,              // Number of bytes in each data buffer
     unsigned original_count,            // Number of original_data[] buffer pointers
     unsigned recovery_count,            // Number of recovery_data[] buffer pointers
     unsigned work_count,                // Number of buffer pointers in work_data[]
diff --git a/proj/Leopard.sln b/proj/Leopard.sln
index bafad8e..daa9f58 100644
--- a/proj/Leopard.sln
+++ b/proj/Leopard.sln
@@ -1,12 +1,14 @@
 ﻿
 Microsoft Visual Studio Solution File, Format Version 12.00
-# Visual Studio 15
-VisualStudioVersion = 15.0.26127.3
+# Visual Studio 14
+VisualStudioVersion = 14.0.25420.1
 MinimumVisualStudioVersion = 10.0.40219.1
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "Leopard", "Leopard.vcxproj", "{32176592-2F30-4BD5-B645-EB11C8D3453E}"
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "LeopardBenchmark", "..\tests\proj\Benchmark.vcxproj", "{97FCA15F-EAF3-4F1A-AFF8-83E693DA9D45}"
 EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "LeopardExperiments", "..\tests\proj\Experiments.vcxproj", "{97FCA15F-EAF3-4F1A-AFF8-83E693DA9D65}"
+EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|Win32 = Debug|Win32
@@ -31,6 +33,14 @@ Global
 		{97FCA15F-EAF3-4F1A-AFF8-83E693DA9D45}.Release|Win32.Build.0 = Release|Win32
 		{97FCA15F-EAF3-4F1A-AFF8-83E693DA9D45}.Release|x64.ActiveCfg = Release|x64
 		{97FCA15F-EAF3-4F1A-AFF8-83E693DA9D45}.Release|x64.Build.0 = Release|x64
+		{97FCA15F-EAF3-4F1A-AFF8-83E693DA9D65}.Debug|Win32.ActiveCfg = Debug|Win32
+		{97FCA15F-EAF3-4F1A-AFF8-83E693DA9D65}.Debug|Win32.Build.0 = Debug|Win32
+		{97FCA15F-EAF3-4F1A-AFF8-83E693DA9D65}.Debug|x64.ActiveCfg = Debug|x64
+		{97FCA15F-EAF3-4F1A-AFF8-83E693DA9D65}.Debug|x64.Build.0 = Debug|x64
+		{97FCA15F-EAF3-4F1A-AFF8-83E693DA9D65}.Release|Win32.ActiveCfg = Release|Win32
+		{97FCA15F-EAF3-4F1A-AFF8-83E693DA9D65}.Release|Win32.Build.0 = Release|Win32
+		{97FCA15F-EAF3-4F1A-AFF8-83E693DA9D65}.Release|x64.ActiveCfg = Release|x64
+		{97FCA15F-EAF3-4F1A-AFF8-83E693DA9D65}.Release|x64.Build.0 = Release|x64
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
diff --git a/proj/Leopard.vcxproj b/proj/Leopard.vcxproj
index da9a8ad..c5c69b5 100644
--- a/proj/Leopard.vcxproj
+++ b/proj/Leopard.vcxproj
@@ -21,16 +21,12 @@
   <ItemGroup>
     <ClInclude Include="..\leopard.h" />
     <ClInclude Include="..\LeopardCommon.h" />
-    <ClInclude Include="..\LeopardDecoder.h" />
-    <ClInclude Include="..\LeopardEncoder.h" />
     <ClInclude Include="..\LeopardFF8.h" />
     <ClInclude Include="..\LeopardFF16.h" />
   </ItemGroup>
   <ItemGroup>
     <ClCompile Include="..\leopard.cpp" />
     <ClCompile Include="..\LeopardCommon.cpp" />
-    <ClCompile Include="..\LeopardDecoder.cpp" />
-    <ClCompile Include="..\LeopardEncoder.cpp" />
     <ClCompile Include="..\LeopardFF8.cpp" />
     <ClCompile Include="..\LeopardFF16.cpp" />
   </ItemGroup>
@@ -38,34 +34,33 @@
     <ProjectGuid>{32176592-2F30-4BD5-B645-EB11C8D3453E}</ProjectGuid>
     <RootNamespace>GF65536</RootNamespace>
     <ProjectName>Leopard</ProjectName>
-    <WindowsTargetPlatformVersion>10.0.14393.0</WindowsTargetPlatformVersion>
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
     <ConfigurationType>StaticLibrary</ConfigurationType>
     <UseDebugLibraries>true</UseDebugLibraries>
     <CharacterSet>MultiByte</CharacterSet>
-    <PlatformToolset>v141</PlatformToolset>
+    <PlatformToolset>v140</PlatformToolset>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
     <ConfigurationType>StaticLibrary</ConfigurationType>
     <UseDebugLibraries>true</UseDebugLibraries>
     <CharacterSet>MultiByte</CharacterSet>
-    <PlatformToolset>v141</PlatformToolset>
+    <PlatformToolset>v140</PlatformToolset>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
     <ConfigurationType>StaticLibrary</ConfigurationType>
     <UseDebugLibraries>false</UseDebugLibraries>
     <WholeProgramOptimization>true</WholeProgramOptimization>
     <CharacterSet>MultiByte</CharacterSet>
-    <PlatformToolset>v141</PlatformToolset>
+    <PlatformToolset>v140</PlatformToolset>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
     <ConfigurationType>StaticLibrary</ConfigurationType>
     <UseDebugLibraries>false</UseDebugLibraries>
     <WholeProgramOptimization>true</WholeProgramOptimization>
     <CharacterSet>MultiByte</CharacterSet>
-    <PlatformToolset>v141</PlatformToolset>
+    <PlatformToolset>v140</PlatformToolset>
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
diff --git a/proj/Leopard.vcxproj.filters b/proj/Leopard.vcxproj.filters
index 079edb1..df7d586 100644
--- a/proj/Leopard.vcxproj.filters
+++ b/proj/Leopard.vcxproj.filters
@@ -21,12 +21,6 @@
     <ClInclude Include="..\LeopardCommon.h">
       <Filter>Source Files</Filter>
     </ClInclude>
-    <ClInclude Include="..\LeopardDecoder.h">
-      <Filter>Source Files</Filter>
-    </ClInclude>
-    <ClInclude Include="..\LeopardEncoder.h">
-      <Filter>Source Files</Filter>
-    </ClInclude>
     <ClInclude Include="..\LeopardFF16.h">
       <Filter>Source Files</Filter>
     </ClInclude>
@@ -35,12 +29,6 @@
     </ClInclude>
   </ItemGroup>
   <ItemGroup>
-    <ClCompile Include="..\LeopardDecoder.cpp">
-      <Filter>Source Files</Filter>
-    </ClCompile>
-    <ClCompile Include="..\LeopardEncoder.cpp">
-      <Filter>Source Files</Filter>
-    </ClCompile>
     <ClCompile Include="..\leopard.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
diff --git a/tests/experiments.cpp b/tests/experiments.cpp
new file mode 100644
index 0000000..f2c6a4e
--- /dev/null
+++ b/tests/experiments.cpp
@@ -0,0 +1,615 @@
+/*
+    Copyright (c) 2017 Christopher A. Taylor.  All rights reserved.
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright notice,
+      this list of conditions and the following disclaimer in the documentation
+      and/or other materials provided with the distribution.
+    * Neither the name of LHC-RS nor the names of its contributors may be
+      used to endorse or promote products derived from this software without
+      specific prior written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+    ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+    POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <string.h>
+#include <time.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+
+//------------------------------------------------------------------------------
+// Debug
+
+// Some bugs only repro in release mode, so this can be helpful
+//#define LEO_DEBUG_IN_RELEASE
+
+#if defined(_DEBUG) || defined(DEBUG) || defined(LEO_DEBUG_IN_RELEASE)
+    #define LEO_DEBUG
+    #ifdef _WIN32
+        #define LEO_DEBUG_BREAK __debugbreak()
+    #else
+        #define LEO_DEBUG_BREAK __builtin_trap()
+    #endif
+    #define LEO_DEBUG_ASSERT(cond) { if (!(cond)) { LEO_DEBUG_BREAK; } }
+#else
+    #define LEO_DEBUG_BREAK ;
+    #define LEO_DEBUG_ASSERT(cond) ;
+#endif
+
+
+//------------------------------------------------------------------------------
+// Platform/Architecture
+
+// Compiler-specific C++11 restrict keyword
+#define LEO_RESTRICT __restrict
+
+// Compiler-specific force inline keyword
+#ifdef _MSC_VER
+    #define LEO_FORCE_INLINE inline __forceinline
+#else
+    #define LEO_FORCE_INLINE inline __attribute__((always_inline))
+#endif
+
+
+
+
+//------------------------------------------------------------------------------
+// Field
+
+//#define LEO_SHORT_FIELD
+
+#ifdef LEO_SHORT_FIELD
+typedef uint8_t ffe_t;
+static const unsigned kGFBits = 8;
+static const unsigned kGFPolynomial = 0x11D;
+ffe_t kGFBasis[kGFBits] = {
+    1, 214, 152, 146, 86, 200, 88, 230 // Cantor basis
+};
+#else
+typedef uint16_t ffe_t;
+static const unsigned kGFBits = 16;
+static const unsigned kGFPolynomial = 0x1002D;
+ffe_t kGFBasis[kGFBits] = {
+    0x0001, 0xACCA, 0x3C0E, 0x163E, // Cantor basis
+    0xC582, 0xED2E, 0x914C, 0x4012,
+    0x6C98, 0x10D8, 0x6A72, 0xB900,
+    0xFDB8, 0xFB34, 0xFF38, 0x991E
+};
+#endif
+
+/*
+    Cantor Basis introduced by:
+    D. G. Cantor, "On arithmetical algorithms over finite fields",
+    Journal of Combinatorial Theory, Series A, vol. 50, no. 2, pp. 285-300, 1989.
+*/
+
+static const unsigned kFieldSize = (unsigned)1 << kGFBits; //Field size
+static const unsigned kFieldModulus = kFieldSize - 1;
+
+static ffe_t GFLog[kFieldSize];
+static ffe_t GFExp[kFieldSize];
+
+// Initialize GFLog[], GFExp[]
+static void InitField()
+{
+    unsigned state = 1;
+    for (unsigned i = 0; i < kFieldModulus; ++i)
+    {
+        GFExp[state] = static_cast<ffe_t>(i);
+        state <<= 1;
+        if (state >= kFieldSize)
+            state ^= kGFPolynomial;
+    }
+    GFExp[0] = kFieldModulus;
+
+    // Conversion to chosen basis:
+
+    GFLog[0] = 0;
+    for (unsigned i = 0; i < kGFBits; ++i)
+    {
+        const ffe_t basis = kGFBasis[i];
+        const unsigned width = (unsigned)(1UL << i);
+
+        for (unsigned j = 0; j < width; ++j)
+            GFLog[j + width] = GFLog[j] ^ basis;
+    }
+
+    for (unsigned i = 0; i < kFieldSize; ++i)
+        GFLog[i] = GFExp[GFLog[i]];
+
+    for (unsigned i = 0; i < kFieldSize; ++i)
+        GFExp[GFLog[i]] = i;
+
+    GFExp[kFieldModulus] = GFExp[0];
+}
+
+
+//------------------------------------------------------------------------------
+// Mod Q Field Operations
+//
+// Q is the maximum symbol value, e.g. 255 or 65535.
+
+// z = x + y (mod Q)
+static inline ffe_t AddModQ(ffe_t a, ffe_t b)
+{
+    const unsigned sum = (unsigned)a + b;
+
+    // Partial reduction step, allowing for Q to be returned
+    return static_cast<ffe_t>(sum + (sum >> kGFBits));
+}
+
+// z = x - y (mod Q)
+static inline ffe_t SubModQ(ffe_t a, ffe_t b)
+{
+    const unsigned dif = (unsigned)a - b;
+
+    // Partial reduction step, allowing for Q to be returned
+    return static_cast<ffe_t>(dif + (dif >> kGFBits));
+}
+
+// return a*GFExp[b] over GF(2^r)
+static ffe_t mulE(ffe_t a, ffe_t b)
+{
+    if (a == 0)
+        return 0;
+
+    const ffe_t sum = static_cast<ffe_t>(AddModQ(GFLog[a], b));
+    return GFExp[sum];
+}
+
+
+//------------------------------------------------------------------------------
+// Fast Walsh-Hadamard Transform (FWHT) Mod Q
+//
+// Q is the maximum symbol value, e.g. 255 or 65535.
+
+// Define this to enable the optimized version of FWHT()
+#define LEO_FWHT_OPTIMIZED
+
+typedef ffe_t fwht_t;
+
+// {a, b} = {a + b, a - b} (Mod Q)
+static LEO_FORCE_INLINE void FWHT_2(fwht_t& LEO_RESTRICT a, fwht_t& LEO_RESTRICT b)
+{
+    const fwht_t sum = AddModQ(a, b);
+    const fwht_t dif = SubModQ(a, b);
+    a = sum;
+    b = dif;
+}
+
+// Reference implementation
+static void FWHT(fwht_t* data, const unsigned bits)
+{
+    const unsigned size = (unsigned)(1UL << bits);
+    for (unsigned width = 1; width < size; width <<= 1)
+        for (unsigned i = 0; i < size; i += (width << 1))
+            for (unsigned j = i; j < (width + i); ++j)
+                FWHT_2(data[j], data[j + width]);
+}
+
+
+//------------------------------------------------------------------------------
+// Formal Derivative
+
+// Formal derivative of polynomial in the new basis
+static void formal_derivative(ffe_t* cos, const unsigned size)
+{
+    /*
+        Left to right xoring data ahead into data behind.
+
+        If the data ends in all zeroes, this can simply stop.
+    */
+    for (unsigned i = 1; i < size; ++i)
+    {
+        const unsigned leng = ((i ^ (i - 1)) + 1) >> 1;
+
+        // If a large number of values are being XORed:
+        for (unsigned j = i - leng; j < i; ++j)
+            cos[j] ^= cos[j + leng];
+    }
+
+    // Doesn't seem to be needed
+#if 0
+    /*
+        Same here - Zeroes on the right are preserved
+    */
+    for (unsigned i = size; i < kFieldSize; i <<= 1)
+    {
+        for (unsigned j = 0; j < size; ++j)
+            cos[j] ^= cos[j + i];
+    }
+#endif
+}
+
+
+//------------------------------------------------------------------------------
+// Fast Fourier Transform
+
+static ffe_t skewVec[kFieldModulus]; // twisted factors used in FFT
+
+static LEO_FORCE_INLINE void ifft_butterfly(ffe_t& a, ffe_t& b, ffe_t skew)
+{
+    b ^= a;
+    a ^= mulE(b, skew);
+}
+
+// IFFT in the proposed basis
+static void IFLT(ffe_t* data, const unsigned size, const unsigned index)
+{
+    for (unsigned width = 1; width < size; width <<= 1)
+    {
+        for (unsigned j = width; j < size; j += (width << 1))
+        {
+            const ffe_t skew = skewVec[j + index - 1];
+
+            if (skew != kFieldModulus)
+            {
+                for (unsigned i = j - width; i < j; ++i)
+                    ifft_butterfly(data[i], data[i + width], skew);
+            }
+            else
+            {
+                for (unsigned i = j - width; i < j; ++i)
+                    data[i + width] ^= data[i];
+            }
+        }
+    }
+}
+
+static LEO_FORCE_INLINE void fft_butterfly(ffe_t& a, ffe_t& b, ffe_t skew)
+{
+    a ^= mulE(b, skew);
+    b ^= a;
+}
+
+// FFT in the proposed basis
+static void FLT(ffe_t* data, const unsigned size, const unsigned skewIndex, const unsigned output_elements)
+{
+    for (unsigned width = (size >> 1); width > 0; width >>= 1)
+    {
+        const ffe_t* skewLUT = skewVec + width + skewIndex - 1;
+
+        for (unsigned j = 0; j < output_elements; j += (width << 1))
+        {
+            const ffe_t skew = skewLUT[j];
+
+            if (skew != kFieldModulus)
+            {
+                for (unsigned i = j; i < j + width; ++i)
+                    fft_butterfly(data[i], data[i + width], skew);
+            }
+            else
+            {
+                for (unsigned i = j; i < j + width; ++i)
+                    data[i + width] ^= data[i];
+            }
+        }
+    }
+}
+
+
+//------------------------------------------------------------------------------
+// FFT Initialization
+
+//static ffe_t B[kFieldSize >> 1];     // factors used in formal derivative
+static fwht_t log_walsh[kFieldSize];  // factors used in the evaluation of the error locator polynomial
+
+// Initialize skewVec[], B[], log_walsh[]
+static void InitFieldOperations()
+{
+    ffe_t temp[kGFBits - 1];
+
+    for (unsigned i = 1; i < kGFBits; ++i)
+        temp[i - 1] = (ffe_t)((unsigned)1 << i);
+
+    for (unsigned m = 0; m < (kGFBits - 1); ++m)
+    {
+        const unsigned step = (unsigned)1 << (m + 1);
+
+        skewVec[((unsigned)1 << m) - 1] = 0;
+
+        for (unsigned i = m; i < (kGFBits - 1); ++i)
+        {
+            const unsigned s = ((unsigned)1 << (i + 1));
+
+            for (unsigned j = ((unsigned)1 << m) - 1; j < s; j += step)
+                skewVec[j + s] = skewVec[j] ^ temp[i];
+        }
+
+        temp[m] = kFieldModulus - GFLog[mulE(temp[m], GFLog[temp[m] ^ 1])];
+
+        for (unsigned i = m + 1; i < (kGFBits - 1); ++i)
+            temp[i] = mulE(temp[i], (GFLog[temp[i] ^ 1] + temp[m]) % kFieldModulus);
+    }
+
+    for (unsigned i = 0; i < kFieldSize; ++i)
+        skewVec[i] = GFLog[skewVec[i]];
+
+#if 0
+    temp[0] = kFieldModulus - temp[0];
+
+    for (unsigned i = 1; i < (kGFBits - 1); ++i)
+        temp[i] = (kFieldModulus - temp[i] + temp[i - 1]) % kFieldModulus;
+
+    B[0] = 0;
+    for (unsigned i = 0; i < (kGFBits - 1); ++i)
+    {
+        const unsigned depart = ((unsigned)1 << i);
+
+        for (unsigned j = 0; j < depart; ++j)
+            B[j + depart] = (B[j] + temp[i]) % kFieldModulus;
+    }
+#endif
+
+    for (unsigned i = 0; i < kFieldSize; ++i)
+        log_walsh[i] = GFLog[i];
+
+    log_walsh[0] = 0;
+
+    FWHT(log_walsh, kGFBits);
+}
+
+
+//------------------------------------------------------------------------------
+// Encoder
+
+// Encoding alg for k/n<0.5: message is a power of two
+static void encodeL(ffe_t* data, const unsigned k, ffe_t* codeword)
+{
+    memcpy(codeword, data, sizeof(ffe_t) * k);
+
+    IFLT(codeword, k, 0);
+
+    for (unsigned i = k; i < kFieldSize; i += k)
+    {
+        memcpy(&codeword[i], codeword, sizeof(ffe_t) * k);
+
+        FLT(&codeword[i], k, i, k);
+    }
+
+    memcpy(codeword, data, sizeof(ffe_t) * k);
+}
+
+// Encoding alg for k/n>0.5: parity is a power of two.
+// data: message array. parity: parity array. mem: buffer(size>= n-k)
+static void encodeH(const ffe_t* data, const unsigned m, const unsigned original_count, ffe_t* parity, ffe_t* mem)
+{
+    // Note: Assumes data is padded with zeroes out to the next multiple of m
+
+    memcpy(parity, data, m * sizeof(ffe_t));
+    IFLT(parity, m, m);
+
+    for (unsigned i = m; i < original_count; i += m)
+    {
+        memcpy(mem, data + i, m * sizeof(ffe_t));
+        IFLT(mem, m, m + i);
+        for (unsigned j = 0; j < m; ++j)
+            parity[j] ^= mem[j];
+    }
+
+    FLT(parity, m, 0, m);
+}
+
+
+//------------------------------------------------------------------------------
+// Decoder
+
+static void decode(ffe_t* codeword, const unsigned m, const unsigned original_count, const unsigned n, const bool* erasure)
+{
+    fwht_t log_walsh2[kFieldSize];
+
+    // Compute the evaluations of the error locator polynomial
+    for (unsigned i = 0; i < kFieldSize; ++i)
+        log_walsh2[i] = erasure[i] ? 1 : 0;
+
+    FWHT(log_walsh2, kGFBits);
+
+    for (unsigned i = 0; i < kFieldSize; ++i)
+        log_walsh2[i] = ((unsigned)log_walsh2[i] * (unsigned)log_walsh[i]) % kFieldModulus;
+
+    FWHT(log_walsh2, kGFBits);
+
+    // k2 can be replaced with k
+    //const unsigned k2 = kFieldSize;
+    //const unsigned k2 = k; // cannot actually be replaced with k.  maybe for encodeL() only?
+
+    for (unsigned i = 0; i < m + original_count; ++i)
+    {
+        if (erasure[i])
+        {
+            codeword[i] = 0;
+        }
+        else
+        {
+            codeword[i] = mulE(codeword[i], log_walsh2[i]);
+        }
+    }
+    for (unsigned i = m + original_count; i < n; ++i)
+        codeword[i] = 0;
+
+    IFLT(codeword, n, 0);
+
+    // Note: This is not needed to recover successfully...
+#if 0
+    // formal derivative
+    // Note: Preserves zeroes on the right
+    for (unsigned i = 0; i < m + original_count; i += 2)
+    {
+        codeword[i] = mulE(codeword[i], kFieldModulus - B[i >> 1]);
+        codeword[i + 1] = mulE(codeword[i + 1], kFieldModulus - B[i >> 1]);
+    }
+#endif
+
+    formal_derivative(codeword, n);
+
+#if 0
+    // Note: Preserves zeroes on the right
+    for (unsigned i = 0; i < m + original_count; i += 2)
+    {
+        codeword[i] = mulE(codeword[i], B[i >> 1]);
+        codeword[i + 1] = mulE(codeword[i + 1], B[i >> 1]);
+    }
+#endif
+
+    FLT(codeword, n, 0, m + original_count);
+
+    for (unsigned i = 0; i < kFieldSize; ++i)
+    {
+        if (erasure[i])
+        {
+            codeword[i] = mulE(codeword[i], kFieldModulus - log_walsh2[i]);
+        }
+    }
+}
+
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#endif
+
+// Returns highest bit index 0..63 where the first non-zero bit is found
+// Precondition: x != 0
+LEO_FORCE_INLINE unsigned LastNonzeroBit64(uint64_t x)
+{
+#ifdef _MSC_VER
+#ifdef _WIN64
+    unsigned long index;
+    // Note: Ignoring result because x != 0
+    _BitScanReverse64(&index, x);
+    return (unsigned)index;
+#else
+    unsigned long index;
+    if (0 != _BitScanReverse(&index, (uint32_t)x))
+        return (unsigned)index;
+    // Note: Ignoring result because x != 0
+    _BitScanReverse(&index, (uint32_t)(x >> 32));
+    return (unsigned)index + 32;
+#endif
+#else
+    // Note: Ignoring return value of 0 because x != 0
+    return 63 - (unsigned)__builtin_clzll(x);
+#endif
+}
+
+
+//------------------------------------------------------------------------------
+// Test Application
+
+void test(unsigned original_count, unsigned recovery_count, unsigned seed)
+{
+    unsigned m = 2UL << LastNonzeroBit64(recovery_count - 1);
+    unsigned n = 2UL << LastNonzeroBit64(m + original_count - 1);
+
+    srand(seed);
+
+    //-----------Generating message----------
+
+    // Message array
+    ffe_t data[kFieldSize] = {0};
+
+    // Filled with random numbers
+    for (unsigned i = m; i < m + original_count; ++i)
+        data[i] = (ffe_t)rand();
+
+
+    //---------encoding----------
+
+    ffe_t codeword[kFieldSize] = {};
+    // First m codewords are for the parity data
+    encodeH(data + m, m, original_count, data, codeword);
+    //encodeL(data, k, codeword); // does not seem to work with any input?  what else needs to change?
+
+    memcpy(codeword, data, sizeof(ffe_t) * kFieldSize);
+
+
+    //--------erasure simulation---------
+
+    // Array indicating erasures
+    bool erasure[kFieldSize] = {
+        false
+    };
+
+    // Tag the first "recovery_count" elements as erasures
+    for (unsigned i = m; i < m + recovery_count; ++i)
+        erasure[i] = true;
+
+    // permuting the erasure array
+    for (unsigned i = m + original_count - 1; i > 0; --i)
+    {
+        unsigned pos = rand() % (i + 1);
+
+        if (i != pos)
+        {
+            bool tmp = erasure[i];
+            erasure[i] = erasure[pos];
+            erasure[pos] = tmp;
+        }
+    }
+
+
+    //---------main processing----------
+    decode(codeword, m, original_count, n, erasure);
+
+    // Check the correctness of the result
+    for (unsigned i = 0; i < kFieldSize; ++i)
+    {
+        if (erasure[i])
+        {
+            if (data[i] != codeword[i])
+            {
+                printf("Decoding Error with seed = %d!\n", seed);
+                LEO_DEBUG_BREAK;
+                return;
+            }
+        }
+    }
+
+    printf(":D ");
+}
+
+
+//------------------------------------------------------------------------------
+// Entrypoint
+
+int main(int argc, char **argv)
+{
+    // Fill GFLog table and GFExp table
+    InitField();
+
+    // Compute factors used in erasure decoder
+    InitFieldOperations();
+
+    unsigned seed = (unsigned)time(NULL);
+    for (;;)
+    {
+#ifdef LEO_SHORT_FIELD
+        const unsigned input_count = 100;
+        const unsigned recovery_count = 20;
+#else // LEO_SHORT_FIELD
+        const unsigned input_count = 10000;
+        const unsigned recovery_count = 2000;
+#endif // LEO_SHORT_FIELD
+
+        test(input_count, recovery_count, seed);
+
+        ++seed;
+    }
+
+    return 0;
+}
diff --git a/tests/proj/Benchmark.vcxproj b/tests/proj/Benchmark.vcxproj
index 6c008f5..41583ff 100644
--- a/tests/proj/Benchmark.vcxproj
+++ b/tests/proj/Benchmark.vcxproj
@@ -20,36 +20,35 @@
   </ItemGroup>
   <PropertyGroup Label="Globals">
     <ProjectGuid>{97FCA15F-EAF3-4F1A-AFF8-83E693DA9D45}</ProjectGuid>
-    <RootNamespace>Fecal</RootNamespace>
+    <RootNamespace>Leopard</RootNamespace>
     <ProjectName>LeopardBenchmark</ProjectName>
-    <WindowsTargetPlatformVersion>10.0.14393.0</WindowsTargetPlatformVersion>
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
     <ConfigurationType>Application</ConfigurationType>
     <UseDebugLibraries>true</UseDebugLibraries>
     <CharacterSet>MultiByte</CharacterSet>
-    <PlatformToolset>v141</PlatformToolset>
+    <PlatformToolset>v140</PlatformToolset>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
     <ConfigurationType>Application</ConfigurationType>
     <UseDebugLibraries>true</UseDebugLibraries>
     <CharacterSet>MultiByte</CharacterSet>
-    <PlatformToolset>v141</PlatformToolset>
+    <PlatformToolset>v140</PlatformToolset>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
     <ConfigurationType>Application</ConfigurationType>
     <UseDebugLibraries>false</UseDebugLibraries>
     <WholeProgramOptimization>true</WholeProgramOptimization>
     <CharacterSet>MultiByte</CharacterSet>
-    <PlatformToolset>v141</PlatformToolset>
+    <PlatformToolset>v140</PlatformToolset>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
     <ConfigurationType>Application</ConfigurationType>
     <UseDebugLibraries>false</UseDebugLibraries>
     <WholeProgramOptimization>true</WholeProgramOptimization>
     <CharacterSet>MultiByte</CharacterSet>
-    <PlatformToolset>v141</PlatformToolset>
+    <PlatformToolset>v140</PlatformToolset>
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
diff --git a/tests/proj/Experiments.filters b/tests/proj/Experiments.filters
new file mode 100644
index 0000000..50a05dd
--- /dev/null
+++ b/tests/proj/Experiments.filters
@@ -0,0 +1,22 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup>
+    <Filter Include="Source Files">
+      <UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
+      <Extensions>cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx</Extensions>
+    </Filter>
+    <Filter Include="Header Files">
+      <UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>
+      <Extensions>h;hh;hpp;hxx;hm;inl;inc;xsd</Extensions>
+    </Filter>
+    <Filter Include="Resource Files">
+      <UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier>
+      <Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
+    </Filter>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="..\benchmark.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+  </ItemGroup>
+</Project>
\ No newline at end of file
diff --git a/tests/proj/Experiments.vcxproj b/tests/proj/Experiments.vcxproj
new file mode 100644
index 0000000..187d804
--- /dev/null
+++ b/tests/proj/Experiments.vcxproj
@@ -0,0 +1,181 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{97FCA15F-EAF3-4F1A-AFF8-83E693DA9D65}</ProjectGuid>
+    <RootNamespace>Leopard</RootNamespace>
+    <ProjectName>LeopardExperiments</ProjectName>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v140</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v140</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v140</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v140</PlatformToolset>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <OutDir>Output/$(ProjectName)/$(Configuration)/$(Platform)/</OutDir>
+    <IntDir>Obj/$(ProjectName)/$(Configuration)/$(Platform)/</IntDir>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <OutDir>Output/$(ProjectName)/$(Configuration)/$(Platform)/</OutDir>
+    <IntDir>Obj/$(ProjectName)/$(Configuration)/$(Platform)/</IntDir>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <OutDir>Output/$(ProjectName)/$(Configuration)/$(Platform)/</OutDir>
+    <IntDir>Obj/$(ProjectName)/$(Configuration)/$(Platform)/</IntDir>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <OutDir>Output/$(ProjectName)/$(Configuration)/$(Platform)/</OutDir>
+    <IntDir>Obj/$(ProjectName)/$(Configuration)/$(Platform)/</IntDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+      <PreprocessorDefinitions>_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <AdditionalLibraryDirectories>
+      </AdditionalLibraryDirectories>
+    </Link>
+    <PostBuildEvent>
+      <Command>
+      </Command>
+    </PostBuildEvent>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+      <PreprocessorDefinitions>_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <AdditionalLibraryDirectories>
+      </AdditionalLibraryDirectories>
+    </Link>
+    <PostBuildEvent>
+      <Command>
+      </Command>
+    </PostBuildEvent>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <InlineFunctionExpansion>AnySuitable</InlineFunctionExpansion>
+      <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
+      <OmitFramePointers>false</OmitFramePointers>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+      <BufferSecurityCheck>true</BufferSecurityCheck>
+      <PreprocessorDefinitions>_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <AdditionalLibraryDirectories>
+      </AdditionalLibraryDirectories>
+    </Link>
+    <PostBuildEvent>
+      <Command>
+      </Command>
+    </PostBuildEvent>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <InlineFunctionExpansion>AnySuitable</InlineFunctionExpansion>
+      <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
+      <OmitFramePointers>false</OmitFramePointers>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+      <BufferSecurityCheck>true</BufferSecurityCheck>
+      <PreprocessorDefinitions>_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <AdditionalLibraryDirectories>
+      </AdditionalLibraryDirectories>
+    </Link>
+    <PostBuildEvent>
+      <Command>
+      </Command>
+    </PostBuildEvent>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="..\experiments.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
\ No newline at end of file