Project structure

2017-05-25 02:24:15 -07:00 · 2017-05-25 02:24:15 -07:00 · 49dbcdc8b1
parent 4d78561689
commit 49dbcdc8b1
20 changed files with 9742 additions and 129 deletions
--- a/LeopardCommon.cpp
+++ b/LeopardCommon.cpp
@ -0,0 +1,957 @@
+/*
+    Copyright (c) 2017 Christopher A. Taylor.  All rights reserved.
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright notice,
+      this list of conditions and the following disclaimer in the documentation
+      and/or other materials provided with the distribution.
+    * Neither the name of Leopard-RS nor the names of its contributors may be
+      used to endorse or promote products derived from this software without
+      specific prior written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+    ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+    POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "LeopardCommon.h"
+
+namespace leopard {
+
+
+//------------------------------------------------------------------------------
+// Runtime CPU Architecture Check
+//
+// Feature checks stolen shamelessly from
+// https://github.com/jedisct1/libsodium/blob/master/src/libsodium/sodium/runtime.c
+
+#if defined(HAVE_ANDROID_GETCPUFEATURES)
+    #include <cpu-features.h>
+#endif
+
+#if defined(LEO_TRY_NEON)
+# if defined(IOS) && defined(__ARM_NEON__)
+// Requires iPhone 5S or newer
+# else
+// Remember to add LOCAL_STATIC_LIBRARIES := cpufeatures
+bool CpuHasNeon = false; // V6 / V7
+bool CpuHasNeon64 = false; // 64-bit
+# endif
+#endif
+
+
+#if !defined(LEO_TARGET_MOBILE)
+
+#ifdef _MSC_VER
+    #include <intrin.h> // __cpuid
+    #pragma warning(disable: 4752) // found Intel(R) Advanced Vector Extensions; consider using /arch:AVX
+#endif
+
+#ifdef LEO_TRY_AVX2
+bool CpuHasAVX2 = false;
+#endif
+bool CpuHasSSSE3 = false;
+
+#define CPUID_EBX_AVX2    0x00000020
+#define CPUID_ECX_SSSE3   0x00000200
+
+static void _cpuid(unsigned int cpu_info[4U], const unsigned int cpu_info_type)
+{
+#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_AMD64) || defined(_M_IX86))
+    __cpuid((int *) cpu_info, cpu_info_type);
+#else //if defined(HAVE_CPUID)
+    cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0;
+# ifdef __i386__
+    __asm__ __volatile__ ("pushfl; pushfl; "
+                          "popl %0; "
+                          "movl %0, %1; xorl %2, %0; "
+                          "pushl %0; "
+                          "popfl; pushfl; popl %0; popfl" :
+                          "=&r" (cpu_info[0]), "=&r" (cpu_info[1]) :
+                          "i" (0x200000));
+    if (((cpu_info[0] ^ cpu_info[1]) & 0x200000) == 0) {
+        return; /* LCOV_EXCL_LINE */
+    }
+# endif
+# ifdef __i386__
+    __asm__ __volatile__ ("xchgl %%ebx, %k1; cpuid; xchgl %%ebx, %k1" :
+                          "=a" (cpu_info[0]), "=&r" (cpu_info[1]),
+                          "=c" (cpu_info[2]), "=d" (cpu_info[3]) :
+                          "0" (cpu_info_type), "2" (0U));
+# elif defined(__x86_64__)
+    __asm__ __volatile__ ("xchgq %%rbx, %q1; cpuid; xchgq %%rbx, %q1" :
+                          "=a" (cpu_info[0]), "=&r" (cpu_info[1]),
+                          "=c" (cpu_info[2]), "=d" (cpu_info[3]) :
+                          "0" (cpu_info_type), "2" (0U));
+# else
+    __asm__ __volatile__ ("cpuid" :
+                          "=a" (cpu_info[0]), "=b" (cpu_info[1]),
+                          "=c" (cpu_info[2]), "=d" (cpu_info[3]) :
+                          "0" (cpu_info_type), "2" (0U));
+# endif
+#endif
+}
+
+#endif // defined(LEO_TARGET_MOBILE)
+
+
+void InitializeCPUArch()
+{
+#if defined(LEO_TRY_NEON) && defined(HAVE_ANDROID_GETCPUFEATURES)
+    AndroidCpuFamily family = android_getCpuFamily();
+    if (family == ANDROID_CPU_FAMILY_ARM)
+    {
+        if (android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON)
+            CpuHasNeon = true;
+    }
+    else if (family == ANDROID_CPU_FAMILY_ARM64)
+    {
+        CpuHasNeon = true;
+        if (android_getCpuFeatures() & ANDROID_CPU_ARM64_FEATURE_ASIMD)
+            CpuHasNeon64 = true;
+    }
+#endif
+
+#if !defined(LEO_TARGET_MOBILE)
+    unsigned int cpu_info[4];
+
+    _cpuid(cpu_info, 1);
+    CpuHasSSSE3 = ((cpu_info[2] & CPUID_ECX_SSSE3) != 0);
+
+#if defined(LEO_TRY_AVX2)
+    _cpuid(cpu_info, 7);
+    CpuHasAVX2 = ((cpu_info[1] & CPUID_EBX_AVX2) != 0);
+#endif // LEO_TRY_AVX2
+
+#endif // LEO_TARGET_MOBILE
+}
+
+
+
+// vx[] += vy[] * z
+static void muladd_mem(GFSymbol * LEO_RESTRICT vx, const GFSymbol * LEO_RESTRICT vy, GFSymbol z, unsigned symbolCount)
+{
+    for (unsigned i = 0; i < symbolCount; ++i)
+    {
+        const GFSymbol a = vy[i];
+        if (a == 0)
+            continue;
+
+        GFSymbol sum1 = static_cast<GFSymbol>(AddModQ(GFLog[a & 0x0f], z));
+        GFSymbol value1 = GFExp[sum1];
+        if ((a & 0x0f) == 0)
+        {
+            value1 = 0;
+        }
+        GFSymbol sum2 = static_cast<GFSymbol>(AddModQ(GFLog[a & 0xf0], z));
+        GFSymbol value2 = GFExp[sum2];
+        if ((a & 0xf0) == 0)
+        {
+            value2 = 0;
+        }
+        GFSymbol sum3 = static_cast<GFSymbol>(AddModQ(GFLog[a & 0x0f00], z));
+        GFSymbol value3 = GFExp[sum3];
+        if ((a & 0x0f00) == 0)
+        {
+            value3 = 0;
+        }
+        GFSymbol sum4 = static_cast<GFSymbol>(AddModQ(GFLog[a & 0xf000], z));
+        GFSymbol value4 = GFExp[sum4];
+        if ((a & 0xf000) == 0)
+        {
+            value4 = 0;
+        }
+
+        vx[i] ^= value1;
+        vx[i] ^= value2;
+        vx[i] ^= value3;
+        vx[i] ^= value4;
+    }
+}
+
+// return a*GFExp[b] over GF(2^r)
+static GFSymbol mulE(GFSymbol a, GFSymbol b)
+{
+    if (a == 0)
+        return 0;
+
+    const GFSymbol sum = static_cast<GFSymbol>(AddModQ(GFLog[a], b));
+    return GFExp[sum];
+}
+
+
+//------------------------------------------------------------------------------
+// Fast Walsh-Hadamard Transform (FWHT) Mod Q
+//
+// Q is the maximum symbol value, e.g. 255 or 65535.
+
+// Define this to enable the optimized version of FWHT()
+#define LEO_FWHT_OPTIMIZED
+
+typedef GFSymbol fwht_t;
+
+// {a, b} = {a + b, a - b} (Mod Q)
+static LEO_FORCE_INLINE void FWHT_2(fwht_t& LEO_RESTRICT a, fwht_t& LEO_RESTRICT b)
+{
+    const fwht_t sum = AddModQ(a, b);
+    const fwht_t dif = SubModQ(a, b);
+    a = sum;
+    b = dif;
+}
+
+/*
+    FWHT is a minor slice of the runtime and does not grow with data size,
+    but I did attempt a few additional optimizations that failed:
+
+    I've attempted to vectorize (with partial reductions) FWHT_4(data, s),
+    which is 70% of the algorithm, but it was slower.  Left in _attic_.
+
+    I've attempted to avoid reductions in all or parts of the FWHT.
+    The final modular reduction ends up being slower than the savings.
+    Specifically I tried doing it for the whole FWHT and also I tried
+    doing it just for the FWHT_2 loop in the main routine, but both
+    approaches are slower than partial reductions.
+
+    Replacing word reads with wider reads does speed up the operation, but
+    at too high a complexity cost relative to minor perf improvement.
+*/
+
+#ifndef LEO_FWHT_OPTIMIZED
+
+// Reference implementation
+static void FWHT(fwht_t* data, const unsigned bits)
+{
+    const unsigned size = (unsigned)(1UL << bits);
+    for (unsigned width = 1; width < size; width <<= 1)
+        for (unsigned i = 0; i < size; i += (width << 1))
+            for (unsigned j = i; j < (width + i); ++j)
+                FWHT_2(data[j], data[j + width]);
+}
+
+#else
+
+static LEO_FORCE_INLINE void FWHT_4(fwht_t* data)
+{
+    fwht_t t0 = data[0];
+    fwht_t t1 = data[1];
+    fwht_t t2 = data[2];
+    fwht_t t3 = data[3];
+    FWHT_2(t0, t1);
+    FWHT_2(t2, t3);
+    FWHT_2(t0, t2);
+    FWHT_2(t1, t3);
+    data[0] = t0;
+    data[1] = t1;
+    data[2] = t2;
+    data[3] = t3;
+}
+
+static LEO_FORCE_INLINE void FWHT_4(fwht_t* data, unsigned s)
+{
+    unsigned x = 0;
+    fwht_t t0 = data[x];  x += s;
+    fwht_t t1 = data[x];  x += s;
+    fwht_t t2 = data[x];  x += s;
+    fwht_t t3 = data[x];
+    FWHT_2(t0, t1);
+    FWHT_2(t2, t3);
+    FWHT_2(t0, t2);
+    FWHT_2(t1, t3);
+    unsigned y = 0;
+    data[y] = t0;  y += s;
+    data[y] = t1;  y += s;
+    data[y] = t2;  y += s;
+    data[y] = t3;
+}
+
+static inline void FWHT_8(fwht_t* data)
+{
+    fwht_t t0 = data[0];
+    fwht_t t1 = data[1];
+    fwht_t t2 = data[2];
+    fwht_t t3 = data[3];
+    fwht_t t4 = data[4];
+    fwht_t t5 = data[5];
+    fwht_t t6 = data[6];
+    fwht_t t7 = data[7];
+    FWHT_2(t0, t1);
+    FWHT_2(t2, t3);
+    FWHT_2(t4, t5);
+    FWHT_2(t6, t7);
+    FWHT_2(t0, t2);
+    FWHT_2(t1, t3);
+    FWHT_2(t4, t6);
+    FWHT_2(t5, t7);
+    FWHT_2(t0, t4);
+    FWHT_2(t1, t5);
+    FWHT_2(t2, t6);
+    FWHT_2(t3, t7);
+    data[0] = t0;
+    data[1] = t1;
+    data[2] = t2;
+    data[3] = t3;
+    data[4] = t4;
+    data[5] = t5;
+    data[6] = t6;
+    data[7] = t7;
+}
+
+static inline void FWHT_16(fwht_t* data)
+{
+    fwht_t t0 = data[0];
+    fwht_t t1 = data[1];
+    fwht_t t2 = data[2];
+    fwht_t t3 = data[3];
+    fwht_t t4 = data[4];
+    fwht_t t5 = data[5];
+    fwht_t t6 = data[6];
+    fwht_t t7 = data[7];
+    fwht_t t8 = data[8];
+    fwht_t t9 = data[9];
+    fwht_t t10 = data[10];
+    fwht_t t11 = data[11];
+    fwht_t t12 = data[12];
+    fwht_t t13 = data[13];
+    fwht_t t14 = data[14];
+    fwht_t t15 = data[15];
+    FWHT_2(t0, t1);
+    FWHT_2(t2, t3);
+    FWHT_2(t4, t5);
+    FWHT_2(t6, t7);
+    FWHT_2(t8, t9);
+    FWHT_2(t10, t11);
+    FWHT_2(t12, t13);
+    FWHT_2(t14, t15);
+    FWHT_2(t0, t2);
+    FWHT_2(t1, t3);
+    FWHT_2(t4, t6);
+    FWHT_2(t5, t7);
+    FWHT_2(t8, t10);
+    FWHT_2(t9, t11);
+    FWHT_2(t12, t14);
+    FWHT_2(t13, t15);
+    FWHT_2(t0, t4);
+    FWHT_2(t1, t5);
+    FWHT_2(t2, t6);
+    FWHT_2(t3, t7);
+    FWHT_2(t8, t12);
+    FWHT_2(t9, t13);
+    FWHT_2(t10, t14);
+    FWHT_2(t11, t15);
+    FWHT_2(t0, t8);
+    FWHT_2(t1, t9);
+    FWHT_2(t2, t10);
+    FWHT_2(t3, t11);
+    FWHT_2(t4, t12);
+    FWHT_2(t5, t13);
+    FWHT_2(t6, t14);
+    FWHT_2(t7, t15);
+    data[0] = t0;
+    data[1] = t1;
+    data[2] = t2;
+    data[3] = t3;
+    data[4] = t4;
+    data[5] = t5;
+    data[6] = t6;
+    data[7] = t7;
+    data[8] = t8;
+    data[9] = t9;
+    data[10] = t10;
+    data[11] = t11;
+    data[12] = t12;
+    data[13] = t13;
+    data[14] = t14;
+    data[15] = t15;
+}
+
+static void FWHT_SmallData(fwht_t* data, unsigned ldn)
+{
+    const unsigned n = (1UL << ldn);
+
+    if (n <= 2)
+    {
+        if (n == 2)
+            FWHT_2(data[0], data[1]);
+        return;
+    }
+
+    for (unsigned ldm = ldn; ldm > 3; ldm -= 2)
+    {
+        unsigned m = (1UL << ldm);
+        unsigned m4 = (m >> 2);
+        for (unsigned r = 0; r < n; r += m)
+            for (unsigned j = 0; j < m4; j++)
+                FWHT_4(data + j + r, m4);
+    }
+
+    if (ldn & 1)
+    {
+        for (unsigned i0 = 0; i0 < n; i0 += 8)
+            FWHT_8(data + i0);
+    }
+    else
+    {
+        for (unsigned i0 = 0; i0 < n; i0 += 4)
+            FWHT_4(data + i0);
+    }
+}
+
+// Decimation in time (DIT) version
+static void FWHT(fwht_t* data, const unsigned ldn)
+{
+    if (ldn <= 13)
+    {
+        FWHT_SmallData(data, ldn);
+        return;
+    }
+
+    FWHT_2(data[2], data[3]);
+    FWHT_4(data + 4);
+    FWHT_8(data + 8);
+    FWHT_16(data + 16);
+    for (unsigned ldm = 5; ldm < ldn; ++ldm)
+        FWHT(data + (unsigned)(1UL << ldm), ldm);
+
+    for (unsigned ldm = 0; ldm < ldn; ++ldm)
+    {
+        const unsigned mh = (1UL << ldm);
+        for (unsigned t1 = 0, t2 = mh; t1 < mh; ++t1, ++t2)
+            FWHT_2(data[t1], data[t2]);
+    }
+}
+
+#endif
+
+
+//------------------------------------------------------------------------------
+// Memory Buffer XOR
+
+static void xor_mem(void * LEO_RESTRICT vx, const void * LEO_RESTRICT vy, unsigned bytes)
+{
+    LEO_M128 * LEO_RESTRICT x16 = reinterpret_cast<LEO_M128 *>(vx);
+    const LEO_M128 * LEO_RESTRICT y16 = reinterpret_cast<const LEO_M128 *>(vy);
+
+#if defined(LEO_TARGET_MOBILE)
+# if defined(LEO_TRY_NEON)
+    // Handle multiples of 64 bytes
+    if (CpuHasNeon)
+    {
+        while (bytes >= 64)
+        {
+            LEO_M128 x0 = vld1q_u8(x16);
+            LEO_M128 x1 = vld1q_u8(x16 + 1);
+            LEO_M128 x2 = vld1q_u8(x16 + 2);
+            LEO_M128 x3 = vld1q_u8(x16 + 3);
+            LEO_M128 y0 = vld1q_u8(y16);
+            LEO_M128 y1 = vld1q_u8(y16 + 1);
+            LEO_M128 y2 = vld1q_u8(y16 + 2);
+            LEO_M128 y3 = vld1q_u8(y16 + 3);
+
+            vst1q_u8(x16,     veorq_u8(x0, y0));
+            vst1q_u8(x16 + 1, veorq_u8(x1, y1));
+            vst1q_u8(x16 + 2, veorq_u8(x2, y2));
+            vst1q_u8(x16 + 3, veorq_u8(x3, y3));
+
+            bytes -= 64, x16 += 4, y16 += 4;
+        }
+
+        // Handle multiples of 16 bytes
+        while (bytes >= 16)
+        {
+            LEO_M128 x0 = vld1q_u8(x16);
+            LEO_M128 y0 = vld1q_u8(y16);
+
+            vst1q_u8(x16, veorq_u8(x0, y0));
+
+            bytes -= 16, ++x16, ++y16;
+        }
+    }
+    else
+# endif // LEO_TRY_NEON
+    {
+        uint64_t * LEO_RESTRICT x8 = reinterpret_cast<uint64_t *>(x16);
+        const uint64_t * LEO_RESTRICT y8 = reinterpret_cast<const uint64_t *>(y16);
+
+        const unsigned count = (unsigned)bytes / 8;
+        for (unsigned ii = 0; ii < count; ++ii)
+            x8[ii] ^= y8[ii];
+
+        x16 = reinterpret_cast<LEO_M128 *>(x8 + count);
+        y16 = reinterpret_cast<const LEO_M128 *>(y8 + count);
+    }
+#else // LEO_TARGET_MOBILE
+# if defined(LEO_TRY_AVX2)
+    if (CpuHasAVX2)
+    {
+        LEO_M256 * LEO_RESTRICT x32 = reinterpret_cast<LEO_M256 *>(x16);
+        const LEO_M256 * LEO_RESTRICT y32 = reinterpret_cast<const LEO_M256 *>(y16);
+
+        while (bytes >= 128)
+        {
+            LEO_M256 x0 = _mm256_loadu_si256(x32);
+            LEO_M256 y0 = _mm256_loadu_si256(y32);
+            x0 = _mm256_xor_si256(x0, y0);
+            LEO_M256 x1 = _mm256_loadu_si256(x32 + 1);
+            LEO_M256 y1 = _mm256_loadu_si256(y32 + 1);
+            x1 = _mm256_xor_si256(x1, y1);
+            LEO_M256 x2 = _mm256_loadu_si256(x32 + 2);
+            LEO_M256 y2 = _mm256_loadu_si256(y32 + 2);
+            x2 = _mm256_xor_si256(x2, y2);
+            LEO_M256 x3 = _mm256_loadu_si256(x32 + 3);
+            LEO_M256 y3 = _mm256_loadu_si256(y32 + 3);
+            x3 = _mm256_xor_si256(x3, y3);
+
+            _mm256_storeu_si256(x32, x0);
+            _mm256_storeu_si256(x32 + 1, x1);
+            _mm256_storeu_si256(x32 + 2, x2);
+            _mm256_storeu_si256(x32 + 3, x3);
+
+            bytes -= 128, x32 += 4, y32 += 4;
+        }
+
+        // Handle multiples of 32 bytes
+        while (bytes >= 32)
+        {
+            // x[i] = x[i] xor y[i]
+            _mm256_storeu_si256(x32,
+                _mm256_xor_si256(
+                    _mm256_loadu_si256(x32),
+                    _mm256_loadu_si256(y32)));
+
+            bytes -= 32, ++x32, ++y32;
+        }
+
+        x16 = reinterpret_cast<LEO_M128 *>(x32);
+        y16 = reinterpret_cast<const LEO_M128 *>(y32);
+    }
+    else
+# endif // LEO_TRY_AVX2
+    {
+        while (bytes >= 64)
+        {
+            LEO_M128 x0 = _mm_loadu_si128(x16);
+            LEO_M128 y0 = _mm_loadu_si128(y16);
+            x0 = _mm_xor_si128(x0, y0);
+            LEO_M128 x1 = _mm_loadu_si128(x16 + 1);
+            LEO_M128 y1 = _mm_loadu_si128(y16 + 1);
+            x1 = _mm_xor_si128(x1, y1);
+            LEO_M128 x2 = _mm_loadu_si128(x16 + 2);
+            LEO_M128 y2 = _mm_loadu_si128(y16 + 2);
+            x2 = _mm_xor_si128(x2, y2);
+            LEO_M128 x3 = _mm_loadu_si128(x16 + 3);
+            LEO_M128 y3 = _mm_loadu_si128(y16 + 3);
+            x3 = _mm_xor_si128(x3, y3);
+
+            _mm_storeu_si128(x16, x0);
+            _mm_storeu_si128(x16 + 1, x1);
+            _mm_storeu_si128(x16 + 2, x2);
+            _mm_storeu_si128(x16 + 3, x3);
+
+            bytes -= 64, x16 += 4, y16 += 4;
+        }
+    }
+#endif // LEO_TARGET_MOBILE
+
+    // Handle multiples of 16 bytes
+    while (bytes >= 16)
+    {
+        // x[i] = x[i] xor y[i]
+        _mm_storeu_si128(x16,
+            _mm_xor_si128(
+                _mm_loadu_si128(x16),
+                _mm_loadu_si128(y16)));
+
+        bytes -= 16, ++x16, ++y16;
+    }
+
+    uint8_t * LEO_RESTRICT x1 = reinterpret_cast<uint8_t *>(x16);
+    const uint8_t * LEO_RESTRICT y1 = reinterpret_cast<const uint8_t *>(y16);
+
+    // Handle a block of 8 bytes
+    const unsigned eight = bytes & 8;
+    if (eight)
+    {
+        uint64_t * LEO_RESTRICT x8 = reinterpret_cast<uint64_t *>(x1);
+        const uint64_t * LEO_RESTRICT y8 = reinterpret_cast<const uint64_t *>(y1);
+        *x8 ^= *y8;
+    }
+
+    // Handle a block of 4 bytes
+    const unsigned four = bytes & 4;
+    if (four)
+    {
+        uint32_t * LEO_RESTRICT x4 = reinterpret_cast<uint32_t *>(x1 + eight);
+        const uint32_t * LEO_RESTRICT y4 = reinterpret_cast<const uint32_t *>(y1 + eight);
+        *x4 ^= *y4;
+    }
+
+    // Handle final bytes
+    const unsigned offset = eight + four;
+    switch (bytes & 3)
+    {
+    case 3: x1[offset + 2] ^= y1[offset + 2];
+    case 2: x1[offset + 1] ^= y1[offset + 1];
+    case 1: x1[offset] ^= y1[offset];
+    default:
+        break;
+    }
+}
+
+
+//------------------------------------------------------------------------------
+// Formal Derivative
+
+// Formal derivative of polynomial in the new basis
+static void formal_derivative(GFSymbol* cos, const unsigned size)
+{
+    for (unsigned i = 1; i < size; ++i)
+    {
+        const unsigned leng = ((i ^ (i - 1)) + 1) >> 1;
+
+        // If a large number of values are being XORed:
+        if (leng >= 8)
+            xor_mem(cos + i - leng, cos + i, leng * sizeof(GFSymbol));
+        else
+            for (unsigned j = i - leng; j < i; j++)
+                cos[j] ^= cos[j + leng];
+    }
+
+    for (unsigned i = size; i < kFieldSize; i <<= 1)
+        xor_mem(cos, cos + i, size * sizeof(GFSymbol));
+}
+
+
+//------------------------------------------------------------------------------
+// Fast Fourier Transform
+
+static GFSymbol skewVec[kFieldModulus]; // twisted factors used in FFT
+
+// IFFT in the proposed basis
+static void IFLT(GFSymbol* data, const unsigned size, const unsigned index)
+{
+    for (unsigned depart_no = 1; depart_no < size; depart_no <<= 1)
+    {
+        for (unsigned j = depart_no; j < size; j += (depart_no << 1))
+        {
+            // If a large number of values are being XORed:
+            if (depart_no >= 8)
+                xor_mem(data + j, data + j - depart_no, depart_no * sizeof(GFSymbol));
+            else
+                for (unsigned i = j - depart_no; i < j; ++i)
+                    data[i + depart_no] ^= data[i];
+
+            const GFSymbol skew = skewVec[j + index - 1];
+
+            if (skew != kFieldModulus)
+                muladd_mem(data + j - depart_no, data + j, skew, depart_no);
+        }
+    }
+}
+
+// FFT in the proposed basis
+static void FLT(GFSymbol* data, const unsigned size, const unsigned index)
+{
+    for (unsigned depart_no = (size >> 1); depart_no > 0; depart_no >>= 1)
+    {
+        for (unsigned j = depart_no; j < size; j += (depart_no << 1))
+        {
+            const GFSymbol skew = skewVec[j + index - 1];
+
+            if (skew != kFieldModulus)
+                muladd_mem(data + j - depart_no, data + j, skew, depart_no);
+
+            // If a large number of values are being XORed:
+            if (depart_no >= 8)
+                xor_mem(data + j, data + j - depart_no, depart_no * sizeof(GFSymbol));
+            else
+                for (unsigned i = j - depart_no; i < j; ++i)
+                    data[i + depart_no] ^= data[i];
+        }
+    }
+}
+
+
+//------------------------------------------------------------------------------
+// FFT Initialization
+
+static GFSymbol B[kFieldSize >> 1];     // factors used in formal derivative
+static fwht_t log_walsh[kFieldSize];  // factors used in the evaluation of the error locator polynomial
+
+// Initialize skewVec[], B[], log_walsh[]
+static void InitFieldOperations()
+{
+    GFSymbol temp[kGFBits - 1];
+
+    for (unsigned i = 1; i < kGFBits; ++i)
+        temp[i - 1] = (GFSymbol)((unsigned)1 << i);
+
+    for (unsigned m = 0; m < (kGFBits - 1); ++m)
+    {
+        const unsigned step = (unsigned)1 << (m + 1);
+
+        skewVec[((unsigned)1 << m) - 1] = 0;
+
+        for (unsigned i = m; i < (kGFBits - 1); ++i)
+        {
+            const unsigned s = ((unsigned)1 << (i + 1));
+
+            for (unsigned j = ((unsigned)1 << m) - 1; j < s; j += step)
+                skewVec[j + s] = skewVec[j] ^ temp[i];
+        }
+
+        temp[m] = kFieldModulus - GFLog[mulE(temp[m], GFLog[temp[m] ^ 1])];
+
+        for (unsigned i = m + 1; i < (kGFBits - 1); ++i)
+            temp[i] = mulE(temp[i], (GFLog[temp[i] ^ 1] + temp[m]) % kFieldModulus);
+    }
+
+    for (unsigned i = 0; i < kFieldSize; ++i)
+        skewVec[i] = GFLog[skewVec[i]];
+
+    temp[0] = kFieldModulus - temp[0];
+
+    for (unsigned i = 1; i < (kGFBits - 1); ++i)
+        temp[i] = (kFieldModulus - temp[i] + temp[i - 1]) % kFieldModulus;
+
+    B[0] = 0;
+    for (unsigned i = 0; i < (kGFBits - 1); ++i)
+    {
+        const unsigned depart = ((unsigned)1 << i);
+
+        for (unsigned j = 0; j < depart; ++j)
+            B[j + depart] = (B[j] + temp[i]) % kFieldModulus;
+    }
+
+    for (unsigned i = 0; i < kFieldSize; ++i)
+        log_walsh[i] = GFLog[i];
+
+    log_walsh[0] = 0;
+
+    FWHT(log_walsh, kGFBits);
+}
+
+
+//------------------------------------------------------------------------------
+// Encoder
+
+// Encoding alg for k/n<0.5: message is a power of two
+static void encodeL(GFSymbol* data, const unsigned k, GFSymbol* codeword)
+{
+    memcpy(codeword, data, sizeof(GFSymbol) * k);
+
+    IFLT(codeword, k, 0);
+
+    for (unsigned i = k; i < kFieldSize; i += k)
+    {
+        memcpy(&codeword[i], codeword, sizeof(GFSymbol) * k);
+
+        FLT(&codeword[i], k, i);
+    }
+
+    memcpy(codeword, data, sizeof(GFSymbol) * k);
+}
+
+// Encoding alg for k/n>0.5: parity is a power of two.
+// data: message array. parity: parity array. mem: buffer(size>= n-k)
+static void encodeH(const GFSymbol* data, const unsigned k, GFSymbol* parity, GFSymbol* mem)
+{
+    const unsigned t = kFieldSize - k;
+
+    memset(parity, 0, sizeof(GFSymbol) * t);
+
+    for (unsigned i = t; i < kFieldSize; i += t)
+    {
+        memcpy(mem, &data[i - t], sizeof(GFSymbol) * t);
+
+        IFLT(mem, t, i);
+
+        xor_mem(parity, mem, t * sizeof(GFSymbol));
+    }
+
+    FLT(parity, t, 0);
+}
+
+
+//------------------------------------------------------------------------------
+// Decoder
+
+static void decode(GFSymbol* codeword, unsigned k, const bool* erasure)
+{
+    fwht_t log_walsh2[kFieldSize];
+
+    // Compute the evaluations of the error locator polynomial
+    for (unsigned i = 0; i < kFieldSize; ++i)
+        log_walsh2[i] = erasure[i] ? 1 : 0;
+
+    FWHT(log_walsh2, kGFBits);
+
+    for (unsigned i = 0; i < kFieldSize; ++i)
+        log_walsh2[i] = ((unsigned)log_walsh2[i] * (unsigned)log_walsh[i]) % kFieldModulus;
+
+    FWHT(log_walsh2, kGFBits);
+
+    // k2 can be replaced with k
+    const unsigned k2 = kFieldSize;
+    //const unsigned k2 = k; // cannot actually be replaced with k.  what else need to change?
+
+    for (unsigned i = 0; i < kFieldSize; ++i)
+    {
+        if (erasure[i])
+        {
+            codeword[i] = 0;
+        }
+        else
+        {
+            codeword[i] = mulE(codeword[i], log_walsh2[i]);
+        }
+    }
+
+    IFLT(codeword, kFieldSize, 0);
+
+    // formal derivative
+    for (unsigned i = 0; i < kFieldSize; i += 2)
+    {
+        codeword[i] = mulE(codeword[i], kFieldModulus - B[i >> 1]);
+        codeword[i + 1] = mulE(codeword[i + 1], kFieldModulus - B[i >> 1]);
+    }
+
+    formal_derivative(codeword, k2);
+
+    for (unsigned i = 0; i < k2; i += 2)
+    {
+        codeword[i] = mulE(codeword[i], B[i >> 1]);
+        codeword[i + 1] = mulE(codeword[i + 1], B[i >> 1]);
+    }
+
+    FLT(codeword, k2, 0);
+
+    for (unsigned i = 0; i < k2; ++i)
+    {
+        if (erasure[i])
+        {
+            codeword[i] = mulE(codeword[i], kFieldModulus - log_walsh2[i]);
+        }
+    }
+}
+
+
+//------------------------------------------------------------------------------
+// Test Application
+
+void test(unsigned k, unsigned seed)
+{
+    srand(seed);
+
+    //-----------Generating message----------
+
+    // Message array
+    GFSymbol data[kFieldSize] = {0};
+
+    // Filled with random numbers
+    for (unsigned i = kFieldSize - k; i < kFieldSize; ++i)
+        data[i] = (GFSymbol)rand();
+
+
+    //---------encoding----------
+
+    GFSymbol codeword[kFieldSize];
+    encodeH(&data[kFieldSize - k], k, data, codeword);
+    //encodeL(data, k, codeword); // does not seem to work with any input?  what else needs to change?
+
+    memcpy(codeword, data, sizeof(GFSymbol) * kFieldSize);
+
+
+    //--------erasure simulation---------
+
+    // Array indicating erasures
+    bool erasure[kFieldSize] = {
+        false
+    };
+
+    for (unsigned i = k; i < kFieldSize; ++i)
+        erasure[i] = true;
+
+    // permuting the erasure array
+    for (unsigned i = kFieldSize - 1; i > 0; --i)
+    {
+        unsigned pos = rand() % (i + 1);
+
+        if (i != pos)
+        {
+            bool tmp = erasure[i];
+            erasure[i] = erasure[pos];
+            erasure[pos] = tmp;
+        }
+    }
+
+    // erasure codeword symbols
+    for (unsigned i = 0; i < kFieldSize; ++i)
+        if (erasure[i])
+            codeword[i] = 0;
+
+
+    //---------main processing----------
+    decode(codeword, k, erasure);
+
+    // Check the correctness of the result
+    for (unsigned i = 0; i < kFieldSize; ++i)
+    {
+        if (erasure[i] == 1)
+        {
+            if (data[i] != codeword[i])
+            {
+                printf("Decoding Error with seed = %d!\n", seed);
+                LEO_DEBUG_BREAK;
+                return;
+            }
+        }
+    }
+
+    //printf("Decoding is successful!\n");
+}
+
+
+//------------------------------------------------------------------------------
+// Entrypoint
+
+int main(int argc, char **argv)
+{
+    // Initialize architecture-specific code
+    leo_architecture_init();
+
+    // Fill GFLog table and GFExp table
+    InitField();
+
+    // Compute factors used in erasure decoder
+    InitFieldOperations();
+
+    unsigned seed = (unsigned)time(NULL);
+    for (;;)
+    {
+        // test(int k), k: message size
+        /*
+            EncodeH works for kFieldSize / 2 and kFieldSize * 3 / 4, etc,
+            s.t. the number of recovery pieces is a power of two
+        */
+        test(kFieldSize / 2, seed);
+
+        ++seed;
+    }
+
+    return 0;
+}
+
+
+} // namespace leopard
--- a/LeopardCommon.h
+++ b/LeopardCommon.h
@ -0,0 +1,194 @@
+/*
+    Copyright (c) 2017 Christopher A. Taylor.  All rights reserved.
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright notice,
+      this list of conditions and the following disclaimer in the documentation
+      and/or other materials provided with the distribution.
+    * Neither the name of Leopard-RS nor the names of its contributors may be
+      used to endorse or promote products derived from this software without
+      specific prior written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+    ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+    POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#pragma once
+
+/*
+    TODO:
+    + Refactor software
+        + I think it should be split up into several C++ modules
+    + Replace GFSymbol with a file data pointer
+    + New 16-bit Muladd inner loops
+        + Class to contain the (large) muladd tables
+    + Preliminary benchmarks for large data!
+    + New 8-bit Muladd inner loops
+    + Benchmarks for smaller data!
+    + Write detailed comments for all the routines
+    + Look into getting EncodeL working so we can support smaller data (Ask Lin)
+    + Look into using k instead of k2 to speed up decoder (Ask Lin)
+    + Avoid performing FFT/IFFT intermediate calculations we're not going to use
+    + Benchmarks, fun!
+    + Add multi-threading to split up long parallelizable calculations
+    + Final benchmarks!
+    + Finish up documentation
+    + Release version 1
+
+
+    Muladd implementation notes:
+
+    Specialize for 1-3 rows at a time since often times we're multiplying by
+    the same (skew) value repeatedly, as the ISA-L library does here:
+
+    https://github.com/01org/isa-l/blob/master/erasure_code/gf_3vect_mad_avx.asm#L258
+
+    Except we should be doing it for 16-bit Galois Field.
+    To implement that use the ALTMAP trick from Jerasure:
+
+    http://lab.jerasure.org/jerasure/gf-complete/blob/master/src/gf_w16.c#L1140
+
+    Except we should also support AVX2 since that is a 40% perf boost, so put
+    the high and low bytes 32 bytes instead of 16 bytes apart.
+
+    Also I think we should go ahead and precompute the multiply tables since
+    it avoids a bunch of memory lookups for each muladd, and only costs 8 MB.
+*/
+
+#include <stdint.h>
+
+
+//------------------------------------------------------------------------------
+// Debug
+
+// Some bugs only repro in release mode, so this can be helpful
+//#define LEO_DEBUG_IN_RELEASE
+
+#if defined(_DEBUG) || defined(DEBUG) || defined(LEO_DEBUG_IN_RELEASE)
+    #define LEO_DEBUG
+    #ifdef _WIN32
+        #define LEO_DEBUG_BREAK __debugbreak()
+    #else
+        #define LEO_DEBUG_BREAK __builtin_trap()
+    #endif
+    #define LEO_DEBUG_ASSERT(cond) { if (!(cond)) { LEO_DEBUG_BREAK; } }
+#else
+    #define LEO_DEBUG_BREAK ;
+    #define LEO_DEBUG_ASSERT(cond) ;
+#endif
+
+
+//------------------------------------------------------------------------------
+// Platform/Architecture
+
+#if defined(ANDROID) || defined(IOS)
+    #define LEO_TARGET_MOBILE
+#endif // ANDROID
+
+#if defined(__AVX2__) || (defined (_MSC_VER) && _MSC_VER >= 1900)
+    #define LEO_TRY_AVX2 /* 256-bit */
+    #include <immintrin.h>
+    #define LEO_ALIGN_BYTES 32
+#else // __AVX2__
+    #define LEO_ALIGN_BYTES 16
+#endif // __AVX2__
+
+#if !defined(LEO_TARGET_MOBILE)
+    // Note: MSVC currently only supports SSSE3 but not AVX2
+    #include <tmmintrin.h> // SSSE3: _mm_shuffle_epi8
+    #include <emmintrin.h> // SSE2
+#endif // LEO_TARGET_MOBILE
+
+#if defined(HAVE_ARM_NEON_H)
+    #include <arm_neon.h>
+#endif // HAVE_ARM_NEON_H
+
+#if defined(LEO_TARGET_MOBILE)
+
+    #define LEO_ALIGNED_ACCESSES /* Inputs must be aligned to LEO_ALIGN_BYTES */
+
+# if defined(HAVE_ARM_NEON_H)
+    // Compiler-specific 128-bit SIMD register keyword
+    #define LEO_M128 uint8x16_t
+    #define LEO_TRY_NEON
+#else
+    #define LEO_M128 uint64_t
+# endif
+
+#else // LEO_TARGET_MOBILE
+
+    // Compiler-specific 128-bit SIMD register keyword
+    #define LEO_M128 __m128i
+
+#endif // LEO_TARGET_MOBILE
+
+#ifdef LEO_TRY_AVX2
+    // Compiler-specific 256-bit SIMD register keyword
+    #define LEO_M256 __m256i
+#endif
+
+// Compiler-specific C++11 restrict keyword
+#define LEO_RESTRICT __restrict
+
+// Compiler-specific force inline keyword
+#ifdef _MSC_VER
+    #define LEO_FORCE_INLINE inline __forceinline
+#else
+    #define LEO_FORCE_INLINE inline __attribute__((always_inline))
+#endif
+
+// Compiler-specific alignment keyword
+// Note: Alignment only matters for ARM NEON where it should be 16
+#ifdef _MSC_VER
+    #define LEO_ALIGNED __declspec(align(LEO_ALIGN_BYTES))
+#else // _MSC_VER
+    #define LEO_ALIGNED __attribute__((aligned(LEO_ALIGN_BYTES)))
+#endif // _MSC_VER
+
+
+namespace leopard {
+
+
+//------------------------------------------------------------------------------
+// Runtime CPU Architecture Check
+
+// Initialize CPU architecture flags
+void InitializeCPUArch();
+
+#if defined(LEO_TRY_NEON)
+# if defined(IOS) && defined(__ARM_NEON__)
+// Does device support NEON?
+static const bool CpuHasNeon = true;
+static const bool CpuHasNeon64 = true;
+# else
+// Does device support NEON?
+// Remember to add LOCAL_STATIC_LIBRARIES := cpufeatures
+extern bool CpuHasNeon; // V6 / V7
+extern bool CpuHasNeon64; // 64-bit
+# endif
+#endif
+
+#if !defined(LEO_TARGET_MOBILE)
+# if defined(LEO_TRY_AVX2)
+// Does CPU support AVX2?
+extern bool CpuHasAVX2;
+# endif
+// Does CPU support SSSE3?
+extern bool CpuHasSSSE3;
+#endif // LEO_TARGET_MOBILE
+
+
+} // namespace leopard
--- a/LeopardDecoder.cpp
+++ b/LeopardDecoder.cpp
@ -1,8 +1,29 @@
 /*
-    S.-J. Lin,  T. Y. Al-Naffouri, Y. S. Han, and W.-H. Chung,
-    "Novel Polynomial Basis with Fast Fourier Transform and Its Application to Reed-Solomon Erasure Codes"
-    IEEE Trans. on Information Theory, pp. 6284-6299, November, 2016.
-    http://ct.ee.ntust.edu.tw/it2016-2.pdf
+    Copyright (c) 2017 Christopher A. Taylor.  All rights reserved.
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright notice,
+      this list of conditions and the following disclaimer in the documentation
+      and/or other materials provided with the distribution.
+    * Neither the name of LHC-RS nor the names of its contributors may be
+      used to endorse or promote products derived from this software without
+      specific prior written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+    ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+    POSSIBILITY OF SUCH DAMAGE.
 */

 #include <string.h>
@ -23,7 +44,7 @@
    + New 8-bit Muladd inner loops
    + Benchmarks for smaller data!
    + Refactor software
-        + Pick a name for the software better than LHC_RS
+        + Pick a name for the software better than LEO_RS
        + I think it should be split up into several C++ modules
    + Write detailed comments for all the routines
    + Look into getting EncodeL working so we can support smaller data (Ask Lin)
@ -60,19 +81,19 @@
 // Debug

 // Some bugs only repro in release mode, so this can be helpful
-//#define LHC_DEBUG_IN_RELEASE
+//#define LEO_DEBUG_IN_RELEASE

-#if defined(_DEBUG) || defined(DEBUG) || defined(LHC_DEBUG_IN_RELEASE)
-    #define LHC_DEBUG
+#if defined(_DEBUG) || defined(DEBUG) || defined(LEO_DEBUG_IN_RELEASE)
+    #define LEO_DEBUG
    #ifdef _WIN32
-        #define LHC_DEBUG_BREAK __debugbreak()
+        #define LEO_DEBUG_BREAK __debugbreak()
    #else
-        #define LHC_DEBUG_BREAK __builtin_trap()
+        #define LEO_DEBUG_BREAK __builtin_trap()
    #endif
-    #define LHC_DEBUG_ASSERT(cond) { if (!(cond)) { LHC_DEBUG_BREAK; } }
+    #define LEO_DEBUG_ASSERT(cond) { if (!(cond)) { LEO_DEBUG_BREAK; } }
 #else
-    #define LHC_DEBUG_BREAK ;
-    #define LHC_DEBUG_ASSERT(cond) ;
+    #define LEO_DEBUG_BREAK ;
+    #define LEO_DEBUG_ASSERT(cond) ;
 #endif


@ -80,67 +101,67 @@
 // Platform/Architecture

 #if defined(ANDROID) || defined(IOS)
-    #define LHC_TARGET_MOBILE
+    #define LEO_TARGET_MOBILE
 #endif // ANDROID

 #if defined(__AVX2__) || (defined (_MSC_VER) && _MSC_VER >= 1900)
-    #define LHC_TRY_AVX2 /* 256-bit */
+    #define LEO_TRY_AVX2 /* 256-bit */
    #include <immintrin.h>
-    #define LHC_ALIGN_BYTES 32
+    #define LEO_ALIGN_BYTES 32
 #else // __AVX2__
-    #define LHC_ALIGN_BYTES 16
+    #define LEO_ALIGN_BYTES 16
 #endif // __AVX2__

-#if !defined(LHC_TARGET_MOBILE)
+#if !defined(LEO_TARGET_MOBILE)
    // Note: MSVC currently only supports SSSE3 but not AVX2
    #include <tmmintrin.h> // SSSE3: _mm_shuffle_epi8
    #include <emmintrin.h> // SSE2
-#endif // LHC_TARGET_MOBILE
+#endif // LEO_TARGET_MOBILE

 #if defined(HAVE_ARM_NEON_H)
    #include <arm_neon.h>
 #endif // HAVE_ARM_NEON_H

-#if defined(LHC_TARGET_MOBILE)
+#if defined(LEO_TARGET_MOBILE)

-    #define LHC_ALIGNED_ACCESSES /* Inputs must be aligned to LHC_ALIGN_BYTES */
+    #define LEO_ALIGNED_ACCESSES /* Inputs must be aligned to LEO_ALIGN_BYTES */

 # if defined(HAVE_ARM_NEON_H)
    // Compiler-specific 128-bit SIMD register keyword
-    #define LHC_M128 uint8x16_t
-    #define LHC_TRY_NEON
+    #define LEO_M128 uint8x16_t
+    #define LEO_TRY_NEON
 #else
-    #define LHC_M128 uint64_t
+    #define LEO_M128 uint64_t
 # endif

-#else // LHC_TARGET_MOBILE
+#else // LEO_TARGET_MOBILE

    // Compiler-specific 128-bit SIMD register keyword
-    #define LHC_M128 __m128i
+    #define LEO_M128 __m128i

-#endif // LHC_TARGET_MOBILE
+#endif // LEO_TARGET_MOBILE

-#ifdef LHC_TRY_AVX2
+#ifdef LEO_TRY_AVX2
    // Compiler-specific 256-bit SIMD register keyword
-    #define LHC_M256 __m256i
+    #define LEO_M256 __m256i
 #endif

 // Compiler-specific C++11 restrict keyword
-#define LHC_RESTRICT __restrict
+#define LEO_RESTRICT __restrict

 // Compiler-specific force inline keyword
 #ifdef _MSC_VER
-    #define LHC_FORCE_INLINE inline __forceinline
+    #define LEO_FORCE_INLINE inline __forceinline
 #else
-    #define LHC_FORCE_INLINE inline __attribute__((always_inline))
+    #define LEO_FORCE_INLINE inline __attribute__((always_inline))
 #endif

 // Compiler-specific alignment keyword
 // Note: Alignment only matters for ARM NEON where it should be 16
 #ifdef _MSC_VER
-    #define LHC_ALIGNED __declspec(align(LHC_ALIGN_BYTES))
+    #define LEO_ALIGNED __declspec(align(LEO_ALIGN_BYTES))
 #else // _MSC_VER
-    #define LHC_ALIGNED __attribute__((aligned(LHC_ALIGN_BYTES)))
+    #define LEO_ALIGNED __attribute__((aligned(LEO_ALIGN_BYTES)))
 #endif // _MSC_VER


@ -154,7 +175,7 @@
    #include <cpu-features.h>
 #endif

-#if defined(LHC_TRY_NEON)
+#if defined(LEO_TRY_NEON)
 # if defined(IOS) && defined(__ARM_NEON__)
        // Requires iPhone 5S or newer
        static const bool CpuHasNeon = true;
@ -167,14 +188,14 @@
 #endif


-#if !defined(LHC_TARGET_MOBILE)
+#if !defined(LEO_TARGET_MOBILE)

 #ifdef _MSC_VER
    #include <intrin.h> // __cpuid
    #pragma warning(disable: 4752) // found Intel(R) Advanced Vector Extensions; consider using /arch:AVX
 #endif

-#ifdef LHC_TRY_AVX2
+#ifdef LEO_TRY_AVX2
 static bool CpuHasAVX2 = false;
 #endif
 static bool CpuHasSSSE3 = false;
@ -219,12 +240,12 @@ static void _cpuid(unsigned int cpu_info[4U], const unsigned int cpu_info_type)
 #endif
 }

-#endif // defined(LHC_TARGET_MOBILE)
+#endif // defined(LEO_TARGET_MOBILE)


-static void lhc_architecture_init()
+static void leo_architecture_init()
 {
-#if defined(LHC_TRY_NEON) && defined(HAVE_ANDROID_GETCPUFEATURES)
+#if defined(LEO_TRY_NEON) && defined(HAVE_ANDROID_GETCPUFEATURES)
    AndroidCpuFamily family = android_getCpuFamily();
    if (family == ANDROID_CPU_FAMILY_ARM)
    {
@ -239,32 +260,32 @@ static void lhc_architecture_init()
    }
 #endif

-#if !defined(LHC_TARGET_MOBILE)
+#if !defined(LEO_TARGET_MOBILE)
    unsigned int cpu_info[4];

    _cpuid(cpu_info, 1);
    CpuHasSSSE3 = ((cpu_info[2] & CPUID_ECX_SSSE3) != 0);

-#if defined(LHC_TRY_AVX2)
+#if defined(LEO_TRY_AVX2)
    _cpuid(cpu_info, 7);
    CpuHasAVX2 = ((cpu_info[1] & CPUID_EBX_AVX2) != 0);
-#endif // LHC_TRY_AVX2
+#endif // LEO_TRY_AVX2

-#endif // LHC_TARGET_MOBILE
+#endif // LEO_TARGET_MOBILE
 }


 //------------------------------------------------------------------------------
 // SIMD-Safe Aligned Memory Allocations

-static const unsigned kAlignmentBytes = LHC_ALIGN_BYTES;
+static const unsigned kAlignmentBytes = LEO_ALIGN_BYTES;

-LHC_FORCE_INLINE unsigned NextAlignedOffset(unsigned offset)
+LEO_FORCE_INLINE unsigned NextAlignedOffset(unsigned offset)
 {
    return (offset + kAlignmentBytes - 1) & ~(kAlignmentBytes - 1);
 }

-static LHC_FORCE_INLINE uint8_t* SIMDSafeAllocate(size_t size)
+static LEO_FORCE_INLINE uint8_t* SIMDSafeAllocate(size_t size)
 {
    uint8_t* data = (uint8_t*)calloc(1, kAlignmentBytes + size);
    if (!data)
@ -275,7 +296,7 @@ static LHC_FORCE_INLINE uint8_t* SIMDSafeAllocate(size_t size)
    return data;
 }

-static LHC_FORCE_INLINE void SIMDSafeFree(void* ptr)
+static LEO_FORCE_INLINE void SIMDSafeFree(void* ptr)
 {
    if (!ptr)
        return;
@ -283,7 +304,7 @@ static LHC_FORCE_INLINE void SIMDSafeFree(void* ptr)
    unsigned offset = data[-1];
    if (offset >= kAlignmentBytes)
    {
-        LHC_DEBUG_BREAK; // Should never happen
+        LEO_DEBUG_BREAK; // Should never happen
        return;
    }
    data -= kAlignmentBytes - offset;
@ -294,9 +315,9 @@ static LHC_FORCE_INLINE void SIMDSafeFree(void* ptr)
 //------------------------------------------------------------------------------
 // Field

-//#define LHC_SHORT_FIELD
+//#define LEO_SHORT_FIELD

-#ifdef LHC_SHORT_FIELD
+#ifdef LEO_SHORT_FIELD
 typedef uint8_t GFSymbol;
 static const unsigned kGFBits = 8;
 static const unsigned kGFPolynomial = 0x11D;
@ -386,7 +407,7 @@ static inline GFSymbol SubModQ(GFSymbol a, GFSymbol b)
 }

 // vx[] += vy[] * z
-static void muladd_mem(GFSymbol * LHC_RESTRICT vx, const GFSymbol * LHC_RESTRICT vy, GFSymbol z, unsigned symbolCount)
+static void muladd_mem(GFSymbol * LEO_RESTRICT vx, const GFSymbol * LEO_RESTRICT vy, GFSymbol z, unsigned symbolCount)
 {
    for (unsigned i = 0; i < symbolCount; ++i)
    {
@ -443,12 +464,12 @@ static GFSymbol mulE(GFSymbol a, GFSymbol b)
 // Q is the maximum symbol value, e.g. 255 or 65535.

 // Define this to enable the optimized version of FWHT()
-#define LHC_FWHT_OPTIMIZED
+#define LEO_FWHT_OPTIMIZED

 typedef GFSymbol fwht_t;

 // {a, b} = {a + b, a - b} (Mod Q)
-static LHC_FORCE_INLINE void FWHT_2(fwht_t& LHC_RESTRICT a, fwht_t& LHC_RESTRICT b)
+static LEO_FORCE_INLINE void FWHT_2(fwht_t& LEO_RESTRICT a, fwht_t& LEO_RESTRICT b)
 {
    const fwht_t sum = AddModQ(a, b);
    const fwht_t dif = SubModQ(a, b);
@ -473,7 +494,7 @@ static LHC_FORCE_INLINE void FWHT_2(fwht_t& LHC_RESTRICT a, fwht_t& LHC_RESTRICT
    at too high a complexity cost relative to minor perf improvement.
 */

-#ifndef LHC_FWHT_OPTIMIZED
+#ifndef LEO_FWHT_OPTIMIZED

 // Reference implementation
 static void FWHT(fwht_t* data, const unsigned bits)
@ -487,7 +508,7 @@ static void FWHT(fwht_t* data, const unsigned bits)

 #else

-static LHC_FORCE_INLINE void FWHT_4(fwht_t* data)
+static LEO_FORCE_INLINE void FWHT_4(fwht_t* data)
 {
    fwht_t t0 = data[0];
    fwht_t t1 = data[1];
@ -503,7 +524,7 @@ static LHC_FORCE_INLINE void FWHT_4(fwht_t* data)
    data[3] = t3;
 }

-static LHC_FORCE_INLINE void FWHT_4(fwht_t* data, unsigned s)
+static LEO_FORCE_INLINE void FWHT_4(fwht_t* data, unsigned s)
 {
    unsigned x = 0;
    fwht_t t0 = data[x];  x += s;
@ -683,26 +704,26 @@ static void FWHT(fwht_t* data, const unsigned ldn)
 //------------------------------------------------------------------------------
 // Memory Buffer XOR

-static void xor_mem(void * LHC_RESTRICT vx, const void * LHC_RESTRICT vy, unsigned bytes)
+static void xor_mem(void * LEO_RESTRICT vx, const void * LEO_RESTRICT vy, unsigned bytes)
 {
-    LHC_M128 * LHC_RESTRICT x16 = reinterpret_cast<LHC_M128 *>(vx);
-    const LHC_M128 * LHC_RESTRICT y16 = reinterpret_cast<const LHC_M128 *>(vy);
+    LEO_M128 * LEO_RESTRICT x16 = reinterpret_cast<LEO_M128 *>(vx);
+    const LEO_M128 * LEO_RESTRICT y16 = reinterpret_cast<const LEO_M128 *>(vy);

-#if defined(LHC_TARGET_MOBILE)
-# if defined(LHC_TRY_NEON)
+#if defined(LEO_TARGET_MOBILE)
+# if defined(LEO_TRY_NEON)
    // Handle multiples of 64 bytes
    if (CpuHasNeon)
    {
        while (bytes >= 64)
        {
-            LHC_M128 x0 = vld1q_u8(x16);
-            LHC_M128 x1 = vld1q_u8(x16 + 1);
-            LHC_M128 x2 = vld1q_u8(x16 + 2);
-            LHC_M128 x3 = vld1q_u8(x16 + 3);
-            LHC_M128 y0 = vld1q_u8(y16);
-            LHC_M128 y1 = vld1q_u8(y16 + 1);
-            LHC_M128 y2 = vld1q_u8(y16 + 2);
-            LHC_M128 y3 = vld1q_u8(y16 + 3);
+            LEO_M128 x0 = vld1q_u8(x16);
+            LEO_M128 x1 = vld1q_u8(x16 + 1);
+            LEO_M128 x2 = vld1q_u8(x16 + 2);
+            LEO_M128 x3 = vld1q_u8(x16 + 3);
+            LEO_M128 y0 = vld1q_u8(y16);
+            LEO_M128 y1 = vld1q_u8(y16 + 1);
+            LEO_M128 y2 = vld1q_u8(y16 + 2);
+            LEO_M128 y3 = vld1q_u8(y16 + 3);

            vst1q_u8(x16,     veorq_u8(x0, y0));
            vst1q_u8(x16 + 1, veorq_u8(x1, y1));
@ -715,8 +736,8 @@ static void xor_mem(void * LHC_RESTRICT vx, const void * LHC_RESTRICT vy, unsign
        // Handle multiples of 16 bytes
        while (bytes >= 16)
        {
-            LHC_M128 x0 = vld1q_u8(x16);
-            LHC_M128 y0 = vld1q_u8(y16);
+            LEO_M128 x0 = vld1q_u8(x16);
+            LEO_M128 y0 = vld1q_u8(y16);

            vst1q_u8(x16, veorq_u8(x0, y0));

@ -724,38 +745,38 @@ static void xor_mem(void * LHC_RESTRICT vx, const void * LHC_RESTRICT vy, unsign
        }
    }
    else
-# endif // LHC_TRY_NEON
+# endif // LEO_TRY_NEON
    {
-        uint64_t * LHC_RESTRICT x8 = reinterpret_cast<uint64_t *>(x16);
-        const uint64_t * LHC_RESTRICT y8 = reinterpret_cast<const uint64_t *>(y16);
+        uint64_t * LEO_RESTRICT x8 = reinterpret_cast<uint64_t *>(x16);
+        const uint64_t * LEO_RESTRICT y8 = reinterpret_cast<const uint64_t *>(y16);

        const unsigned count = (unsigned)bytes / 8;
        for (unsigned ii = 0; ii < count; ++ii)
            x8[ii] ^= y8[ii];

-        x16 = reinterpret_cast<LHC_M128 *>(x8 + count);
-        y16 = reinterpret_cast<const LHC_M128 *>(y8 + count);
+        x16 = reinterpret_cast<LEO_M128 *>(x8 + count);
+        y16 = reinterpret_cast<const LEO_M128 *>(y8 + count);
    }
-#else // LHC_TARGET_MOBILE
-# if defined(LHC_TRY_AVX2)
+#else // LEO_TARGET_MOBILE
+# if defined(LEO_TRY_AVX2)
    if (CpuHasAVX2)
    {
-        LHC_M256 * LHC_RESTRICT x32 = reinterpret_cast<LHC_M256 *>(x16);
-        const LHC_M256 * LHC_RESTRICT y32 = reinterpret_cast<const LHC_M256 *>(y16);
+        LEO_M256 * LEO_RESTRICT x32 = reinterpret_cast<LEO_M256 *>(x16);
+        const LEO_M256 * LEO_RESTRICT y32 = reinterpret_cast<const LEO_M256 *>(y16);

        while (bytes >= 128)
        {
-            LHC_M256 x0 = _mm256_loadu_si256(x32);
-            LHC_M256 y0 = _mm256_loadu_si256(y32);
+            LEO_M256 x0 = _mm256_loadu_si256(x32);
+            LEO_M256 y0 = _mm256_loadu_si256(y32);
            x0 = _mm256_xor_si256(x0, y0);
-            LHC_M256 x1 = _mm256_loadu_si256(x32 + 1);
-            LHC_M256 y1 = _mm256_loadu_si256(y32 + 1);
+            LEO_M256 x1 = _mm256_loadu_si256(x32 + 1);
+            LEO_M256 y1 = _mm256_loadu_si256(y32 + 1);
            x1 = _mm256_xor_si256(x1, y1);
-            LHC_M256 x2 = _mm256_loadu_si256(x32 + 2);
-            LHC_M256 y2 = _mm256_loadu_si256(y32 + 2);
+            LEO_M256 x2 = _mm256_loadu_si256(x32 + 2);
+            LEO_M256 y2 = _mm256_loadu_si256(y32 + 2);
            x2 = _mm256_xor_si256(x2, y2);
-            LHC_M256 x3 = _mm256_loadu_si256(x32 + 3);
-            LHC_M256 y3 = _mm256_loadu_si256(y32 + 3);
+            LEO_M256 x3 = _mm256_loadu_si256(x32 + 3);
+            LEO_M256 y3 = _mm256_loadu_si256(y32 + 3);
            x3 = _mm256_xor_si256(x3, y3);

            _mm256_storeu_si256(x32, x0);
@ -778,25 +799,25 @@ static void xor_mem(void * LHC_RESTRICT vx, const void * LHC_RESTRICT vy, unsign
            bytes -= 32, ++x32, ++y32;
        }

-        x16 = reinterpret_cast<LHC_M128 *>(x32);
-        y16 = reinterpret_cast<const LHC_M128 *>(y32);
+        x16 = reinterpret_cast<LEO_M128 *>(x32);
+        y16 = reinterpret_cast<const LEO_M128 *>(y32);
    }
    else
-# endif // LHC_TRY_AVX2
+# endif // LEO_TRY_AVX2
    {
        while (bytes >= 64)
        {
-            LHC_M128 x0 = _mm_loadu_si128(x16);
-            LHC_M128 y0 = _mm_loadu_si128(y16);
+            LEO_M128 x0 = _mm_loadu_si128(x16);
+            LEO_M128 y0 = _mm_loadu_si128(y16);
            x0 = _mm_xor_si128(x0, y0);
-            LHC_M128 x1 = _mm_loadu_si128(x16 + 1);
-            LHC_M128 y1 = _mm_loadu_si128(y16 + 1);
+            LEO_M128 x1 = _mm_loadu_si128(x16 + 1);
+            LEO_M128 y1 = _mm_loadu_si128(y16 + 1);
            x1 = _mm_xor_si128(x1, y1);
-            LHC_M128 x2 = _mm_loadu_si128(x16 + 2);
-            LHC_M128 y2 = _mm_loadu_si128(y16 + 2);
+            LEO_M128 x2 = _mm_loadu_si128(x16 + 2);
+            LEO_M128 y2 = _mm_loadu_si128(y16 + 2);
            x2 = _mm_xor_si128(x2, y2);
-            LHC_M128 x3 = _mm_loadu_si128(x16 + 3);
-            LHC_M128 y3 = _mm_loadu_si128(y16 + 3);
+            LEO_M128 x3 = _mm_loadu_si128(x16 + 3);
+            LEO_M128 y3 = _mm_loadu_si128(y16 + 3);
            x3 = _mm_xor_si128(x3, y3);

            _mm_storeu_si128(x16, x0);
@ -807,7 +828,7 @@ static void xor_mem(void * LHC_RESTRICT vx, const void * LHC_RESTRICT vy, unsign
            bytes -= 64, x16 += 4, y16 += 4;
        }
    }
-#endif // LHC_TARGET_MOBILE
+#endif // LEO_TARGET_MOBILE

    // Handle multiples of 16 bytes
    while (bytes >= 16)
@ -821,15 +842,15 @@ static void xor_mem(void * LHC_RESTRICT vx, const void * LHC_RESTRICT vy, unsign
        bytes -= 16, ++x16, ++y16;
    }

-    uint8_t * LHC_RESTRICT x1 = reinterpret_cast<uint8_t *>(x16);
-    const uint8_t * LHC_RESTRICT y1 = reinterpret_cast<const uint8_t *>(y16);
+    uint8_t * LEO_RESTRICT x1 = reinterpret_cast<uint8_t *>(x16);
+    const uint8_t * LEO_RESTRICT y1 = reinterpret_cast<const uint8_t *>(y16);

    // Handle a block of 8 bytes
    const unsigned eight = bytes & 8;
    if (eight)
    {
-        uint64_t * LHC_RESTRICT x8 = reinterpret_cast<uint64_t *>(x1);
-        const uint64_t * LHC_RESTRICT y8 = reinterpret_cast<const uint64_t *>(y1);
+        uint64_t * LEO_RESTRICT x8 = reinterpret_cast<uint64_t *>(x1);
+        const uint64_t * LEO_RESTRICT y8 = reinterpret_cast<const uint64_t *>(y1);
        *x8 ^= *y8;
    }

@ -837,8 +858,8 @@ static void xor_mem(void * LHC_RESTRICT vx, const void * LHC_RESTRICT vy, unsign
    const unsigned four = bytes & 4;
    if (four)
    {
-        uint32_t * LHC_RESTRICT x4 = reinterpret_cast<uint32_t *>(x1 + eight);
-        const uint32_t * LHC_RESTRICT y4 = reinterpret_cast<const uint32_t *>(y1 + eight);
+        uint32_t * LEO_RESTRICT x4 = reinterpret_cast<uint32_t *>(x1 + eight);
+        const uint32_t * LEO_RESTRICT y4 = reinterpret_cast<const uint32_t *>(y1 + eight);
        *x4 ^= *y4;
    }

@ -1158,7 +1179,7 @@ void test(unsigned k, unsigned seed)
            if (data[i] != codeword[i])
            {
                printf("Decoding Error with seed = %d!\n", seed);
-                LHC_DEBUG_BREAK;
+                LEO_DEBUG_BREAK;
                return;
            }
        }
@ -1174,7 +1195,7 @@ void test(unsigned k, unsigned seed)
 int main(int argc, char **argv)
 {
    // Initialize architecture-specific code
-    lhc_architecture_init();
+    leo_architecture_init();

    // Fill GFLog table and GFExp table
    InitField();
--- a/LeopardDecoder.h
+++ b/LeopardDecoder.h
--- a/LeopardEncoder.cpp
+++ b/LeopardEncoder.cpp
--- a/LeopardEncoder.h
+++ b/LeopardEncoder.h
--- a/LeopardFF16.cpp
+++ b/LeopardFF16.cpp
--- a/LeopardFF16.h
+++ b/LeopardFF16.h
--- a/LeopardFF8.cpp
+++ b/LeopardFF8.cpp
@ -0,0 +1,840 @@
+/*
+    Copyright (c) 2017 Christopher A. Taylor.  All rights reserved.
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright notice,
+      this list of conditions and the following disclaimer in the documentation
+      and/or other materials provided with the distribution.
+    * Neither the name of LHC-RS nor the names of its contributors may be
+      used to endorse or promote products derived from this software without
+      specific prior written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+    ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+    POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "LeopardFF8.h"
+
+namespace leopard { namespace ff8 {
+
+
+//------------------------------------------------------------------------------
+// Datatypes and Constants
+
+// LFSR Polynomial that generates the field elements
+static const unsigned kPolynomial = 0x11D;
+
+// Basis used for generating logarithm tables
+static const ffe_t kBasis[kBits] = {
+    1, 214, 152, 146, 86, 200, 88, 230 // Cantor basis
+    // 1, 2, 4, 8, 16, 32, 64, 128 // Monomial basis
+};
+
+
+//------------------------------------------------------------------------------
+// Field Operations
+
+// Modulus for field operations
+static const ffe_t kModulus = 255;
+
+// z = x + y (mod kModulus)
+static inline ffe_t AddMod(const ffe_t a, const ffe_t b)
+{
+    const unsigned sum = (unsigned)a + b;
+
+    // Partial reduction step, allowing for kModulus to be returned
+    return static_cast<ffe_t>(sum + (sum >> kBits));
+}
+
+// z = x - y (mod kModulus)
+static inline ffe_t SubMod(const ffe_t a, const ffe_t b)
+{
+    const unsigned dif = (unsigned)a - b;
+
+    // Partial reduction step, allowing for kModulus to be returned
+    return static_cast<ffe_t>(dif + (dif >> kBits));
+}
+
+
+//------------------------------------------------------------------------------
+// Logarithm Tables
+
+static ffe_t LogLUT[kOrder];
+static ffe_t ExpLUT[kOrder];
+
+
+// Initialize LogLUT[], ExpLUT[]
+static void InitializeLogarithmTables()
+{
+    // LFSR table generation:
+
+    unsigned state = 1;
+    for (unsigned i = 0; i < kModulus; ++i)
+    {
+        ExpLUT[state] = static_cast<ffe_t>(i);
+        state <<= 1;
+        if (state >= kOrder)
+            state ^= kPolynomial;
+    }
+    ExpLUT[0] = kModulus;
+
+    // Conversion to chosen basis:
+
+    LogLUT[0] = 0;
+    for (unsigned i = 0; i < kBits; ++i)
+    {
+        const ffe_t basis = kBasis[i];
+        const unsigned width = static_cast<unsigned>(1UL << i);
+
+        for (unsigned j = 0; j < width; ++j)
+            LogLUT[j + width] = LogLUT[j] ^ basis;
+    }
+
+    for (unsigned i = 0; i < kOrder; ++i)
+        LogLUT[i] = ExpLUT[LogLUT[i]];
+
+    for (unsigned i = 0; i < kOrder; ++i)
+        ExpLUT[LogLUT[i]] = i;
+
+    ExpLUT[kModulus] = ExpLUT[0];
+}
+
+
+//------------------------------------------------------------------------------
+// Fast Walsh-Hadamard Transform (FWHT) (mod kModulus)
+
+#if defined(LEO_FF8_FWHT_OPTIMIZED)
+
+// {a, b} = {a + b, a - b} (Mod Q)
+static LEO_FORCE_INLINE void FWHT_2(ffe_t& LEO_RESTRICT a, ffe_t& LEO_RESTRICT b)
+{
+    const ffe_t sum = AddMod(a, b);
+    const ffe_t dif = SubMod(a, b);
+    a = sum;
+    b = dif;
+}
+
+static LEO_FORCE_INLINE void FWHT_4(ffe_t* data)
+{
+    ffe_t t0 = data[0];
+    ffe_t t1 = data[1];
+    ffe_t t2 = data[2];
+    ffe_t t3 = data[3];
+    FWHT_2(t0, t1);
+    FWHT_2(t2, t3);
+    FWHT_2(t0, t2);
+    FWHT_2(t1, t3);
+    data[0] = t0;
+    data[1] = t1;
+    data[2] = t2;
+    data[3] = t3;
+}
+
+static LEO_FORCE_INLINE void FWHT_4(ffe_t* data, unsigned s)
+{
+    unsigned x = 0;
+    ffe_t t0 = data[x];  x += s;
+    ffe_t t1 = data[x];  x += s;
+    ffe_t t2 = data[x];  x += s;
+    ffe_t t3 = data[x];
+    FWHT_2(t0, t1);
+    FWHT_2(t2, t3);
+    FWHT_2(t0, t2);
+    FWHT_2(t1, t3);
+    unsigned y = 0;
+    data[y] = t0;  y += s;
+    data[y] = t1;  y += s;
+    data[y] = t2;  y += s;
+    data[y] = t3;
+}
+
+static inline void FWHT_8(ffe_t* data)
+{
+    ffe_t t0 = data[0];
+    ffe_t t1 = data[1];
+    ffe_t t2 = data[2];
+    ffe_t t3 = data[3];
+    ffe_t t4 = data[4];
+    ffe_t t5 = data[5];
+    ffe_t t6 = data[6];
+    ffe_t t7 = data[7];
+    FWHT_2(t0, t1);
+    FWHT_2(t2, t3);
+    FWHT_2(t4, t5);
+    FWHT_2(t6, t7);
+    FWHT_2(t0, t2);
+    FWHT_2(t1, t3);
+    FWHT_2(t4, t6);
+    FWHT_2(t5, t7);
+    FWHT_2(t0, t4);
+    FWHT_2(t1, t5);
+    FWHT_2(t2, t6);
+    FWHT_2(t3, t7);
+    data[0] = t0;
+    data[1] = t1;
+    data[2] = t2;
+    data[3] = t3;
+    data[4] = t4;
+    data[5] = t5;
+    data[6] = t6;
+    data[7] = t7;
+}
+
+// Decimation in time (DIT) version
+static void FWHT(ffe_t* data, const unsigned ldn)
+{
+    const unsigned n = (1UL << ldn);
+
+    if (n <= 2)
+    {
+        if (n == 2)
+            FWHT_2(data[0], data[1]);
+        return;
+    }
+
+    for (unsigned ldm = ldn; ldm > 3; ldm -= 2)
+    {
+        unsigned m = (1UL << ldm);
+        unsigned m4 = (m >> 2);
+        for (unsigned r = 0; r < n; r += m)
+            for (unsigned j = 0; j < m4; j++)
+                FWHT_4(data + j + r, m4);
+    }
+
+    if (ldn & 1)
+    {
+        for (unsigned i0 = 0; i0 < n; i0 += 8)
+            FWHT_8(data + i0);
+    }
+    else
+    {
+        for (unsigned i0 = 0; i0 < n; i0 += 4)
+            FWHT_4(data + i0);
+    }
+}
+
+#else // LEO_FF8_FWHT_OPTIMIZED
+
+// Reference implementation
+void FWHT(ffe_t* data, const unsigned bits)
+{
+    const unsigned size = (unsigned)(1UL << bits);
+    for (unsigned width = 1; width < size; width <<= 1)
+        for (unsigned i = 0; i < size; i += (width << 1))
+            for (unsigned j = i; j < (width + i); ++j)
+                FWHT_2(data[j], data[j + width]);
+}
+
+#endif // LEO_FF8_FWHT_OPTIMIZED
+
+// Transform specialized for the finite field order
+void FWHT(ffe_t data[kOrder])
+{
+    FWHT(data, kBits);
+}
+
+
+//------------------------------------------------------------------------------
+// XOR Memory
+
+void xor_mem(
+    void * LEO_RESTRICT vx, const void * LEO_RESTRICT vy,
+    unsigned bytes)
+{
+#if defined(LEO_TRY_AVX2)
+    if (CpuHasAVX2)
+    {
+        LEO_M256 * LEO_RESTRICT x32 = reinterpret_cast<LEO_M256 *>(vx);
+        const LEO_M256 * LEO_RESTRICT y32 = reinterpret_cast<const LEO_M256 *>(vy);
+        do
+        {
+            const LEO_M256 x0 = _mm256_xor_si256(_mm256_loadu_si256(x32), _mm256_loadu_si256(y32));
+            const LEO_M256 x1 = _mm256_xor_si256(_mm256_loadu_si256(x32 + 1), _mm256_loadu_si256(y32 + 1));
+            const LEO_M256 x2 = _mm256_xor_si256(_mm256_loadu_si256(x32 + 2), _mm256_loadu_si256(y32 + 2));
+            const LEO_M256 x3 = _mm256_xor_si256(_mm256_loadu_si256(x32 + 3), _mm256_loadu_si256(y32 + 3));
+            _mm256_storeu_si256(x32, x0);
+            _mm256_storeu_si256(x32 + 1, x1);
+            _mm256_storeu_si256(x32 + 2, x2);
+            _mm256_storeu_si256(x32 + 3, x3);
+            bytes -= 128, x32 += 4, y32 += 4;
+        } while (bytes >= 128);
+        if (bytes > 0)
+        {
+            const LEO_M256 x0 = _mm256_xor_si256(_mm256_loadu_si256(x32), _mm256_loadu_si256(y32));
+            const LEO_M256 x1 = _mm256_xor_si256(_mm256_loadu_si256(x32 + 1), _mm256_loadu_si256(y32 + 1));
+            _mm256_storeu_si256(x32, x0);
+            _mm256_storeu_si256(x32 + 1, x1);
+        }
+        return;
+    }
+#endif // LEO_TRY_AVX2
+    LEO_M128 * LEO_RESTRICT x16 = reinterpret_cast<LEO_M128 *>(vx);
+    const LEO_M128 * LEO_RESTRICT y16 = reinterpret_cast<const LEO_M128 *>(vy);
+    do
+    {
+        const LEO_M128 x0 = _mm_xor_si128(_mm_loadu_si128(x16), _mm_loadu_si128(y16));
+        const LEO_M128 x1 = _mm_xor_si128(_mm_loadu_si128(x16 + 1), _mm_loadu_si128(y16 + 1));
+        const LEO_M128 x2 = _mm_xor_si128(_mm_loadu_si128(x16 + 2), _mm_loadu_si128(y16 + 2));
+        const LEO_M128 x3 = _mm_xor_si128(_mm_loadu_si128(x16 + 3), _mm_loadu_si128(y16 + 3));
+        _mm_storeu_si128(x16, x0);
+        _mm_storeu_si128(x16 + 1, x1);
+        _mm_storeu_si128(x16 + 2, x2);
+        _mm_storeu_si128(x16 + 3, x3);
+        bytes -= 64, x16 += 4, y16 += 4;
+    } while (bytes > 0);
+}
+
+void xor_mem2(
+    void * LEO_RESTRICT vx_0, const void * LEO_RESTRICT vy_0,
+    void * LEO_RESTRICT vx_1, const void * LEO_RESTRICT vy_1,
+    unsigned bytes)
+{
+#if defined(LEO_TRY_AVX2)
+    if (CpuHasAVX2)
+    {
+        LEO_M256 * LEO_RESTRICT       x32_0 = reinterpret_cast<LEO_M256 *>      (vx_0);
+        const LEO_M256 * LEO_RESTRICT y32_0 = reinterpret_cast<const LEO_M256 *>(vy_0);
+        LEO_M256 * LEO_RESTRICT       x32_1 = reinterpret_cast<LEO_M256 *>      (vx_1);
+        const LEO_M256 * LEO_RESTRICT y32_1 = reinterpret_cast<const LEO_M256 *>(vy_1);
+        do
+        {
+            const LEO_M256 x0_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0),     _mm256_loadu_si256(y32_0));
+            const LEO_M256 x1_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 1), _mm256_loadu_si256(y32_0 + 1));
+            const LEO_M256 x2_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 2), _mm256_loadu_si256(y32_0 + 2));
+            const LEO_M256 x3_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 3), _mm256_loadu_si256(y32_0 + 3));
+            const LEO_M256 x0_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1),     _mm256_loadu_si256(y32_1));
+            const LEO_M256 x1_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 1), _mm256_loadu_si256(y32_1 + 1));
+            const LEO_M256 x2_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 2), _mm256_loadu_si256(y32_1 + 2));
+            const LEO_M256 x3_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 3), _mm256_loadu_si256(y32_1 + 3));
+            _mm256_storeu_si256(x32_0,     x0_0);
+            _mm256_storeu_si256(x32_0 + 1, x1_0);
+            _mm256_storeu_si256(x32_0 + 2, x2_0);
+            _mm256_storeu_si256(x32_0 + 3, x3_0);
+            _mm256_storeu_si256(x32_1,     x0_1);
+            _mm256_storeu_si256(x32_1 + 1, x1_1);
+            _mm256_storeu_si256(x32_1 + 2, x2_1);
+            _mm256_storeu_si256(x32_1 + 3, x3_1);
+            x32_0 += 4, y32_0 += 4;
+            x32_1 += 4, y32_1 += 4;
+            bytes -= 128;
+        } while (bytes >= 128);
+        if (bytes > 0)
+        {
+            const LEO_M256 x0_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0),     _mm256_loadu_si256(y32_0));
+            const LEO_M256 x1_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 1), _mm256_loadu_si256(y32_0 + 1));
+            const LEO_M256 x0_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1),     _mm256_loadu_si256(y32_1));
+            const LEO_M256 x1_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 1), _mm256_loadu_si256(y32_1 + 1));
+            _mm256_storeu_si256(x32_0,     x0_0);
+            _mm256_storeu_si256(x32_0 + 1, x1_0);
+            _mm256_storeu_si256(x32_1,     x0_1);
+            _mm256_storeu_si256(x32_1 + 1, x1_1);
+        }
+        return;
+    }
+#endif // LEO_TRY_AVX2
+    LEO_M128 * LEO_RESTRICT       x16_0 = reinterpret_cast<LEO_M128 *>      (vx_0);
+    const LEO_M128 * LEO_RESTRICT y16_0 = reinterpret_cast<const LEO_M128 *>(vy_0);
+    LEO_M128 * LEO_RESTRICT       x16_1 = reinterpret_cast<LEO_M128 *>      (vx_1);
+    const LEO_M128 * LEO_RESTRICT y16_1 = reinterpret_cast<const LEO_M128 *>(vy_1);
+    do
+    {
+        const LEO_M128 x0_0 = _mm_xor_si128(_mm_loadu_si128(x16_0),     _mm_loadu_si128(y16_0));
+        const LEO_M128 x1_0 = _mm_xor_si128(_mm_loadu_si128(x16_0 + 1), _mm_loadu_si128(y16_0 + 1));
+        const LEO_M128 x2_0 = _mm_xor_si128(_mm_loadu_si128(x16_0 + 2), _mm_loadu_si128(y16_0 + 2));
+        const LEO_M128 x3_0 = _mm_xor_si128(_mm_loadu_si128(x16_0 + 3), _mm_loadu_si128(y16_0 + 3));
+        const LEO_M128 x0_1 = _mm_xor_si128(_mm_loadu_si128(x16_1),     _mm_loadu_si128(y16_1));
+        const LEO_M128 x1_1 = _mm_xor_si128(_mm_loadu_si128(x16_1 + 1), _mm_loadu_si128(y16_1 + 1));
+        const LEO_M128 x2_1 = _mm_xor_si128(_mm_loadu_si128(x16_1 + 2), _mm_loadu_si128(y16_1 + 2));
+        const LEO_M128 x3_1 = _mm_xor_si128(_mm_loadu_si128(x16_1 + 3), _mm_loadu_si128(y16_1 + 3));
+        _mm_storeu_si128(x16_0,     x0_0);
+        _mm_storeu_si128(x16_0 + 1, x1_0);
+        _mm_storeu_si128(x16_0 + 2, x2_0);
+        _mm_storeu_si128(x16_0 + 3, x3_0);
+        _mm_storeu_si128(x16_1,     x0_1);
+        _mm_storeu_si128(x16_1 + 1, x1_1);
+        _mm_storeu_si128(x16_1 + 2, x2_1);
+        _mm_storeu_si128(x16_1 + 3, x3_1);
+        x16_0 += 4, y16_0 += 4;
+        x16_1 += 4, y16_1 += 4;
+        bytes -= 64;
+    } while (bytes > 0);
+}
+
+void xor_mem3(
+    void * LEO_RESTRICT vx_0, const void * LEO_RESTRICT vy_0,
+    void * LEO_RESTRICT vx_1, const void * LEO_RESTRICT vy_1,
+    void * LEO_RESTRICT vx_2, const void * LEO_RESTRICT vy_2,
+    unsigned bytes)
+{
+#if defined(LEO_TRY_AVX2)
+    if (CpuHasAVX2)
+    {
+        LEO_M256 * LEO_RESTRICT       x32_0 = reinterpret_cast<LEO_M256 *>      (vx_0);
+        const LEO_M256 * LEO_RESTRICT y32_0 = reinterpret_cast<const LEO_M256 *>(vy_0);
+        LEO_M256 * LEO_RESTRICT       x32_1 = reinterpret_cast<LEO_M256 *>      (vx_1);
+        const LEO_M256 * LEO_RESTRICT y32_1 = reinterpret_cast<const LEO_M256 *>(vy_1);
+        LEO_M256 * LEO_RESTRICT       x32_2 = reinterpret_cast<LEO_M256 *>      (vx_2);
+        const LEO_M256 * LEO_RESTRICT y32_2 = reinterpret_cast<const LEO_M256 *>(vy_2);
+        do
+        {
+            const LEO_M256 x0_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0),     _mm256_loadu_si256(y32_0));
+            const LEO_M256 x1_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 1), _mm256_loadu_si256(y32_0 + 1));
+            const LEO_M256 x2_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 2), _mm256_loadu_si256(y32_0 + 2));
+            const LEO_M256 x3_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 3), _mm256_loadu_si256(y32_0 + 3));
+            const LEO_M256 x0_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1),     _mm256_loadu_si256(y32_1));
+            const LEO_M256 x1_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 1), _mm256_loadu_si256(y32_1 + 1));
+            const LEO_M256 x2_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 2), _mm256_loadu_si256(y32_1 + 2));
+            const LEO_M256 x3_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 3), _mm256_loadu_si256(y32_1 + 3));
+            const LEO_M256 x0_2 = _mm256_xor_si256(_mm256_loadu_si256(x32_2),     _mm256_loadu_si256(y32_2));
+            const LEO_M256 x1_2 = _mm256_xor_si256(_mm256_loadu_si256(x32_2 + 1), _mm256_loadu_si256(y32_2 + 1));
+            const LEO_M256 x2_2 = _mm256_xor_si256(_mm256_loadu_si256(x32_2 + 2), _mm256_loadu_si256(y32_2 + 2));
+            const LEO_M256 x3_2 = _mm256_xor_si256(_mm256_loadu_si256(x32_2 + 3), _mm256_loadu_si256(y32_2 + 3));
+            _mm256_storeu_si256(x32_0,     x0_0);
+            _mm256_storeu_si256(x32_0 + 1, x1_0);
+            _mm256_storeu_si256(x32_0 + 2, x2_0);
+            _mm256_storeu_si256(x32_0 + 3, x3_0);
+            _mm256_storeu_si256(x32_1,     x0_1);
+            _mm256_storeu_si256(x32_1 + 1, x1_1);
+            _mm256_storeu_si256(x32_1 + 2, x2_1);
+            _mm256_storeu_si256(x32_1 + 3, x3_1);
+            _mm256_storeu_si256(x32_2,     x0_2);
+            _mm256_storeu_si256(x32_2 + 1, x1_2);
+            _mm256_storeu_si256(x32_2 + 2, x2_2);
+            _mm256_storeu_si256(x32_2 + 3, x3_2);
+            x32_0 += 4, y32_0 += 4;
+            x32_1 += 4, y32_1 += 4;
+            x32_2 += 4, y32_2 += 4;
+            bytes -= 128;
+        } while (bytes >= 128);
+        if (bytes > 0)
+        {
+            const LEO_M256 x0_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0),     _mm256_loadu_si256(y32_0));
+            const LEO_M256 x1_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 1), _mm256_loadu_si256(y32_0 + 1));
+            const LEO_M256 x0_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1),     _mm256_loadu_si256(y32_1));
+            const LEO_M256 x1_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 1), _mm256_loadu_si256(y32_1 + 1));
+            const LEO_M256 x0_2 = _mm256_xor_si256(_mm256_loadu_si256(x32_2),     _mm256_loadu_si256(y32_2));
+            const LEO_M256 x1_2 = _mm256_xor_si256(_mm256_loadu_si256(x32_2 + 1), _mm256_loadu_si256(y32_2 + 1));
+            _mm256_storeu_si256(x32_0,     x0_0);
+            _mm256_storeu_si256(x32_0 + 1, x1_0);
+            _mm256_storeu_si256(x32_1,     x0_1);
+            _mm256_storeu_si256(x32_1 + 1, x1_1);
+            _mm256_storeu_si256(x32_2,     x0_2);
+            _mm256_storeu_si256(x32_2 + 1, x1_2);
+        }
+        return;
+    }
+#endif // LEO_TRY_AVX2
+    LEO_M128 * LEO_RESTRICT       x16_0 = reinterpret_cast<LEO_M128 *>      (vx_0);
+    const LEO_M128 * LEO_RESTRICT y16_0 = reinterpret_cast<const LEO_M128 *>(vy_0);
+    LEO_M128 * LEO_RESTRICT       x16_1 = reinterpret_cast<LEO_M128 *>      (vx_1);
+    const LEO_M128 * LEO_RESTRICT y16_1 = reinterpret_cast<const LEO_M128 *>(vy_1);
+    LEO_M128 * LEO_RESTRICT       x16_2 = reinterpret_cast<LEO_M128 *>      (vx_2);
+    const LEO_M128 * LEO_RESTRICT y16_2 = reinterpret_cast<const LEO_M128 *>(vy_2);
+    do
+    {
+        const LEO_M128 x0_0 = _mm_xor_si128(_mm_loadu_si128(x16_0),     _mm_loadu_si128(y16_0));
+        const LEO_M128 x1_0 = _mm_xor_si128(_mm_loadu_si128(x16_0 + 1), _mm_loadu_si128(y16_0 + 1));
+        const LEO_M128 x2_0 = _mm_xor_si128(_mm_loadu_si128(x16_0 + 2), _mm_loadu_si128(y16_0 + 2));
+        const LEO_M128 x3_0 = _mm_xor_si128(_mm_loadu_si128(x16_0 + 3), _mm_loadu_si128(y16_0 + 3));
+        const LEO_M128 x0_1 = _mm_xor_si128(_mm_loadu_si128(x16_1),     _mm_loadu_si128(y16_1));
+        const LEO_M128 x1_1 = _mm_xor_si128(_mm_loadu_si128(x16_1 + 1), _mm_loadu_si128(y16_1 + 1));
+        const LEO_M128 x2_1 = _mm_xor_si128(_mm_loadu_si128(x16_1 + 2), _mm_loadu_si128(y16_1 + 2));
+        const LEO_M128 x3_1 = _mm_xor_si128(_mm_loadu_si128(x16_1 + 3), _mm_loadu_si128(y16_1 + 3));
+        const LEO_M128 x0_2 = _mm_xor_si128(_mm_loadu_si128(x16_2),     _mm_loadu_si128(y16_2));
+        const LEO_M128 x1_2 = _mm_xor_si128(_mm_loadu_si128(x16_2 + 1), _mm_loadu_si128(y16_2 + 1));
+        const LEO_M128 x2_2 = _mm_xor_si128(_mm_loadu_si128(x16_2 + 2), _mm_loadu_si128(y16_2 + 2));
+        const LEO_M128 x3_2 = _mm_xor_si128(_mm_loadu_si128(x16_2 + 3), _mm_loadu_si128(y16_2 + 3));
+        _mm_storeu_si128(x16_0,     x0_0);
+        _mm_storeu_si128(x16_0 + 1, x1_0);
+        _mm_storeu_si128(x16_0 + 2, x2_0);
+        _mm_storeu_si128(x16_0 + 3, x3_0);
+        _mm_storeu_si128(x16_1,     x0_1);
+        _mm_storeu_si128(x16_1 + 1, x1_1);
+        _mm_storeu_si128(x16_1 + 2, x2_1);
+        _mm_storeu_si128(x16_1 + 3, x3_1);
+        _mm_storeu_si128(x16_2,     x0_2);
+        _mm_storeu_si128(x16_2 + 1, x1_2);
+        _mm_storeu_si128(x16_2 + 2, x2_2);
+        _mm_storeu_si128(x16_2 + 3, x3_2);
+        x16_0 += 4, y16_0 += 4;
+        x16_1 += 4, y16_1 += 4;
+        x16_2 += 4, y16_2 += 4;
+        bytes -= 64;
+    } while (bytes > 0);
+}
+
+
+//------------------------------------------------------------------------------
+// Multiplies
+
+// We require memory to be aligned since the SIMD instructions benefit from
+// or require aligned accesses to the table data.
+struct {
+    LEO_ALIGNED LEO_M128 Lo[256];
+    LEO_ALIGNED LEO_M128 Hi[256];
+} Multiply128LUT;
+#if defined(LEO_TRY_AVX2)
+struct {
+    LEO_ALIGNED LEO_M256 Lo[256];
+    LEO_ALIGNED LEO_M256 Hi[256];
+} Multiply256LUT;
+#endif // LEO_TRY_AVX2
+
+// Returns a * b
+static ffe_t FFEMultiply(ffe_t a, ffe_t b)
+{
+    if (a == 0 || b == 0)
+        return 0;
+    return ExpLUT[AddMod(LogLUT[a], LogLUT[b])];
+}
+
+bool InitializeMultiplyTables()
+{
+    // Reuse aligned self test buffers to load table data
+    uint8_t* lo = m_SelfTestBuffers.A;
+    uint8_t* hi = m_SelfTestBuffers.B;
+
+    for (int y = 0; y < 256; ++y)
+    {
+        for (unsigned char x = 0; x < 16; ++x)
+        {
+            lo[x] = FFEMultiply(x,      static_cast<uint8_t>(y));
+            hi[x] = FFEMultiply(x << 4, static_cast<uint8_t>(y));
+        }
+
+        const LEO_M128 table_lo = _mm_loadu_si128((LEO_M128*)lo);
+        const LEO_M128 table_hi = _mm_loadu_si128((LEO_M128*)hi);
+        _mm_storeu_si128(Multiply128LUT.Lo + y, table_lo);
+        _mm_storeu_si128(Multiply128LUT.Hi + y, table_hi);
+#if defined(LEO_TRY_AVX2)
+        if (CpuHasAVX2)
+        {
+            const LEO_M256 table_lo2 = _mm256_broadcastsi128_si256(table_lo);
+            const LEO_M256 table_hi2 = _mm256_broadcastsi128_si256(table_hi);
+            _mm256_storeu_si256(Multiply256LUT.Lo + y, table_lo2);
+            _mm256_storeu_si256(Multiply256LUT.Hi + y, table_hi2);
+        }
+#endif // LEO_TRY_AVX2
+    }
+
+    return true;
+}
+
+// vx[] = vy[] * m
+void mul_mem_set(
+    void * LEO_RESTRICT vx, const void * LEO_RESTRICT vy,
+    ffe_t m, unsigned bytes)
+{
+    if (m <= 1)
+    {
+        if (m == 1)
+            memcpy(vx, vy, bytes);
+        else
+            memset(vx, 0, bytes);
+        return;
+    }
+
+#if defined(LEO_TRY_AVX2)
+    if (CpuHasAVX2)
+    {
+        const LEO_M256 table_lo_y = _mm256_loadu_si256(Multiply256LUT.Lo + m);
+        const LEO_M256 table_hi_y = _mm256_loadu_si256(Multiply256LUT.Hi + m);
+
+        const LEO_M256 clr_mask = _mm256_set1_epi8(0x0f);
+
+        LEO_M256 * LEO_RESTRICT z32 = reinterpret_cast<LEO_M256 *>(vx);
+        const LEO_M256 * LEO_RESTRICT x32 = reinterpret_cast<const LEO_M256 *>(vy);
+
+        const unsigned count = bytes / 64;
+        for (unsigned i = 0; i < count; ++i)
+        {
+            LEO_M256 x0 = _mm256_loadu_si256(x32 + i * 2);
+            LEO_M256 l0 = _mm256_and_si256(x0, clr_mask);
+            x0 = _mm256_srli_epi64(x0, 4);
+            LEO_M256 h0 = _mm256_and_si256(x0, clr_mask);
+            l0 = _mm256_shuffle_epi8(table_lo_y, l0);
+            h0 = _mm256_shuffle_epi8(table_hi_y, h0);
+            _mm256_storeu_si256(z32 + i * 2, _mm256_xor_si256(l0, h0));
+
+            LEO_M256 x1 = _mm256_loadu_si256(x32 + i * 2 + 1);
+            LEO_M256 l1 = _mm256_and_si256(x1, clr_mask);
+            x1 = _mm256_srli_epi64(x1, 4);
+            LEO_M256 h1 = _mm256_and_si256(x1, clr_mask);
+            l1 = _mm256_shuffle_epi8(table_lo_y, l1);
+            h1 = _mm256_shuffle_epi8(table_hi_y, h1);
+            _mm256_storeu_si256(z32 + i * 2 + 1, _mm256_xor_si256(l1, h1));
+        }
+        return;
+    }
+#endif // LEO_TRY_AVX2
+
+    const LEO_M128 table_lo_y = _mm_loadu_si128(Multiply128LUT.Lo + m);
+    const LEO_M128 table_hi_y = _mm_loadu_si128(Multiply128LUT.Hi + m);
+
+    const LEO_M128 clr_mask = _mm_set1_epi8(0x0f);
+
+    LEO_M128 * LEO_RESTRICT       x16 = reinterpret_cast<LEO_M128 *>      (vx);
+    const LEO_M128 * LEO_RESTRICT y16 = reinterpret_cast<const LEO_M128 *>(vy);
+
+    do
+    {
+        LEO_M128 x3 = _mm_loadu_si128(y16 + 3);
+        LEO_M128 l3 = _mm_and_si128(x3, clr_mask);
+        x3 = _mm_srli_epi64(x3, 4);
+        LEO_M128 h3 = _mm_and_si128(x3, clr_mask);
+        l3 = _mm_shuffle_epi8(table_lo_y, l3);
+        h3 = _mm_shuffle_epi8(table_hi_y, h3);
+
+        LEO_M128 x2 = _mm_loadu_si128(y16 + 2);
+        LEO_M128 l2 = _mm_and_si128(x2, clr_mask);
+        x2 = _mm_srli_epi64(x2, 4);
+        LEO_M128 h2 = _mm_and_si128(x2, clr_mask);
+        l2 = _mm_shuffle_epi8(table_lo_y, l2);
+        h2 = _mm_shuffle_epi8(table_hi_y, h2);
+
+        LEO_M128 x1 = _mm_loadu_si128(y16 + 1);
+        LEO_M128 l1 = _mm_and_si128(x1, clr_mask);
+        x1 = _mm_srli_epi64(x1, 4);
+        LEO_M128 h1 = _mm_and_si128(x1, clr_mask);
+        l1 = _mm_shuffle_epi8(table_lo_y, l1);
+        h1 = _mm_shuffle_epi8(table_hi_y, h1);
+
+        LEO_M128 x0 = _mm_loadu_si128(y16);
+        LEO_M128 l0 = _mm_and_si128(x0, clr_mask);
+        x0 = _mm_srli_epi64(x0, 4);
+        LEO_M128 h0 = _mm_and_si128(x0, clr_mask);
+        l0 = _mm_shuffle_epi8(table_lo_y, l0);
+        h0 = _mm_shuffle_epi8(table_hi_y, h0);
+
+        _mm_storeu_si128(x16 + 3, _mm_xor_si128(l3, h3));
+        _mm_storeu_si128(x16 + 2, _mm_xor_si128(l2, h2));
+        _mm_storeu_si128(x16 + 1, _mm_xor_si128(l1, h1));
+        _mm_storeu_si128(x16,     _mm_xor_si128(l0, h0));
+
+        x16 += 4, y16 += 4;
+        bytes -= 64;
+    } while (bytes > 0);
+}
+
+// vx0[] *= m, vx1[] *= m
+void mul_mem2_inplace(
+    void * LEO_RESTRICT vx_0,
+    void * LEO_RESTRICT vx_1,
+    ffe_t m, unsigned bytes)
+{
+    if (m <= 1)
+    {
+        if (m == 0)
+        {
+            memset(vx_0, 0, bytes);
+            memset(vx_1, 0, bytes);
+        }
+        return;
+    }
+
+#if defined(LEO_TRY_AVX2)
+    if (CpuHasAVX2)
+    {
+        const LEO_M256 table_lo_y = _mm256_loadu_si256(Multiply256LUT.Lo + m);
+        const LEO_M256 table_hi_y = _mm256_loadu_si256(Multiply256LUT.Hi + m);
+
+        const LEO_M256 clr_mask = _mm256_set1_epi8(0x0f);
+
+        LEO_M256 * LEO_RESTRICT x32_0 = reinterpret_cast<LEO_M256 *>(vx_0);
+        LEO_M256 * LEO_RESTRICT x32_1 = reinterpret_cast<LEO_M256 *>(vx_1);
+
+        do
+        {
+            LEO_M256 x0_0 = _mm256_loadu_si256(x32_0 + 1);
+            LEO_M256 l0_0 = _mm256_and_si256(x0_0, clr_mask);
+            x0_0 = _mm256_srli_epi64(x0_0, 4);
+            LEO_M256 h0_0 = _mm256_and_si256(x0_0, clr_mask);
+            l0_0 = _mm256_shuffle_epi8(table_lo_y, l0_0);
+            h0_0 = _mm256_shuffle_epi8(table_hi_y, h0_0);
+            l0_0 = _mm256_xor_si256(l0_0, h0_0);
+
+            LEO_M256 x1_0 = _mm256_loadu_si256(x32_0);
+            LEO_M256 l1_0 = _mm256_and_si256(x1_0, clr_mask);
+            x1_0 = _mm256_srli_epi64(x1_0, 4);
+            LEO_M256 h1_0 = _mm256_and_si256(x1_0, clr_mask);
+            l1_0 = _mm256_shuffle_epi8(table_lo_y, l1_0);
+            h1_0 = _mm256_shuffle_epi8(table_hi_y, h1_0);
+            l1_0 = _mm256_xor_si256(l1_0, h1_0);
+
+            LEO_M256 x0_1 = _mm256_loadu_si256(x32_1 + 1);
+            LEO_M256 l0_1 = _mm256_and_si256(x0_1, clr_mask);
+            x0_1 = _mm256_srli_epi64(x0_1, 4);
+            LEO_M256 h0_1 = _mm256_and_si256(x0_1, clr_mask);
+            l0_1 = _mm256_shuffle_epi8(table_lo_y, l0_1);
+            h0_1 = _mm256_shuffle_epi8(table_hi_y, h0_1);
+            l0_1 = _mm256_xor_si256(l0_1, h0_1);
+
+            LEO_M256 x1_1 = _mm256_loadu_si256(x32_1);
+            LEO_M256 l1_1 = _mm256_and_si256(x1_1, clr_mask);
+            x1_1 = _mm256_srli_epi64(x1_1, 4);
+            LEO_M256 h1_1 = _mm256_and_si256(x1_1, clr_mask);
+            l1_1 = _mm256_shuffle_epi8(table_lo_y, l1_1);
+            h1_1 = _mm256_shuffle_epi8(table_hi_y, h1_1);
+            l1_1 = _mm256_xor_si256(l1_1, h1_1);
+
+            _mm256_storeu_si256(x32_0 + 1, l0_0);
+            _mm256_storeu_si256(x32_0, l1_0);
+            _mm256_storeu_si256(x32_1 + 1, l0_1);
+            _mm256_storeu_si256(x32_1, l1_1);
+
+            x32_0 += 2;
+            x32_1 += 2;
+            bytes -= 64;
+        } while (bytes > 0);
+        return;
+    }
+#endif // LEO_TRY_AVX2
+
+    const LEO_M128 table_lo_y = _mm_loadu_si128(Multiply128LUT.Lo + m);
+    const LEO_M128 table_hi_y = _mm_loadu_si128(Multiply128LUT.Hi + m);
+
+    const LEO_M128 clr_mask = _mm_set1_epi8(0x0f);
+
+    LEO_M128 * LEO_RESTRICT x16_0 = reinterpret_cast<LEO_M128 *>(vx_0);
+    LEO_M128 * LEO_RESTRICT x16_1 = reinterpret_cast<LEO_M128 *>(vx_1);
+
+    do
+    {
+        LEO_M128 x3 = _mm_loadu_si128(x16_0 + 3);
+        LEO_M128 l3 = _mm_and_si128(x3, clr_mask);
+        x3 = _mm_srli_epi64(x3, 4);
+        LEO_M128 h3 = _mm_and_si128(x3, clr_mask);
+        l3 = _mm_shuffle_epi8(table_lo_y, l3);
+        h3 = _mm_shuffle_epi8(table_hi_y, h3);
+
+        LEO_M128 x2 = _mm_loadu_si128(x16_0 + 2);
+        LEO_M128 l2 = _mm_and_si128(x2, clr_mask);
+        x2 = _mm_srli_epi64(x2, 4);
+        LEO_M128 h2 = _mm_and_si128(x2, clr_mask);
+        l2 = _mm_shuffle_epi8(table_lo_y, l2);
+        h2 = _mm_shuffle_epi8(table_hi_y, h2);
+
+        LEO_M128 x1 = _mm_loadu_si128(x16_0 + 1);
+        LEO_M128 l1 = _mm_and_si128(x1, clr_mask);
+        x1 = _mm_srli_epi64(x1, 4);
+        LEO_M128 h1 = _mm_and_si128(x1, clr_mask);
+        l1 = _mm_shuffle_epi8(table_lo_y, l1);
+        h1 = _mm_shuffle_epi8(table_hi_y, h1);
+
+        LEO_M128 x0 = _mm_loadu_si128(x16_0);
+        LEO_M128 l0 = _mm_and_si128(x0, clr_mask);
+        x0 = _mm_srli_epi64(x0, 4);
+        LEO_M128 h0 = _mm_and_si128(x0, clr_mask);
+        l0 = _mm_shuffle_epi8(table_lo_y, l0);
+        h0 = _mm_shuffle_epi8(table_hi_y, h0);
+
+        _mm_storeu_si128(x16_0 + 3, _mm_xor_si128(l3, h3));
+        _mm_storeu_si128(x16_0 + 2, _mm_xor_si128(l2, h2));
+        _mm_storeu_si128(x16_0 + 1, _mm_xor_si128(l1, h1));
+        _mm_storeu_si128(x16_0,     _mm_xor_si128(l0, h0));
+
+        // FIXME: Add second one here
+
+        x16_0 += 4;
+        x16_1 += 4;
+        bytes -= 64;
+    } while (bytes > 0);
+}
+
+
+//------------------------------------------------------------------------------
+// FFT Operations
+
+// x[] ^= y[] * m, y[] ^= x[]
+void mul_fft(
+    void * LEO_RESTRICT x, void * LEO_RESTRICT y,
+    ffe_t m, unsigned bytes)
+{
+
+}
+
+// For i = {0, 1}: x_i[] ^= y_i[] * m, y_i[] ^= x_i[]
+void mul_fft2(
+    void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
+    void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1,
+    ffe_t m, unsigned bytes)
+{
+
+}
+
+// For i = {0, 1, 2}: x_i[] ^= y_i[] * m, y_i[] ^= x_i[]
+void mul_fft3(
+    void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
+    void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1,
+    void * LEO_RESTRICT x_2, void * LEO_RESTRICT y_2,
+    ffe_t m, unsigned bytes)
+{
+
+}
+
+
+//------------------------------------------------------------------------------
+// IFFT Operations
+
+// y[] ^= x[], x[] ^= y[] * m
+void mul_ifft(
+    void * LEO_RESTRICT x, void * LEO_RESTRICT y,
+    ffe_t m, unsigned bytes)
+{
+
+}
+
+// For i = {0, 1}: y_i[] ^= x_i[], x_i[] ^= y_i[] * m
+void mul_ifft2(
+    void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
+    void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1,
+    ffe_t m, unsigned bytes)
+{
+
+}
+
+// For i = {0, 1, 2}: y_i[] ^= x_i[], x_i[] ^= y_i[] * m
+void mul_ifft3(
+    void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
+    void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1,
+    void * LEO_RESTRICT x_2, void * LEO_RESTRICT y_2,
+    ffe_t m, unsigned bytes)
+{
+
+}
+
+
+//------------------------------------------------------------------------------
+// API
+
+static bool IsInitialized = false;
+
+bool Initialize()
+{
+    if (IsInitialized)
+        return true;
+
+    if (!CpuHasSSSE3)
+        return false;
+
+    InitializeLogarithmTables();
+
+    IsInitialized = true;
+    return true;
+}
+
+
+}} // namespace leopard::ff8
--- a/LeopardFF8.h
+++ b/LeopardFF8.h
@ -0,0 +1,157 @@
+/*
+    Copyright (c) 2017 Christopher A. Taylor.  All rights reserved.
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright notice,
+      this list of conditions and the following disclaimer in the documentation
+      and/or other materials provided with the distribution.
+    * Neither the name of Leopard-RS nor the names of its contributors may be
+      used to endorse or promote products derived from this software without
+      specific prior written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+    ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+    POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#pragma once
+
+#include "LeopardCommon.h"
+
+/*
+    8-bit Finite Field Math
+
+    This finite field contains 256 elements and so each element is one byte.
+    This library is designed for data that is a multiple of 64 bytes in size.
+*/
+
+namespace leopard { namespace ff8 {
+
+
+//------------------------------------------------------------------------------
+// Datatypes and Constants
+
+// Finite field element type
+typedef uint8_t ffe_t;
+
+// Number of bits per element
+static const unsigned kBits = 8;
+
+// Finite field order: Number of elements in the field
+static const unsigned kOrder = 256;
+
+
+//------------------------------------------------------------------------------
+// Fast Walsh-Hadamard Transform (FWHT) (mod kModulus)
+
+// Define this to enable the optimized version of FWHT()
+#define LEO_FF8_FWHT_OPTIMIZED
+
+// Transform for a variable number of bits (up to kOrder)
+void FWHT(ffe_t* data, const unsigned bits);
+
+// Transform specialized for the finite field order
+void FWHT(ffe_t data[kOrder]);
+
+
+//------------------------------------------------------------------------------
+// XOR Memory
+
+// x[] ^= y[]
+void xor_mem(
+    void * LEO_RESTRICT x, const void * LEO_RESTRICT y,
+    unsigned bytes);
+
+// For i = {0, 1}: x_i[] ^= x_i[]
+void xor_mem2(
+    void * LEO_RESTRICT x_0, const void * LEO_RESTRICT y_0,
+    void * LEO_RESTRICT x_1, const void * LEO_RESTRICT y_1,
+    unsigned bytes);
+
+// For i = {0, 1, 2}: x_i[] ^= x_i[]
+void xor_mem3(
+    void * LEO_RESTRICT x_0, const void * LEO_RESTRICT y_0,
+    void * LEO_RESTRICT x_1, const void * LEO_RESTRICT y_1,
+    void * LEO_RESTRICT x_2, const void * LEO_RESTRICT y_2,
+    unsigned bytes);
+
+
+//------------------------------------------------------------------------------
+// Multiplies
+
+// x[] = y[] * m
+void mul_mem_set(
+    void * LEO_RESTRICT x, const void * LEO_RESTRICT y,
+    ffe_t m, unsigned bytes);
+
+// For i = {0, 1}: x_i[] *= m
+void mul_mem2_inplace(
+    void * LEO_RESTRICT x_0,
+    void * LEO_RESTRICT x_1,
+    ffe_t m, unsigned bytes);
+
+
+//------------------------------------------------------------------------------
+// FFT Operations
+
+// x[] ^= y[] * m, y[] ^= x[]
+void mul_fft(
+    void * LEO_RESTRICT x, void * LEO_RESTRICT y,
+    ffe_t m, unsigned bytes);
+
+// For i = {0, 1}: x_i[] ^= y_i[] * m, y_i[] ^= x_i[]
+void mul_fft2(
+    void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
+    void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1,
+    ffe_t m, unsigned bytes);
+
+// For i = {0, 1, 2}: x_i[] ^= y_i[] * m, y_i[] ^= x_i[]
+void mul_fft3(
+    void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
+    void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1,
+    void * LEO_RESTRICT x_2, void * LEO_RESTRICT y_2,
+    ffe_t m, unsigned bytes);
+
+
+//------------------------------------------------------------------------------
+// IFFT Operations
+
+// y[] ^= x[], x[] ^= y[] * m
+void mul_ifft(
+    void * LEO_RESTRICT x, void * LEO_RESTRICT y,
+    ffe_t m, unsigned bytes);
+
+// For i = {0, 1}: y_i[] ^= x_i[], x_i[] ^= y_i[] * m
+void mul_ifft2(
+    void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
+    void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1,
+    ffe_t m, unsigned bytes);
+
+// For i = {0, 1, 2}: y_i[] ^= x_i[], x_i[] ^= y_i[] * m
+void mul_ifft3(
+    void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
+    void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1,
+    void * LEO_RESTRICT x_2, void * LEO_RESTRICT y_2,
+    ffe_t m, unsigned bytes);
+
+
+//------------------------------------------------------------------------------
+// API
+
+// Returns false if the self-test fails
+bool Initialize();
+
+
+}} // namespace leopard::ff8
--- a/License.md
+++ b/License.md
@ -0,0 +1,29 @@
+BSD 3-Clause License
+
+Copyright (c) 2017, Christopher A. Taylor
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/README.md
+++ b/README.md
@ -1,9 +1,91 @@
-# Lin-Han-Chung RS Codes
-This is an attempt at implementing a fast version of the algorithm described here:
+# Leopard-RS
+## Leopard Reed-Solomon Error Correction Codes in C
+
+Leopard-RS is a portable, fast library for Forward Error Correction.
+From a block of equally sized original data pieces, it generates recovery
+symbols that can be used to recover lost original data.
+
+* It requires that data pieces are all a fixed size, a multiple of 64 bytes.
+* The original and recovery data must not exceed 65536 pieces.
+
+
+#### Motivation:
+
+It gets slower as O(N Log N) in the input data size, and its inner loops are
+vectorized using the best approaches available on modern processors, using the
+fastest finite fields (8-bit or 16-bit Galois fields) for bulk data.
+
+It sets new speed records for MDS encoding and decoding of large data.
+It is also the only open-source production ready software for this purpose
+available today.
+
+Example applications are data recovery software and data center replication.
+
+
+#### Encoder API:
+
+```
+#include "leopard.h"
+```
+
+For full documentation please read `leopard.h`.
+
+ `leo_init()` : Initialize library.
+ `leo_encode_work_count()` : Calculate the number of work_data buffers to provide to leo_encode().
+ `leo_encode()`: Generate recovery data.
+
+
+#### Decoder API:
+
+```
+#include "leopard.h"
+```
+
+For full documentation please read `leopard.h`.
+
+ `leo_init()` : Initialize library.
+ `leo_decode_work_count()` : Calculate the number of work_data buffers to provide to leo_decode().
+ `leo_decode()` : Generate recovery data.
+
+
+#### Benchmarks:
+
+```
+TODO
+```
+
+
+#### Comparisons:
+
+```
+TODO
+```
+
+
+#### Background
+
+This library implements an MDS erasure code introduced in this paper:

 ~~~
    S.-J. Lin,  T. Y. Al-Naffouri, Y. S. Han, and W.-H. Chung,
    "Novel Polynomial Basis with Fast Fourier Transform and Its Application to Reed-Solomon Erasure Codes"
    IEEE Trans. on Information Theory, pp. 6284-6299, November, 2016.
 ~~~
-Available here: [http://ct.ee.ntust.edu.tw/it2016-2.pdf](http://ct.ee.ntust.edu.tw/it2016-2.pdf)
+
+The paper is available here: [http://ct.ee.ntust.edu.tw/it2016-2.pdf](http://ct.ee.ntust.edu.tw/it2016-2.pdf)
+And also mirrored in the /docs/ folder.
+
+The high-level summary is that instead of using complicated fields,
+an additive FFT was introduced that works with familiar Galois fields for the first time.
+This is actually a huge new result that will change how Reed-Solomon codecs will be written.
+
+My contribution is extending the ALTMAP approach from Jerasure
+for 16-bit Galois fields out to 64 bytes to enable AVX2 speedups,
+and marry it with the row parallelism introduced by ISA-L.
+
+
+#### Credits
+
+The idea is the brain-child of S.-J. Lin.  He is a super bright guy who should be recognized more widely!
+
+This software was written entirely by myself ( Christopher A. Taylor mrcatid@gmail.com ). If you find it useful and would like to buy me a coffee, consider tipping.
--- a/leopard.cpp
+++ b/leopard.cpp
@ -0,0 +1,172 @@
+/*
+    Copyright (c) 2017 Christopher A. Taylor.  All rights reserved.
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright notice,
+      this list of conditions and the following disclaimer in the documentation
+      and/or other materials provided with the distribution.
+    * Neither the name of Leopard-RS nor the names of its contributors may be
+      used to endorse or promote products derived from this software without
+      specific prior written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+    ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+    POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "leopard.h"
+#include "FecalEncoder.h"
+#include "FecalDecoder.h"
+
+extern "C" {
+
+
+//------------------------------------------------------------------------------
+// Initialization API
+
+static bool m_Initialized = false;
+
+FECAL_EXPORT int fecal_init_(int version)
+{
+    if (version != FECAL_VERSION)
+        return Fecal_InvalidInput;
+
+    if (0 != gf256_init())
+        return Fecal_Platform;
+
+    m_Initialized = true;
+    return Fecal_Success;
+}
+
+
+//------------------------------------------------------------------------------
+// Encoder API
+
+FECAL_EXPORT FecalEncoder fecal_encoder_create(unsigned input_count, void* const * const input_data, uint64_t total_bytes)
+{
+    if (input_count <= 0 || !input_data || total_bytes < input_count)
+    {
+        FECAL_DEBUG_BREAK; // Invalid input
+        return nullptr;
+    }
+
+    FECAL_DEBUG_ASSERT(m_Initialized); // Must call fecal_init() first
+    if (!m_Initialized)
+        return nullptr;
+
+    fecal::Encoder* encoder = new(std::nothrow) fecal::Encoder;
+    if (!encoder)
+    {
+        FECAL_DEBUG_BREAK; // Out of memory
+        return nullptr;
+    }
+
+    if (Fecal_Success != encoder->Initialize(input_count, input_data, total_bytes))
+    {
+        delete encoder;
+        return nullptr;
+    }
+
+    return reinterpret_cast<FecalEncoder>( encoder );
+}
+
+FECAL_EXPORT int fecal_encode(FecalEncoder encoder_v, FecalSymbol* symbol)
+{
+    fecal::Encoder* encoder = reinterpret_cast<fecal::Encoder*>( encoder_v );
+    if (!encoder || !symbol)
+        return Fecal_InvalidInput;
+
+    return encoder->Encode(*symbol);
+}
+
+FECAL_EXPORT void fecal_free(void* codec_v)
+{
+    if (codec_v)
+    {
+        fecal::ICodec* icodec = reinterpret_cast<fecal::ICodec*>( codec_v );
+        delete icodec;
+    }
+}
+
+
+//------------------------------------------------------------------------------
+// Decoder API
+
+FECAL_EXPORT FecalDecoder fecal_decoder_create(unsigned input_count, uint64_t total_bytes)
+{
+    if (input_count <= 0 || total_bytes < input_count)
+    {
+        FECAL_DEBUG_BREAK; // Invalid input
+        return nullptr;
+    }
+
+    FECAL_DEBUG_ASSERT(m_Initialized); // Must call fecal_init() first
+    if (!m_Initialized)
+        return nullptr;
+
+    fecal::Decoder* decoder = new(std::nothrow) fecal::Decoder;
+    if (!decoder)
+    {
+        FECAL_DEBUG_BREAK; // Out of memory
+        return nullptr;
+    }
+
+    if (Fecal_Success != decoder->Initialize(input_count, total_bytes))
+    {
+        delete decoder;
+        return nullptr;
+    }
+
+    return reinterpret_cast<FecalDecoder>( decoder );
+}
+
+FECAL_EXPORT int fecal_decoder_add_original(FecalDecoder decoder_v, const FecalSymbol* symbol)
+{
+    fecal::Decoder* decoder = reinterpret_cast<fecal::Decoder*>( decoder_v );
+    if (!decoder || !symbol)
+        return Fecal_InvalidInput;
+
+    return decoder->AddOriginal(*symbol);
+}
+
+FECAL_EXPORT int fecal_decoder_add_recovery(FecalDecoder decoder_v, const FecalSymbol* symbol)
+{
+    fecal::Decoder* decoder = reinterpret_cast<fecal::Decoder*>( decoder_v );
+    if (!decoder || !symbol)
+        return Fecal_InvalidInput;
+
+    return decoder->AddRecovery(*symbol);
+}
+
+FECAL_EXPORT int fecal_decode(FecalDecoder decoder_v, RecoveredSymbols* symbols)
+{
+    fecal::Decoder* decoder = reinterpret_cast<fecal::Decoder*>( decoder_v );
+    if (!decoder || !symbols)
+        return Fecal_InvalidInput;
+
+    return decoder->Decode(*symbols);
+}
+
+FECAL_EXPORT int fecal_decoder_get(FecalDecoder decoder_v, unsigned input_index, FecalSymbol* symbol)
+{
+    fecal::Decoder* decoder = reinterpret_cast<fecal::Decoder*>( decoder_v );
+    if (!decoder || !symbol)
+        return Fecal_InvalidInput;
+
+    return decoder->GetOriginal(input_index, *symbol);
+}
+
+
+} // extern "C"
--- a/leopard.h
+++ b/leopard.h
@ -0,0 +1,229 @@
+/*
+    Copyright (c) 2017 Christopher A. Taylor.  All rights reserved.
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright notice,
+      this list of conditions and the following disclaimer in the documentation
+      and/or other materials provided with the distribution.
+    * Neither the name of Leopard-RS nor the names of its contributors may be
+      used to endorse or promote products derived from this software without
+      specific prior written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+    ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+    POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef CAT_LEOPARD_RS_H
+#define CAT_LEOPARD_RS_H
+
+/*
+    Leopard-RS: Reed-Solomon Error Correction Coding for Extremely Large Data
+
+    S.-J. Lin, T. Y. Al-Naffouri, Y. S. Han, and W.-H. Chung,
+    "Novel Polynomial Basis with Fast Fourier Transform and Its Application to Reed-Solomon Erasure Codes"
+    IEEE Trans. on Information Theory, pp. 6284-6299, November, 2016.
+    http://ct.ee.ntust.edu.tw/it2016-2.pdf
+*/
+
+// Library version
+#define LEO_VERSION 1
+
+// Tweak if the functions are exported or statically linked
+//#define LEO_DLL /* Defined when building/linking as DLL */
+//#define LEO_BUILDING /* Defined by the library makefile */
+
+#if defined(LEO_BUILDING)
+# if defined(LEO_DLL)
+    #define LEO_EXPORT __declspec(dllexport)
+# else
+    #define LEO_EXPORT
+# endif
+#else
+# if defined(LEO_DLL)
+    #define LEO_EXPORT __declspec(dllimport)
+# else
+    #define LEO_EXPORT extern
+# endif
+#endif
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+//------------------------------------------------------------------------------
+// Initialization API
+
+/*
+    leo_init()
+
+    Perform static initialization for the library, verifying that the platform
+    is supported.
+
+    Returns 0 on success and other values on failure.
+*/
+
+LEO_EXPORT int leo_init_(int version);
+#define leo_init() leo_init_(LEO_VERSION)
+
+
+//------------------------------------------------------------------------------
+// Shared Constants / Datatypes
+
+// Results
+typedef enum LeopardResultT
+{
+    Leopard_Success           =  0, // Operation succeeded
+
+    Leopard_TooMuchData       = -1, // Buffer counts are too high
+    Leopard_InvalidBlockSize  = -2, // Buffer size must be a multiple of 64 bytes
+    Leopard_InvalidInput      = -3, // A function parameter was invalid
+    Leopard_Platform          = -4, // Platform is unsupported
+    Leopard_OutOfMemory       = -5, // Out of memory error occurred
+    Leopard_Unexpected        = -6, // Unexpected error - Software bug?
+} LeopardResult;
+
+// Results
+typedef enum LeopardFlagsT
+{
+    LeopardFlags_Defaults      = 0, // Default settings
+
+    LeopardFlags_Multithreaded = 1, // Enable multiple threads
+} LeopardFlags;
+
+
+//------------------------------------------------------------------------------
+// Encoder API
+
+/*
+    leo_encode_work_count()
+
+	Calculate the number of work_data buffers to provide to leo_encode().
+
+    The sum of original_count + recovery_count must not exceed 65536.
+
+	Returns the work_count value to pass into leo_encode().
+    Returns 0 on invalid input.
+*/
+
+LEO_EXPORT unsigned leo_encode_work_count(
+    unsigned original_count,
+    unsigned recovery_count);
+
+/*
+    leo_encode()
+
+    Generate recovery data.
+
+    original_count: Number of original_data[] buffers provided.
+    recovery_count: Number of desired recovery data buffers.
+    buffer_bytes:   Number of bytes in each data buffer.
+    original_data:  Array of pointers to original data buffers.
+    work_count:     Number of work_data[] buffers, from leo_encode_work_count().
+    work_data:      Array of pointers to work data buffers.
+    flags:          Flags for encoding e.g. LeopardFlag_Multithreaded
+
+    The sum of original_count + recovery_count must not exceed 65536.
+    The buffer_bytes must be a multiple of 64.
+    Each buffer should have the same number of bytes.
+    Even the last piece must be rounded up to the block size.
+
+    Let buffer_bytes = The number of bytes in each buffer:
+
+        original_count = static_cast<unsigned>(
+            ((uint64_t)total_bytes + buffer_bytes - 1) / buffer_bytes);
+
+    Or if the number of pieces is known:
+
+        buffer_bytes = static_cast<unsigned>(
+            ((uint64_t)total_bytes + original_count - 1) / original_count);
+
+    Returns Leopard_Success on success.
+    The first set of recovery_count buffers in work_data will be the result.
+
+    Returns Leopard_TooMuchData if the data is too large.
+    Returns Leopard_InvalidBlockSize if the data is the wrong size.
+    Returns Leopard_InvalidInput on invalid input.
+    Returns other values on errors.
+*/
+LEO_EXPORT LeopardResult leo_encode(
+    unsigned buffer_bytes,              // Number of bytes in each data buffer
+    unsigned original_count,            // Number of original_data[] buffer pointers
+    unsigned recovery_count,            // Number of recovery_data[] buffer pointers
+    unsigned work_count,                // Number of work_data[] buffer pointers, from leo_encode_work_count()
+    void* const * const original_data,  // Array of pointers to original data buffers
+    void** work_data,                   // Array of work buffers
+    unsigned flags);                    // Operation flags
+
+
+//------------------------------------------------------------------------------
+// Decoder API
+
+/*
+    leo_decode_work_count()
+
+	Calculate the number of work_data buffers to provide to leo_decode().
+
+    The sum of original_count + recovery_count must not exceed 65536.
+
+	Returns the work_count value to pass into leo_encode().
+    Returns 0 on invalid input.
+*/
+
+LEO_EXPORT unsigned leo_decode_work_count(
+    unsigned original_count,
+    unsigned recovery_count);
+
+/*
+    leo_decode()
+
+    Decode original data from recovery data.
+
+    buffer_bytes:   Number of bytes in each data buffer.
+    original_count: Number of original_data[] buffers provided.
+    original_data:  Array of pointers to original data buffers.
+    recovery_count: Number of recovery_data[] buffers provided.
+    recovery_data:  Array of pointers to recovery data buffers.
+    work_count:     Number of work_data[] buffers, from leo_decode_work_count().
+    work_data:      Array of pointers to recovery data buffers.
+    flags:          Flags for encoding e.g. LeopardFlag_Multithreaded
+
+    Lost original/recovery data should be set to NULL.
+
+    The sum of recovery_count + the number of non-NULL original data must be at
+    least original_count in order to perform recovery.
+
+    Returns Leopard_Success on success.
+    Returns other values on errors.
+*/
+LEO_EXPORT LeopardResult leo_decode(
+    unsigned buffer_bytes,              // Number of bytes in each data buffer
+    unsigned original_count,            // Number of original_data[] buffer pointers
+    unsigned recovery_count,            // Number of recovery_data[] buffer pointers
+    unsigned work_count,                // Number of buffer pointers in work_data[]
+    void* const * const original_data,  // Array of original data buffers
+    void* const * const recovery_data,  // Array of recovery data buffers
+    void** work_data,                   // Array of work data buffers
+    unsigned flags);                    // Operation flags
+
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif // CAT_LEOPARD_RS_H
--- a/proj/Leopard.sln
+++ b/proj/Leopard.sln
@ -1,9 +1,11 @@

 Microsoft Visual Studio Solution File, Format Version 12.00
-# Visual Studio 14
-VisualStudioVersion = 14.0.25420.1
+# Visual Studio 15
+VisualStudioVersion = 15.0.26127.3
 MinimumVisualStudioVersion = 10.0.40219.1
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "LHC_RS", "LHC_RS.vcxproj", "{32176592-2F30-4BD5-B645-EB11C8D3453E}"
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "Leopard", "Leopard.vcxproj", "{32176592-2F30-4BD5-B645-EB11C8D3453E}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "LeopardBenchmark", "..\tests\proj\Benchmark.vcxproj", "{97FCA15F-EAF3-4F1A-AFF8-83E693DA9D45}"
 EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
@ -21,6 +23,14 @@ Global
 		{32176592-2F30-4BD5-B645-EB11C8D3453E}.Release|Win32.Build.0 = Release|Win32
 		{32176592-2F30-4BD5-B645-EB11C8D3453E}.Release|x64.ActiveCfg = Release|x64
 		{32176592-2F30-4BD5-B645-EB11C8D3453E}.Release|x64.Build.0 = Release|x64
+		{97FCA15F-EAF3-4F1A-AFF8-83E693DA9D45}.Debug|Win32.ActiveCfg = Debug|Win32
+		{97FCA15F-EAF3-4F1A-AFF8-83E693DA9D45}.Debug|Win32.Build.0 = Debug|Win32
+		{97FCA15F-EAF3-4F1A-AFF8-83E693DA9D45}.Debug|x64.ActiveCfg = Debug|x64
+		{97FCA15F-EAF3-4F1A-AFF8-83E693DA9D45}.Debug|x64.Build.0 = Debug|x64
+		{97FCA15F-EAF3-4F1A-AFF8-83E693DA9D45}.Release|Win32.ActiveCfg = Release|Win32
+		{97FCA15F-EAF3-4F1A-AFF8-83E693DA9D45}.Release|Win32.Build.0 = Release|Win32
+		{97FCA15F-EAF3-4F1A-AFF8-83E693DA9D45}.Release|x64.ActiveCfg = Release|x64
+		{97FCA15F-EAF3-4F1A-AFF8-83E693DA9D45}.Release|x64.Build.0 = Release|x64
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
--- a/proj/Leopard.vcxproj
+++ b/proj/Leopard.vcxproj
@ -0,0 +1,193 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="..\leopard.h" />
+    <ClInclude Include="..\LeopardCommon.h" />
+    <ClInclude Include="..\LeopardDecoder.h" />
+    <ClInclude Include="..\LeopardEncoder.h" />
+    <ClInclude Include="..\LeopardFF8.h" />
+    <ClInclude Include="..\LeopardFF16.h" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="..\leopard.cpp" />
+    <ClCompile Include="..\LeopardCommon.cpp" />
+    <ClCompile Include="..\LeopardDecoder.cpp" />
+    <ClCompile Include="..\LeopardEncoder.cpp" />
+    <ClCompile Include="..\LeopardFF8.cpp" />
+    <ClCompile Include="..\LeopardFF16.cpp" />
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{32176592-2F30-4BD5-B645-EB11C8D3453E}</ProjectGuid>
+    <RootNamespace>GF65536</RootNamespace>
+    <ProjectName>Leopard</ProjectName>
+    <WindowsTargetPlatformVersion>10.0.14393.0</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>StaticLibrary</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v141</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>StaticLibrary</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v141</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>StaticLibrary</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v141</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>StaticLibrary</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v141</PlatformToolset>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <OutDir>Output/$(ProjectName)/$(Configuration)/$(Platform)/</OutDir>
+    <IntDir>Obj/$(ProjectName)/$(Configuration)/$(Platform)/</IntDir>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <OutDir>Output/$(ProjectName)/$(Configuration)/$(Platform)/</OutDir>
+    <IntDir>Obj/$(ProjectName)/$(Configuration)/$(Platform)/</IntDir>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <OutDir>Output/$(ProjectName)/$(Configuration)/$(Platform)/</OutDir>
+    <IntDir>Obj/$(ProjectName)/$(Configuration)/$(Platform)/</IntDir>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <OutDir>Output/$(ProjectName)/$(Configuration)/$(Platform)/</OutDir>
+    <IntDir>Obj/$(ProjectName)/$(Configuration)/$(Platform)/</IntDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+      <PreprocessorDefinitions>_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <AdditionalLibraryDirectories>
+      </AdditionalLibraryDirectories>
+    </Link>
+    <PostBuildEvent>
+      <Command>
+      </Command>
+    </PostBuildEvent>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+      <PreprocessorDefinitions>_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <AdditionalLibraryDirectories>
+      </AdditionalLibraryDirectories>
+    </Link>
+    <PostBuildEvent>
+      <Command>
+      </Command>
+    </PostBuildEvent>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <InlineFunctionExpansion>AnySuitable</InlineFunctionExpansion>
+      <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
+      <OmitFramePointers>false</OmitFramePointers>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+      <BufferSecurityCheck>true</BufferSecurityCheck>
+      <PreprocessorDefinitions>_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <AdditionalLibraryDirectories>
+      </AdditionalLibraryDirectories>
+    </Link>
+    <PostBuildEvent>
+      <Command>
+      </Command>
+    </PostBuildEvent>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <InlineFunctionExpansion>OnlyExplicitInline</InlineFunctionExpansion>
+      <FavorSizeOrSpeed>Size</FavorSizeOrSpeed>
+      <OmitFramePointers>false</OmitFramePointers>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+      <BufferSecurityCheck>true</BufferSecurityCheck>
+      <PreprocessorDefinitions>_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <AdditionalLibraryDirectories>
+      </AdditionalLibraryDirectories>
+    </Link>
+    <PostBuildEvent>
+      <Command>
+      </Command>
+    </PostBuildEvent>
+  </ItemDefinitionGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
--- a/proj/Leopard.vcxproj.filters
+++ b/proj/Leopard.vcxproj.filters
@ -0,0 +1,57 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup>
+    <Filter Include="Source Files">
+      <UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
+      <Extensions>cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx</Extensions>
+    </Filter>
+    <Filter Include="Header Files">
+      <UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>
+      <Extensions>h;hh;hpp;hxx;hm;inl;inc;xsd</Extensions>
+    </Filter>
+    <Filter Include="Resource Files">
+      <UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier>
+      <Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
+    </Filter>
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="..\leopard.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="..\LeopardCommon.h">
+      <Filter>Source Files</Filter>
+    </ClInclude>
+    <ClInclude Include="..\LeopardDecoder.h">
+      <Filter>Source Files</Filter>
+    </ClInclude>
+    <ClInclude Include="..\LeopardEncoder.h">
+      <Filter>Source Files</Filter>
+    </ClInclude>
+    <ClInclude Include="..\LeopardFF16.h">
+      <Filter>Source Files</Filter>
+    </ClInclude>
+    <ClInclude Include="..\LeopardFF8.h">
+      <Filter>Source Files</Filter>
+    </ClInclude>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="..\LeopardDecoder.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\LeopardEncoder.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\leopard.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\LeopardCommon.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\LeopardFF16.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\LeopardFF8.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+  </ItemGroup>
+</Project>
--- a/tests/benchmark.cpp
+++ b/tests/benchmark.cpp
@ -0,0 +1,567 @@
+/*
+    Copyright (c) 2017 Christopher A. Taylor.  All rights reserved.
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright notice,
+      this list of conditions and the following disclaimer in the documentation
+      and/or other materials provided with the distribution.
+    * Neither the name of Leopard nor the names of its contributors may be
+      used to endorse or promote products derived from this software without
+      specific prior written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+    ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+    POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "../LeopardCommon.h"
+#include "../leopard.h"
+
+#include <memory>
+#include <vector>
+#include <iostream>
+#include <string>
+using namespace std;
+
+//#define TEST_DATA_ALL_SAME
+//#define TEST_LOSE_FIRST_K_PACKETS
+
+
+//------------------------------------------------------------------------------
+// Windows
+
+#ifdef _WIN32
+    #define WIN32_LEAN_AND_MEAN
+
+    #ifndef _WINSOCKAPI_
+        #define DID_DEFINE_WINSOCKAPI
+        #define _WINSOCKAPI_
+    #endif
+    #ifndef NOMINMAX
+        #define NOMINMAX
+    #endif
+    #ifndef _WIN32_WINNT
+        #define _WIN32_WINNT 0x0601 /* Windows 7+ */
+    #endif
+
+    #include <windows.h>
+#endif
+
+#ifdef DID_DEFINE_WINSOCKAPI
+    #undef _WINSOCKAPI_
+    #undef DID_DEFINE_WINSOCKAPI
+#endif
+
+
+//------------------------------------------------------------------------------
+// Threads
+
+static bool SetCurrentThreadPriority()
+{
+#ifdef _WIN32
+    return 0 != ::SetThreadPriority(::GetCurrentThread(), THREAD_PRIORITY_ABOVE_NORMAL);
+#else
+    return -1 != nice(2);
+#endif
+}
+
+
+//------------------------------------------------------------------------------
+// Timing
+
+static uint64_t GetTimeUsec()
+{
+#ifdef _WIN32
+    LARGE_INTEGER timeStamp = {};
+    if (!::QueryPerformanceCounter(&timeStamp))
+        return 0;
+    static double PerfFrequencyInverse = 0.;
+    if (PerfFrequencyInverse == 0.)
+    {
+        LARGE_INTEGER freq = {};
+        if (!::QueryPerformanceFrequency(&freq) || freq.QuadPart == 0)
+            return 0;
+        PerfFrequencyInverse = 1000000. / (double)freq.QuadPart;
+    }
+    return (uint64_t)(PerfFrequencyInverse * timeStamp.QuadPart);
+#else
+    struct timeval tv;
+    gettimeofday(&tv, nullptr);
+    return 1000000 * tv.tv_sec + tv.tv_usec;
+#endif // _WIN32
+}
+
+
+//------------------------------------------------------------------------------
+// PCG PRNG
+// From http://www.pcg-random.org/
+
+class PCGRandom
+{
+public:
+    inline void Seed(uint64_t y, uint64_t x = 0)
+    {
+        State = 0;
+        Inc = (y << 1u) | 1u;
+        Next();
+        State += x;
+        Next();
+    }
+
+    inline uint32_t Next()
+    {
+        const uint64_t oldstate = State;
+        State = oldstate * UINT64_C(6364136223846793005) + Inc;
+        const uint32_t xorshifted = (uint32_t)(((oldstate >> 18) ^ oldstate) >> 27);
+        const uint32_t rot = oldstate >> 59;
+        return (xorshifted >> rot) | (xorshifted << ((uint32_t)(-(int32_t)rot) & 31));
+    }
+
+    uint64_t State = 0, Inc = 0;
+};
+
+
+//------------------------------------------------------------------------------
+// Self-Checking Packet
+
+static void WriteRandomSelfCheckingPacket(PCGRandom& prng, void* packet, unsigned bytes)
+{
+    uint8_t* buffer = (uint8_t*)packet;
+#ifdef TEST_DATA_ALL_SAME
+    if (bytes != 0)
+#else
+    if (bytes < 16)
+#endif
+    {
+        LEO_DEBUG_ASSERT(bytes >= 2);
+        buffer[0] = (uint8_t)prng.Next();
+        for (unsigned i = 1; i < bytes; ++i)
+        {
+            buffer[i] = buffer[0];
+        }
+    }
+    else
+    {
+        uint32_t crc = bytes;
+        *(uint32_t*)(buffer + 4) = bytes;
+        for (unsigned i = 8; i < bytes; ++i)
+        {
+            uint8_t v = (uint8_t)prng.Next();
+            buffer[i] = v;
+            crc = (crc << 3) | (crc >> (32 - 3));
+            crc += v;
+        }
+        *(uint32_t*)buffer = crc;
+    }
+}
+
+static bool CheckPacket(const void* packet, unsigned bytes)
+{
+    uint8_t* buffer = (uint8_t*)packet;
+#ifdef TEST_DATA_ALL_SAME
+    if (bytes != 0)
+#else
+    if (bytes < 16)
+#endif
+    {
+        if (bytes < 2)
+            return false;
+
+        uint8_t v = buffer[0];
+        for (unsigned i = 1; i < bytes; ++i)
+        {
+            if (buffer[i] != v)
+                return false;
+        }
+    }
+    else
+    {
+        uint32_t crc = bytes;
+        uint32_t readBytes = *(uint32_t*)(buffer + 4);
+        if (readBytes != bytes)
+            return false;
+        for (unsigned i = 8; i < bytes; ++i)
+        {
+            uint8_t v = buffer[i];
+            crc = (crc << 3) | (crc >> (32 - 3));
+            crc += v;
+        }
+        uint32_t readCRC = *(uint32_t*)buffer;
+        if (readCRC != crc)
+            return false;
+    }
+    return true;
+}
+
+
+//------------------------------------------------------------------------------
+// FunctionTimer
+
+class FunctionTimer
+{
+public:
+    FunctionTimer(const std::string& name)
+    {
+        FunctionName = name;
+    }
+    void BeginCall()
+    {
+        LEO_DEBUG_ASSERT(t0 == 0);
+        t0 = GetTimeUsec();
+    }
+    void EndCall()
+    {
+        LEO_DEBUG_ASSERT(t0 != 0);
+        uint64_t t1 = GetTimeUsec();
+        ++Invokations;
+        TotalUsec += t1 - t0;
+        t0 = 0;
+    }
+    void Reset()
+    {
+        LEO_DEBUG_ASSERT(t0 == 0);
+        t0 = 0;
+        Invokations = 0;
+        TotalUsec = 0;
+    }
+    void Print(unsigned trials)
+    {
+        cout << FunctionName << " called " << Invokations / (float)trials << " times per trial (avg).  " << TotalUsec / (double)Invokations << " usec avg for all invokations.  " << TotalUsec / (float)trials << " usec (avg) of " << trials << " trials" << endl;
+    }
+
+    uint64_t t0 = 0;
+    uint64_t Invokations = 0;
+    uint64_t TotalUsec = 0;
+    std::string FunctionName;
+};
+
+
+//------------------------------------------------------------------------------
+// Utility: Deck Shuffling function
+
+/*
+    Given a PRNG, generate a deck of cards in a random order.
+    The deck will contain elements with values between 0 and count - 1.
+*/
+
+static void ShuffleDeck16(PCGRandom &prng, uint16_t * LEO_RESTRICT deck, uint32_t count)
+{
+    deck[0] = 0;
+
+    // If we can unroll 4 times,
+    if (count <= 256)
+    {
+        for (uint32_t ii = 1;;)
+        {
+            uint32_t jj, rv = prng.Next();
+
+            // 8-bit unroll
+            switch (count - ii)
+            {
+            default:
+                jj = (uint8_t)rv % ii;
+                deck[ii] = deck[jj];
+                deck[jj] = ii;
+                ++ii;
+                jj = (uint8_t)(rv >> 8) % ii;
+                deck[ii] = deck[jj];
+                deck[jj] = ii;
+                ++ii;
+                jj = (uint8_t)(rv >> 16) % ii;
+                deck[ii] = deck[jj];
+                deck[jj] = ii;
+                ++ii;
+                jj = (uint8_t)(rv >> 24) % ii;
+                deck[ii] = deck[jj];
+                deck[jj] = ii;
+                ++ii;
+                break;
+
+            case 3:
+                jj = (uint8_t)rv % ii;
+                deck[ii] = deck[jj];
+                deck[jj] = ii;
+                ++ii;
+            case 2:
+                jj = (uint8_t)(rv >> 8) % ii;
+                deck[ii] = deck[jj];
+                deck[jj] = ii;
+                ++ii;
+            case 1:
+                jj = (uint8_t)(rv >> 16) % ii;
+                deck[ii] = deck[jj];
+                deck[jj] = ii;
+            case 0:
+                return;
+            }
+        }
+    }
+    else
+    {
+        // For each deck entry,
+        for (uint32_t ii = 1;;)
+        {
+            uint32_t jj, rv = prng.Next();
+
+            // 16-bit unroll
+            switch (count - ii)
+            {
+            default:
+                jj = (uint16_t)rv % ii;
+                deck[ii] = deck[jj];
+                deck[jj] = ii;
+                ++ii;
+                jj = (uint16_t)(rv >> 16) % ii;
+                deck[ii] = deck[jj];
+                deck[jj] = ii;
+                ++ii;
+                break;
+
+            case 1:
+                jj = (uint16_t)rv % ii;
+                deck[ii] = deck[jj];
+                deck[jj] = ii;
+            case 0:
+                return;
+            }
+        }
+    }
+}
+
+
+//------------------------------------------------------------------------------
+// SIMD-Safe Aligned Memory Allocations
+
+static const unsigned kAlignmentBytes = LEO_ALIGN_BYTES;
+
+LEO_FORCE_INLINE unsigned NextAlignedOffset(unsigned offset)
+{
+    return (offset + kAlignmentBytes - 1) & ~(kAlignmentBytes - 1);
+}
+
+static LEO_FORCE_INLINE uint8_t* SIMDSafeAllocate(size_t size)
+{
+    uint8_t* data = (uint8_t*)calloc(1, kAlignmentBytes + size);
+    if (!data)
+        return nullptr;
+    unsigned offset = (unsigned)((uintptr_t)data % kAlignmentBytes);
+    data += kAlignmentBytes - offset;
+    data[-1] = (uint8_t)offset;
+    return data;
+}
+
+static LEO_FORCE_INLINE void SIMDSafeFree(void* ptr)
+{
+    if (!ptr)
+        return;
+    uint8_t* data = (uint8_t*)ptr;
+    unsigned offset = data[-1];
+    if (offset >= kAlignmentBytes)
+    {
+        LEO_DEBUG_BREAK; // Should never happen
+        return;
+    }
+    data -= kAlignmentBytes - offset;
+    free(data);
+}
+
+
+//------------------------------------------------------------------------------
+// Tests
+
+struct TestParameters
+{
+    unsigned original_count = 200; // under 65536
+    unsigned recovery_count = 100; // under 65536 - original_count
+    unsigned buffer_bytes = 64000; // multiple of 64 bytes
+    unsigned loss_count = 20; // some fraction of original_count
+    unsigned seed = 0;
+    bool multithreaded = true;
+};
+
+static void BasicTest(const TestParameters& params)
+{
+    static const unsigned kTrials = 4;
+
+    std::vector<uint8_t*> original_data(params.original_count);
+
+    const unsigned encode_work_count = leo_encode_work_count(params.original_count, params.recovery_count);
+    const unsigned decode_work_count = leo_decode_work_count(params.original_count, params.recovery_count);
+
+    std::vector<uint8_t*> encode_work_data(encode_work_count);
+    std::vector<uint8_t*> decode_work_data(decode_work_count);
+
+    FunctionTimer t_mem_alloc("memory_allocation");
+    FunctionTimer t_leo_encode("leo_encode");
+    FunctionTimer t_leo_decode("leo_decode");
+    FunctionTimer t_mem_free("memory_free");
+
+    const uint64_t total_bytes = (uint64_t)params.buffer_bytes * params.original_count;
+
+    for (unsigned trial = 0; trial < kTrials; ++trial)
+    {
+        // Allocate memory:
+
+        t_mem_alloc.BeginCall();
+        for (unsigned i = 0, count = params.original_count; i < count; ++i)
+            original_data[i] = SIMDSafeAllocate(params.buffer_bytes);
+        for (unsigned i = 0, count = encode_work_count; i < count; ++i)
+            encode_work_data[i] = SIMDSafeAllocate(params.buffer_bytes);
+        for (unsigned i = 0, count = decode_work_count; i < count; ++i)
+            decode_work_data[i] = SIMDSafeAllocate(params.buffer_bytes);
+        t_mem_alloc.EndCall();
+
+        // Generate data:
+
+        PCGRandom prng;
+        prng.Seed(params.seed, trial);
+
+        for (unsigned i = 0; i < params.original_count; ++i)
+            WriteRandomSelfCheckingPacket(prng, original_data[i], params.buffer_bytes);
+
+        // Encode:
+
+        t_leo_encode.BeginCall();
+        LeopardResult encodeResult = leo_encode(
+            params.buffer_bytes,
+            params.original_count,
+            params.recovery_count,
+            encode_work_count,
+            (void**)&original_data[0],
+            (void**)&encode_work_data[0], // recovery data written here
+            params.multithreaded ? LeopardFlags_Multithreaded : LeopardFlags_Defaults
+        );
+        t_leo_encode.EndCall();
+
+        if (encodeResult != Leopard_Success)
+        {
+            cout << "Error: Leopard encode failed with result=" << encodeResult << endl;
+            LEO_DEBUG_BREAK;
+            return;
+        }
+
+        // Lose random original data:
+
+        std::vector<uint16_t> original_losses(params.original_count);
+        ShuffleDeck16(prng, &original_losses[0], params.original_count);
+
+        for (unsigned i = 0, count = params.loss_count; i < count; ++i)
+        {
+            const unsigned loss_index = original_losses[i];
+            delete[] original_data[loss_index];
+            original_data[loss_index] = nullptr;
+        }
+
+        // Lose random recovery data:
+
+        const unsigned recovery_loss_count = params.recovery_count - params.loss_count;
+
+        std::vector<uint16_t> recovery_losses(params.recovery_count);
+        ShuffleDeck16(prng, &recovery_losses[0], params.recovery_count);
+
+        for (unsigned i = 0, count = params.loss_count; i < count; ++i)
+        {
+            const unsigned loss_index = original_losses[i];
+            delete[] encode_work_data[loss_index];
+            encode_work_data[loss_index] = nullptr;
+        }
+
+        // Decode:
+
+        t_leo_decode.BeginCall();
+        LeopardResult decodeResult = leo_decode(
+            params.buffer_bytes,
+            params.original_count,
+            params.recovery_count,
+            decode_work_count,
+            (void**)&original_data[0],
+            (void**)&encode_work_data[0],
+            (void**)&decode_work_data[0],
+            params.multithreaded ? LeopardFlags_Multithreaded : LeopardFlags_Defaults);
+        t_leo_decode.EndCall();
+
+        if (decodeResult != Leopard_Success)
+        {
+            cout << "Error: Leopard decode failed with result=" << decodeResult << endl;
+            LEO_DEBUG_BREAK;
+            return;
+        }
+
+        // Free memory:
+
+        t_mem_free.BeginCall();
+        for (unsigned i = 0, count = params.original_count; i < count; ++i)
+            SIMDSafeFree(original_data[i]);
+        for (unsigned i = 0, count = encode_work_count; i < count; ++i)
+            SIMDSafeFree(encode_work_data[i]);
+        for (unsigned i = 0, count = decode_work_count; i < count; ++i)
+            SIMDSafeFree(decode_work_data[i]);
+        t_mem_free.EndCall();
+    }
+
+    t_mem_alloc.Print(kTrials);
+    t_leo_encode.Print(kTrials);
+    t_leo_decode.Print(kTrials);
+    t_mem_free.Print(kTrials);
+
+    float encode_input_MBPS = total_bytes * kTrials / (float)(t_leo_encode.TotalUsec);
+    float encode_output_MBPS = params.buffer_bytes * (uint64_t)params.recovery_count * kTrials / (float)(t_leo_encode.TotalUsec);
+    float decode_input_MBPS = total_bytes * kTrials / (float)(t_leo_decode.TotalUsec);
+    float decode_output_MBPS = params.buffer_bytes * (uint64_t)params.loss_count * kTrials / (float)(t_leo_decode.TotalUsec);
+
+    cout << "Leopard Encoder(" << total_bytes / 1000000.f << " MB in " << params.original_count << " pieces, " << params.loss_count << " losses): Input=" << encode_input_MBPS << " MB/s, Output=" << encode_output_MBPS << " MB/s" << endl;
+    cout << "Leopard Decoder(" << total_bytes / 1000000.f << " MB in " << params.original_count << " pieces, " << params.loss_count << " losses): Input=" << decode_input_MBPS << " MB/s, Output=" << decode_output_MBPS << " MB/s" << endl << endl;
+}
+
+
+//------------------------------------------------------------------------------
+// Entrypoint
+
+int main(int argc, char **argv)
+{
+    SetCurrentThreadPriority();
+
+    FunctionTimer t_leo_init("leo_init");
+
+    t_leo_init.BeginCall();
+    if (0 != leo_init())
+    {
+        cout << "Failed to initialize" << endl;
+        return -1;
+    }
+    t_leo_init.EndCall();
+    t_leo_init.Print(1);
+
+    TestParameters params;
+
+    if (argc >= 2)
+        params.original_count = atoi(argv[1]);
+    if (argc >= 3)
+        params.recovery_count = atoi(argv[2]);
+    if (argc >= 4)
+        params.buffer_bytes = atoi(argv[3]);
+    if (argc >= 5)
+        params.loss_count = atoi(argv[4]);
+    if (argc >= 6)
+        params.multithreaded = (atoi(argv[5]) != 0);
+
+    cout << "Parameters: [original count=" << params.original_count << "] [recovery count=" << params.recovery_count << "] [buffer bytes=" << params.buffer_bytes << "] [loss count=" << params.loss_count << "] [random seed=" << params.seed << "]" << endl;
+
+    BasicTest(params);
+
+    getchar();
+
+    return 0;
+}
--- a/tests/proj/Benchmark.vcxproj
+++ b/tests/proj/Benchmark.vcxproj
@ -18,41 +18,38 @@
      <Platform>x64</Platform>
    </ProjectConfiguration>
  </ItemGroup>
-  <ItemGroup>
-    <ClCompile Include="..\lhc_rs.cpp" />
-  </ItemGroup>
  <PropertyGroup Label="Globals">
-    <ProjectGuid>{32176592-2F30-4BD5-B645-EB11C8D3453E}</ProjectGuid>
-    <RootNamespace>GF65536</RootNamespace>
-    <ProjectName>LHC_RS</ProjectName>
-    <WindowsTargetPlatformVersion>8.1</WindowsTargetPlatformVersion>
+    <ProjectGuid>{97FCA15F-EAF3-4F1A-AFF8-83E693DA9D45}</ProjectGuid>
+    <RootNamespace>Fecal</RootNamespace>
+    <ProjectName>LeopardBenchmark</ProjectName>
+    <WindowsTargetPlatformVersion>10.0.14393.0</WindowsTargetPlatformVersion>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>true</UseDebugLibraries>
    <CharacterSet>MultiByte</CharacterSet>
-    <PlatformToolset>v140</PlatformToolset>
+    <PlatformToolset>v141</PlatformToolset>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>true</UseDebugLibraries>
    <CharacterSet>MultiByte</CharacterSet>
-    <PlatformToolset>v140</PlatformToolset>
+    <PlatformToolset>v141</PlatformToolset>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>false</UseDebugLibraries>
    <WholeProgramOptimization>true</WholeProgramOptimization>
    <CharacterSet>MultiByte</CharacterSet>
-    <PlatformToolset>v140</PlatformToolset>
+    <PlatformToolset>v141</PlatformToolset>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>false</UseDebugLibraries>
    <WholeProgramOptimization>true</WholeProgramOptimization>
    <CharacterSet>MultiByte</CharacterSet>
-    <PlatformToolset>v140</PlatformToolset>
+    <PlatformToolset>v141</PlatformToolset>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
@ -155,8 +152,8 @@
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <SDLCheck>true</SDLCheck>
-      <InlineFunctionExpansion>OnlyExplicitInline</InlineFunctionExpansion>
-      <FavorSizeOrSpeed>Size</FavorSizeOrSpeed>
+      <InlineFunctionExpansion>AnySuitable</InlineFunctionExpansion>
+      <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
      <OmitFramePointers>false</OmitFramePointers>
      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
      <BufferSecurityCheck>true</BufferSecurityCheck>
@ -174,6 +171,14 @@
      </Command>
    </PostBuildEvent>
  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="..\benchmark.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <ProjectReference Include="..\..\proj\Leopard.vcxproj">
+      <Project>{32176592-2f30-4bd5-b645-eb11c8d3453e}</Project>
+    </ProjectReference>
+  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
  </ImportGroup>
--- a/tests/proj/Benchmark.vcxproj.filters
+++ b/tests/proj/Benchmark.vcxproj.filters
@ -15,7 +15,7 @@
    </Filter>
  </ItemGroup>
  <ItemGroup>
-    <ClCompile Include="..\lhc_rs.cpp">
+    <ClCompile Include="..\benchmark.cpp">
      <Filter>Source Files</Filter>
    </ClCompile>
  </ItemGroup>