diff --git a/LeopardFF16.cpp b/LeopardFF16.cpp
index 033c7c3..f8892d7 100644
--- a/LeopardFF16.cpp
+++ b/LeopardFF16.cpp
@@ -119,10 +119,11 @@ static void FWHT(ffe_t* data, const unsigned m, const unsigned m_truncated)
     {
         // For each set of dist*4 elements:
 #pragma omp parallel for
-        for (int r = 0; r < m_truncated; r += dist4)
+        for (int r = 0; r < (int)m_truncated; r += dist4)
         {
             // For each set of dist elements:
-            for (int i = r; i < r + dist; ++i)
+            const int i_end = r + dist;
+            for (int i = r; i < i_end; ++i)
                 FWHT_4(data + i, dist);
         }
     }
@@ -130,7 +131,7 @@ static void FWHT(ffe_t* data, const unsigned m, const unsigned m_truncated)
     // If there is one layer left:
     if (dist < m)
 #pragma omp parallel for
-        for (int i = 0; i < dist; ++i)
+        for (int i = 0; i < (int)dist; ++i)
             FWHT_2(data[i], data[i + dist]);
 }
 
@@ -294,12 +295,110 @@ static const Multiply256LUT_t* Multiply256LUT = nullptr;
 
 #endif // LEO_TRY_AVX2
 
+// Stores the partial products of x * y at offset x + y * 65536
+// Repeated accesses from the same y value are faster
+struct Product16Table
+{
+    ffe_t LUT[4 * 16];
+};
+static const Product16Table* Multiply16LUT = nullptr;
+
+
+// Reference version of muladd: x[] ^= y[] * log_m
+static LEO_FORCE_INLINE void RefMulAdd(
+    void* LEO_RESTRICT x,
+    const void* LEO_RESTRICT y,
+    ffe_t log_m,
+    uint64_t bytes)
+{
+    const ffe_t* LEO_RESTRICT lut = Multiply16LUT[log_m].LUT;
+    const uint8_t * LEO_RESTRICT y1 = reinterpret_cast<const uint8_t *>(y);
+    uint8_t * LEO_RESTRICT x1 = reinterpret_cast<uint8_t *>(x);
+
+    do
+    {
+        for (unsigned i = 0; i < 32; ++i)
+        {
+            const unsigned lo = y1[i];
+            const unsigned hi = y1[i + 32];
+
+            const ffe_t prod = \
+                lut[(lo & 15)] ^ \
+                lut[(lo >> 4) + 16] ^ \
+                lut[(hi & 15) + 32] ^ \
+                lut[(hi >> 4) + 48];
+
+            x1[i] ^= (uint8_t)prod;
+            x1[i + 32] ^= (uint8_t)(prod >> 8);
+        }
+
+        x1 += 64, y1 += 64;
+        bytes -= 64;
+    } while (bytes > 0);
+
+}
+
+// Reference version of mul: x[] = y[] * log_m
+static LEO_FORCE_INLINE void RefMul(
+    void* LEO_RESTRICT x,
+    const void* LEO_RESTRICT y,
+    ffe_t log_m,
+    uint64_t bytes)
+{
+    const ffe_t* LEO_RESTRICT lut = Multiply16LUT[log_m].LUT;
+    const uint8_t * LEO_RESTRICT y1 = reinterpret_cast<const uint8_t *>(y);
+    uint8_t * LEO_RESTRICT x1 = reinterpret_cast<uint8_t *>(x);
+
+    do
+    {
+        for (unsigned i = 0; i < 32; ++i)
+        {
+            const unsigned lo = y1[i];
+            const unsigned hi = y1[i + 32];
+
+            const ffe_t prod = \
+                lut[(lo & 15)] ^ \
+                lut[(lo >> 4) + 16] ^ \
+                lut[(hi & 15) + 32] ^ \
+                lut[(hi >> 4) + 48];
+
+            x1[i] = (uint8_t)prod;
+            x1[i + 32] = (uint8_t)(prod >> 8);
+        }
+
+        x1 += 64, y1 += 64;
+        bytes -= 64;
+    } while (bytes > 0);
+}
+
 
 static void InitializeMultiplyTables()
 {
     // If we cannot use the PSHUFB instruction, generate Multiply8LUT:
     if (!CpuHasSSSE3)
+    {
+        Multiply16LUT = new Product16Table[65536];
+
+        // For each log_m multiplicand:
+#pragma omp parallel for
+        for (int log_m = 0; log_m < kOrder; ++log_m)
+        {
+            const Product16Table& lut = Multiply16LUT[log_m];
+
+            for (unsigned nibble = 0, shift = 0; nibble < 4; ++nibble, shift += 4)
+            {
+                ffe_t* nibble_lut = (ffe_t*)&lut.LUT[nibble * 16];
+
+                for (unsigned x_nibble = 0; x_nibble < 16; ++x_nibble)
+                {
+                    const ffe_t prod = MultiplyLog(x_nibble << shift, static_cast<ffe_t>(log_m));
+                    nibble_lut[x_nibble] = prod;
+                }
+            }
+        }
+
         return;
+    }
 
     if (CpuHasAVX2)
         Multiply256LUT = reinterpret_cast<const Multiply256LUT_t*>(SIMDSafeAllocate(sizeof(Multiply256LUT_t) * kOrder));
@@ -381,29 +480,36 @@ static void mul_mem(
     }
 #endif // LEO_TRY_AVX2
 
-    LEO_MUL_TABLES_128(0, log_m);
-
-    const LEO_M128 clr_mask = _mm_set1_epi8(0x0f);
-
-    LEO_M128 * LEO_RESTRICT x16 = reinterpret_cast<LEO_M128 *>(x);
-    const LEO_M128 * LEO_RESTRICT y16 = reinterpret_cast<const LEO_M128 *>(y);
-
-    do
+    if (CpuHasSSSE3)
     {
+        LEO_MUL_TABLES_128(0, log_m);
+
+        const LEO_M128 clr_mask = _mm_set1_epi8(0x0f);
+
+        LEO_M128 * LEO_RESTRICT x16 = reinterpret_cast<LEO_M128 *>(x);
+        const LEO_M128 * LEO_RESTRICT y16 = reinterpret_cast<const LEO_M128 *>(y);
+
+        do
+        {
 #define LEO_MUL_128_LS(x_ptr, y_ptr) { \
-            const LEO_M128 data_lo = _mm_loadu_si128(y_ptr); \
-            const LEO_M128 data_hi = _mm_loadu_si128(y_ptr + 2); \
-            LEO_M128 prod_lo, prod_hi; \
-            LEO_MUL_128(data_lo, data_hi, 0); \
-            _mm_storeu_si128(x_ptr, prod_lo); \
-            _mm_storeu_si128(x_ptr + 2, prod_hi); }
+                const LEO_M128 data_lo = _mm_loadu_si128(y_ptr); \
+                const LEO_M128 data_hi = _mm_loadu_si128(y_ptr + 2); \
+                LEO_M128 prod_lo, prod_hi; \
+                LEO_MUL_128(data_lo, data_hi, 0); \
+                _mm_storeu_si128(x_ptr, prod_lo); \
+                _mm_storeu_si128(x_ptr + 2, prod_hi); }
 
-        LEO_MUL_128_LS(x16 + 1, y16 + 1);
-        LEO_MUL_128_LS(x16, y16);
-        x16 += 4, y16 += 4;
+            LEO_MUL_128_LS(x16 + 1, y16 + 1);
+            LEO_MUL_128_LS(x16, y16);
+            x16 += 4, y16 += 4;
 
-        bytes -= 64;
-    } while (bytes > 0);
+            bytes -= 64;
+        } while (bytes > 0);
+
+        return;
+    }
+
+    RefMul(x, y, log_m, bytes);
 }
 
 
@@ -555,34 +661,43 @@ static void IFFT_DIT2(
     }
 #endif // LEO_TRY_AVX2
 
-    LEO_MUL_TABLES_128(0, log_m);
-
-    const LEO_M128 clr_mask = _mm_set1_epi8(0x0f);
-
-    LEO_M128 * LEO_RESTRICT x16 = reinterpret_cast<LEO_M128 *>(x);
-    LEO_M128 * LEO_RESTRICT y16 = reinterpret_cast<LEO_M128 *>(y);
-
-    do
+    if (CpuHasSSSE3)
     {
+        LEO_MUL_TABLES_128(0, log_m);
+
+        const LEO_M128 clr_mask = _mm_set1_epi8(0x0f);
+
+        LEO_M128 * LEO_RESTRICT x16 = reinterpret_cast<LEO_M128 *>(x);
+        LEO_M128 * LEO_RESTRICT y16 = reinterpret_cast<LEO_M128 *>(y);
+
+        do
+        {
 #define LEO_IFFTB_128(x_ptr, y_ptr) { \
-            LEO_M128 x_lo = _mm_loadu_si128(x_ptr); \
-            LEO_M128 x_hi = _mm_loadu_si128(x_ptr + 2); \
-            LEO_M128 y_lo = _mm_loadu_si128(y_ptr); \
-            LEO_M128 y_hi = _mm_loadu_si128(y_ptr + 2); \
-            y_lo = _mm_xor_si128(y_lo, x_lo); \
-            y_hi = _mm_xor_si128(y_hi, x_hi); \
-            _mm_storeu_si128(y_ptr, y_lo); \
-            _mm_storeu_si128(y_ptr + 2, y_hi); \
-            LEO_MULADD_128(x_lo, x_hi, y_lo, y_hi, 0); \
-            _mm_storeu_si128(x_ptr, x_lo); \
-            _mm_storeu_si128(x_ptr + 2, x_hi); }
+                LEO_M128 x_lo = _mm_loadu_si128(x_ptr); \
+                LEO_M128 x_hi = _mm_loadu_si128(x_ptr + 2); \
+                LEO_M128 y_lo = _mm_loadu_si128(y_ptr); \
+                LEO_M128 y_hi = _mm_loadu_si128(y_ptr + 2); \
+                y_lo = _mm_xor_si128(y_lo, x_lo); \
+                y_hi = _mm_xor_si128(y_hi, x_hi); \
+                _mm_storeu_si128(y_ptr, y_lo); \
+                _mm_storeu_si128(y_ptr + 2, y_hi); \
+                LEO_MULADD_128(x_lo, x_hi, y_lo, y_hi, 0); \
+                _mm_storeu_si128(x_ptr, x_lo); \
+                _mm_storeu_si128(x_ptr + 2, x_hi); }
 
-        LEO_IFFTB_128(x16 + 1, y16 + 1);
-        LEO_IFFTB_128(x16, y16);
-        x16 += 4, y16 += 4;
+            LEO_IFFTB_128(x16 + 1, y16 + 1);
+            LEO_IFFTB_128(x16, y16);
+            x16 += 4, y16 += 4;
 
-        bytes -= 64;
-    } while (bytes > 0);
+            bytes -= 64;
+        } while (bytes > 0);
+
+        return;
+    }
+
+    // Reference version:
+    xor_mem(y, x, bytes);
+    RefMulAdd(x, y, log_m, bytes);
 }
 
 
@@ -774,10 +889,10 @@ static void IFFT_DIT_Encoder(
     // found that it only yields a 4% performance improvement, which is not
     // worth the extra complexity.
 #pragma omp parallel for
-    for (int i = 0; i < m_truncated; ++i)
+    for (int i = 0; i < (int)m_truncated; ++i)
         memcpy(work[i], data[i], bytes);
 #pragma omp parallel for
-    for (int i = m_truncated; i < m; ++i)
+    for (int i = m_truncated; i < (int)m; ++i)
         memset(work[i], 0, bytes);
 
     // I tried splitting up the first few layers into L3-cache sized blocks but
@@ -790,7 +905,7 @@ static void IFFT_DIT_Encoder(
     {
         // For each set of dist*4 elements:
 #pragma omp parallel for
-        for (int r = 0; r < m_truncated; r += dist4)
+        for (int r = 0; r < (int)m_truncated; r += dist4)
         {
             const unsigned i_end = r + dist;
             const ffe_t log_m01 = skewLUT[i_end];
@@ -798,7 +913,7 @@ static void IFFT_DIT_Encoder(
             const ffe_t log_m23 = skewLUT[i_end + dist * 2];
 
             // For each set of dist elements:
-            for (int i = r; i < i_end; ++i)
+            for (int i = r; i < (int)i_end; ++i)
             {
                 IFFT_DIT4(
                     bytes,
@@ -828,7 +943,7 @@ static void IFFT_DIT_Encoder(
         else
         {
 #pragma omp parallel for
-            for (int i = 0; i < dist; ++i)
+            for (int i = 0; i < (int)dist; ++i)
             {
                 IFFT_DIT2(
                     work[i],
@@ -860,7 +975,7 @@ static void IFFT_DIT_Decoder(
     {
         // For each set of dist*4 elements:
 #pragma omp parallel for
-        for (int r = 0; r < m_truncated; r += dist4)
+        for (int r = 0; r < (int)m_truncated; r += dist4)
         {
             const unsigned i_end = r + dist;
             const ffe_t log_m01 = skewLUT[i_end];
@@ -868,7 +983,7 @@ static void IFFT_DIT_Decoder(
             const ffe_t log_m23 = skewLUT[i_end + dist * 2];
 
             // For each set of dist elements:
-            for (int i = r; i < i_end; ++i)
+            for (int i = r; i < (int)i_end; ++i)
             {
                 IFFT_DIT4(
                     bytes,
@@ -894,7 +1009,7 @@ static void IFFT_DIT_Decoder(
         else
         {
 #pragma omp parallel for
-            for (int i = 0; i < dist; ++i)
+            for (int i = 0; i < (int)dist; ++i)
             {
                 IFFT_DIT2(
                     work[i],
@@ -1000,34 +1115,43 @@ static void FFT_DIT2(
     }
 #endif // LEO_TRY_AVX2
 
-    LEO_MUL_TABLES_128(0, log_m);
-
-    const LEO_M128 clr_mask = _mm_set1_epi8(0x0f);
-
-    LEO_M128 * LEO_RESTRICT x16 = reinterpret_cast<LEO_M128 *>(x);
-    LEO_M128 * LEO_RESTRICT y16 = reinterpret_cast<LEO_M128 *>(y);
-
-    do
+    if (CpuHasSSSE3)
     {
+        LEO_MUL_TABLES_128(0, log_m);
+
+        const LEO_M128 clr_mask = _mm_set1_epi8(0x0f);
+
+        LEO_M128 * LEO_RESTRICT x16 = reinterpret_cast<LEO_M128 *>(x);
+        LEO_M128 * LEO_RESTRICT y16 = reinterpret_cast<LEO_M128 *>(y);
+
+        do
+        {
 #define LEO_FFTB_128(x_ptr, y_ptr) { \
-            LEO_M128 x_lo = _mm_loadu_si128(x_ptr); \
-            LEO_M128 x_hi = _mm_loadu_si128(x_ptr + 2); \
-            LEO_M128 y_lo = _mm_loadu_si128(y_ptr); \
-            LEO_M128 y_hi = _mm_loadu_si128(y_ptr + 2); \
-            LEO_MULADD_128(x_lo, x_hi, y_lo, y_hi, 0); \
-            _mm_storeu_si128(x_ptr, x_lo); \
-            _mm_storeu_si128(x_ptr + 2, x_hi); \
-            y_lo = _mm_xor_si128(y_lo, x_lo); \
-            y_hi = _mm_xor_si128(y_hi, x_hi); \
-            _mm_storeu_si128(y_ptr, y_lo); \
-            _mm_storeu_si128(y_ptr + 2, y_hi); }
+                LEO_M128 x_lo = _mm_loadu_si128(x_ptr); \
+                LEO_M128 x_hi = _mm_loadu_si128(x_ptr + 2); \
+                LEO_M128 y_lo = _mm_loadu_si128(y_ptr); \
+                LEO_M128 y_hi = _mm_loadu_si128(y_ptr + 2); \
+                LEO_MULADD_128(x_lo, x_hi, y_lo, y_hi, 0); \
+                _mm_storeu_si128(x_ptr, x_lo); \
+                _mm_storeu_si128(x_ptr + 2, x_hi); \
+                y_lo = _mm_xor_si128(y_lo, x_lo); \
+                y_hi = _mm_xor_si128(y_hi, x_hi); \
+                _mm_storeu_si128(y_ptr, y_lo); \
+                _mm_storeu_si128(y_ptr + 2, y_hi); }
 
-        LEO_FFTB_128(x16 + 1, y16 + 1);
-        LEO_FFTB_128(x16, y16);
-        x16 += 4, y16 += 4;
+            LEO_FFTB_128(x16 + 1, y16 + 1);
+            LEO_FFTB_128(x16, y16);
+            x16 += 4, y16 += 4;
 
-        bytes -= 64;
-    } while (bytes > 0);
+            bytes -= 64;
+        } while (bytes > 0);
+
+        return;
+    }
+
+    // Reference version:
+    RefMulAdd(x, y, log_m, bytes);
+    xor_mem(y, x, bytes);
 }
 
 
@@ -1222,7 +1346,7 @@ static void FFT_DIT(
 
         // For each set of dist*4 elements:
 #pragma omp parallel for
-        for (int r = 0; r < m_truncated; r += dist4)
+        for (int r = 0; r < (int)m_truncated; r += dist4)
         {
             const unsigned i_end = r + dist;
             const ffe_t log_m01 = skewLUT[i_end];
@@ -1230,7 +1354,7 @@ static void FFT_DIT(
             const ffe_t log_m23 = skewLUT[i_end + dist * 2];
 
             // For each set of dist elements:
-            for (int i = r; i < i_end; ++i)
+            for (int i = r; i < (int)i_end; ++i)
             {
                 FFT_DIT4(
                     bytes,
@@ -1247,7 +1371,7 @@ static void FFT_DIT(
     if (dist4 == 2)
     {
 #pragma omp parallel for
-        for (int r = 0; r < m_truncated; r += 2)
+        for (int r = 0; r < (int)m_truncated; r += 2)
         {
             const ffe_t log_m = skewLUT[r + 1];
 
@@ -1470,7 +1594,7 @@ static void FFT_DIT_ErrorBits(
     {
         // For each set of dist*4 elements:
 #pragma omp parallel for
-        for (int r = 0; r < n_truncated; r += dist4)
+        for (int r = 0; r < (int)n_truncated; r += dist4)
         {
             if (!error_bits.IsNeeded(mip_level, r))
                 continue;
@@ -1482,7 +1606,7 @@ static void FFT_DIT_ErrorBits(
 
             // For each set of dist elements:
 #pragma omp parallel for
-            for (int i = r; i < i_end; ++i)
+            for (int i = r; i < (int)i_end; ++i)
             {
                 FFT_DIT4(
                     bytes,
@@ -1499,7 +1623,7 @@ static void FFT_DIT_ErrorBits(
     if (dist4 == 2)
     {
 #pragma omp parallel for
-        for (int r = 0; r < n_truncated; r += 2)
+        for (int r = 0; r < (int)n_truncated; r += 2)
         {
             if (!error_bits.IsNeeded(mip_level, r))
                 continue;
@@ -1543,10 +1667,10 @@ void ReedSolomonDecode(
 #endif // LEO_ERROR_BITFIELD_OPT
 
     ffe_t error_locations[kOrder] = {};
-    for (int i = 0; i < recovery_count; ++i)
+    for (unsigned i = 0; i < recovery_count; ++i)
         if (!recovery[i])
             error_locations[i] = 1;
-    for (int i = recovery_count; i < m; ++i)
+    for (unsigned i = recovery_count; i < m; ++i)
         error_locations[i] = 1;
     for (unsigned i = 0; i < original_count; ++i)
     {
@@ -1576,7 +1700,7 @@ void ReedSolomonDecode(
     // work <- recovery data
 
 #pragma omp parallel for
-    for (int i = 0; i < recovery_count; ++i)
+    for (int i = 0; i < (int)recovery_count; ++i)
     {
         if (recovery[i])
             mul_mem(work[i], recovery[i], error_locations[i], buffer_bytes);
@@ -1584,13 +1708,13 @@ void ReedSolomonDecode(
             memset(work[i], 0, buffer_bytes);
     }
 #pragma omp parallel for
-    for (int i = recovery_count; i < m; ++i)
+    for (int i = recovery_count; i < (int)m; ++i)
         memset(work[i], 0, buffer_bytes);
 
     // work <- original data
 
 #pragma omp parallel for
-    for (int i = 0; i < original_count; ++i)
+    for (int i = 0; i < (int)original_count; ++i)
     {
         if (original[i])
             mul_mem(work[m + i], original[i], error_locations[m + i], buffer_bytes);
@@ -1598,7 +1722,7 @@ void ReedSolomonDecode(
             memset(work[m + i], 0, buffer_bytes);
     }
 #pragma omp parallel for
-    for (int i = m + original_count; i < n; ++i)
+    for (int i = m + original_count; i < (int)n; ++i)
         memset(work[i], 0, buffer_bytes);
 
     // work <- IFFT(work, n, 0)
@@ -1646,7 +1770,7 @@ void ReedSolomonDecode(
 
     // Reveal erasures
 
-    for (int i = 0; i < original_count; ++i)
+    for (unsigned i = 0; i < original_count; ++i)
         if (!original[i])
             mul_mem(work[i], work[i + m], kModulus - error_locations[i + m], buffer_bytes);
 }
@@ -1662,9 +1786,6 @@ bool Initialize()
     if (IsInitialized)
         return true;
 
-    if (!CpuHasSSSE3)
-        return false;
-
     InitializeLogarithmTables();
     InitializeMultiplyTables();
     FFTInitialize();
diff --git a/LeopardFF8.cpp b/LeopardFF8.cpp
index 07061ab..b87eda1 100644
--- a/LeopardFF8.cpp
+++ b/LeopardFF8.cpp
@@ -243,6 +243,99 @@ static const Multiply256LUT_t* Multiply256LUT = nullptr;
 static const ffe_t* Multiply8LUT = nullptr;
 
 
+// Reference version of muladd: x[] ^= y[] * log_m
+static LEO_FORCE_INLINE void RefMulAdd(
+    void* LEO_RESTRICT x,
+    const void* LEO_RESTRICT y,
+    ffe_t log_m,
+    uint64_t bytes)
+{
+    const ffe_t* LEO_RESTRICT lut = Multiply8LUT + (unsigned)log_m * 256;
+    const ffe_t * LEO_RESTRICT y1 = reinterpret_cast<const ffe_t *>(y);
+
+#ifdef LEO_TARGET_MOBILE
+    ffe_t * LEO_RESTRICT x1 = reinterpret_cast<ffe_t *>(x);
+
+    do
+    {
+        for (unsigned j = 0; j < 64; ++j)
+            x1[j] ^= lut[y1[j]];
+
+        x1 += 64, y1 += 64;
+        bytes -= 64;
+} while (bytes > 0);
+#else
+    uint64_t * LEO_RESTRICT x8 = reinterpret_cast<uint64_t *>(x);
+
+    do
+    {
+        for (unsigned j = 0; j < 8; ++j)
+        {
+            uint64_t x_0 = x8[j];
+            x_0 ^= (uint64_t)lut[y1[0]];
+            x_0 ^= (uint64_t)lut[y1[1]] << 8;
+            x_0 ^= (uint64_t)lut[y1[2]] << 16;
+            x_0 ^= (uint64_t)lut[y1[3]] << 24;
+            x_0 ^= (uint64_t)lut[y1[4]] << 32;
+            x_0 ^= (uint64_t)lut[y1[5]] << 40;
+            x_0 ^= (uint64_t)lut[y1[6]] << 48;
+            x_0 ^= (uint64_t)lut[y1[7]] << 56;
+            x8[j] = x_0;
+            y1 += 8;
+        }
+
+        x8 += 8;
+        bytes -= 64;
+    } while (bytes > 0);
+#endif
+}
+
+// Reference version of mul: x[] = y[] * log_m
+static LEO_FORCE_INLINE void RefMul(
+    void* LEO_RESTRICT x,
+    const void* LEO_RESTRICT y,
+    ffe_t log_m,
+    uint64_t bytes)
+{
+    const ffe_t* LEO_RESTRICT lut = Multiply8LUT + (unsigned)log_m * 256;
+    const ffe_t * LEO_RESTRICT y1 = reinterpret_cast<const ffe_t *>(y);
+
+#ifdef LEO_TARGET_MOBILE
+    ffe_t * LEO_RESTRICT x1 = reinterpret_cast<ffe_t *>(x);
+
+    do
+    {
+        for (unsigned j = 0; j < 64; ++j)
+            x1[j] ^= lut[y1[j]];
+
+        x1 += 64, y1 += 64;
+        bytes -= 64;
+    } while (bytes > 0);
+#else
+    uint64_t * LEO_RESTRICT x8 = reinterpret_cast<uint64_t *>(x);
+
+    do
+    {
+        for (unsigned j = 0; j < 8; ++j)
+        {
+            uint64_t x_0 = (uint64_t)lut[y1[0]];
+            x_0 ^= (uint64_t)lut[y1[1]] << 8;
+            x_0 ^= (uint64_t)lut[y1[2]] << 16;
+            x_0 ^= (uint64_t)lut[y1[3]] << 24;
+            x_0 ^= (uint64_t)lut[y1[4]] << 32;
+            x_0 ^= (uint64_t)lut[y1[5]] << 40;
+            x_0 ^= (uint64_t)lut[y1[6]] << 48;
+            x_0 ^= (uint64_t)lut[y1[7]] << 56;
+            x8[j] = x_0;
+            y1 += 8;
+        }
+
+        x8 += 8;
+        bytes -= 64;
+    } while (bytes > 0);
+#endif
+}
+
 static void InitializeMultiplyTables()
 {
     // If we cannot use the PSHUFB instruction, generate Multiply8LUT:
@@ -382,18 +475,7 @@ static void mul_mem(
     }
 
     // Reference version:
-    const ffe_t* LEO_RESTRICT lut = Multiply8LUT + log_m * 256;
-    ffe_t * LEO_RESTRICT x1 = reinterpret_cast<ffe_t *>(x);
-    const ffe_t * LEO_RESTRICT y1 = reinterpret_cast<const ffe_t *>(y);
-
-    do
-    {
-        for (unsigned j = 0; j < 64; ++j)
-            x1[j] = lut[y1[j]];
-
-        x1 += 64, y1 += 64;
-        bytes -= 64;
-    } while (bytes > 0);
+    RefMul(x, y, log_m, bytes);
 }
 
 
@@ -575,47 +657,8 @@ static void IFFT_DIT2(
     }
 
     // Reference version:
-    const ffe_t* LEO_RESTRICT lut = Multiply8LUT + log_m * 256;
-
     xor_mem(y, x, bytes);
-
-#ifdef LEO_TARGET_MOBILE
-    ffe_t * LEO_RESTRICT x1 = reinterpret_cast<ffe_t *>(x);
-    ffe_t * LEO_RESTRICT y1 = reinterpret_cast<ffe_t *>(y);
-
-    do
-    {
-        for (unsigned j = 0; j < 64; ++j)
-            x1[j] ^= lut[y1[j]];
-
-        x1 += 64, y1 += 64;
-        bytes -= 64;
-    } while (bytes > 0);
-#else
-    uint64_t * LEO_RESTRICT x8 = reinterpret_cast<uint64_t *>(x);
-    ffe_t * LEO_RESTRICT y1 = reinterpret_cast<ffe_t *>(y);
-
-    do
-    {
-        for (unsigned j = 0; j < 8; ++j)
-        {
-            uint64_t x_0 = x8[j];
-            x_0 ^= (uint64_t)lut[y1[0]];
-            x_0 ^= (uint64_t)lut[y1[1]] << 8;
-            x_0 ^= (uint64_t)lut[y1[2]] << 16;
-            x_0 ^= (uint64_t)lut[y1[3]] << 24;
-            x_0 ^= (uint64_t)lut[y1[4]] << 32;
-            x_0 ^= (uint64_t)lut[y1[5]] << 40;
-            x_0 ^= (uint64_t)lut[y1[6]] << 48;
-            x_0 ^= (uint64_t)lut[y1[7]] << 56;
-            x8[j] = x_0;
-            y1 += 8;
-        }
-
-        x8 += 8;
-        bytes -= 64;
-    } while (bytes > 0);
-#endif
+    RefMulAdd(x, y, log_m, bytes);
 }
 
 
@@ -852,49 +895,8 @@ static void IFFT_DIT2_xor(
     }
 
     // Reference version:
-    const ffe_t* LEO_RESTRICT lut = Multiply8LUT + log_m * 256;
-
     xor_mem(y_in, x_in, bytes);
-
-    uint64_t count = bytes;
-    ffe_t * LEO_RESTRICT y1 = reinterpret_cast<ffe_t *>(y_in);
-
-#ifdef LEO_TARGET_MOBILE
-    ffe_t * LEO_RESTRICT x1 = reinterpret_cast<ffe_t *>(x_in);
-
-    do
-    {
-        for (unsigned j = 0; j < 64; ++j)
-            x1[j] ^= lut[y1[j]];
-
-        x1 += 64, y1 += 64;
-        count -= 64;
-    } while (count > 0);
-#else
-    uint64_t * LEO_RESTRICT x8 = reinterpret_cast<uint64_t *>(x_in);
-
-    do
-    {
-        for (unsigned j = 0; j < 8; ++j)
-        {
-            uint64_t x_0 = x8[j];
-            x_0 ^= (uint64_t)lut[y1[0]];
-            x_0 ^= (uint64_t)lut[y1[1]] << 8;
-            x_0 ^= (uint64_t)lut[y1[2]] << 16;
-            x_0 ^= (uint64_t)lut[y1[3]] << 24;
-            x_0 ^= (uint64_t)lut[y1[4]] << 32;
-            x_0 ^= (uint64_t)lut[y1[5]] << 40;
-            x_0 ^= (uint64_t)lut[y1[6]] << 48;
-            x_0 ^= (uint64_t)lut[y1[7]] << 56;
-            x8[j] = x_0;
-            y1 += 8;
-        }
-
-        x8 += 8;
-        count -= 64;
-    } while (count > 0);
-#endif
-
+    RefMulAdd(x_in, y_in, log_m, bytes);
     xor_mem(y_out, y_in, bytes);
     xor_mem(x_out, x_in, bytes);
 }
@@ -1379,52 +1381,8 @@ static void FFT_DIT2(
     }
 
     // Reference version:
-    const ffe_t* LEO_RESTRICT lut = Multiply8LUT + log_m * 256;
-
-#ifdef LEO_TARGET_MOBILE
-    ffe_t * LEO_RESTRICT x1 = reinterpret_cast<ffe_t *>(x);
-    ffe_t * LEO_RESTRICT y1 = reinterpret_cast<ffe_t *>(y);
-
-    do
-    {
-        for (unsigned j = 0; j < 64; ++j)
-        {
-            ffe_t x_0 = x1[j];
-            ffe_t y_0 = y1[j];
-            x_0 ^= lut[y_0];
-            x1[j] = x_0;
-            y1[j] = y_0 ^ x_0;
-        }
-
-        x1 += 64, y1 += 64;
-        bytes -= 64;
-    } while (bytes > 0);
-#else
-    uint64_t * LEO_RESTRICT x8 = reinterpret_cast<uint64_t *>(x);
-    uint64_t * LEO_RESTRICT y8 = reinterpret_cast<uint64_t *>(y);
-    ffe_t * LEO_RESTRICT y1 = reinterpret_cast<ffe_t *>(y);
-
-    do
-    {
-        for (unsigned j = 0; j < 8; ++j)
-        {
-            uint64_t x_0 = x8[j], y_0 = y8[j];
-            x_0 ^= (uint64_t)lut[y1[0]];
-            x_0 ^= (uint64_t)lut[y1[1]] << 8;
-            x_0 ^= (uint64_t)lut[y1[2]] << 16;
-            x_0 ^= (uint64_t)lut[y1[3]] << 24;
-            x_0 ^= (uint64_t)lut[y1[4]] << 32;
-            x_0 ^= (uint64_t)lut[y1[5]] << 40;
-            x_0 ^= (uint64_t)lut[y1[6]] << 48;
-            x_0 ^= (uint64_t)lut[y1[7]] << 56;
-            x8[j] = x_0, y8[j] = y_0 ^ x_0;
-            y1 += 8;
-        }
-
-        x8 += 8, y8 += 8;
-        bytes -= 64;
-    } while (bytes > 0);
-#endif
+    RefMulAdd(x, y, log_m, bytes);
+    xor_mem(y, x, bytes);
 }
 
 
diff --git a/tests/benchmark.cpp b/tests/benchmark.cpp
index 5ad1110..b5b7ed6 100644
--- a/tests/benchmark.cpp
+++ b/tests/benchmark.cpp
@@ -48,13 +48,13 @@ struct TestParameters
     unsigned original_count = 100; // under 65536
     unsigned recovery_count = 10; // under 65536 - original_count
 #endif
-    unsigned buffer_bytes = 1344; // multiple of 64 bytes
+    unsigned buffer_bytes = 64000; // multiple of 64 bytes
     unsigned loss_count = 32768; // some fraction of original_count
     unsigned seed = 2;
 };
 
 static const unsigned kLargeTrialCount = 1;
-static const unsigned kSmallTrialCount = 300;
+static const unsigned kSmallTrialCount = 1;
 
 
 //------------------------------------------------------------------------------
@@ -564,19 +564,34 @@ int main(int argc, char **argv)
         goto Failed;
 
 #if 1
-    static const unsigned kMaxRandomData = 32768;
+    static const unsigned kMaxLargeRandomData = 32768;
+    static const unsigned kMaxSmallRandomData = 128;
 
     prng.Seed(params.seed, 8);
     for (;; ++params.seed)
     {
-        params.original_count = prng.Next() % kMaxRandomData + 1;
-        params.recovery_count = prng.Next() % params.original_count + 1;
-        params.loss_count = prng.Next() % params.recovery_count + 1;
+        // Large:
+        {
+            params.original_count = prng.Next() % kMaxLargeRandomData + 1;
+            params.recovery_count = prng.Next() % params.original_count + 1;
+            params.loss_count = prng.Next() % params.recovery_count + 1;
 
-        cout << "Parameters: [original count=" << params.original_count << "] [recovery count=" << params.recovery_count << "] [buffer bytes=" << params.buffer_bytes << "] [loss count=" << params.loss_count << "] [random seed=" << params.seed << "]" << endl;
+            cout << "Parameters: [original count=" << params.original_count << "] [recovery count=" << params.recovery_count << "] [buffer bytes=" << params.buffer_bytes << "] [loss count=" << params.loss_count << "] [random seed=" << params.seed << "]" << endl;
 
-        if (!Benchmark(params))
-            goto Failed;
+            if (!Benchmark(params))
+                goto Failed;
+        }
+        // Small:
+        {
+            params.original_count = prng.Next() % kMaxSmallRandomData + 1;
+            params.recovery_count = prng.Next() % params.original_count + 1;
+            params.loss_count = prng.Next() % params.recovery_count + 1;
+
+            cout << "Parameters: [original count=" << params.original_count << "] [recovery count=" << params.recovery_count << "] [buffer bytes=" << params.buffer_bytes << "] [loss count=" << params.loss_count << "] [random seed=" << params.seed << "]" << endl;
+
+            if (!Benchmark(params))
+                goto Failed;
+        }
     }
 #endif