Add fallbacks for processors that do not support SSSE3

2017-06-06 20:45:50 -07:00 · 2017-06-06 20:45:50 -07:00 · dee7d414de
parent a4a00679a6
commit dee7d414de
3 changed files with 337 additions and 243 deletions
--- a/LeopardFF16.cpp
+++ b/LeopardFF16.cpp
@ -119,10 +119,11 @@ static void FWHT(ffe_t* data, const unsigned m, const unsigned m_truncated)
    {
        // For each set of dist*4 elements:
 #pragma omp parallel for
-        for (int r = 0; r < m_truncated; r += dist4)
+        for (int r = 0; r < (int)m_truncated; r += dist4)
        {
            // For each set of dist elements:
-            for (int i = r; i < r + dist; ++i)
+            const int i_end = r + dist;
+            for (int i = r; i < i_end; ++i)
                FWHT_4(data + i, dist);
        }
    }
@ -130,7 +131,7 @@ static void FWHT(ffe_t* data, const unsigned m, const unsigned m_truncated)
    // If there is one layer left:
    if (dist < m)
 #pragma omp parallel for
-        for (int i = 0; i < dist; ++i)
+        for (int i = 0; i < (int)dist; ++i)
            FWHT_2(data[i], data[i + dist]);
 }

@ -294,12 +295,110 @@ static const Multiply256LUT_t* Multiply256LUT = nullptr;

 #endif // LEO_TRY_AVX2

+// Stores the partial products of x * y at offset x + y * 65536
+// Repeated accesses from the same y value are faster
+struct Product16Table
+{
+    ffe_t LUT[4 * 16];
+};
+static const Product16Table* Multiply16LUT = nullptr;
+
+
+// Reference version of muladd: x[] ^= y[] * log_m
+static LEO_FORCE_INLINE void RefMulAdd(
+    void* LEO_RESTRICT x,
+    const void* LEO_RESTRICT y,
+    ffe_t log_m,
+    uint64_t bytes)
+{
+    const ffe_t* LEO_RESTRICT lut = Multiply16LUT[log_m].LUT;
+    const uint8_t * LEO_RESTRICT y1 = reinterpret_cast<const uint8_t *>(y);
+    uint8_t * LEO_RESTRICT x1 = reinterpret_cast<uint8_t *>(x);
+
+    do
+    {
+        for (unsigned i = 0; i < 32; ++i)
+        {
+            const unsigned lo = y1[i];
+            const unsigned hi = y1[i + 32];
+
+            const ffe_t prod = \
+                lut[(lo & 15)] ^ \
+                lut[(lo >> 4) + 16] ^ \
+                lut[(hi & 15) + 32] ^ \
+                lut[(hi >> 4) + 48];
+
+            x1[i] ^= (uint8_t)prod;
+            x1[i + 32] ^= (uint8_t)(prod >> 8);
+        }
+
+        x1 += 64, y1 += 64;
+        bytes -= 64;
+    } while (bytes > 0);
+
+}
+
+// Reference version of mul: x[] = y[] * log_m
+static LEO_FORCE_INLINE void RefMul(
+    void* LEO_RESTRICT x,
+    const void* LEO_RESTRICT y,
+    ffe_t log_m,
+    uint64_t bytes)
+{
+    const ffe_t* LEO_RESTRICT lut = Multiply16LUT[log_m].LUT;
+    const uint8_t * LEO_RESTRICT y1 = reinterpret_cast<const uint8_t *>(y);
+    uint8_t * LEO_RESTRICT x1 = reinterpret_cast<uint8_t *>(x);
+
+    do
+    {
+        for (unsigned i = 0; i < 32; ++i)
+        {
+            const unsigned lo = y1[i];
+            const unsigned hi = y1[i + 32];
+
+            const ffe_t prod = \
+                lut[(lo & 15)] ^ \
+                lut[(lo >> 4) + 16] ^ \
+                lut[(hi & 15) + 32] ^ \
+                lut[(hi >> 4) + 48];
+
+            x1[i] = (uint8_t)prod;
+            x1[i + 32] = (uint8_t)(prod >> 8);
+        }
+
+        x1 += 64, y1 += 64;
+        bytes -= 64;
+    } while (bytes > 0);
+}
+

 static void InitializeMultiplyTables()
 {
    // If we cannot use the PSHUFB instruction, generate Multiply8LUT:
    if (!CpuHasSSSE3)
+    {
+        Multiply16LUT = new Product16Table[65536];
+
+        // For each log_m multiplicand:
+#pragma omp parallel for
+        for (int log_m = 0; log_m < kOrder; ++log_m)
+        {
+            const Product16Table& lut = Multiply16LUT[log_m];
+
+            for (unsigned nibble = 0, shift = 0; nibble < 4; ++nibble, shift += 4)
+            {
+                ffe_t* nibble_lut = (ffe_t*)&lut.LUT[nibble * 16];
+
+                for (unsigned x_nibble = 0; x_nibble < 16; ++x_nibble)
+                {
+                    const ffe_t prod = MultiplyLog(x_nibble << shift, static_cast<ffe_t>(log_m));
+                    nibble_lut[x_nibble] = prod;
+                }
+            }
+        }
+
        return;
+    }

    if (CpuHasAVX2)
        Multiply256LUT = reinterpret_cast<const Multiply256LUT_t*>(SIMDSafeAllocate(sizeof(Multiply256LUT_t) * kOrder));
@ -381,29 +480,36 @@ static void mul_mem(
    }
 #endif // LEO_TRY_AVX2

-    LEO_MUL_TABLES_128(0, log_m);
-
-    const LEO_M128 clr_mask = _mm_set1_epi8(0x0f);
-
-    LEO_M128 * LEO_RESTRICT x16 = reinterpret_cast<LEO_M128 *>(x);
-    const LEO_M128 * LEO_RESTRICT y16 = reinterpret_cast<const LEO_M128 *>(y);
-
-    do
+    if (CpuHasSSSE3)
    {
+        LEO_MUL_TABLES_128(0, log_m);
+
+        const LEO_M128 clr_mask = _mm_set1_epi8(0x0f);
+
+        LEO_M128 * LEO_RESTRICT x16 = reinterpret_cast<LEO_M128 *>(x);
+        const LEO_M128 * LEO_RESTRICT y16 = reinterpret_cast<const LEO_M128 *>(y);
+
+        do
+        {
 #define LEO_MUL_128_LS(x_ptr, y_ptr) { \
-            const LEO_M128 data_lo = _mm_loadu_si128(y_ptr); \
-            const LEO_M128 data_hi = _mm_loadu_si128(y_ptr + 2); \
-            LEO_M128 prod_lo, prod_hi; \
-            LEO_MUL_128(data_lo, data_hi, 0); \
-            _mm_storeu_si128(x_ptr, prod_lo); \
-            _mm_storeu_si128(x_ptr + 2, prod_hi); }
+                const LEO_M128 data_lo = _mm_loadu_si128(y_ptr); \
+                const LEO_M128 data_hi = _mm_loadu_si128(y_ptr + 2); \
+                LEO_M128 prod_lo, prod_hi; \
+                LEO_MUL_128(data_lo, data_hi, 0); \
+                _mm_storeu_si128(x_ptr, prod_lo); \
+                _mm_storeu_si128(x_ptr + 2, prod_hi); }

-        LEO_MUL_128_LS(x16 + 1, y16 + 1);
-        LEO_MUL_128_LS(x16, y16);
-        x16 += 4, y16 += 4;
+            LEO_MUL_128_LS(x16 + 1, y16 + 1);
+            LEO_MUL_128_LS(x16, y16);
+            x16 += 4, y16 += 4;

-        bytes -= 64;
-    } while (bytes > 0);
+            bytes -= 64;
+        } while (bytes > 0);
+
+        return;
+    }
+
+    RefMul(x, y, log_m, bytes);
 }


@ -555,34 +661,43 @@ static void IFFT_DIT2(
    }
 #endif // LEO_TRY_AVX2

-    LEO_MUL_TABLES_128(0, log_m);
-
-    const LEO_M128 clr_mask = _mm_set1_epi8(0x0f);
-
-    LEO_M128 * LEO_RESTRICT x16 = reinterpret_cast<LEO_M128 *>(x);
-    LEO_M128 * LEO_RESTRICT y16 = reinterpret_cast<LEO_M128 *>(y);
-
-    do
+    if (CpuHasSSSE3)
    {
+        LEO_MUL_TABLES_128(0, log_m);
+
+        const LEO_M128 clr_mask = _mm_set1_epi8(0x0f);
+
+        LEO_M128 * LEO_RESTRICT x16 = reinterpret_cast<LEO_M128 *>(x);
+        LEO_M128 * LEO_RESTRICT y16 = reinterpret_cast<LEO_M128 *>(y);
+
+        do
+        {
 #define LEO_IFFTB_128(x_ptr, y_ptr) { \
-            LEO_M128 x_lo = _mm_loadu_si128(x_ptr); \
-            LEO_M128 x_hi = _mm_loadu_si128(x_ptr + 2); \
-            LEO_M128 y_lo = _mm_loadu_si128(y_ptr); \
-            LEO_M128 y_hi = _mm_loadu_si128(y_ptr + 2); \
-            y_lo = _mm_xor_si128(y_lo, x_lo); \
-            y_hi = _mm_xor_si128(y_hi, x_hi); \
-            _mm_storeu_si128(y_ptr, y_lo); \
-            _mm_storeu_si128(y_ptr + 2, y_hi); \
-            LEO_MULADD_128(x_lo, x_hi, y_lo, y_hi, 0); \
-            _mm_storeu_si128(x_ptr, x_lo); \
-            _mm_storeu_si128(x_ptr + 2, x_hi); }
+                LEO_M128 x_lo = _mm_loadu_si128(x_ptr); \
+                LEO_M128 x_hi = _mm_loadu_si128(x_ptr + 2); \
+                LEO_M128 y_lo = _mm_loadu_si128(y_ptr); \
+                LEO_M128 y_hi = _mm_loadu_si128(y_ptr + 2); \
+                y_lo = _mm_xor_si128(y_lo, x_lo); \
+                y_hi = _mm_xor_si128(y_hi, x_hi); \
+                _mm_storeu_si128(y_ptr, y_lo); \
+                _mm_storeu_si128(y_ptr + 2, y_hi); \
+                LEO_MULADD_128(x_lo, x_hi, y_lo, y_hi, 0); \
+                _mm_storeu_si128(x_ptr, x_lo); \
+                _mm_storeu_si128(x_ptr + 2, x_hi); }

-        LEO_IFFTB_128(x16 + 1, y16 + 1);
-        LEO_IFFTB_128(x16, y16);
-        x16 += 4, y16 += 4;
+            LEO_IFFTB_128(x16 + 1, y16 + 1);
+            LEO_IFFTB_128(x16, y16);
+            x16 += 4, y16 += 4;

-        bytes -= 64;
-    } while (bytes > 0);
+            bytes -= 64;
+        } while (bytes > 0);
+
+        return;
+    }
+
+    // Reference version:
+    xor_mem(y, x, bytes);
+    RefMulAdd(x, y, log_m, bytes);
 }


@ -774,10 +889,10 @@ static void IFFT_DIT_Encoder(
    // found that it only yields a 4% performance improvement, which is not
    // worth the extra complexity.
 #pragma omp parallel for
-    for (int i = 0; i < m_truncated; ++i)
+    for (int i = 0; i < (int)m_truncated; ++i)
        memcpy(work[i], data[i], bytes);
 #pragma omp parallel for
-    for (int i = m_truncated; i < m; ++i)
+    for (int i = m_truncated; i < (int)m; ++i)
        memset(work[i], 0, bytes);

    // I tried splitting up the first few layers into L3-cache sized blocks but
@ -790,7 +905,7 @@ static void IFFT_DIT_Encoder(
    {
        // For each set of dist*4 elements:
 #pragma omp parallel for
-        for (int r = 0; r < m_truncated; r += dist4)
+        for (int r = 0; r < (int)m_truncated; r += dist4)
        {
            const unsigned i_end = r + dist;
            const ffe_t log_m01 = skewLUT[i_end];
@ -798,7 +913,7 @@ static void IFFT_DIT_Encoder(
            const ffe_t log_m23 = skewLUT[i_end + dist * 2];

            // For each set of dist elements:
-            for (int i = r; i < i_end; ++i)
+            for (int i = r; i < (int)i_end; ++i)
            {
                IFFT_DIT4(
                    bytes,
@ -828,7 +943,7 @@ static void IFFT_DIT_Encoder(
        else
        {
 #pragma omp parallel for
-            for (int i = 0; i < dist; ++i)
+            for (int i = 0; i < (int)dist; ++i)
            {
                IFFT_DIT2(
                    work[i],
@ -860,7 +975,7 @@ static void IFFT_DIT_Decoder(
    {
        // For each set of dist*4 elements:
 #pragma omp parallel for
-        for (int r = 0; r < m_truncated; r += dist4)
+        for (int r = 0; r < (int)m_truncated; r += dist4)
        {
            const unsigned i_end = r + dist;
            const ffe_t log_m01 = skewLUT[i_end];
@ -868,7 +983,7 @@ static void IFFT_DIT_Decoder(
            const ffe_t log_m23 = skewLUT[i_end + dist * 2];

            // For each set of dist elements:
-            for (int i = r; i < i_end; ++i)
+            for (int i = r; i < (int)i_end; ++i)
            {
                IFFT_DIT4(
                    bytes,
@ -894,7 +1009,7 @@ static void IFFT_DIT_Decoder(
        else
        {
 #pragma omp parallel for
-            for (int i = 0; i < dist; ++i)
+            for (int i = 0; i < (int)dist; ++i)
            {
                IFFT_DIT2(
                    work[i],
@ -1000,34 +1115,43 @@ static void FFT_DIT2(
    }
 #endif // LEO_TRY_AVX2

-    LEO_MUL_TABLES_128(0, log_m);
-
-    const LEO_M128 clr_mask = _mm_set1_epi8(0x0f);
-
-    LEO_M128 * LEO_RESTRICT x16 = reinterpret_cast<LEO_M128 *>(x);
-    LEO_M128 * LEO_RESTRICT y16 = reinterpret_cast<LEO_M128 *>(y);
-
-    do
+    if (CpuHasSSSE3)
    {
+        LEO_MUL_TABLES_128(0, log_m);
+
+        const LEO_M128 clr_mask = _mm_set1_epi8(0x0f);
+
+        LEO_M128 * LEO_RESTRICT x16 = reinterpret_cast<LEO_M128 *>(x);
+        LEO_M128 * LEO_RESTRICT y16 = reinterpret_cast<LEO_M128 *>(y);
+
+        do
+        {
 #define LEO_FFTB_128(x_ptr, y_ptr) { \
-            LEO_M128 x_lo = _mm_loadu_si128(x_ptr); \
-            LEO_M128 x_hi = _mm_loadu_si128(x_ptr + 2); \
-            LEO_M128 y_lo = _mm_loadu_si128(y_ptr); \
-            LEO_M128 y_hi = _mm_loadu_si128(y_ptr + 2); \
-            LEO_MULADD_128(x_lo, x_hi, y_lo, y_hi, 0); \
-            _mm_storeu_si128(x_ptr, x_lo); \
-            _mm_storeu_si128(x_ptr + 2, x_hi); \
-            y_lo = _mm_xor_si128(y_lo, x_lo); \
-            y_hi = _mm_xor_si128(y_hi, x_hi); \
-            _mm_storeu_si128(y_ptr, y_lo); \
-            _mm_storeu_si128(y_ptr + 2, y_hi); }
+                LEO_M128 x_lo = _mm_loadu_si128(x_ptr); \
+                LEO_M128 x_hi = _mm_loadu_si128(x_ptr + 2); \
+                LEO_M128 y_lo = _mm_loadu_si128(y_ptr); \
+                LEO_M128 y_hi = _mm_loadu_si128(y_ptr + 2); \
+                LEO_MULADD_128(x_lo, x_hi, y_lo, y_hi, 0); \
+                _mm_storeu_si128(x_ptr, x_lo); \
+                _mm_storeu_si128(x_ptr + 2, x_hi); \
+                y_lo = _mm_xor_si128(y_lo, x_lo); \
+                y_hi = _mm_xor_si128(y_hi, x_hi); \
+                _mm_storeu_si128(y_ptr, y_lo); \
+                _mm_storeu_si128(y_ptr + 2, y_hi); }

-        LEO_FFTB_128(x16 + 1, y16 + 1);
-        LEO_FFTB_128(x16, y16);
-        x16 += 4, y16 += 4;
+            LEO_FFTB_128(x16 + 1, y16 + 1);
+            LEO_FFTB_128(x16, y16);
+            x16 += 4, y16 += 4;

-        bytes -= 64;
-    } while (bytes > 0);
+            bytes -= 64;
+        } while (bytes > 0);
+
+        return;
+    }
+
+    // Reference version:
+    RefMulAdd(x, y, log_m, bytes);
+    xor_mem(y, x, bytes);
 }


@ -1222,7 +1346,7 @@ static void FFT_DIT(

        // For each set of dist*4 elements:
 #pragma omp parallel for
-        for (int r = 0; r < m_truncated; r += dist4)
+        for (int r = 0; r < (int)m_truncated; r += dist4)
        {
            const unsigned i_end = r + dist;
            const ffe_t log_m01 = skewLUT[i_end];
@ -1230,7 +1354,7 @@ static void FFT_DIT(
            const ffe_t log_m23 = skewLUT[i_end + dist * 2];

            // For each set of dist elements:
-            for (int i = r; i < i_end; ++i)
+            for (int i = r; i < (int)i_end; ++i)
            {
                FFT_DIT4(
                    bytes,
@ -1247,7 +1371,7 @@ static void FFT_DIT(
    if (dist4 == 2)
    {
 #pragma omp parallel for
-        for (int r = 0; r < m_truncated; r += 2)
+        for (int r = 0; r < (int)m_truncated; r += 2)
        {
            const ffe_t log_m = skewLUT[r + 1];

@ -1470,7 +1594,7 @@ static void FFT_DIT_ErrorBits(
    {
        // For each set of dist*4 elements:
 #pragma omp parallel for
-        for (int r = 0; r < n_truncated; r += dist4)
+        for (int r = 0; r < (int)n_truncated; r += dist4)
        {
            if (!error_bits.IsNeeded(mip_level, r))
                continue;
@ -1482,7 +1606,7 @@ static void FFT_DIT_ErrorBits(

            // For each set of dist elements:
 #pragma omp parallel for
-            for (int i = r; i < i_end; ++i)
+            for (int i = r; i < (int)i_end; ++i)
            {
                FFT_DIT4(
                    bytes,
@ -1499,7 +1623,7 @@ static void FFT_DIT_ErrorBits(
    if (dist4 == 2)
    {
 #pragma omp parallel for
-        for (int r = 0; r < n_truncated; r += 2)
+        for (int r = 0; r < (int)n_truncated; r += 2)
        {
            if (!error_bits.IsNeeded(mip_level, r))
                continue;
@ -1543,10 +1667,10 @@ void ReedSolomonDecode(
 #endif // LEO_ERROR_BITFIELD_OPT

    ffe_t error_locations[kOrder] = {};
-    for (int i = 0; i < recovery_count; ++i)
+    for (unsigned i = 0; i < recovery_count; ++i)
        if (!recovery[i])
            error_locations[i] = 1;
-    for (int i = recovery_count; i < m; ++i)
+    for (unsigned i = recovery_count; i < m; ++i)
        error_locations[i] = 1;
    for (unsigned i = 0; i < original_count; ++i)
    {
@ -1576,7 +1700,7 @@ void ReedSolomonDecode(
    // work <- recovery data

 #pragma omp parallel for
-    for (int i = 0; i < recovery_count; ++i)
+    for (int i = 0; i < (int)recovery_count; ++i)
    {
        if (recovery[i])
            mul_mem(work[i], recovery[i], error_locations[i], buffer_bytes);
@ -1584,13 +1708,13 @@ void ReedSolomonDecode(
            memset(work[i], 0, buffer_bytes);
    }
 #pragma omp parallel for
-    for (int i = recovery_count; i < m; ++i)
+    for (int i = recovery_count; i < (int)m; ++i)
        memset(work[i], 0, buffer_bytes);

    // work <- original data

 #pragma omp parallel for
-    for (int i = 0; i < original_count; ++i)
+    for (int i = 0; i < (int)original_count; ++i)
    {
        if (original[i])
            mul_mem(work[m + i], original[i], error_locations[m + i], buffer_bytes);
@ -1598,7 +1722,7 @@ void ReedSolomonDecode(
            memset(work[m + i], 0, buffer_bytes);
    }
 #pragma omp parallel for
-    for (int i = m + original_count; i < n; ++i)
+    for (int i = m + original_count; i < (int)n; ++i)
        memset(work[i], 0, buffer_bytes);

    // work <- IFFT(work, n, 0)
@ -1646,7 +1770,7 @@ void ReedSolomonDecode(

    // Reveal erasures

-    for (int i = 0; i < original_count; ++i)
+    for (unsigned i = 0; i < original_count; ++i)
        if (!original[i])
            mul_mem(work[i], work[i + m], kModulus - error_locations[i + m], buffer_bytes);
 }
@ -1662,9 +1786,6 @@ bool Initialize()
    if (IsInitialized)
        return true;

-    if (!CpuHasSSSE3)
-        return false;
-
    InitializeLogarithmTables();
    InitializeMultiplyTables();
    FFTInitialize();
--- a/LeopardFF8.cpp
+++ b/LeopardFF8.cpp
@ -243,6 +243,99 @@ static const Multiply256LUT_t* Multiply256LUT = nullptr;
 static const ffe_t* Multiply8LUT = nullptr;


+// Reference version of muladd: x[] ^= y[] * log_m
+static LEO_FORCE_INLINE void RefMulAdd(
+    void* LEO_RESTRICT x,
+    const void* LEO_RESTRICT y,
+    ffe_t log_m,
+    uint64_t bytes)
+{
+    const ffe_t* LEO_RESTRICT lut = Multiply8LUT + (unsigned)log_m * 256;
+    const ffe_t * LEO_RESTRICT y1 = reinterpret_cast<const ffe_t *>(y);
+
+#ifdef LEO_TARGET_MOBILE
+    ffe_t * LEO_RESTRICT x1 = reinterpret_cast<ffe_t *>(x);
+
+    do
+    {
+        for (unsigned j = 0; j < 64; ++j)
+            x1[j] ^= lut[y1[j]];
+
+        x1 += 64, y1 += 64;
+        bytes -= 64;
+} while (bytes > 0);
+#else
+    uint64_t * LEO_RESTRICT x8 = reinterpret_cast<uint64_t *>(x);
+
+    do
+    {
+        for (unsigned j = 0; j < 8; ++j)
+        {
+            uint64_t x_0 = x8[j];
+            x_0 ^= (uint64_t)lut[y1[0]];
+            x_0 ^= (uint64_t)lut[y1[1]] << 8;
+            x_0 ^= (uint64_t)lut[y1[2]] << 16;
+            x_0 ^= (uint64_t)lut[y1[3]] << 24;
+            x_0 ^= (uint64_t)lut[y1[4]] << 32;
+            x_0 ^= (uint64_t)lut[y1[5]] << 40;
+            x_0 ^= (uint64_t)lut[y1[6]] << 48;
+            x_0 ^= (uint64_t)lut[y1[7]] << 56;
+            x8[j] = x_0;
+            y1 += 8;
+        }
+
+        x8 += 8;
+        bytes -= 64;
+    } while (bytes > 0);
+#endif
+}
+
+// Reference version of mul: x[] = y[] * log_m
+static LEO_FORCE_INLINE void RefMul(
+    void* LEO_RESTRICT x,
+    const void* LEO_RESTRICT y,
+    ffe_t log_m,
+    uint64_t bytes)
+{
+    const ffe_t* LEO_RESTRICT lut = Multiply8LUT + (unsigned)log_m * 256;
+    const ffe_t * LEO_RESTRICT y1 = reinterpret_cast<const ffe_t *>(y);
+
+#ifdef LEO_TARGET_MOBILE
+    ffe_t * LEO_RESTRICT x1 = reinterpret_cast<ffe_t *>(x);
+
+    do
+    {
+        for (unsigned j = 0; j < 64; ++j)
+            x1[j] ^= lut[y1[j]];
+
+        x1 += 64, y1 += 64;
+        bytes -= 64;
+    } while (bytes > 0);
+#else
+    uint64_t * LEO_RESTRICT x8 = reinterpret_cast<uint64_t *>(x);
+
+    do
+    {
+        for (unsigned j = 0; j < 8; ++j)
+        {
+            uint64_t x_0 = (uint64_t)lut[y1[0]];
+            x_0 ^= (uint64_t)lut[y1[1]] << 8;
+            x_0 ^= (uint64_t)lut[y1[2]] << 16;
+            x_0 ^= (uint64_t)lut[y1[3]] << 24;
+            x_0 ^= (uint64_t)lut[y1[4]] << 32;
+            x_0 ^= (uint64_t)lut[y1[5]] << 40;
+            x_0 ^= (uint64_t)lut[y1[6]] << 48;
+            x_0 ^= (uint64_t)lut[y1[7]] << 56;
+            x8[j] = x_0;
+            y1 += 8;
+        }
+
+        x8 += 8;
+        bytes -= 64;
+    } while (bytes > 0);
+#endif
+}
+
 static void InitializeMultiplyTables()
 {
    // If we cannot use the PSHUFB instruction, generate Multiply8LUT:
@ -382,18 +475,7 @@ static void mul_mem(
    }

    // Reference version:
-    const ffe_t* LEO_RESTRICT lut = Multiply8LUT + log_m * 256;
-    ffe_t * LEO_RESTRICT x1 = reinterpret_cast<ffe_t *>(x);
-    const ffe_t * LEO_RESTRICT y1 = reinterpret_cast<const ffe_t *>(y);
-
-    do
-    {
-        for (unsigned j = 0; j < 64; ++j)
-            x1[j] = lut[y1[j]];
-
-        x1 += 64, y1 += 64;
-        bytes -= 64;
-    } while (bytes > 0);
+    RefMul(x, y, log_m, bytes);
 }


@ -575,47 +657,8 @@ static void IFFT_DIT2(
    }

    // Reference version:
-    const ffe_t* LEO_RESTRICT lut = Multiply8LUT + log_m * 256;
-
    xor_mem(y, x, bytes);
-
-#ifdef LEO_TARGET_MOBILE
-    ffe_t * LEO_RESTRICT x1 = reinterpret_cast<ffe_t *>(x);
-    ffe_t * LEO_RESTRICT y1 = reinterpret_cast<ffe_t *>(y);
-
-    do
-    {
-        for (unsigned j = 0; j < 64; ++j)
-            x1[j] ^= lut[y1[j]];
-
-        x1 += 64, y1 += 64;
-        bytes -= 64;
-    } while (bytes > 0);
-#else
-    uint64_t * LEO_RESTRICT x8 = reinterpret_cast<uint64_t *>(x);
-    ffe_t * LEO_RESTRICT y1 = reinterpret_cast<ffe_t *>(y);
-
-    do
-    {
-        for (unsigned j = 0; j < 8; ++j)
-        {
-            uint64_t x_0 = x8[j];
-            x_0 ^= (uint64_t)lut[y1[0]];
-            x_0 ^= (uint64_t)lut[y1[1]] << 8;
-            x_0 ^= (uint64_t)lut[y1[2]] << 16;
-            x_0 ^= (uint64_t)lut[y1[3]] << 24;
-            x_0 ^= (uint64_t)lut[y1[4]] << 32;
-            x_0 ^= (uint64_t)lut[y1[5]] << 40;
-            x_0 ^= (uint64_t)lut[y1[6]] << 48;
-            x_0 ^= (uint64_t)lut[y1[7]] << 56;
-            x8[j] = x_0;
-            y1 += 8;
-        }
-
-        x8 += 8;
-        bytes -= 64;
-    } while (bytes > 0);
-#endif
+    RefMulAdd(x, y, log_m, bytes);
 }


@ -852,49 +895,8 @@ static void IFFT_DIT2_xor(
    }

    // Reference version:
-    const ffe_t* LEO_RESTRICT lut = Multiply8LUT + log_m * 256;
-
    xor_mem(y_in, x_in, bytes);
-
-    uint64_t count = bytes;
-    ffe_t * LEO_RESTRICT y1 = reinterpret_cast<ffe_t *>(y_in);
-
-#ifdef LEO_TARGET_MOBILE
-    ffe_t * LEO_RESTRICT x1 = reinterpret_cast<ffe_t *>(x_in);
-
-    do
-    {
-        for (unsigned j = 0; j < 64; ++j)
-            x1[j] ^= lut[y1[j]];
-
-        x1 += 64, y1 += 64;
-        count -= 64;
-    } while (count > 0);
-#else
-    uint64_t * LEO_RESTRICT x8 = reinterpret_cast<uint64_t *>(x_in);
-
-    do
-    {
-        for (unsigned j = 0; j < 8; ++j)
-        {
-            uint64_t x_0 = x8[j];
-            x_0 ^= (uint64_t)lut[y1[0]];
-            x_0 ^= (uint64_t)lut[y1[1]] << 8;
-            x_0 ^= (uint64_t)lut[y1[2]] << 16;
-            x_0 ^= (uint64_t)lut[y1[3]] << 24;
-            x_0 ^= (uint64_t)lut[y1[4]] << 32;
-            x_0 ^= (uint64_t)lut[y1[5]] << 40;
-            x_0 ^= (uint64_t)lut[y1[6]] << 48;
-            x_0 ^= (uint64_t)lut[y1[7]] << 56;
-            x8[j] = x_0;
-            y1 += 8;
-        }
-
-        x8 += 8;
-        count -= 64;
-    } while (count > 0);
-#endif
-
+    RefMulAdd(x_in, y_in, log_m, bytes);
    xor_mem(y_out, y_in, bytes);
    xor_mem(x_out, x_in, bytes);
 }
@ -1379,52 +1381,8 @@ static void FFT_DIT2(
    }

    // Reference version:
-    const ffe_t* LEO_RESTRICT lut = Multiply8LUT + log_m * 256;
-
-#ifdef LEO_TARGET_MOBILE
-    ffe_t * LEO_RESTRICT x1 = reinterpret_cast<ffe_t *>(x);
-    ffe_t * LEO_RESTRICT y1 = reinterpret_cast<ffe_t *>(y);
-
-    do
-    {
-        for (unsigned j = 0; j < 64; ++j)
-        {
-            ffe_t x_0 = x1[j];
-            ffe_t y_0 = y1[j];
-            x_0 ^= lut[y_0];
-            x1[j] = x_0;
-            y1[j] = y_0 ^ x_0;
-        }
-
-        x1 += 64, y1 += 64;
-        bytes -= 64;
-    } while (bytes > 0);
-#else
-    uint64_t * LEO_RESTRICT x8 = reinterpret_cast<uint64_t *>(x);
-    uint64_t * LEO_RESTRICT y8 = reinterpret_cast<uint64_t *>(y);
-    ffe_t * LEO_RESTRICT y1 = reinterpret_cast<ffe_t *>(y);
-
-    do
-    {
-        for (unsigned j = 0; j < 8; ++j)
-        {
-            uint64_t x_0 = x8[j], y_0 = y8[j];
-            x_0 ^= (uint64_t)lut[y1[0]];
-            x_0 ^= (uint64_t)lut[y1[1]] << 8;
-            x_0 ^= (uint64_t)lut[y1[2]] << 16;
-            x_0 ^= (uint64_t)lut[y1[3]] << 24;
-            x_0 ^= (uint64_t)lut[y1[4]] << 32;
-            x_0 ^= (uint64_t)lut[y1[5]] << 40;
-            x_0 ^= (uint64_t)lut[y1[6]] << 48;
-            x_0 ^= (uint64_t)lut[y1[7]] << 56;
-            x8[j] = x_0, y8[j] = y_0 ^ x_0;
-            y1 += 8;
-        }
-
-        x8 += 8, y8 += 8;
-        bytes -= 64;
-    } while (bytes > 0);
-#endif
+    RefMulAdd(x, y, log_m, bytes);
+    xor_mem(y, x, bytes);
 }


--- a/tests/benchmark.cpp
+++ b/tests/benchmark.cpp
@ -48,13 +48,13 @@ struct TestParameters
    unsigned original_count = 100; // under 65536
    unsigned recovery_count = 10; // under 65536 - original_count
 #endif
-    unsigned buffer_bytes = 1344; // multiple of 64 bytes
+    unsigned buffer_bytes = 64000; // multiple of 64 bytes
    unsigned loss_count = 32768; // some fraction of original_count
    unsigned seed = 2;
 };

 static const unsigned kLargeTrialCount = 1;
-static const unsigned kSmallTrialCount = 300;
+static const unsigned kSmallTrialCount = 1;


 //------------------------------------------------------------------------------
@ -564,19 +564,34 @@ int main(int argc, char **argv)
        goto Failed;

 #if 1
-    static const unsigned kMaxRandomData = 32768;
+    static const unsigned kMaxLargeRandomData = 32768;
+    static const unsigned kMaxSmallRandomData = 128;

    prng.Seed(params.seed, 8);
    for (;; ++params.seed)
    {
-        params.original_count = prng.Next() % kMaxRandomData + 1;
-        params.recovery_count = prng.Next() % params.original_count + 1;
-        params.loss_count = prng.Next() % params.recovery_count + 1;
+        // Large:
+        {
+            params.original_count = prng.Next() % kMaxLargeRandomData + 1;
+            params.recovery_count = prng.Next() % params.original_count + 1;
+            params.loss_count = prng.Next() % params.recovery_count + 1;

-        cout << "Parameters: [original count=" << params.original_count << "] [recovery count=" << params.recovery_count << "] [buffer bytes=" << params.buffer_bytes << "] [loss count=" << params.loss_count << "] [random seed=" << params.seed << "]" << endl;
+            cout << "Parameters: [original count=" << params.original_count << "] [recovery count=" << params.recovery_count << "] [buffer bytes=" << params.buffer_bytes << "] [loss count=" << params.loss_count << "] [random seed=" << params.seed << "]" << endl;

-        if (!Benchmark(params))
-            goto Failed;
+            if (!Benchmark(params))
+                goto Failed;
+        }
+        // Small:
+        {
+            params.original_count = prng.Next() % kMaxSmallRandomData + 1;
+            params.recovery_count = prng.Next() % params.original_count + 1;
+            params.loss_count = prng.Next() % params.recovery_count + 1;
+
+            cout << "Parameters: [original count=" << params.original_count << "] [recovery count=" << params.recovery_count << "] [buffer bytes=" << params.buffer_bytes << "] [loss count=" << params.loss_count << "] [random seed=" << params.seed << "]" << endl;
+
+            if (!Benchmark(params))
+                goto Failed;
+        }
    }
 #endif