From 63bfdadce4c85c771dd61e71ec50bdd76ab01f37 Mon Sep 17 00:00:00 2001
From: Christopher Taylor <christopher.taylor@oculus.com>
Date: Sun, 4 Jun 2017 19:26:26 -0700
Subject: [PATCH] Cleanups and copy pasta

---
 LeopardFF16.cpp     | 477 ++++++++++++++++++++++++++++++++++++++++----
 LeopardFF8.cpp      | 368 ++++++++++++++++++++--------------
 tests/benchmark.cpp |   6 +-
 3 files changed, 659 insertions(+), 192 deletions(-)
diff --git a/LeopardFF16.cpp b/LeopardFF16.cpp
index 74b5180..717b8a8 100644
--- a/LeopardFF16.cpp
+++ b/LeopardFF16.cpp
@@ -525,6 +525,7 @@ static void FFTInitialize()
         {1-5, 1'-5', 1-1', 5-5'},
 */
 
+// 2-way butterfly
 static void IFFT_DIT2(
     void * LEO_RESTRICT x, void * LEO_RESTRICT y,
     ffe_t log_m, uint64_t bytes)
@@ -624,6 +625,7 @@ static void IFFT_DIT2(
     } while (bytes > 0);
 }
 
+
 // 4-way butterfly
 static void IFFT_DIT4(
     uint64_t bytes,
@@ -817,7 +819,318 @@ static void IFFT_DIT4(
     }
 }
 
-static void IFFT_DIT(
+
+// {x_out, y_out} ^= IFFT_DIT2( {x_in, y_in} )
+static void IFFT_DIT2_xor(
+    void * LEO_RESTRICT x_in, void * LEO_RESTRICT y_in,
+    void * LEO_RESTRICT x_out, void * LEO_RESTRICT y_out,
+    const ffe_t log_m, uint64_t bytes)
+{
+#if defined(LEO_TRY_AVX2)
+    if (CpuHasAVX2)
+    {
+        const LEO_M256 table_lo_y = _mm256_loadu_si256(&Multiply256LUT[log_m].Value[0]);
+        const LEO_M256 table_hi_y = _mm256_loadu_si256(&Multiply256LUT[log_m].Value[1]);
+
+        const LEO_M256 clr_mask = _mm256_set1_epi8(0x0f);
+
+        const LEO_M256 * LEO_RESTRICT x32_in = reinterpret_cast<const LEO_M256 *>(x_in);
+        const LEO_M256 * LEO_RESTRICT y32_in = reinterpret_cast<const LEO_M256 *>(y_in);
+        LEO_M256 * LEO_RESTRICT x32_out = reinterpret_cast<LEO_M256 *>(x_out);
+        LEO_M256 * LEO_RESTRICT y32_out = reinterpret_cast<LEO_M256 *>(y_out);
+
+        do
+        {
+#define LEO_IFFTB_256_XOR(x_ptr_in, y_ptr_in, x_ptr_out, y_ptr_out) { \
+            LEO_M256 x_data_out = _mm256_loadu_si256(x_ptr_out); \
+            LEO_M256 y_data_out = _mm256_loadu_si256(y_ptr_out); \
+            LEO_M256 x_data_in = _mm256_loadu_si256(x_ptr_in); \
+            LEO_M256 y_data_in = _mm256_loadu_si256(y_ptr_in); \
+            y_data_in = _mm256_xor_si256(y_data_in, x_data_in); \
+            y_data_out = _mm256_xor_si256(y_data_out, y_data_in); \
+            _mm256_storeu_si256(y_ptr_out, y_data_out); \
+            LEO_MULADD_256(x_data_in, y_data_in, table_lo_y, table_hi_y); \
+            x_data_out = _mm256_xor_si256(x_data_out, x_data_in); \
+            _mm256_storeu_si256(x_ptr_out, x_data_out); }
+
+            LEO_IFFTB_256_XOR(x32_in + 1, y32_in + 1, x32_out + 1, y32_out + 1);
+            LEO_IFFTB_256_XOR(x32_in, y32_in, x32_out, y32_out);
+            y32_in += 2, x32_in += 2, y32_out += 2, x32_out += 2;
+
+            bytes -= 64;
+        } while (bytes > 0);
+
+        return;
+    }
+#endif // LEO_TRY_AVX2
+
+    if (CpuHasSSSE3)
+    {
+        const LEO_M128 table_lo_y = _mm_loadu_si128(&Multiply128LUT[log_m].Value[0]);
+        const LEO_M128 table_hi_y = _mm_loadu_si128(&Multiply128LUT[log_m].Value[1]);
+
+        const LEO_M128 clr_mask = _mm_set1_epi8(0x0f);
+
+        const LEO_M128 * LEO_RESTRICT x16_in = reinterpret_cast<const LEO_M128 *>(x_in);
+        const LEO_M128 * LEO_RESTRICT y16_in = reinterpret_cast<const LEO_M128 *>(y_in);
+        LEO_M128 * LEO_RESTRICT x16_out = reinterpret_cast<LEO_M128 *>(x_out);
+        LEO_M128 * LEO_RESTRICT y16_out = reinterpret_cast<LEO_M128 *>(y_out);
+
+        do
+        {
+#define LEO_IFFTB_128_XOR(x_ptr_in, y_ptr_in, x_ptr_out, y_ptr_out) { \
+            LEO_M128 x_data_out = _mm_loadu_si128(x_ptr_out); \
+            LEO_M128 y_data_out = _mm_loadu_si128(y_ptr_out); \
+            LEO_M128 x_data_in = _mm_loadu_si128(x_ptr_in); \
+            LEO_M128 y_data_in = _mm_loadu_si128(y_ptr_in); \
+            y_data_in = _mm_xor_si128(y_data_in, x_data_in); \
+            y_data_out = _mm_xor_si128(y_data_out, y_data_in); \
+            _mm_storeu_si128(y_ptr_out, y_data_out); \
+            LEO_MULADD_128(x_data_in, y_data_in, table_lo_y, table_hi_y); \
+            x_data_out = _mm_xor_si128(x_data_out, x_data_in); \
+            _mm_storeu_si128(x_ptr_out, x_data_out); }
+
+            LEO_IFFTB_128_XOR(x16_in + 3, y16_in + 3, x16_out + 3, y16_out + 3);
+            LEO_IFFTB_128_XOR(x16_in + 2, y16_in + 2, x16_out + 2, y16_out + 2);
+            LEO_IFFTB_128_XOR(x16_in + 1, y16_in + 1, x16_out + 1, y16_out + 1);
+            LEO_IFFTB_128_XOR(x16_in, y16_in, x16_out, y16_out);
+            y16_in += 4, x16_in += 4, y16_out += 4, x16_out += 4;
+
+            bytes -= 64;
+        } while (bytes > 0);
+
+        return;
+    }
+
+    // Reference version:
+    const ffe_t* LEO_RESTRICT lut = Multiply8LUT + log_m * 256;
+
+    xor_mem(y_in, x_in, bytes);
+
+    uint64_t count = bytes;
+    ffe_t * LEO_RESTRICT y1 = reinterpret_cast<ffe_t *>(y_in);
+
+#ifdef LEO_TARGET_MOBILE
+    ffe_t * LEO_RESTRICT x1 = reinterpret_cast<ffe_t *>(x_in);
+
+    do
+    {
+        for (unsigned j = 0; j < 64; ++j)
+            x1[j] ^= lut[y1[j]];
+
+        x1 += 64, y1 += 64;
+        count -= 64;
+    } while (count > 0);
+#else
+    uint64_t * LEO_RESTRICT x8 = reinterpret_cast<uint64_t *>(x_in);
+
+    do
+    {
+        for (unsigned j = 0; j < 8; ++j)
+        {
+            uint64_t x_0 = x8[j];
+            x_0 ^= (uint64_t)lut[y1[0]];
+            x_0 ^= (uint64_t)lut[y1[1]] << 8;
+            x_0 ^= (uint64_t)lut[y1[2]] << 16;
+            x_0 ^= (uint64_t)lut[y1[3]] << 24;
+            x_0 ^= (uint64_t)lut[y1[4]] << 32;
+            x_0 ^= (uint64_t)lut[y1[5]] << 40;
+            x_0 ^= (uint64_t)lut[y1[6]] << 48;
+            x_0 ^= (uint64_t)lut[y1[7]] << 56;
+            x8[j] = x_0;
+            y1 += 8;
+        }
+
+        x8 += 8;
+        count -= 64;
+    } while (count > 0);
+#endif
+
+    xor_mem(y_out, y_in, bytes);
+    xor_mem(x_out, x_in, bytes);
+}
+
+
+// xor_result ^= IFFT_DIT4(work)
+static void IFFT_DIT4_xor(
+    uint64_t bytes,
+    void** work_in,
+    void** xor_out,
+    unsigned dist,
+    const ffe_t log_m01,
+    const ffe_t log_m23,
+    const ffe_t log_m02)
+{
+#ifdef LEO_INTERLEAVE_BUTTERFLY4_OPT
+
+#if defined(LEO_TRY_AVX2)
+
+    if (CpuHasAVX2)
+    {
+        const LEO_M256 t01_lo = _mm256_loadu_si256(&Multiply256LUT[log_m01].Value[0]);
+        const LEO_M256 t01_hi = _mm256_loadu_si256(&Multiply256LUT[log_m01].Value[1]);
+        const LEO_M256 t23_lo = _mm256_loadu_si256(&Multiply256LUT[log_m23].Value[0]);
+        const LEO_M256 t23_hi = _mm256_loadu_si256(&Multiply256LUT[log_m23].Value[1]);
+        const LEO_M256 t02_lo = _mm256_loadu_si256(&Multiply256LUT[log_m02].Value[0]);
+        const LEO_M256 t02_hi = _mm256_loadu_si256(&Multiply256LUT[log_m02].Value[1]);
+
+        const LEO_M256 clr_mask = _mm256_set1_epi8(0x0f);
+
+        const LEO_M256 * LEO_RESTRICT work0 = reinterpret_cast<const LEO_M256 *>(work_in[0]);
+        const LEO_M256 * LEO_RESTRICT work1 = reinterpret_cast<const LEO_M256 *>(work_in[dist]);
+        const LEO_M256 * LEO_RESTRICT work2 = reinterpret_cast<const LEO_M256 *>(work_in[dist * 2]);
+        const LEO_M256 * LEO_RESTRICT work3 = reinterpret_cast<const LEO_M256 *>(work_in[dist * 3]);
+        LEO_M256 * LEO_RESTRICT xor0 = reinterpret_cast<LEO_M256 *>(xor_out[0]);
+        LEO_M256 * LEO_RESTRICT xor1 = reinterpret_cast<LEO_M256 *>(xor_out[dist]);
+        LEO_M256 * LEO_RESTRICT xor2 = reinterpret_cast<LEO_M256 *>(xor_out[dist * 2]);
+        LEO_M256 * LEO_RESTRICT xor3 = reinterpret_cast<LEO_M256 *>(xor_out[dist * 3]);
+
+        do
+        {
+            // First layer:
+            LEO_M256 work0_reg = _mm256_loadu_si256(work0);
+            LEO_M256 work1_reg = _mm256_loadu_si256(work1);
+            work0++, work1++;
+
+            work1_reg = _mm256_xor_si256(work0_reg, work1_reg);
+            if (log_m01 != kModulus)
+                LEO_MULADD_256(work0_reg, work1_reg, t01_lo, t01_hi);
+
+            LEO_M256 work2_reg = _mm256_loadu_si256(work2);
+            LEO_M256 work3_reg = _mm256_loadu_si256(work3);
+            work2++, work3++;
+
+            work3_reg = _mm256_xor_si256(work2_reg, work3_reg);
+            if (log_m23 != kModulus)
+                LEO_MULADD_256(work2_reg, work3_reg, t23_lo, t23_hi);
+
+            // Second layer:
+            work2_reg = _mm256_xor_si256(work0_reg, work2_reg);
+            work3_reg = _mm256_xor_si256(work1_reg, work3_reg);
+            if (log_m02 != kModulus)
+            {
+                LEO_MULADD_256(work0_reg, work2_reg, t02_lo, t02_hi);
+                LEO_MULADD_256(work1_reg, work3_reg, t02_lo, t02_hi);
+            }
+
+            work0_reg = _mm256_xor_si256(work0_reg, _mm256_loadu_si256(xor0));
+            work1_reg = _mm256_xor_si256(work1_reg, _mm256_loadu_si256(xor1));
+            work2_reg = _mm256_xor_si256(work2_reg, _mm256_loadu_si256(xor2));
+            work3_reg = _mm256_xor_si256(work3_reg, _mm256_loadu_si256(xor3));
+
+            _mm256_storeu_si256(xor0, work0_reg);
+            _mm256_storeu_si256(xor1, work1_reg);
+            _mm256_storeu_si256(xor2, work2_reg);
+            _mm256_storeu_si256(xor3, work3_reg);
+            xor0++, xor1++, xor2++, xor3++;
+
+            bytes -= 32;
+        } while (bytes > 0);
+
+        return;
+    }
+
+#endif // LEO_TRY_AVX2
+
+    if (CpuHasSSSE3)
+    {
+        const LEO_M128 t01_lo = _mm_loadu_si128(&Multiply128LUT[log_m01].Value[0]);
+        const LEO_M128 t01_hi = _mm_loadu_si128(&Multiply128LUT[log_m01].Value[1]);
+        const LEO_M128 t23_lo = _mm_loadu_si128(&Multiply128LUT[log_m23].Value[0]);
+        const LEO_M128 t23_hi = _mm_loadu_si128(&Multiply128LUT[log_m23].Value[1]);
+        const LEO_M128 t02_lo = _mm_loadu_si128(&Multiply128LUT[log_m02].Value[0]);
+        const LEO_M128 t02_hi = _mm_loadu_si128(&Multiply128LUT[log_m02].Value[1]);
+
+        const LEO_M128 clr_mask = _mm_set1_epi8(0x0f);
+
+        const LEO_M128 * LEO_RESTRICT work0 = reinterpret_cast<const LEO_M128 *>(work_in[0]);
+        const LEO_M128 * LEO_RESTRICT work1 = reinterpret_cast<const LEO_M128 *>(work_in[dist]);
+        const LEO_M128 * LEO_RESTRICT work2 = reinterpret_cast<const LEO_M128 *>(work_in[dist * 2]);
+        const LEO_M128 * LEO_RESTRICT work3 = reinterpret_cast<const LEO_M128 *>(work_in[dist * 3]);
+        LEO_M128 * LEO_RESTRICT xor0 = reinterpret_cast<LEO_M128 *>(xor_out[0]);
+        LEO_M128 * LEO_RESTRICT xor1 = reinterpret_cast<LEO_M128 *>(xor_out[dist]);
+        LEO_M128 * LEO_RESTRICT xor2 = reinterpret_cast<LEO_M128 *>(xor_out[dist * 2]);
+        LEO_M128 * LEO_RESTRICT xor3 = reinterpret_cast<LEO_M128 *>(xor_out[dist * 3]);
+
+        do
+        {
+            // First layer:
+            LEO_M128 work0_reg = _mm_loadu_si128(work0);
+            LEO_M128 work1_reg = _mm_loadu_si128(work1);
+            work0++, work1++;
+
+            work1_reg = _mm_xor_si128(work0_reg, work1_reg);
+            if (log_m01 != kModulus)
+                LEO_MULADD_128(work0_reg, work1_reg, t01_lo, t01_hi);
+
+            LEO_M128 work2_reg = _mm_loadu_si128(work2);
+            LEO_M128 work3_reg = _mm_loadu_si128(work3);
+            work2++, work3++;
+
+            work3_reg = _mm_xor_si128(work2_reg, work3_reg);
+            if (log_m23 != kModulus)
+                LEO_MULADD_128(work2_reg, work3_reg, t23_lo, t23_hi);
+
+            // Second layer:
+            work2_reg = _mm_xor_si128(work0_reg, work2_reg);
+            work3_reg = _mm_xor_si128(work1_reg, work3_reg);
+            if (log_m02 != kModulus)
+            {
+                LEO_MULADD_128(work0_reg, work2_reg, t02_lo, t02_hi);
+                LEO_MULADD_128(work1_reg, work3_reg, t02_lo, t02_hi);
+            }
+
+            work0_reg = _mm_xor_si128(work0_reg, _mm_loadu_si128(xor0));
+            work1_reg = _mm_xor_si128(work1_reg, _mm_loadu_si128(xor1));
+            work2_reg = _mm_xor_si128(work2_reg, _mm_loadu_si128(xor2));
+            work3_reg = _mm_xor_si128(work3_reg, _mm_loadu_si128(xor3));
+
+            _mm_storeu_si128(xor0, work0_reg);
+            _mm_storeu_si128(xor1, work1_reg);
+            _mm_storeu_si128(xor2, work2_reg);
+            _mm_storeu_si128(xor3, work3_reg);
+            xor0++, xor1++, xor2++, xor3++;
+
+            bytes -= 16;
+        } while (bytes > 0);
+
+        return;
+    }
+
+#endif // LEO_INTERLEAVE_BUTTERFLY4_OPT
+
+    // First layer:
+    if (log_m01 == kModulus)
+        xor_mem(work_in[dist], work_in[0], bytes);
+    else
+        IFFT_DIT2(work_in[0], work_in[dist], log_m01, bytes);
+
+    if (log_m23 == kModulus)
+        xor_mem(work_in[dist * 3], work_in[dist * 2], bytes);
+    else
+        IFFT_DIT2(work_in[dist * 2], work_in[dist * 3], log_m23, bytes);
+
+    // Second layer:
+    if (log_m02 == kModulus)
+    {
+        xor_mem(work_in[dist * 2], work_in[0], bytes);
+        xor_mem(work_in[dist * 3], work_in[dist], bytes);
+    }
+    else
+    {
+        IFFT_DIT2(work_in[0], work_in[dist * 2], log_m02, bytes);
+        IFFT_DIT2(work_in[dist], work_in[dist * 3], log_m02, bytes);
+    }
+
+    xor_mem(xor_out[0], work_in[0], bytes);
+    xor_mem(xor_out[dist], work_in[dist], bytes);
+    xor_mem(xor_out[dist * 2], work_in[dist * 2], bytes);
+    xor_mem(xor_out[dist * 3], work_in[dist * 3], bytes);
+}
+
+
+// Unrolled IFFT for encoder
+static void IFFT_DIT_Encoder(
     const uint64_t bytes,
     const void* const* data,
     const unsigned m_truncated,
@@ -826,14 +1139,13 @@ static void IFFT_DIT(
     const unsigned m,
     const ffe_t* skewLUT)
 {
-    // FIXME: Roll into first layer
-    if (data)
-    {
-        for (unsigned i = 0; i < m_truncated; ++i)
-            memcpy(work[i], data[i], bytes);
-        for (unsigned i = m_truncated; i < m; ++i)
-            memset(work[i], 0, bytes);
-    }
+    // I tried rolling the memcpy/memset into the first layer of the FFT and
+    // found that it only yields a 4% performance improvement, which is not
+    // worth the extra complexity.
+    for (unsigned i = 0; i < m_truncated; ++i)
+        memcpy(work[i], data[i], bytes);
+    for (unsigned i = m_truncated; i < m; ++i)
+        memset(work[i], 0, bytes);
 
     // I tried splitting up the first few layers into L3-cache sized blocks but
     // found that it only provides about 5% performance boost, which is not
@@ -846,12 +1158,117 @@ static void IFFT_DIT(
         // For each set of dist*4 elements:
         for (unsigned r = 0; r < m_truncated; r += dist4)
         {
-            const ffe_t log_m01 = skewLUT[r + dist];
-            const ffe_t log_m23 = skewLUT[r + dist * 3];
-            const ffe_t log_m02 = skewLUT[r + dist * 2];
+            const unsigned i_end = r + dist;
+            const ffe_t log_m01 = skewLUT[i_end];
+            const ffe_t log_m02 = skewLUT[i_end + dist];
+            const ffe_t log_m23 = skewLUT[i_end + dist * 2];
+
+            if (dist4 == m && xor_result)
+            {
+                // For each set of dist elements:
+                for (unsigned i = r; i < i_end; ++i)
+                {
+                    IFFT_DIT4_xor(
+                        bytes,
+                        work + i,
+                        xor_result + i,
+                        dist,
+                        log_m01,
+                        log_m23,
+                        log_m02);
+                }
+            }
+            else
+            {
+                // For each set of dist elements:
+                for (unsigned i = r; i < i_end; ++i)
+                {
+                    IFFT_DIT4(
+                        bytes,
+                        work + i,
+                        dist,
+                        log_m01,
+                        log_m23,
+                        log_m02);
+                }
+            }
+        }
+
+        // I tried alternating sweeps left->right and right->left to reduce cache misses.
+        // It provides about 1% performance boost when done for both FFT and IFFT, so it
+        // does not seem to be worth the extra complexity.
+    }
+
+    // If there is one layer left:
+    if (dist < m)
+    {
+        // Assuming that dist = m / 2
+        LEO_DEBUG_ASSERT(dist * 2 == m);
+
+        const ffe_t log_m = skewLUT[dist];
+
+        if (xor_result)
+        {
+            if (log_m == kModulus)
+            {
+                for (unsigned i = 0; i < dist; ++i)
+                    xor_mem_2to1(xor_result[i], work[i], work[i + dist], bytes);
+            }
+            else
+            {
+                for (unsigned i = 0; i < dist; ++i)
+                {
+                    IFFT_DIT2_xor(
+                        work[i],
+                        work[i + dist],
+                        xor_result[i],
+                        xor_result[i + dist],
+                        log_m,
+                        bytes);
+                }
+            }
+        }
+        else
+        {
+            if (log_m == kModulus)
+                VectorXOR(bytes, dist, work + dist, work);
+            else
+            {
+                for (unsigned i = 0; i < dist; ++i)
+                {
+                    IFFT_DIT2(
+                        work[i],
+                        work[i + dist],
+                        log_m,
+                        bytes);
+                }
+            }
+        }
+    }
+}
+
+
+// Basic no-frills version for decoder
+static void IFFT_DIT_Decoder(
+    const uint64_t bytes,
+    const unsigned m_truncated,
+    void** work,
+    const unsigned m,
+    const ffe_t* skewLUT)
+{
+    // Decimation in time: Unroll 2 layers at a time
+    unsigned dist = 1, dist4 = 4;
+    for (; dist4 <= m; dist = dist4, dist4 <<= 2)
+    {
+        // For each set of dist*4 elements:
+        for (unsigned r = 0; r < m_truncated; r += dist4)
+        {
+            const unsigned i_end = r + dist;
+            const ffe_t log_m01 = skewLUT[i_end];
+            const ffe_t log_m02 = skewLUT[i_end + dist];
+            const ffe_t log_m23 = skewLUT[i_end + dist * 2];
 
             // For each set of dist elements:
-            const unsigned i_end = r + dist;
             for (unsigned i = r; i < i_end; ++i)
             {
                 IFFT_DIT4(
@@ -863,13 +1280,6 @@ static void IFFT_DIT(
                     log_m02);
             }
         }
-
-        // I tried alternating sweeps left->right and right->left to reduce cache misses.
-        // It provides about 1% performance boost when done for both FFT and IFFT, so it
-        // does not seem to be worth the extra complexity.
-
-        // Clear data after the first layer
-        data = nullptr;
     }
 
     // If there is one layer left:
@@ -894,11 +1304,6 @@ static void IFFT_DIT(
             }
         }
     }
-
-    // FIXME: Roll into last layer
-    if (xor_result)
-        for (unsigned i = 0; i < m; ++i)
-            xor_mem(xor_result[i], work[i], bytes);
 }
 
 /*
@@ -955,6 +1360,7 @@ static void IFFT_DIT(
         {4-6, 5-7, 4-5, 6-7},
 */
 
+// 2-way butterfly
 static void FFT_DIT2(
     void * LEO_RESTRICT x, void * LEO_RESTRICT y,
     ffe_t log_m, uint64_t bytes)
@@ -1054,6 +1460,8 @@ static void FFT_DIT2(
     } while (bytes > 0);
 }
 
+
+// 4-way butterfly
 static void FFT_DIT4(
     uint64_t bytes,
     void** work,
@@ -1227,6 +1635,7 @@ static void FFT_DIT4(
 }
 
 
+// In-place FFT for encoder and decoder
 static void FFT_DIT(
     const uint64_t bytes,
     void** work,
@@ -1241,12 +1650,12 @@ static void FFT_DIT(
         // For each set of dist*4 elements:
         for (unsigned r = 0; r < m_truncated; r += dist4)
         {
-            const ffe_t log_m01 = skewLUT[r + dist];
-            const ffe_t log_m23 = skewLUT[r + dist * 3];
-            const ffe_t log_m02 = skewLUT[r + dist * 2];
+            const unsigned i_end = r + dist;
+            const ffe_t log_m01 = skewLUT[i_end];
+            const ffe_t log_m02 = skewLUT[i_end + dist];
+            const ffe_t log_m23 = skewLUT[i_end + dist * 2];
 
             // For each set of dist elements:
-            const unsigned i_end = r + dist;
             for (unsigned i = r; i < i_end; ++i)
             {
                 FFT_DIT4(
@@ -1297,7 +1706,7 @@ void ReedSolomonEncode(
 
     const ffe_t* skewLUT = FFTSkew + m - 1;
 
-    IFFT_DIT(
+    IFFT_DIT_Encoder(
         buffer_bytes,
         data,
         original_count < m ? original_count : m,
@@ -1318,7 +1727,7 @@ void ReedSolomonEncode(
 
         // work <- work xor IFFT(data + i, m, m + i)
 
-        IFFT_DIT(
+        IFFT_DIT_Encoder(
             buffer_bytes,
             data, // data source
             m,
@@ -1338,7 +1747,7 @@ void ReedSolomonEncode(
 
         // work <- work xor IFFT(data + i, m, m + i)
 
-        IFFT_DIT(
+        IFFT_DIT_Encoder(
             buffer_bytes,
             data, // data source
             last_count,
@@ -1607,12 +2016,10 @@ void ReedSolomonDecode(
 
     // work <- IFFT(work, n, 0)
 
-    IFFT_DIT(
+    IFFT_DIT_Decoder(
         buffer_bytes,
-        nullptr,
         m + original_count,
         work,
-        nullptr,
         n,
         FFTSkew - 1);
 
diff --git a/LeopardFF8.cpp b/LeopardFF8.cpp
index 2121a0f..66fc567 100644
--- a/LeopardFF8.cpp
+++ b/LeopardFF8.cpp
@@ -505,6 +505,7 @@ static void FFTInitialize()
         {1-5, 1'-5', 1-1', 5-5'},
 */
 
+// 2-way butterfly
 static void IFFT_DIT2(
     void * LEO_RESTRICT x, void * LEO_RESTRICT y,
     ffe_t log_m, uint64_t bytes)
@@ -617,135 +618,6 @@ static void IFFT_DIT2(
 #endif
 }
 
-// {x_out, y_out} ^= IFFT_DIT2( {x_in, y_in} )
-static void IFFT_DIT2_xor(
-    void * LEO_RESTRICT x_in, void * LEO_RESTRICT y_in,
-    void * LEO_RESTRICT x_out, void * LEO_RESTRICT y_out,
-    const ffe_t log_m, uint64_t bytes)
-{
-#if defined(LEO_TRY_AVX2)
-    if (CpuHasAVX2)
-    {
-        const LEO_M256 table_lo_y = _mm256_loadu_si256(&Multiply256LUT[log_m].Value[0]);
-        const LEO_M256 table_hi_y = _mm256_loadu_si256(&Multiply256LUT[log_m].Value[1]);
-
-        const LEO_M256 clr_mask = _mm256_set1_epi8(0x0f);
-
-        const LEO_M256 * LEO_RESTRICT x32_in = reinterpret_cast<const LEO_M256 *>(x_in);
-        const LEO_M256 * LEO_RESTRICT y32_in = reinterpret_cast<const LEO_M256 *>(y_in);
-        LEO_M256 * LEO_RESTRICT x32_out = reinterpret_cast<LEO_M256 *>(x_out);
-        LEO_M256 * LEO_RESTRICT y32_out = reinterpret_cast<LEO_M256 *>(y_out);
-
-        do
-        {
-#define LEO_IFFTB_256_XOR(x_ptr_in, y_ptr_in, x_ptr_out, y_ptr_out) { \
-            LEO_M256 x_data_out = _mm256_loadu_si256(x_ptr_out); \
-            LEO_M256 y_data_out = _mm256_loadu_si256(y_ptr_out); \
-            LEO_M256 x_data_in = _mm256_loadu_si256(x_ptr_in); \
-            LEO_M256 y_data_in = _mm256_loadu_si256(y_ptr_in); \
-            y_data_in = _mm256_xor_si256(y_data_in, x_data_in); \
-            y_data_out = _mm256_xor_si256(y_data_out, y_data_in); \
-            _mm256_storeu_si256(y_ptr_out, y_data_out); \
-            LEO_MULADD_256(x_data_in, y_data_in, table_lo_y, table_hi_y); \
-            x_data_out = _mm256_xor_si256(x_data_out, x_data_in); \
-            _mm256_storeu_si256(x_ptr_out, x_data_out); }
-
-            LEO_IFFTB_256_XOR(x32_in + 1, y32_in + 1, x32_out + 1, y32_out + 1);
-            LEO_IFFTB_256_XOR(x32_in, y32_in, x32_out, y32_out);
-            y32_in += 2, x32_in += 2, y32_out += 2, x32_out += 2;
-
-            bytes -= 64;
-        } while (bytes > 0);
-
-        return;
-    }
-#endif // LEO_TRY_AVX2
-
-    if (CpuHasSSSE3)
-    {
-        const LEO_M128 table_lo_y = _mm_loadu_si128(&Multiply128LUT[log_m].Value[0]);
-        const LEO_M128 table_hi_y = _mm_loadu_si128(&Multiply128LUT[log_m].Value[1]);
-
-        const LEO_M128 clr_mask = _mm_set1_epi8(0x0f);
-
-        const LEO_M128 * LEO_RESTRICT x16_in = reinterpret_cast<const LEO_M128 *>(x_in);
-        const LEO_M128 * LEO_RESTRICT y16_in = reinterpret_cast<const LEO_M128 *>(y_in);
-        LEO_M128 * LEO_RESTRICT x16_out = reinterpret_cast<LEO_M128 *>(x_out);
-        LEO_M128 * LEO_RESTRICT y16_out = reinterpret_cast<LEO_M128 *>(y_out);
-
-        do
-        {
-#define LEO_IFFTB_128_XOR(x_ptr_in, y_ptr_in, x_ptr_out, y_ptr_out) { \
-            LEO_M128 x_data_out = _mm_loadu_si128(x_ptr_out); \
-            LEO_M128 y_data_out = _mm_loadu_si128(y_ptr_out); \
-            LEO_M128 x_data_in = _mm_loadu_si128(x_ptr_in); \
-            LEO_M128 y_data_in = _mm_loadu_si128(y_ptr_in); \
-            y_data_in = _mm_xor_si128(y_data_in, x_data_in); \
-            y_data_out = _mm_xor_si128(y_data_out, y_data_in); \
-            _mm_storeu_si128(y_ptr_out, y_data_out); \
-            LEO_MULADD_128(x_data_in, y_data_in, table_lo_y, table_hi_y); \
-            x_data_out = _mm_xor_si128(x_data_out, x_data_in); \
-            _mm_storeu_si128(x_ptr_out, x_data_out); }
-
-            LEO_IFFTB_128_XOR(x16_in + 3, y16_in + 3, x16_out + 3, y16_out + 3);
-            LEO_IFFTB_128_XOR(x16_in + 2, y16_in + 2, x16_out + 2, y16_out + 2);
-            LEO_IFFTB_128_XOR(x16_in + 1, y16_in + 1, x16_out + 1, y16_out + 1);
-            LEO_IFFTB_128_XOR(x16_in, y16_in, x16_out, y16_out);
-            y16_in += 4, x16_in += 4, y16_out += 4, x16_out += 4;
-
-            bytes -= 64;
-        } while (bytes > 0);
-
-        return;
-    }
-
-    // Reference version:
-    const ffe_t* LEO_RESTRICT lut = Multiply8LUT + log_m * 256;
-
-    xor_mem(y_in, x_in, bytes);
-
-    uint64_t count = bytes;
-    ffe_t * LEO_RESTRICT y1 = reinterpret_cast<ffe_t *>(y_in);
-
-#ifdef LEO_TARGET_MOBILE
-    ffe_t * LEO_RESTRICT x1 = reinterpret_cast<ffe_t *>(x_in);
-
-    do
-    {
-        for (unsigned j = 0; j < 64; ++j)
-            x1[j] ^= lut[y1[j]];
-
-        x1 += 64, y1 += 64;
-        count -= 64;
-    } while (count > 0);
-#else
-    uint64_t * LEO_RESTRICT x8 = reinterpret_cast<uint64_t *>(x_in);
-
-    do
-    {
-        for (unsigned j = 0; j < 8; ++j)
-        {
-            uint64_t x_0 = x8[j];
-            x_0 ^= (uint64_t)lut[y1[0]];
-            x_0 ^= (uint64_t)lut[y1[1]] << 8;
-            x_0 ^= (uint64_t)lut[y1[2]] << 16;
-            x_0 ^= (uint64_t)lut[y1[3]] << 24;
-            x_0 ^= (uint64_t)lut[y1[4]] << 32;
-            x_0 ^= (uint64_t)lut[y1[5]] << 40;
-            x_0 ^= (uint64_t)lut[y1[6]] << 48;
-            x_0 ^= (uint64_t)lut[y1[7]] << 56;
-            x8[j] = x_0;
-            y1 += 8;
-        }
-
-        x8 += 8;
-        count -= 64;
-    } while (count > 0);
-#endif
-
-    xor_mem(y_out, y_in, bytes);
-    xor_mem(x_out, x_in, bytes);
-}
 
 // 4-way butterfly
 static void IFFT_DIT4(
@@ -896,6 +768,138 @@ static void IFFT_DIT4(
     }
 }
 
+
+// {x_out, y_out} ^= IFFT_DIT2( {x_in, y_in} )
+static void IFFT_DIT2_xor(
+    void * LEO_RESTRICT x_in, void * LEO_RESTRICT y_in,
+    void * LEO_RESTRICT x_out, void * LEO_RESTRICT y_out,
+    const ffe_t log_m, uint64_t bytes)
+{
+#if defined(LEO_TRY_AVX2)
+    if (CpuHasAVX2)
+    {
+        const LEO_M256 table_lo_y = _mm256_loadu_si256(&Multiply256LUT[log_m].Value[0]);
+        const LEO_M256 table_hi_y = _mm256_loadu_si256(&Multiply256LUT[log_m].Value[1]);
+
+        const LEO_M256 clr_mask = _mm256_set1_epi8(0x0f);
+
+        const LEO_M256 * LEO_RESTRICT x32_in = reinterpret_cast<const LEO_M256 *>(x_in);
+        const LEO_M256 * LEO_RESTRICT y32_in = reinterpret_cast<const LEO_M256 *>(y_in);
+        LEO_M256 * LEO_RESTRICT x32_out = reinterpret_cast<LEO_M256 *>(x_out);
+        LEO_M256 * LEO_RESTRICT y32_out = reinterpret_cast<LEO_M256 *>(y_out);
+
+        do
+        {
+#define LEO_IFFTB_256_XOR(x_ptr_in, y_ptr_in, x_ptr_out, y_ptr_out) { \
+            LEO_M256 x_data_out = _mm256_loadu_si256(x_ptr_out); \
+            LEO_M256 y_data_out = _mm256_loadu_si256(y_ptr_out); \
+            LEO_M256 x_data_in = _mm256_loadu_si256(x_ptr_in); \
+            LEO_M256 y_data_in = _mm256_loadu_si256(y_ptr_in); \
+            y_data_in = _mm256_xor_si256(y_data_in, x_data_in); \
+            y_data_out = _mm256_xor_si256(y_data_out, y_data_in); \
+            _mm256_storeu_si256(y_ptr_out, y_data_out); \
+            LEO_MULADD_256(x_data_in, y_data_in, table_lo_y, table_hi_y); \
+            x_data_out = _mm256_xor_si256(x_data_out, x_data_in); \
+            _mm256_storeu_si256(x_ptr_out, x_data_out); }
+
+            LEO_IFFTB_256_XOR(x32_in + 1, y32_in + 1, x32_out + 1, y32_out + 1);
+            LEO_IFFTB_256_XOR(x32_in, y32_in, x32_out, y32_out);
+            y32_in += 2, x32_in += 2, y32_out += 2, x32_out += 2;
+
+            bytes -= 64;
+        } while (bytes > 0);
+
+        return;
+    }
+#endif // LEO_TRY_AVX2
+
+    if (CpuHasSSSE3)
+    {
+        const LEO_M128 table_lo_y = _mm_loadu_si128(&Multiply128LUT[log_m].Value[0]);
+        const LEO_M128 table_hi_y = _mm_loadu_si128(&Multiply128LUT[log_m].Value[1]);
+
+        const LEO_M128 clr_mask = _mm_set1_epi8(0x0f);
+
+        const LEO_M128 * LEO_RESTRICT x16_in = reinterpret_cast<const LEO_M128 *>(x_in);
+        const LEO_M128 * LEO_RESTRICT y16_in = reinterpret_cast<const LEO_M128 *>(y_in);
+        LEO_M128 * LEO_RESTRICT x16_out = reinterpret_cast<LEO_M128 *>(x_out);
+        LEO_M128 * LEO_RESTRICT y16_out = reinterpret_cast<LEO_M128 *>(y_out);
+
+        do
+        {
+#define LEO_IFFTB_128_XOR(x_ptr_in, y_ptr_in, x_ptr_out, y_ptr_out) { \
+            LEO_M128 x_data_out = _mm_loadu_si128(x_ptr_out); \
+            LEO_M128 y_data_out = _mm_loadu_si128(y_ptr_out); \
+            LEO_M128 x_data_in = _mm_loadu_si128(x_ptr_in); \
+            LEO_M128 y_data_in = _mm_loadu_si128(y_ptr_in); \
+            y_data_in = _mm_xor_si128(y_data_in, x_data_in); \
+            y_data_out = _mm_xor_si128(y_data_out, y_data_in); \
+            _mm_storeu_si128(y_ptr_out, y_data_out); \
+            LEO_MULADD_128(x_data_in, y_data_in, table_lo_y, table_hi_y); \
+            x_data_out = _mm_xor_si128(x_data_out, x_data_in); \
+            _mm_storeu_si128(x_ptr_out, x_data_out); }
+
+            LEO_IFFTB_128_XOR(x16_in + 3, y16_in + 3, x16_out + 3, y16_out + 3);
+            LEO_IFFTB_128_XOR(x16_in + 2, y16_in + 2, x16_out + 2, y16_out + 2);
+            LEO_IFFTB_128_XOR(x16_in + 1, y16_in + 1, x16_out + 1, y16_out + 1);
+            LEO_IFFTB_128_XOR(x16_in, y16_in, x16_out, y16_out);
+            y16_in += 4, x16_in += 4, y16_out += 4, x16_out += 4;
+
+            bytes -= 64;
+        } while (bytes > 0);
+
+        return;
+    }
+
+    // Reference version:
+    const ffe_t* LEO_RESTRICT lut = Multiply8LUT + log_m * 256;
+
+    xor_mem(y_in, x_in, bytes);
+
+    uint64_t count = bytes;
+    ffe_t * LEO_RESTRICT y1 = reinterpret_cast<ffe_t *>(y_in);
+
+#ifdef LEO_TARGET_MOBILE
+    ffe_t * LEO_RESTRICT x1 = reinterpret_cast<ffe_t *>(x_in);
+
+    do
+    {
+        for (unsigned j = 0; j < 64; ++j)
+            x1[j] ^= lut[y1[j]];
+
+        x1 += 64, y1 += 64;
+        count -= 64;
+    } while (count > 0);
+#else
+    uint64_t * LEO_RESTRICT x8 = reinterpret_cast<uint64_t *>(x_in);
+
+    do
+    {
+        for (unsigned j = 0; j < 8; ++j)
+        {
+            uint64_t x_0 = x8[j];
+            x_0 ^= (uint64_t)lut[y1[0]];
+            x_0 ^= (uint64_t)lut[y1[1]] << 8;
+            x_0 ^= (uint64_t)lut[y1[2]] << 16;
+            x_0 ^= (uint64_t)lut[y1[3]] << 24;
+            x_0 ^= (uint64_t)lut[y1[4]] << 32;
+            x_0 ^= (uint64_t)lut[y1[5]] << 40;
+            x_0 ^= (uint64_t)lut[y1[6]] << 48;
+            x_0 ^= (uint64_t)lut[y1[7]] << 56;
+            x8[j] = x_0;
+            y1 += 8;
+        }
+
+        x8 += 8;
+        count -= 64;
+    } while (count > 0);
+#endif
+
+    xor_mem(y_out, y_in, bytes);
+    xor_mem(x_out, x_in, bytes);
+}
+
+
 // xor_result ^= IFFT_DIT4(work)
 static void IFFT_DIT4_xor(
     uint64_t bytes,
@@ -1073,7 +1077,9 @@ static void IFFT_DIT4_xor(
     xor_mem(xor_out[dist * 3], work_in[dist * 3], bytes);
 }
 
-static void IFFT_DIT(
+
+// Unrolled IFFT for encoder
+static void IFFT_DIT_Encoder(
     const uint64_t bytes,
     const void* const* data,
     const unsigned m_truncated,
@@ -1085,13 +1091,10 @@ static void IFFT_DIT(
     // I tried rolling the memcpy/memset into the first layer of the FFT and
     // found that it only yields a 4% performance improvement, which is not
     // worth the extra complexity.
-    if (data)
-    {
-        for (unsigned i = 0; i < m_truncated; ++i)
-            memcpy(work[i], data[i], bytes);
-        for (unsigned i = m_truncated; i < m; ++i)
-            memset(work[i], 0, bytes);
-    }
+    for (unsigned i = 0; i < m_truncated; ++i)
+        memcpy(work[i], data[i], bytes);
+    for (unsigned i = m_truncated; i < m; ++i)
+        memset(work[i], 0, bytes);
 
     // I tried splitting up the first few layers into L3-cache sized blocks but
     // found that it only provides about 5% performance boost, which is not
@@ -1104,11 +1107,10 @@ static void IFFT_DIT(
         // For each set of dist*4 elements:
         for (unsigned r = 0; r < m_truncated; r += dist4)
         {
-            const ffe_t log_m01 = skewLUT[r + dist];
-            const ffe_t log_m23 = skewLUT[r + dist * 3];
-            const ffe_t log_m02 = skewLUT[r + dist * 2];
-
             const unsigned i_end = r + dist;
+            const ffe_t log_m01 = skewLUT[i_end];
+            const ffe_t log_m02 = skewLUT[i_end + dist];
+            const ffe_t log_m23 = skewLUT[i_end + dist * 2];
 
             if (dist4 == m && xor_result)
             {
@@ -1144,9 +1146,6 @@ static void IFFT_DIT(
         // I tried alternating sweeps left->right and right->left to reduce cache misses.
         // It provides about 1% performance boost when done for both FFT and IFFT, so it
         // does not seem to be worth the extra complexity.
-
-        // Clear data after the first layer
-        data = nullptr;
     }
 
     // If there is one layer left:
@@ -1197,6 +1196,65 @@ static void IFFT_DIT(
     }
 }
 
+
+// Basic no-frills version for decoder
+static void IFFT_DIT_Decoder(
+    const uint64_t bytes,
+    const unsigned m_truncated,
+    void** work,
+    const unsigned m,
+    const ffe_t* skewLUT)
+{
+    // Decimation in time: Unroll 2 layers at a time
+    unsigned dist = 1, dist4 = 4;
+    for (; dist4 <= m; dist = dist4, dist4 <<= 2)
+    {
+        // For each set of dist*4 elements:
+        for (unsigned r = 0; r < m_truncated; r += dist4)
+        {
+            const unsigned i_end = r + dist;
+            const ffe_t log_m01 = skewLUT[i_end];
+            const ffe_t log_m02 = skewLUT[i_end + dist];
+            const ffe_t log_m23 = skewLUT[i_end + dist * 2];
+
+            // For each set of dist elements:
+            for (unsigned i = r; i < i_end; ++i)
+            {
+                IFFT_DIT4(
+                    bytes,
+                    work + i,
+                    dist,
+                    log_m01,
+                    log_m23,
+                    log_m02);
+            }
+        }
+    }
+
+    // If there is one layer left:
+    if (dist < m)
+    {
+        // Assuming that dist = m / 2
+        LEO_DEBUG_ASSERT(dist * 2 == m);
+
+        const ffe_t log_m = skewLUT[dist];
+
+        if (log_m == kModulus)
+            VectorXOR(bytes, dist, work + dist, work);
+        else
+        {
+            for (unsigned i = 0; i < dist; ++i)
+            {
+                IFFT_DIT2(
+                    work[i],
+                    work[i + dist],
+                    log_m,
+                    bytes);
+            }
+        }
+    }
+}
+
 /*
     Decimation in time FFT:
 
@@ -1251,6 +1309,7 @@ static void IFFT_DIT(
         {4-6, 5-7, 4-5, 6-7},
 */
 
+// 2-way butterfly
 static void FFT_DIT2(
     void * LEO_RESTRICT x, void * LEO_RESTRICT y,
     ffe_t log_m, uint64_t bytes)
@@ -1368,6 +1427,8 @@ static void FFT_DIT2(
 #endif
 }
 
+
+// 4-way butterfly
 static void FFT_DIT4(
     uint64_t bytes,
     void** work,
@@ -1515,6 +1576,7 @@ static void FFT_DIT4(
 }
 
 
+// In-place FFT for encoder and decoder
 static void FFT_DIT(
     const uint64_t bytes,
     void** work,
@@ -1529,12 +1591,12 @@ static void FFT_DIT(
         // For each set of dist*4 elements:
         for (unsigned r = 0; r < m_truncated; r += dist4)
         {
-            const ffe_t log_m01 = skewLUT[r + dist];
-            const ffe_t log_m23 = skewLUT[r + dist * 3];
-            const ffe_t log_m02 = skewLUT[r + dist * 2];
+            const unsigned i_end = r + dist;
+            const ffe_t log_m01 = skewLUT[i_end];
+            const ffe_t log_m02 = skewLUT[i_end + dist];
+            const ffe_t log_m23 = skewLUT[i_end + dist * 2];
 
             // For each set of dist elements:
-            const unsigned i_end = r + dist;
             for (unsigned i = r; i < i_end; ++i)
             {
                 FFT_DIT4(
@@ -1585,7 +1647,7 @@ void ReedSolomonEncode(
 
     const ffe_t* skewLUT = FFTSkew + m - 1;
 
-    IFFT_DIT(
+    IFFT_DIT_Encoder(
         buffer_bytes,
         data,
         original_count < m ? original_count : m,
@@ -1606,7 +1668,7 @@ void ReedSolomonEncode(
 
         // work <- work xor IFFT(data + i, m, m + i)
 
-        IFFT_DIT(
+        IFFT_DIT_Encoder(
             buffer_bytes,
             data, // data source
             m,
@@ -1626,7 +1688,7 @@ void ReedSolomonEncode(
 
         // work <- work xor IFFT(data + i, m, m + i)
 
-        IFFT_DIT(
+        IFFT_DIT_Encoder(
             buffer_bytes,
             data, // data source
             last_count,
@@ -1851,12 +1913,10 @@ void ReedSolomonDecode(
 
     // work <- IFFT(work, n, 0)
 
-    IFFT_DIT(
+    IFFT_DIT_Decoder(
         buffer_bytes,
-        nullptr,
         m + original_count,
         work,
-        nullptr,
         n,
         FFTSkew - 1);
 
diff --git a/tests/benchmark.cpp b/tests/benchmark.cpp
index ae7c1c6..ed6e051 100644
--- a/tests/benchmark.cpp
+++ b/tests/benchmark.cpp
@@ -42,8 +42,8 @@ using namespace std;
 struct TestParameters
 {
 #ifdef LEO_HAS_FF16
-    unsigned original_count = 100; // under 65536
-    unsigned recovery_count = 20; // under 65536 - original_count
+    unsigned original_count = 1000; // under 65536
+    unsigned recovery_count = 200; // under 65536 - original_count
 #else
     unsigned original_count = 100; // under 65536
     unsigned recovery_count = 20; // under 65536 - original_count
@@ -55,7 +55,7 @@ struct TestParameters
 };
 
 static const unsigned kLargeTrialCount = 1;
-static const unsigned kSmallTrialCount = 100;
+static const unsigned kSmallTrialCount = 10;
 
 
 //------------------------------------------------------------------------------