From 08fed770cd3b839b559c7add93c7fb33d4022ddf Mon Sep 17 00:00:00 2001
From: Christopher Taylor <christopher.taylor@oculus.com>
Date: Fri, 2 Jun 2017 23:52:03 -0700
Subject: [PATCH] Use DIT FFT for decoder

---
 LeopardFF16.h       |  52 ---------
 LeopardFF8.cpp      | 250 ++++++++++++++------------------------------
 tests/benchmark.cpp |  10 +-
 3 files changed, 85 insertions(+), 227 deletions(-)

diff --git a/LeopardFF16.h b/LeopardFF16.h
index b36ad84..a335d0f 100644
--- a/LeopardFF16.h
+++ b/LeopardFF16.h
@@ -95,18 +95,6 @@ void fft_butterfly(
     void * LEO_RESTRICT x, void * LEO_RESTRICT y,
     ffe_t log_m, uint64_t bytes);
 
-#ifdef LEO_USE_VECTOR4_OPT
-
-// Unroll 4 rows at a time
-void fft_butterfly4(
-    void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
-    void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1,
-    void * LEO_RESTRICT x_2, void * LEO_RESTRICT y_2,
-    void * LEO_RESTRICT x_3, void * LEO_RESTRICT y_3,
-    ffe_t log_m, uint64_t bytes);
-
-#endif // LEO_USE_VECTOR4_OPT
-
 
 //------------------------------------------------------------------------------
 // IFFT Operations
@@ -121,46 +109,6 @@ void ifft_butterfly(
     void * LEO_RESTRICT x, void * LEO_RESTRICT y,
     ffe_t log_m, uint64_t bytes);
 
-#ifdef LEO_USE_VECTOR4_OPT
-
-// Unroll 4 rows at a time
-void ifft_butterfly4(
-    void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
-    void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1,
-    void * LEO_RESTRICT x_2, void * LEO_RESTRICT y_2,
-    void * LEO_RESTRICT x_3, void * LEO_RESTRICT y_3,
-    ffe_t log_m, uint64_t bytes);
-
-#endif // LEO_USE_VECTOR4_OPT
-
-
-//------------------------------------------------------------------------------
-// FFT
-
-/*
-    if (log_m != kModulus)
-        x[] ^= exp(log(y[]) + log_m)
-    y[] ^= x[]
-*/
-void VectorFFTButterfly(
-    const uint64_t bytes,
-    unsigned count,
-    void** x,
-    void** y,
-    const ffe_t log_m);
-
-/*
-    y[] ^= x[]
-    if (log_m != kModulus)
-        x[] ^= exp(log(y[]) + log_m)
-*/
-void VectorIFFTButterfly(
-    const uint64_t bytes,
-    unsigned count,
-    void** x,
-    void** y,
-    const ffe_t log_m);
-
 
 //------------------------------------------------------------------------------
 // Reed-Solomon Encode
diff --git a/LeopardFF8.cpp b/LeopardFF8.cpp
index 263bdb8..026854c 100644
--- a/LeopardFF8.cpp
+++ b/LeopardFF8.cpp
@@ -228,31 +228,31 @@ struct {
 } static Multiply256LUT[kOrder];
 #endif // LEO_TRY_AVX2
 
-static ffe_t Multiply8LUT[256 * 256];
+// Stores the product of x * y at offset x + y * 256
+// Repeated accesses from the same y value are faster
+static ffe_t Multiply8LUT[256 * 256] = {};
 
 
 void InitializeMultiplyTables()
 {
+    // If we cannot use the PSHUFB instruction, generate Multiply8LUT:
     if (!CpuHasSSSE3)
     {
+        // For each left-multiplicand:
         for (unsigned x = 0; x < 256; ++x)
         {
             ffe_t* lut = Multiply8LUT + x;
 
+            // Note: Table is already zeroed out so we can skip the zeroes
             if (x == 0)
-            {
-                for (unsigned log_y = 0; log_y < 256; ++log_y, lut += 256)
-                    lut[log_y] = 0;
-            }
-            else
-            {
-                const ffe_t log_x = LogLUT[x];
+                continue;
 
-                for (unsigned log_y = 0; log_y < 256; ++log_y, lut += 256)
-                {
-                    const ffe_t prod = ExpLUT[AddMod(log_x, log_y)];
-                    *lut = prod;
-                }
+            const ffe_t log_x = LogLUT[x];
+
+            for (unsigned log_y = 0; log_y < 256; ++log_y, lut += 256)
+            {
+                const ffe_t prod = ExpLUT[AddMod(log_x, log_y)];
+                *lut = prod;
             }
         }
 
@@ -428,37 +428,6 @@ static void FFTInitialize()
     FWHT(LogWalsh, kOrder, kOrder);
 }
 
-void VectorFFTButterfly(
-    const uint64_t bytes,
-    unsigned count,
-    void** x,
-    void** y,
-    const ffe_t log_m)
-{
-    if (log_m == kModulus)
-    {
-        VectorXOR(bytes, count, y, x);
-        return;
-    }
-
-#ifdef LEO_USE_VECTOR4_OPT
-    while (count >= 4)
-    {
-        fft_butterfly4(
-            x[0], y[0],
-            x[1], y[1],
-            x[2], y[2],
-            x[3], y[3],
-            log_m, bytes);
-        x += 4, y += 4;
-        count -= 4;
-    }
-#endif // LEO_USE_VECTOR4_OPT
-
-    for (unsigned i = 0; i < count; ++i)
-        fft_butterfly(x[i], y[i], log_m, bytes);
-}
-
 /*
     Decimation in time IFFT:
 
@@ -1073,106 +1042,6 @@ void fft_butterfly(
 #endif
 }
 
-#ifdef LEO_USE_VECTOR4_OPT
-
-void fft_butterfly4(
-    void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
-    void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1,
-    void * LEO_RESTRICT x_2, void * LEO_RESTRICT y_2,
-    void * LEO_RESTRICT x_3, void * LEO_RESTRICT y_3,
-    ffe_t log_m, uint64_t bytes)
-{
-#if defined(LEO_TRY_AVX2)
-    if (CpuHasAVX2)
-    {
-        const LEO_M256 table_lo_y = _mm256_loadu_si256(&Multiply256LUT[log_m].Value[0]);
-        const LEO_M256 table_hi_y = _mm256_loadu_si256(&Multiply256LUT[log_m].Value[1]);
-
-        const LEO_M256 clr_mask = _mm256_set1_epi8(0x0f);
-
-        LEO_M256 * LEO_RESTRICT x32_0 = reinterpret_cast<LEO_M256 *>(x_0);
-        LEO_M256 * LEO_RESTRICT y32_0 = reinterpret_cast<LEO_M256 *>(y_0);
-        LEO_M256 * LEO_RESTRICT x32_1 = reinterpret_cast<LEO_M256 *>(x_1);
-        LEO_M256 * LEO_RESTRICT y32_1 = reinterpret_cast<LEO_M256 *>(y_1);
-        LEO_M256 * LEO_RESTRICT x32_2 = reinterpret_cast<LEO_M256 *>(x_2);
-        LEO_M256 * LEO_RESTRICT y32_2 = reinterpret_cast<LEO_M256 *>(y_2);
-        LEO_M256 * LEO_RESTRICT x32_3 = reinterpret_cast<LEO_M256 *>(x_3);
-        LEO_M256 * LEO_RESTRICT y32_3 = reinterpret_cast<LEO_M256 *>(y_3);
-
-        do
-        {
-            LEO_FFTB_256(x32_0 + 1, y32_0 + 1);
-            LEO_FFTB_256(x32_0, y32_0);
-            y32_0 += 2, x32_0 += 2;
-
-            LEO_FFTB_256(x32_1 + 1, y32_1 + 1);
-            LEO_FFTB_256(x32_1, y32_1);
-            y32_1 += 2, x32_1 += 2;
-
-            LEO_FFTB_256(x32_2 + 1, y32_2 + 1);
-            LEO_FFTB_256(x32_2, y32_2);
-            y32_2 += 2, x32_2 += 2;
-
-            LEO_FFTB_256(x32_3 + 1, y32_3 + 1);
-            LEO_FFTB_256(x32_3, y32_3);
-            y32_3 += 2, x32_3 += 2;
-
-            bytes -= 64;
-        } while (bytes > 0);
-
-        return;
-    }
-#endif // LEO_TRY_AVX2
-
-    if (CpuHasSSSE3)
-    {
-        const LEO_M128 table_lo_y = _mm_loadu_si128(&Multiply128LUT[log_m].Value[0]);
-        const LEO_M128 table_hi_y = _mm_loadu_si128(&Multiply128LUT[log_m].Value[1]);
-
-        const LEO_M128 clr_mask = _mm_set1_epi8(0x0f);
-
-        LEO_M128 * LEO_RESTRICT x16_0 = reinterpret_cast<LEO_M128 *>(x_0);
-        LEO_M128 * LEO_RESTRICT y16_0 = reinterpret_cast<LEO_M128 *>(y_0);
-        LEO_M128 * LEO_RESTRICT x16_1 = reinterpret_cast<LEO_M128 *>(x_1);
-        LEO_M128 * LEO_RESTRICT y16_1 = reinterpret_cast<LEO_M128 *>(y_1);
-        LEO_M128 * LEO_RESTRICT x16_2 = reinterpret_cast<LEO_M128 *>(x_2);
-        LEO_M128 * LEO_RESTRICT y16_2 = reinterpret_cast<LEO_M128 *>(y_2);
-        LEO_M128 * LEO_RESTRICT x16_3 = reinterpret_cast<LEO_M128 *>(x_3);
-        LEO_M128 * LEO_RESTRICT y16_3 = reinterpret_cast<LEO_M128 *>(y_3);
-
-        do
-        {
-            LEO_FFTB_128(x16_0 + 3, y16_0 + 3);
-            LEO_FFTB_128(x16_0 + 2, y16_0 + 2);
-            LEO_FFTB_128(x16_0 + 1, y16_0 + 1);
-            LEO_FFTB_128(x16_0, y16_0);
-            x16_0 += 4, y16_0 += 4;
-
-            LEO_FFTB_128(x16_1 + 3, y16_1 + 3);
-            LEO_FFTB_128(x16_1 + 2, y16_1 + 2);
-            LEO_FFTB_128(x16_1 + 1, y16_1 + 1);
-            LEO_FFTB_128(x16_1, y16_1);
-            x16_1 += 4, y16_1 += 4;
-
-            LEO_FFTB_128(x16_2 + 3, y16_2 + 3);
-            LEO_FFTB_128(x16_2 + 2, y16_2 + 2);
-            LEO_FFTB_128(x16_2 + 1, y16_2 + 1);
-            LEO_FFTB_128(x16_2, y16_2);
-            x16_2 += 4, y16_2 += 4;
-
-            LEO_FFTB_128(x16_3 + 3, y16_3 + 3);
-            LEO_FFTB_128(x16_3 + 2, y16_3 + 2);
-            LEO_FFTB_128(x16_3 + 1, y16_3 + 1);
-            LEO_FFTB_128(x16_3, y16_3);
-            x16_3 += 4, y16_3 += 4;
-
-            bytes -= 64;
-        } while (bytes > 0);
-    }
-}
-
-#endif // LEO_USE_VECTOR4_OPT
-
 static void FFT_DIT4(
     uint64_t bytes,
     void** work,
@@ -1540,6 +1409,66 @@ void ErrorBitfield::Prepare()
         Words[6][i] = Words[6][i + 1] = Words[5][i] | Words[5][i + 1];
 }
 
+
+static void FFT_DIT_ErrorBits(
+    const uint64_t bytes,
+    void** work,
+    const unsigned n_truncated,
+    const unsigned n,
+    const ffe_t* skewLUT,
+    const ErrorBitfield& error_bits)
+{
+    unsigned mip_level = LastNonzeroBit32(n);
+
+    // Decimation in time: Unroll 2 layers at a time
+    unsigned dist4 = n, dist = n >> 2;
+    for (; dist != 0; dist4 = dist, dist >>= 2, mip_level -=2)
+    {
+        // For each set of dist*4 elements:
+        for (unsigned r = 0; r < n_truncated; r += dist4)
+        {
+            if (!error_bits.IsNeeded(mip_level, r))
+                continue;
+
+            const ffe_t log_m01 = skewLUT[r + dist];
+            const ffe_t log_m23 = skewLUT[r + dist * 3];
+            const ffe_t log_m02 = skewLUT[r + dist * 2];
+
+            // For each set of dist elements:
+            for (unsigned i = r; i < r + dist; ++i)
+            {
+                FFT_DIT4(
+                    bytes,
+                    work + i,
+                    dist,
+                    log_m01,
+                    log_m23,
+                    log_m02);
+            }
+        }
+    }
+
+    // If there is one layer left:
+    if (dist4 == 2)
+    {
+        for (unsigned r = 0; r < n_truncated; r += 2)
+        {
+            const ffe_t log_m = skewLUT[r + 1];
+
+            if (log_m == kModulus)
+                xor_mem(work[r + 1], work[r], bytes);
+            else
+            {
+                fft_butterfly(
+                    work[r],
+                    work[r + 1],
+                    log_m,
+                    bytes);
+            }
+        }
+    }
+}
+
 #endif // LEO_ERROR_BITFIELD_OPT
 
 
@@ -1559,7 +1488,7 @@ void ReedSolomonDecode(
     // Fill in error locations
 
 #ifdef LEO_ERROR_BITFIELD_OPT
-    ErrorBitfield ErrorBits;
+    ErrorBitfield error_bits;
 #endif // LEO_ERROR_BITFIELD_OPT
 
     ffe_t ErrorLocations[kOrder] = {};
@@ -1574,13 +1503,13 @@ void ReedSolomonDecode(
         {
             ErrorLocations[i + m] = 1;
 #ifdef LEO_ERROR_BITFIELD_OPT
-            ErrorBits.Set(i + m);
+            error_bits.Set(i + m);
 #endif // LEO_ERROR_BITFIELD_OPT
         }
     }
 
 #ifdef LEO_ERROR_BITFIELD_OPT
-    ErrorBits.Prepare();
+    error_bits.Prepare();
 #endif // LEO_ERROR_BITFIELD_OPT
 
     // Evaluate error locator polynomial
@@ -1642,32 +1571,13 @@ void ReedSolomonDecode(
 
     // work <- FFT(work, n, 0) truncated to m + original_count
 
-    unsigned mip_level = LastNonzeroBit32(n);
     const unsigned output_count = m + original_count;
-    for (unsigned width = (n >> 1); width > 0; width >>= 1, --mip_level)
-    {
-        const ffe_t* skewLUT = FFTSkew + width - 1;
-        const unsigned range = width << 1;
 
-#ifdef LEO_SCHEDULE_OPT
-        for (unsigned j = (m < range) ? 0 : m; j < output_count; j += range)
-#else
-        for (unsigned j = 0; j < n; j += range)
-#endif
-        {
 #ifdef LEO_ERROR_BITFIELD_OPT
-            if (!ErrorBits.IsNeeded(mip_level, j))
-                continue;
-#endif // LEO_ERROR_BITFIELD_OPT
-
-            VectorFFTButterfly(
-                buffer_bytes,
-                width,
-                work + j,
-                work + j + width,
-                skewLUT[j]);
-        }
-    }
+    FFT_DIT_ErrorBits(buffer_bytes, work, output_count, n, FFTSkew - 1, error_bits);
+#else
+    FFT_DIT(buffer_bytes, work, output_count, n, FFTSkew - 1);
+#endif
 
     // Reveal erasures
 
diff --git a/tests/benchmark.cpp b/tests/benchmark.cpp
index d059789..f8b3933 100644
--- a/tests/benchmark.cpp
+++ b/tests/benchmark.cpp
@@ -42,13 +42,13 @@ using namespace std;
 struct TestParameters
 {
 #ifdef LEO_HAS_FF16
-    unsigned original_count = 100; // under 65536
-    unsigned recovery_count = 20; // under 65536 - original_count
+    unsigned original_count = 128; // under 65536
+    unsigned recovery_count = 128; // under 65536 - original_count
 #else
     unsigned original_count = 128; // under 65536
     unsigned recovery_count = 128; // under 65536 - original_count
 #endif
-    unsigned buffer_bytes = 64000; // multiple of 64 bytes
+    unsigned buffer_bytes = 64; // multiple of 64 bytes
     unsigned loss_count = 32768; // some fraction of original_count
     unsigned seed = 2;
     bool multithreaded = true;
@@ -399,7 +399,7 @@ static LEO_FORCE_INLINE void SIMDSafeFree(void* ptr)
 
 static bool Benchmark(const TestParameters& params)
 {
-    const unsigned kTrials = params.original_count > 8000 ? 1 : 1;
+    const unsigned kTrials = params.original_count > 8000 ? 1 : 16;
 
     std::vector<uint8_t*> original_data(params.original_count);
 
@@ -594,7 +594,7 @@ int main(int argc, char **argv)
     if (!Benchmark(params))
         goto Failed;
 
-#if 1
+#if 0
     static const unsigned kMaxRandomData = 32768;
 
     prng.Seed(params.seed, 8);