Only calculate final FFT entries needed

2017-05-27 23:31:13 -07:00 · 2017-05-27 23:31:13 -07:00 · d42ea87e26
parent d66da163f9
commit d42ea87e26
2 changed files with 89 additions and 9 deletions
--- a/LeopardFF8.cpp
+++ b/LeopardFF8.cpp
@ -979,6 +979,71 @@ skip_body:
 }


+//------------------------------------------------------------------------------
+// ErrorBitfield
+
+// Used in decoding to decide which final FFT operations to perform
+class ErrorBitfield
+{
+    static const unsigned kWords = kOrder / 64;
+    uint64_t Words[7][kWords] = {};
+
+public:
+    LEO_FORCE_INLINE void Set(unsigned i)
+    {
+        Words[0][i / 64] |= (uint64_t)1 << (i % 64);
+    }
+
+    void Prepare();
+
+    LEO_FORCE_INLINE bool IsNeeded(unsigned mip_level, unsigned bit)
+    {
+        if (mip_level >= 8)
+            return true;
+        return 0 != (Words[mip_level - 1][bit / 64] & ((uint64_t)1 << (bit % 64)));
+    }
+};
+
+static const uint64_t kHiMasks[5] = {
+    0xAAAAAAAAAAAAAAAAULL,
+    0xCCCCCCCCCCCCCCCCULL,
+    0xF0F0F0F0F0F0F0F0ULL,
+    0xFF00FF00FF00FF00ULL,
+    0xFFFF0000FFFF0000ULL,
+};
+
+void ErrorBitfield::Prepare()
+{
+    // First mip level is for final layer of FFT: pairs of data
+    for (unsigned i = 0; i < kWords; ++i)
+    {
+        uint64_t w = Words[0][i];
+        const uint64_t hi2lo = w | ((w & kHiMasks[0]) >> 1);
+        const uint64_t lo2hi = ((w & (kHiMasks[0] >> 1)) << 1);
+        Words[0][i] = hi2lo | lo2hi;
+
+        for (unsigned j = 1, bits = 2; j < 5; ++j, bits <<= 1)
+        {
+            uint64_t w = Words[j - 1][i];
+            const uint64_t hi2lo = w | ((w & kHiMasks[j]) >> bits);
+            const uint64_t lo2hi = ((w & (kHiMasks[j] >> bits)) << bits);
+            Words[j][i] = hi2lo | lo2hi;
+        }
+    }
+
+    for (unsigned i = 0; i < kWords; ++i)
+    {
+        uint64_t w = Words[4][i];
+        w |= w >> 32;
+        w |= w << 32;
+        Words[5][i] = w;
+    }
+
+    for (unsigned i = 0; i < kWords; i += 2)
+        Words[6][i] = Words[6][i + 1] = Words[5][i] | Words[5][i + 1];
+}
+
+
 //------------------------------------------------------------------------------
 // Decode

@ -994,14 +1059,27 @@ void Decode(
 {
    // Fill in error locations

+    ErrorBitfield ErrorBits;
+
    ffe_t ErrorLocations[kOrder];
    for (unsigned i = 0; i < recovery_count; ++i)
        ErrorLocations[i] = recovery[i] ? 0 : 1;
    for (unsigned i = recovery_count; i < m; ++i)
        ErrorLocations[i] = 1;
+
+    // Clear the remainder in bulk
+    memset(ErrorLocations + m, 0, (n - m) * sizeof(ffe_t));
+
    for (unsigned i = 0; i < original_count; ++i)
-        ErrorLocations[i + m] = original[i] ? 0 : 1;
-    memset(ErrorLocations + m + original_count, 0, (n - original_count - m) * sizeof(ffe_t));
+    {
+        if (!original[i])
+        {
+            ErrorLocations[i + m] = 1;
+            ErrorBits.Set(i + m);
+        }
+    }
+
+    ErrorBits.Prepare();

    // Evaluate error locator polynomial

@ -1039,7 +1117,8 @@ void Decode(
    // work <- IFFT(work, n, 0)

    const unsigned input_count = m + original_count;
-    for (unsigned width = 1; width < n; width <<= 1)
+    unsigned mip_level = 0;
+    for (unsigned width = 1; width < n; width <<= 1, ++mip_level)
    {
        const unsigned range = width << 1;

@ -1070,15 +1149,16 @@ void Decode(
    // work <- FFT(work, n, 0) truncated to m + original_count

    const unsigned output_count = m + original_count;
-    for (unsigned width = (n >> 1); width > 0; width >>= 1)
+    for (unsigned width = (n >> 1); width > 0; width >>= 1, --mip_level)
    {
        const ffe_t* skewLUT = FFTSkew + width - 1;
        const unsigned range = width << 1;

-        // FIXME: Generate mipmaps here
-
        for (unsigned j = (m < range) ? 0 : m; j < output_count; j += range)
        {
+            if (!ErrorBits.IsNeeded(mip_level, j))
+                continue;
+
            VectorFFTButterfly(
                buffer_bytes,
                width,
--- a/tests/benchmark.cpp
+++ b/tests/benchmark.cpp
@ -389,11 +389,11 @@ struct TestParameters
    unsigned original_count = 1000; // under 65536
    unsigned recovery_count = 100; // under 65536 - original_count
 #else
-    unsigned original_count = 200; // under 65536
-    unsigned recovery_count = 20; // under 65536 - original_count
+    unsigned original_count = 100; // under 65536
+    unsigned recovery_count = 10; // under 65536 - original_count
 #endif
    unsigned buffer_bytes = 64000; // multiple of 64 bytes
-    unsigned loss_count = 20; // some fraction of original_count
+    unsigned loss_count = 10; // some fraction of original_count
    unsigned seed = 0;
    bool multithreaded = true;
 };