From d42ea87e26e752af3ad42b8a192ea882c4222051 Mon Sep 17 00:00:00 2001 From: Christopher Taylor Date: Sat, 27 May 2017 23:31:13 -0700 Subject: [PATCH] Only calculate final FFT entries needed --- LeopardFF8.cpp | 92 ++++++++++++++++++++++++++++++++++++++++++--- tests/benchmark.cpp | 6 +-- 2 files changed, 89 insertions(+), 9 deletions(-) diff --git a/LeopardFF8.cpp b/LeopardFF8.cpp index dd36778..b677734 100644 --- a/LeopardFF8.cpp +++ b/LeopardFF8.cpp @@ -979,6 +979,71 @@ skip_body: } +//------------------------------------------------------------------------------ +// ErrorBitfield + +// Used in decoding to decide which final FFT operations to perform +class ErrorBitfield +{ + static const unsigned kWords = kOrder / 64; + uint64_t Words[7][kWords] = {}; + +public: + LEO_FORCE_INLINE void Set(unsigned i) + { + Words[0][i / 64] |= (uint64_t)1 << (i % 64); + } + + void Prepare(); + + LEO_FORCE_INLINE bool IsNeeded(unsigned mip_level, unsigned bit) + { + if (mip_level >= 8) + return true; + return 0 != (Words[mip_level - 1][bit / 64] & ((uint64_t)1 << (bit % 64))); + } +}; + +static const uint64_t kHiMasks[5] = { + 0xAAAAAAAAAAAAAAAAULL, + 0xCCCCCCCCCCCCCCCCULL, + 0xF0F0F0F0F0F0F0F0ULL, + 0xFF00FF00FF00FF00ULL, + 0xFFFF0000FFFF0000ULL, +}; + +void ErrorBitfield::Prepare() +{ + // First mip level is for final layer of FFT: pairs of data + for (unsigned i = 0; i < kWords; ++i) + { + uint64_t w = Words[0][i]; + const uint64_t hi2lo = w | ((w & kHiMasks[0]) >> 1); + const uint64_t lo2hi = ((w & (kHiMasks[0] >> 1)) << 1); + Words[0][i] = hi2lo | lo2hi; + + for (unsigned j = 1, bits = 2; j < 5; ++j, bits <<= 1) + { + uint64_t w = Words[j - 1][i]; + const uint64_t hi2lo = w | ((w & kHiMasks[j]) >> bits); + const uint64_t lo2hi = ((w & (kHiMasks[j] >> bits)) << bits); + Words[j][i] = hi2lo | lo2hi; + } + } + + for (unsigned i = 0; i < kWords; ++i) + { + uint64_t w = Words[4][i]; + w |= w >> 32; + w |= w << 32; + Words[5][i] = w; + } + + for (unsigned i = 0; i < kWords; i += 2) + Words[6][i] = Words[6][i + 1] = Words[5][i] | Words[5][i + 1]; +} + + //------------------------------------------------------------------------------ // Decode @@ -994,14 +1059,27 @@ void Decode( { // Fill in error locations + ErrorBitfield ErrorBits; + ffe_t ErrorLocations[kOrder]; for (unsigned i = 0; i < recovery_count; ++i) ErrorLocations[i] = recovery[i] ? 0 : 1; for (unsigned i = recovery_count; i < m; ++i) ErrorLocations[i] = 1; + + // Clear the remainder in bulk + memset(ErrorLocations + m, 0, (n - m) * sizeof(ffe_t)); + for (unsigned i = 0; i < original_count; ++i) - ErrorLocations[i + m] = original[i] ? 0 : 1; - memset(ErrorLocations + m + original_count, 0, (n - original_count - m) * sizeof(ffe_t)); + { + if (!original[i]) + { + ErrorLocations[i + m] = 1; + ErrorBits.Set(i + m); + } + } + + ErrorBits.Prepare(); // Evaluate error locator polynomial @@ -1039,7 +1117,8 @@ void Decode( // work <- IFFT(work, n, 0) const unsigned input_count = m + original_count; - for (unsigned width = 1; width < n; width <<= 1) + unsigned mip_level = 0; + for (unsigned width = 1; width < n; width <<= 1, ++mip_level) { const unsigned range = width << 1; @@ -1070,15 +1149,16 @@ void Decode( // work <- FFT(work, n, 0) truncated to m + original_count const unsigned output_count = m + original_count; - for (unsigned width = (n >> 1); width > 0; width >>= 1) + for (unsigned width = (n >> 1); width > 0; width >>= 1, --mip_level) { const ffe_t* skewLUT = FFTSkew + width - 1; const unsigned range = width << 1; - // FIXME: Generate mipmaps here - for (unsigned j = (m < range) ? 0 : m; j < output_count; j += range) { + if (!ErrorBits.IsNeeded(mip_level, j)) + continue; + VectorFFTButterfly( buffer_bytes, width, diff --git a/tests/benchmark.cpp b/tests/benchmark.cpp index 2f42b66..022a9f4 100644 --- a/tests/benchmark.cpp +++ b/tests/benchmark.cpp @@ -389,11 +389,11 @@ struct TestParameters unsigned original_count = 1000; // under 65536 unsigned recovery_count = 100; // under 65536 - original_count #else - unsigned original_count = 200; // under 65536 - unsigned recovery_count = 20; // under 65536 - original_count + unsigned original_count = 100; // under 65536 + unsigned recovery_count = 10; // under 65536 - original_count #endif unsigned buffer_bytes = 64000; // multiple of 64 bytes - unsigned loss_count = 20; // some fraction of original_count + unsigned loss_count = 10; // some fraction of original_count unsigned seed = 0; bool multithreaded = true; };