Cleanup and small opt

2017-06-03 00:24:18 -07:00 · 2017-06-03 00:24:18 -07:00 · 968c4f4f6a
parent 7c2be9f17b
commit 968c4f4f6a
5 changed files with 42 additions and 149 deletions
--- a/LeopardCommon.h
+++ b/LeopardCommon.h
@ -32,7 +32,6 @@
    TODO:

    Short-term:
-    + FF8 decoder needs DIT FFT optimization
    + Port DIT FFT code to FF16
    + Unroll first/final butterflies to avoid extra copies/xors in encoder
    + Multithreading
--- a/LeopardFF16.h
+++ b/LeopardFF16.h
@ -64,54 +64,10 @@ static const unsigned kPolynomial = 0x1002D;


 //------------------------------------------------------------------------------
-// Fast Walsh-Hadamard Transform (FWHT) (mod kModulus)
+// API

-// Transform for a variable number of bits (up to kOrder)
-//void FWHT(ffe_t* data, const unsigned bits);
-
-// Transform specialized for the finite field order
-void FWHT(ffe_t data[kOrder]);
-
-
-//------------------------------------------------------------------------------
-// Multiplies
-
-// x[] = exp(log(y[]) + log_m)
-void mul_mem(
-    void * LEO_RESTRICT x, const void * LEO_RESTRICT y,
-    ffe_t log_m, uint64_t bytes);
-
-
-//------------------------------------------------------------------------------
-// FFT Operations
-
-/*
-    Precondition: log_m != kModulus
-
-    x[] ^= exp(log(y[]) + log_m)
-    y[] ^= x[]
-*/
-void fft_butterfly(
-    void * LEO_RESTRICT x, void * LEO_RESTRICT y,
-    ffe_t log_m, uint64_t bytes);
-
-
-//------------------------------------------------------------------------------
-// IFFT Operations
-
-/*
-    Precondition: log_m != kModulus
-
-    y[] ^= x[]
-    x[] ^= exp(log(y[]) + log_m)
-*/
-void ifft_butterfly(
-    void * LEO_RESTRICT x, void * LEO_RESTRICT y,
-    ffe_t log_m, uint64_t bytes);
-
-
-//------------------------------------------------------------------------------
-// Reed-Solomon Encode
+// Returns false if the self-test fails
+bool Initialize();

 void ReedSolomonEncode(
    uint64_t buffer_bytes,
@ -121,10 +77,6 @@ void ReedSolomonEncode(
    const void* const * const data,
    void** work); // Size of GetEncodeWorkCount()

-
-//------------------------------------------------------------------------------
-// Reed-Solomon Decode
-
 void ReedSolomonDecode(
    uint64_t buffer_bytes,
    unsigned original_count,
@ -136,13 +88,6 @@ void ReedSolomonDecode(
    void** work); // n entries


-//------------------------------------------------------------------------------
-// API
-
-// Returns false if the self-test fails
-bool Initialize();
-
-
 }} // namespace leopard::ff16

 #endif // LEO_HAS_FF16
--- a/LeopardFF8.cpp
+++ b/LeopardFF8.cpp
@ -134,7 +134,7 @@ static void FWHT(ffe_t* data, const unsigned m, const unsigned m_truncated)
 #else // LEO_FWHT_OPT

 // Reference implementation
-void FWHT(ffe_t* data, const unsigned bits)
+static void FWHT(ffe_t* data, const unsigned bits)
 {
    const unsigned size = (unsigned)(1UL << bits);
    for (unsigned width = 1; width < size; width <<= 1)
@ -233,7 +233,7 @@ struct {
 static ffe_t Multiply8LUT[256 * 256] = {};


-void InitializeMultiplyTables()
+static void InitializeMultiplyTables()
 {
    // If we cannot use the PSHUFB instruction, generate Multiply8LUT:
    if (!CpuHasSSSE3)
@ -288,7 +288,7 @@ void InitializeMultiplyTables()
 }


-void mul_mem(
+static void mul_mem(
    void * LEO_RESTRICT x, const void * LEO_RESTRICT y,
    ffe_t log_m, uint64_t bytes)
 {
@ -482,7 +482,7 @@ static void FFTInitialize()
        {1-5, 1'-5', 1-1', 5-5'},
 */

-void ifft_butterfly(
+static void ifft_butterfly(
    void * LEO_RESTRICT x, void * LEO_RESTRICT y,
    ffe_t log_m, uint64_t bytes)
 {
@ -781,7 +781,7 @@ static void IFFT_DIT4(
    }
 }

-void IFFT_DIT(
+static void IFFT_DIT(
    const uint64_t bytes,
    const void* const* data,
    const unsigned m_truncated,
@ -815,7 +815,10 @@ void IFFT_DIT(
            const ffe_t log_m02 = skewLUT[r + dist * 2];

            // For each set of dist elements:
-            for (unsigned i = r; i < r + dist; ++i)
+            unsigned i_end = r + dist;
+            if (i_end >= m_truncated)
+                i_end = m_truncated;
+            for (unsigned i = r; i < i_end; ++i)
            {
                IFFT_DIT4(
                    bytes,
@ -915,7 +918,7 @@ void IFFT_DIT(
        {4-6, 5-7, 4-5, 6-7},
 */

-void fft_butterfly(
+static void fft_butterfly(
    void * LEO_RESTRICT x, void * LEO_RESTRICT y,
    ffe_t log_m, uint64_t bytes)
 {
@ -1212,7 +1215,8 @@ static void FFT_DIT4(
        fft_butterfly(work[dist * 2], work[dist * 3], log_m23, bytes);
 }

-void FFT_DIT(
+
+static void FFT_DIT(
    const uint64_t bytes,
    void** work,
    const unsigned m_truncated,
@ -1231,7 +1235,10 @@ void FFT_DIT(
            const ffe_t log_m02 = skewLUT[r + dist * 2];

            // For each set of dist elements:
-            for (unsigned i = r; i < r + dist; ++i)
+            unsigned i_end = r + dist;
+            if (i_end >= m_truncated)
+                i_end = m_truncated;
+            for (unsigned i = r; i < i_end; ++i)
            {
                FFT_DIT4(
                    bytes,
--- a/LeopardFF8.h
+++ b/LeopardFF8.h
@ -64,94 +64,28 @@ static const unsigned kPolynomial = 0x11D;


 //------------------------------------------------------------------------------
-// Fast Walsh-Hadamard Transform (FWHT) (mod kModulus)
+// API

-// Transform for a variable number of elements
-// m_truncated: Number of elements that are non-zero at the front of data
-//void FWHT(ffe_t* data, const unsigned m, const unsigned m_truncated);
-
-
-//------------------------------------------------------------------------------
-// Multiplies
-
-// x[] = exp(log(y[]) + log_m)
-// mul_mem
-void mul_mem(
-    void * LEO_RESTRICT x, const void * LEO_RESTRICT y,
-    ffe_t log_m, uint64_t bytes);
-
-
-//------------------------------------------------------------------------------
-// FFT Operations
-
-/*
-    Precondition: log_m != kModulus
-
-    x[] ^= exp(log(y[]) + log_m)
-    y[] ^= x[]
-*/
-void fft_butterfly(
-    void * LEO_RESTRICT x, void * LEO_RESTRICT y,
-    ffe_t log_m, uint64_t bytes);
-
-#ifdef LEO_USE_VECTOR4_OPT
-
-// Unroll 4 rows at a time
-void fft_butterfly4(
-    void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
-    void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1,
-    void * LEO_RESTRICT x_2, void * LEO_RESTRICT y_2,
-    void * LEO_RESTRICT x_3, void * LEO_RESTRICT y_3,
-    ffe_t log_m, uint64_t bytes);
-
-#endif // LEO_USE_VECTOR4_OPT
-
-
-//------------------------------------------------------------------------------
-// IFFT Operations
-
-/*
-    Precondition: log_m != kModulus
-
-    y[] ^= x[]
-    x[] ^= exp(log(y[]) + log_m)
-*/
-void ifft_butterfly(
-    void * LEO_RESTRICT x, void * LEO_RESTRICT y,
-    ffe_t log_m, uint64_t bytes);
-
-
-//------------------------------------------------------------------------------
-// Reed-Solomon Encode
+// Returns false if the self-test fails
+bool Initialize();

 void ReedSolomonEncode(
    uint64_t buffer_bytes,
    unsigned original_count,
    unsigned recovery_count,
-    unsigned m, // = NextPow2(recovery_count) * 2 = work_count
+    unsigned m, // = NextPow2(recovery_count)
    const void* const * const data,
-    void** work); // Size of GetEncodeWorkCount()
-
-
-//------------------------------------------------------------------------------
-// Reed-Solomon Decode
+    void** work); // m * 2 elements

 void ReedSolomonDecode(
    uint64_t buffer_bytes,
    unsigned original_count,
    unsigned recovery_count,
    unsigned m, // = NextPow2(recovery_count)
-    unsigned n, // = NextPow2(m + original_count) = work_count
+    unsigned n, // = NextPow2(m + original_count)
    const void* const * const original, // original_count entries
    const void* const * const recovery, // recovery_count entries
-    void** work); // n entries
-
-
-//------------------------------------------------------------------------------
-// API
-
-// Returns false if the self-test fails
-bool Initialize();
+    void** work); // n elements


 }} // namespace leopard::ff8
--- a/tests/benchmark.cpp
+++ b/tests/benchmark.cpp
@ -48,7 +48,7 @@ struct TestParameters
    unsigned original_count = 128; // under 65536
    unsigned recovery_count = 128; // under 65536 - original_count
 #endif
-    unsigned buffer_bytes = 64; // multiple of 64 bytes
+    unsigned buffer_bytes = 64000; // multiple of 64 bytes
    unsigned loss_count = 32768; // some fraction of original_count
    unsigned seed = 2;
    bool multithreaded = true;
@ -240,9 +240,15 @@ public:
    void EndCall()
    {
        LEO_DEBUG_ASSERT(t0 != 0);
-        uint64_t t1 = GetTimeUsec();
-        ++Invokations;
-        TotalUsec += t1 - t0;
+        const uint64_t t1 = GetTimeUsec();
+        const uint64_t delta = t1 - t0;
+        if (++Invokations == 1)
+            MaxCallUsec = MinCallUsec = delta;
+        else if (MaxCallUsec < delta)
+            MaxCallUsec = delta;
+        else if (MinCallUsec > delta)
+            MinCallUsec = delta;
+        TotalUsec += delta;
        t0 = 0;
    }
    void Reset()
@ -260,6 +266,8 @@ public:
    uint64_t t0 = 0;
    uint64_t Invokations = 0;
    uint64_t TotalUsec = 0;
+    uint64_t MaxCallUsec = 0;
+    uint64_t MinCallUsec = 0;
    std::string FunctionName;
 };

@ -542,10 +550,10 @@ static bool Benchmark(const TestParameters& params)
    t_mem_free.Print(kTrials);
 #endif

-    float encode_input_MBPS = total_bytes * kTrials / (float)(t_leo_encode.TotalUsec);
-    float encode_output_MBPS = params.buffer_bytes * (uint64_t)params.recovery_count * kTrials / (float)(t_leo_encode.TotalUsec);
-    float decode_input_MBPS = total_bytes * kTrials / (float)(t_leo_decode.TotalUsec);
-    float decode_output_MBPS = params.buffer_bytes * (uint64_t)params.loss_count * kTrials / (float)(t_leo_decode.TotalUsec);
+    float encode_input_MBPS = total_bytes / (float)(t_leo_encode.MinCallUsec);
+    float encode_output_MBPS = params.buffer_bytes * (uint64_t)params.recovery_count / (float)(t_leo_encode.MinCallUsec);
+    float decode_input_MBPS = total_bytes / (float)(t_leo_decode.MinCallUsec);
+    float decode_output_MBPS = params.buffer_bytes * (uint64_t)params.loss_count / (float)(t_leo_decode.MinCallUsec);

    cout << "Leopard Encoder(" << total_bytes / 1000000.f << " MB in " << params.original_count << " pieces, " << params.loss_count << " losses): Input=" << encode_input_MBPS << " MB/s, Output=" << encode_output_MBPS << " MB/s" << endl;
    cout << "Leopard Decoder(" << total_bytes / 1000000.f << " MB in " << params.original_count << " pieces, " << params.loss_count << " losses): Input=" << decode_input_MBPS << " MB/s, Output=" << decode_output_MBPS << " MB/s" << endl << endl;