Refactor multiply table code

2017-05-29 15:01:01 -07:00 · 2017-05-29 15:01:01 -07:00 · 49960e90f3
parent 283c1aac22
commit 49960e90f3
4 changed files with 790 additions and 317 deletions
--- a/LeopardFF16.cpp
+++ b/LeopardFF16.cpp
--- a/LeopardFF16.h
+++ b/LeopardFF16.h
@ -76,44 +76,90 @@ void FWHT(ffe_t data[kOrder]);
 //------------------------------------------------------------------------------
 // Multiplies

-// x[] = y[] * m
-void mul_mem_set(
+// x[] = exp(log(y[]) + log_m)
+void mul_mem(
    void * LEO_RESTRICT x, const void * LEO_RESTRICT y,
-    ffe_t m, uint64_t bytes);
+    ffe_t log_m, uint64_t bytes);


 //------------------------------------------------------------------------------
 // FFT Operations

-// x[] ^= y[] * m, y[] ^= x[]
+/*
+    Precondition: log_m != kModulus
+
+    x[] ^= exp(log(y[]) + log_m)
+    y[] ^= x[]
+*/
 void fft_butterfly(
    void * LEO_RESTRICT x, void * LEO_RESTRICT y,
-    ffe_t m, uint64_t bytes);
+    ffe_t log_m, uint64_t bytes);

-// For i = {0, 1, 2, 3}: x_i[] ^= y_i[] * m, y_i[] ^= x_i[]
+#ifdef LEO_USE_VECTOR4_OPT
+
+// Unroll 4 rows at a time
 void fft_butterfly4(
    void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
    void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1,
    void * LEO_RESTRICT x_2, void * LEO_RESTRICT y_2,
    void * LEO_RESTRICT x_3, void * LEO_RESTRICT y_3,
-    ffe_t m, uint64_t bytes);
+    ffe_t log_m, uint64_t bytes);
+
+#endif // LEO_USE_VECTOR4_OPT


 //------------------------------------------------------------------------------
 // IFFT Operations

-// y[] ^= x[], x[] ^= y[] * m
+/*
+    Precondition: log_m != kModulus
+
+    y[] ^= x[]
+    x[] ^= exp(log(y[]) + log_m)
+*/
 void ifft_butterfly(
    void * LEO_RESTRICT x, void * LEO_RESTRICT y,
-    ffe_t m, uint64_t bytes);
+    ffe_t log_m, uint64_t bytes);

-// For i = {0, 1, 2, 3}: y_i[] ^= x_i[], x_i[] ^= y_i[] * m
+#ifdef LEO_USE_VECTOR4_OPT
+
+// Unroll 4 rows at a time
 void ifft_butterfly4(
    void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
    void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1,
    void * LEO_RESTRICT x_2, void * LEO_RESTRICT y_2,
    void * LEO_RESTRICT x_3, void * LEO_RESTRICT y_3,
-    ffe_t m, uint64_t bytes);
+    ffe_t log_m, uint64_t bytes);
+
+#endif // LEO_USE_VECTOR4_OPT
+
+
+//------------------------------------------------------------------------------
+// FFT
+
+/*
+    if (log_m != kModulus)
+        x[] ^= exp(log(y[]) + log_m)
+    y[] ^= x[]
+*/
+void VectorFFTButterfly(
+    const uint64_t bytes,
+    unsigned count,
+    void** x,
+    void** y,
+    const ffe_t log_m);
+
+/*
+    y[] ^= x[]
+    if (log_m != kModulus)
+        x[] ^= exp(log(y[]) + log_m)
+*/
+void VectorIFFTButterfly(
+    const uint64_t bytes,
+    unsigned count,
+    void** x,
+    void** y,
+    const ffe_t log_m);


 //------------------------------------------------------------------------------
--- a/LeopardFF8.cpp
+++ b/LeopardFF8.cpp
@ -253,20 +253,17 @@ static void InitializeLogarithmTables()
    ExpLUT[kModulus] = ExpLUT[0];
 }

+
 //------------------------------------------------------------------------------
 // Multiplies

-// We require memory to be aligned since the SIMD instructions benefit from
-// or require aligned accesses to the table data.
 struct {
-    LEO_ALIGNED LEO_M128 Lo[256];
-    LEO_ALIGNED LEO_M128 Hi[256];
-} static Multiply128LUT;
+    LEO_ALIGNED LEO_M128 Value[kBits / 4];
+} static Multiply128LUT[kOrder];
 #if defined(LEO_TRY_AVX2)
 struct {
-    LEO_ALIGNED LEO_M256 Lo[256];
-    LEO_ALIGNED LEO_M256 Hi[256];
-} static Multiply256LUT;
+    LEO_ALIGNED LEO_M256 Value[kBits / 4];
+} static Multiply256LUT[kOrder];
 #endif // LEO_TRY_AVX2

 // Returns a * Log(b)
@ -285,33 +282,33 @@ static ffe_t MultiplyLog(ffe_t a, ffe_t log_b)
    return ExpLUT[AddMod(LogLUT[a], log_b)];
 }

-
 void InitializeMultiplyTables()
 {
-    for (int log_y = 0; log_y < 256; ++log_y)
+    // For each value we could multiply by:
+    for (unsigned log_m = 0; log_m < kOrder; ++log_m)
    {
-        uint8_t lo[16], hi[16];
-        for (uint8_t x = 0; x < 16; ++x)
+        // For each 4 bits of the finite field width in bits:
+        for (unsigned i = 0, shift = 0; i < kBits / 4; ++i, shift += 4)
        {
-            lo[x] = MultiplyLog(x, static_cast<uint8_t>(log_y));
-            hi[x] = MultiplyLog(x << 4, static_cast<uint8_t>(log_y));
-        }
+            // Construct 16 entry LUT for PSHUFB
+            ffe_t lut[16];
+            for (uint8_t x = 0; x < 16; ++x)
+                lut[x] = MultiplyLog(x << shift, static_cast<ffe_t>(log_m));

-        const LEO_M128 table_lo = _mm_loadu_si128((LEO_M128*)lo);
-        const LEO_M128 table_hi = _mm_loadu_si128((LEO_M128*)hi);
-
-        _mm_storeu_si128(Multiply128LUT.Lo + log_y, table_lo);
-        _mm_storeu_si128(Multiply128LUT.Hi + log_y, table_hi);
+            // Store in 128-bit wide table
+            const LEO_M128 *v_ptr = reinterpret_cast<const LEO_M128 *>(&lut[0]);
+            const LEO_M128 value = _mm_loadu_si128(v_ptr);
+            _mm_storeu_si128(&Multiply128LUT[log_m].Value[i], value);

+            // Store in 256-bit wide table
 #if defined(LEO_TRY_AVX2)
-        if (CpuHasAVX2)
-        {
-            _mm256_storeu_si256(Multiply256LUT.Lo + log_y,
-                _mm256_broadcastsi128_si256(table_lo));
-            _mm256_storeu_si256(Multiply256LUT.Hi + log_y,
-                _mm256_broadcastsi128_si256(table_hi));
-        }
+            if (CpuHasAVX2)
+            {
+                _mm256_storeu_si256(&Multiply256LUT[log_m].Value[i],
+                    _mm256_broadcastsi128_si256(value));
+            }
 #endif // LEO_TRY_AVX2
+        }
    }
 }

@ -323,8 +320,8 @@ void mul_mem(
 #if defined(LEO_TRY_AVX2)
    if (CpuHasAVX2)
    {
-        const LEO_M256 table_lo_y = _mm256_loadu_si256(Multiply256LUT.Lo + log_m);
-        const LEO_M256 table_hi_y = _mm256_loadu_si256(Multiply256LUT.Hi + log_m);
+        const LEO_M256 table_lo_y = _mm256_loadu_si256(&Multiply256LUT[log_m].Value[0]);
+        const LEO_M256 table_hi_y = _mm256_loadu_si256(&Multiply256LUT[log_m].Value[1]);

        const LEO_M256 clr_mask = _mm256_set1_epi8(0x0f);

@ -353,8 +350,8 @@ void mul_mem(
    }
 #endif // LEO_TRY_AVX2

-    const LEO_M128 table_lo_y = _mm_loadu_si128(Multiply128LUT.Lo + log_m);
-    const LEO_M128 table_hi_y = _mm_loadu_si128(Multiply128LUT.Hi + log_m);
+    const LEO_M128 table_lo_y = _mm_loadu_si128(&Multiply128LUT[log_m].Value[0]);
+    const LEO_M128 table_hi_y = _mm_loadu_si128(&Multiply128LUT[log_m].Value[1]);

    const LEO_M128 clr_mask = _mm_set1_epi8(0x0f);

@ -393,8 +390,8 @@ void fft_butterfly(
 #if defined(LEO_TRY_AVX2)
    if (CpuHasAVX2)
    {
-        const LEO_M256 table_lo_y = _mm256_loadu_si256(Multiply256LUT.Lo + log_m);
-        const LEO_M256 table_hi_y = _mm256_loadu_si256(Multiply256LUT.Hi + log_m);
+        const LEO_M256 table_lo_y = _mm256_loadu_si256(&Multiply256LUT[log_m].Value[0]);
+        const LEO_M256 table_hi_y = _mm256_loadu_si256(&Multiply256LUT[log_m].Value[1]);

        const LEO_M256 clr_mask = _mm256_set1_epi8(0x0f);

@ -427,8 +424,8 @@ void fft_butterfly(
    }
 #endif // LEO_TRY_AVX2

-    const LEO_M128 table_lo_y = _mm_loadu_si128(Multiply128LUT.Lo + log_m);
-    const LEO_M128 table_hi_y = _mm_loadu_si128(Multiply128LUT.Hi + log_m);
+    const LEO_M128 table_lo_y = _mm_loadu_si128(&Multiply128LUT[log_m].Value[0]);
+    const LEO_M128 table_hi_y = _mm_loadu_si128(&Multiply128LUT[log_m].Value[1]);

    const LEO_M128 clr_mask = _mm_set1_epi8(0x0f);

@ -472,8 +469,8 @@ void fft_butterfly4(
 #if defined(LEO_TRY_AVX2)
    if (CpuHasAVX2)
    {
-        const LEO_M256 table_lo_y = _mm256_loadu_si256(Multiply256LUT.Lo + log_m);
-        const LEO_M256 table_hi_y = _mm256_loadu_si256(Multiply256LUT.Hi + log_m);
+        const LEO_M256 table_lo_y = _mm256_loadu_si256(&Multiply256LUT[log_m].Value[0]);
+        const LEO_M256 table_hi_y = _mm256_loadu_si256(&Multiply256LUT[log_m].Value[1]);

        const LEO_M256 clr_mask = _mm256_set1_epi8(0x0f);

@ -511,8 +508,8 @@ void fft_butterfly4(
    }
 #endif // LEO_TRY_AVX2

-    const LEO_M128 table_lo_y = _mm_loadu_si128(Multiply128LUT.Lo + log_m);
-    const LEO_M128 table_hi_y = _mm_loadu_si128(Multiply128LUT.Hi + log_m);
+    const LEO_M128 table_lo_y = _mm_loadu_si128(&Multiply128LUT[log_m].Value[0]);
+    const LEO_M128 table_hi_y = _mm_loadu_si128(&Multiply128LUT[log_m].Value[1]);

    const LEO_M128 clr_mask = _mm_set1_epi8(0x0f);

@ -568,8 +565,8 @@ void ifft_butterfly(
 #if defined(LEO_TRY_AVX2)
    if (CpuHasAVX2)
    {
-        const LEO_M256 table_lo_y = _mm256_loadu_si256(Multiply256LUT.Lo + log_m);
-        const LEO_M256 table_hi_y = _mm256_loadu_si256(Multiply256LUT.Hi + log_m);
+        const LEO_M256 table_lo_y = _mm256_loadu_si256(&Multiply256LUT[log_m].Value[0]);
+        const LEO_M256 table_hi_y = _mm256_loadu_si256(&Multiply256LUT[log_m].Value[1]);

        const LEO_M256 clr_mask = _mm256_set1_epi8(0x0f);

@ -602,8 +599,8 @@ void ifft_butterfly(
    }
 #endif // LEO_TRY_AVX2

-    const LEO_M128 table_lo_y = _mm_loadu_si128(Multiply128LUT.Lo + log_m);
-    const LEO_M128 table_hi_y = _mm_loadu_si128(Multiply128LUT.Hi + log_m);
+    const LEO_M128 table_lo_y = _mm_loadu_si128(&Multiply128LUT[log_m].Value[0]);
+    const LEO_M128 table_hi_y = _mm_loadu_si128(&Multiply128LUT[log_m].Value[1]);

    const LEO_M128 clr_mask = _mm_set1_epi8(0x0f);

@ -647,8 +644,8 @@ void ifft_butterfly4(
 #if defined(LEO_TRY_AVX2)
    if (CpuHasAVX2)
    {
-        const LEO_M256 table_lo_y = _mm256_loadu_si256(Multiply256LUT.Lo + log_m);
-        const LEO_M256 table_hi_y = _mm256_loadu_si256(Multiply256LUT.Hi + log_m);
+        const LEO_M256 table_lo_y = _mm256_loadu_si256(&Multiply256LUT[log_m].Value[0]);
+        const LEO_M256 table_hi_y = _mm256_loadu_si256(&Multiply256LUT[log_m].Value[1]);

        const LEO_M256 clr_mask = _mm256_set1_epi8(0x0f);

@ -686,8 +683,8 @@ void ifft_butterfly4(
    }
 #endif // LEO_TRY_AVX2

-    const LEO_M128 table_lo_y = _mm_loadu_si128(Multiply128LUT.Lo + log_m);
-    const LEO_M128 table_hi_y = _mm_loadu_si128(Multiply128LUT.Hi + log_m);
+    const LEO_M128 table_lo_y = _mm_loadu_si128(&Multiply128LUT[log_m].Value[0]);
+    const LEO_M128 table_hi_y = _mm_loadu_si128(&Multiply128LUT[log_m].Value[1]);

    const LEO_M128 clr_mask = _mm_set1_epi8(0x0f);

--- a/tests/benchmark.cpp
+++ b/tests/benchmark.cpp
@ -45,11 +45,11 @@ struct TestParameters
    unsigned original_count = 1000; // under 65536
    unsigned recovery_count = 100; // under 65536 - original_count
 #else
-    unsigned original_count = 200; // under 65536
-    unsigned recovery_count = 20; // under 65536 - original_count
+    unsigned original_count = 128; // under 65536
+    unsigned recovery_count = 128; // under 65536 - original_count
 #endif
    unsigned buffer_bytes = 64000; // multiple of 64 bytes
-    unsigned loss_count = 20; // some fraction of original_count
+    unsigned loss_count = 128; // some fraction of original_count
    unsigned seed = 0;
    bool multithreaded = true;
 };