Better non-SSE gf mul operations: Now only 5x slower..

2017-06-01 23:54:47 -07:00 · 2017-06-01 23:54:47 -07:00 · e6753965a1
parent 6237d3ddaf
commit e6753965a1
5 changed files with 108 additions and 26 deletions
--- a/LeopardCommon.cpp
+++ b/LeopardCommon.cpp
@ -175,6 +175,7 @@ void xor_mem(
        return;
    }
 #endif // LEO_TRY_AVX2
+
    LEO_M128 * LEO_RESTRICT x16 = reinterpret_cast<LEO_M128 *>(vx);
    const LEO_M128 * LEO_RESTRICT y16 = reinterpret_cast<const LEO_M128 *>(vy);
    do
@ -223,6 +224,7 @@ void xor_mem_2to1(
            x32 += 4, y32 += 4, z32 += 4;
            bytes -= 128;
        };
+
        if (bytes > 0)
        {
            LEO_M256 x0 = _mm256_xor_si256(_mm256_loadu_si256(x32),     _mm256_loadu_si256(y32));
@ -232,9 +234,11 @@ void xor_mem_2to1(
            _mm256_storeu_si256(x32, x0);
            _mm256_storeu_si256(x32 + 1, x1);
        }
+
        return;
    }
 #endif // LEO_TRY_AVX2
+
    LEO_M128 * LEO_RESTRICT x16 = reinterpret_cast<LEO_M128 *>(x);
    const LEO_M128 * LEO_RESTRICT y16 = reinterpret_cast<const LEO_M128 *>(y);
    const LEO_M128 * LEO_RESTRICT z16 = reinterpret_cast<const LEO_M128 *>(z);
--- a/LeopardCommon.h
+++ b/LeopardCommon.h
@ -159,6 +159,10 @@
 //------------------------------------------------------------------------------
 // Constants

+// Enable 8-bit or 16-bit fields
+#define LEO_HAS_FF8
+#define LEO_HAS_FF16
+
 // Define this to enable the optimized version of FWHT()
 #define LEO_FWHT_OPT

--- a/LeopardFF8.cpp
+++ b/LeopardFF8.cpp
@ -228,11 +228,36 @@ struct {
 } static Multiply256LUT[kOrder];
 #endif // LEO_TRY_AVX2

+static ffe_t Multiply8LUT[256 * 256];
+

 void InitializeMultiplyTables()
 {
    if (!CpuHasSSSE3)
+    {
+        for (unsigned x = 0; x < 256; ++x)
+        {
+            ffe_t* lut = Multiply8LUT + x;
+
+            if (x == 0)
+            {
+                for (unsigned log_y = 0; log_y < 256; ++log_y, lut += 256)
+                    lut[log_y] = 0;
+            }
+            else
+            {
+                const ffe_t log_x = LogLUT[x];
+
+                for (unsigned log_y = 0; log_y < 256; ++log_y, lut += 256)
+                {
+                    const ffe_t prod = ExpLUT[AddMod(log_x, log_y)];
+                    *lut = prod;
+                }
+            }
+        }
+
        return;
+    }

    // For each value we could multiply by:
    for (unsigned log_m = 0; log_m < kOrder; ++log_m)
@ -334,16 +359,16 @@ void mul_mem(
    }

    // Reference version:
+    const ffe_t* LEO_RESTRICT lut = Multiply8LUT + log_m * 256;
    ffe_t * LEO_RESTRICT x1 = reinterpret_cast<ffe_t *>(x);
    const ffe_t * LEO_RESTRICT y1 = reinterpret_cast<const ffe_t *>(y);

    do
    {
        for (unsigned j = 0; j < 64; ++j)
-            x1[j] = MultiplyLog(y1[j], log_m);
+            x1[j] = lut[y1[j]];

-        x1 += 64;
-        y1 += 64;
+        x1 += 64, y1 += 64;
        bytes -= 64;
    } while (bytes > 0);
 }
@ -567,25 +592,47 @@ void ifft_butterfly(
    }

    // Reference version:
+    const ffe_t* LEO_RESTRICT lut = Multiply8LUT + log_m * 256;
+
+    xor_mem(y, x, bytes);
+
+#ifdef LEO_TARGET_MOBILE
    ffe_t * LEO_RESTRICT x1 = reinterpret_cast<ffe_t *>(x);
    ffe_t * LEO_RESTRICT y1 = reinterpret_cast<ffe_t *>(y);

    do
    {
        for (unsigned j = 0; j < 64; ++j)
-        {
-            ffe_t x_0 = x1[j];
-            ffe_t y_0 = y1[j];
-            y_0 ^= x_0;
-            x_0 ^= MultiplyLog(y_0, log_m);
-            x1[j] = x_0;
-            y1[j] = y_0;
-        }
+            x1[j] ^= lut[y1[j]];

-        x1 += 64;
-        y1 += 64;
+        x1 += 64, y1 += 64;
        bytes -= 64;
    } while (bytes > 0);
+#else
+    uint64_t * LEO_RESTRICT x8 = reinterpret_cast<uint64_t *>(x);
+    ffe_t * LEO_RESTRICT y1 = reinterpret_cast<ffe_t *>(y);
+
+    do
+    {
+        for (unsigned j = 0; j < 8; ++j)
+        {
+            uint64_t x_0 = x8[j];
+            x_0 ^= (uint64_t)lut[y1[0]];
+            x_0 ^= (uint64_t)lut[y1[1]] << 8;
+            x_0 ^= (uint64_t)lut[y1[2]] << 16;
+            x_0 ^= (uint64_t)lut[y1[3]] << 24;
+            x_0 ^= (uint64_t)lut[y1[4]] << 32;
+            x_0 ^= (uint64_t)lut[y1[5]] << 40;
+            x_0 ^= (uint64_t)lut[y1[6]] << 48;
+            x_0 ^= (uint64_t)lut[y1[7]] << 56;
+            x8[j] = x_0;
+            y1 += 8;
+        }
+
+        x8 += 8;
+        bytes -= 64;
+    } while (bytes > 0);
+#endif
 }

 // 4-way butterfly
@ -783,6 +830,10 @@ void IFFT_DIT(
            memset(work[i], 0, bytes);
    }

+    // I tried splitting up the first few layers into L3-cache sized blocks but
+    // found that it only provides about 5% performance boost, which is not
+    // worth the extra complexity.
+
    // Decimation in time: Unroll 2 layers at a time
    unsigned dist = 1, dist4 = 4;
    for (; dist4 <= m; dist = dist4, dist4 <<= 2)
@ -974,6 +1025,9 @@ void fft_butterfly(
    }

    // Reference version:
+    const ffe_t* LEO_RESTRICT lut = Multiply8LUT + log_m * 256;
+
+#ifdef LEO_TARGET_MOBILE
    ffe_t * LEO_RESTRICT x1 = reinterpret_cast<ffe_t *>(x);
    ffe_t * LEO_RESTRICT y1 = reinterpret_cast<ffe_t *>(y);

@ -983,15 +1037,40 @@ void fft_butterfly(
        {
            ffe_t x_0 = x1[j];
            ffe_t y_0 = y1[j];
-            x_0 ^= MultiplyLog(y_0, log_m);
+            x_0 ^= lut[y_0];
            x1[j] = x_0;
            y1[j] = y_0 ^ x_0;
        }

-        x1 += 64;
-        y1 += 64;
+        x1 += 64, y1 += 64;
        bytes -= 64;
    } while (bytes > 0);
+#else
+    uint64_t * LEO_RESTRICT x8 = reinterpret_cast<uint64_t *>(x);
+    uint64_t * LEO_RESTRICT y8 = reinterpret_cast<uint64_t *>(y);
+    ffe_t * LEO_RESTRICT y1 = reinterpret_cast<ffe_t *>(y);
+
+    do
+    {
+        for (unsigned j = 0; j < 8; ++j)
+        {
+            uint64_t x_0 = x8[j], y_0 = y8[j];
+            x_0 ^= (uint64_t)lut[y1[0]];
+            x_0 ^= (uint64_t)lut[y1[1]] << 8;
+            x_0 ^= (uint64_t)lut[y1[2]] << 16;
+            x_0 ^= (uint64_t)lut[y1[3]] << 24;
+            x_0 ^= (uint64_t)lut[y1[4]] << 32;
+            x_0 ^= (uint64_t)lut[y1[5]] << 40;
+            x_0 ^= (uint64_t)lut[y1[6]] << 48;
+            x_0 ^= (uint64_t)lut[y1[7]] << 56;
+            x8[j] = x_0, y8[j] = y_0 ^ x_0;
+            y1 += 8;
+        }
+
+        x8 += 8, y8 += 8;
+        bytes -= 64;
+    } while (bytes > 0);
+#endif
 }

 #ifdef LEO_USE_VECTOR4_OPT
@ -1153,8 +1232,8 @@ static void FFT_DIT4(

            _mm256_storeu_si256(work0, work0_reg);
            _mm256_storeu_si256(work1, work1_reg);
+            work0++, work1++;

-            // First layer:
            if (log_m23 != kModulus)
            {
                LEO_FFTB4_256(work2_reg, work3_reg, t23_lo, t23_hi);
@ -1163,8 +1242,7 @@ static void FFT_DIT4(

            _mm256_storeu_si256(work2, work2_reg);
            _mm256_storeu_si256(work3, work3_reg);
-
-            work0++, work1++, work2++, work3++;
+            work2++, work3++;

            bytes -= 32;
        } while (bytes > 0);
@ -1221,8 +1299,8 @@ static void FFT_DIT4(

            _mm_storeu_si128(work0, work0_reg);
            _mm_storeu_si128(work1, work1_reg);
+            work0++, work1++;

-            // First layer:
            if (log_m23 != kModulus)
            {
                LEO_FFTB4_128(work2_reg, work3_reg, t23_lo, t23_hi);
@ -1231,8 +1309,7 @@ static void FFT_DIT4(

            _mm_storeu_si128(work2, work2_reg);
            _mm_storeu_si128(work3, work3_reg);
-
-            work0++, work1++, work2++, work3++;
+            work2++, work3++;

            bytes -= 16;
        } while (bytes > 0);
--- a/leopard.cpp
+++ b/leopard.cpp
@ -27,6 +27,7 @@
 */

 #include "leopard.h"
+#include "LeopardCommon.h"

 #ifdef LEO_HAS_FF8
    #include "LeopardFF8.h"
--- a/leopard.h
+++ b/leopard.h
@ -65,10 +65,6 @@
 // Library version
 #define LEO_VERSION 1

-// Enable 8-bit or 16-bit fields
-#define LEO_HAS_FF8
-#define LEO_HAS_FF16
-
 // Tweak if the functions are exported or statically linked
 //#define LEO_DLL /* Defined when building/linking as DLL */
 //#define LEO_BUILDING /* Defined by the library makefile */