mirror of https://github.com/status-im/leopard.git
Better non-SSE gf mul operations: Now only 5x slower..
This commit is contained in:
parent
6237d3ddaf
commit
e6753965a1
|
@ -175,6 +175,7 @@ void xor_mem(
|
|||
return;
|
||||
}
|
||||
#endif // LEO_TRY_AVX2
|
||||
|
||||
LEO_M128 * LEO_RESTRICT x16 = reinterpret_cast<LEO_M128 *>(vx);
|
||||
const LEO_M128 * LEO_RESTRICT y16 = reinterpret_cast<const LEO_M128 *>(vy);
|
||||
do
|
||||
|
@ -223,6 +224,7 @@ void xor_mem_2to1(
|
|||
x32 += 4, y32 += 4, z32 += 4;
|
||||
bytes -= 128;
|
||||
};
|
||||
|
||||
if (bytes > 0)
|
||||
{
|
||||
LEO_M256 x0 = _mm256_xor_si256(_mm256_loadu_si256(x32), _mm256_loadu_si256(y32));
|
||||
|
@ -232,9 +234,11 @@ void xor_mem_2to1(
|
|||
_mm256_storeu_si256(x32, x0);
|
||||
_mm256_storeu_si256(x32 + 1, x1);
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
#endif // LEO_TRY_AVX2
|
||||
|
||||
LEO_M128 * LEO_RESTRICT x16 = reinterpret_cast<LEO_M128 *>(x);
|
||||
const LEO_M128 * LEO_RESTRICT y16 = reinterpret_cast<const LEO_M128 *>(y);
|
||||
const LEO_M128 * LEO_RESTRICT z16 = reinterpret_cast<const LEO_M128 *>(z);
|
||||
|
|
|
@ -159,6 +159,10 @@
|
|||
//------------------------------------------------------------------------------
|
||||
// Constants
|
||||
|
||||
// Enable 8-bit or 16-bit fields
|
||||
#define LEO_HAS_FF8
|
||||
#define LEO_HAS_FF16
|
||||
|
||||
// Define this to enable the optimized version of FWHT()
|
||||
#define LEO_FWHT_OPT
|
||||
|
||||
|
|
121
LeopardFF8.cpp
121
LeopardFF8.cpp
|
@ -228,11 +228,36 @@ struct {
|
|||
} static Multiply256LUT[kOrder];
|
||||
#endif // LEO_TRY_AVX2
|
||||
|
||||
static ffe_t Multiply8LUT[256 * 256];
|
||||
|
||||
|
||||
void InitializeMultiplyTables()
|
||||
{
|
||||
if (!CpuHasSSSE3)
|
||||
{
|
||||
for (unsigned x = 0; x < 256; ++x)
|
||||
{
|
||||
ffe_t* lut = Multiply8LUT + x;
|
||||
|
||||
if (x == 0)
|
||||
{
|
||||
for (unsigned log_y = 0; log_y < 256; ++log_y, lut += 256)
|
||||
lut[log_y] = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
const ffe_t log_x = LogLUT[x];
|
||||
|
||||
for (unsigned log_y = 0; log_y < 256; ++log_y, lut += 256)
|
||||
{
|
||||
const ffe_t prod = ExpLUT[AddMod(log_x, log_y)];
|
||||
*lut = prod;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
// For each value we could multiply by:
|
||||
for (unsigned log_m = 0; log_m < kOrder; ++log_m)
|
||||
|
@ -334,16 +359,16 @@ void mul_mem(
|
|||
}
|
||||
|
||||
// Reference version:
|
||||
const ffe_t* LEO_RESTRICT lut = Multiply8LUT + log_m * 256;
|
||||
ffe_t * LEO_RESTRICT x1 = reinterpret_cast<ffe_t *>(x);
|
||||
const ffe_t * LEO_RESTRICT y1 = reinterpret_cast<const ffe_t *>(y);
|
||||
|
||||
do
|
||||
{
|
||||
for (unsigned j = 0; j < 64; ++j)
|
||||
x1[j] = MultiplyLog(y1[j], log_m);
|
||||
x1[j] = lut[y1[j]];
|
||||
|
||||
x1 += 64;
|
||||
y1 += 64;
|
||||
x1 += 64, y1 += 64;
|
||||
bytes -= 64;
|
||||
} while (bytes > 0);
|
||||
}
|
||||
|
@ -567,25 +592,47 @@ void ifft_butterfly(
|
|||
}
|
||||
|
||||
// Reference version:
|
||||
const ffe_t* LEO_RESTRICT lut = Multiply8LUT + log_m * 256;
|
||||
|
||||
xor_mem(y, x, bytes);
|
||||
|
||||
#ifdef LEO_TARGET_MOBILE
|
||||
ffe_t * LEO_RESTRICT x1 = reinterpret_cast<ffe_t *>(x);
|
||||
ffe_t * LEO_RESTRICT y1 = reinterpret_cast<ffe_t *>(y);
|
||||
|
||||
do
|
||||
{
|
||||
for (unsigned j = 0; j < 64; ++j)
|
||||
{
|
||||
ffe_t x_0 = x1[j];
|
||||
ffe_t y_0 = y1[j];
|
||||
y_0 ^= x_0;
|
||||
x_0 ^= MultiplyLog(y_0, log_m);
|
||||
x1[j] = x_0;
|
||||
y1[j] = y_0;
|
||||
}
|
||||
x1[j] ^= lut[y1[j]];
|
||||
|
||||
x1 += 64;
|
||||
y1 += 64;
|
||||
x1 += 64, y1 += 64;
|
||||
bytes -= 64;
|
||||
} while (bytes > 0);
|
||||
#else
|
||||
uint64_t * LEO_RESTRICT x8 = reinterpret_cast<uint64_t *>(x);
|
||||
ffe_t * LEO_RESTRICT y1 = reinterpret_cast<ffe_t *>(y);
|
||||
|
||||
do
|
||||
{
|
||||
for (unsigned j = 0; j < 8; ++j)
|
||||
{
|
||||
uint64_t x_0 = x8[j];
|
||||
x_0 ^= (uint64_t)lut[y1[0]];
|
||||
x_0 ^= (uint64_t)lut[y1[1]] << 8;
|
||||
x_0 ^= (uint64_t)lut[y1[2]] << 16;
|
||||
x_0 ^= (uint64_t)lut[y1[3]] << 24;
|
||||
x_0 ^= (uint64_t)lut[y1[4]] << 32;
|
||||
x_0 ^= (uint64_t)lut[y1[5]] << 40;
|
||||
x_0 ^= (uint64_t)lut[y1[6]] << 48;
|
||||
x_0 ^= (uint64_t)lut[y1[7]] << 56;
|
||||
x8[j] = x_0;
|
||||
y1 += 8;
|
||||
}
|
||||
|
||||
x8 += 8;
|
||||
bytes -= 64;
|
||||
} while (bytes > 0);
|
||||
#endif
|
||||
}
|
||||
|
||||
// 4-way butterfly
|
||||
|
@ -783,6 +830,10 @@ void IFFT_DIT(
|
|||
memset(work[i], 0, bytes);
|
||||
}
|
||||
|
||||
// I tried splitting up the first few layers into L3-cache sized blocks but
|
||||
// found that it only provides about 5% performance boost, which is not
|
||||
// worth the extra complexity.
|
||||
|
||||
// Decimation in time: Unroll 2 layers at a time
|
||||
unsigned dist = 1, dist4 = 4;
|
||||
for (; dist4 <= m; dist = dist4, dist4 <<= 2)
|
||||
|
@ -974,6 +1025,9 @@ void fft_butterfly(
|
|||
}
|
||||
|
||||
// Reference version:
|
||||
const ffe_t* LEO_RESTRICT lut = Multiply8LUT + log_m * 256;
|
||||
|
||||
#ifdef LEO_TARGET_MOBILE
|
||||
ffe_t * LEO_RESTRICT x1 = reinterpret_cast<ffe_t *>(x);
|
||||
ffe_t * LEO_RESTRICT y1 = reinterpret_cast<ffe_t *>(y);
|
||||
|
||||
|
@ -983,15 +1037,40 @@ void fft_butterfly(
|
|||
{
|
||||
ffe_t x_0 = x1[j];
|
||||
ffe_t y_0 = y1[j];
|
||||
x_0 ^= MultiplyLog(y_0, log_m);
|
||||
x_0 ^= lut[y_0];
|
||||
x1[j] = x_0;
|
||||
y1[j] = y_0 ^ x_0;
|
||||
}
|
||||
|
||||
x1 += 64;
|
||||
y1 += 64;
|
||||
x1 += 64, y1 += 64;
|
||||
bytes -= 64;
|
||||
} while (bytes > 0);
|
||||
#else
|
||||
uint64_t * LEO_RESTRICT x8 = reinterpret_cast<uint64_t *>(x);
|
||||
uint64_t * LEO_RESTRICT y8 = reinterpret_cast<uint64_t *>(y);
|
||||
ffe_t * LEO_RESTRICT y1 = reinterpret_cast<ffe_t *>(y);
|
||||
|
||||
do
|
||||
{
|
||||
for (unsigned j = 0; j < 8; ++j)
|
||||
{
|
||||
uint64_t x_0 = x8[j], y_0 = y8[j];
|
||||
x_0 ^= (uint64_t)lut[y1[0]];
|
||||
x_0 ^= (uint64_t)lut[y1[1]] << 8;
|
||||
x_0 ^= (uint64_t)lut[y1[2]] << 16;
|
||||
x_0 ^= (uint64_t)lut[y1[3]] << 24;
|
||||
x_0 ^= (uint64_t)lut[y1[4]] << 32;
|
||||
x_0 ^= (uint64_t)lut[y1[5]] << 40;
|
||||
x_0 ^= (uint64_t)lut[y1[6]] << 48;
|
||||
x_0 ^= (uint64_t)lut[y1[7]] << 56;
|
||||
x8[j] = x_0, y8[j] = y_0 ^ x_0;
|
||||
y1 += 8;
|
||||
}
|
||||
|
||||
x8 += 8, y8 += 8;
|
||||
bytes -= 64;
|
||||
} while (bytes > 0);
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef LEO_USE_VECTOR4_OPT
|
||||
|
@ -1153,8 +1232,8 @@ static void FFT_DIT4(
|
|||
|
||||
_mm256_storeu_si256(work0, work0_reg);
|
||||
_mm256_storeu_si256(work1, work1_reg);
|
||||
work0++, work1++;
|
||||
|
||||
// First layer:
|
||||
if (log_m23 != kModulus)
|
||||
{
|
||||
LEO_FFTB4_256(work2_reg, work3_reg, t23_lo, t23_hi);
|
||||
|
@ -1163,8 +1242,7 @@ static void FFT_DIT4(
|
|||
|
||||
_mm256_storeu_si256(work2, work2_reg);
|
||||
_mm256_storeu_si256(work3, work3_reg);
|
||||
|
||||
work0++, work1++, work2++, work3++;
|
||||
work2++, work3++;
|
||||
|
||||
bytes -= 32;
|
||||
} while (bytes > 0);
|
||||
|
@ -1221,8 +1299,8 @@ static void FFT_DIT4(
|
|||
|
||||
_mm_storeu_si128(work0, work0_reg);
|
||||
_mm_storeu_si128(work1, work1_reg);
|
||||
work0++, work1++;
|
||||
|
||||
// First layer:
|
||||
if (log_m23 != kModulus)
|
||||
{
|
||||
LEO_FFTB4_128(work2_reg, work3_reg, t23_lo, t23_hi);
|
||||
|
@ -1231,8 +1309,7 @@ static void FFT_DIT4(
|
|||
|
||||
_mm_storeu_si128(work2, work2_reg);
|
||||
_mm_storeu_si128(work3, work3_reg);
|
||||
|
||||
work0++, work1++, work2++, work3++;
|
||||
work2++, work3++;
|
||||
|
||||
bytes -= 16;
|
||||
} while (bytes > 0);
|
||||
|
|
|
@ -27,6 +27,7 @@
|
|||
*/
|
||||
|
||||
#include "leopard.h"
|
||||
#include "LeopardCommon.h"
|
||||
|
||||
#ifdef LEO_HAS_FF8
|
||||
#include "LeopardFF8.h"
|
||||
|
|
|
@ -65,10 +65,6 @@
|
|||
// Library version
|
||||
#define LEO_VERSION 1
|
||||
|
||||
// Enable 8-bit or 16-bit fields
|
||||
#define LEO_HAS_FF8
|
||||
#define LEO_HAS_FF16
|
||||
|
||||
// Tweak if the functions are exported or statically linked
|
||||
//#define LEO_DLL /* Defined when building/linking as DLL */
|
||||
//#define LEO_BUILDING /* Defined by the library makefile */
|
||||
|
|
Loading…
Reference in New Issue