This commit is contained in:
Christopher Taylor 2017-05-28 01:23:03 -07:00
parent d4f4f94809
commit 574db36cbf
6 changed files with 88 additions and 38 deletions

View File

@ -62,6 +62,7 @@ bool CpuHasNeon64 = false; // 64-bit
#ifdef LEO_TRY_AVX2 #ifdef LEO_TRY_AVX2
bool CpuHasAVX2 = false; bool CpuHasAVX2 = false;
#endif #endif
bool CpuHasSSSE3 = false; bool CpuHasSSSE3 = false;
#define CPUID_EBX_AVX2 0x00000020 #define CPUID_EBX_AVX2 0x00000020
@ -191,6 +192,8 @@ void xor_mem(
} while (bytes > 0); } while (bytes > 0);
} }
#ifdef LEO_USE_VECTOR4_OPT
void xor_mem4( void xor_mem4(
void * LEO_RESTRICT vx_0, const void * LEO_RESTRICT vy_0, void * LEO_RESTRICT vx_0, const void * LEO_RESTRICT vy_0,
void * LEO_RESTRICT vx_1, const void * LEO_RESTRICT vy_1, void * LEO_RESTRICT vx_1, const void * LEO_RESTRICT vy_1,
@ -321,12 +324,15 @@ void xor_mem4(
} while (bytes > 0); } while (bytes > 0);
} }
#endif // LEO_USE_VECTOR4_OPT
void VectorXOR( void VectorXOR(
const uint64_t bytes, const uint64_t bytes,
unsigned count, unsigned count,
void** x, void** x,
void** y) void** y)
{ {
#ifdef LEO_USE_VECTOR4_OPT
while (count >= 4) while (count >= 4)
{ {
xor_mem4( xor_mem4(
@ -338,9 +344,10 @@ void VectorXOR(
x += 4, y += 4; x += 4, y += 4;
count -= 4; count -= 4;
} }
#endif // LEO_USE_VECTOR4_OPT
for (unsigned i = 0; i < count; ++i) for (unsigned i = 0; i < count; ++i)
xor_mem(y[i], x[i], bytes); xor_mem(x[i], y[i], bytes);
} }

View File

@ -51,6 +51,16 @@
#include <stdint.h> #include <stdint.h>
//------------------------------------------------------------------------------
// Constants
// Unroll inner loops 4 times
//#define LEO_USE_VECTOR4_OPT
// Define this to enable the optimized version of FWHT()
//#define LEO_FWHT_OPT
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
// Debug // Debug
@ -152,6 +162,7 @@ namespace leopard {
// Initialize CPU architecture flags // Initialize CPU architecture flags
void InitializeCPUArch(); void InitializeCPUArch();
#if defined(LEO_TRY_NEON) #if defined(LEO_TRY_NEON)
# if defined(IOS) && defined(__ARM_NEON__) # if defined(IOS) && defined(__ARM_NEON__)
// Does device support NEON? // Does device support NEON?
@ -210,6 +221,8 @@ void xor_mem(
void * LEO_RESTRICT x, const void * LEO_RESTRICT y, void * LEO_RESTRICT x, const void * LEO_RESTRICT y,
uint64_t bytes); uint64_t bytes);
#ifdef LEO_USE_VECTOR4_OPT
// For i = {0, 1, 2, 3}: x_i[] ^= x_i[] // For i = {0, 1, 2, 3}: x_i[] ^= x_i[]
void xor_mem4( void xor_mem4(
void * LEO_RESTRICT x_0, const void * LEO_RESTRICT y_0, void * LEO_RESTRICT x_0, const void * LEO_RESTRICT y_0,
@ -218,6 +231,8 @@ void xor_mem4(
void * LEO_RESTRICT x_3, const void * LEO_RESTRICT y_3, void * LEO_RESTRICT x_3, const void * LEO_RESTRICT y_3,
uint64_t bytes); uint64_t bytes);
#endif // LEO_USE_VECTOR4_OPT
// x[] ^= y[] // x[] ^= y[]
void VectorXOR( void VectorXOR(
const uint64_t bytes, const uint64_t bytes,

View File

@ -32,9 +32,6 @@
#include <string.h> #include <string.h>
// Define this to enable the optimized version of FWHT()
#define LEO_FF16_FWHT_OPTIMIZED
namespace leopard { namespace ff16 { namespace leopard { namespace ff16 {
@ -84,7 +81,7 @@ static inline ffe_t SubMod(const ffe_t a, const ffe_t b)
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
// Fast Walsh-Hadamard Transform (FWHT) (mod kModulus) // Fast Walsh-Hadamard Transform (FWHT) (mod kModulus)
#if defined(LEO_FF16_FWHT_OPTIMIZED) #if defined(LEO_FWHT_OPT)
// {a, b} = {a + b, a - b} (Mod Q) // {a, b} = {a + b, a - b} (Mod Q)
static LEO_FORCE_INLINE void FWHT_2(ffe_t& LEO_RESTRICT a, ffe_t& LEO_RESTRICT b) static LEO_FORCE_INLINE void FWHT_2(ffe_t& LEO_RESTRICT a, ffe_t& LEO_RESTRICT b)
@ -285,7 +282,7 @@ static void FWHT(ffe_t* data, const unsigned ldn)
} }
} }
#else // LEO_FF16_FWHT_OPTIMIZED #else // LEO_FWHT_OPT
// Reference implementation // Reference implementation
void FWHT(ffe_t* data, const unsigned bits) void FWHT(ffe_t* data, const unsigned bits)
@ -297,7 +294,7 @@ void FWHT(ffe_t* data, const unsigned bits)
FWHT_2(data[j], data[j + width]); FWHT_2(data[j], data[j + width]);
} }
#endif // LEO_FF16_FWHT_OPTIMIZED #endif // LEO_FWHT_OPT
// Transform specialized for the finite field order // Transform specialized for the finite field order
void FWHT(ffe_t data[kOrder]) void FWHT(ffe_t data[kOrder])

View File

@ -32,9 +32,6 @@
#include <string.h> #include <string.h>
// Define this to enable the optimized version of FWHT()
#define LEO_FF8_FWHT_OPTIMIZED
namespace leopard { namespace ff8 { namespace leopard { namespace ff8 {
@ -81,8 +78,6 @@ static inline ffe_t SubMod(const ffe_t a, const ffe_t b)
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
// Fast Walsh-Hadamard Transform (FWHT) (mod kModulus) // Fast Walsh-Hadamard Transform (FWHT) (mod kModulus)
#if defined(LEO_FF8_FWHT_OPTIMIZED)
// {a, b} = {a + b, a - b} (Mod Q) // {a, b} = {a + b, a - b} (Mod Q)
static LEO_FORCE_INLINE void FWHT_2(ffe_t& LEO_RESTRICT a, ffe_t& LEO_RESTRICT b) static LEO_FORCE_INLINE void FWHT_2(ffe_t& LEO_RESTRICT a, ffe_t& LEO_RESTRICT b)
{ {
@ -92,6 +87,8 @@ static LEO_FORCE_INLINE void FWHT_2(ffe_t& LEO_RESTRICT a, ffe_t& LEO_RESTRICT b
b = dif; b = dif;
} }
#if defined(LEO_FWHT_OPT)
static LEO_FORCE_INLINE void FWHT_4(ffe_t* data) static LEO_FORCE_INLINE void FWHT_4(ffe_t* data)
{ {
ffe_t t0 = data[0]; ffe_t t0 = data[0];
@ -191,7 +188,7 @@ static void FWHT(ffe_t* data, const unsigned ldn)
} }
} }
#else // LEO_FF8_FWHT_OPTIMIZED #else // LEO_FWHT_OPT
// Reference implementation // Reference implementation
void FWHT(ffe_t* data, const unsigned bits) void FWHT(ffe_t* data, const unsigned bits)
@ -203,7 +200,7 @@ void FWHT(ffe_t* data, const unsigned bits)
FWHT_2(data[j], data[j + width]); FWHT_2(data[j], data[j + width]);
} }
#endif // LEO_FF8_FWHT_OPTIMIZED #endif // LEO_FWHT_OPT
// Transform specialized for the finite field order // Transform specialized for the finite field order
void FWHT(ffe_t data[kOrder]) void FWHT(ffe_t data[kOrder])
@ -272,7 +269,7 @@ struct {
#endif // LEO_TRY_AVX2 #endif // LEO_TRY_AVX2
// Returns a * Log(b) // Returns a * Log(b)
static ffe_t FFEMultiplyLog(ffe_t a, ffe_t log_b) static ffe_t MultiplyLog(ffe_t a, ffe_t log_b)
{ {
if (a == 0) if (a == 0)
return 0; return 0;
@ -285,10 +282,10 @@ void InitializeMultiplyTables()
for (int log_y = 0; log_y < 256; ++log_y) for (int log_y = 0; log_y < 256; ++log_y)
{ {
uint8_t lo[16], hi[16]; uint8_t lo[16], hi[16];
for (unsigned char x = 0; x < 16; ++x) for (uint8_t x = 0; x < 16; ++x)
{ {
lo[x] = FFEMultiplyLog(x, static_cast<uint8_t>(log_y)); lo[x] = MultiplyLog(x, static_cast<uint8_t>(log_y));
hi[x] = FFEMultiplyLog(x << 4, static_cast<uint8_t>(log_y)); hi[x] = MultiplyLog(x << 4, static_cast<uint8_t>(log_y));
} }
const LEO_M128 table_lo = _mm_loadu_si128((LEO_M128*)lo); const LEO_M128 table_lo = _mm_loadu_si128((LEO_M128*)lo);
@ -454,6 +451,7 @@ void fft_butterfly(
} while (bytes > 0); } while (bytes > 0);
} }
#ifdef LEO_USE_VECTOR4_OPT
void fft_butterfly4( void fft_butterfly4(
void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0, void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
@ -548,6 +546,8 @@ void fft_butterfly4(
} while (bytes > 0); } while (bytes > 0);
} }
#endif // LEO_USE_VECTOR4_OPT
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
// IFFT Operations // IFFT Operations
@ -626,6 +626,7 @@ void ifft_butterfly(
} while (bytes > 0); } while (bytes > 0);
} }
#ifdef LEO_USE_VECTOR4_OPT
void ifft_butterfly4( void ifft_butterfly4(
void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0, void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
@ -720,6 +721,8 @@ void ifft_butterfly4(
} while (bytes > 0); } while (bytes > 0);
} }
#endif // LEO_USE_VECTOR4_OPT
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
// FFT // FFT
@ -751,12 +754,12 @@ static void FFTInitialize()
FFTSkew[j + s] = FFTSkew[j] ^ temp[i]; FFTSkew[j + s] = FFTSkew[j] ^ temp[i];
} }
temp[m] = kModulus - LogLUT[FFEMultiplyLog(temp[m], LogLUT[temp[m] ^ 1])]; temp[m] = kModulus - LogLUT[MultiplyLog(temp[m], LogLUT[temp[m] ^ 1])];
for (unsigned i = m + 1; i < (kBits - 1); ++i) for (unsigned i = m + 1; i < (kBits - 1); ++i)
{ {
const ffe_t sum = AddMod(LogLUT[temp[i] ^ 1], temp[m]); const ffe_t sum = AddMod(LogLUT[temp[i] ^ 1], temp[m]);
temp[i] = FFEMultiplyLog(temp[i], sum); temp[i] = MultiplyLog(temp[i], sum);
} }
} }
@ -780,10 +783,11 @@ void VectorFFTButterfly(
{ {
if (skew == kModulus) if (skew == kModulus)
{ {
VectorXOR(bytes, count, x, y); VectorXOR(bytes, count, y, x);
return; return;
} }
#ifdef LEO_USE_VECTOR4_OPT
while (count >= 4) while (count >= 4)
{ {
fft_butterfly4( fft_butterfly4(
@ -795,6 +799,7 @@ void VectorFFTButterfly(
x += 4, y += 4; x += 4, y += 4;
count -= 4; count -= 4;
} }
#endif // LEO_USE_VECTOR4_OPT
for (unsigned i = 0; i < count; ++i) for (unsigned i = 0; i < count; ++i)
fft_butterfly(x[i], y[i], skew, bytes); fft_butterfly(x[i], y[i], skew, bytes);
@ -809,10 +814,11 @@ void VectorIFFTButterfly(
{ {
if (skew == kModulus) if (skew == kModulus)
{ {
VectorXOR(bytes, count, x, y); VectorXOR(bytes, count, y, x);
return; return;
} }
#ifdef LEO_USE_VECTOR4_OPT
while (count >= 4) while (count >= 4)
{ {
ifft_butterfly4( ifft_butterfly4(
@ -824,6 +830,7 @@ void VectorIFFTButterfly(
x += 4, y += 4; x += 4, y += 4;
count -= 4; count -= 4;
} }
#endif // LEO_USE_VECTOR4_OPT
for (unsigned i = 0; i < count; ++i) for (unsigned i = 0; i < count; ++i)
ifft_butterfly(x[i], y[i], skew, bytes); ifft_butterfly(x[i], y[i], skew, bytes);

View File

@ -86,6 +86,8 @@ void fft_butterfly(
void * LEO_RESTRICT x, void * LEO_RESTRICT y, void * LEO_RESTRICT x, void * LEO_RESTRICT y,
ffe_t log_m, uint64_t bytes); ffe_t log_m, uint64_t bytes);
#ifdef LEO_USE_VECTOR4_OPT
// Unroll 4 rows at a time // Unroll 4 rows at a time
void fft_butterfly4( void fft_butterfly4(
void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0, void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
@ -94,6 +96,8 @@ void fft_butterfly4(
void * LEO_RESTRICT x_3, void * LEO_RESTRICT y_3, void * LEO_RESTRICT x_3, void * LEO_RESTRICT y_3,
ffe_t log_m, uint64_t bytes); ffe_t log_m, uint64_t bytes);
#endif // LEO_USE_VECTOR4_OPT
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
// IFFT Operations // IFFT Operations
@ -107,6 +111,8 @@ void ifft_butterfly(
void * LEO_RESTRICT x, void * LEO_RESTRICT y, void * LEO_RESTRICT x, void * LEO_RESTRICT y,
ffe_t log_m, uint64_t bytes); ffe_t log_m, uint64_t bytes);
#ifdef LEO_USE_VECTOR4_OPT
// Unroll 4 rows at a time // Unroll 4 rows at a time
void ifft_butterfly4( void ifft_butterfly4(
void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0, void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
@ -115,6 +121,12 @@ void ifft_butterfly4(
void * LEO_RESTRICT x_3, void * LEO_RESTRICT y_3, void * LEO_RESTRICT x_3, void * LEO_RESTRICT y_3,
ffe_t log_m, uint64_t bytes); ffe_t log_m, uint64_t bytes);
#endif // LEO_USE_VECTOR4_OPT
//------------------------------------------------------------------------------
// FFT
void VectorFFTButterfly( void VectorFFTButterfly(
const uint64_t bytes, const uint64_t bytes,
unsigned count, unsigned count,

View File

@ -552,6 +552,8 @@ static void BasicTest(const TestParameters& params)
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
// Parallel XOR Benchmark // Parallel XOR Benchmark
#ifdef LEO_USE_VECTOR4_OPT
// Demonstrate about 10% performance boost by doing parallel rows for XORs // Demonstrate about 10% performance boost by doing parallel rows for XORs
void ParallelXORBenchmark() void ParallelXORBenchmark()
{ {
@ -605,12 +607,16 @@ void ParallelXORBenchmark()
t_4.Print(iteration_count); t_4.Print(iteration_count);
} }
#endif // LEO_USE_VECTOR4_OPT
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
// Parallel Butterfly8 Benchmark // Parallel Butterfly8 Benchmark
#ifdef LEO_HAS_FF8 #ifdef LEO_HAS_FF8
#ifdef LEO_USE_VECTOR4_OPT
// Demonstrate performance boost by doing parallel rows for Butterfly8s // Demonstrate performance boost by doing parallel rows for Butterfly8s
void ParallelButterfly8Benchmark() void ParallelButterfly8Benchmark()
{ {
@ -670,6 +676,8 @@ void ParallelButterfly8Benchmark()
t_4.Print(iteration_count); t_4.Print(iteration_count);
} }
#endif // LEO_USE_VECTOR4_OPT
#endif // LEO_HAS_FF8 #endif // LEO_HAS_FF8
@ -678,6 +686,8 @@ void ParallelButterfly8Benchmark()
#ifdef LEO_HAS_FF16 #ifdef LEO_HAS_FF16
#ifdef LEO_USE_VECTOR4_OPT
// Demonstrate performance boost by doing parallel rows for Butterfly16s // Demonstrate performance boost by doing parallel rows for Butterfly16s
void ParallelButterfly16Benchmark() void ParallelButterfly16Benchmark()
{ {
@ -737,6 +747,8 @@ void ParallelButterfly16Benchmark()
t_4.Print(iteration_count); t_4.Print(iteration_count);
} }
#endif // LEO_USE_VECTOR4_OPT
#endif // LEO_HAS_FF8 #endif // LEO_HAS_FF8