diff --git a/LeopardCommon.cpp b/LeopardCommon.cpp index 141e03d..6f03b5b 100644 --- a/LeopardCommon.cpp +++ b/LeopardCommon.cpp @@ -43,11 +43,11 @@ namespace leopard { #if defined(LEO_TRY_NEON) # if defined(IOS) && defined(__ARM_NEON__) -// Requires iPhone 5S or newer + // Requires iPhone 5S or newer # else -// Remember to add LOCAL_STATIC_LIBRARIES := cpufeatures -bool CpuHasNeon = false; // V6 / V7 -bool CpuHasNeon64 = false; // 64-bit + // Remember to add LOCAL_STATIC_LIBRARIES := cpufeatures + bool CpuHasNeon = false; // V6 / V7 + bool CpuHasNeon64 = false; // 64-bit # endif #endif @@ -60,8 +60,9 @@ bool CpuHasNeon64 = false; // 64-bit #endif #ifdef LEO_TRY_AVX2 -bool CpuHasAVX2 = false; + bool CpuHasAVX2 = false; #endif + bool CpuHasSSSE3 = false; #define CPUID_EBX_AVX2 0x00000020 @@ -191,6 +192,8 @@ void xor_mem( } while (bytes > 0); } +#ifdef LEO_USE_VECTOR4_OPT + void xor_mem4( void * LEO_RESTRICT vx_0, const void * LEO_RESTRICT vy_0, void * LEO_RESTRICT vx_1, const void * LEO_RESTRICT vy_1, @@ -321,12 +324,15 @@ void xor_mem4( } while (bytes > 0); } +#endif // LEO_USE_VECTOR4_OPT + void VectorXOR( const uint64_t bytes, unsigned count, void** x, void** y) { +#ifdef LEO_USE_VECTOR4_OPT while (count >= 4) { xor_mem4( @@ -338,9 +344,10 @@ void VectorXOR( x += 4, y += 4; count -= 4; } +#endif // LEO_USE_VECTOR4_OPT for (unsigned i = 0; i < count; ++i) - xor_mem(y[i], x[i], bytes); + xor_mem(x[i], y[i], bytes); } diff --git a/LeopardCommon.h b/LeopardCommon.h index 7cc9764..7535854 100644 --- a/LeopardCommon.h +++ b/LeopardCommon.h @@ -51,6 +51,16 @@ #include +//------------------------------------------------------------------------------ +// Constants + +// Unroll inner loops 4 times +//#define LEO_USE_VECTOR4_OPT + +// Define this to enable the optimized version of FWHT() +//#define LEO_FWHT_OPT + + //------------------------------------------------------------------------------ // Debug @@ -152,26 +162,27 @@ namespace leopard { // Initialize CPU architecture flags void InitializeCPUArch(); + #if defined(LEO_TRY_NEON) # if defined(IOS) && defined(__ARM_NEON__) -// Does device support NEON? -static const bool CpuHasNeon = true; -static const bool CpuHasNeon64 = true; + // Does device support NEON? + static const bool CpuHasNeon = true; + static const bool CpuHasNeon64 = true; # else -// Does device support NEON? -// Remember to add LOCAL_STATIC_LIBRARIES := cpufeatures -extern bool CpuHasNeon; // V6 / V7 -extern bool CpuHasNeon64; // 64-bit + // Does device support NEON? + // Remember to add LOCAL_STATIC_LIBRARIES := cpufeatures + extern bool CpuHasNeon; // V6 / V7 + extern bool CpuHasNeon64; // 64-bit # endif #endif #if !defined(LEO_TARGET_MOBILE) # if defined(LEO_TRY_AVX2) -// Does CPU support AVX2? -extern bool CpuHasAVX2; + // Does CPU support AVX2? + extern bool CpuHasAVX2; # endif -// Does CPU support SSSE3? -extern bool CpuHasSSSE3; + // Does CPU support SSSE3? + extern bool CpuHasSSSE3; #endif // LEO_TARGET_MOBILE @@ -210,6 +221,8 @@ void xor_mem( void * LEO_RESTRICT x, const void * LEO_RESTRICT y, uint64_t bytes); +#ifdef LEO_USE_VECTOR4_OPT + // For i = {0, 1, 2, 3}: x_i[] ^= x_i[] void xor_mem4( void * LEO_RESTRICT x_0, const void * LEO_RESTRICT y_0, @@ -218,6 +231,8 @@ void xor_mem4( void * LEO_RESTRICT x_3, const void * LEO_RESTRICT y_3, uint64_t bytes); +#endif // LEO_USE_VECTOR4_OPT + // x[] ^= y[] void VectorXOR( const uint64_t bytes, diff --git a/LeopardFF16.cpp b/LeopardFF16.cpp index 66ccf04..7cfb9b4 100644 --- a/LeopardFF16.cpp +++ b/LeopardFF16.cpp @@ -32,9 +32,6 @@ #include -// Define this to enable the optimized version of FWHT() -#define LEO_FF16_FWHT_OPTIMIZED - namespace leopard { namespace ff16 { @@ -84,7 +81,7 @@ static inline ffe_t SubMod(const ffe_t a, const ffe_t b) //------------------------------------------------------------------------------ // Fast Walsh-Hadamard Transform (FWHT) (mod kModulus) -#if defined(LEO_FF16_FWHT_OPTIMIZED) +#if defined(LEO_FWHT_OPT) // {a, b} = {a + b, a - b} (Mod Q) static LEO_FORCE_INLINE void FWHT_2(ffe_t& LEO_RESTRICT a, ffe_t& LEO_RESTRICT b) @@ -285,7 +282,7 @@ static void FWHT(ffe_t* data, const unsigned ldn) } } -#else // LEO_FF16_FWHT_OPTIMIZED +#else // LEO_FWHT_OPT // Reference implementation void FWHT(ffe_t* data, const unsigned bits) @@ -297,7 +294,7 @@ void FWHT(ffe_t* data, const unsigned bits) FWHT_2(data[j], data[j + width]); } -#endif // LEO_FF16_FWHT_OPTIMIZED +#endif // LEO_FWHT_OPT // Transform specialized for the finite field order void FWHT(ffe_t data[kOrder]) diff --git a/LeopardFF8.cpp b/LeopardFF8.cpp index ca8a05b..eccc5c4 100644 --- a/LeopardFF8.cpp +++ b/LeopardFF8.cpp @@ -32,9 +32,6 @@ #include -// Define this to enable the optimized version of FWHT() -#define LEO_FF8_FWHT_OPTIMIZED - namespace leopard { namespace ff8 { @@ -81,8 +78,6 @@ static inline ffe_t SubMod(const ffe_t a, const ffe_t b) //------------------------------------------------------------------------------ // Fast Walsh-Hadamard Transform (FWHT) (mod kModulus) -#if defined(LEO_FF8_FWHT_OPTIMIZED) - // {a, b} = {a + b, a - b} (Mod Q) static LEO_FORCE_INLINE void FWHT_2(ffe_t& LEO_RESTRICT a, ffe_t& LEO_RESTRICT b) { @@ -92,6 +87,8 @@ static LEO_FORCE_INLINE void FWHT_2(ffe_t& LEO_RESTRICT a, ffe_t& LEO_RESTRICT b b = dif; } +#if defined(LEO_FWHT_OPT) + static LEO_FORCE_INLINE void FWHT_4(ffe_t* data) { ffe_t t0 = data[0]; @@ -191,7 +188,7 @@ static void FWHT(ffe_t* data, const unsigned ldn) } } -#else // LEO_FF8_FWHT_OPTIMIZED +#else // LEO_FWHT_OPT // Reference implementation void FWHT(ffe_t* data, const unsigned bits) @@ -203,7 +200,7 @@ void FWHT(ffe_t* data, const unsigned bits) FWHT_2(data[j], data[j + width]); } -#endif // LEO_FF8_FWHT_OPTIMIZED +#endif // LEO_FWHT_OPT // Transform specialized for the finite field order void FWHT(ffe_t data[kOrder]) @@ -272,7 +269,7 @@ struct { #endif // LEO_TRY_AVX2 // Returns a * Log(b) -static ffe_t FFEMultiplyLog(ffe_t a, ffe_t log_b) +static ffe_t MultiplyLog(ffe_t a, ffe_t log_b) { if (a == 0) return 0; @@ -285,10 +282,10 @@ void InitializeMultiplyTables() for (int log_y = 0; log_y < 256; ++log_y) { uint8_t lo[16], hi[16]; - for (unsigned char x = 0; x < 16; ++x) + for (uint8_t x = 0; x < 16; ++x) { - lo[x] = FFEMultiplyLog(x, static_cast(log_y)); - hi[x] = FFEMultiplyLog(x << 4, static_cast(log_y)); + lo[x] = MultiplyLog(x, static_cast(log_y)); + hi[x] = MultiplyLog(x << 4, static_cast(log_y)); } const LEO_M128 table_lo = _mm_loadu_si128((LEO_M128*)lo); @@ -454,6 +451,7 @@ void fft_butterfly( } while (bytes > 0); } +#ifdef LEO_USE_VECTOR4_OPT void fft_butterfly4( void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0, @@ -548,6 +546,8 @@ void fft_butterfly4( } while (bytes > 0); } +#endif // LEO_USE_VECTOR4_OPT + //------------------------------------------------------------------------------ // IFFT Operations @@ -626,6 +626,7 @@ void ifft_butterfly( } while (bytes > 0); } +#ifdef LEO_USE_VECTOR4_OPT void ifft_butterfly4( void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0, @@ -720,6 +721,8 @@ void ifft_butterfly4( } while (bytes > 0); } +#endif // LEO_USE_VECTOR4_OPT + //------------------------------------------------------------------------------ // FFT @@ -751,12 +754,12 @@ static void FFTInitialize() FFTSkew[j + s] = FFTSkew[j] ^ temp[i]; } - temp[m] = kModulus - LogLUT[FFEMultiplyLog(temp[m], LogLUT[temp[m] ^ 1])]; + temp[m] = kModulus - LogLUT[MultiplyLog(temp[m], LogLUT[temp[m] ^ 1])]; for (unsigned i = m + 1; i < (kBits - 1); ++i) { const ffe_t sum = AddMod(LogLUT[temp[i] ^ 1], temp[m]); - temp[i] = FFEMultiplyLog(temp[i], sum); + temp[i] = MultiplyLog(temp[i], sum); } } @@ -780,10 +783,11 @@ void VectorFFTButterfly( { if (skew == kModulus) { - VectorXOR(bytes, count, x, y); + VectorXOR(bytes, count, y, x); return; } +#ifdef LEO_USE_VECTOR4_OPT while (count >= 4) { fft_butterfly4( @@ -795,6 +799,7 @@ void VectorFFTButterfly( x += 4, y += 4; count -= 4; } +#endif // LEO_USE_VECTOR4_OPT for (unsigned i = 0; i < count; ++i) fft_butterfly(x[i], y[i], skew, bytes); @@ -809,10 +814,11 @@ void VectorIFFTButterfly( { if (skew == kModulus) { - VectorXOR(bytes, count, x, y); + VectorXOR(bytes, count, y, x); return; } +#ifdef LEO_USE_VECTOR4_OPT while (count >= 4) { ifft_butterfly4( @@ -824,6 +830,7 @@ void VectorIFFTButterfly( x += 4, y += 4; count -= 4; } +#endif // LEO_USE_VECTOR4_OPT for (unsigned i = 0; i < count; ++i) ifft_butterfly(x[i], y[i], skew, bytes); diff --git a/LeopardFF8.h b/LeopardFF8.h index bcd0200..4bc7115 100644 --- a/LeopardFF8.h +++ b/LeopardFF8.h @@ -86,6 +86,8 @@ void fft_butterfly( void * LEO_RESTRICT x, void * LEO_RESTRICT y, ffe_t log_m, uint64_t bytes); +#ifdef LEO_USE_VECTOR4_OPT + // Unroll 4 rows at a time void fft_butterfly4( void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0, @@ -94,6 +96,8 @@ void fft_butterfly4( void * LEO_RESTRICT x_3, void * LEO_RESTRICT y_3, ffe_t log_m, uint64_t bytes); +#endif // LEO_USE_VECTOR4_OPT + //------------------------------------------------------------------------------ // IFFT Operations @@ -107,6 +111,8 @@ void ifft_butterfly( void * LEO_RESTRICT x, void * LEO_RESTRICT y, ffe_t log_m, uint64_t bytes); +#ifdef LEO_USE_VECTOR4_OPT + // Unroll 4 rows at a time void ifft_butterfly4( void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0, @@ -115,6 +121,12 @@ void ifft_butterfly4( void * LEO_RESTRICT x_3, void * LEO_RESTRICT y_3, ffe_t log_m, uint64_t bytes); +#endif // LEO_USE_VECTOR4_OPT + + +//------------------------------------------------------------------------------ +// FFT + void VectorFFTButterfly( const uint64_t bytes, unsigned count, diff --git a/tests/benchmark.cpp b/tests/benchmark.cpp index 0856089..5fcc1c1 100644 --- a/tests/benchmark.cpp +++ b/tests/benchmark.cpp @@ -552,6 +552,8 @@ static void BasicTest(const TestParameters& params) //------------------------------------------------------------------------------ // Parallel XOR Benchmark +#ifdef LEO_USE_VECTOR4_OPT + // Demonstrate about 10% performance boost by doing parallel rows for XORs void ParallelXORBenchmark() { @@ -605,12 +607,16 @@ void ParallelXORBenchmark() t_4.Print(iteration_count); } +#endif // LEO_USE_VECTOR4_OPT + //------------------------------------------------------------------------------ // Parallel Butterfly8 Benchmark #ifdef LEO_HAS_FF8 +#ifdef LEO_USE_VECTOR4_OPT + // Demonstrate performance boost by doing parallel rows for Butterfly8s void ParallelButterfly8Benchmark() { @@ -670,6 +676,8 @@ void ParallelButterfly8Benchmark() t_4.Print(iteration_count); } +#endif // LEO_USE_VECTOR4_OPT + #endif // LEO_HAS_FF8 @@ -678,6 +686,8 @@ void ParallelButterfly8Benchmark() #ifdef LEO_HAS_FF16 +#ifdef LEO_USE_VECTOR4_OPT + // Demonstrate performance boost by doing parallel rows for Butterfly16s void ParallelButterfly16Benchmark() { @@ -737,6 +747,8 @@ void ParallelButterfly16Benchmark() t_4.Print(iteration_count); } +#endif // LEO_USE_VECTOR4_OPT + #endif // LEO_HAS_FF8