From c53b075edad6fc83ac5d27167001afb1649df067 Mon Sep 17 00:00:00 2001 From: Christopher Taylor Date: Tue, 30 May 2017 02:05:41 -0700 Subject: [PATCH] FF16 works --- LeopardCommon.h | 6 +++--- LeopardFF16.cpp | 33 +++------------------------------ leopard.h | 4 +++- tests/benchmark.cpp | 11 +++++++---- 4 files changed, 16 insertions(+), 38 deletions(-) diff --git a/LeopardCommon.h b/LeopardCommon.h index ee7b318..8684633 100644 --- a/LeopardCommon.h +++ b/LeopardCommon.h @@ -149,9 +149,6 @@ //------------------------------------------------------------------------------ // Constants -// Unroll inner loops 4 times -//#define LEO_USE_VECTOR4_OPT - // Define this to enable the optimized version of FWHT() #define LEO_FWHT_OPT @@ -164,6 +161,9 @@ // Optimize M=1 case #define LEO_M1_OPT +// Unroll inner loops 4 times +#define LEO_USE_VECTOR4_OPT + //------------------------------------------------------------------------------ // Debug diff --git a/LeopardFF16.cpp b/LeopardFF16.cpp index 0898cb5..3028971 100644 --- a/LeopardFF16.cpp +++ b/LeopardFF16.cpp @@ -636,8 +636,7 @@ void fft_butterfly4( #if defined(LEO_TRY_AVX2) if (CpuHasAVX2) { - const LEO_M256 table_lo_y = _mm256_loadu_si256(&Multiply256LUT[log_m].Value[0]); - const LEO_M256 table_hi_y = _mm256_loadu_si256(&Multiply256LUT[log_m].Value[1]); + LEO_MUL_TABLES_256(); const LEO_M256 clr_mask = _mm256_set1_epi8(0x0f); @@ -652,19 +651,15 @@ void fft_butterfly4( do { - LEO_FFTB_256(x32_0 + 1, y32_0 + 1); LEO_FFTB_256(x32_0, y32_0); y32_0 += 2, x32_0 += 2; - LEO_FFTB_256(x32_1 + 1, y32_1 + 1); LEO_FFTB_256(x32_1, y32_1); y32_1 += 2, x32_1 += 2; - LEO_FFTB_256(x32_2 + 1, y32_2 + 1); LEO_FFTB_256(x32_2, y32_2); y32_2 += 2, x32_2 += 2; - LEO_FFTB_256(x32_3 + 1, y32_3 + 1); LEO_FFTB_256(x32_3, y32_3); y32_3 += 2, x32_3 += 2; @@ -690,26 +685,18 @@ void fft_butterfly4( do { - LEO_FFTB_128(x16_0 + 3, y16_0 + 3); - LEO_FFTB_128(x16_0 + 2, y16_0 + 2); LEO_FFTB_128(x16_0 + 1, y16_0 + 1); LEO_FFTB_128(x16_0, y16_0); x16_0 += 4, y16_0 += 4; - LEO_FFTB_128(x16_1 + 3, y16_1 + 3); - LEO_FFTB_128(x16_1 + 2, y16_1 + 2); LEO_FFTB_128(x16_1 + 1, y16_1 + 1); LEO_FFTB_128(x16_1, y16_1); x16_1 += 4, y16_1 += 4; - LEO_FFTB_128(x16_2 + 3, y16_2 + 3); - LEO_FFTB_128(x16_2 + 2, y16_2 + 2); LEO_FFTB_128(x16_2 + 1, y16_2 + 1); LEO_FFTB_128(x16_2, y16_2); x16_2 += 4, y16_2 += 4; - LEO_FFTB_128(x16_3 + 3, y16_3 + 3); - LEO_FFTB_128(x16_3 + 2, y16_3 + 2); LEO_FFTB_128(x16_3 + 1, y16_3 + 1); LEO_FFTB_128(x16_3, y16_3); x16_3 += 4, y16_3 += 4; @@ -835,8 +822,7 @@ void ifft_butterfly4( #if defined(LEO_TRY_AVX2) if (CpuHasAVX2) { - const LEO_M256 table_lo_y = _mm256_loadu_si256(&Multiply256LUT[log_m].Value[0]); - const LEO_M256 table_hi_y = _mm256_loadu_si256(&Multiply256LUT[log_m].Value[1]); + LEO_MUL_TABLES_256(); const LEO_M256 clr_mask = _mm256_set1_epi8(0x0f); @@ -851,19 +837,15 @@ void ifft_butterfly4( do { - LEO_IFFTB_256(x32_0 + 1, y32_0 + 1); LEO_IFFTB_256(x32_0, y32_0); y32_0 += 2, x32_0 += 2; - LEO_IFFTB_256(x32_1 + 1, y32_1 + 1); LEO_IFFTB_256(x32_1, y32_1); y32_1 += 2, x32_1 += 2; - LEO_IFFTB_256(x32_2 + 1, y32_2 + 1); LEO_IFFTB_256(x32_2, y32_2); y32_2 += 2, x32_2 += 2; - LEO_IFFTB_256(x32_3 + 1, y32_3 + 1); LEO_IFFTB_256(x32_3, y32_3); y32_3 += 2, x32_3 += 2; @@ -874,8 +856,7 @@ void ifft_butterfly4( } #endif // LEO_TRY_AVX2 - const LEO_M128 table_lo_y = _mm_loadu_si128(&Multiply128LUT[log_m].Value[0]); - const LEO_M128 table_hi_y = _mm_loadu_si128(&Multiply128LUT[log_m].Value[1]); + LEO_MUL_TABLES_128(); const LEO_M128 clr_mask = _mm_set1_epi8(0x0f); @@ -890,26 +871,18 @@ void ifft_butterfly4( do { - LEO_IFFTB_128(x16_0 + 3, y16_0 + 3); - LEO_IFFTB_128(x16_0 + 2, y16_0 + 2); LEO_IFFTB_128(x16_0 + 1, y16_0 + 1); LEO_IFFTB_128(x16_0, y16_0); x16_0 += 4, y16_0 += 4; - LEO_IFFTB_128(x16_1 + 3, y16_1 + 3); - LEO_IFFTB_128(x16_1 + 2, y16_1 + 2); LEO_IFFTB_128(x16_1 + 1, y16_1 + 1); LEO_IFFTB_128(x16_1, y16_1); x16_1 += 4, y16_1 += 4; - LEO_IFFTB_128(x16_2 + 3, y16_2 + 3); - LEO_IFFTB_128(x16_2 + 2, y16_2 + 2); LEO_IFFTB_128(x16_2 + 1, y16_2 + 1); LEO_IFFTB_128(x16_2, y16_2); x16_2 += 4, y16_2 += 4; - LEO_IFFTB_128(x16_3 + 3, y16_3 + 3); - LEO_IFFTB_128(x16_3 + 2, y16_3 + 2); LEO_IFFTB_128(x16_3 + 1, y16_3 + 1); LEO_IFFTB_128(x16_3, y16_3); x16_3 += 4, y16_3 += 4; diff --git a/leopard.h b/leopard.h index 07529df..24e29bd 100644 --- a/leopard.h +++ b/leopard.h @@ -64,13 +64,15 @@ /* TODO: - + Benchmarks for large data! + Add multi-threading to split up long parallelizable calculations + Final benchmarks! + Release version 1 + Finish up documentation TBD: + + Look into 12-bit fields as a performance optimization + + Look into shortening the FWHT() since it takes a lot of decoder runtime + + Unroll first/final butterflies to avoid extra copies/xors in encoder + Look into getting EncodeL working so we can support smaller data (Ask Lin) + Look into using FFT_m instead of FFT_n for decoder */ diff --git a/tests/benchmark.cpp b/tests/benchmark.cpp index 40e030c..86c779c 100644 --- a/tests/benchmark.cpp +++ b/tests/benchmark.cpp @@ -42,14 +42,14 @@ using namespace std; struct TestParameters { #ifdef LEO_HAS_FF16 - unsigned original_count = 677; // under 65536 - unsigned recovery_count = 487; // under 65536 - original_count + unsigned original_count = 1000; // under 65536 + unsigned recovery_count = 200; // under 65536 - original_count #else unsigned original_count = 128; // under 65536 unsigned recovery_count = 128; // under 65536 - original_count #endif unsigned buffer_bytes = 2560; // multiple of 64 bytes - unsigned loss_count = 2; // some fraction of original_count + unsigned loss_count = 500; // some fraction of original_count unsigned seed = 2; bool multithreaded = true; }; @@ -807,10 +807,13 @@ int main(int argc, char **argv) if (!BasicTest(params)) goto Failed; + + static const unsigned kMaxRandomData = 32768; + prng.Seed(params.seed, 8); for (;; ++params.seed) { - params.original_count = prng.Next() % 32768; + params.original_count = prng.Next() % kMaxRandomData; params.recovery_count = prng.Next() % params.original_count + 1; params.loss_count = prng.Next() % params.recovery_count + 1;