From c53b075edad6fc83ac5d27167001afb1649df067 Mon Sep 17 00:00:00 2001
From: Christopher Taylor <christopher.taylor@oculus.com>
Date: Tue, 30 May 2017 02:05:41 -0700
Subject: [PATCH] FF16 works

---
 LeopardCommon.h     |  6 +++---
 LeopardFF16.cpp     | 33 +++------------------------------
 leopard.h           |  4 +++-
 tests/benchmark.cpp | 11 +++++++----
 4 files changed, 16 insertions(+), 38 deletions(-)

diff --git a/LeopardCommon.h b/LeopardCommon.h
index ee7b318..8684633 100644
--- a/LeopardCommon.h
+++ b/LeopardCommon.h
@@ -149,9 +149,6 @@
 //------------------------------------------------------------------------------
 // Constants
 
-// Unroll inner loops 4 times
-//#define LEO_USE_VECTOR4_OPT
-
 // Define this to enable the optimized version of FWHT()
 #define LEO_FWHT_OPT
 
@@ -164,6 +161,9 @@
 // Optimize M=1 case
 #define LEO_M1_OPT
 
+// Unroll inner loops 4 times
+#define LEO_USE_VECTOR4_OPT
+
 
 //------------------------------------------------------------------------------
 // Debug
diff --git a/LeopardFF16.cpp b/LeopardFF16.cpp
index 0898cb5..3028971 100644
--- a/LeopardFF16.cpp
+++ b/LeopardFF16.cpp
@@ -636,8 +636,7 @@ void fft_butterfly4(
 #if defined(LEO_TRY_AVX2)
     if (CpuHasAVX2)
     {
-        const LEO_M256 table_lo_y = _mm256_loadu_si256(&Multiply256LUT[log_m].Value[0]);
-        const LEO_M256 table_hi_y = _mm256_loadu_si256(&Multiply256LUT[log_m].Value[1]);
+        LEO_MUL_TABLES_256();
 
         const LEO_M256 clr_mask = _mm256_set1_epi8(0x0f);
 
@@ -652,19 +651,15 @@ void fft_butterfly4(
 
         do
         {
-            LEO_FFTB_256(x32_0 + 1, y32_0 + 1);
             LEO_FFTB_256(x32_0, y32_0);
             y32_0 += 2, x32_0 += 2;
 
-            LEO_FFTB_256(x32_1 + 1, y32_1 + 1);
             LEO_FFTB_256(x32_1, y32_1);
             y32_1 += 2, x32_1 += 2;
 
-            LEO_FFTB_256(x32_2 + 1, y32_2 + 1);
             LEO_FFTB_256(x32_2, y32_2);
             y32_2 += 2, x32_2 += 2;
 
-            LEO_FFTB_256(x32_3 + 1, y32_3 + 1);
             LEO_FFTB_256(x32_3, y32_3);
             y32_3 += 2, x32_3 += 2;
 
@@ -690,26 +685,18 @@ void fft_butterfly4(
 
     do
     {
-        LEO_FFTB_128(x16_0 + 3, y16_0 + 3);
-        LEO_FFTB_128(x16_0 + 2, y16_0 + 2);
         LEO_FFTB_128(x16_0 + 1, y16_0 + 1);
         LEO_FFTB_128(x16_0, y16_0);
         x16_0 += 4, y16_0 += 4;
 
-        LEO_FFTB_128(x16_1 + 3, y16_1 + 3);
-        LEO_FFTB_128(x16_1 + 2, y16_1 + 2);
         LEO_FFTB_128(x16_1 + 1, y16_1 + 1);
         LEO_FFTB_128(x16_1, y16_1);
         x16_1 += 4, y16_1 += 4;
 
-        LEO_FFTB_128(x16_2 + 3, y16_2 + 3);
-        LEO_FFTB_128(x16_2 + 2, y16_2 + 2);
         LEO_FFTB_128(x16_2 + 1, y16_2 + 1);
         LEO_FFTB_128(x16_2, y16_2);
         x16_2 += 4, y16_2 += 4;
 
-        LEO_FFTB_128(x16_3 + 3, y16_3 + 3);
-        LEO_FFTB_128(x16_3 + 2, y16_3 + 2);
         LEO_FFTB_128(x16_3 + 1, y16_3 + 1);
         LEO_FFTB_128(x16_3, y16_3);
         x16_3 += 4, y16_3 += 4;
@@ -835,8 +822,7 @@ void ifft_butterfly4(
 #if defined(LEO_TRY_AVX2)
     if (CpuHasAVX2)
     {
-        const LEO_M256 table_lo_y = _mm256_loadu_si256(&Multiply256LUT[log_m].Value[0]);
-        const LEO_M256 table_hi_y = _mm256_loadu_si256(&Multiply256LUT[log_m].Value[1]);
+        LEO_MUL_TABLES_256();
 
         const LEO_M256 clr_mask = _mm256_set1_epi8(0x0f);
 
@@ -851,19 +837,15 @@ void ifft_butterfly4(
 
         do
         {
-            LEO_IFFTB_256(x32_0 + 1, y32_0 + 1);
             LEO_IFFTB_256(x32_0, y32_0);
             y32_0 += 2, x32_0 += 2;
 
-            LEO_IFFTB_256(x32_1 + 1, y32_1 + 1);
             LEO_IFFTB_256(x32_1, y32_1);
             y32_1 += 2, x32_1 += 2;
 
-            LEO_IFFTB_256(x32_2 + 1, y32_2 + 1);
             LEO_IFFTB_256(x32_2, y32_2);
             y32_2 += 2, x32_2 += 2;
 
-            LEO_IFFTB_256(x32_3 + 1, y32_3 + 1);
             LEO_IFFTB_256(x32_3, y32_3);
             y32_3 += 2, x32_3 += 2;
 
@@ -874,8 +856,7 @@ void ifft_butterfly4(
     }
 #endif // LEO_TRY_AVX2
 
-    const LEO_M128 table_lo_y = _mm_loadu_si128(&Multiply128LUT[log_m].Value[0]);
-    const LEO_M128 table_hi_y = _mm_loadu_si128(&Multiply128LUT[log_m].Value[1]);
+    LEO_MUL_TABLES_128();
 
     const LEO_M128 clr_mask = _mm_set1_epi8(0x0f);
 
@@ -890,26 +871,18 @@ void ifft_butterfly4(
 
     do
     {
-        LEO_IFFTB_128(x16_0 + 3, y16_0 + 3);
-        LEO_IFFTB_128(x16_0 + 2, y16_0 + 2);
         LEO_IFFTB_128(x16_0 + 1, y16_0 + 1);
         LEO_IFFTB_128(x16_0, y16_0);
         x16_0 += 4, y16_0 += 4;
 
-        LEO_IFFTB_128(x16_1 + 3, y16_1 + 3);
-        LEO_IFFTB_128(x16_1 + 2, y16_1 + 2);
         LEO_IFFTB_128(x16_1 + 1, y16_1 + 1);
         LEO_IFFTB_128(x16_1, y16_1);
         x16_1 += 4, y16_1 += 4;
 
-        LEO_IFFTB_128(x16_2 + 3, y16_2 + 3);
-        LEO_IFFTB_128(x16_2 + 2, y16_2 + 2);
         LEO_IFFTB_128(x16_2 + 1, y16_2 + 1);
         LEO_IFFTB_128(x16_2, y16_2);
         x16_2 += 4, y16_2 += 4;
 
-        LEO_IFFTB_128(x16_3 + 3, y16_3 + 3);
-        LEO_IFFTB_128(x16_3 + 2, y16_3 + 2);
         LEO_IFFTB_128(x16_3 + 1, y16_3 + 1);
         LEO_IFFTB_128(x16_3, y16_3);
         x16_3 += 4, y16_3 += 4;
diff --git a/leopard.h b/leopard.h
index 07529df..24e29bd 100644
--- a/leopard.h
+++ b/leopard.h
@@ -64,13 +64,15 @@
 
 /*
     TODO:
-    + Benchmarks for large data!
     + Add multi-threading to split up long parallelizable calculations
         + Final benchmarks!
     + Release version 1
         + Finish up documentation
 
     TBD:
+    + Look into 12-bit fields as a performance optimization
+    + Look into shortening the FWHT() since it takes a lot of decoder runtime
+    + Unroll first/final butterflies to avoid extra copies/xors in encoder
     + Look into getting EncodeL working so we can support smaller data (Ask Lin)
     + Look into using FFT_m instead of FFT_n for decoder
 */
diff --git a/tests/benchmark.cpp b/tests/benchmark.cpp
index 40e030c..86c779c 100644
--- a/tests/benchmark.cpp
+++ b/tests/benchmark.cpp
@@ -42,14 +42,14 @@ using namespace std;
 struct TestParameters
 {
 #ifdef LEO_HAS_FF16
-    unsigned original_count = 677; // under 65536
-    unsigned recovery_count = 487; // under 65536 - original_count
+    unsigned original_count = 1000; // under 65536
+    unsigned recovery_count = 200; // under 65536 - original_count
 #else
     unsigned original_count = 128; // under 65536
     unsigned recovery_count = 128; // under 65536 - original_count
 #endif
     unsigned buffer_bytes = 2560; // multiple of 64 bytes
-    unsigned loss_count = 2; // some fraction of original_count
+    unsigned loss_count = 500; // some fraction of original_count
     unsigned seed = 2;
     bool multithreaded = true;
 };
@@ -807,10 +807,13 @@ int main(int argc, char **argv)
     if (!BasicTest(params))
         goto Failed;
 
+
+    static const unsigned kMaxRandomData = 32768;
+
     prng.Seed(params.seed, 8);
     for (;; ++params.seed)
     {
-        params.original_count = prng.Next() % 32768;
+        params.original_count = prng.Next() % kMaxRandomData;
         params.recovery_count = prng.Next() % params.original_count + 1;
         params.loss_count = prng.Next() % params.recovery_count + 1;