From bf5795fd17ef3d68dae48d0d3c4ce05cc60f1542 Mon Sep 17 00:00:00 2001
From: Christopher Taylor <christopher.taylor@oculus.com>
Date: Tue, 20 Jun 2017 22:47:41 -0700
Subject: [PATCH] Add CMakeLists and fix OS X compat

---
 CMakeLists.txt      | 37 +++++++++++++++++++++++++++++++++++++
 LeopardCommon.h     |  2 ++
 LeopardFF16.cpp     | 15 +++++++--------
 LeopardFF8.cpp      |  8 ++++++--
 tests/benchmark.cpp |  7 ++++++-
 5 files changed, 58 insertions(+), 11 deletions(-)
 create mode 100644 CMakeLists.txt
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..926ca66
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,37 @@
+cmake_minimum_required(VERSION 3.7)
+project(leopard)
+
+set(CMAKE_CONFIGURATION_TYPES "Debug;Release" CACHE STRING "" FORCE)
+
+set(CMAKE_CXX_STANDARD 11)
+
+set(LIB_SOURCE_FILES
+        leopard.cpp
+        leopard.h
+        LeopardCommon.cpp
+        LeopardCommon.h
+        LeopardFF16.cpp
+        LeopardFF16.h
+        LeopardFF8.cpp
+        LeopardFF8.h)
+
+set(BENCH_SOURCE_FILES
+        tests/benchmark.cpp)
+
+set(EXPERIMENT_SOURCE_FILES
+        tests/experiments.cpp)
+
+if(NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE Release)
+endif()
+
+set(CMAKE_CXX_FLAGS "-Wall -Wextra")
+set(CMAKE_CXX_FLAGS_DEBUG "-g -O0")
+set(CMAKE_CXX_FLAGS_RELEASE "-O3")
+
+add_library(libleopard STATIC ${LIB_SOURCE_FILES})
+
+add_executable(bench_leopard ${BENCH_SOURCE_FILES})
+target_link_libraries(bench_leopard libleopard)
+
+add_executable(experiment_leopard ${EXPERIMENT_SOURCE_FILES})
diff --git a/LeopardCommon.h b/LeopardCommon.h
index a399933..467f829 100644
--- a/LeopardCommon.h
+++ b/LeopardCommon.h
@@ -153,7 +153,9 @@
 #include "leopard.h"
 
 #include <stdint.h>
+#ifdef _WIN32
 #include <malloc.h>
+#endif //_WIN32
 #include <vector>
 #include <atomic>
 #include <memory>
diff --git a/LeopardFF16.cpp b/LeopardFF16.cpp
index f8892d7..3241c66 100644
--- a/LeopardFF16.cpp
+++ b/LeopardFF16.cpp
@@ -381,7 +381,7 @@ static void InitializeMultiplyTables()
 
         // For each log_m multiplicand:
 #pragma omp parallel for
-        for (int log_m = 0; log_m < kOrder; ++log_m)
+        for (int log_m = 0; log_m < (int)kOrder; ++log_m)
         {
             const Product16Table& lut = Multiply16LUT[log_m];
 
@@ -400,14 +400,16 @@ static void InitializeMultiplyTables()
         return;
     }
 
+#if defined(LEO_TRY_AVX2)
     if (CpuHasAVX2)
         Multiply256LUT = reinterpret_cast<const Multiply256LUT_t*>(SIMDSafeAllocate(sizeof(Multiply256LUT_t) * kOrder));
     else
+#endif // LEO_TRY_AVX2
         Multiply128LUT = reinterpret_cast<const Multiply128LUT_t*>(SIMDSafeAllocate(sizeof(Multiply128LUT_t) * kOrder));
 
     // For each value we could multiply by:
 #pragma omp parallel for
-    for (int log_m = 0; log_m < kOrder; ++log_m)
+    for (int log_m = 0; log_m < (int)kOrder; ++log_m)
     {
         // For each 4 bits of the finite field width in bits:
         for (unsigned i = 0, shift = 0; i < 4; ++i, shift += 4)
@@ -425,7 +427,9 @@ static void InitializeMultiplyTables()
             const LEO_M128 value_hi = _mm_loadu_si128((LEO_M128*)prod_hi);
 
             // Store in 128-bit wide table
+#if defined(LEO_TRY_AVX2)
             if (!CpuHasAVX2)
+#endif // LEO_TRY_AVX2
             {
                 _mm_storeu_si128((LEO_M128*)&Multiply128LUT[log_m].Lo[i], value_lo);
                 _mm_storeu_si128((LEO_M128*)&Multiply128LUT[log_m].Hi[i], value_hi);
@@ -1341,9 +1345,6 @@ static void FFT_DIT(
     unsigned dist4 = m, dist = m >> 2;
     for (; dist != 0; dist4 = dist, dist >>= 2)
     {
-        const unsigned thread_u = m_truncated / dist4;
-        const unsigned thread_v = dist;
-
         // For each set of dist*4 elements:
 #pragma omp parallel for
         for (int r = 0; r < (int)m_truncated; r += dist4)
@@ -1439,8 +1440,6 @@ void ReedSolomonEncode(
     // Handle final partial set of m pieces:
     if (last_count != 0)
     {
-        const unsigned i = original_count - last_count;
-
         data += m;
         skewLUT += m;
 
@@ -1692,7 +1691,7 @@ void ReedSolomonDecode(
     FWHT(error_locations, kOrder, m + original_count);
 
 #pragma omp parallel for
-    for (int i = 0; i < kOrder; ++i)
+    for (int i = 0; i < (int)kOrder; ++i)
         error_locations[i] = ((unsigned)error_locations[i] * (unsigned)LogWalsh[i]) % kModulus;
 
     FWHT(error_locations, kOrder, kOrder);
diff --git a/LeopardFF8.cpp b/LeopardFF8.cpp
index b87eda1..2f941f8 100644
--- a/LeopardFF8.cpp
+++ b/LeopardFF8.cpp
@@ -368,9 +368,11 @@ static void InitializeMultiplyTables()
         return;
     }
 
+#ifdef LEO_TRY_AVX2
     if (CpuHasAVX2)
         Multiply256LUT = reinterpret_cast<const Multiply256LUT_t*>(SIMDSafeAllocate(sizeof(Multiply256LUT_t) * kOrder));
     else
+#endif // LEO_TRY_AVX2
         Multiply128LUT = reinterpret_cast<const Multiply128LUT_t*>(SIMDSafeAllocate(sizeof(Multiply128LUT_t) * kOrder));
 
     // For each value we could multiply by:
@@ -388,7 +390,9 @@ static void InitializeMultiplyTables()
             const LEO_M128 value = _mm_loadu_si128(v_ptr);
 
             // Store in 128-bit wide table
+#if defined(LEO_TRY_AVX2)
             if (!CpuHasAVX2)
+#endif // LEO_TRY_AVX2
                 _mm_storeu_si128((LEO_M128*)&Multiply128LUT[log_m].Value[i], value);
 
             // Store in 256-bit wide table
@@ -1397,6 +1401,7 @@ static void FFT_DIT4(
 {
 #ifdef LEO_INTERLEAVE_BUTTERFLY4_OPT
 
+#if defined(LEO_TRY_AVX2)
     if (CpuHasAVX2)
     {
         const LEO_M256 t01_lo = _mm256_loadu_si256(&Multiply256LUT[log_m01].Value[0]);
@@ -1451,6 +1456,7 @@ static void FFT_DIT4(
 
         return;
     }
+#endif // LEO_TRY_AVX2
 
     if (CpuHasSSSE3)
     {
@@ -1639,8 +1645,6 @@ void ReedSolomonEncode(
     // Handle final partial set of m pieces:
     if (last_count != 0)
     {
-        const unsigned i = original_count - last_count;
-
         data += m;
         skewLUT += m;
 
diff --git a/tests/benchmark.cpp b/tests/benchmark.cpp
index b5b7ed6..f3f32d5 100644
--- a/tests/benchmark.cpp
+++ b/tests/benchmark.cpp
@@ -91,7 +91,8 @@ static bool SetCurrentThreadPriority()
 #ifdef _WIN32
     return 0 != ::SetThreadPriority(::GetCurrentThread(), THREAD_PRIORITY_ABOVE_NORMAL);
 #else
-    return -1 != nice(2);
+    // setpriority on mac os x
+    return true;
 #endif
 }
 
@@ -99,6 +100,10 @@ static bool SetCurrentThreadPriority()
 //------------------------------------------------------------------------------
 // Timing
 
+#ifndef _WIN32
+#include <sys/time.h>
+#endif
+
 static uint64_t GetTimeUsec()
 {
 #ifdef _WIN32