From bf5795fd17ef3d68dae48d0d3c4ce05cc60f1542 Mon Sep 17 00:00:00 2001 From: Christopher Taylor Date: Tue, 20 Jun 2017 22:47:41 -0700 Subject: [PATCH] Add CMakeLists and fix OS X compat --- CMakeLists.txt | 37 +++++++++++++++++++++++++++++++++++++ LeopardCommon.h | 2 ++ LeopardFF16.cpp | 15 +++++++-------- LeopardFF8.cpp | 8 ++++++-- tests/benchmark.cpp | 7 ++++++- 5 files changed, 58 insertions(+), 11 deletions(-) create mode 100644 CMakeLists.txt diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..926ca66 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,37 @@ +cmake_minimum_required(VERSION 3.7) +project(leopard) + +set(CMAKE_CONFIGURATION_TYPES "Debug;Release" CACHE STRING "" FORCE) + +set(CMAKE_CXX_STANDARD 11) + +set(LIB_SOURCE_FILES + leopard.cpp + leopard.h + LeopardCommon.cpp + LeopardCommon.h + LeopardFF16.cpp + LeopardFF16.h + LeopardFF8.cpp + LeopardFF8.h) + +set(BENCH_SOURCE_FILES + tests/benchmark.cpp) + +set(EXPERIMENT_SOURCE_FILES + tests/experiments.cpp) + +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE Release) +endif() + +set(CMAKE_CXX_FLAGS "-Wall -Wextra") +set(CMAKE_CXX_FLAGS_DEBUG "-g -O0") +set(CMAKE_CXX_FLAGS_RELEASE "-O3") + +add_library(libleopard STATIC ${LIB_SOURCE_FILES}) + +add_executable(bench_leopard ${BENCH_SOURCE_FILES}) +target_link_libraries(bench_leopard libleopard) + +add_executable(experiment_leopard ${EXPERIMENT_SOURCE_FILES}) diff --git a/LeopardCommon.h b/LeopardCommon.h index a399933..467f829 100644 --- a/LeopardCommon.h +++ b/LeopardCommon.h @@ -153,7 +153,9 @@ #include "leopard.h" #include +#ifdef _WIN32 #include +#endif //_WIN32 #include #include #include diff --git a/LeopardFF16.cpp b/LeopardFF16.cpp index f8892d7..3241c66 100644 --- a/LeopardFF16.cpp +++ b/LeopardFF16.cpp @@ -381,7 +381,7 @@ static void InitializeMultiplyTables() // For each log_m multiplicand: #pragma omp parallel for - for (int log_m = 0; log_m < kOrder; ++log_m) + for (int log_m = 0; log_m < (int)kOrder; ++log_m) { const Product16Table& lut = Multiply16LUT[log_m]; @@ -400,14 +400,16 @@ static void InitializeMultiplyTables() return; } +#if defined(LEO_TRY_AVX2) if (CpuHasAVX2) Multiply256LUT = reinterpret_cast(SIMDSafeAllocate(sizeof(Multiply256LUT_t) * kOrder)); else +#endif // LEO_TRY_AVX2 Multiply128LUT = reinterpret_cast(SIMDSafeAllocate(sizeof(Multiply128LUT_t) * kOrder)); // For each value we could multiply by: #pragma omp parallel for - for (int log_m = 0; log_m < kOrder; ++log_m) + for (int log_m = 0; log_m < (int)kOrder; ++log_m) { // For each 4 bits of the finite field width in bits: for (unsigned i = 0, shift = 0; i < 4; ++i, shift += 4) @@ -425,7 +427,9 @@ static void InitializeMultiplyTables() const LEO_M128 value_hi = _mm_loadu_si128((LEO_M128*)prod_hi); // Store in 128-bit wide table +#if defined(LEO_TRY_AVX2) if (!CpuHasAVX2) +#endif // LEO_TRY_AVX2 { _mm_storeu_si128((LEO_M128*)&Multiply128LUT[log_m].Lo[i], value_lo); _mm_storeu_si128((LEO_M128*)&Multiply128LUT[log_m].Hi[i], value_hi); @@ -1341,9 +1345,6 @@ static void FFT_DIT( unsigned dist4 = m, dist = m >> 2; for (; dist != 0; dist4 = dist, dist >>= 2) { - const unsigned thread_u = m_truncated / dist4; - const unsigned thread_v = dist; - // For each set of dist*4 elements: #pragma omp parallel for for (int r = 0; r < (int)m_truncated; r += dist4) @@ -1439,8 +1440,6 @@ void ReedSolomonEncode( // Handle final partial set of m pieces: if (last_count != 0) { - const unsigned i = original_count - last_count; - data += m; skewLUT += m; @@ -1692,7 +1691,7 @@ void ReedSolomonDecode( FWHT(error_locations, kOrder, m + original_count); #pragma omp parallel for - for (int i = 0; i < kOrder; ++i) + for (int i = 0; i < (int)kOrder; ++i) error_locations[i] = ((unsigned)error_locations[i] * (unsigned)LogWalsh[i]) % kModulus; FWHT(error_locations, kOrder, kOrder); diff --git a/LeopardFF8.cpp b/LeopardFF8.cpp index b87eda1..2f941f8 100644 --- a/LeopardFF8.cpp +++ b/LeopardFF8.cpp @@ -368,9 +368,11 @@ static void InitializeMultiplyTables() return; } +#ifdef LEO_TRY_AVX2 if (CpuHasAVX2) Multiply256LUT = reinterpret_cast(SIMDSafeAllocate(sizeof(Multiply256LUT_t) * kOrder)); else +#endif // LEO_TRY_AVX2 Multiply128LUT = reinterpret_cast(SIMDSafeAllocate(sizeof(Multiply128LUT_t) * kOrder)); // For each value we could multiply by: @@ -388,7 +390,9 @@ static void InitializeMultiplyTables() const LEO_M128 value = _mm_loadu_si128(v_ptr); // Store in 128-bit wide table +#if defined(LEO_TRY_AVX2) if (!CpuHasAVX2) +#endif // LEO_TRY_AVX2 _mm_storeu_si128((LEO_M128*)&Multiply128LUT[log_m].Value[i], value); // Store in 256-bit wide table @@ -1397,6 +1401,7 @@ static void FFT_DIT4( { #ifdef LEO_INTERLEAVE_BUTTERFLY4_OPT +#if defined(LEO_TRY_AVX2) if (CpuHasAVX2) { const LEO_M256 t01_lo = _mm256_loadu_si256(&Multiply256LUT[log_m01].Value[0]); @@ -1451,6 +1456,7 @@ static void FFT_DIT4( return; } +#endif // LEO_TRY_AVX2 if (CpuHasSSSE3) { @@ -1639,8 +1645,6 @@ void ReedSolomonEncode( // Handle final partial set of m pieces: if (last_count != 0) { - const unsigned i = original_count - last_count; - data += m; skewLUT += m; diff --git a/tests/benchmark.cpp b/tests/benchmark.cpp index b5b7ed6..f3f32d5 100644 --- a/tests/benchmark.cpp +++ b/tests/benchmark.cpp @@ -91,7 +91,8 @@ static bool SetCurrentThreadPriority() #ifdef _WIN32 return 0 != ::SetThreadPriority(::GetCurrentThread(), THREAD_PRIORITY_ABOVE_NORMAL); #else - return -1 != nice(2); + // setpriority on mac os x + return true; #endif } @@ -99,6 +100,10 @@ static bool SetCurrentThreadPriority() //------------------------------------------------------------------------------ // Timing +#ifndef _WIN32 +#include +#endif + static uint64_t GetTimeUsec() { #ifdef _WIN32