From 96bd047a2d823662b658d578a30fd89c078c7bde Mon Sep 17 00:00:00 2001 From: Christopher Taylor Date: Sat, 3 Jun 2017 16:48:09 -0700 Subject: [PATCH] Only allocate memory for mul table used --- LeopardCommon.h | 38 +++++++++++++++++++++++++++++ LeopardFF16.cpp | 45 +++++++++++++++++++++++++--------- LeopardFF8.cpp | 59 ++++++++++++++++++++++++++++++--------------- tests/benchmark.cpp | 53 ++++++---------------------------------- 4 files changed, 119 insertions(+), 76 deletions(-) diff --git a/LeopardCommon.h b/LeopardCommon.h index bf855de..d1b15f5 100644 --- a/LeopardCommon.h +++ b/LeopardCommon.h @@ -157,6 +157,7 @@ #include "leopard.h" #include +#include //------------------------------------------------------------------------------ @@ -421,4 +422,41 @@ protected: }; +//------------------------------------------------------------------------------ +// SIMD-Safe Aligned Memory Allocations + +static const unsigned kAlignmentBytes = LEO_ALIGN_BYTES; + +LEO_FORCE_INLINE unsigned NextAlignedOffset(unsigned offset) +{ + return (offset + kAlignmentBytes - 1) & ~(kAlignmentBytes - 1); +} + +static LEO_FORCE_INLINE uint8_t* SIMDSafeAllocate(size_t size) +{ + uint8_t* data = (uint8_t*)calloc(1, kAlignmentBytes + size); + if (!data) + return nullptr; + unsigned offset = (unsigned)((uintptr_t)data % kAlignmentBytes); + data += kAlignmentBytes - offset; + data[-1] = (uint8_t)offset; + return data; +} + +static LEO_FORCE_INLINE void SIMDSafeFree(void* ptr) +{ + if (!ptr) + return; + uint8_t* data = (uint8_t*)ptr; + unsigned offset = data[-1]; + if (offset >= kAlignmentBytes) + { + LEO_DEBUG_BREAK; // Should never happen + return; + } + data -= kAlignmentBytes - offset; + free(data); +} + + } // namespace leopard diff --git a/LeopardFF16.cpp b/LeopardFF16.cpp index 926f467..f1bbbea 100644 --- a/LeopardFF16.cpp +++ b/LeopardFF16.cpp @@ -199,20 +199,38 @@ static void InitializeLogarithmTables() The ALTMAP memory layout is used since there is no need to convert in/out. */ -struct { - LEO_ALIGNED LEO_M128 Lo[4]; - LEO_ALIGNED LEO_M128 Hi[4]; -} static Multiply128LUT[kOrder]; +struct Multiply128LUT_t +{ + LEO_M128 Lo[4]; + LEO_M128 Hi[4]; +}; + +static const Multiply128LUT_t* Multiply128LUT = nullptr; + #if defined(LEO_TRY_AVX2) -struct { - LEO_ALIGNED LEO_M256 Lo[4]; - LEO_ALIGNED LEO_M256 Hi[4]; -} static Multiply256LUT[kOrder]; + +struct Multiply256LUT_t +{ + LEO_M256 Lo[4]; + LEO_M256 Hi[4]; +}; + +static const Multiply256LUT_t* Multiply256LUT = nullptr; + #endif // LEO_TRY_AVX2 void InitializeMultiplyTables() { + // If we cannot use the PSHUFB instruction, generate Multiply8LUT: + if (!CpuHasSSSE3) + return; + + if (CpuHasAVX2) + Multiply256LUT = reinterpret_cast(SIMDSafeAllocate(sizeof(Multiply256LUT_t) * kOrder)); + else + Multiply128LUT = reinterpret_cast(SIMDSafeAllocate(sizeof(Multiply128LUT_t) * kOrder)); + // For each value we could multiply by: for (unsigned log_m = 0; log_m < kOrder; ++log_m) { @@ -232,16 +250,19 @@ void InitializeMultiplyTables() const LEO_M128 value_hi = _mm_loadu_si128((LEO_M128*)prod_hi); // Store in 128-bit wide table - _mm_storeu_si128(&Multiply128LUT[log_m].Lo[i], value_lo); - _mm_storeu_si128(&Multiply128LUT[log_m].Hi[i], value_hi); + if (!CpuHasAVX2) + { + _mm_storeu_si128((LEO_M128*)&Multiply128LUT[log_m].Lo[i], value_lo); + _mm_storeu_si128((LEO_M128*)&Multiply128LUT[log_m].Hi[i], value_hi); + } // Store in 256-bit wide table #if defined(LEO_TRY_AVX2) if (CpuHasAVX2) { - _mm256_storeu_si256(&Multiply256LUT[log_m].Lo[i], + _mm256_storeu_si256((LEO_M256*)&Multiply256LUT[log_m].Lo[i], _mm256_broadcastsi128_si256(value_lo)); - _mm256_storeu_si256(&Multiply256LUT[log_m].Hi[i], + _mm256_storeu_si256((LEO_M256*)&Multiply256LUT[log_m].Hi[i], _mm256_broadcastsi128_si256(value_hi)); } #endif // LEO_TRY_AVX2 diff --git a/LeopardFF8.cpp b/LeopardFF8.cpp index d9eaaa9..b9c2ccc 100644 --- a/LeopardFF8.cpp +++ b/LeopardFF8.cpp @@ -202,19 +202,27 @@ static void InitializeLogarithmTables() Specifically section 6 outlines the algorithm used here for 8-bit fields. */ -struct { - LEO_ALIGNED LEO_M128 Value[2]; -} static Multiply128LUT[kOrder]; +struct Multiply128LUT_t +{ + LEO_M128 Value[2]; +}; + +static const Multiply128LUT_t* Multiply128LUT = nullptr; #if defined(LEO_TRY_AVX2) -struct { - LEO_ALIGNED LEO_M256 Value[2]; -} static Multiply256LUT[kOrder]; + +struct Multiply256LUT_t +{ + LEO_M256 Value[2]; +}; + +static const Multiply256LUT_t* Multiply256LUT = nullptr; + #endif // LEO_TRY_AVX2 // Stores the product of x * y at offset x + y * 256 // Repeated accesses from the same y value are faster -static ffe_t Multiply8LUT[256 * 256] = {}; +static const ffe_t* Multiply8LUT = nullptr; static void InitializeMultiplyTables() @@ -222,27 +230,38 @@ static void InitializeMultiplyTables() // If we cannot use the PSHUFB instruction, generate Multiply8LUT: if (!CpuHasSSSE3) { + Multiply8LUT = new ffe_t[256 * 256]; + // For each left-multiplicand: for (unsigned x = 0; x < 256; ++x) { - ffe_t* lut = Multiply8LUT + x; + ffe_t* lut = (ffe_t*)Multiply8LUT + x; - // Note: Table is already zeroed out so we can skip the zeroes if (x == 0) - continue; - - const ffe_t log_x = LogLUT[x]; - - for (unsigned log_y = 0; log_y < 256; ++log_y, lut += 256) { - const ffe_t prod = ExpLUT[AddMod(log_x, log_y)]; - *lut = prod; + for (unsigned log_y = 0; log_y < 256; ++log_y, lut += 256) + *lut = 0; + } + else + { + const ffe_t log_x = LogLUT[x]; + + for (unsigned log_y = 0; log_y < 256; ++log_y, lut += 256) + { + const ffe_t prod = ExpLUT[AddMod(log_x, log_y)]; + *lut = prod; + } } } return; } + if (CpuHasAVX2) + Multiply256LUT = reinterpret_cast(SIMDSafeAllocate(sizeof(Multiply256LUT_t) * kOrder)); + else + Multiply128LUT = reinterpret_cast(SIMDSafeAllocate(sizeof(Multiply128LUT_t) * kOrder)); + // For each value we could multiply by: for (unsigned log_m = 0; log_m < kOrder; ++log_m) { @@ -254,16 +273,18 @@ static void InitializeMultiplyTables() for (ffe_t x = 0; x < 16; ++x) lut[x] = MultiplyLog(x << shift, static_cast(log_m)); - // Store in 128-bit wide table const LEO_M128 *v_ptr = reinterpret_cast(&lut[0]); const LEO_M128 value = _mm_loadu_si128(v_ptr); - _mm_storeu_si128(&Multiply128LUT[log_m].Value[i], value); + + // Store in 128-bit wide table + if (!CpuHasAVX2) + _mm_storeu_si128((LEO_M128*)&Multiply128LUT[log_m].Value[i], value); // Store in 256-bit wide table #if defined(LEO_TRY_AVX2) if (CpuHasAVX2) { - _mm256_storeu_si256(&Multiply256LUT[log_m].Value[i], + _mm256_storeu_si256((LEO_M256*)&Multiply256LUT[log_m].Value[i], _mm256_broadcastsi128_si256(value)); } #endif // LEO_TRY_AVX2 diff --git a/tests/benchmark.cpp b/tests/benchmark.cpp index 1c491b1..8215883 100644 --- a/tests/benchmark.cpp +++ b/tests/benchmark.cpp @@ -365,43 +365,6 @@ static void ShuffleDeck16(PCGRandom &prng, uint16_t * LEO_RESTRICT deck, uint32_ } -//------------------------------------------------------------------------------ -// SIMD-Safe Aligned Memory Allocations - -static const unsigned kAlignmentBytes = LEO_ALIGN_BYTES; - -LEO_FORCE_INLINE unsigned NextAlignedOffset(unsigned offset) -{ - return (offset + kAlignmentBytes - 1) & ~(kAlignmentBytes - 1); -} - -static LEO_FORCE_INLINE uint8_t* SIMDSafeAllocate(size_t size) -{ - uint8_t* data = (uint8_t*)calloc(1, kAlignmentBytes + size); - if (!data) - return nullptr; - unsigned offset = (unsigned)((uintptr_t)data % kAlignmentBytes); - data += kAlignmentBytes - offset; - data[-1] = (uint8_t)offset; - return data; -} - -static LEO_FORCE_INLINE void SIMDSafeFree(void* ptr) -{ - if (!ptr) - return; - uint8_t* data = (uint8_t*)ptr; - unsigned offset = data[-1]; - if (offset >= kAlignmentBytes) - { - LEO_DEBUG_BREAK; // Should never happen - return; - } - data -= kAlignmentBytes - offset; - free(data); -} - - //------------------------------------------------------------------------------ // Benchmark @@ -430,11 +393,11 @@ static bool Benchmark(const TestParameters& params) t_mem_alloc.BeginCall(); for (unsigned i = 0, count = params.original_count; i < count; ++i) - original_data[i] = SIMDSafeAllocate(params.buffer_bytes); + original_data[i] = leopard::SIMDSafeAllocate(params.buffer_bytes); for (unsigned i = 0, count = encode_work_count; i < count; ++i) - encode_work_data[i] = SIMDSafeAllocate(params.buffer_bytes); + encode_work_data[i] = leopard::SIMDSafeAllocate(params.buffer_bytes); for (unsigned i = 0, count = decode_work_count; i < count; ++i) - decode_work_data[i] = SIMDSafeAllocate(params.buffer_bytes); + decode_work_data[i] = leopard::SIMDSafeAllocate(params.buffer_bytes); t_mem_alloc.EndCall(); // Generate data: @@ -479,7 +442,7 @@ static bool Benchmark(const TestParameters& params) for (unsigned i = 0, count = params.loss_count; i < count; ++i) { const unsigned loss_index = original_losses[i]; - SIMDSafeFree(original_data[loss_index]); + leopard::SIMDSafeFree(original_data[loss_index]); original_data[loss_index] = nullptr; } @@ -493,7 +456,7 @@ static bool Benchmark(const TestParameters& params) for (unsigned i = 0, count = recovery_loss_count; i < count; ++i) { const unsigned loss_index = recovery_losses[i]; - SIMDSafeFree(encode_work_data[loss_index]); + leopard::SIMDSafeFree(encode_work_data[loss_index]); encode_work_data[loss_index] = nullptr; } @@ -535,11 +498,11 @@ static bool Benchmark(const TestParameters& params) t_mem_free.BeginCall(); for (unsigned i = 0, count = params.original_count; i < count; ++i) - SIMDSafeFree(original_data[i]); + leopard::SIMDSafeFree(original_data[i]); for (unsigned i = 0, count = encode_work_count; i < count; ++i) - SIMDSafeFree(encode_work_data[i]); + leopard::SIMDSafeFree(encode_work_data[i]); for (unsigned i = 0, count = decode_work_count; i < count; ++i) - SIMDSafeFree(decode_work_data[i]); + leopard::SIMDSafeFree(decode_work_data[i]); t_mem_free.EndCall(); }