Only allocate memory for mul table used

2017-06-03 16:48:09 -07:00 · 2017-06-03 16:48:09 -07:00 · 96bd047a2d
parent 62f9f56555
commit 96bd047a2d
4 changed files with 119 additions and 76 deletions
--- a/LeopardCommon.h
+++ b/LeopardCommon.h
@ -157,6 +157,7 @@
 #include "leopard.h"
 #include <stdint.h>
 #include <malloc.h>
 //------------------------------------------------------------------------------
@ -421,4 +422,41 @@ protected:
 };
 //------------------------------------------------------------------------------
 // SIMD-Safe Aligned Memory Allocations
 static const unsigned kAlignmentBytes = LEO_ALIGN_BYTES;
 LEO_FORCE_INLINE unsigned NextAlignedOffset(unsigned offset)
 {
    return (offset + kAlignmentBytes - 1) & ~(kAlignmentBytes - 1);
 }
 static LEO_FORCE_INLINE uint8_t* SIMDSafeAllocate(size_t size)
 {
    uint8_t* data = (uint8_t*)calloc(1, kAlignmentBytes + size);
    if (!data)
        return nullptr;
    unsigned offset = (unsigned)((uintptr_t)data % kAlignmentBytes);
    data += kAlignmentBytes - offset;
    data[-1] = (uint8_t)offset;
    return data;
 }
 static LEO_FORCE_INLINE void SIMDSafeFree(void* ptr)
 {
    if (!ptr)
        return;
    uint8_t* data = (uint8_t*)ptr;
    unsigned offset = data[-1];
    if (offset >= kAlignmentBytes)
    {
        LEO_DEBUG_BREAK; // Should never happen
        return;
    }
    data -= kAlignmentBytes - offset;
    free(data);
 }
 } // namespace leopard
--- a/LeopardFF16.cpp
+++ b/LeopardFF16.cpp
@ -199,20 +199,38 @@ static void InitializeLogarithmTables()
    The ALTMAP memory layout is used since there is no need to convert in/out.
 */
-struct {
+struct Multiply128LUT_t
-    LEO_ALIGNED LEO_M128 Lo[4];
+{
-    LEO_ALIGNED LEO_M128 Hi[4];
+    LEO_M128 Lo[4];
-} static Multiply128LUT[kOrder];
+    LEO_M128 Hi[4];
 };
 static const Multiply128LUT_t* Multiply128LUT = nullptr;
 #if defined(LEO_TRY_AVX2)
-struct {
+
-    LEO_ALIGNED LEO_M256 Lo[4];
+struct Multiply256LUT_t
-    LEO_ALIGNED LEO_M256 Hi[4];
+{
-} static Multiply256LUT[kOrder];
+    LEO_M256 Lo[4];
    LEO_M256 Hi[4];
 };
 static const Multiply256LUT_t* Multiply256LUT = nullptr;
 #endif // LEO_TRY_AVX2
 void InitializeMultiplyTables()
 {
    // If we cannot use the PSHUFB instruction, generate Multiply8LUT:
    if (!CpuHasSSSE3)
        return;
    if (CpuHasAVX2)
        Multiply256LUT = reinterpret_cast<const Multiply256LUT_t*>(SIMDSafeAllocate(sizeof(Multiply256LUT_t) * kOrder));
    else
        Multiply128LUT = reinterpret_cast<const Multiply128LUT_t*>(SIMDSafeAllocate(sizeof(Multiply128LUT_t) * kOrder));
    // For each value we could multiply by:
    for (unsigned log_m = 0; log_m < kOrder; ++log_m)
    {
@ -232,16 +250,19 @@ void InitializeMultiplyTables()
            const LEO_M128 value_hi = _mm_loadu_si128((LEO_M128*)prod_hi);
            // Store in 128-bit wide table
-            _mm_storeu_si128(&Multiply128LUT[log_m].Lo[i], value_lo);
+            if (!CpuHasAVX2)
-            _mm_storeu_si128(&Multiply128LUT[log_m].Hi[i], value_hi);
+            {
                _mm_storeu_si128((LEO_M128*)&Multiply128LUT[log_m].Lo[i], value_lo);
                _mm_storeu_si128((LEO_M128*)&Multiply128LUT[log_m].Hi[i], value_hi);
            }
            // Store in 256-bit wide table
 #if defined(LEO_TRY_AVX2)
            if (CpuHasAVX2)
            {
-                _mm256_storeu_si256(&Multiply256LUT[log_m].Lo[i],
+                _mm256_storeu_si256((LEO_M256*)&Multiply256LUT[log_m].Lo[i],
                    _mm256_broadcastsi128_si256(value_lo));
-                _mm256_storeu_si256(&Multiply256LUT[log_m].Hi[i],
+                _mm256_storeu_si256((LEO_M256*)&Multiply256LUT[log_m].Hi[i],
                    _mm256_broadcastsi128_si256(value_hi));
            }
 #endif // LEO_TRY_AVX2
--- a/LeopardFF8.cpp
+++ b/LeopardFF8.cpp
@ -202,19 +202,27 @@ static void InitializeLogarithmTables()
    Specifically section 6 outlines the algorithm used here for 8-bit fields.
 */
-struct {
+struct Multiply128LUT_t
-    LEO_ALIGNED LEO_M128 Value[2];
+{
-} static Multiply128LUT[kOrder];
+    LEO_M128 Value[2];
 };
 static const Multiply128LUT_t* Multiply128LUT = nullptr;
 #if defined(LEO_TRY_AVX2)
-struct {
+
-    LEO_ALIGNED LEO_M256 Value[2];
+struct Multiply256LUT_t
-} static Multiply256LUT[kOrder];
+{
    LEO_M256 Value[2];
 };
 static const Multiply256LUT_t* Multiply256LUT = nullptr;
 #endif // LEO_TRY_AVX2
 // Stores the product of x * y at offset x + y * 256
 // Repeated accesses from the same y value are faster
-static ffe_t Multiply8LUT[256 * 256] = {};
+static const ffe_t* Multiply8LUT = nullptr;
 static void InitializeMultiplyTables()
@ -222,15 +230,20 @@ static void InitializeMultiplyTables()
    // If we cannot use the PSHUFB instruction, generate Multiply8LUT:
    if (!CpuHasSSSE3)
    {
        Multiply8LUT = new ffe_t[256 * 256];
        // For each left-multiplicand:
        for (unsigned x = 0; x < 256; ++x)
        {
-            ffe_t* lut = Multiply8LUT + x;
+            ffe_t* lut = (ffe_t*)Multiply8LUT + x;
            // Note: Table is already zeroed out so we can skip the zeroes
            if (x == 0)
-                continue;
+            {
-
+                for (unsigned log_y = 0; log_y < 256; ++log_y, lut += 256)
                    *lut = 0;
            }
            else
            {
                const ffe_t log_x = LogLUT[x];
                for (unsigned log_y = 0; log_y < 256; ++log_y, lut += 256)
@ -239,10 +252,16 @@ static void InitializeMultiplyTables()
                    *lut = prod;
                }
            }
        }
        return;
    }
    if (CpuHasAVX2)
        Multiply256LUT = reinterpret_cast<const Multiply256LUT_t*>(SIMDSafeAllocate(sizeof(Multiply256LUT_t) * kOrder));
    else
        Multiply128LUT = reinterpret_cast<const Multiply128LUT_t*>(SIMDSafeAllocate(sizeof(Multiply128LUT_t) * kOrder));
    // For each value we could multiply by:
    for (unsigned log_m = 0; log_m < kOrder; ++log_m)
    {
@ -254,16 +273,18 @@ static void InitializeMultiplyTables()
            for (ffe_t x = 0; x < 16; ++x)
                lut[x] = MultiplyLog(x << shift, static_cast<ffe_t>(log_m));
            // Store in 128-bit wide table
            const LEO_M128 *v_ptr = reinterpret_cast<const LEO_M128 *>(&lut[0]);
            const LEO_M128 value = _mm_loadu_si128(v_ptr);
-            _mm_storeu_si128(&Multiply128LUT[log_m].Value[i], value);
+
            // Store in 128-bit wide table
            if (!CpuHasAVX2)
                _mm_storeu_si128((LEO_M128*)&Multiply128LUT[log_m].Value[i], value);
            // Store in 256-bit wide table
 #if defined(LEO_TRY_AVX2)
            if (CpuHasAVX2)
            {
-                _mm256_storeu_si256(&Multiply256LUT[log_m].Value[i],
+                _mm256_storeu_si256((LEO_M256*)&Multiply256LUT[log_m].Value[i],
                    _mm256_broadcastsi128_si256(value));
            }
 #endif // LEO_TRY_AVX2
--- a/tests/benchmark.cpp
+++ b/tests/benchmark.cpp
@ -365,43 +365,6 @@ static void ShuffleDeck16(PCGRandom &prng, uint16_t * LEO_RESTRICT deck, uint32_
 }
 //------------------------------------------------------------------------------
 // SIMD-Safe Aligned Memory Allocations
 static const unsigned kAlignmentBytes = LEO_ALIGN_BYTES;
 LEO_FORCE_INLINE unsigned NextAlignedOffset(unsigned offset)
 {
    return (offset + kAlignmentBytes - 1) & ~(kAlignmentBytes - 1);
 }
 static LEO_FORCE_INLINE uint8_t* SIMDSafeAllocate(size_t size)
 {
    uint8_t* data = (uint8_t*)calloc(1, kAlignmentBytes + size);
    if (!data)
        return nullptr;
    unsigned offset = (unsigned)((uintptr_t)data % kAlignmentBytes);
    data += kAlignmentBytes - offset;
    data[-1] = (uint8_t)offset;
    return data;
 }
 static LEO_FORCE_INLINE void SIMDSafeFree(void* ptr)
 {
    if (!ptr)
        return;
    uint8_t* data = (uint8_t*)ptr;
    unsigned offset = data[-1];
    if (offset >= kAlignmentBytes)
    {
        LEO_DEBUG_BREAK; // Should never happen
        return;
    }
    data -= kAlignmentBytes - offset;
    free(data);
 }
 //------------------------------------------------------------------------------
 // Benchmark
@ -430,11 +393,11 @@ static bool Benchmark(const TestParameters& params)
        t_mem_alloc.BeginCall();
        for (unsigned i = 0, count = params.original_count; i < count; ++i)
-            original_data[i] = SIMDSafeAllocate(params.buffer_bytes);
+            original_data[i] = leopard::SIMDSafeAllocate(params.buffer_bytes);
        for (unsigned i = 0, count = encode_work_count; i < count; ++i)
-            encode_work_data[i] = SIMDSafeAllocate(params.buffer_bytes);
+            encode_work_data[i] = leopard::SIMDSafeAllocate(params.buffer_bytes);
        for (unsigned i = 0, count = decode_work_count; i < count; ++i)
-            decode_work_data[i] = SIMDSafeAllocate(params.buffer_bytes);
+            decode_work_data[i] = leopard::SIMDSafeAllocate(params.buffer_bytes);
        t_mem_alloc.EndCall();
        // Generate data:
@ -479,7 +442,7 @@ static bool Benchmark(const TestParameters& params)
        for (unsigned i = 0, count = params.loss_count; i < count; ++i)
        {
            const unsigned loss_index = original_losses[i];
-            SIMDSafeFree(original_data[loss_index]);
+            leopard::SIMDSafeFree(original_data[loss_index]);
            original_data[loss_index] = nullptr;
        }
@ -493,7 +456,7 @@ static bool Benchmark(const TestParameters& params)
        for (unsigned i = 0, count = recovery_loss_count; i < count; ++i)
        {
            const unsigned loss_index = recovery_losses[i];
-            SIMDSafeFree(encode_work_data[loss_index]);
+            leopard::SIMDSafeFree(encode_work_data[loss_index]);
            encode_work_data[loss_index] = nullptr;
        }
@ -535,11 +498,11 @@ static bool Benchmark(const TestParameters& params)
        t_mem_free.BeginCall();
        for (unsigned i = 0, count = params.original_count; i < count; ++i)
-            SIMDSafeFree(original_data[i]);
+            leopard::SIMDSafeFree(original_data[i]);
        for (unsigned i = 0, count = encode_work_count; i < count; ++i)
-            SIMDSafeFree(encode_work_data[i]);
+            leopard::SIMDSafeFree(encode_work_data[i]);
        for (unsigned i = 0, count = decode_work_count; i < count; ++i)
-            SIMDSafeFree(decode_work_data[i]);
+            leopard::SIMDSafeFree(decode_work_data[i]);
        t_mem_free.EndCall();
    }