Only allocate memory for mul table used

2017-06-03 16:48:09 -07:00 · 2017-06-03 16:48:09 -07:00 · 96bd047a2d
parent 62f9f56555
commit 96bd047a2d
4 changed files with 119 additions and 76 deletions
--- a/LeopardCommon.h
+++ b/LeopardCommon.h
@ -157,6 +157,7 @@
 #include "leopard.h"

 #include <stdint.h>
+#include <malloc.h>


 //------------------------------------------------------------------------------
@ -421,4 +422,41 @@ protected:
 };


+//------------------------------------------------------------------------------
+// SIMD-Safe Aligned Memory Allocations
+
+static const unsigned kAlignmentBytes = LEO_ALIGN_BYTES;
+
+LEO_FORCE_INLINE unsigned NextAlignedOffset(unsigned offset)
+{
+    return (offset + kAlignmentBytes - 1) & ~(kAlignmentBytes - 1);
+}
+
+static LEO_FORCE_INLINE uint8_t* SIMDSafeAllocate(size_t size)
+{
+    uint8_t* data = (uint8_t*)calloc(1, kAlignmentBytes + size);
+    if (!data)
+        return nullptr;
+    unsigned offset = (unsigned)((uintptr_t)data % kAlignmentBytes);
+    data += kAlignmentBytes - offset;
+    data[-1] = (uint8_t)offset;
+    return data;
+}
+
+static LEO_FORCE_INLINE void SIMDSafeFree(void* ptr)
+{
+    if (!ptr)
+        return;
+    uint8_t* data = (uint8_t*)ptr;
+    unsigned offset = data[-1];
+    if (offset >= kAlignmentBytes)
+    {
+        LEO_DEBUG_BREAK; // Should never happen
+        return;
+    }
+    data -= kAlignmentBytes - offset;
+    free(data);
+}
+
+
 } // namespace leopard
--- a/LeopardFF16.cpp
+++ b/LeopardFF16.cpp
@ -199,20 +199,38 @@ static void InitializeLogarithmTables()
    The ALTMAP memory layout is used since there is no need to convert in/out.
 */

-struct {
-    LEO_ALIGNED LEO_M128 Lo[4];
-    LEO_ALIGNED LEO_M128 Hi[4];
-} static Multiply128LUT[kOrder];
+struct Multiply128LUT_t
+{
+    LEO_M128 Lo[4];
+    LEO_M128 Hi[4];
+};
+
+static const Multiply128LUT_t* Multiply128LUT = nullptr;
+
 #if defined(LEO_TRY_AVX2)
-struct {
-    LEO_ALIGNED LEO_M256 Lo[4];
-    LEO_ALIGNED LEO_M256 Hi[4];
-} static Multiply256LUT[kOrder];
+
+struct Multiply256LUT_t
+{
+    LEO_M256 Lo[4];
+    LEO_M256 Hi[4];
+};
+
+static const Multiply256LUT_t* Multiply256LUT = nullptr;
+
 #endif // LEO_TRY_AVX2


 void InitializeMultiplyTables()
 {
+    // If we cannot use the PSHUFB instruction, generate Multiply8LUT:
+    if (!CpuHasSSSE3)
+        return;
+
+    if (CpuHasAVX2)
+        Multiply256LUT = reinterpret_cast<const Multiply256LUT_t*>(SIMDSafeAllocate(sizeof(Multiply256LUT_t) * kOrder));
+    else
+        Multiply128LUT = reinterpret_cast<const Multiply128LUT_t*>(SIMDSafeAllocate(sizeof(Multiply128LUT_t) * kOrder));
+
    // For each value we could multiply by:
    for (unsigned log_m = 0; log_m < kOrder; ++log_m)
    {
@ -232,16 +250,19 @@ void InitializeMultiplyTables()
            const LEO_M128 value_hi = _mm_loadu_si128((LEO_M128*)prod_hi);

            // Store in 128-bit wide table
-            _mm_storeu_si128(&Multiply128LUT[log_m].Lo[i], value_lo);
-            _mm_storeu_si128(&Multiply128LUT[log_m].Hi[i], value_hi);
+            if (!CpuHasAVX2)
+            {
+                _mm_storeu_si128((LEO_M128*)&Multiply128LUT[log_m].Lo[i], value_lo);
+                _mm_storeu_si128((LEO_M128*)&Multiply128LUT[log_m].Hi[i], value_hi);
+            }

            // Store in 256-bit wide table
 #if defined(LEO_TRY_AVX2)
            if (CpuHasAVX2)
            {
-                _mm256_storeu_si256(&Multiply256LUT[log_m].Lo[i],
+                _mm256_storeu_si256((LEO_M256*)&Multiply256LUT[log_m].Lo[i],
                    _mm256_broadcastsi128_si256(value_lo));
-                _mm256_storeu_si256(&Multiply256LUT[log_m].Hi[i],
+                _mm256_storeu_si256((LEO_M256*)&Multiply256LUT[log_m].Hi[i],
                    _mm256_broadcastsi128_si256(value_hi));
            }
 #endif // LEO_TRY_AVX2
--- a/LeopardFF8.cpp
+++ b/LeopardFF8.cpp
@ -202,19 +202,27 @@ static void InitializeLogarithmTables()
    Specifically section 6 outlines the algorithm used here for 8-bit fields.
 */

-struct {
-    LEO_ALIGNED LEO_M128 Value[2];
-} static Multiply128LUT[kOrder];
+struct Multiply128LUT_t
+{
+    LEO_M128 Value[2];
+};
+
+static const Multiply128LUT_t* Multiply128LUT = nullptr;

 #if defined(LEO_TRY_AVX2)
-struct {
-    LEO_ALIGNED LEO_M256 Value[2];
-} static Multiply256LUT[kOrder];
+
+struct Multiply256LUT_t
+{
+    LEO_M256 Value[2];
+};
+
+static const Multiply256LUT_t* Multiply256LUT = nullptr;
+
 #endif // LEO_TRY_AVX2

 // Stores the product of x * y at offset x + y * 256
 // Repeated accesses from the same y value are faster
-static ffe_t Multiply8LUT[256 * 256] = {};
+static const ffe_t* Multiply8LUT = nullptr;


 static void InitializeMultiplyTables()
@ -222,27 +230,38 @@ static void InitializeMultiplyTables()
    // If we cannot use the PSHUFB instruction, generate Multiply8LUT:
    if (!CpuHasSSSE3)
    {
+        Multiply8LUT = new ffe_t[256 * 256];
+
        // For each left-multiplicand:
        for (unsigned x = 0; x < 256; ++x)
        {
-            ffe_t* lut = Multiply8LUT + x;
+            ffe_t* lut = (ffe_t*)Multiply8LUT + x;

-            // Note: Table is already zeroed out so we can skip the zeroes
            if (x == 0)
-                continue;
-
-            const ffe_t log_x = LogLUT[x];
-
-            for (unsigned log_y = 0; log_y < 256; ++log_y, lut += 256)
            {
-                const ffe_t prod = ExpLUT[AddMod(log_x, log_y)];
-                *lut = prod;
+                for (unsigned log_y = 0; log_y < 256; ++log_y, lut += 256)
+                    *lut = 0;
+            }
+            else
+            {
+                const ffe_t log_x = LogLUT[x];
+
+                for (unsigned log_y = 0; log_y < 256; ++log_y, lut += 256)
+                {
+                    const ffe_t prod = ExpLUT[AddMod(log_x, log_y)];
+                    *lut = prod;
+                }
            }
        }

        return;
    }

+    if (CpuHasAVX2)
+        Multiply256LUT = reinterpret_cast<const Multiply256LUT_t*>(SIMDSafeAllocate(sizeof(Multiply256LUT_t) * kOrder));
+    else
+        Multiply128LUT = reinterpret_cast<const Multiply128LUT_t*>(SIMDSafeAllocate(sizeof(Multiply128LUT_t) * kOrder));
+
    // For each value we could multiply by:
    for (unsigned log_m = 0; log_m < kOrder; ++log_m)
    {
@ -254,16 +273,18 @@ static void InitializeMultiplyTables()
            for (ffe_t x = 0; x < 16; ++x)
                lut[x] = MultiplyLog(x << shift, static_cast<ffe_t>(log_m));

-            // Store in 128-bit wide table
            const LEO_M128 *v_ptr = reinterpret_cast<const LEO_M128 *>(&lut[0]);
            const LEO_M128 value = _mm_loadu_si128(v_ptr);
-            _mm_storeu_si128(&Multiply128LUT[log_m].Value[i], value);
+
+            // Store in 128-bit wide table
+            if (!CpuHasAVX2)
+                _mm_storeu_si128((LEO_M128*)&Multiply128LUT[log_m].Value[i], value);

            // Store in 256-bit wide table
 #if defined(LEO_TRY_AVX2)
            if (CpuHasAVX2)
            {
-                _mm256_storeu_si256(&Multiply256LUT[log_m].Value[i],
+                _mm256_storeu_si256((LEO_M256*)&Multiply256LUT[log_m].Value[i],
                    _mm256_broadcastsi128_si256(value));
            }
 #endif // LEO_TRY_AVX2
--- a/tests/benchmark.cpp
+++ b/tests/benchmark.cpp
@ -365,43 +365,6 @@ static void ShuffleDeck16(PCGRandom &prng, uint16_t * LEO_RESTRICT deck, uint32_
 }


-//------------------------------------------------------------------------------
-// SIMD-Safe Aligned Memory Allocations
-
-static const unsigned kAlignmentBytes = LEO_ALIGN_BYTES;
-
-LEO_FORCE_INLINE unsigned NextAlignedOffset(unsigned offset)
-{
-    return (offset + kAlignmentBytes - 1) & ~(kAlignmentBytes - 1);
-}
-
-static LEO_FORCE_INLINE uint8_t* SIMDSafeAllocate(size_t size)
-{
-    uint8_t* data = (uint8_t*)calloc(1, kAlignmentBytes + size);
-    if (!data)
-        return nullptr;
-    unsigned offset = (unsigned)((uintptr_t)data % kAlignmentBytes);
-    data += kAlignmentBytes - offset;
-    data[-1] = (uint8_t)offset;
-    return data;
-}
-
-static LEO_FORCE_INLINE void SIMDSafeFree(void* ptr)
-{
-    if (!ptr)
-        return;
-    uint8_t* data = (uint8_t*)ptr;
-    unsigned offset = data[-1];
-    if (offset >= kAlignmentBytes)
-    {
-        LEO_DEBUG_BREAK; // Should never happen
-        return;
-    }
-    data -= kAlignmentBytes - offset;
-    free(data);
-}
-
-
 //------------------------------------------------------------------------------
 // Benchmark

@ -430,11 +393,11 @@ static bool Benchmark(const TestParameters& params)

        t_mem_alloc.BeginCall();
        for (unsigned i = 0, count = params.original_count; i < count; ++i)
-            original_data[i] = SIMDSafeAllocate(params.buffer_bytes);
+            original_data[i] = leopard::SIMDSafeAllocate(params.buffer_bytes);
        for (unsigned i = 0, count = encode_work_count; i < count; ++i)
-            encode_work_data[i] = SIMDSafeAllocate(params.buffer_bytes);
+            encode_work_data[i] = leopard::SIMDSafeAllocate(params.buffer_bytes);
        for (unsigned i = 0, count = decode_work_count; i < count; ++i)
-            decode_work_data[i] = SIMDSafeAllocate(params.buffer_bytes);
+            decode_work_data[i] = leopard::SIMDSafeAllocate(params.buffer_bytes);
        t_mem_alloc.EndCall();

        // Generate data:
@ -479,7 +442,7 @@ static bool Benchmark(const TestParameters& params)
        for (unsigned i = 0, count = params.loss_count; i < count; ++i)
        {
            const unsigned loss_index = original_losses[i];
-            SIMDSafeFree(original_data[loss_index]);
+            leopard::SIMDSafeFree(original_data[loss_index]);
            original_data[loss_index] = nullptr;
        }

@ -493,7 +456,7 @@ static bool Benchmark(const TestParameters& params)
        for (unsigned i = 0, count = recovery_loss_count; i < count; ++i)
        {
            const unsigned loss_index = recovery_losses[i];
-            SIMDSafeFree(encode_work_data[loss_index]);
+            leopard::SIMDSafeFree(encode_work_data[loss_index]);
            encode_work_data[loss_index] = nullptr;
        }

@ -535,11 +498,11 @@ static bool Benchmark(const TestParameters& params)

        t_mem_free.BeginCall();
        for (unsigned i = 0, count = params.original_count; i < count; ++i)
-            SIMDSafeFree(original_data[i]);
+            leopard::SIMDSafeFree(original_data[i]);
        for (unsigned i = 0, count = encode_work_count; i < count; ++i)
-            SIMDSafeFree(encode_work_data[i]);
+            leopard::SIMDSafeFree(encode_work_data[i]);
        for (unsigned i = 0, count = decode_work_count; i < count; ++i)
-            SIMDSafeFree(decode_work_data[i]);
+            leopard::SIMDSafeFree(decode_work_data[i]);
        t_mem_free.EndCall();
    }