From 96bd047a2d823662b658d578a30fd89c078c7bde Mon Sep 17 00:00:00 2001
From: Christopher Taylor <christopher.taylor@oculus.com>
Date: Sat, 3 Jun 2017 16:48:09 -0700
Subject: [PATCH] Only allocate memory for mul table used

---
 LeopardCommon.h     | 38 +++++++++++++++++++++++++++++
 LeopardFF16.cpp     | 45 +++++++++++++++++++++++++---------
 LeopardFF8.cpp      | 59 ++++++++++++++++++++++++++++++---------------
 tests/benchmark.cpp | 53 ++++++----------------------------------
 4 files changed, 119 insertions(+), 76 deletions(-)
diff --git a/LeopardCommon.h b/LeopardCommon.h
index bf855de..d1b15f5 100644
--- a/LeopardCommon.h
+++ b/LeopardCommon.h
@@ -157,6 +157,7 @@
 #include "leopard.h"
 
 #include <stdint.h>
+#include <malloc.h>
 
 
 //------------------------------------------------------------------------------
@@ -421,4 +422,41 @@ protected:
 };
 
 
+//------------------------------------------------------------------------------
+// SIMD-Safe Aligned Memory Allocations
+
+static const unsigned kAlignmentBytes = LEO_ALIGN_BYTES;
+
+LEO_FORCE_INLINE unsigned NextAlignedOffset(unsigned offset)
+{
+    return (offset + kAlignmentBytes - 1) & ~(kAlignmentBytes - 1);
+}
+
+static LEO_FORCE_INLINE uint8_t* SIMDSafeAllocate(size_t size)
+{
+    uint8_t* data = (uint8_t*)calloc(1, kAlignmentBytes + size);
+    if (!data)
+        return nullptr;
+    unsigned offset = (unsigned)((uintptr_t)data % kAlignmentBytes);
+    data += kAlignmentBytes - offset;
+    data[-1] = (uint8_t)offset;
+    return data;
+}
+
+static LEO_FORCE_INLINE void SIMDSafeFree(void* ptr)
+{
+    if (!ptr)
+        return;
+    uint8_t* data = (uint8_t*)ptr;
+    unsigned offset = data[-1];
+    if (offset >= kAlignmentBytes)
+    {
+        LEO_DEBUG_BREAK; // Should never happen
+        return;
+    }
+    data -= kAlignmentBytes - offset;
+    free(data);
+}
+
+
 } // namespace leopard
diff --git a/LeopardFF16.cpp b/LeopardFF16.cpp
index 926f467..f1bbbea 100644
--- a/LeopardFF16.cpp
+++ b/LeopardFF16.cpp
@@ -199,20 +199,38 @@ static void InitializeLogarithmTables()
     The ALTMAP memory layout is used since there is no need to convert in/out.
 */
 
-struct {
-    LEO_ALIGNED LEO_M128 Lo[4];
-    LEO_ALIGNED LEO_M128 Hi[4];
-} static Multiply128LUT[kOrder];
+struct Multiply128LUT_t
+{
+    LEO_M128 Lo[4];
+    LEO_M128 Hi[4];
+};
+
+static const Multiply128LUT_t* Multiply128LUT = nullptr;
+
 #if defined(LEO_TRY_AVX2)
-struct {
-    LEO_ALIGNED LEO_M256 Lo[4];
-    LEO_ALIGNED LEO_M256 Hi[4];
-} static Multiply256LUT[kOrder];
+
+struct Multiply256LUT_t
+{
+    LEO_M256 Lo[4];
+    LEO_M256 Hi[4];
+};
+
+static const Multiply256LUT_t* Multiply256LUT = nullptr;
+
 #endif // LEO_TRY_AVX2
 
 
 void InitializeMultiplyTables()
 {
+    // If we cannot use the PSHUFB instruction, generate Multiply8LUT:
+    if (!CpuHasSSSE3)
+        return;
+
+    if (CpuHasAVX2)
+        Multiply256LUT = reinterpret_cast<const Multiply256LUT_t*>(SIMDSafeAllocate(sizeof(Multiply256LUT_t) * kOrder));
+    else
+        Multiply128LUT = reinterpret_cast<const Multiply128LUT_t*>(SIMDSafeAllocate(sizeof(Multiply128LUT_t) * kOrder));
+
     // For each value we could multiply by:
     for (unsigned log_m = 0; log_m < kOrder; ++log_m)
     {
@@ -232,16 +250,19 @@ void InitializeMultiplyTables()
             const LEO_M128 value_hi = _mm_loadu_si128((LEO_M128*)prod_hi);
 
             // Store in 128-bit wide table
-            _mm_storeu_si128(&Multiply128LUT[log_m].Lo[i], value_lo);
-            _mm_storeu_si128(&Multiply128LUT[log_m].Hi[i], value_hi);
+            if (!CpuHasAVX2)
+            {
+                _mm_storeu_si128((LEO_M128*)&Multiply128LUT[log_m].Lo[i], value_lo);
+                _mm_storeu_si128((LEO_M128*)&Multiply128LUT[log_m].Hi[i], value_hi);
+            }
 
             // Store in 256-bit wide table
 #if defined(LEO_TRY_AVX2)
             if (CpuHasAVX2)
             {
-                _mm256_storeu_si256(&Multiply256LUT[log_m].Lo[i],
+                _mm256_storeu_si256((LEO_M256*)&Multiply256LUT[log_m].Lo[i],
                     _mm256_broadcastsi128_si256(value_lo));
-                _mm256_storeu_si256(&Multiply256LUT[log_m].Hi[i],
+                _mm256_storeu_si256((LEO_M256*)&Multiply256LUT[log_m].Hi[i],
                     _mm256_broadcastsi128_si256(value_hi));
             }
 #endif // LEO_TRY_AVX2
diff --git a/LeopardFF8.cpp b/LeopardFF8.cpp
index d9eaaa9..b9c2ccc 100644
--- a/LeopardFF8.cpp
+++ b/LeopardFF8.cpp
@@ -202,19 +202,27 @@ static void InitializeLogarithmTables()
     Specifically section 6 outlines the algorithm used here for 8-bit fields.
 */
 
-struct {
-    LEO_ALIGNED LEO_M128 Value[2];
-} static Multiply128LUT[kOrder];
+struct Multiply128LUT_t
+{
+    LEO_M128 Value[2];
+};
+
+static const Multiply128LUT_t* Multiply128LUT = nullptr;
 
 #if defined(LEO_TRY_AVX2)
-struct {
-    LEO_ALIGNED LEO_M256 Value[2];
-} static Multiply256LUT[kOrder];
+
+struct Multiply256LUT_t
+{
+    LEO_M256 Value[2];
+};
+
+static const Multiply256LUT_t* Multiply256LUT = nullptr;
+
 #endif // LEO_TRY_AVX2
 
 // Stores the product of x * y at offset x + y * 256
 // Repeated accesses from the same y value are faster
-static ffe_t Multiply8LUT[256 * 256] = {};
+static const ffe_t* Multiply8LUT = nullptr;
 
 
 static void InitializeMultiplyTables()
@@ -222,27 +230,38 @@ static void InitializeMultiplyTables()
     // If we cannot use the PSHUFB instruction, generate Multiply8LUT:
     if (!CpuHasSSSE3)
     {
+        Multiply8LUT = new ffe_t[256 * 256];
+
         // For each left-multiplicand:
         for (unsigned x = 0; x < 256; ++x)
         {
-            ffe_t* lut = Multiply8LUT + x;
+            ffe_t* lut = (ffe_t*)Multiply8LUT + x;
 
-            // Note: Table is already zeroed out so we can skip the zeroes
             if (x == 0)
-                continue;
-
-            const ffe_t log_x = LogLUT[x];
-
-            for (unsigned log_y = 0; log_y < 256; ++log_y, lut += 256)
             {
-                const ffe_t prod = ExpLUT[AddMod(log_x, log_y)];
-                *lut = prod;
+                for (unsigned log_y = 0; log_y < 256; ++log_y, lut += 256)
+                    *lut = 0;
+            }
+            else
+            {
+                const ffe_t log_x = LogLUT[x];
+
+                for (unsigned log_y = 0; log_y < 256; ++log_y, lut += 256)
+                {
+                    const ffe_t prod = ExpLUT[AddMod(log_x, log_y)];
+                    *lut = prod;
+                }
             }
         }
 
         return;
     }
 
+    if (CpuHasAVX2)
+        Multiply256LUT = reinterpret_cast<const Multiply256LUT_t*>(SIMDSafeAllocate(sizeof(Multiply256LUT_t) * kOrder));
+    else
+        Multiply128LUT = reinterpret_cast<const Multiply128LUT_t*>(SIMDSafeAllocate(sizeof(Multiply128LUT_t) * kOrder));
+
     // For each value we could multiply by:
     for (unsigned log_m = 0; log_m < kOrder; ++log_m)
     {
@@ -254,16 +273,18 @@ static void InitializeMultiplyTables()
             for (ffe_t x = 0; x < 16; ++x)
                 lut[x] = MultiplyLog(x << shift, static_cast<ffe_t>(log_m));
 
-            // Store in 128-bit wide table
             const LEO_M128 *v_ptr = reinterpret_cast<const LEO_M128 *>(&lut[0]);
             const LEO_M128 value = _mm_loadu_si128(v_ptr);
-            _mm_storeu_si128(&Multiply128LUT[log_m].Value[i], value);
+
+            // Store in 128-bit wide table
+            if (!CpuHasAVX2)
+                _mm_storeu_si128((LEO_M128*)&Multiply128LUT[log_m].Value[i], value);
 
             // Store in 256-bit wide table
 #if defined(LEO_TRY_AVX2)
             if (CpuHasAVX2)
             {
-                _mm256_storeu_si256(&Multiply256LUT[log_m].Value[i],
+                _mm256_storeu_si256((LEO_M256*)&Multiply256LUT[log_m].Value[i],
                     _mm256_broadcastsi128_si256(value));
             }
 #endif // LEO_TRY_AVX2
diff --git a/tests/benchmark.cpp b/tests/benchmark.cpp
index 1c491b1..8215883 100644
--- a/tests/benchmark.cpp
+++ b/tests/benchmark.cpp
@@ -365,43 +365,6 @@ static void ShuffleDeck16(PCGRandom &prng, uint16_t * LEO_RESTRICT deck, uint32_
 }
 
 
-//------------------------------------------------------------------------------
-// SIMD-Safe Aligned Memory Allocations
-
-static const unsigned kAlignmentBytes = LEO_ALIGN_BYTES;
-
-LEO_FORCE_INLINE unsigned NextAlignedOffset(unsigned offset)
-{
-    return (offset + kAlignmentBytes - 1) & ~(kAlignmentBytes - 1);
-}
-
-static LEO_FORCE_INLINE uint8_t* SIMDSafeAllocate(size_t size)
-{
-    uint8_t* data = (uint8_t*)calloc(1, kAlignmentBytes + size);
-    if (!data)
-        return nullptr;
-    unsigned offset = (unsigned)((uintptr_t)data % kAlignmentBytes);
-    data += kAlignmentBytes - offset;
-    data[-1] = (uint8_t)offset;
-    return data;
-}
-
-static LEO_FORCE_INLINE void SIMDSafeFree(void* ptr)
-{
-    if (!ptr)
-        return;
-    uint8_t* data = (uint8_t*)ptr;
-    unsigned offset = data[-1];
-    if (offset >= kAlignmentBytes)
-    {
-        LEO_DEBUG_BREAK; // Should never happen
-        return;
-    }
-    data -= kAlignmentBytes - offset;
-    free(data);
-}
-
-
 //------------------------------------------------------------------------------
 // Benchmark
 
@@ -430,11 +393,11 @@ static bool Benchmark(const TestParameters& params)
 
         t_mem_alloc.BeginCall();
         for (unsigned i = 0, count = params.original_count; i < count; ++i)
-            original_data[i] = SIMDSafeAllocate(params.buffer_bytes);
+            original_data[i] = leopard::SIMDSafeAllocate(params.buffer_bytes);
         for (unsigned i = 0, count = encode_work_count; i < count; ++i)
-            encode_work_data[i] = SIMDSafeAllocate(params.buffer_bytes);
+            encode_work_data[i] = leopard::SIMDSafeAllocate(params.buffer_bytes);
         for (unsigned i = 0, count = decode_work_count; i < count; ++i)
-            decode_work_data[i] = SIMDSafeAllocate(params.buffer_bytes);
+            decode_work_data[i] = leopard::SIMDSafeAllocate(params.buffer_bytes);
         t_mem_alloc.EndCall();
 
         // Generate data:
@@ -479,7 +442,7 @@ static bool Benchmark(const TestParameters& params)
         for (unsigned i = 0, count = params.loss_count; i < count; ++i)
         {
             const unsigned loss_index = original_losses[i];
-            SIMDSafeFree(original_data[loss_index]);
+            leopard::SIMDSafeFree(original_data[loss_index]);
             original_data[loss_index] = nullptr;
         }
 
@@ -493,7 +456,7 @@ static bool Benchmark(const TestParameters& params)
         for (unsigned i = 0, count = recovery_loss_count; i < count; ++i)
         {
             const unsigned loss_index = recovery_losses[i];
-            SIMDSafeFree(encode_work_data[loss_index]);
+            leopard::SIMDSafeFree(encode_work_data[loss_index]);
             encode_work_data[loss_index] = nullptr;
         }
 
@@ -535,11 +498,11 @@ static bool Benchmark(const TestParameters& params)
 
         t_mem_free.BeginCall();
         for (unsigned i = 0, count = params.original_count; i < count; ++i)
-            SIMDSafeFree(original_data[i]);
+            leopard::SIMDSafeFree(original_data[i]);
         for (unsigned i = 0, count = encode_work_count; i < count; ++i)
-            SIMDSafeFree(encode_work_data[i]);
+            leopard::SIMDSafeFree(encode_work_data[i]);
         for (unsigned i = 0, count = decode_work_count; i < count; ++i)
-            SIMDSafeFree(decode_work_data[i]);
+            leopard::SIMDSafeFree(decode_work_data[i]);
         t_mem_free.EndCall();
     }