Only allocate memory for mul table used

This commit is contained in:
Christopher Taylor 2017-06-03 16:48:09 -07:00
parent 62f9f56555
commit 96bd047a2d
4 changed files with 119 additions and 76 deletions

View File

@ -157,6 +157,7 @@
#include "leopard.h" #include "leopard.h"
#include <stdint.h> #include <stdint.h>
#include <malloc.h>
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
@ -421,4 +422,41 @@ protected:
}; };
//------------------------------------------------------------------------------
// SIMD-Safe Aligned Memory Allocations
static const unsigned kAlignmentBytes = LEO_ALIGN_BYTES;
LEO_FORCE_INLINE unsigned NextAlignedOffset(unsigned offset)
{
return (offset + kAlignmentBytes - 1) & ~(kAlignmentBytes - 1);
}
static LEO_FORCE_INLINE uint8_t* SIMDSafeAllocate(size_t size)
{
uint8_t* data = (uint8_t*)calloc(1, kAlignmentBytes + size);
if (!data)
return nullptr;
unsigned offset = (unsigned)((uintptr_t)data % kAlignmentBytes);
data += kAlignmentBytes - offset;
data[-1] = (uint8_t)offset;
return data;
}
static LEO_FORCE_INLINE void SIMDSafeFree(void* ptr)
{
if (!ptr)
return;
uint8_t* data = (uint8_t*)ptr;
unsigned offset = data[-1];
if (offset >= kAlignmentBytes)
{
LEO_DEBUG_BREAK; // Should never happen
return;
}
data -= kAlignmentBytes - offset;
free(data);
}
} // namespace leopard } // namespace leopard

View File

@ -199,20 +199,38 @@ static void InitializeLogarithmTables()
The ALTMAP memory layout is used since there is no need to convert in/out. The ALTMAP memory layout is used since there is no need to convert in/out.
*/ */
struct { struct Multiply128LUT_t
LEO_ALIGNED LEO_M128 Lo[4]; {
LEO_ALIGNED LEO_M128 Hi[4]; LEO_M128 Lo[4];
} static Multiply128LUT[kOrder]; LEO_M128 Hi[4];
};
static const Multiply128LUT_t* Multiply128LUT = nullptr;
#if defined(LEO_TRY_AVX2) #if defined(LEO_TRY_AVX2)
struct {
LEO_ALIGNED LEO_M256 Lo[4]; struct Multiply256LUT_t
LEO_ALIGNED LEO_M256 Hi[4]; {
} static Multiply256LUT[kOrder]; LEO_M256 Lo[4];
LEO_M256 Hi[4];
};
static const Multiply256LUT_t* Multiply256LUT = nullptr;
#endif // LEO_TRY_AVX2 #endif // LEO_TRY_AVX2
void InitializeMultiplyTables() void InitializeMultiplyTables()
{ {
// If we cannot use the PSHUFB instruction, generate Multiply8LUT:
if (!CpuHasSSSE3)
return;
if (CpuHasAVX2)
Multiply256LUT = reinterpret_cast<const Multiply256LUT_t*>(SIMDSafeAllocate(sizeof(Multiply256LUT_t) * kOrder));
else
Multiply128LUT = reinterpret_cast<const Multiply128LUT_t*>(SIMDSafeAllocate(sizeof(Multiply128LUT_t) * kOrder));
// For each value we could multiply by: // For each value we could multiply by:
for (unsigned log_m = 0; log_m < kOrder; ++log_m) for (unsigned log_m = 0; log_m < kOrder; ++log_m)
{ {
@ -232,16 +250,19 @@ void InitializeMultiplyTables()
const LEO_M128 value_hi = _mm_loadu_si128((LEO_M128*)prod_hi); const LEO_M128 value_hi = _mm_loadu_si128((LEO_M128*)prod_hi);
// Store in 128-bit wide table // Store in 128-bit wide table
_mm_storeu_si128(&Multiply128LUT[log_m].Lo[i], value_lo); if (!CpuHasAVX2)
_mm_storeu_si128(&Multiply128LUT[log_m].Hi[i], value_hi); {
_mm_storeu_si128((LEO_M128*)&Multiply128LUT[log_m].Lo[i], value_lo);
_mm_storeu_si128((LEO_M128*)&Multiply128LUT[log_m].Hi[i], value_hi);
}
// Store in 256-bit wide table // Store in 256-bit wide table
#if defined(LEO_TRY_AVX2) #if defined(LEO_TRY_AVX2)
if (CpuHasAVX2) if (CpuHasAVX2)
{ {
_mm256_storeu_si256(&Multiply256LUT[log_m].Lo[i], _mm256_storeu_si256((LEO_M256*)&Multiply256LUT[log_m].Lo[i],
_mm256_broadcastsi128_si256(value_lo)); _mm256_broadcastsi128_si256(value_lo));
_mm256_storeu_si256(&Multiply256LUT[log_m].Hi[i], _mm256_storeu_si256((LEO_M256*)&Multiply256LUT[log_m].Hi[i],
_mm256_broadcastsi128_si256(value_hi)); _mm256_broadcastsi128_si256(value_hi));
} }
#endif // LEO_TRY_AVX2 #endif // LEO_TRY_AVX2

View File

@ -202,19 +202,27 @@ static void InitializeLogarithmTables()
Specifically section 6 outlines the algorithm used here for 8-bit fields. Specifically section 6 outlines the algorithm used here for 8-bit fields.
*/ */
struct { struct Multiply128LUT_t
LEO_ALIGNED LEO_M128 Value[2]; {
} static Multiply128LUT[kOrder]; LEO_M128 Value[2];
};
static const Multiply128LUT_t* Multiply128LUT = nullptr;
#if defined(LEO_TRY_AVX2) #if defined(LEO_TRY_AVX2)
struct {
LEO_ALIGNED LEO_M256 Value[2]; struct Multiply256LUT_t
} static Multiply256LUT[kOrder]; {
LEO_M256 Value[2];
};
static const Multiply256LUT_t* Multiply256LUT = nullptr;
#endif // LEO_TRY_AVX2 #endif // LEO_TRY_AVX2
// Stores the product of x * y at offset x + y * 256 // Stores the product of x * y at offset x + y * 256
// Repeated accesses from the same y value are faster // Repeated accesses from the same y value are faster
static ffe_t Multiply8LUT[256 * 256] = {}; static const ffe_t* Multiply8LUT = nullptr;
static void InitializeMultiplyTables() static void InitializeMultiplyTables()
@ -222,15 +230,20 @@ static void InitializeMultiplyTables()
// If we cannot use the PSHUFB instruction, generate Multiply8LUT: // If we cannot use the PSHUFB instruction, generate Multiply8LUT:
if (!CpuHasSSSE3) if (!CpuHasSSSE3)
{ {
Multiply8LUT = new ffe_t[256 * 256];
// For each left-multiplicand: // For each left-multiplicand:
for (unsigned x = 0; x < 256; ++x) for (unsigned x = 0; x < 256; ++x)
{ {
ffe_t* lut = Multiply8LUT + x; ffe_t* lut = (ffe_t*)Multiply8LUT + x;
// Note: Table is already zeroed out so we can skip the zeroes
if (x == 0) if (x == 0)
continue; {
for (unsigned log_y = 0; log_y < 256; ++log_y, lut += 256)
*lut = 0;
}
else
{
const ffe_t log_x = LogLUT[x]; const ffe_t log_x = LogLUT[x];
for (unsigned log_y = 0; log_y < 256; ++log_y, lut += 256) for (unsigned log_y = 0; log_y < 256; ++log_y, lut += 256)
@ -239,10 +252,16 @@ static void InitializeMultiplyTables()
*lut = prod; *lut = prod;
} }
} }
}
return; return;
} }
if (CpuHasAVX2)
Multiply256LUT = reinterpret_cast<const Multiply256LUT_t*>(SIMDSafeAllocate(sizeof(Multiply256LUT_t) * kOrder));
else
Multiply128LUT = reinterpret_cast<const Multiply128LUT_t*>(SIMDSafeAllocate(sizeof(Multiply128LUT_t) * kOrder));
// For each value we could multiply by: // For each value we could multiply by:
for (unsigned log_m = 0; log_m < kOrder; ++log_m) for (unsigned log_m = 0; log_m < kOrder; ++log_m)
{ {
@ -254,16 +273,18 @@ static void InitializeMultiplyTables()
for (ffe_t x = 0; x < 16; ++x) for (ffe_t x = 0; x < 16; ++x)
lut[x] = MultiplyLog(x << shift, static_cast<ffe_t>(log_m)); lut[x] = MultiplyLog(x << shift, static_cast<ffe_t>(log_m));
// Store in 128-bit wide table
const LEO_M128 *v_ptr = reinterpret_cast<const LEO_M128 *>(&lut[0]); const LEO_M128 *v_ptr = reinterpret_cast<const LEO_M128 *>(&lut[0]);
const LEO_M128 value = _mm_loadu_si128(v_ptr); const LEO_M128 value = _mm_loadu_si128(v_ptr);
_mm_storeu_si128(&Multiply128LUT[log_m].Value[i], value);
// Store in 128-bit wide table
if (!CpuHasAVX2)
_mm_storeu_si128((LEO_M128*)&Multiply128LUT[log_m].Value[i], value);
// Store in 256-bit wide table // Store in 256-bit wide table
#if defined(LEO_TRY_AVX2) #if defined(LEO_TRY_AVX2)
if (CpuHasAVX2) if (CpuHasAVX2)
{ {
_mm256_storeu_si256(&Multiply256LUT[log_m].Value[i], _mm256_storeu_si256((LEO_M256*)&Multiply256LUT[log_m].Value[i],
_mm256_broadcastsi128_si256(value)); _mm256_broadcastsi128_si256(value));
} }
#endif // LEO_TRY_AVX2 #endif // LEO_TRY_AVX2

View File

@ -365,43 +365,6 @@ static void ShuffleDeck16(PCGRandom &prng, uint16_t * LEO_RESTRICT deck, uint32_
} }
//------------------------------------------------------------------------------
// SIMD-Safe Aligned Memory Allocations
static const unsigned kAlignmentBytes = LEO_ALIGN_BYTES;
LEO_FORCE_INLINE unsigned NextAlignedOffset(unsigned offset)
{
return (offset + kAlignmentBytes - 1) & ~(kAlignmentBytes - 1);
}
static LEO_FORCE_INLINE uint8_t* SIMDSafeAllocate(size_t size)
{
uint8_t* data = (uint8_t*)calloc(1, kAlignmentBytes + size);
if (!data)
return nullptr;
unsigned offset = (unsigned)((uintptr_t)data % kAlignmentBytes);
data += kAlignmentBytes - offset;
data[-1] = (uint8_t)offset;
return data;
}
static LEO_FORCE_INLINE void SIMDSafeFree(void* ptr)
{
if (!ptr)
return;
uint8_t* data = (uint8_t*)ptr;
unsigned offset = data[-1];
if (offset >= kAlignmentBytes)
{
LEO_DEBUG_BREAK; // Should never happen
return;
}
data -= kAlignmentBytes - offset;
free(data);
}
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
// Benchmark // Benchmark
@ -430,11 +393,11 @@ static bool Benchmark(const TestParameters& params)
t_mem_alloc.BeginCall(); t_mem_alloc.BeginCall();
for (unsigned i = 0, count = params.original_count; i < count; ++i) for (unsigned i = 0, count = params.original_count; i < count; ++i)
original_data[i] = SIMDSafeAllocate(params.buffer_bytes); original_data[i] = leopard::SIMDSafeAllocate(params.buffer_bytes);
for (unsigned i = 0, count = encode_work_count; i < count; ++i) for (unsigned i = 0, count = encode_work_count; i < count; ++i)
encode_work_data[i] = SIMDSafeAllocate(params.buffer_bytes); encode_work_data[i] = leopard::SIMDSafeAllocate(params.buffer_bytes);
for (unsigned i = 0, count = decode_work_count; i < count; ++i) for (unsigned i = 0, count = decode_work_count; i < count; ++i)
decode_work_data[i] = SIMDSafeAllocate(params.buffer_bytes); decode_work_data[i] = leopard::SIMDSafeAllocate(params.buffer_bytes);
t_mem_alloc.EndCall(); t_mem_alloc.EndCall();
// Generate data: // Generate data:
@ -479,7 +442,7 @@ static bool Benchmark(const TestParameters& params)
for (unsigned i = 0, count = params.loss_count; i < count; ++i) for (unsigned i = 0, count = params.loss_count; i < count; ++i)
{ {
const unsigned loss_index = original_losses[i]; const unsigned loss_index = original_losses[i];
SIMDSafeFree(original_data[loss_index]); leopard::SIMDSafeFree(original_data[loss_index]);
original_data[loss_index] = nullptr; original_data[loss_index] = nullptr;
} }
@ -493,7 +456,7 @@ static bool Benchmark(const TestParameters& params)
for (unsigned i = 0, count = recovery_loss_count; i < count; ++i) for (unsigned i = 0, count = recovery_loss_count; i < count; ++i)
{ {
const unsigned loss_index = recovery_losses[i]; const unsigned loss_index = recovery_losses[i];
SIMDSafeFree(encode_work_data[loss_index]); leopard::SIMDSafeFree(encode_work_data[loss_index]);
encode_work_data[loss_index] = nullptr; encode_work_data[loss_index] = nullptr;
} }
@ -535,11 +498,11 @@ static bool Benchmark(const TestParameters& params)
t_mem_free.BeginCall(); t_mem_free.BeginCall();
for (unsigned i = 0, count = params.original_count; i < count; ++i) for (unsigned i = 0, count = params.original_count; i < count; ++i)
SIMDSafeFree(original_data[i]); leopard::SIMDSafeFree(original_data[i]);
for (unsigned i = 0, count = encode_work_count; i < count; ++i) for (unsigned i = 0, count = encode_work_count; i < count; ++i)
SIMDSafeFree(encode_work_data[i]); leopard::SIMDSafeFree(encode_work_data[i]);
for (unsigned i = 0, count = decode_work_count; i < count; ++i) for (unsigned i = 0, count = decode_work_count; i < count; ++i)
SIMDSafeFree(decode_work_data[i]); leopard::SIMDSafeFree(decode_work_data[i]);
t_mem_free.EndCall(); t_mem_free.EndCall();
} }