Only allocate memory for mul table used

This commit is contained in:
Christopher Taylor 2017-06-03 16:48:09 -07:00
parent 62f9f56555
commit 96bd047a2d
4 changed files with 119 additions and 76 deletions

View File

@ -157,6 +157,7 @@
#include "leopard.h"
#include <stdint.h>
#include <malloc.h>
//------------------------------------------------------------------------------
@ -421,4 +422,41 @@ protected:
};
//------------------------------------------------------------------------------
// SIMD-Safe Aligned Memory Allocations
static const unsigned kAlignmentBytes = LEO_ALIGN_BYTES;
LEO_FORCE_INLINE unsigned NextAlignedOffset(unsigned offset)
{
return (offset + kAlignmentBytes - 1) & ~(kAlignmentBytes - 1);
}
static LEO_FORCE_INLINE uint8_t* SIMDSafeAllocate(size_t size)
{
uint8_t* data = (uint8_t*)calloc(1, kAlignmentBytes + size);
if (!data)
return nullptr;
unsigned offset = (unsigned)((uintptr_t)data % kAlignmentBytes);
data += kAlignmentBytes - offset;
data[-1] = (uint8_t)offset;
return data;
}
static LEO_FORCE_INLINE void SIMDSafeFree(void* ptr)
{
if (!ptr)
return;
uint8_t* data = (uint8_t*)ptr;
unsigned offset = data[-1];
if (offset >= kAlignmentBytes)
{
LEO_DEBUG_BREAK; // Should never happen
return;
}
data -= kAlignmentBytes - offset;
free(data);
}
} // namespace leopard

View File

@ -199,20 +199,38 @@ static void InitializeLogarithmTables()
The ALTMAP memory layout is used since there is no need to convert in/out.
*/
struct {
LEO_ALIGNED LEO_M128 Lo[4];
LEO_ALIGNED LEO_M128 Hi[4];
} static Multiply128LUT[kOrder];
struct Multiply128LUT_t
{
LEO_M128 Lo[4];
LEO_M128 Hi[4];
};
static const Multiply128LUT_t* Multiply128LUT = nullptr;
#if defined(LEO_TRY_AVX2)
struct {
LEO_ALIGNED LEO_M256 Lo[4];
LEO_ALIGNED LEO_M256 Hi[4];
} static Multiply256LUT[kOrder];
struct Multiply256LUT_t
{
LEO_M256 Lo[4];
LEO_M256 Hi[4];
};
static const Multiply256LUT_t* Multiply256LUT = nullptr;
#endif // LEO_TRY_AVX2
void InitializeMultiplyTables()
{
// If we cannot use the PSHUFB instruction, generate Multiply8LUT:
if (!CpuHasSSSE3)
return;
if (CpuHasAVX2)
Multiply256LUT = reinterpret_cast<const Multiply256LUT_t*>(SIMDSafeAllocate(sizeof(Multiply256LUT_t) * kOrder));
else
Multiply128LUT = reinterpret_cast<const Multiply128LUT_t*>(SIMDSafeAllocate(sizeof(Multiply128LUT_t) * kOrder));
// For each value we could multiply by:
for (unsigned log_m = 0; log_m < kOrder; ++log_m)
{
@ -232,16 +250,19 @@ void InitializeMultiplyTables()
const LEO_M128 value_hi = _mm_loadu_si128((LEO_M128*)prod_hi);
// Store in 128-bit wide table
_mm_storeu_si128(&Multiply128LUT[log_m].Lo[i], value_lo);
_mm_storeu_si128(&Multiply128LUT[log_m].Hi[i], value_hi);
if (!CpuHasAVX2)
{
_mm_storeu_si128((LEO_M128*)&Multiply128LUT[log_m].Lo[i], value_lo);
_mm_storeu_si128((LEO_M128*)&Multiply128LUT[log_m].Hi[i], value_hi);
}
// Store in 256-bit wide table
#if defined(LEO_TRY_AVX2)
if (CpuHasAVX2)
{
_mm256_storeu_si256(&Multiply256LUT[log_m].Lo[i],
_mm256_storeu_si256((LEO_M256*)&Multiply256LUT[log_m].Lo[i],
_mm256_broadcastsi128_si256(value_lo));
_mm256_storeu_si256(&Multiply256LUT[log_m].Hi[i],
_mm256_storeu_si256((LEO_M256*)&Multiply256LUT[log_m].Hi[i],
_mm256_broadcastsi128_si256(value_hi));
}
#endif // LEO_TRY_AVX2

View File

@ -202,19 +202,27 @@ static void InitializeLogarithmTables()
Specifically section 6 outlines the algorithm used here for 8-bit fields.
*/
struct {
LEO_ALIGNED LEO_M128 Value[2];
} static Multiply128LUT[kOrder];
struct Multiply128LUT_t
{
LEO_M128 Value[2];
};
static const Multiply128LUT_t* Multiply128LUT = nullptr;
#if defined(LEO_TRY_AVX2)
struct {
LEO_ALIGNED LEO_M256 Value[2];
} static Multiply256LUT[kOrder];
struct Multiply256LUT_t
{
LEO_M256 Value[2];
};
static const Multiply256LUT_t* Multiply256LUT = nullptr;
#endif // LEO_TRY_AVX2
// Stores the product of x * y at offset x + y * 256
// Repeated accesses from the same y value are faster
static ffe_t Multiply8LUT[256 * 256] = {};
static const ffe_t* Multiply8LUT = nullptr;
static void InitializeMultiplyTables()
@ -222,27 +230,38 @@ static void InitializeMultiplyTables()
// If we cannot use the PSHUFB instruction, generate Multiply8LUT:
if (!CpuHasSSSE3)
{
Multiply8LUT = new ffe_t[256 * 256];
// For each left-multiplicand:
for (unsigned x = 0; x < 256; ++x)
{
ffe_t* lut = Multiply8LUT + x;
ffe_t* lut = (ffe_t*)Multiply8LUT + x;
// Note: Table is already zeroed out so we can skip the zeroes
if (x == 0)
continue;
const ffe_t log_x = LogLUT[x];
for (unsigned log_y = 0; log_y < 256; ++log_y, lut += 256)
{
const ffe_t prod = ExpLUT[AddMod(log_x, log_y)];
*lut = prod;
for (unsigned log_y = 0; log_y < 256; ++log_y, lut += 256)
*lut = 0;
}
else
{
const ffe_t log_x = LogLUT[x];
for (unsigned log_y = 0; log_y < 256; ++log_y, lut += 256)
{
const ffe_t prod = ExpLUT[AddMod(log_x, log_y)];
*lut = prod;
}
}
}
return;
}
if (CpuHasAVX2)
Multiply256LUT = reinterpret_cast<const Multiply256LUT_t*>(SIMDSafeAllocate(sizeof(Multiply256LUT_t) * kOrder));
else
Multiply128LUT = reinterpret_cast<const Multiply128LUT_t*>(SIMDSafeAllocate(sizeof(Multiply128LUT_t) * kOrder));
// For each value we could multiply by:
for (unsigned log_m = 0; log_m < kOrder; ++log_m)
{
@ -254,16 +273,18 @@ static void InitializeMultiplyTables()
for (ffe_t x = 0; x < 16; ++x)
lut[x] = MultiplyLog(x << shift, static_cast<ffe_t>(log_m));
// Store in 128-bit wide table
const LEO_M128 *v_ptr = reinterpret_cast<const LEO_M128 *>(&lut[0]);
const LEO_M128 value = _mm_loadu_si128(v_ptr);
_mm_storeu_si128(&Multiply128LUT[log_m].Value[i], value);
// Store in 128-bit wide table
if (!CpuHasAVX2)
_mm_storeu_si128((LEO_M128*)&Multiply128LUT[log_m].Value[i], value);
// Store in 256-bit wide table
#if defined(LEO_TRY_AVX2)
if (CpuHasAVX2)
{
_mm256_storeu_si256(&Multiply256LUT[log_m].Value[i],
_mm256_storeu_si256((LEO_M256*)&Multiply256LUT[log_m].Value[i],
_mm256_broadcastsi128_si256(value));
}
#endif // LEO_TRY_AVX2

View File

@ -365,43 +365,6 @@ static void ShuffleDeck16(PCGRandom &prng, uint16_t * LEO_RESTRICT deck, uint32_
}
//------------------------------------------------------------------------------
// SIMD-Safe Aligned Memory Allocations
static const unsigned kAlignmentBytes = LEO_ALIGN_BYTES;
LEO_FORCE_INLINE unsigned NextAlignedOffset(unsigned offset)
{
return (offset + kAlignmentBytes - 1) & ~(kAlignmentBytes - 1);
}
static LEO_FORCE_INLINE uint8_t* SIMDSafeAllocate(size_t size)
{
uint8_t* data = (uint8_t*)calloc(1, kAlignmentBytes + size);
if (!data)
return nullptr;
unsigned offset = (unsigned)((uintptr_t)data % kAlignmentBytes);
data += kAlignmentBytes - offset;
data[-1] = (uint8_t)offset;
return data;
}
static LEO_FORCE_INLINE void SIMDSafeFree(void* ptr)
{
if (!ptr)
return;
uint8_t* data = (uint8_t*)ptr;
unsigned offset = data[-1];
if (offset >= kAlignmentBytes)
{
LEO_DEBUG_BREAK; // Should never happen
return;
}
data -= kAlignmentBytes - offset;
free(data);
}
//------------------------------------------------------------------------------
// Benchmark
@ -430,11 +393,11 @@ static bool Benchmark(const TestParameters& params)
t_mem_alloc.BeginCall();
for (unsigned i = 0, count = params.original_count; i < count; ++i)
original_data[i] = SIMDSafeAllocate(params.buffer_bytes);
original_data[i] = leopard::SIMDSafeAllocate(params.buffer_bytes);
for (unsigned i = 0, count = encode_work_count; i < count; ++i)
encode_work_data[i] = SIMDSafeAllocate(params.buffer_bytes);
encode_work_data[i] = leopard::SIMDSafeAllocate(params.buffer_bytes);
for (unsigned i = 0, count = decode_work_count; i < count; ++i)
decode_work_data[i] = SIMDSafeAllocate(params.buffer_bytes);
decode_work_data[i] = leopard::SIMDSafeAllocate(params.buffer_bytes);
t_mem_alloc.EndCall();
// Generate data:
@ -479,7 +442,7 @@ static bool Benchmark(const TestParameters& params)
for (unsigned i = 0, count = params.loss_count; i < count; ++i)
{
const unsigned loss_index = original_losses[i];
SIMDSafeFree(original_data[loss_index]);
leopard::SIMDSafeFree(original_data[loss_index]);
original_data[loss_index] = nullptr;
}
@ -493,7 +456,7 @@ static bool Benchmark(const TestParameters& params)
for (unsigned i = 0, count = recovery_loss_count; i < count; ++i)
{
const unsigned loss_index = recovery_losses[i];
SIMDSafeFree(encode_work_data[loss_index]);
leopard::SIMDSafeFree(encode_work_data[loss_index]);
encode_work_data[loss_index] = nullptr;
}
@ -535,11 +498,11 @@ static bool Benchmark(const TestParameters& params)
t_mem_free.BeginCall();
for (unsigned i = 0, count = params.original_count; i < count; ++i)
SIMDSafeFree(original_data[i]);
leopard::SIMDSafeFree(original_data[i]);
for (unsigned i = 0, count = encode_work_count; i < count; ++i)
SIMDSafeFree(encode_work_data[i]);
leopard::SIMDSafeFree(encode_work_data[i]);
for (unsigned i = 0, count = decode_work_count; i < count; ++i)
SIMDSafeFree(decode_work_data[i]);
leopard::SIMDSafeFree(decode_work_data[i]);
t_mem_free.EndCall();
}