mirror of
https://github.com/status-im/leopard.git
synced 2025-02-20 09:48:23 +00:00
Only allocate memory for mul table used
This commit is contained in:
parent
62f9f56555
commit
96bd047a2d
@ -157,6 +157,7 @@
|
||||
#include "leopard.h"
|
||||
|
||||
#include <stdint.h>
|
||||
#include <malloc.h>
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
@ -421,4 +422,41 @@ protected:
|
||||
};
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// SIMD-Safe Aligned Memory Allocations
|
||||
|
||||
static const unsigned kAlignmentBytes = LEO_ALIGN_BYTES;
|
||||
|
||||
LEO_FORCE_INLINE unsigned NextAlignedOffset(unsigned offset)
|
||||
{
|
||||
return (offset + kAlignmentBytes - 1) & ~(kAlignmentBytes - 1);
|
||||
}
|
||||
|
||||
static LEO_FORCE_INLINE uint8_t* SIMDSafeAllocate(size_t size)
|
||||
{
|
||||
uint8_t* data = (uint8_t*)calloc(1, kAlignmentBytes + size);
|
||||
if (!data)
|
||||
return nullptr;
|
||||
unsigned offset = (unsigned)((uintptr_t)data % kAlignmentBytes);
|
||||
data += kAlignmentBytes - offset;
|
||||
data[-1] = (uint8_t)offset;
|
||||
return data;
|
||||
}
|
||||
|
||||
static LEO_FORCE_INLINE void SIMDSafeFree(void* ptr)
|
||||
{
|
||||
if (!ptr)
|
||||
return;
|
||||
uint8_t* data = (uint8_t*)ptr;
|
||||
unsigned offset = data[-1];
|
||||
if (offset >= kAlignmentBytes)
|
||||
{
|
||||
LEO_DEBUG_BREAK; // Should never happen
|
||||
return;
|
||||
}
|
||||
data -= kAlignmentBytes - offset;
|
||||
free(data);
|
||||
}
|
||||
|
||||
|
||||
} // namespace leopard
|
||||
|
@ -199,20 +199,38 @@ static void InitializeLogarithmTables()
|
||||
The ALTMAP memory layout is used since there is no need to convert in/out.
|
||||
*/
|
||||
|
||||
struct {
|
||||
LEO_ALIGNED LEO_M128 Lo[4];
|
||||
LEO_ALIGNED LEO_M128 Hi[4];
|
||||
} static Multiply128LUT[kOrder];
|
||||
struct Multiply128LUT_t
|
||||
{
|
||||
LEO_M128 Lo[4];
|
||||
LEO_M128 Hi[4];
|
||||
};
|
||||
|
||||
static const Multiply128LUT_t* Multiply128LUT = nullptr;
|
||||
|
||||
#if defined(LEO_TRY_AVX2)
|
||||
struct {
|
||||
LEO_ALIGNED LEO_M256 Lo[4];
|
||||
LEO_ALIGNED LEO_M256 Hi[4];
|
||||
} static Multiply256LUT[kOrder];
|
||||
|
||||
struct Multiply256LUT_t
|
||||
{
|
||||
LEO_M256 Lo[4];
|
||||
LEO_M256 Hi[4];
|
||||
};
|
||||
|
||||
static const Multiply256LUT_t* Multiply256LUT = nullptr;
|
||||
|
||||
#endif // LEO_TRY_AVX2
|
||||
|
||||
|
||||
void InitializeMultiplyTables()
|
||||
{
|
||||
// If we cannot use the PSHUFB instruction, generate Multiply8LUT:
|
||||
if (!CpuHasSSSE3)
|
||||
return;
|
||||
|
||||
if (CpuHasAVX2)
|
||||
Multiply256LUT = reinterpret_cast<const Multiply256LUT_t*>(SIMDSafeAllocate(sizeof(Multiply256LUT_t) * kOrder));
|
||||
else
|
||||
Multiply128LUT = reinterpret_cast<const Multiply128LUT_t*>(SIMDSafeAllocate(sizeof(Multiply128LUT_t) * kOrder));
|
||||
|
||||
// For each value we could multiply by:
|
||||
for (unsigned log_m = 0; log_m < kOrder; ++log_m)
|
||||
{
|
||||
@ -232,16 +250,19 @@ void InitializeMultiplyTables()
|
||||
const LEO_M128 value_hi = _mm_loadu_si128((LEO_M128*)prod_hi);
|
||||
|
||||
// Store in 128-bit wide table
|
||||
_mm_storeu_si128(&Multiply128LUT[log_m].Lo[i], value_lo);
|
||||
_mm_storeu_si128(&Multiply128LUT[log_m].Hi[i], value_hi);
|
||||
if (!CpuHasAVX2)
|
||||
{
|
||||
_mm_storeu_si128((LEO_M128*)&Multiply128LUT[log_m].Lo[i], value_lo);
|
||||
_mm_storeu_si128((LEO_M128*)&Multiply128LUT[log_m].Hi[i], value_hi);
|
||||
}
|
||||
|
||||
// Store in 256-bit wide table
|
||||
#if defined(LEO_TRY_AVX2)
|
||||
if (CpuHasAVX2)
|
||||
{
|
||||
_mm256_storeu_si256(&Multiply256LUT[log_m].Lo[i],
|
||||
_mm256_storeu_si256((LEO_M256*)&Multiply256LUT[log_m].Lo[i],
|
||||
_mm256_broadcastsi128_si256(value_lo));
|
||||
_mm256_storeu_si256(&Multiply256LUT[log_m].Hi[i],
|
||||
_mm256_storeu_si256((LEO_M256*)&Multiply256LUT[log_m].Hi[i],
|
||||
_mm256_broadcastsi128_si256(value_hi));
|
||||
}
|
||||
#endif // LEO_TRY_AVX2
|
||||
|
@ -202,19 +202,27 @@ static void InitializeLogarithmTables()
|
||||
Specifically section 6 outlines the algorithm used here for 8-bit fields.
|
||||
*/
|
||||
|
||||
struct {
|
||||
LEO_ALIGNED LEO_M128 Value[2];
|
||||
} static Multiply128LUT[kOrder];
|
||||
struct Multiply128LUT_t
|
||||
{
|
||||
LEO_M128 Value[2];
|
||||
};
|
||||
|
||||
static const Multiply128LUT_t* Multiply128LUT = nullptr;
|
||||
|
||||
#if defined(LEO_TRY_AVX2)
|
||||
struct {
|
||||
LEO_ALIGNED LEO_M256 Value[2];
|
||||
} static Multiply256LUT[kOrder];
|
||||
|
||||
struct Multiply256LUT_t
|
||||
{
|
||||
LEO_M256 Value[2];
|
||||
};
|
||||
|
||||
static const Multiply256LUT_t* Multiply256LUT = nullptr;
|
||||
|
||||
#endif // LEO_TRY_AVX2
|
||||
|
||||
// Stores the product of x * y at offset x + y * 256
|
||||
// Repeated accesses from the same y value are faster
|
||||
static ffe_t Multiply8LUT[256 * 256] = {};
|
||||
static const ffe_t* Multiply8LUT = nullptr;
|
||||
|
||||
|
||||
static void InitializeMultiplyTables()
|
||||
@ -222,27 +230,38 @@ static void InitializeMultiplyTables()
|
||||
// If we cannot use the PSHUFB instruction, generate Multiply8LUT:
|
||||
if (!CpuHasSSSE3)
|
||||
{
|
||||
Multiply8LUT = new ffe_t[256 * 256];
|
||||
|
||||
// For each left-multiplicand:
|
||||
for (unsigned x = 0; x < 256; ++x)
|
||||
{
|
||||
ffe_t* lut = Multiply8LUT + x;
|
||||
ffe_t* lut = (ffe_t*)Multiply8LUT + x;
|
||||
|
||||
// Note: Table is already zeroed out so we can skip the zeroes
|
||||
if (x == 0)
|
||||
continue;
|
||||
|
||||
const ffe_t log_x = LogLUT[x];
|
||||
|
||||
for (unsigned log_y = 0; log_y < 256; ++log_y, lut += 256)
|
||||
{
|
||||
const ffe_t prod = ExpLUT[AddMod(log_x, log_y)];
|
||||
*lut = prod;
|
||||
for (unsigned log_y = 0; log_y < 256; ++log_y, lut += 256)
|
||||
*lut = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
const ffe_t log_x = LogLUT[x];
|
||||
|
||||
for (unsigned log_y = 0; log_y < 256; ++log_y, lut += 256)
|
||||
{
|
||||
const ffe_t prod = ExpLUT[AddMod(log_x, log_y)];
|
||||
*lut = prod;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
if (CpuHasAVX2)
|
||||
Multiply256LUT = reinterpret_cast<const Multiply256LUT_t*>(SIMDSafeAllocate(sizeof(Multiply256LUT_t) * kOrder));
|
||||
else
|
||||
Multiply128LUT = reinterpret_cast<const Multiply128LUT_t*>(SIMDSafeAllocate(sizeof(Multiply128LUT_t) * kOrder));
|
||||
|
||||
// For each value we could multiply by:
|
||||
for (unsigned log_m = 0; log_m < kOrder; ++log_m)
|
||||
{
|
||||
@ -254,16 +273,18 @@ static void InitializeMultiplyTables()
|
||||
for (ffe_t x = 0; x < 16; ++x)
|
||||
lut[x] = MultiplyLog(x << shift, static_cast<ffe_t>(log_m));
|
||||
|
||||
// Store in 128-bit wide table
|
||||
const LEO_M128 *v_ptr = reinterpret_cast<const LEO_M128 *>(&lut[0]);
|
||||
const LEO_M128 value = _mm_loadu_si128(v_ptr);
|
||||
_mm_storeu_si128(&Multiply128LUT[log_m].Value[i], value);
|
||||
|
||||
// Store in 128-bit wide table
|
||||
if (!CpuHasAVX2)
|
||||
_mm_storeu_si128((LEO_M128*)&Multiply128LUT[log_m].Value[i], value);
|
||||
|
||||
// Store in 256-bit wide table
|
||||
#if defined(LEO_TRY_AVX2)
|
||||
if (CpuHasAVX2)
|
||||
{
|
||||
_mm256_storeu_si256(&Multiply256LUT[log_m].Value[i],
|
||||
_mm256_storeu_si256((LEO_M256*)&Multiply256LUT[log_m].Value[i],
|
||||
_mm256_broadcastsi128_si256(value));
|
||||
}
|
||||
#endif // LEO_TRY_AVX2
|
||||
|
@ -365,43 +365,6 @@ static void ShuffleDeck16(PCGRandom &prng, uint16_t * LEO_RESTRICT deck, uint32_
|
||||
}
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// SIMD-Safe Aligned Memory Allocations
|
||||
|
||||
static const unsigned kAlignmentBytes = LEO_ALIGN_BYTES;
|
||||
|
||||
LEO_FORCE_INLINE unsigned NextAlignedOffset(unsigned offset)
|
||||
{
|
||||
return (offset + kAlignmentBytes - 1) & ~(kAlignmentBytes - 1);
|
||||
}
|
||||
|
||||
static LEO_FORCE_INLINE uint8_t* SIMDSafeAllocate(size_t size)
|
||||
{
|
||||
uint8_t* data = (uint8_t*)calloc(1, kAlignmentBytes + size);
|
||||
if (!data)
|
||||
return nullptr;
|
||||
unsigned offset = (unsigned)((uintptr_t)data % kAlignmentBytes);
|
||||
data += kAlignmentBytes - offset;
|
||||
data[-1] = (uint8_t)offset;
|
||||
return data;
|
||||
}
|
||||
|
||||
static LEO_FORCE_INLINE void SIMDSafeFree(void* ptr)
|
||||
{
|
||||
if (!ptr)
|
||||
return;
|
||||
uint8_t* data = (uint8_t*)ptr;
|
||||
unsigned offset = data[-1];
|
||||
if (offset >= kAlignmentBytes)
|
||||
{
|
||||
LEO_DEBUG_BREAK; // Should never happen
|
||||
return;
|
||||
}
|
||||
data -= kAlignmentBytes - offset;
|
||||
free(data);
|
||||
}
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Benchmark
|
||||
|
||||
@ -430,11 +393,11 @@ static bool Benchmark(const TestParameters& params)
|
||||
|
||||
t_mem_alloc.BeginCall();
|
||||
for (unsigned i = 0, count = params.original_count; i < count; ++i)
|
||||
original_data[i] = SIMDSafeAllocate(params.buffer_bytes);
|
||||
original_data[i] = leopard::SIMDSafeAllocate(params.buffer_bytes);
|
||||
for (unsigned i = 0, count = encode_work_count; i < count; ++i)
|
||||
encode_work_data[i] = SIMDSafeAllocate(params.buffer_bytes);
|
||||
encode_work_data[i] = leopard::SIMDSafeAllocate(params.buffer_bytes);
|
||||
for (unsigned i = 0, count = decode_work_count; i < count; ++i)
|
||||
decode_work_data[i] = SIMDSafeAllocate(params.buffer_bytes);
|
||||
decode_work_data[i] = leopard::SIMDSafeAllocate(params.buffer_bytes);
|
||||
t_mem_alloc.EndCall();
|
||||
|
||||
// Generate data:
|
||||
@ -479,7 +442,7 @@ static bool Benchmark(const TestParameters& params)
|
||||
for (unsigned i = 0, count = params.loss_count; i < count; ++i)
|
||||
{
|
||||
const unsigned loss_index = original_losses[i];
|
||||
SIMDSafeFree(original_data[loss_index]);
|
||||
leopard::SIMDSafeFree(original_data[loss_index]);
|
||||
original_data[loss_index] = nullptr;
|
||||
}
|
||||
|
||||
@ -493,7 +456,7 @@ static bool Benchmark(const TestParameters& params)
|
||||
for (unsigned i = 0, count = recovery_loss_count; i < count; ++i)
|
||||
{
|
||||
const unsigned loss_index = recovery_losses[i];
|
||||
SIMDSafeFree(encode_work_data[loss_index]);
|
||||
leopard::SIMDSafeFree(encode_work_data[loss_index]);
|
||||
encode_work_data[loss_index] = nullptr;
|
||||
}
|
||||
|
||||
@ -535,11 +498,11 @@ static bool Benchmark(const TestParameters& params)
|
||||
|
||||
t_mem_free.BeginCall();
|
||||
for (unsigned i = 0, count = params.original_count; i < count; ++i)
|
||||
SIMDSafeFree(original_data[i]);
|
||||
leopard::SIMDSafeFree(original_data[i]);
|
||||
for (unsigned i = 0, count = encode_work_count; i < count; ++i)
|
||||
SIMDSafeFree(encode_work_data[i]);
|
||||
leopard::SIMDSafeFree(encode_work_data[i]);
|
||||
for (unsigned i = 0, count = decode_work_count; i < count; ++i)
|
||||
SIMDSafeFree(decode_work_data[i]);
|
||||
leopard::SIMDSafeFree(decode_work_data[i]);
|
||||
t_mem_free.EndCall();
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user