mirror of
https://github.com/status-im/leopard.git
synced 2025-02-22 10:48:18 +00:00
Only allocate memory for mul table used
This commit is contained in:
parent
62f9f56555
commit
96bd047a2d
@ -157,6 +157,7 @@
|
|||||||
#include "leopard.h"
|
#include "leopard.h"
|
||||||
|
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
#include <malloc.h>
|
||||||
|
|
||||||
|
|
||||||
//------------------------------------------------------------------------------
|
//------------------------------------------------------------------------------
|
||||||
@ -421,4 +422,41 @@ protected:
|
|||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
//------------------------------------------------------------------------------
|
||||||
|
// SIMD-Safe Aligned Memory Allocations
|
||||||
|
|
||||||
|
static const unsigned kAlignmentBytes = LEO_ALIGN_BYTES;
|
||||||
|
|
||||||
|
LEO_FORCE_INLINE unsigned NextAlignedOffset(unsigned offset)
|
||||||
|
{
|
||||||
|
return (offset + kAlignmentBytes - 1) & ~(kAlignmentBytes - 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
static LEO_FORCE_INLINE uint8_t* SIMDSafeAllocate(size_t size)
|
||||||
|
{
|
||||||
|
uint8_t* data = (uint8_t*)calloc(1, kAlignmentBytes + size);
|
||||||
|
if (!data)
|
||||||
|
return nullptr;
|
||||||
|
unsigned offset = (unsigned)((uintptr_t)data % kAlignmentBytes);
|
||||||
|
data += kAlignmentBytes - offset;
|
||||||
|
data[-1] = (uint8_t)offset;
|
||||||
|
return data;
|
||||||
|
}
|
||||||
|
|
||||||
|
static LEO_FORCE_INLINE void SIMDSafeFree(void* ptr)
|
||||||
|
{
|
||||||
|
if (!ptr)
|
||||||
|
return;
|
||||||
|
uint8_t* data = (uint8_t*)ptr;
|
||||||
|
unsigned offset = data[-1];
|
||||||
|
if (offset >= kAlignmentBytes)
|
||||||
|
{
|
||||||
|
LEO_DEBUG_BREAK; // Should never happen
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
data -= kAlignmentBytes - offset;
|
||||||
|
free(data);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
} // namespace leopard
|
} // namespace leopard
|
||||||
|
@ -199,20 +199,38 @@ static void InitializeLogarithmTables()
|
|||||||
The ALTMAP memory layout is used since there is no need to convert in/out.
|
The ALTMAP memory layout is used since there is no need to convert in/out.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
struct {
|
struct Multiply128LUT_t
|
||||||
LEO_ALIGNED LEO_M128 Lo[4];
|
{
|
||||||
LEO_ALIGNED LEO_M128 Hi[4];
|
LEO_M128 Lo[4];
|
||||||
} static Multiply128LUT[kOrder];
|
LEO_M128 Hi[4];
|
||||||
|
};
|
||||||
|
|
||||||
|
static const Multiply128LUT_t* Multiply128LUT = nullptr;
|
||||||
|
|
||||||
#if defined(LEO_TRY_AVX2)
|
#if defined(LEO_TRY_AVX2)
|
||||||
struct {
|
|
||||||
LEO_ALIGNED LEO_M256 Lo[4];
|
struct Multiply256LUT_t
|
||||||
LEO_ALIGNED LEO_M256 Hi[4];
|
{
|
||||||
} static Multiply256LUT[kOrder];
|
LEO_M256 Lo[4];
|
||||||
|
LEO_M256 Hi[4];
|
||||||
|
};
|
||||||
|
|
||||||
|
static const Multiply256LUT_t* Multiply256LUT = nullptr;
|
||||||
|
|
||||||
#endif // LEO_TRY_AVX2
|
#endif // LEO_TRY_AVX2
|
||||||
|
|
||||||
|
|
||||||
void InitializeMultiplyTables()
|
void InitializeMultiplyTables()
|
||||||
{
|
{
|
||||||
|
// If we cannot use the PSHUFB instruction, generate Multiply8LUT:
|
||||||
|
if (!CpuHasSSSE3)
|
||||||
|
return;
|
||||||
|
|
||||||
|
if (CpuHasAVX2)
|
||||||
|
Multiply256LUT = reinterpret_cast<const Multiply256LUT_t*>(SIMDSafeAllocate(sizeof(Multiply256LUT_t) * kOrder));
|
||||||
|
else
|
||||||
|
Multiply128LUT = reinterpret_cast<const Multiply128LUT_t*>(SIMDSafeAllocate(sizeof(Multiply128LUT_t) * kOrder));
|
||||||
|
|
||||||
// For each value we could multiply by:
|
// For each value we could multiply by:
|
||||||
for (unsigned log_m = 0; log_m < kOrder; ++log_m)
|
for (unsigned log_m = 0; log_m < kOrder; ++log_m)
|
||||||
{
|
{
|
||||||
@ -232,16 +250,19 @@ void InitializeMultiplyTables()
|
|||||||
const LEO_M128 value_hi = _mm_loadu_si128((LEO_M128*)prod_hi);
|
const LEO_M128 value_hi = _mm_loadu_si128((LEO_M128*)prod_hi);
|
||||||
|
|
||||||
// Store in 128-bit wide table
|
// Store in 128-bit wide table
|
||||||
_mm_storeu_si128(&Multiply128LUT[log_m].Lo[i], value_lo);
|
if (!CpuHasAVX2)
|
||||||
_mm_storeu_si128(&Multiply128LUT[log_m].Hi[i], value_hi);
|
{
|
||||||
|
_mm_storeu_si128((LEO_M128*)&Multiply128LUT[log_m].Lo[i], value_lo);
|
||||||
|
_mm_storeu_si128((LEO_M128*)&Multiply128LUT[log_m].Hi[i], value_hi);
|
||||||
|
}
|
||||||
|
|
||||||
// Store in 256-bit wide table
|
// Store in 256-bit wide table
|
||||||
#if defined(LEO_TRY_AVX2)
|
#if defined(LEO_TRY_AVX2)
|
||||||
if (CpuHasAVX2)
|
if (CpuHasAVX2)
|
||||||
{
|
{
|
||||||
_mm256_storeu_si256(&Multiply256LUT[log_m].Lo[i],
|
_mm256_storeu_si256((LEO_M256*)&Multiply256LUT[log_m].Lo[i],
|
||||||
_mm256_broadcastsi128_si256(value_lo));
|
_mm256_broadcastsi128_si256(value_lo));
|
||||||
_mm256_storeu_si256(&Multiply256LUT[log_m].Hi[i],
|
_mm256_storeu_si256((LEO_M256*)&Multiply256LUT[log_m].Hi[i],
|
||||||
_mm256_broadcastsi128_si256(value_hi));
|
_mm256_broadcastsi128_si256(value_hi));
|
||||||
}
|
}
|
||||||
#endif // LEO_TRY_AVX2
|
#endif // LEO_TRY_AVX2
|
||||||
|
@ -202,19 +202,27 @@ static void InitializeLogarithmTables()
|
|||||||
Specifically section 6 outlines the algorithm used here for 8-bit fields.
|
Specifically section 6 outlines the algorithm used here for 8-bit fields.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
struct {
|
struct Multiply128LUT_t
|
||||||
LEO_ALIGNED LEO_M128 Value[2];
|
{
|
||||||
} static Multiply128LUT[kOrder];
|
LEO_M128 Value[2];
|
||||||
|
};
|
||||||
|
|
||||||
|
static const Multiply128LUT_t* Multiply128LUT = nullptr;
|
||||||
|
|
||||||
#if defined(LEO_TRY_AVX2)
|
#if defined(LEO_TRY_AVX2)
|
||||||
struct {
|
|
||||||
LEO_ALIGNED LEO_M256 Value[2];
|
struct Multiply256LUT_t
|
||||||
} static Multiply256LUT[kOrder];
|
{
|
||||||
|
LEO_M256 Value[2];
|
||||||
|
};
|
||||||
|
|
||||||
|
static const Multiply256LUT_t* Multiply256LUT = nullptr;
|
||||||
|
|
||||||
#endif // LEO_TRY_AVX2
|
#endif // LEO_TRY_AVX2
|
||||||
|
|
||||||
// Stores the product of x * y at offset x + y * 256
|
// Stores the product of x * y at offset x + y * 256
|
||||||
// Repeated accesses from the same y value are faster
|
// Repeated accesses from the same y value are faster
|
||||||
static ffe_t Multiply8LUT[256 * 256] = {};
|
static const ffe_t* Multiply8LUT = nullptr;
|
||||||
|
|
||||||
|
|
||||||
static void InitializeMultiplyTables()
|
static void InitializeMultiplyTables()
|
||||||
@ -222,15 +230,20 @@ static void InitializeMultiplyTables()
|
|||||||
// If we cannot use the PSHUFB instruction, generate Multiply8LUT:
|
// If we cannot use the PSHUFB instruction, generate Multiply8LUT:
|
||||||
if (!CpuHasSSSE3)
|
if (!CpuHasSSSE3)
|
||||||
{
|
{
|
||||||
|
Multiply8LUT = new ffe_t[256 * 256];
|
||||||
|
|
||||||
// For each left-multiplicand:
|
// For each left-multiplicand:
|
||||||
for (unsigned x = 0; x < 256; ++x)
|
for (unsigned x = 0; x < 256; ++x)
|
||||||
{
|
{
|
||||||
ffe_t* lut = Multiply8LUT + x;
|
ffe_t* lut = (ffe_t*)Multiply8LUT + x;
|
||||||
|
|
||||||
// Note: Table is already zeroed out so we can skip the zeroes
|
|
||||||
if (x == 0)
|
if (x == 0)
|
||||||
continue;
|
{
|
||||||
|
for (unsigned log_y = 0; log_y < 256; ++log_y, lut += 256)
|
||||||
|
*lut = 0;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
const ffe_t log_x = LogLUT[x];
|
const ffe_t log_x = LogLUT[x];
|
||||||
|
|
||||||
for (unsigned log_y = 0; log_y < 256; ++log_y, lut += 256)
|
for (unsigned log_y = 0; log_y < 256; ++log_y, lut += 256)
|
||||||
@ -239,10 +252,16 @@ static void InitializeMultiplyTables()
|
|||||||
*lut = prod;
|
*lut = prod;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (CpuHasAVX2)
|
||||||
|
Multiply256LUT = reinterpret_cast<const Multiply256LUT_t*>(SIMDSafeAllocate(sizeof(Multiply256LUT_t) * kOrder));
|
||||||
|
else
|
||||||
|
Multiply128LUT = reinterpret_cast<const Multiply128LUT_t*>(SIMDSafeAllocate(sizeof(Multiply128LUT_t) * kOrder));
|
||||||
|
|
||||||
// For each value we could multiply by:
|
// For each value we could multiply by:
|
||||||
for (unsigned log_m = 0; log_m < kOrder; ++log_m)
|
for (unsigned log_m = 0; log_m < kOrder; ++log_m)
|
||||||
{
|
{
|
||||||
@ -254,16 +273,18 @@ static void InitializeMultiplyTables()
|
|||||||
for (ffe_t x = 0; x < 16; ++x)
|
for (ffe_t x = 0; x < 16; ++x)
|
||||||
lut[x] = MultiplyLog(x << shift, static_cast<ffe_t>(log_m));
|
lut[x] = MultiplyLog(x << shift, static_cast<ffe_t>(log_m));
|
||||||
|
|
||||||
// Store in 128-bit wide table
|
|
||||||
const LEO_M128 *v_ptr = reinterpret_cast<const LEO_M128 *>(&lut[0]);
|
const LEO_M128 *v_ptr = reinterpret_cast<const LEO_M128 *>(&lut[0]);
|
||||||
const LEO_M128 value = _mm_loadu_si128(v_ptr);
|
const LEO_M128 value = _mm_loadu_si128(v_ptr);
|
||||||
_mm_storeu_si128(&Multiply128LUT[log_m].Value[i], value);
|
|
||||||
|
// Store in 128-bit wide table
|
||||||
|
if (!CpuHasAVX2)
|
||||||
|
_mm_storeu_si128((LEO_M128*)&Multiply128LUT[log_m].Value[i], value);
|
||||||
|
|
||||||
// Store in 256-bit wide table
|
// Store in 256-bit wide table
|
||||||
#if defined(LEO_TRY_AVX2)
|
#if defined(LEO_TRY_AVX2)
|
||||||
if (CpuHasAVX2)
|
if (CpuHasAVX2)
|
||||||
{
|
{
|
||||||
_mm256_storeu_si256(&Multiply256LUT[log_m].Value[i],
|
_mm256_storeu_si256((LEO_M256*)&Multiply256LUT[log_m].Value[i],
|
||||||
_mm256_broadcastsi128_si256(value));
|
_mm256_broadcastsi128_si256(value));
|
||||||
}
|
}
|
||||||
#endif // LEO_TRY_AVX2
|
#endif // LEO_TRY_AVX2
|
||||||
|
@ -365,43 +365,6 @@ static void ShuffleDeck16(PCGRandom &prng, uint16_t * LEO_RESTRICT deck, uint32_
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
//------------------------------------------------------------------------------
|
|
||||||
// SIMD-Safe Aligned Memory Allocations
|
|
||||||
|
|
||||||
static const unsigned kAlignmentBytes = LEO_ALIGN_BYTES;
|
|
||||||
|
|
||||||
LEO_FORCE_INLINE unsigned NextAlignedOffset(unsigned offset)
|
|
||||||
{
|
|
||||||
return (offset + kAlignmentBytes - 1) & ~(kAlignmentBytes - 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
static LEO_FORCE_INLINE uint8_t* SIMDSafeAllocate(size_t size)
|
|
||||||
{
|
|
||||||
uint8_t* data = (uint8_t*)calloc(1, kAlignmentBytes + size);
|
|
||||||
if (!data)
|
|
||||||
return nullptr;
|
|
||||||
unsigned offset = (unsigned)((uintptr_t)data % kAlignmentBytes);
|
|
||||||
data += kAlignmentBytes - offset;
|
|
||||||
data[-1] = (uint8_t)offset;
|
|
||||||
return data;
|
|
||||||
}
|
|
||||||
|
|
||||||
static LEO_FORCE_INLINE void SIMDSafeFree(void* ptr)
|
|
||||||
{
|
|
||||||
if (!ptr)
|
|
||||||
return;
|
|
||||||
uint8_t* data = (uint8_t*)ptr;
|
|
||||||
unsigned offset = data[-1];
|
|
||||||
if (offset >= kAlignmentBytes)
|
|
||||||
{
|
|
||||||
LEO_DEBUG_BREAK; // Should never happen
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
data -= kAlignmentBytes - offset;
|
|
||||||
free(data);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
//------------------------------------------------------------------------------
|
//------------------------------------------------------------------------------
|
||||||
// Benchmark
|
// Benchmark
|
||||||
|
|
||||||
@ -430,11 +393,11 @@ static bool Benchmark(const TestParameters& params)
|
|||||||
|
|
||||||
t_mem_alloc.BeginCall();
|
t_mem_alloc.BeginCall();
|
||||||
for (unsigned i = 0, count = params.original_count; i < count; ++i)
|
for (unsigned i = 0, count = params.original_count; i < count; ++i)
|
||||||
original_data[i] = SIMDSafeAllocate(params.buffer_bytes);
|
original_data[i] = leopard::SIMDSafeAllocate(params.buffer_bytes);
|
||||||
for (unsigned i = 0, count = encode_work_count; i < count; ++i)
|
for (unsigned i = 0, count = encode_work_count; i < count; ++i)
|
||||||
encode_work_data[i] = SIMDSafeAllocate(params.buffer_bytes);
|
encode_work_data[i] = leopard::SIMDSafeAllocate(params.buffer_bytes);
|
||||||
for (unsigned i = 0, count = decode_work_count; i < count; ++i)
|
for (unsigned i = 0, count = decode_work_count; i < count; ++i)
|
||||||
decode_work_data[i] = SIMDSafeAllocate(params.buffer_bytes);
|
decode_work_data[i] = leopard::SIMDSafeAllocate(params.buffer_bytes);
|
||||||
t_mem_alloc.EndCall();
|
t_mem_alloc.EndCall();
|
||||||
|
|
||||||
// Generate data:
|
// Generate data:
|
||||||
@ -479,7 +442,7 @@ static bool Benchmark(const TestParameters& params)
|
|||||||
for (unsigned i = 0, count = params.loss_count; i < count; ++i)
|
for (unsigned i = 0, count = params.loss_count; i < count; ++i)
|
||||||
{
|
{
|
||||||
const unsigned loss_index = original_losses[i];
|
const unsigned loss_index = original_losses[i];
|
||||||
SIMDSafeFree(original_data[loss_index]);
|
leopard::SIMDSafeFree(original_data[loss_index]);
|
||||||
original_data[loss_index] = nullptr;
|
original_data[loss_index] = nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -493,7 +456,7 @@ static bool Benchmark(const TestParameters& params)
|
|||||||
for (unsigned i = 0, count = recovery_loss_count; i < count; ++i)
|
for (unsigned i = 0, count = recovery_loss_count; i < count; ++i)
|
||||||
{
|
{
|
||||||
const unsigned loss_index = recovery_losses[i];
|
const unsigned loss_index = recovery_losses[i];
|
||||||
SIMDSafeFree(encode_work_data[loss_index]);
|
leopard::SIMDSafeFree(encode_work_data[loss_index]);
|
||||||
encode_work_data[loss_index] = nullptr;
|
encode_work_data[loss_index] = nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -535,11 +498,11 @@ static bool Benchmark(const TestParameters& params)
|
|||||||
|
|
||||||
t_mem_free.BeginCall();
|
t_mem_free.BeginCall();
|
||||||
for (unsigned i = 0, count = params.original_count; i < count; ++i)
|
for (unsigned i = 0, count = params.original_count; i < count; ++i)
|
||||||
SIMDSafeFree(original_data[i]);
|
leopard::SIMDSafeFree(original_data[i]);
|
||||||
for (unsigned i = 0, count = encode_work_count; i < count; ++i)
|
for (unsigned i = 0, count = encode_work_count; i < count; ++i)
|
||||||
SIMDSafeFree(encode_work_data[i]);
|
leopard::SIMDSafeFree(encode_work_data[i]);
|
||||||
for (unsigned i = 0, count = decode_work_count; i < count; ++i)
|
for (unsigned i = 0, count = decode_work_count; i < count; ++i)
|
||||||
SIMDSafeFree(decode_work_data[i]);
|
leopard::SIMDSafeFree(decode_work_data[i]);
|
||||||
t_mem_free.EndCall();
|
t_mem_free.EndCall();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user