mirror of https://github.com/status-im/leopard.git
Cleanup and small opt
This commit is contained in:
parent
7c2be9f17b
commit
968c4f4f6a
|
@ -32,7 +32,6 @@
|
|||
TODO:
|
||||
|
||||
Short-term:
|
||||
+ FF8 decoder needs DIT FFT optimization
|
||||
+ Port DIT FFT code to FF16
|
||||
+ Unroll first/final butterflies to avoid extra copies/xors in encoder
|
||||
+ Multithreading
|
||||
|
|
|
@ -64,54 +64,10 @@ static const unsigned kPolynomial = 0x1002D;
|
|||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Fast Walsh-Hadamard Transform (FWHT) (mod kModulus)
|
||||
// API
|
||||
|
||||
// Transform for a variable number of bits (up to kOrder)
|
||||
//void FWHT(ffe_t* data, const unsigned bits);
|
||||
|
||||
// Transform specialized for the finite field order
|
||||
void FWHT(ffe_t data[kOrder]);
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Multiplies
|
||||
|
||||
// x[] = exp(log(y[]) + log_m)
|
||||
void mul_mem(
|
||||
void * LEO_RESTRICT x, const void * LEO_RESTRICT y,
|
||||
ffe_t log_m, uint64_t bytes);
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// FFT Operations
|
||||
|
||||
/*
|
||||
Precondition: log_m != kModulus
|
||||
|
||||
x[] ^= exp(log(y[]) + log_m)
|
||||
y[] ^= x[]
|
||||
*/
|
||||
void fft_butterfly(
|
||||
void * LEO_RESTRICT x, void * LEO_RESTRICT y,
|
||||
ffe_t log_m, uint64_t bytes);
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// IFFT Operations
|
||||
|
||||
/*
|
||||
Precondition: log_m != kModulus
|
||||
|
||||
y[] ^= x[]
|
||||
x[] ^= exp(log(y[]) + log_m)
|
||||
*/
|
||||
void ifft_butterfly(
|
||||
void * LEO_RESTRICT x, void * LEO_RESTRICT y,
|
||||
ffe_t log_m, uint64_t bytes);
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Reed-Solomon Encode
|
||||
// Returns false if the self-test fails
|
||||
bool Initialize();
|
||||
|
||||
void ReedSolomonEncode(
|
||||
uint64_t buffer_bytes,
|
||||
|
@ -121,10 +77,6 @@ void ReedSolomonEncode(
|
|||
const void* const * const data,
|
||||
void** work); // Size of GetEncodeWorkCount()
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Reed-Solomon Decode
|
||||
|
||||
void ReedSolomonDecode(
|
||||
uint64_t buffer_bytes,
|
||||
unsigned original_count,
|
||||
|
@ -136,13 +88,6 @@ void ReedSolomonDecode(
|
|||
void** work); // n entries
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// API
|
||||
|
||||
// Returns false if the self-test fails
|
||||
bool Initialize();
|
||||
|
||||
|
||||
}} // namespace leopard::ff16
|
||||
|
||||
#endif // LEO_HAS_FF16
|
||||
|
|
|
@ -134,7 +134,7 @@ static void FWHT(ffe_t* data, const unsigned m, const unsigned m_truncated)
|
|||
#else // LEO_FWHT_OPT
|
||||
|
||||
// Reference implementation
|
||||
void FWHT(ffe_t* data, const unsigned bits)
|
||||
static void FWHT(ffe_t* data, const unsigned bits)
|
||||
{
|
||||
const unsigned size = (unsigned)(1UL << bits);
|
||||
for (unsigned width = 1; width < size; width <<= 1)
|
||||
|
@ -233,7 +233,7 @@ struct {
|
|||
static ffe_t Multiply8LUT[256 * 256] = {};
|
||||
|
||||
|
||||
void InitializeMultiplyTables()
|
||||
static void InitializeMultiplyTables()
|
||||
{
|
||||
// If we cannot use the PSHUFB instruction, generate Multiply8LUT:
|
||||
if (!CpuHasSSSE3)
|
||||
|
@ -288,7 +288,7 @@ void InitializeMultiplyTables()
|
|||
}
|
||||
|
||||
|
||||
void mul_mem(
|
||||
static void mul_mem(
|
||||
void * LEO_RESTRICT x, const void * LEO_RESTRICT y,
|
||||
ffe_t log_m, uint64_t bytes)
|
||||
{
|
||||
|
@ -482,7 +482,7 @@ static void FFTInitialize()
|
|||
{1-5, 1'-5', 1-1', 5-5'},
|
||||
*/
|
||||
|
||||
void ifft_butterfly(
|
||||
static void ifft_butterfly(
|
||||
void * LEO_RESTRICT x, void * LEO_RESTRICT y,
|
||||
ffe_t log_m, uint64_t bytes)
|
||||
{
|
||||
|
@ -781,7 +781,7 @@ static void IFFT_DIT4(
|
|||
}
|
||||
}
|
||||
|
||||
void IFFT_DIT(
|
||||
static void IFFT_DIT(
|
||||
const uint64_t bytes,
|
||||
const void* const* data,
|
||||
const unsigned m_truncated,
|
||||
|
@ -815,7 +815,10 @@ void IFFT_DIT(
|
|||
const ffe_t log_m02 = skewLUT[r + dist * 2];
|
||||
|
||||
// For each set of dist elements:
|
||||
for (unsigned i = r; i < r + dist; ++i)
|
||||
unsigned i_end = r + dist;
|
||||
if (i_end >= m_truncated)
|
||||
i_end = m_truncated;
|
||||
for (unsigned i = r; i < i_end; ++i)
|
||||
{
|
||||
IFFT_DIT4(
|
||||
bytes,
|
||||
|
@ -915,7 +918,7 @@ void IFFT_DIT(
|
|||
{4-6, 5-7, 4-5, 6-7},
|
||||
*/
|
||||
|
||||
void fft_butterfly(
|
||||
static void fft_butterfly(
|
||||
void * LEO_RESTRICT x, void * LEO_RESTRICT y,
|
||||
ffe_t log_m, uint64_t bytes)
|
||||
{
|
||||
|
@ -1212,7 +1215,8 @@ static void FFT_DIT4(
|
|||
fft_butterfly(work[dist * 2], work[dist * 3], log_m23, bytes);
|
||||
}
|
||||
|
||||
void FFT_DIT(
|
||||
|
||||
static void FFT_DIT(
|
||||
const uint64_t bytes,
|
||||
void** work,
|
||||
const unsigned m_truncated,
|
||||
|
@ -1231,7 +1235,10 @@ void FFT_DIT(
|
|||
const ffe_t log_m02 = skewLUT[r + dist * 2];
|
||||
|
||||
// For each set of dist elements:
|
||||
for (unsigned i = r; i < r + dist; ++i)
|
||||
unsigned i_end = r + dist;
|
||||
if (i_end >= m_truncated)
|
||||
i_end = m_truncated;
|
||||
for (unsigned i = r; i < i_end; ++i)
|
||||
{
|
||||
FFT_DIT4(
|
||||
bytes,
|
||||
|
|
80
LeopardFF8.h
80
LeopardFF8.h
|
@ -64,94 +64,28 @@ static const unsigned kPolynomial = 0x11D;
|
|||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Fast Walsh-Hadamard Transform (FWHT) (mod kModulus)
|
||||
// API
|
||||
|
||||
// Transform for a variable number of elements
|
||||
// m_truncated: Number of elements that are non-zero at the front of data
|
||||
//void FWHT(ffe_t* data, const unsigned m, const unsigned m_truncated);
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Multiplies
|
||||
|
||||
// x[] = exp(log(y[]) + log_m)
|
||||
// mul_mem
|
||||
void mul_mem(
|
||||
void * LEO_RESTRICT x, const void * LEO_RESTRICT y,
|
||||
ffe_t log_m, uint64_t bytes);
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// FFT Operations
|
||||
|
||||
/*
|
||||
Precondition: log_m != kModulus
|
||||
|
||||
x[] ^= exp(log(y[]) + log_m)
|
||||
y[] ^= x[]
|
||||
*/
|
||||
void fft_butterfly(
|
||||
void * LEO_RESTRICT x, void * LEO_RESTRICT y,
|
||||
ffe_t log_m, uint64_t bytes);
|
||||
|
||||
#ifdef LEO_USE_VECTOR4_OPT
|
||||
|
||||
// Unroll 4 rows at a time
|
||||
void fft_butterfly4(
|
||||
void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
|
||||
void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1,
|
||||
void * LEO_RESTRICT x_2, void * LEO_RESTRICT y_2,
|
||||
void * LEO_RESTRICT x_3, void * LEO_RESTRICT y_3,
|
||||
ffe_t log_m, uint64_t bytes);
|
||||
|
||||
#endif // LEO_USE_VECTOR4_OPT
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// IFFT Operations
|
||||
|
||||
/*
|
||||
Precondition: log_m != kModulus
|
||||
|
||||
y[] ^= x[]
|
||||
x[] ^= exp(log(y[]) + log_m)
|
||||
*/
|
||||
void ifft_butterfly(
|
||||
void * LEO_RESTRICT x, void * LEO_RESTRICT y,
|
||||
ffe_t log_m, uint64_t bytes);
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Reed-Solomon Encode
|
||||
// Returns false if the self-test fails
|
||||
bool Initialize();
|
||||
|
||||
void ReedSolomonEncode(
|
||||
uint64_t buffer_bytes,
|
||||
unsigned original_count,
|
||||
unsigned recovery_count,
|
||||
unsigned m, // = NextPow2(recovery_count) * 2 = work_count
|
||||
unsigned m, // = NextPow2(recovery_count)
|
||||
const void* const * const data,
|
||||
void** work); // Size of GetEncodeWorkCount()
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Reed-Solomon Decode
|
||||
void** work); // m * 2 elements
|
||||
|
||||
void ReedSolomonDecode(
|
||||
uint64_t buffer_bytes,
|
||||
unsigned original_count,
|
||||
unsigned recovery_count,
|
||||
unsigned m, // = NextPow2(recovery_count)
|
||||
unsigned n, // = NextPow2(m + original_count) = work_count
|
||||
unsigned n, // = NextPow2(m + original_count)
|
||||
const void* const * const original, // original_count entries
|
||||
const void* const * const recovery, // recovery_count entries
|
||||
void** work); // n entries
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// API
|
||||
|
||||
// Returns false if the self-test fails
|
||||
bool Initialize();
|
||||
void** work); // n elements
|
||||
|
||||
|
||||
}} // namespace leopard::ff8
|
||||
|
|
|
@ -48,7 +48,7 @@ struct TestParameters
|
|||
unsigned original_count = 128; // under 65536
|
||||
unsigned recovery_count = 128; // under 65536 - original_count
|
||||
#endif
|
||||
unsigned buffer_bytes = 64; // multiple of 64 bytes
|
||||
unsigned buffer_bytes = 64000; // multiple of 64 bytes
|
||||
unsigned loss_count = 32768; // some fraction of original_count
|
||||
unsigned seed = 2;
|
||||
bool multithreaded = true;
|
||||
|
@ -240,9 +240,15 @@ public:
|
|||
void EndCall()
|
||||
{
|
||||
LEO_DEBUG_ASSERT(t0 != 0);
|
||||
uint64_t t1 = GetTimeUsec();
|
||||
++Invokations;
|
||||
TotalUsec += t1 - t0;
|
||||
const uint64_t t1 = GetTimeUsec();
|
||||
const uint64_t delta = t1 - t0;
|
||||
if (++Invokations == 1)
|
||||
MaxCallUsec = MinCallUsec = delta;
|
||||
else if (MaxCallUsec < delta)
|
||||
MaxCallUsec = delta;
|
||||
else if (MinCallUsec > delta)
|
||||
MinCallUsec = delta;
|
||||
TotalUsec += delta;
|
||||
t0 = 0;
|
||||
}
|
||||
void Reset()
|
||||
|
@ -260,6 +266,8 @@ public:
|
|||
uint64_t t0 = 0;
|
||||
uint64_t Invokations = 0;
|
||||
uint64_t TotalUsec = 0;
|
||||
uint64_t MaxCallUsec = 0;
|
||||
uint64_t MinCallUsec = 0;
|
||||
std::string FunctionName;
|
||||
};
|
||||
|
||||
|
@ -542,10 +550,10 @@ static bool Benchmark(const TestParameters& params)
|
|||
t_mem_free.Print(kTrials);
|
||||
#endif
|
||||
|
||||
float encode_input_MBPS = total_bytes * kTrials / (float)(t_leo_encode.TotalUsec);
|
||||
float encode_output_MBPS = params.buffer_bytes * (uint64_t)params.recovery_count * kTrials / (float)(t_leo_encode.TotalUsec);
|
||||
float decode_input_MBPS = total_bytes * kTrials / (float)(t_leo_decode.TotalUsec);
|
||||
float decode_output_MBPS = params.buffer_bytes * (uint64_t)params.loss_count * kTrials / (float)(t_leo_decode.TotalUsec);
|
||||
float encode_input_MBPS = total_bytes / (float)(t_leo_encode.MinCallUsec);
|
||||
float encode_output_MBPS = params.buffer_bytes * (uint64_t)params.recovery_count / (float)(t_leo_encode.MinCallUsec);
|
||||
float decode_input_MBPS = total_bytes / (float)(t_leo_decode.MinCallUsec);
|
||||
float decode_output_MBPS = params.buffer_bytes * (uint64_t)params.loss_count / (float)(t_leo_decode.MinCallUsec);
|
||||
|
||||
cout << "Leopard Encoder(" << total_bytes / 1000000.f << " MB in " << params.original_count << " pieces, " << params.loss_count << " losses): Input=" << encode_input_MBPS << " MB/s, Output=" << encode_output_MBPS << " MB/s" << endl;
|
||||
cout << "Leopard Decoder(" << total_bytes / 1000000.f << " MB in " << params.original_count << " pieces, " << params.loss_count << " losses): Input=" << decode_input_MBPS << " MB/s, Output=" << decode_output_MBPS << " MB/s" << endl << endl;
|
||||
|
|
Loading…
Reference in New Issue