mirror of https://github.com/status-im/leopard.git
Cleanup and small opt
This commit is contained in:
parent
7c2be9f17b
commit
968c4f4f6a
|
@ -32,7 +32,6 @@
|
||||||
TODO:
|
TODO:
|
||||||
|
|
||||||
Short-term:
|
Short-term:
|
||||||
+ FF8 decoder needs DIT FFT optimization
|
|
||||||
+ Port DIT FFT code to FF16
|
+ Port DIT FFT code to FF16
|
||||||
+ Unroll first/final butterflies to avoid extra copies/xors in encoder
|
+ Unroll first/final butterflies to avoid extra copies/xors in encoder
|
||||||
+ Multithreading
|
+ Multithreading
|
||||||
|
|
|
@ -64,54 +64,10 @@ static const unsigned kPolynomial = 0x1002D;
|
||||||
|
|
||||||
|
|
||||||
//------------------------------------------------------------------------------
|
//------------------------------------------------------------------------------
|
||||||
// Fast Walsh-Hadamard Transform (FWHT) (mod kModulus)
|
// API
|
||||||
|
|
||||||
// Transform for a variable number of bits (up to kOrder)
|
// Returns false if the self-test fails
|
||||||
//void FWHT(ffe_t* data, const unsigned bits);
|
bool Initialize();
|
||||||
|
|
||||||
// Transform specialized for the finite field order
|
|
||||||
void FWHT(ffe_t data[kOrder]);
|
|
||||||
|
|
||||||
|
|
||||||
//------------------------------------------------------------------------------
|
|
||||||
// Multiplies
|
|
||||||
|
|
||||||
// x[] = exp(log(y[]) + log_m)
|
|
||||||
void mul_mem(
|
|
||||||
void * LEO_RESTRICT x, const void * LEO_RESTRICT y,
|
|
||||||
ffe_t log_m, uint64_t bytes);
|
|
||||||
|
|
||||||
|
|
||||||
//------------------------------------------------------------------------------
|
|
||||||
// FFT Operations
|
|
||||||
|
|
||||||
/*
|
|
||||||
Precondition: log_m != kModulus
|
|
||||||
|
|
||||||
x[] ^= exp(log(y[]) + log_m)
|
|
||||||
y[] ^= x[]
|
|
||||||
*/
|
|
||||||
void fft_butterfly(
|
|
||||||
void * LEO_RESTRICT x, void * LEO_RESTRICT y,
|
|
||||||
ffe_t log_m, uint64_t bytes);
|
|
||||||
|
|
||||||
|
|
||||||
//------------------------------------------------------------------------------
|
|
||||||
// IFFT Operations
|
|
||||||
|
|
||||||
/*
|
|
||||||
Precondition: log_m != kModulus
|
|
||||||
|
|
||||||
y[] ^= x[]
|
|
||||||
x[] ^= exp(log(y[]) + log_m)
|
|
||||||
*/
|
|
||||||
void ifft_butterfly(
|
|
||||||
void * LEO_RESTRICT x, void * LEO_RESTRICT y,
|
|
||||||
ffe_t log_m, uint64_t bytes);
|
|
||||||
|
|
||||||
|
|
||||||
//------------------------------------------------------------------------------
|
|
||||||
// Reed-Solomon Encode
|
|
||||||
|
|
||||||
void ReedSolomonEncode(
|
void ReedSolomonEncode(
|
||||||
uint64_t buffer_bytes,
|
uint64_t buffer_bytes,
|
||||||
|
@ -121,10 +77,6 @@ void ReedSolomonEncode(
|
||||||
const void* const * const data,
|
const void* const * const data,
|
||||||
void** work); // Size of GetEncodeWorkCount()
|
void** work); // Size of GetEncodeWorkCount()
|
||||||
|
|
||||||
|
|
||||||
//------------------------------------------------------------------------------
|
|
||||||
// Reed-Solomon Decode
|
|
||||||
|
|
||||||
void ReedSolomonDecode(
|
void ReedSolomonDecode(
|
||||||
uint64_t buffer_bytes,
|
uint64_t buffer_bytes,
|
||||||
unsigned original_count,
|
unsigned original_count,
|
||||||
|
@ -136,13 +88,6 @@ void ReedSolomonDecode(
|
||||||
void** work); // n entries
|
void** work); // n entries
|
||||||
|
|
||||||
|
|
||||||
//------------------------------------------------------------------------------
|
|
||||||
// API
|
|
||||||
|
|
||||||
// Returns false if the self-test fails
|
|
||||||
bool Initialize();
|
|
||||||
|
|
||||||
|
|
||||||
}} // namespace leopard::ff16
|
}} // namespace leopard::ff16
|
||||||
|
|
||||||
#endif // LEO_HAS_FF16
|
#endif // LEO_HAS_FF16
|
||||||
|
|
|
@ -134,7 +134,7 @@ static void FWHT(ffe_t* data, const unsigned m, const unsigned m_truncated)
|
||||||
#else // LEO_FWHT_OPT
|
#else // LEO_FWHT_OPT
|
||||||
|
|
||||||
// Reference implementation
|
// Reference implementation
|
||||||
void FWHT(ffe_t* data, const unsigned bits)
|
static void FWHT(ffe_t* data, const unsigned bits)
|
||||||
{
|
{
|
||||||
const unsigned size = (unsigned)(1UL << bits);
|
const unsigned size = (unsigned)(1UL << bits);
|
||||||
for (unsigned width = 1; width < size; width <<= 1)
|
for (unsigned width = 1; width < size; width <<= 1)
|
||||||
|
@ -233,7 +233,7 @@ struct {
|
||||||
static ffe_t Multiply8LUT[256 * 256] = {};
|
static ffe_t Multiply8LUT[256 * 256] = {};
|
||||||
|
|
||||||
|
|
||||||
void InitializeMultiplyTables()
|
static void InitializeMultiplyTables()
|
||||||
{
|
{
|
||||||
// If we cannot use the PSHUFB instruction, generate Multiply8LUT:
|
// If we cannot use the PSHUFB instruction, generate Multiply8LUT:
|
||||||
if (!CpuHasSSSE3)
|
if (!CpuHasSSSE3)
|
||||||
|
@ -288,7 +288,7 @@ void InitializeMultiplyTables()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void mul_mem(
|
static void mul_mem(
|
||||||
void * LEO_RESTRICT x, const void * LEO_RESTRICT y,
|
void * LEO_RESTRICT x, const void * LEO_RESTRICT y,
|
||||||
ffe_t log_m, uint64_t bytes)
|
ffe_t log_m, uint64_t bytes)
|
||||||
{
|
{
|
||||||
|
@ -482,7 +482,7 @@ static void FFTInitialize()
|
||||||
{1-5, 1'-5', 1-1', 5-5'},
|
{1-5, 1'-5', 1-1', 5-5'},
|
||||||
*/
|
*/
|
||||||
|
|
||||||
void ifft_butterfly(
|
static void ifft_butterfly(
|
||||||
void * LEO_RESTRICT x, void * LEO_RESTRICT y,
|
void * LEO_RESTRICT x, void * LEO_RESTRICT y,
|
||||||
ffe_t log_m, uint64_t bytes)
|
ffe_t log_m, uint64_t bytes)
|
||||||
{
|
{
|
||||||
|
@ -781,7 +781,7 @@ static void IFFT_DIT4(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void IFFT_DIT(
|
static void IFFT_DIT(
|
||||||
const uint64_t bytes,
|
const uint64_t bytes,
|
||||||
const void* const* data,
|
const void* const* data,
|
||||||
const unsigned m_truncated,
|
const unsigned m_truncated,
|
||||||
|
@ -815,7 +815,10 @@ void IFFT_DIT(
|
||||||
const ffe_t log_m02 = skewLUT[r + dist * 2];
|
const ffe_t log_m02 = skewLUT[r + dist * 2];
|
||||||
|
|
||||||
// For each set of dist elements:
|
// For each set of dist elements:
|
||||||
for (unsigned i = r; i < r + dist; ++i)
|
unsigned i_end = r + dist;
|
||||||
|
if (i_end >= m_truncated)
|
||||||
|
i_end = m_truncated;
|
||||||
|
for (unsigned i = r; i < i_end; ++i)
|
||||||
{
|
{
|
||||||
IFFT_DIT4(
|
IFFT_DIT4(
|
||||||
bytes,
|
bytes,
|
||||||
|
@ -915,7 +918,7 @@ void IFFT_DIT(
|
||||||
{4-6, 5-7, 4-5, 6-7},
|
{4-6, 5-7, 4-5, 6-7},
|
||||||
*/
|
*/
|
||||||
|
|
||||||
void fft_butterfly(
|
static void fft_butterfly(
|
||||||
void * LEO_RESTRICT x, void * LEO_RESTRICT y,
|
void * LEO_RESTRICT x, void * LEO_RESTRICT y,
|
||||||
ffe_t log_m, uint64_t bytes)
|
ffe_t log_m, uint64_t bytes)
|
||||||
{
|
{
|
||||||
|
@ -1212,7 +1215,8 @@ static void FFT_DIT4(
|
||||||
fft_butterfly(work[dist * 2], work[dist * 3], log_m23, bytes);
|
fft_butterfly(work[dist * 2], work[dist * 3], log_m23, bytes);
|
||||||
}
|
}
|
||||||
|
|
||||||
void FFT_DIT(
|
|
||||||
|
static void FFT_DIT(
|
||||||
const uint64_t bytes,
|
const uint64_t bytes,
|
||||||
void** work,
|
void** work,
|
||||||
const unsigned m_truncated,
|
const unsigned m_truncated,
|
||||||
|
@ -1231,7 +1235,10 @@ void FFT_DIT(
|
||||||
const ffe_t log_m02 = skewLUT[r + dist * 2];
|
const ffe_t log_m02 = skewLUT[r + dist * 2];
|
||||||
|
|
||||||
// For each set of dist elements:
|
// For each set of dist elements:
|
||||||
for (unsigned i = r; i < r + dist; ++i)
|
unsigned i_end = r + dist;
|
||||||
|
if (i_end >= m_truncated)
|
||||||
|
i_end = m_truncated;
|
||||||
|
for (unsigned i = r; i < i_end; ++i)
|
||||||
{
|
{
|
||||||
FFT_DIT4(
|
FFT_DIT4(
|
||||||
bytes,
|
bytes,
|
||||||
|
|
80
LeopardFF8.h
80
LeopardFF8.h
|
@ -64,94 +64,28 @@ static const unsigned kPolynomial = 0x11D;
|
||||||
|
|
||||||
|
|
||||||
//------------------------------------------------------------------------------
|
//------------------------------------------------------------------------------
|
||||||
// Fast Walsh-Hadamard Transform (FWHT) (mod kModulus)
|
// API
|
||||||
|
|
||||||
// Transform for a variable number of elements
|
// Returns false if the self-test fails
|
||||||
// m_truncated: Number of elements that are non-zero at the front of data
|
bool Initialize();
|
||||||
//void FWHT(ffe_t* data, const unsigned m, const unsigned m_truncated);
|
|
||||||
|
|
||||||
|
|
||||||
//------------------------------------------------------------------------------
|
|
||||||
// Multiplies
|
|
||||||
|
|
||||||
// x[] = exp(log(y[]) + log_m)
|
|
||||||
// mul_mem
|
|
||||||
void mul_mem(
|
|
||||||
void * LEO_RESTRICT x, const void * LEO_RESTRICT y,
|
|
||||||
ffe_t log_m, uint64_t bytes);
|
|
||||||
|
|
||||||
|
|
||||||
//------------------------------------------------------------------------------
|
|
||||||
// FFT Operations
|
|
||||||
|
|
||||||
/*
|
|
||||||
Precondition: log_m != kModulus
|
|
||||||
|
|
||||||
x[] ^= exp(log(y[]) + log_m)
|
|
||||||
y[] ^= x[]
|
|
||||||
*/
|
|
||||||
void fft_butterfly(
|
|
||||||
void * LEO_RESTRICT x, void * LEO_RESTRICT y,
|
|
||||||
ffe_t log_m, uint64_t bytes);
|
|
||||||
|
|
||||||
#ifdef LEO_USE_VECTOR4_OPT
|
|
||||||
|
|
||||||
// Unroll 4 rows at a time
|
|
||||||
void fft_butterfly4(
|
|
||||||
void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
|
|
||||||
void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1,
|
|
||||||
void * LEO_RESTRICT x_2, void * LEO_RESTRICT y_2,
|
|
||||||
void * LEO_RESTRICT x_3, void * LEO_RESTRICT y_3,
|
|
||||||
ffe_t log_m, uint64_t bytes);
|
|
||||||
|
|
||||||
#endif // LEO_USE_VECTOR4_OPT
|
|
||||||
|
|
||||||
|
|
||||||
//------------------------------------------------------------------------------
|
|
||||||
// IFFT Operations
|
|
||||||
|
|
||||||
/*
|
|
||||||
Precondition: log_m != kModulus
|
|
||||||
|
|
||||||
y[] ^= x[]
|
|
||||||
x[] ^= exp(log(y[]) + log_m)
|
|
||||||
*/
|
|
||||||
void ifft_butterfly(
|
|
||||||
void * LEO_RESTRICT x, void * LEO_RESTRICT y,
|
|
||||||
ffe_t log_m, uint64_t bytes);
|
|
||||||
|
|
||||||
|
|
||||||
//------------------------------------------------------------------------------
|
|
||||||
// Reed-Solomon Encode
|
|
||||||
|
|
||||||
void ReedSolomonEncode(
|
void ReedSolomonEncode(
|
||||||
uint64_t buffer_bytes,
|
uint64_t buffer_bytes,
|
||||||
unsigned original_count,
|
unsigned original_count,
|
||||||
unsigned recovery_count,
|
unsigned recovery_count,
|
||||||
unsigned m, // = NextPow2(recovery_count) * 2 = work_count
|
unsigned m, // = NextPow2(recovery_count)
|
||||||
const void* const * const data,
|
const void* const * const data,
|
||||||
void** work); // Size of GetEncodeWorkCount()
|
void** work); // m * 2 elements
|
||||||
|
|
||||||
|
|
||||||
//------------------------------------------------------------------------------
|
|
||||||
// Reed-Solomon Decode
|
|
||||||
|
|
||||||
void ReedSolomonDecode(
|
void ReedSolomonDecode(
|
||||||
uint64_t buffer_bytes,
|
uint64_t buffer_bytes,
|
||||||
unsigned original_count,
|
unsigned original_count,
|
||||||
unsigned recovery_count,
|
unsigned recovery_count,
|
||||||
unsigned m, // = NextPow2(recovery_count)
|
unsigned m, // = NextPow2(recovery_count)
|
||||||
unsigned n, // = NextPow2(m + original_count) = work_count
|
unsigned n, // = NextPow2(m + original_count)
|
||||||
const void* const * const original, // original_count entries
|
const void* const * const original, // original_count entries
|
||||||
const void* const * const recovery, // recovery_count entries
|
const void* const * const recovery, // recovery_count entries
|
||||||
void** work); // n entries
|
void** work); // n elements
|
||||||
|
|
||||||
|
|
||||||
//------------------------------------------------------------------------------
|
|
||||||
// API
|
|
||||||
|
|
||||||
// Returns false if the self-test fails
|
|
||||||
bool Initialize();
|
|
||||||
|
|
||||||
|
|
||||||
}} // namespace leopard::ff8
|
}} // namespace leopard::ff8
|
||||||
|
|
|
@ -48,7 +48,7 @@ struct TestParameters
|
||||||
unsigned original_count = 128; // under 65536
|
unsigned original_count = 128; // under 65536
|
||||||
unsigned recovery_count = 128; // under 65536 - original_count
|
unsigned recovery_count = 128; // under 65536 - original_count
|
||||||
#endif
|
#endif
|
||||||
unsigned buffer_bytes = 64; // multiple of 64 bytes
|
unsigned buffer_bytes = 64000; // multiple of 64 bytes
|
||||||
unsigned loss_count = 32768; // some fraction of original_count
|
unsigned loss_count = 32768; // some fraction of original_count
|
||||||
unsigned seed = 2;
|
unsigned seed = 2;
|
||||||
bool multithreaded = true;
|
bool multithreaded = true;
|
||||||
|
@ -240,9 +240,15 @@ public:
|
||||||
void EndCall()
|
void EndCall()
|
||||||
{
|
{
|
||||||
LEO_DEBUG_ASSERT(t0 != 0);
|
LEO_DEBUG_ASSERT(t0 != 0);
|
||||||
uint64_t t1 = GetTimeUsec();
|
const uint64_t t1 = GetTimeUsec();
|
||||||
++Invokations;
|
const uint64_t delta = t1 - t0;
|
||||||
TotalUsec += t1 - t0;
|
if (++Invokations == 1)
|
||||||
|
MaxCallUsec = MinCallUsec = delta;
|
||||||
|
else if (MaxCallUsec < delta)
|
||||||
|
MaxCallUsec = delta;
|
||||||
|
else if (MinCallUsec > delta)
|
||||||
|
MinCallUsec = delta;
|
||||||
|
TotalUsec += delta;
|
||||||
t0 = 0;
|
t0 = 0;
|
||||||
}
|
}
|
||||||
void Reset()
|
void Reset()
|
||||||
|
@ -260,6 +266,8 @@ public:
|
||||||
uint64_t t0 = 0;
|
uint64_t t0 = 0;
|
||||||
uint64_t Invokations = 0;
|
uint64_t Invokations = 0;
|
||||||
uint64_t TotalUsec = 0;
|
uint64_t TotalUsec = 0;
|
||||||
|
uint64_t MaxCallUsec = 0;
|
||||||
|
uint64_t MinCallUsec = 0;
|
||||||
std::string FunctionName;
|
std::string FunctionName;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -542,10 +550,10 @@ static bool Benchmark(const TestParameters& params)
|
||||||
t_mem_free.Print(kTrials);
|
t_mem_free.Print(kTrials);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
float encode_input_MBPS = total_bytes * kTrials / (float)(t_leo_encode.TotalUsec);
|
float encode_input_MBPS = total_bytes / (float)(t_leo_encode.MinCallUsec);
|
||||||
float encode_output_MBPS = params.buffer_bytes * (uint64_t)params.recovery_count * kTrials / (float)(t_leo_encode.TotalUsec);
|
float encode_output_MBPS = params.buffer_bytes * (uint64_t)params.recovery_count / (float)(t_leo_encode.MinCallUsec);
|
||||||
float decode_input_MBPS = total_bytes * kTrials / (float)(t_leo_decode.TotalUsec);
|
float decode_input_MBPS = total_bytes / (float)(t_leo_decode.MinCallUsec);
|
||||||
float decode_output_MBPS = params.buffer_bytes * (uint64_t)params.loss_count * kTrials / (float)(t_leo_decode.TotalUsec);
|
float decode_output_MBPS = params.buffer_bytes * (uint64_t)params.loss_count / (float)(t_leo_decode.MinCallUsec);
|
||||||
|
|
||||||
cout << "Leopard Encoder(" << total_bytes / 1000000.f << " MB in " << params.original_count << " pieces, " << params.loss_count << " losses): Input=" << encode_input_MBPS << " MB/s, Output=" << encode_output_MBPS << " MB/s" << endl;
|
cout << "Leopard Encoder(" << total_bytes / 1000000.f << " MB in " << params.original_count << " pieces, " << params.loss_count << " losses): Input=" << encode_input_MBPS << " MB/s, Output=" << encode_output_MBPS << " MB/s" << endl;
|
||||||
cout << "Leopard Decoder(" << total_bytes / 1000000.f << " MB in " << params.original_count << " pieces, " << params.loss_count << " losses): Input=" << decode_input_MBPS << " MB/s, Output=" << decode_output_MBPS << " MB/s" << endl << endl;
|
cout << "Leopard Decoder(" << total_bytes / 1000000.f << " MB in " << params.original_count << " pieces, " << params.loss_count << " losses): Input=" << decode_input_MBPS << " MB/s, Output=" << decode_output_MBPS << " MB/s" << endl << endl;
|
||||||
|
|
Loading…
Reference in New Issue