Cleanup and small opt

This commit is contained in:
Christopher Taylor 2017-06-03 00:24:18 -07:00
parent 7c2be9f17b
commit 968c4f4f6a
5 changed files with 42 additions and 149 deletions

View File

@ -32,7 +32,6 @@
TODO:
Short-term:
+ FF8 decoder needs DIT FFT optimization
+ Port DIT FFT code to FF16
+ Unroll first/final butterflies to avoid extra copies/xors in encoder
+ Multithreading

View File

@ -64,54 +64,10 @@ static const unsigned kPolynomial = 0x1002D;
//------------------------------------------------------------------------------
// Fast Walsh-Hadamard Transform (FWHT) (mod kModulus)
// API
// Transform for a variable number of bits (up to kOrder)
//void FWHT(ffe_t* data, const unsigned bits);
// Transform specialized for the finite field order
void FWHT(ffe_t data[kOrder]);
//------------------------------------------------------------------------------
// Multiplies
// x[] = exp(log(y[]) + log_m)
void mul_mem(
void * LEO_RESTRICT x, const void * LEO_RESTRICT y,
ffe_t log_m, uint64_t bytes);
//------------------------------------------------------------------------------
// FFT Operations
/*
Precondition: log_m != kModulus
x[] ^= exp(log(y[]) + log_m)
y[] ^= x[]
*/
void fft_butterfly(
void * LEO_RESTRICT x, void * LEO_RESTRICT y,
ffe_t log_m, uint64_t bytes);
//------------------------------------------------------------------------------
// IFFT Operations
/*
Precondition: log_m != kModulus
y[] ^= x[]
x[] ^= exp(log(y[]) + log_m)
*/
void ifft_butterfly(
void * LEO_RESTRICT x, void * LEO_RESTRICT y,
ffe_t log_m, uint64_t bytes);
//------------------------------------------------------------------------------
// Reed-Solomon Encode
// Returns false if the self-test fails
bool Initialize();
void ReedSolomonEncode(
uint64_t buffer_bytes,
@ -121,10 +77,6 @@ void ReedSolomonEncode(
const void* const * const data,
void** work); // Size of GetEncodeWorkCount()
//------------------------------------------------------------------------------
// Reed-Solomon Decode
void ReedSolomonDecode(
uint64_t buffer_bytes,
unsigned original_count,
@ -136,13 +88,6 @@ void ReedSolomonDecode(
void** work); // n entries
//------------------------------------------------------------------------------
// API
// Returns false if the self-test fails
bool Initialize();
}} // namespace leopard::ff16
#endif // LEO_HAS_FF16

View File

@ -134,7 +134,7 @@ static void FWHT(ffe_t* data, const unsigned m, const unsigned m_truncated)
#else // LEO_FWHT_OPT
// Reference implementation
void FWHT(ffe_t* data, const unsigned bits)
static void FWHT(ffe_t* data, const unsigned bits)
{
const unsigned size = (unsigned)(1UL << bits);
for (unsigned width = 1; width < size; width <<= 1)
@ -233,7 +233,7 @@ struct {
static ffe_t Multiply8LUT[256 * 256] = {};
void InitializeMultiplyTables()
static void InitializeMultiplyTables()
{
// If we cannot use the PSHUFB instruction, generate Multiply8LUT:
if (!CpuHasSSSE3)
@ -288,7 +288,7 @@ void InitializeMultiplyTables()
}
void mul_mem(
static void mul_mem(
void * LEO_RESTRICT x, const void * LEO_RESTRICT y,
ffe_t log_m, uint64_t bytes)
{
@ -482,7 +482,7 @@ static void FFTInitialize()
{1-5, 1'-5', 1-1', 5-5'},
*/
void ifft_butterfly(
static void ifft_butterfly(
void * LEO_RESTRICT x, void * LEO_RESTRICT y,
ffe_t log_m, uint64_t bytes)
{
@ -781,7 +781,7 @@ static void IFFT_DIT4(
}
}
void IFFT_DIT(
static void IFFT_DIT(
const uint64_t bytes,
const void* const* data,
const unsigned m_truncated,
@ -815,7 +815,10 @@ void IFFT_DIT(
const ffe_t log_m02 = skewLUT[r + dist * 2];
// For each set of dist elements:
for (unsigned i = r; i < r + dist; ++i)
unsigned i_end = r + dist;
if (i_end >= m_truncated)
i_end = m_truncated;
for (unsigned i = r; i < i_end; ++i)
{
IFFT_DIT4(
bytes,
@ -915,7 +918,7 @@ void IFFT_DIT(
{4-6, 5-7, 4-5, 6-7},
*/
void fft_butterfly(
static void fft_butterfly(
void * LEO_RESTRICT x, void * LEO_RESTRICT y,
ffe_t log_m, uint64_t bytes)
{
@ -1212,7 +1215,8 @@ static void FFT_DIT4(
fft_butterfly(work[dist * 2], work[dist * 3], log_m23, bytes);
}
void FFT_DIT(
static void FFT_DIT(
const uint64_t bytes,
void** work,
const unsigned m_truncated,
@ -1231,7 +1235,10 @@ void FFT_DIT(
const ffe_t log_m02 = skewLUT[r + dist * 2];
// For each set of dist elements:
for (unsigned i = r; i < r + dist; ++i)
unsigned i_end = r + dist;
if (i_end >= m_truncated)
i_end = m_truncated;
for (unsigned i = r; i < i_end; ++i)
{
FFT_DIT4(
bytes,

View File

@ -64,94 +64,28 @@ static const unsigned kPolynomial = 0x11D;
//------------------------------------------------------------------------------
// Fast Walsh-Hadamard Transform (FWHT) (mod kModulus)
// API
// Transform for a variable number of elements
// m_truncated: Number of elements that are non-zero at the front of data
//void FWHT(ffe_t* data, const unsigned m, const unsigned m_truncated);
//------------------------------------------------------------------------------
// Multiplies
// x[] = exp(log(y[]) + log_m)
// mul_mem
void mul_mem(
void * LEO_RESTRICT x, const void * LEO_RESTRICT y,
ffe_t log_m, uint64_t bytes);
//------------------------------------------------------------------------------
// FFT Operations
/*
Precondition: log_m != kModulus
x[] ^= exp(log(y[]) + log_m)
y[] ^= x[]
*/
void fft_butterfly(
void * LEO_RESTRICT x, void * LEO_RESTRICT y,
ffe_t log_m, uint64_t bytes);
#ifdef LEO_USE_VECTOR4_OPT
// Unroll 4 rows at a time
void fft_butterfly4(
void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1,
void * LEO_RESTRICT x_2, void * LEO_RESTRICT y_2,
void * LEO_RESTRICT x_3, void * LEO_RESTRICT y_3,
ffe_t log_m, uint64_t bytes);
#endif // LEO_USE_VECTOR4_OPT
//------------------------------------------------------------------------------
// IFFT Operations
/*
Precondition: log_m != kModulus
y[] ^= x[]
x[] ^= exp(log(y[]) + log_m)
*/
void ifft_butterfly(
void * LEO_RESTRICT x, void * LEO_RESTRICT y,
ffe_t log_m, uint64_t bytes);
//------------------------------------------------------------------------------
// Reed-Solomon Encode
// Returns false if the self-test fails
bool Initialize();
void ReedSolomonEncode(
uint64_t buffer_bytes,
unsigned original_count,
unsigned recovery_count,
unsigned m, // = NextPow2(recovery_count) * 2 = work_count
unsigned m, // = NextPow2(recovery_count)
const void* const * const data,
void** work); // Size of GetEncodeWorkCount()
//------------------------------------------------------------------------------
// Reed-Solomon Decode
void** work); // m * 2 elements
void ReedSolomonDecode(
uint64_t buffer_bytes,
unsigned original_count,
unsigned recovery_count,
unsigned m, // = NextPow2(recovery_count)
unsigned n, // = NextPow2(m + original_count) = work_count
unsigned n, // = NextPow2(m + original_count)
const void* const * const original, // original_count entries
const void* const * const recovery, // recovery_count entries
void** work); // n entries
//------------------------------------------------------------------------------
// API
// Returns false if the self-test fails
bool Initialize();
void** work); // n elements
}} // namespace leopard::ff8

View File

@ -48,7 +48,7 @@ struct TestParameters
unsigned original_count = 128; // under 65536
unsigned recovery_count = 128; // under 65536 - original_count
#endif
unsigned buffer_bytes = 64; // multiple of 64 bytes
unsigned buffer_bytes = 64000; // multiple of 64 bytes
unsigned loss_count = 32768; // some fraction of original_count
unsigned seed = 2;
bool multithreaded = true;
@ -240,9 +240,15 @@ public:
void EndCall()
{
LEO_DEBUG_ASSERT(t0 != 0);
uint64_t t1 = GetTimeUsec();
++Invokations;
TotalUsec += t1 - t0;
const uint64_t t1 = GetTimeUsec();
const uint64_t delta = t1 - t0;
if (++Invokations == 1)
MaxCallUsec = MinCallUsec = delta;
else if (MaxCallUsec < delta)
MaxCallUsec = delta;
else if (MinCallUsec > delta)
MinCallUsec = delta;
TotalUsec += delta;
t0 = 0;
}
void Reset()
@ -260,6 +266,8 @@ public:
uint64_t t0 = 0;
uint64_t Invokations = 0;
uint64_t TotalUsec = 0;
uint64_t MaxCallUsec = 0;
uint64_t MinCallUsec = 0;
std::string FunctionName;
};
@ -542,10 +550,10 @@ static bool Benchmark(const TestParameters& params)
t_mem_free.Print(kTrials);
#endif
float encode_input_MBPS = total_bytes * kTrials / (float)(t_leo_encode.TotalUsec);
float encode_output_MBPS = params.buffer_bytes * (uint64_t)params.recovery_count * kTrials / (float)(t_leo_encode.TotalUsec);
float decode_input_MBPS = total_bytes * kTrials / (float)(t_leo_decode.TotalUsec);
float decode_output_MBPS = params.buffer_bytes * (uint64_t)params.loss_count * kTrials / (float)(t_leo_decode.TotalUsec);
float encode_input_MBPS = total_bytes / (float)(t_leo_encode.MinCallUsec);
float encode_output_MBPS = params.buffer_bytes * (uint64_t)params.recovery_count / (float)(t_leo_encode.MinCallUsec);
float decode_input_MBPS = total_bytes / (float)(t_leo_decode.MinCallUsec);
float decode_output_MBPS = params.buffer_bytes * (uint64_t)params.loss_count / (float)(t_leo_decode.MinCallUsec);
cout << "Leopard Encoder(" << total_bytes / 1000000.f << " MB in " << params.original_count << " pieces, " << params.loss_count << " losses): Input=" << encode_input_MBPS << " MB/s, Output=" << encode_output_MBPS << " MB/s" << endl;
cout << "Leopard Decoder(" << total_bytes / 1000000.f << " MB in " << params.original_count << " pieces, " << params.loss_count << " losses): Input=" << decode_input_MBPS << " MB/s, Output=" << decode_output_MBPS << " MB/s" << endl << endl;