Cleanup and small opt

This commit is contained in:
Christopher Taylor 2017-06-03 00:24:18 -07:00
parent 7c2be9f17b
commit 968c4f4f6a
5 changed files with 42 additions and 149 deletions

View File

@ -32,7 +32,6 @@
TODO: TODO:
Short-term: Short-term:
+ FF8 decoder needs DIT FFT optimization
+ Port DIT FFT code to FF16 + Port DIT FFT code to FF16
+ Unroll first/final butterflies to avoid extra copies/xors in encoder + Unroll first/final butterflies to avoid extra copies/xors in encoder
+ Multithreading + Multithreading

View File

@ -64,54 +64,10 @@ static const unsigned kPolynomial = 0x1002D;
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
// Fast Walsh-Hadamard Transform (FWHT) (mod kModulus) // API
// Transform for a variable number of bits (up to kOrder) // Returns false if the self-test fails
//void FWHT(ffe_t* data, const unsigned bits); bool Initialize();
// Transform specialized for the finite field order
void FWHT(ffe_t data[kOrder]);
//------------------------------------------------------------------------------
// Multiplies
// x[] = exp(log(y[]) + log_m)
void mul_mem(
void * LEO_RESTRICT x, const void * LEO_RESTRICT y,
ffe_t log_m, uint64_t bytes);
//------------------------------------------------------------------------------
// FFT Operations
/*
Precondition: log_m != kModulus
x[] ^= exp(log(y[]) + log_m)
y[] ^= x[]
*/
void fft_butterfly(
void * LEO_RESTRICT x, void * LEO_RESTRICT y,
ffe_t log_m, uint64_t bytes);
//------------------------------------------------------------------------------
// IFFT Operations
/*
Precondition: log_m != kModulus
y[] ^= x[]
x[] ^= exp(log(y[]) + log_m)
*/
void ifft_butterfly(
void * LEO_RESTRICT x, void * LEO_RESTRICT y,
ffe_t log_m, uint64_t bytes);
//------------------------------------------------------------------------------
// Reed-Solomon Encode
void ReedSolomonEncode( void ReedSolomonEncode(
uint64_t buffer_bytes, uint64_t buffer_bytes,
@ -121,10 +77,6 @@ void ReedSolomonEncode(
const void* const * const data, const void* const * const data,
void** work); // Size of GetEncodeWorkCount() void** work); // Size of GetEncodeWorkCount()
//------------------------------------------------------------------------------
// Reed-Solomon Decode
void ReedSolomonDecode( void ReedSolomonDecode(
uint64_t buffer_bytes, uint64_t buffer_bytes,
unsigned original_count, unsigned original_count,
@ -136,13 +88,6 @@ void ReedSolomonDecode(
void** work); // n entries void** work); // n entries
//------------------------------------------------------------------------------
// API
// Returns false if the self-test fails
bool Initialize();
}} // namespace leopard::ff16 }} // namespace leopard::ff16
#endif // LEO_HAS_FF16 #endif // LEO_HAS_FF16

View File

@ -134,7 +134,7 @@ static void FWHT(ffe_t* data, const unsigned m, const unsigned m_truncated)
#else // LEO_FWHT_OPT #else // LEO_FWHT_OPT
// Reference implementation // Reference implementation
void FWHT(ffe_t* data, const unsigned bits) static void FWHT(ffe_t* data, const unsigned bits)
{ {
const unsigned size = (unsigned)(1UL << bits); const unsigned size = (unsigned)(1UL << bits);
for (unsigned width = 1; width < size; width <<= 1) for (unsigned width = 1; width < size; width <<= 1)
@ -233,7 +233,7 @@ struct {
static ffe_t Multiply8LUT[256 * 256] = {}; static ffe_t Multiply8LUT[256 * 256] = {};
void InitializeMultiplyTables() static void InitializeMultiplyTables()
{ {
// If we cannot use the PSHUFB instruction, generate Multiply8LUT: // If we cannot use the PSHUFB instruction, generate Multiply8LUT:
if (!CpuHasSSSE3) if (!CpuHasSSSE3)
@ -288,7 +288,7 @@ void InitializeMultiplyTables()
} }
void mul_mem( static void mul_mem(
void * LEO_RESTRICT x, const void * LEO_RESTRICT y, void * LEO_RESTRICT x, const void * LEO_RESTRICT y,
ffe_t log_m, uint64_t bytes) ffe_t log_m, uint64_t bytes)
{ {
@ -482,7 +482,7 @@ static void FFTInitialize()
{1-5, 1'-5', 1-1', 5-5'}, {1-5, 1'-5', 1-1', 5-5'},
*/ */
void ifft_butterfly( static void ifft_butterfly(
void * LEO_RESTRICT x, void * LEO_RESTRICT y, void * LEO_RESTRICT x, void * LEO_RESTRICT y,
ffe_t log_m, uint64_t bytes) ffe_t log_m, uint64_t bytes)
{ {
@ -781,7 +781,7 @@ static void IFFT_DIT4(
} }
} }
void IFFT_DIT( static void IFFT_DIT(
const uint64_t bytes, const uint64_t bytes,
const void* const* data, const void* const* data,
const unsigned m_truncated, const unsigned m_truncated,
@ -815,7 +815,10 @@ void IFFT_DIT(
const ffe_t log_m02 = skewLUT[r + dist * 2]; const ffe_t log_m02 = skewLUT[r + dist * 2];
// For each set of dist elements: // For each set of dist elements:
for (unsigned i = r; i < r + dist; ++i) unsigned i_end = r + dist;
if (i_end >= m_truncated)
i_end = m_truncated;
for (unsigned i = r; i < i_end; ++i)
{ {
IFFT_DIT4( IFFT_DIT4(
bytes, bytes,
@ -915,7 +918,7 @@ void IFFT_DIT(
{4-6, 5-7, 4-5, 6-7}, {4-6, 5-7, 4-5, 6-7},
*/ */
void fft_butterfly( static void fft_butterfly(
void * LEO_RESTRICT x, void * LEO_RESTRICT y, void * LEO_RESTRICT x, void * LEO_RESTRICT y,
ffe_t log_m, uint64_t bytes) ffe_t log_m, uint64_t bytes)
{ {
@ -1212,7 +1215,8 @@ static void FFT_DIT4(
fft_butterfly(work[dist * 2], work[dist * 3], log_m23, bytes); fft_butterfly(work[dist * 2], work[dist * 3], log_m23, bytes);
} }
void FFT_DIT(
static void FFT_DIT(
const uint64_t bytes, const uint64_t bytes,
void** work, void** work,
const unsigned m_truncated, const unsigned m_truncated,
@ -1231,7 +1235,10 @@ void FFT_DIT(
const ffe_t log_m02 = skewLUT[r + dist * 2]; const ffe_t log_m02 = skewLUT[r + dist * 2];
// For each set of dist elements: // For each set of dist elements:
for (unsigned i = r; i < r + dist; ++i) unsigned i_end = r + dist;
if (i_end >= m_truncated)
i_end = m_truncated;
for (unsigned i = r; i < i_end; ++i)
{ {
FFT_DIT4( FFT_DIT4(
bytes, bytes,

View File

@ -64,94 +64,28 @@ static const unsigned kPolynomial = 0x11D;
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
// Fast Walsh-Hadamard Transform (FWHT) (mod kModulus) // API
// Transform for a variable number of elements // Returns false if the self-test fails
// m_truncated: Number of elements that are non-zero at the front of data bool Initialize();
//void FWHT(ffe_t* data, const unsigned m, const unsigned m_truncated);
//------------------------------------------------------------------------------
// Multiplies
// x[] = exp(log(y[]) + log_m)
// mul_mem
void mul_mem(
void * LEO_RESTRICT x, const void * LEO_RESTRICT y,
ffe_t log_m, uint64_t bytes);
//------------------------------------------------------------------------------
// FFT Operations
/*
Precondition: log_m != kModulus
x[] ^= exp(log(y[]) + log_m)
y[] ^= x[]
*/
void fft_butterfly(
void * LEO_RESTRICT x, void * LEO_RESTRICT y,
ffe_t log_m, uint64_t bytes);
#ifdef LEO_USE_VECTOR4_OPT
// Unroll 4 rows at a time
void fft_butterfly4(
void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1,
void * LEO_RESTRICT x_2, void * LEO_RESTRICT y_2,
void * LEO_RESTRICT x_3, void * LEO_RESTRICT y_3,
ffe_t log_m, uint64_t bytes);
#endif // LEO_USE_VECTOR4_OPT
//------------------------------------------------------------------------------
// IFFT Operations
/*
Precondition: log_m != kModulus
y[] ^= x[]
x[] ^= exp(log(y[]) + log_m)
*/
void ifft_butterfly(
void * LEO_RESTRICT x, void * LEO_RESTRICT y,
ffe_t log_m, uint64_t bytes);
//------------------------------------------------------------------------------
// Reed-Solomon Encode
void ReedSolomonEncode( void ReedSolomonEncode(
uint64_t buffer_bytes, uint64_t buffer_bytes,
unsigned original_count, unsigned original_count,
unsigned recovery_count, unsigned recovery_count,
unsigned m, // = NextPow2(recovery_count) * 2 = work_count unsigned m, // = NextPow2(recovery_count)
const void* const * const data, const void* const * const data,
void** work); // Size of GetEncodeWorkCount() void** work); // m * 2 elements
//------------------------------------------------------------------------------
// Reed-Solomon Decode
void ReedSolomonDecode( void ReedSolomonDecode(
uint64_t buffer_bytes, uint64_t buffer_bytes,
unsigned original_count, unsigned original_count,
unsigned recovery_count, unsigned recovery_count,
unsigned m, // = NextPow2(recovery_count) unsigned m, // = NextPow2(recovery_count)
unsigned n, // = NextPow2(m + original_count) = work_count unsigned n, // = NextPow2(m + original_count)
const void* const * const original, // original_count entries const void* const * const original, // original_count entries
const void* const * const recovery, // recovery_count entries const void* const * const recovery, // recovery_count entries
void** work); // n entries void** work); // n elements
//------------------------------------------------------------------------------
// API
// Returns false if the self-test fails
bool Initialize();
}} // namespace leopard::ff8 }} // namespace leopard::ff8

View File

@ -48,7 +48,7 @@ struct TestParameters
unsigned original_count = 128; // under 65536 unsigned original_count = 128; // under 65536
unsigned recovery_count = 128; // under 65536 - original_count unsigned recovery_count = 128; // under 65536 - original_count
#endif #endif
unsigned buffer_bytes = 64; // multiple of 64 bytes unsigned buffer_bytes = 64000; // multiple of 64 bytes
unsigned loss_count = 32768; // some fraction of original_count unsigned loss_count = 32768; // some fraction of original_count
unsigned seed = 2; unsigned seed = 2;
bool multithreaded = true; bool multithreaded = true;
@ -240,9 +240,15 @@ public:
void EndCall() void EndCall()
{ {
LEO_DEBUG_ASSERT(t0 != 0); LEO_DEBUG_ASSERT(t0 != 0);
uint64_t t1 = GetTimeUsec(); const uint64_t t1 = GetTimeUsec();
++Invokations; const uint64_t delta = t1 - t0;
TotalUsec += t1 - t0; if (++Invokations == 1)
MaxCallUsec = MinCallUsec = delta;
else if (MaxCallUsec < delta)
MaxCallUsec = delta;
else if (MinCallUsec > delta)
MinCallUsec = delta;
TotalUsec += delta;
t0 = 0; t0 = 0;
} }
void Reset() void Reset()
@ -260,6 +266,8 @@ public:
uint64_t t0 = 0; uint64_t t0 = 0;
uint64_t Invokations = 0; uint64_t Invokations = 0;
uint64_t TotalUsec = 0; uint64_t TotalUsec = 0;
uint64_t MaxCallUsec = 0;
uint64_t MinCallUsec = 0;
std::string FunctionName; std::string FunctionName;
}; };
@ -542,10 +550,10 @@ static bool Benchmark(const TestParameters& params)
t_mem_free.Print(kTrials); t_mem_free.Print(kTrials);
#endif #endif
float encode_input_MBPS = total_bytes * kTrials / (float)(t_leo_encode.TotalUsec); float encode_input_MBPS = total_bytes / (float)(t_leo_encode.MinCallUsec);
float encode_output_MBPS = params.buffer_bytes * (uint64_t)params.recovery_count * kTrials / (float)(t_leo_encode.TotalUsec); float encode_output_MBPS = params.buffer_bytes * (uint64_t)params.recovery_count / (float)(t_leo_encode.MinCallUsec);
float decode_input_MBPS = total_bytes * kTrials / (float)(t_leo_decode.TotalUsec); float decode_input_MBPS = total_bytes / (float)(t_leo_decode.MinCallUsec);
float decode_output_MBPS = params.buffer_bytes * (uint64_t)params.loss_count * kTrials / (float)(t_leo_decode.TotalUsec); float decode_output_MBPS = params.buffer_bytes * (uint64_t)params.loss_count / (float)(t_leo_decode.MinCallUsec);
cout << "Leopard Encoder(" << total_bytes / 1000000.f << " MB in " << params.original_count << " pieces, " << params.loss_count << " losses): Input=" << encode_input_MBPS << " MB/s, Output=" << encode_output_MBPS << " MB/s" << endl; cout << "Leopard Encoder(" << total_bytes / 1000000.f << " MB in " << params.original_count << " pieces, " << params.loss_count << " losses): Input=" << encode_input_MBPS << " MB/s, Output=" << encode_output_MBPS << " MB/s" << endl;
cout << "Leopard Decoder(" << total_bytes / 1000000.f << " MB in " << params.original_count << " pieces, " << params.loss_count << " losses): Input=" << decode_input_MBPS << " MB/s, Output=" << decode_output_MBPS << " MB/s" << endl << endl; cout << "Leopard Decoder(" << total_bytes / 1000000.f << " MB in " << params.original_count << " pieces, " << params.loss_count << " losses): Input=" << decode_input_MBPS << " MB/s, Output=" << decode_output_MBPS << " MB/s" << endl << endl;