mirror of
https://github.com/status-im/leopard.git
synced 2025-02-16 16:07:36 +00:00
Implement DIT FFT and some reference fallbacks
This commit is contained in:
parent
8c35c8d4de
commit
c7f0085948
@ -171,6 +171,9 @@
|
||||
// Unroll inner loops 4 times
|
||||
#define LEO_USE_VECTOR4_OPT
|
||||
|
||||
// Interleave butterfly operations between layer pairs in FFT
|
||||
#define LEO_INTERLEAVE_BUTTERFLY4_OPT
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Debug
|
||||
|
1189
LeopardFF8.cpp
1189
LeopardFF8.cpp
File diff suppressed because it is too large
Load Diff
21
LeopardFF8.h
21
LeopardFF8.h
@ -66,17 +66,16 @@ static const unsigned kPolynomial = 0x11D;
|
||||
//------------------------------------------------------------------------------
|
||||
// Fast Walsh-Hadamard Transform (FWHT) (mod kModulus)
|
||||
|
||||
// Transform for a variable number of bits (up to kOrder)
|
||||
void FWHT(ffe_t* data, const unsigned bits);
|
||||
|
||||
// Transform specialized for the finite field order
|
||||
void FWHT(ffe_t data[kOrder]);
|
||||
// Transform for a variable number of elements
|
||||
// m_truncated: Number of elements that are non-zero at the front of data
|
||||
void FWHT(ffe_t* data, const unsigned m, const unsigned m_truncated);
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Multiplies
|
||||
|
||||
// x[] = exp(log(y[]) + log_m)
|
||||
// mul_mem
|
||||
void mul_mem(
|
||||
void * LEO_RESTRICT x, const void * LEO_RESTRICT y,
|
||||
ffe_t log_m, uint64_t bytes);
|
||||
@ -121,18 +120,6 @@ void ifft_butterfly(
|
||||
void * LEO_RESTRICT x, void * LEO_RESTRICT y,
|
||||
ffe_t log_m, uint64_t bytes);
|
||||
|
||||
#ifdef LEO_USE_VECTOR4_OPT
|
||||
|
||||
// Unroll 4 rows at a time
|
||||
void ifft_butterfly4(
|
||||
void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
|
||||
void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1,
|
||||
void * LEO_RESTRICT x_2, void * LEO_RESTRICT y_2,
|
||||
void * LEO_RESTRICT x_3, void * LEO_RESTRICT y_3,
|
||||
ffe_t log_m, uint64_t bytes);
|
||||
|
||||
#endif // LEO_USE_VECTOR4_OPT
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Reed-Solomon Encode
|
||||
|
@ -42,8 +42,8 @@ using namespace std;
|
||||
struct TestParameters
|
||||
{
|
||||
#ifdef LEO_HAS_FF16
|
||||
unsigned original_count = 100; // under 65536
|
||||
unsigned recovery_count = 20; // under 65536 - original_count
|
||||
unsigned original_count = 1000; // under 65536
|
||||
unsigned recovery_count = 200; // under 65536 - original_count
|
||||
#else
|
||||
unsigned original_count = 128; // under 65536
|
||||
unsigned recovery_count = 128; // under 65536 - original_count
|
||||
@ -395,11 +395,11 @@ static LEO_FORCE_INLINE void SIMDSafeFree(void* ptr)
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Tests
|
||||
// Benchmark
|
||||
|
||||
static bool BasicTest(const TestParameters& params)
|
||||
static bool Benchmark(const TestParameters& params)
|
||||
{
|
||||
const unsigned kTrials = params.original_count > 8000 ? 1 : 100;
|
||||
const unsigned kTrials = params.original_count > 8000 ? 1 : 1;
|
||||
|
||||
std::vector<uint8_t*> original_data(params.original_count);
|
||||
|
||||
@ -554,209 +554,6 @@ static bool BasicTest(const TestParameters& params)
|
||||
}
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Parallel XOR Benchmark
|
||||
|
||||
#ifdef LEO_USE_VECTOR4_OPT
|
||||
|
||||
// Demonstrate about 10% performance boost by doing parallel rows for XORs
|
||||
void ParallelXORBenchmark()
|
||||
{
|
||||
FunctionTimer t_1("xor_mem");
|
||||
FunctionTimer t_4("xor_mem4");
|
||||
|
||||
static const unsigned buffer_bytes = 4096;
|
||||
static const unsigned buffer_count = 1024;
|
||||
|
||||
uint8_t* buffers_x[buffer_count] = {};
|
||||
uint8_t* buffers_y[buffer_count] = {};
|
||||
|
||||
for (unsigned i = 0; i < buffer_count; ++i)
|
||||
{
|
||||
buffers_x[i] = SIMDSafeAllocate(buffer_bytes);
|
||||
buffers_y[i] = SIMDSafeAllocate(buffer_bytes);
|
||||
}
|
||||
|
||||
static const unsigned iteration_count = 1000;
|
||||
|
||||
for (unsigned i = 0; i < iteration_count; ++i)
|
||||
{
|
||||
t_1.BeginCall();
|
||||
for (unsigned j = 0; j < buffer_count; ++j)
|
||||
leopard::xor_mem(
|
||||
buffers_x[j], buffers_y[j],
|
||||
buffer_bytes);
|
||||
t_1.EndCall();
|
||||
}
|
||||
|
||||
for (unsigned i = 0; i < iteration_count; ++i)
|
||||
{
|
||||
t_4.BeginCall();
|
||||
for (unsigned j = 0; j < buffer_count; j += 4)
|
||||
leopard::xor_mem4(
|
||||
buffers_x[j], buffers_y[j],
|
||||
buffers_x[j + 1], buffers_y[j + 1],
|
||||
buffers_x[j + 2], buffers_y[j + 2],
|
||||
buffers_x[j + 3], buffers_y[j + 3],
|
||||
buffer_bytes);
|
||||
t_4.EndCall();
|
||||
}
|
||||
|
||||
for (unsigned i = 0; i < buffer_count; ++i)
|
||||
{
|
||||
SIMDSafeFree(buffers_x[i]);
|
||||
SIMDSafeFree(buffers_y[i]);
|
||||
}
|
||||
|
||||
t_1.Print(iteration_count);
|
||||
t_4.Print(iteration_count);
|
||||
}
|
||||
|
||||
#endif // LEO_USE_VECTOR4_OPT
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Parallel Butterfly8 Benchmark
|
||||
|
||||
#ifdef LEO_HAS_FF8
|
||||
|
||||
#ifdef LEO_USE_VECTOR4_OPT
|
||||
|
||||
// Demonstrate performance boost by doing parallel rows for Butterfly8s
|
||||
void ParallelButterfly8Benchmark()
|
||||
{
|
||||
FunctionTimer t_1("8-bit fft_butterfly");
|
||||
FunctionTimer t_4("8-bit fft_butterfly4");
|
||||
|
||||
static const unsigned buffer_bytes = 4096;
|
||||
static const unsigned buffer_count = 1024;
|
||||
|
||||
uint8_t* buffers_x[buffer_count] = {};
|
||||
uint8_t* buffers_y[buffer_count] = {};
|
||||
|
||||
for (unsigned i = 0; i < buffer_count; ++i)
|
||||
{
|
||||
buffers_x[i] = SIMDSafeAllocate(buffer_bytes);
|
||||
buffers_y[i] = SIMDSafeAllocate(buffer_bytes);
|
||||
}
|
||||
|
||||
static const unsigned iteration_count = 1000;
|
||||
|
||||
for (unsigned i = 0; i < iteration_count; ++i)
|
||||
{
|
||||
leopard::ff8::ffe_t m = (leopard::ff8::ffe_t)(i + 2);
|
||||
|
||||
t_1.BeginCall();
|
||||
for (unsigned j = 0; j < buffer_count; ++j)
|
||||
leopard::ff8::fft_butterfly(
|
||||
buffers_x[j], buffers_y[j],
|
||||
m,
|
||||
buffer_bytes);
|
||||
t_1.EndCall();
|
||||
}
|
||||
|
||||
for (unsigned i = 0; i < iteration_count; ++i)
|
||||
{
|
||||
leopard::ff8::ffe_t m = (leopard::ff8::ffe_t)(i + 2);
|
||||
|
||||
t_4.BeginCall();
|
||||
for (unsigned j = 0; j < buffer_count; j += 4)
|
||||
leopard::ff8::fft_butterfly4(
|
||||
buffers_x[j], buffers_y[j],
|
||||
buffers_x[j + 1], buffers_y[j + 1],
|
||||
buffers_x[j + 2], buffers_y[j + 2],
|
||||
buffers_x[j + 3], buffers_y[j + 3],
|
||||
m,
|
||||
buffer_bytes);
|
||||
t_4.EndCall();
|
||||
}
|
||||
|
||||
for (unsigned i = 0; i < buffer_count; ++i)
|
||||
{
|
||||
SIMDSafeFree(buffers_x[i]);
|
||||
SIMDSafeFree(buffers_y[i]);
|
||||
}
|
||||
|
||||
t_1.Print(iteration_count);
|
||||
t_4.Print(iteration_count);
|
||||
}
|
||||
|
||||
#endif // LEO_USE_VECTOR4_OPT
|
||||
|
||||
#endif // LEO_HAS_FF8
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Parallel Butterfly16 Benchmark
|
||||
|
||||
#ifdef LEO_HAS_FF16
|
||||
|
||||
#ifdef LEO_USE_VECTOR4_OPT
|
||||
|
||||
// Demonstrate performance boost by doing parallel rows for Butterfly16s
|
||||
void ParallelButterfly16Benchmark()
|
||||
{
|
||||
FunctionTimer t_1("16-bit fft_butterfly");
|
||||
FunctionTimer t_4("16-bit fft_butterfly4");
|
||||
|
||||
static const unsigned buffer_bytes = 4096;
|
||||
static const unsigned buffer_count = 1024;
|
||||
|
||||
uint8_t* buffers_x[buffer_count] = {};
|
||||
uint8_t* buffers_y[buffer_count] = {};
|
||||
|
||||
for (unsigned i = 0; i < buffer_count; ++i)
|
||||
{
|
||||
buffers_x[i] = SIMDSafeAllocate(buffer_bytes);
|
||||
buffers_y[i] = SIMDSafeAllocate(buffer_bytes);
|
||||
}
|
||||
|
||||
static const unsigned iteration_count = 100;
|
||||
|
||||
for (unsigned i = 0; i < iteration_count; ++i)
|
||||
{
|
||||
leopard::ff16::ffe_t m = (leopard::ff16::ffe_t)(i + 2);
|
||||
|
||||
t_1.BeginCall();
|
||||
for (unsigned j = 0; j < buffer_count; ++j)
|
||||
leopard::ff16::fft_butterfly(
|
||||
buffers_x[j], buffers_y[j],
|
||||
m,
|
||||
buffer_bytes);
|
||||
t_1.EndCall();
|
||||
}
|
||||
|
||||
for (unsigned i = 0; i < iteration_count; ++i)
|
||||
{
|
||||
leopard::ff16::ffe_t m = (leopard::ff16::ffe_t)(i + 2);
|
||||
|
||||
t_4.BeginCall();
|
||||
for (unsigned j = 0; j < buffer_count; j += 4)
|
||||
leopard::ff16::fft_butterfly4(
|
||||
buffers_x[j], buffers_y[j],
|
||||
buffers_x[j + 1], buffers_y[j + 1],
|
||||
buffers_x[j + 2], buffers_y[j + 2],
|
||||
buffers_x[j + 3], buffers_y[j + 3],
|
||||
m,
|
||||
buffer_bytes);
|
||||
t_4.EndCall();
|
||||
}
|
||||
|
||||
for (unsigned i = 0; i < buffer_count; ++i)
|
||||
{
|
||||
SIMDSafeFree(buffers_x[i]);
|
||||
SIMDSafeFree(buffers_y[i]);
|
||||
}
|
||||
|
||||
t_1.Print(iteration_count);
|
||||
t_4.Print(iteration_count);
|
||||
}
|
||||
|
||||
#endif // LEO_USE_VECTOR4_OPT
|
||||
|
||||
#endif // LEO_HAS_FF8
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Entrypoint
|
||||
|
||||
@ -775,16 +572,6 @@ int main(int argc, char **argv)
|
||||
t_leo_init.EndCall();
|
||||
t_leo_init.Print(1);
|
||||
|
||||
#if 0
|
||||
ParallelXORBenchmark();
|
||||
#ifdef LEO_HAS_FF8
|
||||
ParallelButterfly8Benchmark();
|
||||
#endif // LEO_HAS_FF8
|
||||
#ifdef LEO_HAS_FF16
|
||||
ParallelButterfly16Benchmark();
|
||||
#endif // LEO_HAS_FF16
|
||||
#endif
|
||||
|
||||
TestParameters params;
|
||||
PCGRandom prng;
|
||||
|
||||
@ -804,11 +591,11 @@ int main(int argc, char **argv)
|
||||
|
||||
cout << "Parameters: [original count=" << params.original_count << "] [recovery count=" << params.recovery_count << "] [buffer bytes=" << params.buffer_bytes << "] [loss count=" << params.loss_count << "] [random seed=" << params.seed << "]" << endl;
|
||||
|
||||
if (!BasicTest(params))
|
||||
if (!Benchmark(params))
|
||||
goto Failed;
|
||||
|
||||
#if 0
|
||||
static const unsigned kMaxRandomData = 128;
|
||||
#if 1
|
||||
static const unsigned kMaxRandomData = 32768;
|
||||
|
||||
prng.Seed(params.seed, 8);
|
||||
for (;; ++params.seed)
|
||||
@ -819,7 +606,7 @@ int main(int argc, char **argv)
|
||||
|
||||
cout << "Parameters: [original count=" << params.original_count << "] [recovery count=" << params.recovery_count << "] [buffer bytes=" << params.buffer_bytes << "] [loss count=" << params.loss_count << "] [random seed=" << params.seed << "]" << endl;
|
||||
|
||||
if (!BasicTest(params))
|
||||
if (!Benchmark(params))
|
||||
goto Failed;
|
||||
}
|
||||
#endif
|
||||
@ -835,7 +622,7 @@ int main(int argc, char **argv)
|
||||
|
||||
cout << "Parameters: [original count=" << params.original_count << "] [recovery count=" << params.recovery_count << "] [buffer bytes=" << params.buffer_bytes << "] [loss count=" << params.loss_count << "] [random seed=" << params.seed << "]" << endl;
|
||||
|
||||
if (!BasicTest(params))
|
||||
if (!Benchmark(params))
|
||||
goto Failed;
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user