Add benchmark tests for 4-way ops

This commit is contained in:
Christopher Taylor 2017-05-27 01:15:24 -07:00
parent f3003488da
commit 5b9cab04b6
6 changed files with 294 additions and 44 deletions

View File

@ -151,7 +151,7 @@ void xor_mem(
{
LEO_M256 * LEO_RESTRICT x32 = reinterpret_cast<LEO_M256 *>(vx);
const LEO_M256 * LEO_RESTRICT y32 = reinterpret_cast<const LEO_M256 *>(vy);
do
while (bytes >= 128)
{
const LEO_M256 x0 = _mm256_xor_si256(_mm256_loadu_si256(x32), _mm256_loadu_si256(y32));
const LEO_M256 x1 = _mm256_xor_si256(_mm256_loadu_si256(x32 + 1), _mm256_loadu_si256(y32 + 1));
@ -161,8 +161,9 @@ void xor_mem(
_mm256_storeu_si256(x32 + 1, x1);
_mm256_storeu_si256(x32 + 2, x2);
_mm256_storeu_si256(x32 + 3, x3);
bytes -= 128, x32 += 4, y32 += 4;
} while (bytes >= 128);
x32 += 4, y32 += 4;
bytes -= 128;
};
if (bytes > 0)
{
const LEO_M256 x0 = _mm256_xor_si256(_mm256_loadu_si256(x32), _mm256_loadu_si256(y32));
@ -185,7 +186,8 @@ void xor_mem(
_mm_storeu_si128(x16 + 1, x1);
_mm_storeu_si128(x16 + 2, x2);
_mm_storeu_si128(x16 + 3, x3);
bytes -= 64, x16 += 4, y16 += 4;
x16 += 4, y16 += 4;
bytes -= 64;
} while (bytes > 0);
}
@ -196,8 +198,6 @@ void xor_mem4(
void * LEO_RESTRICT vx_3, const void * LEO_RESTRICT vy_3,
uint64_t bytes)
{
// FIXME: Add args
#if defined(LEO_TRY_AVX2)
if (CpuHasAVX2)
{
@ -207,51 +207,66 @@ void xor_mem4(
const LEO_M256 * LEO_RESTRICT y32_1 = reinterpret_cast<const LEO_M256 *>(vy_1);
LEO_M256 * LEO_RESTRICT x32_2 = reinterpret_cast<LEO_M256 *> (vx_2);
const LEO_M256 * LEO_RESTRICT y32_2 = reinterpret_cast<const LEO_M256 *>(vy_2);
do
LEO_M256 * LEO_RESTRICT x32_3 = reinterpret_cast<LEO_M256 *> (vx_3);
const LEO_M256 * LEO_RESTRICT y32_3 = reinterpret_cast<const LEO_M256 *>(vy_3);
while (bytes >= 128)
{
const LEO_M256 x0_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0), _mm256_loadu_si256(y32_0));
const LEO_M256 x1_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 1), _mm256_loadu_si256(y32_0 + 1));
const LEO_M256 x2_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 2), _mm256_loadu_si256(y32_0 + 2));
const LEO_M256 x3_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 3), _mm256_loadu_si256(y32_0 + 3));
_mm256_storeu_si256(x32_0, x0_0);
_mm256_storeu_si256(x32_0 + 1, x1_0);
_mm256_storeu_si256(x32_0 + 2, x2_0);
_mm256_storeu_si256(x32_0 + 3, x3_0);
x32_0 += 4, y32_0 += 4;
const LEO_M256 x0_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1), _mm256_loadu_si256(y32_1));
const LEO_M256 x1_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 1), _mm256_loadu_si256(y32_1 + 1));
const LEO_M256 x2_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 2), _mm256_loadu_si256(y32_1 + 2));
const LEO_M256 x3_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 3), _mm256_loadu_si256(y32_1 + 3));
_mm256_storeu_si256(x32_1, x0_1);
_mm256_storeu_si256(x32_1 + 1, x1_1);
_mm256_storeu_si256(x32_1 + 2, x2_1);
_mm256_storeu_si256(x32_1 + 3, x3_1);
x32_1 += 4, y32_1 += 4;
const LEO_M256 x0_2 = _mm256_xor_si256(_mm256_loadu_si256(x32_2), _mm256_loadu_si256(y32_2));
const LEO_M256 x1_2 = _mm256_xor_si256(_mm256_loadu_si256(x32_2 + 1), _mm256_loadu_si256(y32_2 + 1));
const LEO_M256 x2_2 = _mm256_xor_si256(_mm256_loadu_si256(x32_2 + 2), _mm256_loadu_si256(y32_2 + 2));
const LEO_M256 x3_2 = _mm256_xor_si256(_mm256_loadu_si256(x32_2 + 3), _mm256_loadu_si256(y32_2 + 3));
_mm256_storeu_si256(x32_0, x0_0);
_mm256_storeu_si256(x32_0 + 1, x1_0);
_mm256_storeu_si256(x32_0 + 2, x2_0);
_mm256_storeu_si256(x32_0 + 3, x3_0);
_mm256_storeu_si256(x32_1, x0_1);
_mm256_storeu_si256(x32_1 + 1, x1_1);
_mm256_storeu_si256(x32_1 + 2, x2_1);
_mm256_storeu_si256(x32_1 + 3, x3_1);
_mm256_storeu_si256(x32_2, x0_2);
_mm256_storeu_si256(x32_2, x0_2);
_mm256_storeu_si256(x32_2 + 1, x1_2);
_mm256_storeu_si256(x32_2 + 2, x2_2);
_mm256_storeu_si256(x32_2 + 3, x3_2);
x32_0 += 4, y32_0 += 4;
x32_1 += 4, y32_1 += 4;
x32_2 += 4, y32_2 += 4;
const LEO_M256 x0_3 = _mm256_xor_si256(_mm256_loadu_si256(x32_3), _mm256_loadu_si256(y32_3));
const LEO_M256 x1_3 = _mm256_xor_si256(_mm256_loadu_si256(x32_3 + 1), _mm256_loadu_si256(y32_3 + 1));
const LEO_M256 x2_3 = _mm256_xor_si256(_mm256_loadu_si256(x32_3 + 2), _mm256_loadu_si256(y32_3 + 2));
const LEO_M256 x3_3 = _mm256_xor_si256(_mm256_loadu_si256(x32_3 + 3), _mm256_loadu_si256(y32_3 + 3));
_mm256_storeu_si256(x32_3, x0_3);
_mm256_storeu_si256(x32_3 + 1, x1_3);
_mm256_storeu_si256(x32_3 + 2, x2_3);
_mm256_storeu_si256(x32_3 + 3, x3_3);
x32_3 += 4, y32_3 += 4;
bytes -= 128;
} while (bytes >= 128);
}
if (bytes > 0)
{
const LEO_M256 x0_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0), _mm256_loadu_si256(y32_0));
const LEO_M256 x1_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 1), _mm256_loadu_si256(y32_0 + 1));
const LEO_M256 x0_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1), _mm256_loadu_si256(y32_1));
const LEO_M256 x1_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 1), _mm256_loadu_si256(y32_1 + 1));
_mm256_storeu_si256(x32_0, x0_0);
_mm256_storeu_si256(x32_0 + 1, x1_0);
_mm256_storeu_si256(x32_1, x0_1);
_mm256_storeu_si256(x32_1 + 1, x1_1);
const LEO_M256 x0_2 = _mm256_xor_si256(_mm256_loadu_si256(x32_2), _mm256_loadu_si256(y32_2));
const LEO_M256 x1_2 = _mm256_xor_si256(_mm256_loadu_si256(x32_2 + 1), _mm256_loadu_si256(y32_2 + 1));
_mm256_storeu_si256(x32_0, x0_0);
_mm256_storeu_si256(x32_0 + 1, x1_0);
_mm256_storeu_si256(x32_1, x0_1);
_mm256_storeu_si256(x32_1 + 1, x1_1);
const LEO_M256 x0_3 = _mm256_xor_si256(_mm256_loadu_si256(x32_3), _mm256_loadu_si256(y32_3));
const LEO_M256 x1_3 = _mm256_xor_si256(_mm256_loadu_si256(x32_3 + 1), _mm256_loadu_si256(y32_3 + 1));
_mm256_storeu_si256(x32_2, x0_2);
_mm256_storeu_si256(x32_2 + 1, x1_2);
_mm256_storeu_si256(x32_3, x0_3);
_mm256_storeu_si256(x32_3 + 1, x1_3);
}
return;
}
@ -262,35 +277,46 @@ void xor_mem4(
const LEO_M128 * LEO_RESTRICT y16_1 = reinterpret_cast<const LEO_M128 *>(vy_1);
LEO_M128 * LEO_RESTRICT x16_2 = reinterpret_cast<LEO_M128 *> (vx_2);
const LEO_M128 * LEO_RESTRICT y16_2 = reinterpret_cast<const LEO_M128 *>(vy_2);
LEO_M128 * LEO_RESTRICT x16_3 = reinterpret_cast<LEO_M128 *> (vx_3);
const LEO_M128 * LEO_RESTRICT y16_3 = reinterpret_cast<const LEO_M128 *>(vy_3);
do
{
const LEO_M128 x0_0 = _mm_xor_si128(_mm_loadu_si128(x16_0), _mm_loadu_si128(y16_0));
const LEO_M128 x1_0 = _mm_xor_si128(_mm_loadu_si128(x16_0 + 1), _mm_loadu_si128(y16_0 + 1));
const LEO_M128 x2_0 = _mm_xor_si128(_mm_loadu_si128(x16_0 + 2), _mm_loadu_si128(y16_0 + 2));
const LEO_M128 x3_0 = _mm_xor_si128(_mm_loadu_si128(x16_0 + 3), _mm_loadu_si128(y16_0 + 3));
_mm_storeu_si128(x16_0, x0_0);
_mm_storeu_si128(x16_0 + 1, x1_0);
_mm_storeu_si128(x16_0 + 2, x2_0);
_mm_storeu_si128(x16_0 + 3, x3_0);
x16_0 += 4, y16_0 += 4;
const LEO_M128 x0_1 = _mm_xor_si128(_mm_loadu_si128(x16_1), _mm_loadu_si128(y16_1));
const LEO_M128 x1_1 = _mm_xor_si128(_mm_loadu_si128(x16_1 + 1), _mm_loadu_si128(y16_1 + 1));
const LEO_M128 x2_1 = _mm_xor_si128(_mm_loadu_si128(x16_1 + 2), _mm_loadu_si128(y16_1 + 2));
const LEO_M128 x3_1 = _mm_xor_si128(_mm_loadu_si128(x16_1 + 3), _mm_loadu_si128(y16_1 + 3));
_mm_storeu_si128(x16_1, x0_1);
_mm_storeu_si128(x16_1 + 1, x1_1);
_mm_storeu_si128(x16_1 + 2, x2_1);
_mm_storeu_si128(x16_1 + 3, x3_1);
x16_1 += 4, y16_1 += 4;
const LEO_M128 x0_2 = _mm_xor_si128(_mm_loadu_si128(x16_2), _mm_loadu_si128(y16_2));
const LEO_M128 x1_2 = _mm_xor_si128(_mm_loadu_si128(x16_2 + 1), _mm_loadu_si128(y16_2 + 1));
const LEO_M128 x2_2 = _mm_xor_si128(_mm_loadu_si128(x16_2 + 2), _mm_loadu_si128(y16_2 + 2));
const LEO_M128 x3_2 = _mm_xor_si128(_mm_loadu_si128(x16_2 + 3), _mm_loadu_si128(y16_2 + 3));
_mm_storeu_si128(x16_0, x0_0);
_mm_storeu_si128(x16_0 + 1, x1_0);
_mm_storeu_si128(x16_0 + 2, x2_0);
_mm_storeu_si128(x16_0 + 3, x3_0);
_mm_storeu_si128(x16_1, x0_1);
_mm_storeu_si128(x16_1 + 1, x1_1);
_mm_storeu_si128(x16_1 + 2, x2_1);
_mm_storeu_si128(x16_1 + 3, x3_1);
_mm_storeu_si128(x16_2, x0_2);
_mm_storeu_si128(x16_2, x0_2);
_mm_storeu_si128(x16_2 + 1, x1_2);
_mm_storeu_si128(x16_2 + 2, x2_2);
_mm_storeu_si128(x16_2 + 3, x3_2);
x16_0 += 4, y16_0 += 4;
x16_1 += 4, y16_1 += 4;
x16_2 += 4, y16_2 += 4;
const LEO_M128 x0_3 = _mm_xor_si128(_mm_loadu_si128(x16_3), _mm_loadu_si128(y16_3));
const LEO_M128 x1_3 = _mm_xor_si128(_mm_loadu_si128(x16_3 + 1), _mm_loadu_si128(y16_3 + 1));
const LEO_M128 x2_3 = _mm_xor_si128(_mm_loadu_si128(x16_3 + 2), _mm_loadu_si128(y16_3 + 2));
const LEO_M128 x3_3 = _mm_xor_si128(_mm_loadu_si128(x16_3 + 3), _mm_loadu_si128(y16_3 + 3));
_mm_storeu_si128(x16_3, x0_3);
_mm_storeu_si128(x16_3 + 1, x1_3);
_mm_storeu_si128(x16_3 + 2, x2_3);
_mm_storeu_si128(x16_3 + 3, x3_3);
x16_3 += 4, y16_3 += 4;
bytes -= 64;
} while (bytes > 0);
}

View File

@ -55,6 +55,9 @@ static const ffe_t kBasis[kBits] = {
0xFDB8, 0xFB34, 0xFF38, 0x991E
};
// Using the Cantor basis here enables us to avoid a lot of extra calculations
// when applying the formal derivative in decoding.
//------------------------------------------------------------------------------
// Field Operations

View File

@ -50,9 +50,11 @@ static const unsigned kPolynomial = 0x11D;
// Basis used for generating logarithm tables
static const ffe_t kBasis[kBits] = {
1, 214, 152, 146, 86, 200, 88, 230 // Cantor basis
// 1, 2, 4, 8, 16, 32, 64, 128 // Monomial basis
};
// Using the Cantor basis here enables us to avoid a lot of extra calculations
// when applying the formal derivative in decoding.
//------------------------------------------------------------------------------
// Field Operations

View File

@ -48,6 +48,8 @@ LEO_EXPORT int leo_init_(int version)
if (version != LEO_VERSION)
return Leopard_InvalidInput;
leopard::InitializeCPUArch();
#ifdef LEO_HAS_FF8
if (!leopard::ff8::Initialize())
return Leopard_Platform;

View File

@ -27,6 +27,8 @@
*/
#include "../LeopardCommon.h"
#include "../LeopardFF8.h"
#include "../LeopardFF16.h"
#include "../leopard.h"
#include <memory>
@ -238,7 +240,7 @@ public:
}
void Print(unsigned trials)
{
cout << FunctionName << " called " << Invokations / (float)trials << " times per trial (avg). " << TotalUsec / (double)Invokations << " usec avg for all invokations. " << TotalUsec / (float)trials << " usec (avg) of " << trials << " trials" << endl;
cout << FunctionName << " called " << Invokations / (float)trials << " times per trial. " << TotalUsec / (double)Invokations << " usec avg. " << TotalUsec / (float)trials << " usec for each of " << trials << " trials" << endl;
}
uint64_t t0 = 0;
@ -526,6 +528,197 @@ static void BasicTest(const TestParameters& params)
}
//------------------------------------------------------------------------------
// Parallel XOR Benchmark
// Demonstrate about 10% performance boost by doing parallel rows for XORs
void ParallelXORBenchmark()
{
FunctionTimer t_1("xor_mem");
FunctionTimer t_4("xor_mem4");
static const unsigned buffer_bytes = 4096;
static const unsigned buffer_count = 1024;
uint8_t* buffers_x[buffer_count] = {};
uint8_t* buffers_y[buffer_count] = {};
for (unsigned i = 0; i < buffer_count; ++i)
{
buffers_x[i] = SIMDSafeAllocate(buffer_bytes);
buffers_y[i] = SIMDSafeAllocate(buffer_bytes);
}
static const unsigned iteration_count = 1000;
for (unsigned i = 0; i < iteration_count; ++i)
{
t_1.BeginCall();
for (unsigned j = 0; j < buffer_count; ++j)
leopard::xor_mem(
buffers_x[j], buffers_y[j],
buffer_bytes);
t_1.EndCall();
}
for (unsigned i = 0; i < iteration_count; ++i)
{
t_4.BeginCall();
for (unsigned j = 0; j < buffer_count; j += 4)
leopard::xor_mem4(
buffers_x[j], buffers_y[j],
buffers_x[j + 1], buffers_y[j + 1],
buffers_x[j + 2], buffers_y[j + 2],
buffers_x[j + 3], buffers_y[j + 3],
buffer_bytes);
t_4.EndCall();
}
for (unsigned i = 0; i < buffer_count; ++i)
{
SIMDSafeFree(buffers_x[i]);
SIMDSafeFree(buffers_y[i]);
}
t_1.Print(iteration_count);
t_4.Print(iteration_count);
}
//------------------------------------------------------------------------------
// Parallel Butterfly8 Benchmark
#ifdef LEO_HAS_FF8
// Demonstrate performance boost by doing parallel rows for Butterfly8s
void ParallelButterfly8Benchmark()
{
FunctionTimer t_1("8-bit fft_butterfly");
FunctionTimer t_4("8-bit fft_butterfly4");
static const unsigned buffer_bytes = 4096;
static const unsigned buffer_count = 1024;
uint8_t* buffers_x[buffer_count] = {};
uint8_t* buffers_y[buffer_count] = {};
for (unsigned i = 0; i < buffer_count; ++i)
{
buffers_x[i] = SIMDSafeAllocate(buffer_bytes);
buffers_y[i] = SIMDSafeAllocate(buffer_bytes);
}
static const unsigned iteration_count = 1000;
for (unsigned i = 0; i < iteration_count; ++i)
{
leopard::ff8::ffe_t m = (leopard::ff8::ffe_t)(i + 2);
t_1.BeginCall();
for (unsigned j = 0; j < buffer_count; ++j)
leopard::ff8::fft_butterfly(
buffers_x[j], buffers_y[j],
m,
buffer_bytes);
t_1.EndCall();
}
for (unsigned i = 0; i < iteration_count; ++i)
{
leopard::ff8::ffe_t m = (leopard::ff8::ffe_t)(i + 2);
t_4.BeginCall();
for (unsigned j = 0; j < buffer_count; j += 4)
leopard::ff8::fft_butterfly4(
buffers_x[j], buffers_y[j],
buffers_x[j + 1], buffers_y[j + 1],
buffers_x[j + 2], buffers_y[j + 2],
buffers_x[j + 3], buffers_y[j + 3],
m,
buffer_bytes);
t_4.EndCall();
}
for (unsigned i = 0; i < buffer_count; ++i)
{
SIMDSafeFree(buffers_x[i]);
SIMDSafeFree(buffers_y[i]);
}
t_1.Print(iteration_count);
t_4.Print(iteration_count);
}
#endif // LEO_HAS_FF8
//------------------------------------------------------------------------------
// Parallel Butterfly16 Benchmark
#ifdef LEO_HAS_FF16
// Demonstrate performance boost by doing parallel rows for Butterfly16s
void ParallelButterfly16Benchmark()
{
FunctionTimer t_1("16-bit fft_butterfly");
FunctionTimer t_4("16-bit fft_butterfly4");
static const unsigned buffer_bytes = 4096;
static const unsigned buffer_count = 1024;
uint8_t* buffers_x[buffer_count] = {};
uint8_t* buffers_y[buffer_count] = {};
for (unsigned i = 0; i < buffer_count; ++i)
{
buffers_x[i] = SIMDSafeAllocate(buffer_bytes);
buffers_y[i] = SIMDSafeAllocate(buffer_bytes);
}
static const unsigned iteration_count = 100;
for (unsigned i = 0; i < iteration_count; ++i)
{
leopard::ff16::ffe_t m = (leopard::ff16::ffe_t)(i + 2);
t_1.BeginCall();
for (unsigned j = 0; j < buffer_count; ++j)
leopard::ff16::fft_butterfly(
buffers_x[j], buffers_y[j],
m,
buffer_bytes);
t_1.EndCall();
}
for (unsigned i = 0; i < iteration_count; ++i)
{
leopard::ff16::ffe_t m = (leopard::ff16::ffe_t)(i + 2);
t_4.BeginCall();
for (unsigned j = 0; j < buffer_count; j += 4)
leopard::ff16::fft_butterfly4(
buffers_x[j], buffers_y[j],
buffers_x[j + 1], buffers_y[j + 1],
buffers_x[j + 2], buffers_y[j + 2],
buffers_x[j + 3], buffers_y[j + 3],
m,
buffer_bytes);
t_4.EndCall();
}
for (unsigned i = 0; i < buffer_count; ++i)
{
SIMDSafeFree(buffers_x[i]);
SIMDSafeFree(buffers_y[i]);
}
t_1.Print(iteration_count);
t_4.Print(iteration_count);
}
#endif // LEO_HAS_FF8
//------------------------------------------------------------------------------
// Entrypoint
@ -544,6 +737,14 @@ int main(int argc, char **argv)
t_leo_init.EndCall();
t_leo_init.Print(1);
ParallelXORBenchmark();
#ifdef LEO_HAS_FF8
ParallelButterfly8Benchmark();
#endif // LEO_HAS_FF8
#ifdef LEO_HAS_FF16
ParallelButterfly16Benchmark();
#endif // LEO_HAS_FF16
TestParameters params;
if (argc >= 2)

View File

@ -33,6 +33,11 @@
#include <stdlib.h>
//#define LEO_SHORT_FIELD
//#define LEO_EXPERIMENT_EXTRA_XOR
//#define LEO_EXPERIMENT_EXTRA_MULS
#define LEO_EXPERIMENT_CANTOR_BASIS
//------------------------------------------------------------------------------
// Debug
@ -70,24 +75,33 @@
//------------------------------------------------------------------------------
// Field
//#define LEO_SHORT_FIELD
#ifdef LEO_SHORT_FIELD
typedef uint8_t ffe_t;
static const unsigned kGFBits = 8;
static const unsigned kGFPolynomial = 0x11D;
ffe_t kGFBasis[kGFBits] = {
#ifdef LEO_EXPERIMENT_CANTOR_BASIS
1, 214, 152, 146, 86, 200, 88, 230 // Cantor basis
#else
1, 2, 4, 8, 16, 32, 64, 128 // Monomial basis
#endif
};
#else
typedef uint16_t ffe_t;
static const unsigned kGFBits = 16;
static const unsigned kGFPolynomial = 0x1002D;
ffe_t kGFBasis[kGFBits] = {
#ifdef LEO_EXPERIMENT_CANTOR_BASIS
0x0001, 0xACCA, 0x3C0E, 0x163E, // Cantor basis
0xC582, 0xED2E, 0x914C, 0x4012,
0x6C98, 0x10D8, 0x6A72, 0xB900,
0xFDB8, 0xFB34, 0xFF38, 0x991E
#else
1, 2, 4, 8, // Monomial basis
16, 32, 64, 128,
256, 512, 1024, 2048,
4096, 8192, 16384, 32768
#endif
};
#endif
@ -223,7 +237,7 @@ static void formal_derivative(ffe_t* cos, const unsigned size)
}
// Doesn't seem to be needed
#if 0
#ifdef LEO_EXPERIMENT_EXTRA_XOR
/*
Same here - Zeroes on the right are preserved
*/
@ -305,7 +319,9 @@ static void FLT(ffe_t* data, const unsigned size, const unsigned skewIndex, cons
//------------------------------------------------------------------------------
// FFT Initialization
//static ffe_t B[kFieldSize >> 1]; // factors used in formal derivative
#ifdef LEO_EXPERIMENT_EXTRA_MULS
static ffe_t B[kFieldSize >> 1]; // factors used in formal derivative
#endif
static fwht_t log_walsh[kFieldSize]; // factors used in the evaluation of the error locator polynomial
// Initialize skewVec[], B[], log_walsh[]
@ -339,7 +355,7 @@ static void InitFieldOperations()
for (unsigned i = 0; i < kFieldSize; ++i)
skewVec[i] = GFLog[skewVec[i]];
#if 0
#ifdef LEO_EXPERIMENT_EXTRA_MULS
temp[0] = kModulus - temp[0];
for (unsigned i = 1; i < (kGFBits - 1); ++i)
@ -444,7 +460,7 @@ static void decode(ffe_t* codeword, const unsigned m, const unsigned original_co
IFLT(codeword, n, 0);
// Note: This is not needed to recover successfully...
#if 0
#ifdef LEO_EXPERIMENT_EXTRA_MULS
// formal derivative
// Note: Preserves zeroes on the right
for (unsigned i = 0; i < m + original_count; i += 2)
@ -456,7 +472,7 @@ static void decode(ffe_t* codeword, const unsigned m, const unsigned original_co
formal_derivative(codeword, n);
#if 0
#ifdef LEO_EXPERIMENT_EXTRA_MULS
// Note: Preserves zeroes on the right
for (unsigned i = 0; i < m + original_count; i += 2)
{
@ -598,7 +614,7 @@ int main(int argc, char **argv)
{
#ifdef LEO_SHORT_FIELD
const unsigned input_count = 100;
const unsigned recovery_count = 20;
const unsigned recovery_count = 10;
#else // LEO_SHORT_FIELD
const unsigned input_count = 10000;
const unsigned recovery_count = 2000;