mirror of
https://github.com/status-im/leopard.git
synced 2025-02-19 17:34:19 +00:00
Implement DIT FFT for Encoder
This commit is contained in:
parent
9090619d73
commit
ac68c62d28
@ -34,7 +34,6 @@
|
||||
+ Look into 12-bit fields as a performance optimization
|
||||
+ Unroll first/final butterflies to avoid extra copies/xors in encoder
|
||||
+ Skip a lot of the initial FWHT() layers that are just operating on zeroes
|
||||
+ Skip a lot of the final FWHT() layers that are not needed for calculation
|
||||
+ For the actual FFT(), I should be unrolling the bottom two layers
|
||||
and performing them in a specialized function that does 2 <=> 2 and
|
||||
then 1<=>1, 1<=>1 operations in local registers/cache
|
||||
@ -160,7 +159,7 @@
|
||||
// Define this to enable the optimized version of FWHT()
|
||||
#define LEO_FWHT_OPT
|
||||
|
||||
// Avoid scheduling reduced FFT operations that are unneeded
|
||||
// Avoid scheduling FFT operations that are unused
|
||||
#define LEO_SCHEDULE_OPT
|
||||
|
||||
// Avoid calculating final FFT values in decoder using bitfield
|
||||
|
@ -364,7 +364,7 @@ static void InitializeLogarithmTables()
|
||||
/*
|
||||
The multiplication algorithm used follows the approach outlined in {4}.
|
||||
Specifically section 7 outlines the algorithm used here for 16-bit fields.
|
||||
I use the ALTMAP memory layout since I do not need to convert in/out of it.
|
||||
The ALTMAP memory layout is used since there is no need to convert in/out.
|
||||
*/
|
||||
|
||||
struct {
|
||||
@ -903,6 +903,7 @@ static ffe_t FFTSkew[kModulus];
|
||||
// Factors used in the evaluation of the error locator polynomial
|
||||
static ffe_t LogWalsh[kOrder];
|
||||
|
||||
|
||||
static void FFTInitialize()
|
||||
{
|
||||
ffe_t temp[kBits - 1];
|
||||
@ -1328,12 +1329,12 @@ void ReedSolomonDecode(
|
||||
|
||||
// Evaluate error locator polynomial
|
||||
|
||||
FWHT(ErrorLocations, kBits);
|
||||
FWHT(ErrorLocations);
|
||||
|
||||
for (unsigned i = 0; i < kOrder; ++i)
|
||||
ErrorLocations[i] = ((unsigned)ErrorLocations[i] * (unsigned)LogWalsh[i]) % kModulus;
|
||||
|
||||
FWHT(ErrorLocations, kBits);
|
||||
FWHT(ErrorLocations);
|
||||
|
||||
// work <- recovery data
|
||||
|
||||
|
635
LeopardFF8.cpp
635
LeopardFF8.cpp
@ -121,38 +121,6 @@ static LEO_FORCE_INLINE void FWHT_4(ffe_t* data, unsigned s)
|
||||
data[y] = t3;
|
||||
}
|
||||
|
||||
static void FWHT_8(ffe_t* data)
|
||||
{
|
||||
ffe_t t0 = data[0];
|
||||
ffe_t t1 = data[1];
|
||||
ffe_t t2 = data[2];
|
||||
ffe_t t3 = data[3];
|
||||
ffe_t t4 = data[4];
|
||||
ffe_t t5 = data[5];
|
||||
ffe_t t6 = data[6];
|
||||
ffe_t t7 = data[7];
|
||||
FWHT_2(t0, t1);
|
||||
FWHT_2(t2, t3);
|
||||
FWHT_2(t4, t5);
|
||||
FWHT_2(t6, t7);
|
||||
FWHT_2(t0, t2);
|
||||
FWHT_2(t1, t3);
|
||||
FWHT_2(t4, t6);
|
||||
FWHT_2(t5, t7);
|
||||
FWHT_2(t0, t4);
|
||||
FWHT_2(t1, t5);
|
||||
FWHT_2(t2, t6);
|
||||
FWHT_2(t3, t7);
|
||||
data[0] = t0;
|
||||
data[1] = t1;
|
||||
data[2] = t2;
|
||||
data[3] = t3;
|
||||
data[4] = t4;
|
||||
data[5] = t5;
|
||||
data[6] = t6;
|
||||
data[7] = t7;
|
||||
}
|
||||
|
||||
// Decimation in time (DIT) version
|
||||
static void FWHT(ffe_t* data, const unsigned bits)
|
||||
{
|
||||
@ -174,16 +142,8 @@ static void FWHT(ffe_t* data, const unsigned bits)
|
||||
FWHT_4(data + j + r, m4);
|
||||
}
|
||||
|
||||
if (bits & 1)
|
||||
{
|
||||
for (unsigned i0 = 0; i0 < n; i0 += 8)
|
||||
FWHT_8(data + i0);
|
||||
}
|
||||
else
|
||||
{
|
||||
for (unsigned i0 = 0; i0 < n; i0 += 4)
|
||||
FWHT_4(data + i0);
|
||||
}
|
||||
for (unsigned i0 = 0; i0 < n; i0 += 4)
|
||||
FWHT_4(data + i0);
|
||||
}
|
||||
|
||||
#else // LEO_FWHT_OPT
|
||||
@ -790,230 +750,393 @@ static void FFTInitialize()
|
||||
FWHT(LogWalsh, kBits);
|
||||
}
|
||||
|
||||
void VectorFFTButterfly(
|
||||
const uint64_t bytes,
|
||||
unsigned count,
|
||||
void** x,
|
||||
void** y,
|
||||
const ffe_t log_m)
|
||||
{
|
||||
if (log_m == kModulus)
|
||||
{
|
||||
VectorXOR(bytes, count, y, x);
|
||||
return;
|
||||
}
|
||||
|
||||
#ifdef LEO_USE_VECTOR4_OPT
|
||||
while (count >= 4)
|
||||
{
|
||||
fft_butterfly4(
|
||||
x[0], y[0],
|
||||
x[1], y[1],
|
||||
x[2], y[2],
|
||||
x[3], y[3],
|
||||
log_m, bytes);
|
||||
x += 4, y += 4;
|
||||
count -= 4;
|
||||
}
|
||||
#endif // LEO_USE_VECTOR4_OPT
|
||||
|
||||
for (unsigned i = 0; i < count; ++i)
|
||||
fft_butterfly(x[i], y[i], log_m, bytes);
|
||||
}
|
||||
|
||||
void VectorIFFTButterfly(
|
||||
const uint64_t bytes,
|
||||
unsigned count,
|
||||
void** x,
|
||||
void** y,
|
||||
const ffe_t log_m)
|
||||
{
|
||||
if (log_m == kModulus)
|
||||
{
|
||||
VectorXOR(bytes, count, y, x);
|
||||
return;
|
||||
}
|
||||
|
||||
#ifdef LEO_USE_VECTOR4_OPT
|
||||
while (count >= 4)
|
||||
{
|
||||
ifft_butterfly4(
|
||||
x[0], y[0],
|
||||
x[1], y[1],
|
||||
x[2], y[2],
|
||||
x[3], y[3],
|
||||
log_m, bytes);
|
||||
x += 4, y += 4;
|
||||
count -= 4;
|
||||
}
|
||||
#endif // LEO_USE_VECTOR4_OPT
|
||||
|
||||
for (unsigned i = 0; i < count; ++i)
|
||||
ifft_butterfly(x[i], y[i], log_m, bytes);
|
||||
}
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Reed-Solomon Encode
|
||||
|
||||
/*
|
||||
Decimation in time IFFT:
|
||||
|
||||
The decimation in time IFFT algorithm allows us to unroll 2 layers at a time,
|
||||
performing calculations on local registers and faster cache memory.
|
||||
|
||||
Each ^___^ below indicates a butterfly between the associated indices.
|
||||
|
||||
The ifft_butterfly(x, y) operation:
|
||||
|
||||
if (log_m != kModulus)
|
||||
x[] ^= exp(log(y[]) + log_m)
|
||||
y[] ^= x[]
|
||||
|
||||
Layer 0:
|
||||
0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7
|
||||
^_^ ^_^ ^_^ ^_^ ^_^ ^_^ ^_^ ^_^
|
||||
|
||||
Layer 1:
|
||||
0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7
|
||||
^___^ ^___^ ^___^ ^___^
|
||||
^___^ ^___^ ^___^ ^___^
|
||||
|
||||
Layer 2:
|
||||
0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7
|
||||
^_______^ ^_______^
|
||||
^_______^ ^_______^
|
||||
^_______^ ^_______^
|
||||
^_______^ ^_______^
|
||||
|
||||
Layer 3:
|
||||
0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7
|
||||
^_______________^
|
||||
^_______________^
|
||||
^_______________^
|
||||
^_______________^
|
||||
^_______________^
|
||||
^_______________^
|
||||
^_______________^
|
||||
^_______________^
|
||||
|
||||
DIT layer 0-1 operations, grouped 4 at a time:
|
||||
{0-1, 2-3, 0-2, 1-3},
|
||||
{4-5, 6-7, 4-6, 5-7},
|
||||
|
||||
DIT layer 1-2 operations, grouped 4 at a time:
|
||||
{0-2, 4-6, 0-4, 2-6},
|
||||
{1-3, 5-7, 1-5, 3-7},
|
||||
|
||||
DIT layer 2-3 operations, grouped 4 at a time:
|
||||
{0-4, 0'-4', 0-0', 4-4'},
|
||||
{1-5, 1'-5', 1-1', 5-5'},
|
||||
*/
|
||||
|
||||
// 4-way butterfly
|
||||
static void IFFT_DIT4(
|
||||
const uint64_t bytes,
|
||||
void** work,
|
||||
unsigned dist,
|
||||
const ffe_t log_m01,
|
||||
const ffe_t log_m23,
|
||||
const ffe_t log_m02)
|
||||
{
|
||||
// FIXME: Interleave these
|
||||
|
||||
// First layer:
|
||||
if (log_m01 == kModulus)
|
||||
xor_mem(work[dist], work[0], bytes);
|
||||
else
|
||||
ifft_butterfly(work[0], work[dist], log_m01, bytes);
|
||||
|
||||
if (log_m23 == kModulus)
|
||||
xor_mem(work[dist * 3], work[dist * 2], bytes);
|
||||
else
|
||||
ifft_butterfly(work[dist * 2], work[dist * 3], log_m23, bytes);
|
||||
|
||||
// Second layer:
|
||||
if (log_m02 == kModulus)
|
||||
{
|
||||
xor_mem(work[dist * 2], work[0], bytes);
|
||||
xor_mem(work[dist * 3], work[dist], bytes);
|
||||
}
|
||||
else
|
||||
{
|
||||
ifft_butterfly(work[0], work[dist * 2], log_m02, bytes);
|
||||
ifft_butterfly(work[dist], work[dist * 3], log_m02, bytes);
|
||||
}
|
||||
}
|
||||
|
||||
void IFFT_DIT(
|
||||
const uint64_t bytes,
|
||||
void* const* data,
|
||||
const unsigned m_truncated,
|
||||
void** work,
|
||||
void** xor_result,
|
||||
const unsigned m,
|
||||
const ffe_t* skewLUT)
|
||||
{
|
||||
// FIXME: Roll into first layer
|
||||
if (data)
|
||||
{
|
||||
for (unsigned i = 0; i < m_truncated; ++i)
|
||||
memcpy(work[i], data[i], bytes);
|
||||
for (unsigned i = m_truncated; i < m; ++i)
|
||||
memset(work[i], 0, bytes);
|
||||
}
|
||||
|
||||
// Decimation in time: Unroll 2 layers at a time
|
||||
unsigned dist = 1, dist4 = 4;
|
||||
for (; dist4 <= m; dist = dist4, dist4 <<= 2)
|
||||
{
|
||||
// FIXME: Walk this in reverse order every other pair of layers for better cache locality
|
||||
// FIXME: m_truncated
|
||||
|
||||
// For each set of dist*4 elements:
|
||||
for (unsigned r = 0; r < m_truncated; r += dist4)
|
||||
{
|
||||
const ffe_t log_m01 = skewLUT[r + dist];
|
||||
const ffe_t log_m23 = skewLUT[r + dist * 3];
|
||||
const ffe_t log_m02 = skewLUT[r + dist * 2];
|
||||
|
||||
// For each set of dist elements:
|
||||
for (unsigned i = r; i < r + dist; ++i)
|
||||
{
|
||||
IFFT_DIT4(
|
||||
bytes,
|
||||
work + i,
|
||||
dist,
|
||||
log_m01,
|
||||
log_m23,
|
||||
log_m02);
|
||||
}
|
||||
}
|
||||
|
||||
data = nullptr;
|
||||
}
|
||||
|
||||
// If there is one layer left:
|
||||
if (dist < m)
|
||||
{
|
||||
const ffe_t log_m = skewLUT[dist];
|
||||
|
||||
if (log_m == kModulus)
|
||||
{
|
||||
for (unsigned i = 0; i < dist; ++i)
|
||||
VectorXOR(bytes, dist, work + dist, work);
|
||||
}
|
||||
else
|
||||
{
|
||||
for (unsigned i = 0; i < dist; ++i)
|
||||
{
|
||||
ifft_butterfly(
|
||||
work[i],
|
||||
work[i + dist],
|
||||
log_m,
|
||||
bytes);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// FIXME: Roll into last layer
|
||||
if (xor_result)
|
||||
for (unsigned i = 0; i < m; ++i)
|
||||
xor_mem(xor_result[i], work[i], bytes);
|
||||
}
|
||||
|
||||
/*
|
||||
Decimation in time FFT:
|
||||
|
||||
The decimation in time FFT algorithm allows us to unroll 2 layers at a time,
|
||||
performing calculations on local registers and faster cache memory.
|
||||
|
||||
Each ^___^ below indicates a butterfly between the associated indices.
|
||||
|
||||
The fft_butterfly(x, y) operation:
|
||||
|
||||
y[] ^= x[]
|
||||
if (log_m != kModulus)
|
||||
x[] ^= exp(log(y[]) + log_m)
|
||||
|
||||
Layer 0:
|
||||
0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7
|
||||
^_______________^
|
||||
^_______________^
|
||||
^_______________^
|
||||
^_______________^
|
||||
^_______________^
|
||||
^_______________^
|
||||
^_______________^
|
||||
^_______________^
|
||||
|
||||
Layer 1:
|
||||
0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7
|
||||
^_______^ ^_______^
|
||||
^_______^ ^_______^
|
||||
^_______^ ^_______^
|
||||
^_______^ ^_______^
|
||||
|
||||
Layer 2:
|
||||
0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7
|
||||
^___^ ^___^ ^___^ ^___^
|
||||
^___^ ^___^ ^___^ ^___^
|
||||
|
||||
Layer 3:
|
||||
0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7
|
||||
^_^ ^_^ ^_^ ^_^ ^_^ ^_^ ^_^ ^_^
|
||||
|
||||
DIT layer 0-1 operations, grouped 4 at a time:
|
||||
{0-0', 4-4', 0-4, 0'-4'},
|
||||
{1-1', 5-5', 1-5, 1'-5'},
|
||||
|
||||
DIT layer 1-2 operations, grouped 4 at a time:
|
||||
{0-4, 2-6, 0-2, 4-6},
|
||||
{1-5, 3-7, 1-3, 5-7},
|
||||
|
||||
DIT layer 2-3 operations, grouped 4 at a time:
|
||||
{0-2, 1-3, 0-1, 2-3},
|
||||
{4-6, 5-7, 4-5, 6-7},
|
||||
*/
|
||||
|
||||
static void FFT_DIT4(
|
||||
const uint64_t bytes,
|
||||
void** work,
|
||||
const unsigned dist,
|
||||
const ffe_t log_m01,
|
||||
const ffe_t log_m23,
|
||||
const ffe_t log_m02)
|
||||
{
|
||||
// FIXME: Interleave
|
||||
|
||||
// First layer:
|
||||
if (log_m02 == kModulus)
|
||||
{
|
||||
xor_mem(work[dist * 2], work[0], bytes);
|
||||
xor_mem(work[dist * 3], work[dist], bytes);
|
||||
}
|
||||
else
|
||||
{
|
||||
fft_butterfly(work[0], work[dist * 2], log_m02, bytes);
|
||||
fft_butterfly(work[dist], work[dist * 3], log_m02, bytes);
|
||||
}
|
||||
|
||||
// Second layer:
|
||||
if (log_m01 == kModulus)
|
||||
xor_mem(work[dist], work[0], bytes);
|
||||
else
|
||||
fft_butterfly(work[0], work[dist], log_m01, bytes);
|
||||
|
||||
if (log_m23 == kModulus)
|
||||
xor_mem(work[dist * 3], work[dist * 2], bytes);
|
||||
else
|
||||
fft_butterfly(work[dist * 2], work[dist * 3], log_m23, bytes);
|
||||
}
|
||||
|
||||
void FFT_DIT(
|
||||
const uint64_t bytes,
|
||||
void** work,
|
||||
const unsigned m_truncated,
|
||||
const unsigned m,
|
||||
const ffe_t* skewLUT)
|
||||
{
|
||||
// Decimation in time: Unroll 2 layers at a time
|
||||
unsigned dist4 = m, dist = m >> 2;
|
||||
for (; dist != 0; dist4 = dist, dist >>= 2)
|
||||
{
|
||||
// FIXME: Walk this in reverse order every other pair of layers for better cache locality
|
||||
// FIXME: m_truncated
|
||||
|
||||
// For each set of dist*4 elements:
|
||||
for (unsigned r = 0; r < m_truncated; r += dist4)
|
||||
{
|
||||
const ffe_t log_m01 = skewLUT[r + dist];
|
||||
const ffe_t log_m23 = skewLUT[r + dist * 3];
|
||||
const ffe_t log_m02 = skewLUT[r + dist * 2];
|
||||
|
||||
// For each set of dist elements:
|
||||
for (unsigned i = r; i < r + dist; ++i)
|
||||
{
|
||||
FFT_DIT4(
|
||||
bytes,
|
||||
work + i,
|
||||
dist,
|
||||
log_m01,
|
||||
log_m23,
|
||||
log_m02);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If there is one layer left:
|
||||
if (dist4 == 2)
|
||||
{
|
||||
for (unsigned r = 0; r < m_truncated; r += 2)
|
||||
{
|
||||
const ffe_t log_m = skewLUT[r + 1];
|
||||
|
||||
if (log_m == kModulus)
|
||||
xor_mem(work[r + 1], work[r], bytes);
|
||||
else
|
||||
{
|
||||
fft_butterfly(
|
||||
work[r],
|
||||
work[r + 1],
|
||||
log_m,
|
||||
bytes);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ReedSolomonEncode(
|
||||
uint64_t buffer_bytes,
|
||||
unsigned original_count,
|
||||
unsigned recovery_count,
|
||||
unsigned m,
|
||||
void* const * data,
|
||||
void* const* data,
|
||||
void** work)
|
||||
{
|
||||
// work <- data
|
||||
|
||||
// TBD: Unroll first loop to eliminate this
|
||||
unsigned first_end = m;
|
||||
if (original_count < m)
|
||||
{
|
||||
first_end = original_count;
|
||||
for (unsigned i = original_count; i < m; ++i)
|
||||
memset(work[i], 0, buffer_bytes);
|
||||
}
|
||||
for (unsigned i = 0; i < first_end; ++i)
|
||||
memcpy(work[i], data[i], buffer_bytes);
|
||||
|
||||
// work <- IFFT(data, m, m)
|
||||
|
||||
for (unsigned width = 1; width < m; width <<= 1)
|
||||
{
|
||||
const unsigned range = width << 1;
|
||||
const ffe_t* skewLUT = FFTSkew + width + m - 1;
|
||||
const ffe_t* skewLUT = FFTSkew + m - 1;
|
||||
|
||||
#ifdef LEO_SCHEDULE_OPT
|
||||
for (unsigned j = 0; j < first_end; j += range)
|
||||
#else
|
||||
for (unsigned j = 0; j < m; j += range)
|
||||
#endif
|
||||
{
|
||||
VectorIFFTButterfly(
|
||||
buffer_bytes,
|
||||
width,
|
||||
work + j,
|
||||
work + j + width,
|
||||
skewLUT[j]);
|
||||
}
|
||||
}
|
||||
IFFT_DIT(
|
||||
buffer_bytes,
|
||||
data,
|
||||
original_count < m ? original_count : m,
|
||||
work,
|
||||
nullptr, // No xor output
|
||||
m,
|
||||
skewLUT);
|
||||
|
||||
if (m >= original_count)
|
||||
goto skip_body;
|
||||
|
||||
// For sets of m data pieces:
|
||||
for (unsigned i = m; i + m <= original_count; i += m)
|
||||
{
|
||||
// temp <- data + i
|
||||
|
||||
data += m;
|
||||
void** temp = work + m;
|
||||
skewLUT += m;
|
||||
|
||||
// TBD: Unroll first loop to eliminate this
|
||||
for (unsigned j = 0; j < m; ++j)
|
||||
memcpy(temp[j], data[j], buffer_bytes);
|
||||
// work <- work xor IFFT(data + i, m, m + i)
|
||||
|
||||
// temp <- IFFT(temp, m, m + i)
|
||||
|
||||
const ffe_t* skewLUT = FFTSkew + m + i - 1;
|
||||
|
||||
for (unsigned width = 1; width < m; width <<= 1)
|
||||
{
|
||||
const unsigned range = width << 1;
|
||||
|
||||
for (unsigned j = width; j < m; j += range)
|
||||
{
|
||||
VectorIFFTButterfly(
|
||||
buffer_bytes,
|
||||
width,
|
||||
temp + j - width,
|
||||
temp + j,
|
||||
skewLUT[j]);
|
||||
}
|
||||
}
|
||||
|
||||
// work <- work XOR temp
|
||||
|
||||
// TBD: Unroll last loop to eliminate this
|
||||
VectorXOR(
|
||||
IFFT_DIT(
|
||||
buffer_bytes,
|
||||
data, // data source
|
||||
m,
|
||||
work,
|
||||
temp);
|
||||
work + m, // temporary workspace
|
||||
work, // xor destination
|
||||
m,
|
||||
skewLUT);
|
||||
}
|
||||
|
||||
// Handle final partial set of m pieces:
|
||||
const unsigned last_count = original_count % m;
|
||||
if (last_count != 0)
|
||||
{
|
||||
const unsigned i = original_count - last_count;
|
||||
|
||||
// temp <- data + i
|
||||
|
||||
data += m;
|
||||
void** temp = work + m;
|
||||
skewLUT += m;
|
||||
|
||||
for (unsigned j = 0; j < last_count; ++j)
|
||||
memcpy(temp[j], data[j], buffer_bytes);
|
||||
for (unsigned j = last_count; j < m; ++j)
|
||||
memset(temp[j], 0, buffer_bytes);
|
||||
// work <- work xor IFFT(data + i, m, m + i)
|
||||
|
||||
// temp <- IFFT(temp, m, m + i)
|
||||
|
||||
for (unsigned width = 1, shift = 1; width < m; width <<= 1, ++shift)
|
||||
{
|
||||
const unsigned range = width << 1;
|
||||
const ffe_t* skewLUT = FFTSkew + width + m + i - 1;
|
||||
|
||||
#ifdef LEO_SCHEDULE_OPT
|
||||
// Calculate stop considering that the right is all zeroes
|
||||
const unsigned stop = ((last_count + range - 1) >> shift) << shift;
|
||||
for (unsigned j = 0; j < stop; j += range)
|
||||
#else
|
||||
for (unsigned j = 0; j < m; j += range)
|
||||
#endif
|
||||
{
|
||||
VectorIFFTButterfly(
|
||||
buffer_bytes,
|
||||
width,
|
||||
temp + j,
|
||||
temp + j + width,
|
||||
skewLUT[j]);
|
||||
}
|
||||
}
|
||||
|
||||
// work <- work XOR temp
|
||||
|
||||
// TBD: Unroll last loop to eliminate this
|
||||
VectorXOR(
|
||||
IFFT_DIT(
|
||||
buffer_bytes,
|
||||
data, // data source
|
||||
last_count,
|
||||
work + m, // temporary workspace
|
||||
work, // xor destination
|
||||
m,
|
||||
work,
|
||||
temp);
|
||||
skewLUT);
|
||||
}
|
||||
|
||||
skip_body:
|
||||
|
||||
// work <- FFT(work, m, 0)
|
||||
|
||||
for (unsigned width = (m >> 1); width > 0; width >>= 1)
|
||||
{
|
||||
const ffe_t* skewLUT = FFTSkew + width - 1;
|
||||
const unsigned range = width << 1;
|
||||
|
||||
#ifdef LEO_SCHEDULE_OPT
|
||||
for (unsigned j = 0; j < recovery_count; j += range)
|
||||
#else
|
||||
for (unsigned j = 0; j < m; j += range)
|
||||
#endif
|
||||
{
|
||||
VectorFFTButterfly(
|
||||
buffer_bytes,
|
||||
width,
|
||||
work + j,
|
||||
work + j + width,
|
||||
skewLUT[j]);
|
||||
}
|
||||
}
|
||||
FFT_DIT(
|
||||
buffer_bytes,
|
||||
work,
|
||||
recovery_count,
|
||||
m,
|
||||
FFTSkew - 1);
|
||||
}
|
||||
|
||||
|
||||
@ -1088,6 +1211,68 @@ void ErrorBitfield::Prepare()
|
||||
//------------------------------------------------------------------------------
|
||||
// Reed-Solomon Decode
|
||||
|
||||
void VectorFFTButterfly(
|
||||
const uint64_t bytes,
|
||||
unsigned count,
|
||||
void** x,
|
||||
void** y,
|
||||
const ffe_t log_m)
|
||||
{
|
||||
if (log_m == kModulus)
|
||||
{
|
||||
VectorXOR(bytes, count, y, x);
|
||||
return;
|
||||
}
|
||||
|
||||
#ifdef LEO_USE_VECTOR4_OPT
|
||||
while (count >= 4)
|
||||
{
|
||||
fft_butterfly4(
|
||||
x[0], y[0],
|
||||
x[1], y[1],
|
||||
x[2], y[2],
|
||||
x[3], y[3],
|
||||
log_m, bytes);
|
||||
x += 4, y += 4;
|
||||
count -= 4;
|
||||
}
|
||||
#endif // LEO_USE_VECTOR4_OPT
|
||||
|
||||
for (unsigned i = 0; i < count; ++i)
|
||||
fft_butterfly(x[i], y[i], log_m, bytes);
|
||||
}
|
||||
|
||||
void VectorIFFTButterfly(
|
||||
const uint64_t bytes,
|
||||
unsigned count,
|
||||
void** x,
|
||||
void** y,
|
||||
const ffe_t log_m)
|
||||
{
|
||||
if (log_m == kModulus)
|
||||
{
|
||||
VectorXOR(bytes, count, y, x);
|
||||
return;
|
||||
}
|
||||
|
||||
#ifdef LEO_USE_VECTOR4_OPT
|
||||
while (count >= 4)
|
||||
{
|
||||
ifft_butterfly4(
|
||||
x[0], y[0],
|
||||
x[1], y[1],
|
||||
x[2], y[2],
|
||||
x[3], y[3],
|
||||
log_m, bytes);
|
||||
x += 4, y += 4;
|
||||
count -= 4;
|
||||
}
|
||||
#endif // LEO_USE_VECTOR4_OPT
|
||||
|
||||
for (unsigned i = 0; i < count; ++i)
|
||||
ifft_butterfly(x[i], y[i], log_m, bytes);
|
||||
}
|
||||
|
||||
void ReedSolomonDecode(
|
||||
uint64_t buffer_bytes,
|
||||
unsigned original_count,
|
||||
@ -1127,12 +1312,12 @@ void ReedSolomonDecode(
|
||||
|
||||
// Evaluate error locator polynomial
|
||||
|
||||
FWHT(ErrorLocations, kBits);
|
||||
FWHT(ErrorLocations);
|
||||
|
||||
for (unsigned i = 0; i < kOrder; ++i)
|
||||
ErrorLocations[i] = ((unsigned)ErrorLocations[i] * (unsigned)LogWalsh[i]) % kModulus;
|
||||
|
||||
FWHT(ErrorLocations, kBits);
|
||||
FWHT(ErrorLocations);
|
||||
|
||||
// work <- recovery data
|
||||
|
||||
|
28
LeopardFF8.h
28
LeopardFF8.h
@ -134,34 +134,6 @@ void ifft_butterfly4(
|
||||
#endif // LEO_USE_VECTOR4_OPT
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// FFT
|
||||
|
||||
/*
|
||||
if (log_m != kModulus)
|
||||
x[] ^= exp(log(y[]) + log_m)
|
||||
y[] ^= x[]
|
||||
*/
|
||||
void VectorFFTButterfly(
|
||||
const uint64_t bytes,
|
||||
unsigned count,
|
||||
void** x,
|
||||
void** y,
|
||||
const ffe_t log_m);
|
||||
|
||||
/*
|
||||
y[] ^= x[]
|
||||
if (log_m != kModulus)
|
||||
x[] ^= exp(log(y[]) + log_m)
|
||||
*/
|
||||
void VectorIFFTButterfly(
|
||||
const uint64_t bytes,
|
||||
unsigned count,
|
||||
void** x,
|
||||
void** y,
|
||||
const ffe_t log_m);
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Reed-Solomon Encode
|
||||
|
||||
|
@ -42,14 +42,14 @@ using namespace std;
|
||||
struct TestParameters
|
||||
{
|
||||
#ifdef LEO_HAS_FF16
|
||||
unsigned original_count = 1000; // under 65536
|
||||
unsigned recovery_count = 200; // under 65536 - original_count
|
||||
unsigned original_count = 100; // under 65536
|
||||
unsigned recovery_count = 20; // under 65536 - original_count
|
||||
#else
|
||||
unsigned original_count = 128; // under 65536
|
||||
unsigned recovery_count = 128; // under 65536 - original_count
|
||||
#endif
|
||||
unsigned buffer_bytes = 2560; // multiple of 64 bytes
|
||||
unsigned loss_count = 500; // some fraction of original_count
|
||||
unsigned buffer_bytes = 64; // multiple of 64 bytes
|
||||
unsigned loss_count = 32768; // some fraction of original_count
|
||||
unsigned seed = 2;
|
||||
bool multithreaded = true;
|
||||
};
|
||||
@ -399,7 +399,7 @@ static LEO_FORCE_INLINE void SIMDSafeFree(void* ptr)
|
||||
|
||||
static bool BasicTest(const TestParameters& params)
|
||||
{
|
||||
const unsigned kTrials = params.original_count > 8000 ? 1 : 10;
|
||||
const unsigned kTrials = params.original_count > 8000 ? 1 : 100000;
|
||||
|
||||
std::vector<uint8_t*> original_data(params.original_count);
|
||||
|
||||
@ -807,8 +807,8 @@ int main(int argc, char **argv)
|
||||
if (!BasicTest(params))
|
||||
goto Failed;
|
||||
|
||||
|
||||
static const unsigned kMaxRandomData = 32768;
|
||||
#if 0
|
||||
static const unsigned kMaxRandomData = 128;
|
||||
|
||||
prng.Seed(params.seed, 8);
|
||||
for (;; ++params.seed)
|
||||
@ -822,8 +822,9 @@ int main(int argc, char **argv)
|
||||
if (!BasicTest(params))
|
||||
goto Failed;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef LEO_BENCH_ALL_256_PARAMS
|
||||
#if 1
|
||||
for (unsigned original_count = 1; original_count <= 256; ++original_count)
|
||||
{
|
||||
for (unsigned recovery_count = 1; recovery_count <= original_count; ++recovery_count)
|
||||
|
Loading…
x
Reference in New Issue
Block a user