Fix build errors

This commit is contained in:
Christopher Taylor 2017-05-26 20:30:48 -07:00
parent b51a7219bc
commit f3003488da
7 changed files with 105 additions and 484 deletions

View File

@ -144,7 +144,7 @@ void InitializeCPUArch()
void xor_mem(
void * LEO_RESTRICT vx, const void * LEO_RESTRICT vy,
unsigned bytes)
uint64_t bytes)
{
#if defined(LEO_TRY_AVX2)
if (CpuHasAVX2)
@ -189,88 +189,15 @@ void xor_mem(
} while (bytes > 0);
}
void xor_mem2(
void * LEO_RESTRICT vx_0, const void * LEO_RESTRICT vy_0,
void * LEO_RESTRICT vx_1, const void * LEO_RESTRICT vy_1,
unsigned bytes)
{
#if defined(LEO_TRY_AVX2)
if (CpuHasAVX2)
{
LEO_M256 * LEO_RESTRICT x32_0 = reinterpret_cast<LEO_M256 *> (vx_0);
const LEO_M256 * LEO_RESTRICT y32_0 = reinterpret_cast<const LEO_M256 *>(vy_0);
LEO_M256 * LEO_RESTRICT x32_1 = reinterpret_cast<LEO_M256 *> (vx_1);
const LEO_M256 * LEO_RESTRICT y32_1 = reinterpret_cast<const LEO_M256 *>(vy_1);
do
{
const LEO_M256 x0_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0), _mm256_loadu_si256(y32_0));
const LEO_M256 x1_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 1), _mm256_loadu_si256(y32_0 + 1));
const LEO_M256 x2_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 2), _mm256_loadu_si256(y32_0 + 2));
const LEO_M256 x3_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 3), _mm256_loadu_si256(y32_0 + 3));
const LEO_M256 x0_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1), _mm256_loadu_si256(y32_1));
const LEO_M256 x1_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 1), _mm256_loadu_si256(y32_1 + 1));
const LEO_M256 x2_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 2), _mm256_loadu_si256(y32_1 + 2));
const LEO_M256 x3_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 3), _mm256_loadu_si256(y32_1 + 3));
_mm256_storeu_si256(x32_0, x0_0);
_mm256_storeu_si256(x32_0 + 1, x1_0);
_mm256_storeu_si256(x32_0 + 2, x2_0);
_mm256_storeu_si256(x32_0 + 3, x3_0);
_mm256_storeu_si256(x32_1, x0_1);
_mm256_storeu_si256(x32_1 + 1, x1_1);
_mm256_storeu_si256(x32_1 + 2, x2_1);
_mm256_storeu_si256(x32_1 + 3, x3_1);
x32_0 += 4, y32_0 += 4;
x32_1 += 4, y32_1 += 4;
bytes -= 128;
} while (bytes >= 128);
if (bytes > 0)
{
const LEO_M256 x0_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0), _mm256_loadu_si256(y32_0));
const LEO_M256 x1_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 1), _mm256_loadu_si256(y32_0 + 1));
const LEO_M256 x0_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1), _mm256_loadu_si256(y32_1));
const LEO_M256 x1_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 1), _mm256_loadu_si256(y32_1 + 1));
_mm256_storeu_si256(x32_0, x0_0);
_mm256_storeu_si256(x32_0 + 1, x1_0);
_mm256_storeu_si256(x32_1, x0_1);
_mm256_storeu_si256(x32_1 + 1, x1_1);
}
return;
}
#endif // LEO_TRY_AVX2
LEO_M128 * LEO_RESTRICT x16_0 = reinterpret_cast<LEO_M128 *> (vx_0);
const LEO_M128 * LEO_RESTRICT y16_0 = reinterpret_cast<const LEO_M128 *>(vy_0);
LEO_M128 * LEO_RESTRICT x16_1 = reinterpret_cast<LEO_M128 *> (vx_1);
const LEO_M128 * LEO_RESTRICT y16_1 = reinterpret_cast<const LEO_M128 *>(vy_1);
do
{
const LEO_M128 x0_0 = _mm_xor_si128(_mm_loadu_si128(x16_0), _mm_loadu_si128(y16_0));
const LEO_M128 x1_0 = _mm_xor_si128(_mm_loadu_si128(x16_0 + 1), _mm_loadu_si128(y16_0 + 1));
const LEO_M128 x2_0 = _mm_xor_si128(_mm_loadu_si128(x16_0 + 2), _mm_loadu_si128(y16_0 + 2));
const LEO_M128 x3_0 = _mm_xor_si128(_mm_loadu_si128(x16_0 + 3), _mm_loadu_si128(y16_0 + 3));
const LEO_M128 x0_1 = _mm_xor_si128(_mm_loadu_si128(x16_1), _mm_loadu_si128(y16_1));
const LEO_M128 x1_1 = _mm_xor_si128(_mm_loadu_si128(x16_1 + 1), _mm_loadu_si128(y16_1 + 1));
const LEO_M128 x2_1 = _mm_xor_si128(_mm_loadu_si128(x16_1 + 2), _mm_loadu_si128(y16_1 + 2));
const LEO_M128 x3_1 = _mm_xor_si128(_mm_loadu_si128(x16_1 + 3), _mm_loadu_si128(y16_1 + 3));
_mm_storeu_si128(x16_0, x0_0);
_mm_storeu_si128(x16_0 + 1, x1_0);
_mm_storeu_si128(x16_0 + 2, x2_0);
_mm_storeu_si128(x16_0 + 3, x3_0);
_mm_storeu_si128(x16_1, x0_1);
_mm_storeu_si128(x16_1 + 1, x1_1);
_mm_storeu_si128(x16_1 + 2, x2_1);
_mm_storeu_si128(x16_1 + 3, x3_1);
x16_0 += 4, y16_0 += 4;
x16_1 += 4, y16_1 += 4;
bytes -= 64;
} while (bytes > 0);
}
void xor_mem3(
void xor_mem4(
void * LEO_RESTRICT vx_0, const void * LEO_RESTRICT vy_0,
void * LEO_RESTRICT vx_1, const void * LEO_RESTRICT vy_1,
void * LEO_RESTRICT vx_2, const void * LEO_RESTRICT vy_2,
unsigned bytes)
void * LEO_RESTRICT vx_3, const void * LEO_RESTRICT vy_3,
uint64_t bytes)
{
// FIXME: Add args
#if defined(LEO_TRY_AVX2)
if (CpuHasAVX2)
{

View File

@ -208,20 +208,15 @@ LEO_FORCE_INLINE unsigned NextPow2(unsigned n)
// x[] ^= y[]
void xor_mem(
void * LEO_RESTRICT x, const void * LEO_RESTRICT y,
unsigned bytes);
uint64_t bytes);
// For i = {0, 1}: x_i[] ^= x_i[]
void xor_mem2(
void * LEO_RESTRICT x_0, const void * LEO_RESTRICT y_0,
void * LEO_RESTRICT x_1, const void * LEO_RESTRICT y_1,
unsigned bytes);
// For i = {0, 1, 2}: x_i[] ^= x_i[]
void xor_mem3(
// For i = {0, 1, 2, 3}: x_i[] ^= x_i[]
void xor_mem4(
void * LEO_RESTRICT x_0, const void * LEO_RESTRICT y_0,
void * LEO_RESTRICT x_1, const void * LEO_RESTRICT y_1,
void * LEO_RESTRICT x_2, const void * LEO_RESTRICT y_2,
unsigned bytes);
void * LEO_RESTRICT x_3, const void * LEO_RESTRICT y_3,
uint64_t bytes);
} // namespace leopard

View File

@ -523,131 +523,6 @@ void mul_mem_set(
} while (bytes > 0);
}
// vx0[] *= m, vx1[] *= m
void mul_mem2_inplace(
void * LEO_RESTRICT vx_0,
void * LEO_RESTRICT vx_1,
ffe_t m, uint64_t bytes)
{
if (m <= 1)
{
if (m == 0)
{
memset(vx_0, 0, bytes);
memset(vx_1, 0, bytes);
}
return;
}
#if defined(LEO_TRY_AVX2)
if (CpuHasAVX2)
{
const LEO_M256 table_lo_y = _mm256_loadu_si256(Multiply256LUT.Lo + m);
const LEO_M256 table_hi_y = _mm256_loadu_si256(Multiply256LUT.Hi + m);
const LEO_M256 clr_mask = _mm256_set1_epi8(0x0f);
LEO_M256 * LEO_RESTRICT x32_0 = reinterpret_cast<LEO_M256 *>(vx_0);
LEO_M256 * LEO_RESTRICT x32_1 = reinterpret_cast<LEO_M256 *>(vx_1);
do
{
LEO_M256 x0_0 = _mm256_loadu_si256(x32_0 + 1);
LEO_M256 l0_0 = _mm256_and_si256(x0_0, clr_mask);
x0_0 = _mm256_srli_epi64(x0_0, 4);
LEO_M256 h0_0 = _mm256_and_si256(x0_0, clr_mask);
l0_0 = _mm256_shuffle_epi8(table_lo_y, l0_0);
h0_0 = _mm256_shuffle_epi8(table_hi_y, h0_0);
l0_0 = _mm256_xor_si256(l0_0, h0_0);
LEO_M256 x1_0 = _mm256_loadu_si256(x32_0);
LEO_M256 l1_0 = _mm256_and_si256(x1_0, clr_mask);
x1_0 = _mm256_srli_epi64(x1_0, 4);
LEO_M256 h1_0 = _mm256_and_si256(x1_0, clr_mask);
l1_0 = _mm256_shuffle_epi8(table_lo_y, l1_0);
h1_0 = _mm256_shuffle_epi8(table_hi_y, h1_0);
l1_0 = _mm256_xor_si256(l1_0, h1_0);
LEO_M256 x0_1 = _mm256_loadu_si256(x32_1 + 1);
LEO_M256 l0_1 = _mm256_and_si256(x0_1, clr_mask);
x0_1 = _mm256_srli_epi64(x0_1, 4);
LEO_M256 h0_1 = _mm256_and_si256(x0_1, clr_mask);
l0_1 = _mm256_shuffle_epi8(table_lo_y, l0_1);
h0_1 = _mm256_shuffle_epi8(table_hi_y, h0_1);
l0_1 = _mm256_xor_si256(l0_1, h0_1);
LEO_M256 x1_1 = _mm256_loadu_si256(x32_1);
LEO_M256 l1_1 = _mm256_and_si256(x1_1, clr_mask);
x1_1 = _mm256_srli_epi64(x1_1, 4);
LEO_M256 h1_1 = _mm256_and_si256(x1_1, clr_mask);
l1_1 = _mm256_shuffle_epi8(table_lo_y, l1_1);
h1_1 = _mm256_shuffle_epi8(table_hi_y, h1_1);
l1_1 = _mm256_xor_si256(l1_1, h1_1);
_mm256_storeu_si256(x32_0 + 1, l0_0);
_mm256_storeu_si256(x32_0, l1_0);
_mm256_storeu_si256(x32_1 + 1, l0_1);
_mm256_storeu_si256(x32_1, l1_1);
x32_0 += 2;
x32_1 += 2;
bytes -= 64;
} while (bytes > 0);
return;
}
#endif // LEO_TRY_AVX2
const LEO_M128 table_lo_y = _mm_loadu_si128(Multiply128LUT.Lo + m);
const LEO_M128 table_hi_y = _mm_loadu_si128(Multiply128LUT.Hi + m);
const LEO_M128 clr_mask = _mm_set1_epi8(0x0f);
LEO_M128 * LEO_RESTRICT x16_0 = reinterpret_cast<LEO_M128 *>(vx_0);
LEO_M128 * LEO_RESTRICT x16_1 = reinterpret_cast<LEO_M128 *>(vx_1);
do
{
LEO_M128 x3 = _mm_loadu_si128(x16_0 + 3);
LEO_M128 l3 = _mm_and_si128(x3, clr_mask);
x3 = _mm_srli_epi64(x3, 4);
LEO_M128 h3 = _mm_and_si128(x3, clr_mask);
l3 = _mm_shuffle_epi8(table_lo_y, l3);
h3 = _mm_shuffle_epi8(table_hi_y, h3);
LEO_M128 x2 = _mm_loadu_si128(x16_0 + 2);
LEO_M128 l2 = _mm_and_si128(x2, clr_mask);
x2 = _mm_srli_epi64(x2, 4);
LEO_M128 h2 = _mm_and_si128(x2, clr_mask);
l2 = _mm_shuffle_epi8(table_lo_y, l2);
h2 = _mm_shuffle_epi8(table_hi_y, h2);
LEO_M128 x1 = _mm_loadu_si128(x16_0 + 1);
LEO_M128 l1 = _mm_and_si128(x1, clr_mask);
x1 = _mm_srli_epi64(x1, 4);
LEO_M128 h1 = _mm_and_si128(x1, clr_mask);
l1 = _mm_shuffle_epi8(table_lo_y, l1);
h1 = _mm_shuffle_epi8(table_hi_y, h1);
LEO_M128 x0 = _mm_loadu_si128(x16_0);
LEO_M128 l0 = _mm_and_si128(x0, clr_mask);
x0 = _mm_srli_epi64(x0, 4);
LEO_M128 h0 = _mm_and_si128(x0, clr_mask);
l0 = _mm_shuffle_epi8(table_lo_y, l0);
h0 = _mm_shuffle_epi8(table_hi_y, h0);
_mm_storeu_si128(x16_0 + 3, _mm_xor_si128(l3, h3));
_mm_storeu_si128(x16_0 + 2, _mm_xor_si128(l2, h2));
_mm_storeu_si128(x16_0 + 1, _mm_xor_si128(l1, h1));
_mm_storeu_si128(x16_0, _mm_xor_si128(l0, h0));
// FIXME: Add second one here
x16_0 += 4;
x16_1 += 4;
bytes -= 64;
} while (bytes > 0);
}
//------------------------------------------------------------------------------
// FFT Operations
@ -660,20 +535,12 @@ void fft_butterfly(
}
// For i = {0, 1}: x_i[] ^= y_i[] * m, y_i[] ^= x_i[]
void fft_butterfly2(
void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1,
ffe_t m, uint64_t bytes)
{
}
// For i = {0, 1, 2}: x_i[] ^= y_i[] * m, y_i[] ^= x_i[]
void fft_butterfly3(
// For i = {0, 1, 2, 3}: x_i[] ^= y_i[] * m, y_i[] ^= x_i[]
void fft_butterfly4(
void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1,
void * LEO_RESTRICT x_2, void * LEO_RESTRICT y_2,
void * LEO_RESTRICT x_3, void * LEO_RESTRICT y_3,
ffe_t m, uint64_t bytes)
{
@ -691,20 +558,12 @@ void ifft_butterfly(
}
// For i = {0, 1}: y_i[] ^= x_i[], x_i[] ^= y_i[] * m
void ifft_butterfly2(
void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1,
ffe_t m, uint64_t bytes)
{
}
// For i = {0, 1, 2}: y_i[] ^= x_i[], x_i[] ^= y_i[] * m
void ifft_butterfly3(
// For i = {0, 1, 2, 3}: y_i[] ^= x_i[], x_i[] ^= y_i[] * m
void ifft_butterfly4(
void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1,
void * LEO_RESTRICT x_2, void * LEO_RESTRICT y_2,
void * LEO_RESTRICT x_3, void * LEO_RESTRICT y_3,
ffe_t m, uint64_t bytes)
{
@ -714,7 +573,7 @@ void ifft_butterfly3(
//------------------------------------------------------------------------------
// FFT
static ffe_t FFTSkew[kFieldModulus]; // twisted factors used in FFT
static ffe_t FFTSkew[kModulus]; // twisted factors used in FFT
static ffe_t LogWalsh[kOrder]; // factors used in the evaluation of the error locator polynomial
void FFTInitialize()
@ -739,19 +598,19 @@ void FFTInitialize()
}
// TBD: This can be cleaned up
temp[m] = kFieldModulus - LogLUT[FFEMultiply(temp[m], temp[m] ^ 1)];
temp[m] = kModulus - LogLUT[FFEMultiply(temp[m], temp[m] ^ 1)];
for (unsigned i = m + 1; i < (kBits - 1); ++i)
temp[i] = FFEMultiplyLog(temp[i], (LogLUT[temp[i] ^ 1] + temp[m]) % kFieldModulus);
temp[i] = FFEMultiplyLog(temp[i], (LogLUT[temp[i] ^ 1] + temp[m]) % kModulus);
}
for (unsigned i = 0; i < kOrder; ++i)
FFTSkew[i] = LogLUT[FFTSkew[i]];
temp[0] = kFieldModulus - temp[0];
temp[0] = kModulus - temp[0];
for (unsigned i = 1; i < (kBits - 1); ++i)
temp[i] = (kFieldModulus - temp[i] + temp[i - 1]) % kFieldModulus;
temp[i] = (kModulus - temp[i] + temp[i - 1]) % kModulus;
for (unsigned i = 0; i < kOrder; ++i)
LogWalsh[i] = LogLUT[i];
@ -787,7 +646,7 @@ void Encode(
{
const ffe_t skew = FFTSkew[j + m - 1];
if (skew != kFieldModulus)
if (skew != kModulus)
{
for (unsigned i = j - width; i < j; ++i)
ifft_butterfly(work[i], work[i + width], skew, buffer_bytes);
@ -818,7 +677,7 @@ void Encode(
{
const ffe_t skew = FFTSkew[j + m + i - 1];
if (skew != kFieldModulus)
if (skew != kModulus)
{
for (unsigned k = j - width; k < j; ++k)
ifft_butterfly(temp[k], temp[k + width], skew, buffer_bytes);
@ -863,7 +722,7 @@ void Encode(
{
const ffe_t skew = FFTSkew[j + m + i - 1];
if (skew != kFieldModulus)
if (skew != kModulus)
{
for (unsigned k = j - width; k < j; ++k)
ifft_butterfly(temp[k], temp[k + width], skew, buffer_bytes);
@ -894,7 +753,7 @@ void Encode(
{
const ffe_t skew = skewLUT[j];
if (skew != kFieldModulus)
if (skew != kModulus)
{
for (unsigned k = j, count = j + width; k < count; ++k)
fft_butterfly(data[k], data[k + width], skew, buffer_bytes);
@ -938,7 +797,7 @@ void Decode(
FWHT(ErrorLocations, kBits);
for (unsigned i = 0; i < kOrder; ++i)
ErrorLocations[i] = ((unsigned)ErrorLocations[i] * (unsigned)LogWalsh[i]) % kFieldModulus;
ErrorLocations[i] = ((unsigned)ErrorLocations[i] * (unsigned)LogWalsh[i]) % kModulus;
FWHT(ErrorLocations, kBits);
@ -974,7 +833,7 @@ void Decode(
{
const ffe_t skew = FFTSkew[j - 1];
if (skew != kFieldModulus)
if (skew != kModulus)
{
for (unsigned i = j - width; i < j; ++i)
ifft_butterfly(work[i], work[i + width], skew, buffer_bytes);
@ -1010,7 +869,7 @@ void Decode(
{
const ffe_t skew = skewLUT[j];
if (skew != kFieldModulus)
if (skew != kModulus)
{
for (unsigned i = j; i < j + width; ++i)
fft_butterfly(work[i], work[i + width], skew, buffer_bytes);
@ -1027,7 +886,7 @@ void Decode(
for (unsigned i = 0; i < original_count; ++i)
if (!original[i])
mul_mem_set(work[i], work[i + m], kFieldModulus - ErrorLocations[i], buffer_bytes);
mul_mem_set(work[i], work[i + m], kModulus - ErrorLocations[i], buffer_bytes);
}

View File

@ -73,12 +73,6 @@ void mul_mem_set(
void * LEO_RESTRICT x, const void * LEO_RESTRICT y,
ffe_t m, uint64_t bytes);
// For i = {0, 1}: x_i[] *= m
void mul_mem2_inplace(
void * LEO_RESTRICT x_0,
void * LEO_RESTRICT x_1,
ffe_t m, uint64_t bytes);
//------------------------------------------------------------------------------
// FFT Operations
@ -88,17 +82,12 @@ void fft_butterfly(
void * LEO_RESTRICT x, void * LEO_RESTRICT y,
ffe_t m, uint64_t bytes);
// For i = {0, 1}: x_i[] ^= y_i[] * m, y_i[] ^= x_i[]
void fft_butterfly2(
void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1,
ffe_t m, uint64_t bytes);
// For i = {0, 1, 2}: x_i[] ^= y_i[] * m, y_i[] ^= x_i[]
void fft_butterfly3(
// For i = {0, 1, 2, 3}: x_i[] ^= y_i[] * m, y_i[] ^= x_i[]
void fft_butterfly4(
void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1,
void * LEO_RESTRICT x_2, void * LEO_RESTRICT y_2,
void * LEO_RESTRICT x_3, void * LEO_RESTRICT y_3,
ffe_t m, uint64_t bytes);
@ -110,17 +99,12 @@ void ifft_butterfly(
void * LEO_RESTRICT x, void * LEO_RESTRICT y,
ffe_t m, uint64_t bytes);
// For i = {0, 1}: y_i[] ^= x_i[], x_i[] ^= y_i[] * m
void ifft_butterfly2(
void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1,
ffe_t m, uint64_t bytes);
// For i = {0, 1, 2}: y_i[] ^= x_i[], x_i[] ^= y_i[] * m
void ifft_butterfly3(
// For i = {0, 1, 2, 3}: y_i[] ^= x_i[], x_i[] ^= y_i[] * m
void ifft_butterfly4(
void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1,
void * LEO_RESTRICT x_2, void * LEO_RESTRICT y_2,
void * LEO_RESTRICT x_3, void * LEO_RESTRICT y_3,
ffe_t m, uint64_t bytes);

View File

@ -282,7 +282,7 @@ static ffe_t FFEMultiplyLog(ffe_t a, ffe_t log_b)
{
if (a == 0)
return 0;
return ExpLUT[AddMod(LogLUT[a], b)];
return ExpLUT[AddMod(LogLUT[a], log_b)];
}
bool InitializeMultiplyTables()
@ -341,25 +341,29 @@ void mul_mem_set(
LEO_M256 * LEO_RESTRICT z32 = reinterpret_cast<LEO_M256 *>(vx);
const LEO_M256 * LEO_RESTRICT x32 = reinterpret_cast<const LEO_M256 *>(vy);
const unsigned count = bytes / 64;
for (unsigned i = 0; i < count; ++i)
do
{
LEO_M256 x0 = _mm256_loadu_si256(x32 + i * 2);
LEO_M256 x0 = _mm256_loadu_si256(x32);
LEO_M256 l0 = _mm256_and_si256(x0, clr_mask);
x0 = _mm256_srli_epi64(x0, 4);
LEO_M256 h0 = _mm256_and_si256(x0, clr_mask);
l0 = _mm256_shuffle_epi8(table_lo_y, l0);
h0 = _mm256_shuffle_epi8(table_hi_y, h0);
_mm256_storeu_si256(z32 + i * 2, _mm256_xor_si256(l0, h0));
_mm256_storeu_si256(z32, _mm256_xor_si256(l0, h0));
LEO_M256 x1 = _mm256_loadu_si256(x32 + i * 2 + 1);
LEO_M256 x1 = _mm256_loadu_si256(x32 + 1);
LEO_M256 l1 = _mm256_and_si256(x1, clr_mask);
x1 = _mm256_srli_epi64(x1, 4);
LEO_M256 h1 = _mm256_and_si256(x1, clr_mask);
l1 = _mm256_shuffle_epi8(table_lo_y, l1);
h1 = _mm256_shuffle_epi8(table_hi_y, h1);
_mm256_storeu_si256(z32 + i * 2 + 1, _mm256_xor_si256(l1, h1));
}
_mm256_storeu_si256(z32 + 1, _mm256_xor_si256(l1, h1));
x32 += 2;
z32 += 2;
bytes -= 64;
} while (bytes > 0);
return;
}
#endif // LEO_TRY_AVX2
@ -412,131 +416,6 @@ void mul_mem_set(
} while (bytes > 0);
}
// vx0[] *= m, vx1[] *= m
void mul_mem2_inplace(
void * LEO_RESTRICT vx_0,
void * LEO_RESTRICT vx_1,
ffe_t m, uint64_t bytes)
{
if (m <= 1)
{
if (m == 0)
{
memset(vx_0, 0, bytes);
memset(vx_1, 0, bytes);
}
return;
}
#if defined(LEO_TRY_AVX2)
if (CpuHasAVX2)
{
const LEO_M256 table_lo_y = _mm256_loadu_si256(Multiply256LUT.Lo + m);
const LEO_M256 table_hi_y = _mm256_loadu_si256(Multiply256LUT.Hi + m);
const LEO_M256 clr_mask = _mm256_set1_epi8(0x0f);
LEO_M256 * LEO_RESTRICT x32_0 = reinterpret_cast<LEO_M256 *>(vx_0);
LEO_M256 * LEO_RESTRICT x32_1 = reinterpret_cast<LEO_M256 *>(vx_1);
do
{
LEO_M256 x0_0 = _mm256_loadu_si256(x32_0 + 1);
LEO_M256 l0_0 = _mm256_and_si256(x0_0, clr_mask);
x0_0 = _mm256_srli_epi64(x0_0, 4);
LEO_M256 h0_0 = _mm256_and_si256(x0_0, clr_mask);
l0_0 = _mm256_shuffle_epi8(table_lo_y, l0_0);
h0_0 = _mm256_shuffle_epi8(table_hi_y, h0_0);
l0_0 = _mm256_xor_si256(l0_0, h0_0);
LEO_M256 x1_0 = _mm256_loadu_si256(x32_0);
LEO_M256 l1_0 = _mm256_and_si256(x1_0, clr_mask);
x1_0 = _mm256_srli_epi64(x1_0, 4);
LEO_M256 h1_0 = _mm256_and_si256(x1_0, clr_mask);
l1_0 = _mm256_shuffle_epi8(table_lo_y, l1_0);
h1_0 = _mm256_shuffle_epi8(table_hi_y, h1_0);
l1_0 = _mm256_xor_si256(l1_0, h1_0);
LEO_M256 x0_1 = _mm256_loadu_si256(x32_1 + 1);
LEO_M256 l0_1 = _mm256_and_si256(x0_1, clr_mask);
x0_1 = _mm256_srli_epi64(x0_1, 4);
LEO_M256 h0_1 = _mm256_and_si256(x0_1, clr_mask);
l0_1 = _mm256_shuffle_epi8(table_lo_y, l0_1);
h0_1 = _mm256_shuffle_epi8(table_hi_y, h0_1);
l0_1 = _mm256_xor_si256(l0_1, h0_1);
LEO_M256 x1_1 = _mm256_loadu_si256(x32_1);
LEO_M256 l1_1 = _mm256_and_si256(x1_1, clr_mask);
x1_1 = _mm256_srli_epi64(x1_1, 4);
LEO_M256 h1_1 = _mm256_and_si256(x1_1, clr_mask);
l1_1 = _mm256_shuffle_epi8(table_lo_y, l1_1);
h1_1 = _mm256_shuffle_epi8(table_hi_y, h1_1);
l1_1 = _mm256_xor_si256(l1_1, h1_1);
_mm256_storeu_si256(x32_0 + 1, l0_0);
_mm256_storeu_si256(x32_0, l1_0);
_mm256_storeu_si256(x32_1 + 1, l0_1);
_mm256_storeu_si256(x32_1, l1_1);
x32_0 += 2;
x32_1 += 2;
bytes -= 64;
} while (bytes > 0);
return;
}
#endif // LEO_TRY_AVX2
const LEO_M128 table_lo_y = _mm_loadu_si128(Multiply128LUT.Lo + m);
const LEO_M128 table_hi_y = _mm_loadu_si128(Multiply128LUT.Hi + m);
const LEO_M128 clr_mask = _mm_set1_epi8(0x0f);
LEO_M128 * LEO_RESTRICT x16_0 = reinterpret_cast<LEO_M128 *>(vx_0);
LEO_M128 * LEO_RESTRICT x16_1 = reinterpret_cast<LEO_M128 *>(vx_1);
do
{
LEO_M128 x3 = _mm_loadu_si128(x16_0 + 3);
LEO_M128 l3 = _mm_and_si128(x3, clr_mask);
x3 = _mm_srli_epi64(x3, 4);
LEO_M128 h3 = _mm_and_si128(x3, clr_mask);
l3 = _mm_shuffle_epi8(table_lo_y, l3);
h3 = _mm_shuffle_epi8(table_hi_y, h3);
LEO_M128 x2 = _mm_loadu_si128(x16_0 + 2);
LEO_M128 l2 = _mm_and_si128(x2, clr_mask);
x2 = _mm_srli_epi64(x2, 4);
LEO_M128 h2 = _mm_and_si128(x2, clr_mask);
l2 = _mm_shuffle_epi8(table_lo_y, l2);
h2 = _mm_shuffle_epi8(table_hi_y, h2);
LEO_M128 x1 = _mm_loadu_si128(x16_0 + 1);
LEO_M128 l1 = _mm_and_si128(x1, clr_mask);
x1 = _mm_srli_epi64(x1, 4);
LEO_M128 h1 = _mm_and_si128(x1, clr_mask);
l1 = _mm_shuffle_epi8(table_lo_y, l1);
h1 = _mm_shuffle_epi8(table_hi_y, h1);
LEO_M128 x0 = _mm_loadu_si128(x16_0);
LEO_M128 l0 = _mm_and_si128(x0, clr_mask);
x0 = _mm_srli_epi64(x0, 4);
LEO_M128 h0 = _mm_and_si128(x0, clr_mask);
l0 = _mm_shuffle_epi8(table_lo_y, l0);
h0 = _mm_shuffle_epi8(table_hi_y, h0);
_mm_storeu_si128(x16_0 + 3, _mm_xor_si128(l3, h3));
_mm_storeu_si128(x16_0 + 2, _mm_xor_si128(l2, h2));
_mm_storeu_si128(x16_0 + 1, _mm_xor_si128(l1, h1));
_mm_storeu_si128(x16_0, _mm_xor_si128(l0, h0));
// FIXME: Add second one here
x16_0 += 4;
x16_1 += 4;
bytes -= 64;
} while (bytes > 0);
}
//------------------------------------------------------------------------------
// FFT Operations
@ -549,20 +428,12 @@ void fft_butterfly(
}
// For i = {0, 1}: x_i[] ^= y_i[] * m, y_i[] ^= x_i[]
void fft_butterfly2(
void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1,
ffe_t m, uint64_t bytes)
{
}
// For i = {0, 1, 2}: x_i[] ^= y_i[] * m, y_i[] ^= x_i[]
void fft_butterfly3(
// For i = {0, 1, 2, 3}: x_i[] ^= y_i[] * m, y_i[] ^= x_i[]
void fft_butterfly4(
void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1,
void * LEO_RESTRICT x_2, void * LEO_RESTRICT y_2,
void * LEO_RESTRICT x_3, void * LEO_RESTRICT y_3,
ffe_t m, uint64_t bytes)
{
@ -580,20 +451,12 @@ void ifft_butterfly(
}
// For i = {0, 1}: y_i[] ^= x_i[], x_i[] ^= y_i[] * m
void ifft_butterfly2(
void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1,
ffe_t m, uint64_t bytes)
{
}
// For i = {0, 1, 2}: y_i[] ^= x_i[], x_i[] ^= y_i[] * m
void ifft_butterfly3(
// For i = {0, 1, 2, 3}: y_i[] ^= x_i[], x_i[] ^= y_i[] * m
void ifft_butterfly4(
void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1,
void * LEO_RESTRICT x_2, void * LEO_RESTRICT y_2,
void * LEO_RESTRICT x_3, void * LEO_RESTRICT y_3,
ffe_t m, uint64_t bytes)
{
@ -603,7 +466,7 @@ void ifft_butterfly3(
//------------------------------------------------------------------------------
// FFT
static ffe_t FFTSkew[kFieldModulus]; // twisted factors used in FFT
static ffe_t FFTSkew[kModulus]; // twisted factors used in FFT
static ffe_t LogWalsh[kOrder]; // factors used in the evaluation of the error locator polynomial
void FFTInitialize()
@ -628,10 +491,10 @@ void FFTInitialize()
}
// TBD: This can be cleaned up
temp[m] = kFieldModulus - LogLUT[FFEMultiply(temp[m], temp[m] ^ 1)];
temp[m] = kModulus - LogLUT[FFEMultiply(temp[m], temp[m] ^ 1)];
for (unsigned i = m + 1; i < (kBits - 1); ++i)
temp[i] = FFEMultiplyLog(temp[i], (LogLUT[temp[i] ^ 1] + temp[m]) % kFieldModulus);
temp[i] = FFEMultiplyLog(temp[i], (LogLUT[temp[i] ^ 1] + temp[m]) % kModulus);
}
for (unsigned i = 0; i < kOrder; ++i)
@ -667,11 +530,14 @@ void Encode(
for (unsigned width = 1; width < m; width <<= 1)
{
for (unsigned j = width; j < m; j += (width << 1))
{
const ffe_t skew = FFTSkew[j + m - 1];
const unsigned range = width << 1;
const ffe_t* skewLUT = FFTSkew + m - 1;
if (skew != kFieldModulus)
for (unsigned j = width; j < m; j += range)
{
const ffe_t skew = skewLUT[j];
if (skew != kModulus)
{
for (unsigned i = j - width; i < j; ++i)
ifft_butterfly(work[i], work[i + width], skew, buffer_bytes);
@ -696,13 +562,15 @@ void Encode(
// temp <- IFFT(temp, m, m + i)
const ffe_t* skewLUT = FFTSkew + m + i - 1;
for (unsigned width = 1; width < m; width <<= 1)
{
for (unsigned j = width; j < m; j += (width << 1))
{
const ffe_t skew = FFTSkew[j + m + i - 1];
const ffe_t skew = skewLUT[j];
if (skew != kFieldModulus)
if (skew != kModulus)
{
for (unsigned k = j - width; k < j; ++k)
ifft_butterfly(temp[k], temp[k + width], skew, buffer_bytes);
@ -742,12 +610,14 @@ void Encode(
{
// Calculate stop considering that the right is all zeroes
const unsigned stop = ((last_count + width - 1) >> shift) << shift;
const unsigned range = width << 1;
const ffe_t* skewLUT = FFTSkew + m + i - 1;
for (unsigned j = width; j < stop; j += (width << 1))
for (unsigned j = width; j < stop; j += range)
{
const ffe_t skew = FFTSkew[j + m + i - 1];
const ffe_t skew = skewLUT[j];
if (skew != kFieldModulus)
if (skew != kModulus)
{
for (unsigned k = j - width; k < j; ++k)
ifft_butterfly(temp[k], temp[k + width], skew, buffer_bytes);
@ -778,7 +648,7 @@ void Encode(
{
const ffe_t skew = skewLUT[j];
if (skew != kFieldModulus)
if (skew != kModulus)
{
for (unsigned k = j, count = j + width; k < count; ++k)
fft_butterfly(data[k], data[k + width], skew, buffer_bytes);
@ -822,7 +692,7 @@ void Decode(
FWHT(ErrorLocations, kBits);
for (unsigned i = 0; i < kOrder; ++i)
ErrorLocations[i] = ((unsigned)ErrorLocations[i] * (unsigned)LogWalsh[i]) % kFieldModulus;
ErrorLocations[i] = ((unsigned)ErrorLocations[i] * (unsigned)LogWalsh[i]) % kModulus;
FWHT(ErrorLocations, kBits);
@ -854,11 +724,13 @@ void Decode(
for (unsigned width = 1; width < n; width <<= 1)
{
for (unsigned j = width; j < n; j += (width << 1))
const unsigned range = width << 1;
for (unsigned j = width; j < n; j += range)
{
const ffe_t skew = FFTSkew[j - 1];
if (skew != kFieldModulus)
if (skew != kModulus)
{
for (unsigned i = j - width; i < j; ++i)
ifft_butterfly(work[i], work[i + width], skew, buffer_bytes);
@ -894,14 +766,14 @@ void Decode(
{
const ffe_t skew = skewLUT[j];
if (skew != kFieldModulus)
if (skew != kModulus)
{
for (unsigned i = j; i < j + width; ++i)
for (unsigned i = j, count = j + width; i < count; ++i)
fft_butterfly(work[i], work[i + width], skew, buffer_bytes);
}
else
{
for (unsigned i = j; i < j + width; ++i)
for (unsigned i = j, count = j + width; i < count; ++i)
xor_mem(work[i + width], work[i], buffer_bytes);
}
}
@ -911,7 +783,7 @@ void Decode(
for (unsigned i = 0; i < original_count; ++i)
if (!original[i])
mul_mem_set(work[i], work[i + m], kFieldModulus - ErrorLocations[i], buffer_bytes);
mul_mem_set(work[i], work[i + m], kModulus - ErrorLocations[i], buffer_bytes);
}

View File

@ -73,12 +73,6 @@ void mul_mem_set(
void * LEO_RESTRICT x, const void * LEO_RESTRICT y,
ffe_t m, uint64_t bytes);
// For i = {0, 1}: x_i[] *= m
void mul_mem2_inplace(
void * LEO_RESTRICT x_0,
void * LEO_RESTRICT x_1,
ffe_t m, uint64_t bytes);
//------------------------------------------------------------------------------
// FFT Operations
@ -88,17 +82,12 @@ void fft_butterfly(
void * LEO_RESTRICT x, void * LEO_RESTRICT y,
ffe_t m, uint64_t bytes);
// For i = {0, 1}: x_i[] ^= y_i[] * m, y_i[] ^= x_i[]
void fft_butterfly2(
void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1,
ffe_t m, uint64_t bytes);
// For i = {0, 1, 2}: x_i[] ^= y_i[] * m, y_i[] ^= x_i[]
void fft_butterfly3(
// For i = {0, 1, 2, 3}: x_i[] ^= y_i[] * m, y_i[] ^= x_i[]
void fft_butterfly4(
void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1,
void * LEO_RESTRICT x_2, void * LEO_RESTRICT y_2,
void * LEO_RESTRICT x_3, void * LEO_RESTRICT y_3,
ffe_t m, uint64_t bytes);
@ -110,17 +99,12 @@ void ifft_butterfly(
void * LEO_RESTRICT x, void * LEO_RESTRICT y,
ffe_t m, uint64_t bytes);
// For i = {0, 1}: y_i[] ^= x_i[], x_i[] ^= y_i[] * m
void ifft_butterfly2(
void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1,
ffe_t m, uint64_t bytes);
// For i = {0, 1, 2}: y_i[] ^= x_i[], x_i[] ^= y_i[] * m
void ifft_butterfly3(
// For i = {0, 1, 2, 3}: y_i[] ^= x_i[], x_i[] ^= y_i[] * m
void ifft_butterfly4(
void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1,
void * LEO_RESTRICT x_2, void * LEO_RESTRICT y_2,
void * LEO_RESTRICT x_3, void * LEO_RESTRICT y_3,
ffe_t m, uint64_t bytes);

View File

@ -98,7 +98,7 @@ ffe_t kGFBasis[kGFBits] = {
*/
static const unsigned kFieldSize = (unsigned)1 << kGFBits; //Field size
static const unsigned kFieldModulus = kFieldSize - 1;
static const unsigned kModulus = kFieldSize - 1;
static ffe_t GFLog[kFieldSize];
static ffe_t GFExp[kFieldSize];
@ -107,14 +107,14 @@ static ffe_t GFExp[kFieldSize];
static void InitField()
{
unsigned state = 1;
for (unsigned i = 0; i < kFieldModulus; ++i)
for (unsigned i = 0; i < kModulus; ++i)
{
GFExp[state] = static_cast<ffe_t>(i);
state <<= 1;
if (state >= kFieldSize)
state ^= kGFPolynomial;
}
GFExp[0] = kFieldModulus;
GFExp[0] = kModulus;
// Conversion to chosen basis:
@ -134,7 +134,7 @@ static void InitField()
for (unsigned i = 0; i < kFieldSize; ++i)
GFExp[GFLog[i]] = i;
GFExp[kFieldModulus] = GFExp[0];
GFExp[kModulus] = GFExp[0];
}
@ -239,7 +239,7 @@ static void formal_derivative(ffe_t* cos, const unsigned size)
//------------------------------------------------------------------------------
// Fast Fourier Transform
static ffe_t skewVec[kFieldModulus]; // twisted factors used in FFT
static ffe_t skewVec[kModulus]; // twisted factors used in FFT
static LEO_FORCE_INLINE void ifft_butterfly(ffe_t& a, ffe_t& b, ffe_t skew)
{
@ -256,7 +256,7 @@ static void IFLT(ffe_t* data, const unsigned size, const unsigned index)
{
const ffe_t skew = skewVec[j + index - 1];
if (skew != kFieldModulus)
if (skew != kModulus)
{
for (unsigned i = j - width; i < j; ++i)
ifft_butterfly(data[i], data[i + width], skew);
@ -287,7 +287,7 @@ static void FLT(ffe_t* data, const unsigned size, const unsigned skewIndex, cons
{
const ffe_t skew = skewLUT[j];
if (skew != kFieldModulus)
if (skew != kModulus)
{
for (unsigned i = j; i < j + width; ++i)
fft_butterfly(data[i], data[i + width], skew);
@ -330,20 +330,20 @@ static void InitFieldOperations()
skewVec[j + s] = skewVec[j] ^ temp[i];
}
temp[m] = kFieldModulus - GFLog[mulE(temp[m], GFLog[temp[m] ^ 1])];
temp[m] = kModulus - GFLog[mulE(temp[m], GFLog[temp[m] ^ 1])];
for (unsigned i = m + 1; i < (kGFBits - 1); ++i)
temp[i] = mulE(temp[i], (GFLog[temp[i] ^ 1] + temp[m]) % kFieldModulus);
temp[i] = mulE(temp[i], (GFLog[temp[i] ^ 1] + temp[m]) % kModulus);
}
for (unsigned i = 0; i < kFieldSize; ++i)
skewVec[i] = GFLog[skewVec[i]];
#if 0
temp[0] = kFieldModulus - temp[0];
temp[0] = kModulus - temp[0];
for (unsigned i = 1; i < (kGFBits - 1); ++i)
temp[i] = (kFieldModulus - temp[i] + temp[i - 1]) % kFieldModulus;
temp[i] = (kModulus - temp[i] + temp[i - 1]) % kModulus;
B[0] = 0;
for (unsigned i = 0; i < (kGFBits - 1); ++i)
@ -351,7 +351,7 @@ static void InitFieldOperations()
const unsigned depart = ((unsigned)1 << i);
for (unsigned j = 0; j < depart; ++j)
B[j + depart] = (B[j] + temp[i]) % kFieldModulus;
B[j + depart] = (B[j] + temp[i]) % kModulus;
}
#endif
@ -419,7 +419,7 @@ static void decode(ffe_t* codeword, const unsigned m, const unsigned original_co
FWHT(log_walsh2, kGFBits);
for (unsigned i = 0; i < kFieldSize; ++i)
log_walsh2[i] = ((unsigned)log_walsh2[i] * (unsigned)log_walsh[i]) % kFieldModulus;
log_walsh2[i] = ((unsigned)log_walsh2[i] * (unsigned)log_walsh[i]) % kModulus;
FWHT(log_walsh2, kGFBits);
@ -449,8 +449,8 @@ static void decode(ffe_t* codeword, const unsigned m, const unsigned original_co
// Note: Preserves zeroes on the right
for (unsigned i = 0; i < m + original_count; i += 2)
{
codeword[i] = mulE(codeword[i], kFieldModulus - B[i >> 1]);
codeword[i + 1] = mulE(codeword[i + 1], kFieldModulus - B[i >> 1]);
codeword[i] = mulE(codeword[i], kModulus - B[i >> 1]);
codeword[i + 1] = mulE(codeword[i + 1], kModulus - B[i >> 1]);
}
#endif
@ -471,7 +471,7 @@ static void decode(ffe_t* codeword, const unsigned m, const unsigned original_co
{
if (erasure[i])
{
codeword[i] = mulE(codeword[i], kFieldModulus - log_walsh2[i]);
codeword[i] = mulE(codeword[i], kModulus - log_walsh2[i]);
}
}
}