2017-05-18 03:06:13 +00:00
|
|
|
/*
|
2017-05-25 09:24:15 +00:00
|
|
|
Copyright (c) 2017 Christopher A. Taylor. All rights reserved.
|
|
|
|
|
|
|
|
Redistribution and use in source and binary forms, with or without
|
|
|
|
modification, are permitted provided that the following conditions are met:
|
|
|
|
|
|
|
|
* Redistributions of source code must retain the above copyright notice,
|
|
|
|
this list of conditions and the following disclaimer.
|
|
|
|
* Redistributions in binary form must reproduce the above copyright notice,
|
|
|
|
this list of conditions and the following disclaimer in the documentation
|
|
|
|
and/or other materials provided with the distribution.
|
2017-05-27 02:51:30 +00:00
|
|
|
* Neither the name of Leopard-RS nor the names of its contributors may be
|
2017-05-25 09:24:15 +00:00
|
|
|
used to endorse or promote products derived from this software without
|
|
|
|
specific prior written permission.
|
|
|
|
|
|
|
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
|
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
|
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
|
|
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
|
|
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
|
|
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
|
|
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
|
|
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
|
|
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
|
|
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
|
|
POSSIBILITY OF SUCH DAMAGE.
|
2017-05-18 03:06:13 +00:00
|
|
|
*/
|
|
|
|
|
2017-05-27 02:51:30 +00:00
|
|
|
#include "LeopardFF16.h"
|
2017-05-27 03:10:53 +00:00
|
|
|
|
|
|
|
#ifdef LEO_HAS_FF16
|
|
|
|
|
2017-05-18 03:06:13 +00:00
|
|
|
#include <string.h>
|
|
|
|
|
2017-05-27 02:51:30 +00:00
|
|
|
namespace leopard { namespace ff16 {
|
2017-05-18 03:06:13 +00:00
|
|
|
|
|
|
|
|
|
|
|
//------------------------------------------------------------------------------
|
2017-05-27 02:51:30 +00:00
|
|
|
// Datatypes and Constants
|
2017-05-18 03:06:13 +00:00
|
|
|
|
2017-05-27 02:51:30 +00:00
|
|
|
// Basis used for generating logarithm tables
|
2017-05-28 01:44:06 +00:00
|
|
|
static const ffe_t kCantorBasis[kBits] = {
|
|
|
|
0x0001, 0xACCA, 0x3C0E, 0x163E,
|
2017-05-18 03:06:13 +00:00
|
|
|
0xC582, 0xED2E, 0x914C, 0x4012,
|
|
|
|
0x6C98, 0x10D8, 0x6A72, 0xB900,
|
|
|
|
0xFDB8, 0xFB34, 0xFF38, 0x991E
|
|
|
|
};
|
|
|
|
|
2017-05-27 08:15:24 +00:00
|
|
|
// Using the Cantor basis here enables us to avoid a lot of extra calculations
|
|
|
|
// when applying the formal derivative in decoding.
|
|
|
|
|
2017-05-18 03:06:13 +00:00
|
|
|
|
|
|
|
//------------------------------------------------------------------------------
|
2017-05-27 02:51:30 +00:00
|
|
|
// Field Operations
|
2017-05-18 03:06:13 +00:00
|
|
|
|
2017-05-27 02:51:30 +00:00
|
|
|
// z = x + y (mod kModulus)
|
|
|
|
static inline ffe_t AddMod(const ffe_t a, const ffe_t b)
|
2017-05-18 03:06:13 +00:00
|
|
|
{
|
|
|
|
const unsigned sum = (unsigned)a + b;
|
|
|
|
|
2017-05-27 02:51:30 +00:00
|
|
|
// Partial reduction step, allowing for kModulus to be returned
|
|
|
|
return static_cast<ffe_t>(sum + (sum >> kBits));
|
2017-05-18 03:06:13 +00:00
|
|
|
}
|
|
|
|
|
2017-05-27 02:51:30 +00:00
|
|
|
// z = x - y (mod kModulus)
|
|
|
|
static inline ffe_t SubMod(const ffe_t a, const ffe_t b)
|
2017-05-18 03:06:13 +00:00
|
|
|
{
|
|
|
|
const unsigned dif = (unsigned)a - b;
|
|
|
|
|
2017-05-27 02:51:30 +00:00
|
|
|
// Partial reduction step, allowing for kModulus to be returned
|
|
|
|
return static_cast<ffe_t>(dif + (dif >> kBits));
|
2017-05-18 03:06:13 +00:00
|
|
|
}
|
|
|
|
|
2017-05-24 08:23:19 +00:00
|
|
|
|
2017-05-18 03:06:13 +00:00
|
|
|
//------------------------------------------------------------------------------
|
2017-05-27 02:51:30 +00:00
|
|
|
// Fast Walsh-Hadamard Transform (FWHT) (mod kModulus)
|
2017-05-18 03:06:13 +00:00
|
|
|
|
2017-05-24 08:23:19 +00:00
|
|
|
// {a, b} = {a + b, a - b} (Mod Q)
|
2017-05-27 02:51:30 +00:00
|
|
|
static LEO_FORCE_INLINE void FWHT_2(ffe_t& LEO_RESTRICT a, ffe_t& LEO_RESTRICT b)
|
2017-05-24 08:23:19 +00:00
|
|
|
{
|
2017-05-27 02:51:30 +00:00
|
|
|
const ffe_t sum = AddMod(a, b);
|
|
|
|
const ffe_t dif = SubMod(a, b);
|
2017-05-24 08:23:19 +00:00
|
|
|
a = sum;
|
|
|
|
b = dif;
|
|
|
|
}
|
|
|
|
|
2017-05-27 02:51:30 +00:00
|
|
|
static LEO_FORCE_INLINE void FWHT_4(ffe_t* data, unsigned s)
|
2017-05-18 03:06:13 +00:00
|
|
|
{
|
2017-06-03 23:23:49 +00:00
|
|
|
const unsigned s2 = s << 1;
|
2017-05-18 03:06:13 +00:00
|
|
|
|
2017-05-27 02:51:30 +00:00
|
|
|
ffe_t t0 = data[0];
|
2017-06-03 23:23:49 +00:00
|
|
|
ffe_t t1 = data[s];
|
|
|
|
ffe_t t2 = data[s2];
|
|
|
|
ffe_t t3 = data[s2 + s];
|
2017-05-18 03:06:13 +00:00
|
|
|
|
|
|
|
FWHT_2(t0, t1);
|
|
|
|
FWHT_2(t2, t3);
|
|
|
|
FWHT_2(t0, t2);
|
|
|
|
FWHT_2(t1, t3);
|
|
|
|
|
2017-06-03 23:23:49 +00:00
|
|
|
data[0] = t0;
|
|
|
|
data[s] = t1;
|
|
|
|
data[s2] = t2;
|
|
|
|
data[s2 + s] = t3;
|
2017-05-18 03:06:13 +00:00
|
|
|
}
|
|
|
|
|
2017-06-03 23:23:49 +00:00
|
|
|
// Decimation in time (DIT) Fast Walsh-Hadamard Transform
|
|
|
|
// Unrolls pairs of layers to perform cross-layer operations in registers
|
|
|
|
// m_truncated: Number of elements that are non-zero at the front of data
|
|
|
|
static void FWHT(ffe_t* data, const unsigned m, const unsigned m_truncated)
|
2017-05-18 03:06:13 +00:00
|
|
|
{
|
2017-06-03 23:23:49 +00:00
|
|
|
// Decimation in time: Unroll 2 layers at a time
|
|
|
|
unsigned dist = 1, dist4 = 4;
|
|
|
|
for (; dist4 <= m; dist = dist4, dist4 <<= 2)
|
2017-05-18 03:06:13 +00:00
|
|
|
{
|
2017-06-03 23:23:49 +00:00
|
|
|
// For each set of dist*4 elements:
|
|
|
|
for (unsigned r = 0; r < m_truncated; r += dist4)
|
|
|
|
{
|
|
|
|
// For each set of dist elements:
|
|
|
|
for (unsigned i = r; i < r + dist; ++i)
|
|
|
|
FWHT_4(data + i, dist);
|
|
|
|
}
|
2017-05-18 03:06:13 +00:00
|
|
|
}
|
|
|
|
|
2017-06-03 23:23:49 +00:00
|
|
|
// If there is one layer left:
|
|
|
|
if (dist < m)
|
|
|
|
for (unsigned i = 0; i < dist; ++i)
|
|
|
|
FWHT_2(data[i], data[i + dist]);
|
2017-05-27 02:51:30 +00:00
|
|
|
}
|
2017-05-18 03:06:13 +00:00
|
|
|
|
|
|
|
|
|
|
|
//------------------------------------------------------------------------------
|
2017-05-27 02:51:30 +00:00
|
|
|
// Logarithm Tables
|
|
|
|
|
|
|
|
static ffe_t LogLUT[kOrder];
|
|
|
|
static ffe_t ExpLUT[kOrder];
|
|
|
|
|
2017-05-18 03:06:13 +00:00
|
|
|
|
2017-05-30 08:37:27 +00:00
|
|
|
// Returns a * Log(b)
|
|
|
|
static ffe_t MultiplyLog(ffe_t a, ffe_t log_b)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
Note that this operation is not a normal multiplication in a finite
|
|
|
|
field because the right operand is already a logarithm. This is done
|
|
|
|
because it moves K table lookups from the Decode() method into the
|
|
|
|
initialization step that is less performance critical. The LogWalsh[]
|
|
|
|
table below contains precalculated logarithms so it is easier to do
|
|
|
|
all the other multiplies in that form as well.
|
|
|
|
*/
|
|
|
|
if (a == 0)
|
|
|
|
return 0;
|
|
|
|
return ExpLUT[AddMod(LogLUT[a], log_b)];
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2017-05-27 02:51:30 +00:00
|
|
|
// Initialize LogLUT[], ExpLUT[]
|
|
|
|
static void InitializeLogarithmTables()
|
2017-05-18 03:06:13 +00:00
|
|
|
{
|
2017-05-27 02:51:30 +00:00
|
|
|
// LFSR table generation:
|
2017-05-18 03:06:13 +00:00
|
|
|
|
2017-05-27 02:51:30 +00:00
|
|
|
unsigned state = 1;
|
|
|
|
for (unsigned i = 0; i < kModulus; ++i)
|
2017-05-18 03:06:13 +00:00
|
|
|
{
|
2017-05-27 02:51:30 +00:00
|
|
|
ExpLUT[state] = static_cast<ffe_t>(i);
|
|
|
|
state <<= 1;
|
|
|
|
if (state >= kOrder)
|
|
|
|
state ^= kPolynomial;
|
|
|
|
}
|
|
|
|
ExpLUT[0] = kModulus;
|
|
|
|
|
2017-05-28 01:44:06 +00:00
|
|
|
// Conversion to Cantor basis:
|
2017-05-27 02:51:30 +00:00
|
|
|
|
|
|
|
LogLUT[0] = 0;
|
|
|
|
for (unsigned i = 0; i < kBits; ++i)
|
|
|
|
{
|
2017-05-28 01:44:06 +00:00
|
|
|
const ffe_t basis = kCantorBasis[i];
|
2017-05-27 02:51:30 +00:00
|
|
|
const unsigned width = static_cast<unsigned>(1UL << i);
|
|
|
|
|
|
|
|
for (unsigned j = 0; j < width; ++j)
|
|
|
|
LogLUT[j + width] = LogLUT[j] ^ basis;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < kOrder; ++i)
|
|
|
|
LogLUT[i] = ExpLUT[LogLUT[i]];
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < kOrder; ++i)
|
|
|
|
ExpLUT[LogLUT[i]] = i;
|
|
|
|
|
|
|
|
ExpLUT[kModulus] = ExpLUT[0];
|
|
|
|
}
|
|
|
|
|
2017-05-29 22:01:01 +00:00
|
|
|
|
2017-05-27 02:51:30 +00:00
|
|
|
//------------------------------------------------------------------------------
|
|
|
|
// Multiplies
|
|
|
|
|
2017-05-30 08:37:27 +00:00
|
|
|
/*
|
|
|
|
The multiplication algorithm used follows the approach outlined in {4}.
|
|
|
|
Specifically section 7 outlines the algorithm used here for 16-bit fields.
|
2017-05-31 08:11:20 +00:00
|
|
|
The ALTMAP memory layout is used since there is no need to convert in/out.
|
2017-05-30 08:37:27 +00:00
|
|
|
*/
|
|
|
|
|
2017-06-03 23:48:09 +00:00
|
|
|
struct Multiply128LUT_t
|
|
|
|
{
|
|
|
|
LEO_M128 Lo[4];
|
|
|
|
LEO_M128 Hi[4];
|
|
|
|
};
|
|
|
|
|
|
|
|
static const Multiply128LUT_t* Multiply128LUT = nullptr;
|
|
|
|
|
2017-05-27 02:51:30 +00:00
|
|
|
#if defined(LEO_TRY_AVX2)
|
2017-06-03 23:48:09 +00:00
|
|
|
|
|
|
|
struct Multiply256LUT_t
|
|
|
|
{
|
|
|
|
LEO_M256 Lo[4];
|
|
|
|
LEO_M256 Hi[4];
|
|
|
|
};
|
|
|
|
|
|
|
|
static const Multiply256LUT_t* Multiply256LUT = nullptr;
|
|
|
|
|
2017-05-27 02:51:30 +00:00
|
|
|
#endif // LEO_TRY_AVX2
|
|
|
|
|
2017-05-29 22:01:01 +00:00
|
|
|
|
|
|
|
void InitializeMultiplyTables()
|
2017-05-27 02:51:30 +00:00
|
|
|
{
|
2017-06-03 23:48:09 +00:00
|
|
|
// If we cannot use the PSHUFB instruction, generate Multiply8LUT:
|
|
|
|
if (!CpuHasSSSE3)
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (CpuHasAVX2)
|
|
|
|
Multiply256LUT = reinterpret_cast<const Multiply256LUT_t*>(SIMDSafeAllocate(sizeof(Multiply256LUT_t) * kOrder));
|
|
|
|
else
|
|
|
|
Multiply128LUT = reinterpret_cast<const Multiply128LUT_t*>(SIMDSafeAllocate(sizeof(Multiply128LUT_t) * kOrder));
|
|
|
|
|
2017-05-30 08:37:27 +00:00
|
|
|
// For each value we could multiply by:
|
2017-05-29 22:01:01 +00:00
|
|
|
for (unsigned log_m = 0; log_m < kOrder; ++log_m)
|
2017-05-27 02:51:30 +00:00
|
|
|
{
|
2017-05-30 08:37:27 +00:00
|
|
|
// For each 4 bits of the finite field width in bits:
|
|
|
|
for (unsigned i = 0, shift = 0; i < 4; ++i, shift += 4)
|
2017-05-18 03:06:13 +00:00
|
|
|
{
|
2017-05-30 08:37:27 +00:00
|
|
|
// Construct 16 entry LUT for PSHUFB
|
|
|
|
uint8_t prod_lo[16], prod_hi[16];
|
|
|
|
for (ffe_t x = 0; x < 16; ++x)
|
|
|
|
{
|
|
|
|
const ffe_t prod = MultiplyLog(x << shift, static_cast<ffe_t>(log_m));
|
|
|
|
prod_lo[x] = static_cast<uint8_t>(prod);
|
|
|
|
prod_hi[x] = static_cast<uint8_t>(prod >> 8);
|
|
|
|
}
|
2017-05-18 03:06:13 +00:00
|
|
|
|
2017-05-30 08:37:27 +00:00
|
|
|
const LEO_M128 value_lo = _mm_loadu_si128((LEO_M128*)prod_lo);
|
|
|
|
const LEO_M128 value_hi = _mm_loadu_si128((LEO_M128*)prod_hi);
|
2017-05-18 03:06:13 +00:00
|
|
|
|
2017-05-30 08:37:27 +00:00
|
|
|
// Store in 128-bit wide table
|
2017-06-03 23:48:09 +00:00
|
|
|
if (!CpuHasAVX2)
|
|
|
|
{
|
|
|
|
_mm_storeu_si128((LEO_M128*)&Multiply128LUT[log_m].Lo[i], value_lo);
|
|
|
|
_mm_storeu_si128((LEO_M128*)&Multiply128LUT[log_m].Hi[i], value_hi);
|
|
|
|
}
|
2017-05-18 03:06:13 +00:00
|
|
|
|
2017-05-30 08:37:27 +00:00
|
|
|
// Store in 256-bit wide table
|
2017-05-27 02:51:30 +00:00
|
|
|
#if defined(LEO_TRY_AVX2)
|
2017-05-30 08:37:27 +00:00
|
|
|
if (CpuHasAVX2)
|
|
|
|
{
|
2017-06-03 23:48:09 +00:00
|
|
|
_mm256_storeu_si256((LEO_M256*)&Multiply256LUT[log_m].Lo[i],
|
2017-05-30 08:37:27 +00:00
|
|
|
_mm256_broadcastsi128_si256(value_lo));
|
2017-06-03 23:48:09 +00:00
|
|
|
_mm256_storeu_si256((LEO_M256*)&Multiply256LUT[log_m].Hi[i],
|
2017-05-30 08:37:27 +00:00
|
|
|
_mm256_broadcastsi128_si256(value_hi));
|
|
|
|
}
|
2017-05-27 02:51:30 +00:00
|
|
|
#endif // LEO_TRY_AVX2
|
2017-05-30 08:37:27 +00:00
|
|
|
}
|
2017-05-18 03:06:13 +00:00
|
|
|
}
|
2017-05-27 02:51:30 +00:00
|
|
|
}
|
2017-05-18 03:06:13 +00:00
|
|
|
|
2017-05-27 02:51:30 +00:00
|
|
|
|
2017-05-29 22:01:01 +00:00
|
|
|
void mul_mem(
|
|
|
|
void * LEO_RESTRICT x, const void * LEO_RESTRICT y,
|
|
|
|
ffe_t log_m, uint64_t bytes)
|
|
|
|
{
|
2017-05-27 02:51:30 +00:00
|
|
|
#if defined(LEO_TRY_AVX2)
|
2017-05-18 03:06:13 +00:00
|
|
|
if (CpuHasAVX2)
|
|
|
|
{
|
2017-05-30 08:37:27 +00:00
|
|
|
#define LEO_MUL_TABLES_256() \
|
|
|
|
const LEO_M256 T0_lo = _mm256_loadu_si256(&Multiply256LUT[log_m].Lo[0]); \
|
|
|
|
const LEO_M256 T1_lo = _mm256_loadu_si256(&Multiply256LUT[log_m].Lo[1]); \
|
|
|
|
const LEO_M256 T2_lo = _mm256_loadu_si256(&Multiply256LUT[log_m].Lo[2]); \
|
|
|
|
const LEO_M256 T3_lo = _mm256_loadu_si256(&Multiply256LUT[log_m].Lo[3]); \
|
|
|
|
const LEO_M256 T0_hi = _mm256_loadu_si256(&Multiply256LUT[log_m].Hi[0]); \
|
|
|
|
const LEO_M256 T1_hi = _mm256_loadu_si256(&Multiply256LUT[log_m].Hi[1]); \
|
|
|
|
const LEO_M256 T2_hi = _mm256_loadu_si256(&Multiply256LUT[log_m].Hi[2]); \
|
|
|
|
const LEO_M256 T3_hi = _mm256_loadu_si256(&Multiply256LUT[log_m].Hi[3]);
|
|
|
|
|
|
|
|
LEO_MUL_TABLES_256();
|
2017-05-27 02:51:30 +00:00
|
|
|
|
|
|
|
const LEO_M256 clr_mask = _mm256_set1_epi8(0x0f);
|
|
|
|
|
2017-05-29 22:01:01 +00:00
|
|
|
LEO_M256 * LEO_RESTRICT x32 = reinterpret_cast<LEO_M256 *>(x);
|
|
|
|
const LEO_M256 * LEO_RESTRICT y32 = reinterpret_cast<const LEO_M256 *>(y);
|
2017-05-18 03:06:13 +00:00
|
|
|
|
2017-05-29 22:01:01 +00:00
|
|
|
do
|
2017-05-18 03:06:13 +00:00
|
|
|
{
|
2017-05-29 22:01:01 +00:00
|
|
|
#define LEO_MUL_256(x_ptr, y_ptr) { \
|
2017-05-30 09:23:33 +00:00
|
|
|
const LEO_M256 A_lo = _mm256_loadu_si256(y_ptr); \
|
2017-05-30 08:37:27 +00:00
|
|
|
const LEO_M256 A_hi = _mm256_loadu_si256(y_ptr + 1); \
|
|
|
|
LEO_M256 data_0 = _mm256_and_si256(A_lo, clr_mask); \
|
|
|
|
LEO_M256 data_1 = _mm256_srli_epi64(A_lo, 4); \
|
|
|
|
data_1 = _mm256_and_si256(data_1, clr_mask); \
|
|
|
|
LEO_M256 data_2 = _mm256_and_si256(A_hi, clr_mask); \
|
|
|
|
LEO_M256 data_3 = _mm256_srli_epi64(A_hi, 4); \
|
|
|
|
data_3 = _mm256_and_si256(data_3, clr_mask); \
|
|
|
|
LEO_M256 output_lo = _mm256_shuffle_epi8(T0_lo, data_0); \
|
|
|
|
output_lo = _mm256_xor_si256(output_lo, _mm256_shuffle_epi8(T1_lo, data_1)); \
|
|
|
|
output_lo = _mm256_xor_si256(output_lo, _mm256_shuffle_epi8(T2_lo, data_2)); \
|
|
|
|
output_lo = _mm256_xor_si256(output_lo, _mm256_shuffle_epi8(T3_lo, data_3)); \
|
|
|
|
LEO_M256 output_hi = _mm256_shuffle_epi8(T0_hi, data_0); \
|
|
|
|
output_hi = _mm256_xor_si256(output_hi, _mm256_shuffle_epi8(T1_hi, data_1)); \
|
|
|
|
output_hi = _mm256_xor_si256(output_hi, _mm256_shuffle_epi8(T2_hi, data_2)); \
|
|
|
|
output_hi = _mm256_xor_si256(output_hi, _mm256_shuffle_epi8(T3_hi, data_3)); \
|
|
|
|
_mm256_storeu_si256(x_ptr, output_lo); \
|
|
|
|
_mm256_storeu_si256(x_ptr + 1, output_hi); }
|
|
|
|
|
2017-05-29 22:01:01 +00:00
|
|
|
LEO_MUL_256(x32, y32);
|
|
|
|
y32 += 2, x32 += 2;
|
|
|
|
|
|
|
|
bytes -= 64;
|
|
|
|
} while (bytes > 0);
|
|
|
|
|
2017-05-27 02:51:30 +00:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
#endif // LEO_TRY_AVX2
|
2017-05-18 03:06:13 +00:00
|
|
|
|
2017-05-30 08:37:27 +00:00
|
|
|
#define LEO_MUL_TABLES_128() \
|
|
|
|
const LEO_M128 T0_lo = _mm_loadu_si128(&Multiply128LUT[log_m].Lo[0]); \
|
|
|
|
const LEO_M128 T1_lo = _mm_loadu_si128(&Multiply128LUT[log_m].Lo[1]); \
|
|
|
|
const LEO_M128 T2_lo = _mm_loadu_si128(&Multiply128LUT[log_m].Lo[2]); \
|
|
|
|
const LEO_M128 T3_lo = _mm_loadu_si128(&Multiply128LUT[log_m].Lo[3]); \
|
|
|
|
const LEO_M128 T0_hi = _mm_loadu_si128(&Multiply128LUT[log_m].Hi[0]); \
|
|
|
|
const LEO_M128 T1_hi = _mm_loadu_si128(&Multiply128LUT[log_m].Hi[1]); \
|
|
|
|
const LEO_M128 T2_hi = _mm_loadu_si128(&Multiply128LUT[log_m].Hi[2]); \
|
|
|
|
const LEO_M128 T3_hi = _mm_loadu_si128(&Multiply128LUT[log_m].Hi[3]);
|
|
|
|
|
|
|
|
LEO_MUL_TABLES_128();
|
2017-05-18 03:06:13 +00:00
|
|
|
|
2017-05-27 02:51:30 +00:00
|
|
|
const LEO_M128 clr_mask = _mm_set1_epi8(0x0f);
|
2017-05-18 03:06:13 +00:00
|
|
|
|
2017-05-29 22:01:01 +00:00
|
|
|
LEO_M128 * LEO_RESTRICT x16 = reinterpret_cast<LEO_M128 *>(x);
|
|
|
|
const LEO_M128 * LEO_RESTRICT y16 = reinterpret_cast<const LEO_M128 *>(y);
|
2017-05-27 02:51:30 +00:00
|
|
|
|
|
|
|
do
|
2017-05-18 03:06:13 +00:00
|
|
|
{
|
2017-05-29 22:01:01 +00:00
|
|
|
#define LEO_MUL_128(x_ptr, y_ptr) { \
|
2017-05-30 09:23:33 +00:00
|
|
|
const LEO_M128 A_lo = _mm_loadu_si128(y_ptr); \
|
2017-05-30 08:37:27 +00:00
|
|
|
const LEO_M128 A_hi = _mm_loadu_si128(y_ptr + 2); \
|
|
|
|
LEO_M128 data_0 = _mm_and_si128(A_lo, clr_mask); \
|
|
|
|
LEO_M128 data_1 = _mm_srli_epi64(A_lo, 4); \
|
|
|
|
data_1 = _mm_and_si128(data_1, clr_mask); \
|
|
|
|
LEO_M128 data_2 = _mm_and_si128(A_hi, clr_mask); \
|
|
|
|
LEO_M128 data_3 = _mm_srli_epi64(A_hi, 4); \
|
|
|
|
data_3 = _mm_and_si128(data_3, clr_mask); \
|
|
|
|
LEO_M128 output_lo = _mm_shuffle_epi8(T0_lo, data_0); \
|
|
|
|
output_lo = _mm_xor_si128(output_lo, _mm_shuffle_epi8(T1_lo, data_1)); \
|
|
|
|
output_lo = _mm_xor_si128(output_lo, _mm_shuffle_epi8(T2_lo, data_2)); \
|
|
|
|
output_lo = _mm_xor_si128(output_lo, _mm_shuffle_epi8(T3_lo, data_3)); \
|
|
|
|
LEO_M128 output_hi = _mm_shuffle_epi8(T0_hi, data_0); \
|
|
|
|
output_hi = _mm_xor_si128(output_hi, _mm_shuffle_epi8(T1_hi, data_1)); \
|
|
|
|
output_hi = _mm_xor_si128(output_hi, _mm_shuffle_epi8(T2_hi, data_2)); \
|
|
|
|
output_hi = _mm_xor_si128(output_hi, _mm_shuffle_epi8(T3_hi, data_3)); \
|
|
|
|
_mm_storeu_si128(x_ptr, output_lo); \
|
|
|
|
_mm_storeu_si128(x_ptr + 2, output_hi); }
|
|
|
|
|
2017-05-29 22:01:01 +00:00
|
|
|
LEO_MUL_128(x16 + 1, y16 + 1);
|
|
|
|
LEO_MUL_128(x16, y16);
|
2017-05-27 02:51:30 +00:00
|
|
|
x16 += 4, y16 += 4;
|
2017-05-29 22:01:01 +00:00
|
|
|
|
2017-05-27 02:51:30 +00:00
|
|
|
bytes -= 64;
|
|
|
|
} while (bytes > 0);
|
|
|
|
}
|
|
|
|
|
2017-05-18 03:06:13 +00:00
|
|
|
|
|
|
|
//------------------------------------------------------------------------------
|
2017-05-27 02:51:30 +00:00
|
|
|
// FFT Operations
|
2017-05-18 03:06:13 +00:00
|
|
|
|
2017-05-27 02:51:30 +00:00
|
|
|
void fft_butterfly(
|
|
|
|
void * LEO_RESTRICT x, void * LEO_RESTRICT y,
|
2017-05-29 22:01:01 +00:00
|
|
|
ffe_t log_m, uint64_t bytes)
|
2017-05-18 03:06:13 +00:00
|
|
|
{
|
2017-05-29 22:01:01 +00:00
|
|
|
#if defined(LEO_TRY_AVX2)
|
|
|
|
if (CpuHasAVX2)
|
|
|
|
{
|
2017-05-30 08:37:27 +00:00
|
|
|
LEO_MUL_TABLES_256();
|
2017-05-29 22:01:01 +00:00
|
|
|
|
|
|
|
const LEO_M256 clr_mask = _mm256_set1_epi8(0x0f);
|
|
|
|
|
|
|
|
LEO_M256 * LEO_RESTRICT x32 = reinterpret_cast<LEO_M256 *>(x);
|
|
|
|
LEO_M256 * LEO_RESTRICT y32 = reinterpret_cast<LEO_M256 *>(y);
|
|
|
|
|
|
|
|
do
|
|
|
|
{
|
|
|
|
#define LEO_FFTB_256(x_ptr, y_ptr) { \
|
2017-05-30 09:23:33 +00:00
|
|
|
LEO_M256 y_lo = _mm256_loadu_si256(y_ptr); \
|
2017-05-30 08:37:27 +00:00
|
|
|
LEO_M256 data_0 = _mm256_and_si256(y_lo, clr_mask); \
|
|
|
|
LEO_M256 data_1 = _mm256_srli_epi64(y_lo, 4); \
|
|
|
|
data_1 = _mm256_and_si256(data_1, clr_mask); \
|
|
|
|
LEO_M256 prod_lo = _mm256_shuffle_epi8(T0_lo, data_0); \
|
|
|
|
prod_lo = _mm256_xor_si256(prod_lo, _mm256_shuffle_epi8(T1_lo, data_1)); \
|
|
|
|
LEO_M256 prod_hi = _mm256_shuffle_epi8(T0_hi, data_0); \
|
|
|
|
prod_hi = _mm256_xor_si256(prod_hi, _mm256_shuffle_epi8(T1_hi, data_1)); \
|
|
|
|
LEO_M256 y_hi = _mm256_loadu_si256(y_ptr + 1); \
|
|
|
|
data_0 = _mm256_and_si256(y_hi, clr_mask); \
|
|
|
|
data_1 = _mm256_srli_epi64(y_hi, 4); \
|
|
|
|
data_1 = _mm256_and_si256(data_1, clr_mask); \
|
|
|
|
prod_lo = _mm256_xor_si256(prod_lo, _mm256_shuffle_epi8(T2_lo, data_0)); \
|
|
|
|
prod_lo = _mm256_xor_si256(prod_lo, _mm256_shuffle_epi8(T3_lo, data_1)); \
|
|
|
|
prod_hi = _mm256_xor_si256(prod_hi, _mm256_shuffle_epi8(T2_hi, data_0)); \
|
|
|
|
prod_hi = _mm256_xor_si256(prod_hi, _mm256_shuffle_epi8(T3_hi, data_1)); \
|
2017-05-30 09:23:33 +00:00
|
|
|
LEO_M256 x_lo = _mm256_loadu_si256(x_ptr); \
|
2017-05-30 08:37:27 +00:00
|
|
|
LEO_M256 x_hi = _mm256_loadu_si256(x_ptr + 1); \
|
|
|
|
x_lo = _mm256_xor_si256(prod_lo, x_lo); \
|
|
|
|
_mm256_storeu_si256(x_ptr, x_lo); \
|
|
|
|
x_hi = _mm256_xor_si256(prod_hi, x_hi); \
|
|
|
|
_mm256_storeu_si256(x_ptr + 1, x_hi); \
|
|
|
|
y_lo = _mm256_xor_si256(y_lo, x_lo); \
|
|
|
|
_mm256_storeu_si256(y_ptr, y_lo); \
|
|
|
|
y_hi = _mm256_xor_si256(y_hi, x_hi); \
|
|
|
|
_mm256_storeu_si256(y_ptr + 1, y_hi); }
|
|
|
|
|
2017-05-29 22:01:01 +00:00
|
|
|
LEO_FFTB_256(x32, y32);
|
|
|
|
y32 += 2, x32 += 2;
|
|
|
|
|
|
|
|
bytes -= 64;
|
|
|
|
} while (bytes > 0);
|
|
|
|
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
#endif // LEO_TRY_AVX2
|
|
|
|
|
2017-05-30 08:37:27 +00:00
|
|
|
LEO_MUL_TABLES_128();
|
2017-05-29 22:01:01 +00:00
|
|
|
|
|
|
|
const LEO_M128 clr_mask = _mm_set1_epi8(0x0f);
|
|
|
|
|
|
|
|
LEO_M128 * LEO_RESTRICT x16 = reinterpret_cast<LEO_M128 *>(x);
|
|
|
|
LEO_M128 * LEO_RESTRICT y16 = reinterpret_cast<LEO_M128 *>(y);
|
2017-05-18 03:06:13 +00:00
|
|
|
|
2017-05-29 22:01:01 +00:00
|
|
|
do
|
|
|
|
{
|
|
|
|
#define LEO_FFTB_128(x_ptr, y_ptr) { \
|
2017-05-30 09:23:33 +00:00
|
|
|
LEO_M128 y_lo = _mm_loadu_si128(y_ptr); \
|
2017-05-30 08:37:27 +00:00
|
|
|
LEO_M128 data_0 = _mm_and_si128(y_lo, clr_mask); \
|
|
|
|
LEO_M128 data_1 = _mm_srli_epi64(y_lo, 4); \
|
|
|
|
data_1 = _mm_and_si128(data_1, clr_mask); \
|
|
|
|
LEO_M128 prod_lo = _mm_shuffle_epi8(T0_lo, data_0); \
|
|
|
|
prod_lo = _mm_xor_si128(prod_lo, _mm_shuffle_epi8(T1_lo, data_1)); \
|
|
|
|
LEO_M128 prod_hi = _mm_shuffle_epi8(T0_hi, data_0); \
|
|
|
|
prod_hi = _mm_xor_si128(prod_hi, _mm_shuffle_epi8(T1_hi, data_1)); \
|
|
|
|
LEO_M128 y_hi = _mm_loadu_si128(y_ptr + 2); \
|
|
|
|
data_0 = _mm_and_si128(y_hi, clr_mask); \
|
|
|
|
data_1 = _mm_srli_epi64(y_hi, 4); \
|
|
|
|
data_1 = _mm_and_si128(data_1, clr_mask); \
|
|
|
|
prod_lo = _mm_xor_si128(prod_lo, _mm_shuffle_epi8(T2_lo, data_0)); \
|
|
|
|
prod_lo = _mm_xor_si128(prod_lo, _mm_shuffle_epi8(T3_lo, data_1)); \
|
|
|
|
prod_hi = _mm_xor_si128(prod_hi, _mm_shuffle_epi8(T2_hi, data_0)); \
|
|
|
|
prod_hi = _mm_xor_si128(prod_hi, _mm_shuffle_epi8(T3_hi, data_1)); \
|
2017-05-30 09:23:33 +00:00
|
|
|
LEO_M128 x_lo = _mm_loadu_si128(x_ptr); \
|
2017-05-30 08:37:27 +00:00
|
|
|
LEO_M128 x_hi = _mm_loadu_si128(x_ptr + 2); \
|
|
|
|
x_lo = _mm_xor_si128(prod_lo, x_lo); \
|
|
|
|
_mm_storeu_si128(x_ptr, x_lo); \
|
|
|
|
x_hi = _mm_xor_si128(prod_hi, x_hi); \
|
|
|
|
_mm_storeu_si128(x_ptr + 2, x_hi); \
|
|
|
|
y_lo = _mm_xor_si128(y_lo, x_lo); \
|
|
|
|
_mm_storeu_si128(y_ptr, y_lo); \
|
|
|
|
y_hi = _mm_xor_si128(y_hi, x_hi); \
|
|
|
|
_mm_storeu_si128(y_ptr + 2, y_hi); }
|
|
|
|
|
2017-05-29 22:01:01 +00:00
|
|
|
LEO_FFTB_128(x16 + 1, y16 + 1);
|
|
|
|
LEO_FFTB_128(x16, y16);
|
|
|
|
x16 += 4, y16 += 4;
|
|
|
|
|
|
|
|
bytes -= 64;
|
|
|
|
} while (bytes > 0);
|
2017-05-27 02:51:30 +00:00
|
|
|
}
|
|
|
|
|
2017-05-29 22:01:01 +00:00
|
|
|
#ifdef LEO_USE_VECTOR4_OPT
|
|
|
|
|
2017-05-27 03:30:48 +00:00
|
|
|
void fft_butterfly4(
|
2017-05-27 02:51:30 +00:00
|
|
|
void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
|
|
|
|
void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1,
|
|
|
|
void * LEO_RESTRICT x_2, void * LEO_RESTRICT y_2,
|
2017-05-27 03:30:48 +00:00
|
|
|
void * LEO_RESTRICT x_3, void * LEO_RESTRICT y_3,
|
2017-05-29 22:01:01 +00:00
|
|
|
ffe_t log_m, uint64_t bytes)
|
2017-05-27 02:51:30 +00:00
|
|
|
{
|
2017-05-29 22:01:01 +00:00
|
|
|
#if defined(LEO_TRY_AVX2)
|
|
|
|
if (CpuHasAVX2)
|
|
|
|
{
|
2017-05-30 09:05:41 +00:00
|
|
|
LEO_MUL_TABLES_256();
|
2017-05-29 22:01:01 +00:00
|
|
|
|
|
|
|
const LEO_M256 clr_mask = _mm256_set1_epi8(0x0f);
|
2017-05-18 03:06:13 +00:00
|
|
|
|
2017-05-29 22:01:01 +00:00
|
|
|
LEO_M256 * LEO_RESTRICT x32_0 = reinterpret_cast<LEO_M256 *>(x_0);
|
|
|
|
LEO_M256 * LEO_RESTRICT y32_0 = reinterpret_cast<LEO_M256 *>(y_0);
|
|
|
|
LEO_M256 * LEO_RESTRICT x32_1 = reinterpret_cast<LEO_M256 *>(x_1);
|
|
|
|
LEO_M256 * LEO_RESTRICT y32_1 = reinterpret_cast<LEO_M256 *>(y_1);
|
|
|
|
LEO_M256 * LEO_RESTRICT x32_2 = reinterpret_cast<LEO_M256 *>(x_2);
|
|
|
|
LEO_M256 * LEO_RESTRICT y32_2 = reinterpret_cast<LEO_M256 *>(y_2);
|
|
|
|
LEO_M256 * LEO_RESTRICT x32_3 = reinterpret_cast<LEO_M256 *>(x_3);
|
|
|
|
LEO_M256 * LEO_RESTRICT y32_3 = reinterpret_cast<LEO_M256 *>(y_3);
|
|
|
|
|
|
|
|
do
|
|
|
|
{
|
|
|
|
LEO_FFTB_256(x32_0, y32_0);
|
|
|
|
y32_0 += 2, x32_0 += 2;
|
|
|
|
|
|
|
|
LEO_FFTB_256(x32_1, y32_1);
|
|
|
|
y32_1 += 2, x32_1 += 2;
|
|
|
|
|
|
|
|
LEO_FFTB_256(x32_2, y32_2);
|
|
|
|
y32_2 += 2, x32_2 += 2;
|
|
|
|
|
|
|
|
LEO_FFTB_256(x32_3, y32_3);
|
|
|
|
y32_3 += 2, x32_3 += 2;
|
|
|
|
|
|
|
|
bytes -= 64;
|
|
|
|
} while (bytes > 0);
|
|
|
|
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
#endif // LEO_TRY_AVX2
|
|
|
|
|
2017-05-30 08:37:27 +00:00
|
|
|
LEO_MUL_TABLES_128();
|
2017-05-29 22:01:01 +00:00
|
|
|
|
|
|
|
const LEO_M128 clr_mask = _mm_set1_epi8(0x0f);
|
|
|
|
|
|
|
|
LEO_M128 * LEO_RESTRICT x16_0 = reinterpret_cast<LEO_M128 *>(x_0);
|
|
|
|
LEO_M128 * LEO_RESTRICT y16_0 = reinterpret_cast<LEO_M128 *>(y_0);
|
|
|
|
LEO_M128 * LEO_RESTRICT x16_1 = reinterpret_cast<LEO_M128 *>(x_1);
|
|
|
|
LEO_M128 * LEO_RESTRICT y16_1 = reinterpret_cast<LEO_M128 *>(y_1);
|
|
|
|
LEO_M128 * LEO_RESTRICT x16_2 = reinterpret_cast<LEO_M128 *>(x_2);
|
|
|
|
LEO_M128 * LEO_RESTRICT y16_2 = reinterpret_cast<LEO_M128 *>(y_2);
|
|
|
|
LEO_M128 * LEO_RESTRICT x16_3 = reinterpret_cast<LEO_M128 *>(x_3);
|
|
|
|
LEO_M128 * LEO_RESTRICT y16_3 = reinterpret_cast<LEO_M128 *>(y_3);
|
|
|
|
|
|
|
|
do
|
|
|
|
{
|
|
|
|
LEO_FFTB_128(x16_0 + 1, y16_0 + 1);
|
|
|
|
LEO_FFTB_128(x16_0, y16_0);
|
|
|
|
x16_0 += 4, y16_0 += 4;
|
|
|
|
|
|
|
|
LEO_FFTB_128(x16_1 + 1, y16_1 + 1);
|
|
|
|
LEO_FFTB_128(x16_1, y16_1);
|
|
|
|
x16_1 += 4, y16_1 += 4;
|
|
|
|
|
|
|
|
LEO_FFTB_128(x16_2 + 1, y16_2 + 1);
|
|
|
|
LEO_FFTB_128(x16_2, y16_2);
|
|
|
|
x16_2 += 4, y16_2 += 4;
|
|
|
|
|
|
|
|
LEO_FFTB_128(x16_3 + 1, y16_3 + 1);
|
|
|
|
LEO_FFTB_128(x16_3, y16_3);
|
|
|
|
x16_3 += 4, y16_3 += 4;
|
|
|
|
|
|
|
|
bytes -= 64;
|
|
|
|
} while (bytes > 0);
|
2017-05-18 03:06:13 +00:00
|
|
|
}
|
|
|
|
|
2017-05-29 22:01:01 +00:00
|
|
|
#endif // LEO_USE_VECTOR4_OPT
|
|
|
|
|
2017-05-18 03:06:13 +00:00
|
|
|
|
|
|
|
//------------------------------------------------------------------------------
|
2017-05-27 02:51:30 +00:00
|
|
|
// IFFT Operations
|
2017-05-18 03:06:13 +00:00
|
|
|
|
2017-05-27 02:51:30 +00:00
|
|
|
void ifft_butterfly(
|
|
|
|
void * LEO_RESTRICT x, void * LEO_RESTRICT y,
|
2017-05-29 22:01:01 +00:00
|
|
|
ffe_t log_m, uint64_t bytes)
|
2017-05-18 03:06:13 +00:00
|
|
|
{
|
2017-05-29 22:01:01 +00:00
|
|
|
#if defined(LEO_TRY_AVX2)
|
|
|
|
if (CpuHasAVX2)
|
|
|
|
{
|
2017-05-30 08:37:27 +00:00
|
|
|
LEO_MUL_TABLES_256();
|
2017-05-29 22:01:01 +00:00
|
|
|
|
|
|
|
const LEO_M256 clr_mask = _mm256_set1_epi8(0x0f);
|
|
|
|
|
|
|
|
LEO_M256 * LEO_RESTRICT x32 = reinterpret_cast<LEO_M256 *>(x);
|
|
|
|
LEO_M256 * LEO_RESTRICT y32 = reinterpret_cast<LEO_M256 *>(y);
|
|
|
|
|
|
|
|
do
|
|
|
|
{
|
|
|
|
#define LEO_IFFTB_256(x_ptr, y_ptr) { \
|
2017-05-30 09:23:33 +00:00
|
|
|
LEO_M256 x_lo = _mm256_loadu_si256(x_ptr); \
|
|
|
|
LEO_M256 y_lo = _mm256_loadu_si256(y_ptr); \
|
2017-05-30 08:37:27 +00:00
|
|
|
y_lo = _mm256_xor_si256(y_lo, x_lo); \
|
|
|
|
_mm256_storeu_si256(y_ptr, y_lo); \
|
|
|
|
LEO_M256 data_0 = _mm256_and_si256(y_lo, clr_mask); \
|
|
|
|
LEO_M256 data_1 = _mm256_srli_epi64(y_lo, 4); \
|
|
|
|
data_1 = _mm256_and_si256(data_1, clr_mask); \
|
|
|
|
LEO_M256 prod_lo = _mm256_shuffle_epi8(T0_lo, data_0); \
|
|
|
|
prod_lo = _mm256_xor_si256(prod_lo, _mm256_shuffle_epi8(T1_lo, data_1)); \
|
|
|
|
LEO_M256 prod_hi = _mm256_shuffle_epi8(T0_hi, data_0); \
|
|
|
|
prod_hi = _mm256_xor_si256(prod_hi, _mm256_shuffle_epi8(T1_hi, data_1)); \
|
|
|
|
LEO_M256 x_hi = _mm256_loadu_si256(x_ptr + 1); \
|
|
|
|
LEO_M256 y_hi = _mm256_loadu_si256(y_ptr + 1); \
|
|
|
|
y_hi = _mm256_xor_si256(y_hi, x_hi); \
|
|
|
|
_mm256_storeu_si256(y_ptr + 1, y_hi); \
|
|
|
|
data_0 = _mm256_and_si256(y_hi, clr_mask); \
|
|
|
|
data_1 = _mm256_srli_epi64(y_hi, 4); \
|
|
|
|
data_1 = _mm256_and_si256(data_1, clr_mask); \
|
|
|
|
prod_lo = _mm256_xor_si256(prod_lo, _mm256_shuffle_epi8(T2_lo, data_0)); \
|
|
|
|
prod_lo = _mm256_xor_si256(prod_lo, _mm256_shuffle_epi8(T3_lo, data_1)); \
|
|
|
|
prod_hi = _mm256_xor_si256(prod_hi, _mm256_shuffle_epi8(T2_hi, data_0)); \
|
|
|
|
prod_hi = _mm256_xor_si256(prod_hi, _mm256_shuffle_epi8(T3_hi, data_1)); \
|
|
|
|
x_lo = _mm256_xor_si256(prod_lo, x_lo); \
|
|
|
|
_mm256_storeu_si256(x_ptr, x_lo); \
|
|
|
|
x_hi = _mm256_xor_si256(prod_hi, x_hi); \
|
|
|
|
_mm256_storeu_si256(x_ptr + 1, x_hi); }
|
|
|
|
|
2017-05-29 22:01:01 +00:00
|
|
|
LEO_IFFTB_256(x32, y32);
|
|
|
|
y32 += 2, x32 += 2;
|
|
|
|
|
|
|
|
bytes -= 64;
|
|
|
|
} while (bytes > 0);
|
|
|
|
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
#endif // LEO_TRY_AVX2
|
|
|
|
|
2017-05-30 08:37:27 +00:00
|
|
|
LEO_MUL_TABLES_128();
|
2017-05-29 22:01:01 +00:00
|
|
|
|
|
|
|
const LEO_M128 clr_mask = _mm_set1_epi8(0x0f);
|
|
|
|
|
|
|
|
LEO_M128 * LEO_RESTRICT x16 = reinterpret_cast<LEO_M128 *>(x);
|
|
|
|
LEO_M128 * LEO_RESTRICT y16 = reinterpret_cast<LEO_M128 *>(y);
|
2017-05-18 03:06:13 +00:00
|
|
|
|
2017-05-29 22:01:01 +00:00
|
|
|
do
|
|
|
|
{
|
|
|
|
#define LEO_IFFTB_128(x_ptr, y_ptr) { \
|
2017-05-30 09:23:33 +00:00
|
|
|
LEO_M128 x_lo = _mm_loadu_si128(x_ptr); \
|
|
|
|
LEO_M128 y_lo = _mm_loadu_si128(y_ptr); \
|
2017-05-30 08:37:27 +00:00
|
|
|
y_lo = _mm_xor_si128(y_lo, x_lo); \
|
|
|
|
_mm_storeu_si128(y_ptr, y_lo); \
|
|
|
|
LEO_M128 data_0 = _mm_and_si128(y_lo, clr_mask); \
|
|
|
|
LEO_M128 data_1 = _mm_srli_epi64(y_lo, 4); \
|
|
|
|
data_1 = _mm_and_si128(data_1, clr_mask); \
|
|
|
|
LEO_M128 prod_lo = _mm_shuffle_epi8(T0_lo, data_0); \
|
|
|
|
prod_lo = _mm_xor_si128(prod_lo, _mm_shuffle_epi8(T1_lo, data_1)); \
|
|
|
|
LEO_M128 prod_hi = _mm_shuffle_epi8(T0_hi, data_0); \
|
|
|
|
prod_hi = _mm_xor_si128(prod_hi, _mm_shuffle_epi8(T1_hi, data_1)); \
|
|
|
|
LEO_M128 x_hi = _mm_loadu_si128(x_ptr + 2); \
|
|
|
|
LEO_M128 y_hi = _mm_loadu_si128(y_ptr + 2); \
|
|
|
|
y_hi = _mm_xor_si128(y_hi, x_hi); \
|
|
|
|
_mm_storeu_si128(y_ptr + 2, y_hi); \
|
|
|
|
data_0 = _mm_and_si128(y_hi, clr_mask); \
|
|
|
|
data_1 = _mm_srli_epi64(y_hi, 4); \
|
|
|
|
data_1 = _mm_and_si128(data_1, clr_mask); \
|
|
|
|
prod_lo = _mm_xor_si128(prod_lo, _mm_shuffle_epi8(T2_lo, data_0)); \
|
|
|
|
prod_lo = _mm_xor_si128(prod_lo, _mm_shuffle_epi8(T3_lo, data_1)); \
|
|
|
|
prod_hi = _mm_xor_si128(prod_hi, _mm_shuffle_epi8(T2_hi, data_0)); \
|
|
|
|
prod_hi = _mm_xor_si128(prod_hi, _mm_shuffle_epi8(T3_hi, data_1)); \
|
|
|
|
x_lo = _mm_xor_si128(prod_lo, x_lo); \
|
|
|
|
_mm_storeu_si128(x_ptr, x_lo); \
|
|
|
|
x_hi = _mm_xor_si128(prod_hi, x_hi); \
|
|
|
|
_mm_storeu_si128(x_ptr + 2, x_hi); }
|
|
|
|
|
2017-05-29 22:01:01 +00:00
|
|
|
LEO_IFFTB_128(x16 + 1, y16 + 1);
|
|
|
|
LEO_IFFTB_128(x16, y16);
|
|
|
|
x16 += 4, y16 += 4;
|
|
|
|
|
|
|
|
bytes -= 64;
|
|
|
|
} while (bytes > 0);
|
2017-05-18 03:06:13 +00:00
|
|
|
}
|
|
|
|
|
2017-05-29 22:01:01 +00:00
|
|
|
#ifdef LEO_USE_VECTOR4_OPT
|
|
|
|
|
2017-05-27 03:30:48 +00:00
|
|
|
void ifft_butterfly4(
|
2017-05-27 02:51:30 +00:00
|
|
|
void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
|
|
|
|
void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1,
|
|
|
|
void * LEO_RESTRICT x_2, void * LEO_RESTRICT y_2,
|
2017-05-27 03:30:48 +00:00
|
|
|
void * LEO_RESTRICT x_3, void * LEO_RESTRICT y_3,
|
2017-05-29 22:01:01 +00:00
|
|
|
ffe_t log_m, uint64_t bytes)
|
2017-05-27 02:51:30 +00:00
|
|
|
{
|
2017-05-29 22:01:01 +00:00
|
|
|
#if defined(LEO_TRY_AVX2)
|
|
|
|
if (CpuHasAVX2)
|
|
|
|
{
|
2017-05-30 09:05:41 +00:00
|
|
|
LEO_MUL_TABLES_256();
|
2017-05-29 22:01:01 +00:00
|
|
|
|
|
|
|
const LEO_M256 clr_mask = _mm256_set1_epi8(0x0f);
|
|
|
|
|
|
|
|
LEO_M256 * LEO_RESTRICT x32_0 = reinterpret_cast<LEO_M256 *>(x_0);
|
|
|
|
LEO_M256 * LEO_RESTRICT y32_0 = reinterpret_cast<LEO_M256 *>(y_0);
|
|
|
|
LEO_M256 * LEO_RESTRICT x32_1 = reinterpret_cast<LEO_M256 *>(x_1);
|
|
|
|
LEO_M256 * LEO_RESTRICT y32_1 = reinterpret_cast<LEO_M256 *>(y_1);
|
|
|
|
LEO_M256 * LEO_RESTRICT x32_2 = reinterpret_cast<LEO_M256 *>(x_2);
|
|
|
|
LEO_M256 * LEO_RESTRICT y32_2 = reinterpret_cast<LEO_M256 *>(y_2);
|
|
|
|
LEO_M256 * LEO_RESTRICT x32_3 = reinterpret_cast<LEO_M256 *>(x_3);
|
|
|
|
LEO_M256 * LEO_RESTRICT y32_3 = reinterpret_cast<LEO_M256 *>(y_3);
|
|
|
|
|
|
|
|
do
|
|
|
|
{
|
|
|
|
LEO_IFFTB_256(x32_0, y32_0);
|
|
|
|
y32_0 += 2, x32_0 += 2;
|
|
|
|
|
|
|
|
LEO_IFFTB_256(x32_1, y32_1);
|
|
|
|
y32_1 += 2, x32_1 += 2;
|
|
|
|
|
|
|
|
LEO_IFFTB_256(x32_2, y32_2);
|
|
|
|
y32_2 += 2, x32_2 += 2;
|
|
|
|
|
|
|
|
LEO_IFFTB_256(x32_3, y32_3);
|
|
|
|
y32_3 += 2, x32_3 += 2;
|
|
|
|
|
|
|
|
bytes -= 64;
|
|
|
|
} while (bytes > 0);
|
2017-05-18 03:06:13 +00:00
|
|
|
|
2017-05-29 22:01:01 +00:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
#endif // LEO_TRY_AVX2
|
|
|
|
|
2017-05-30 09:05:41 +00:00
|
|
|
LEO_MUL_TABLES_128();
|
2017-05-29 22:01:01 +00:00
|
|
|
|
|
|
|
const LEO_M128 clr_mask = _mm_set1_epi8(0x0f);
|
|
|
|
|
|
|
|
LEO_M128 * LEO_RESTRICT x16_0 = reinterpret_cast<LEO_M128 *>(x_0);
|
|
|
|
LEO_M128 * LEO_RESTRICT y16_0 = reinterpret_cast<LEO_M128 *>(y_0);
|
|
|
|
LEO_M128 * LEO_RESTRICT x16_1 = reinterpret_cast<LEO_M128 *>(x_1);
|
|
|
|
LEO_M128 * LEO_RESTRICT y16_1 = reinterpret_cast<LEO_M128 *>(y_1);
|
|
|
|
LEO_M128 * LEO_RESTRICT x16_2 = reinterpret_cast<LEO_M128 *>(x_2);
|
|
|
|
LEO_M128 * LEO_RESTRICT y16_2 = reinterpret_cast<LEO_M128 *>(y_2);
|
|
|
|
LEO_M128 * LEO_RESTRICT x16_3 = reinterpret_cast<LEO_M128 *>(x_3);
|
|
|
|
LEO_M128 * LEO_RESTRICT y16_3 = reinterpret_cast<LEO_M128 *>(y_3);
|
|
|
|
|
|
|
|
do
|
|
|
|
{
|
|
|
|
LEO_IFFTB_128(x16_0 + 1, y16_0 + 1);
|
|
|
|
LEO_IFFTB_128(x16_0, y16_0);
|
|
|
|
x16_0 += 4, y16_0 += 4;
|
|
|
|
|
|
|
|
LEO_IFFTB_128(x16_1 + 1, y16_1 + 1);
|
|
|
|
LEO_IFFTB_128(x16_1, y16_1);
|
|
|
|
x16_1 += 4, y16_1 += 4;
|
|
|
|
|
|
|
|
LEO_IFFTB_128(x16_2 + 1, y16_2 + 1);
|
|
|
|
LEO_IFFTB_128(x16_2, y16_2);
|
|
|
|
x16_2 += 4, y16_2 += 4;
|
|
|
|
|
|
|
|
LEO_IFFTB_128(x16_3 + 1, y16_3 + 1);
|
|
|
|
LEO_IFFTB_128(x16_3, y16_3);
|
|
|
|
x16_3 += 4, y16_3 += 4;
|
|
|
|
|
|
|
|
bytes -= 64;
|
|
|
|
} while (bytes > 0);
|
2017-05-18 03:06:13 +00:00
|
|
|
}
|
|
|
|
|
2017-05-29 22:01:01 +00:00
|
|
|
#endif // LEO_USE_VECTOR4_OPT
|
|
|
|
|
2017-05-18 03:06:13 +00:00
|
|
|
|
|
|
|
//------------------------------------------------------------------------------
|
2017-05-27 02:51:30 +00:00
|
|
|
// FFT
|
2017-05-18 03:06:13 +00:00
|
|
|
|
2017-05-29 22:01:01 +00:00
|
|
|
// Twisted factors used in FFT
|
|
|
|
static ffe_t FFTSkew[kModulus];
|
2017-05-18 03:06:13 +00:00
|
|
|
|
2017-05-29 22:01:01 +00:00
|
|
|
// Factors used in the evaluation of the error locator polynomial
|
|
|
|
static ffe_t LogWalsh[kOrder];
|
|
|
|
|
2017-05-31 08:11:20 +00:00
|
|
|
|
2017-05-29 22:01:01 +00:00
|
|
|
static void FFTInitialize()
|
2017-05-18 03:06:13 +00:00
|
|
|
{
|
2017-05-27 02:51:30 +00:00
|
|
|
ffe_t temp[kBits - 1];
|
2017-05-18 03:06:13 +00:00
|
|
|
|
2017-05-29 22:01:01 +00:00
|
|
|
// Generate FFT skew vector {1}:
|
|
|
|
|
2017-05-27 02:51:30 +00:00
|
|
|
for (unsigned i = 1; i < kBits; ++i)
|
2017-05-29 22:01:01 +00:00
|
|
|
temp[i - 1] = static_cast<ffe_t>(1UL << i);
|
2017-05-18 03:06:13 +00:00
|
|
|
|
2017-05-27 02:51:30 +00:00
|
|
|
for (unsigned m = 0; m < (kBits - 1); ++m)
|
2017-05-18 03:06:13 +00:00
|
|
|
{
|
2017-05-29 22:01:01 +00:00
|
|
|
const unsigned step = 1UL << (m + 1);
|
2017-05-18 03:06:13 +00:00
|
|
|
|
2017-05-29 22:01:01 +00:00
|
|
|
FFTSkew[(1UL << m) - 1] = 0;
|
2017-05-18 03:06:13 +00:00
|
|
|
|
2017-05-27 02:51:30 +00:00
|
|
|
for (unsigned i = m; i < (kBits - 1); ++i)
|
2017-05-18 03:06:13 +00:00
|
|
|
{
|
2017-05-29 22:01:01 +00:00
|
|
|
const unsigned s = (1UL << (i + 1));
|
2017-05-18 03:06:13 +00:00
|
|
|
|
2017-05-29 22:01:01 +00:00
|
|
|
for (unsigned j = (1UL << m) - 1; j < s; j += step)
|
2017-05-27 02:51:30 +00:00
|
|
|
FFTSkew[j + s] = FFTSkew[j] ^ temp[i];
|
2017-05-18 03:06:13 +00:00
|
|
|
}
|
|
|
|
|
2017-05-29 22:01:01 +00:00
|
|
|
temp[m] = kModulus - LogLUT[MultiplyLog(temp[m], LogLUT[temp[m] ^ 1])];
|
2017-05-18 03:06:13 +00:00
|
|
|
|
2017-05-27 02:51:30 +00:00
|
|
|
for (unsigned i = m + 1; i < (kBits - 1); ++i)
|
2017-05-29 22:01:01 +00:00
|
|
|
{
|
|
|
|
const ffe_t sum = AddMod(LogLUT[temp[i] ^ 1], temp[m]);
|
|
|
|
temp[i] = MultiplyLog(temp[i], sum);
|
|
|
|
}
|
2017-05-18 03:06:13 +00:00
|
|
|
}
|
|
|
|
|
2017-06-02 15:12:03 +00:00
|
|
|
for (unsigned i = 0; i < kModulus; ++i)
|
2017-05-27 02:51:30 +00:00
|
|
|
FFTSkew[i] = LogLUT[FFTSkew[i]];
|
2017-05-18 03:06:13 +00:00
|
|
|
|
2017-05-29 22:01:01 +00:00
|
|
|
// Precalculate FWHT(Log[i]):
|
2017-05-18 03:06:13 +00:00
|
|
|
|
2017-05-27 02:51:30 +00:00
|
|
|
for (unsigned i = 0; i < kOrder; ++i)
|
|
|
|
LogWalsh[i] = LogLUT[i];
|
|
|
|
LogWalsh[0] = 0;
|
2017-05-18 03:06:13 +00:00
|
|
|
|
2017-06-03 23:23:49 +00:00
|
|
|
FWHT(LogWalsh, kOrder, kOrder);
|
2017-05-18 03:06:13 +00:00
|
|
|
}
|
|
|
|
|
2017-05-29 22:01:01 +00:00
|
|
|
void VectorFFTButterfly(
|
|
|
|
const uint64_t bytes,
|
|
|
|
unsigned count,
|
|
|
|
void** x,
|
|
|
|
void** y,
|
|
|
|
const ffe_t log_m)
|
|
|
|
{
|
|
|
|
if (log_m == kModulus)
|
|
|
|
{
|
|
|
|
VectorXOR(bytes, count, y, x);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef LEO_USE_VECTOR4_OPT
|
|
|
|
while (count >= 4)
|
|
|
|
{
|
|
|
|
fft_butterfly4(
|
|
|
|
x[0], y[0],
|
|
|
|
x[1], y[1],
|
|
|
|
x[2], y[2],
|
|
|
|
x[3], y[3],
|
|
|
|
log_m, bytes);
|
|
|
|
x += 4, y += 4;
|
|
|
|
count -= 4;
|
|
|
|
}
|
|
|
|
#endif // LEO_USE_VECTOR4_OPT
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < count; ++i)
|
|
|
|
fft_butterfly(x[i], y[i], log_m, bytes);
|
|
|
|
}
|
|
|
|
|
|
|
|
void VectorIFFTButterfly(
|
|
|
|
const uint64_t bytes,
|
|
|
|
unsigned count,
|
|
|
|
void** x,
|
|
|
|
void** y,
|
|
|
|
const ffe_t log_m)
|
|
|
|
{
|
|
|
|
if (log_m == kModulus)
|
|
|
|
{
|
|
|
|
VectorXOR(bytes, count, y, x);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef LEO_USE_VECTOR4_OPT
|
|
|
|
while (count >= 4)
|
|
|
|
{
|
|
|
|
ifft_butterfly4(
|
|
|
|
x[0], y[0],
|
|
|
|
x[1], y[1],
|
|
|
|
x[2], y[2],
|
|
|
|
x[3], y[3],
|
|
|
|
log_m, bytes);
|
|
|
|
x += 4, y += 4;
|
|
|
|
count -= 4;
|
|
|
|
}
|
|
|
|
#endif // LEO_USE_VECTOR4_OPT
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < count; ++i)
|
|
|
|
ifft_butterfly(x[i], y[i], log_m, bytes);
|
|
|
|
}
|
|
|
|
|
2017-05-18 03:06:13 +00:00
|
|
|
|
|
|
|
//------------------------------------------------------------------------------
|
2017-05-29 22:01:01 +00:00
|
|
|
// Reed-Solomon Encode
|
2017-05-27 02:51:30 +00:00
|
|
|
|
2017-05-29 22:01:01 +00:00
|
|
|
void ReedSolomonEncode(
|
2017-05-27 02:51:30 +00:00
|
|
|
uint64_t buffer_bytes,
|
|
|
|
unsigned original_count,
|
|
|
|
unsigned recovery_count,
|
|
|
|
unsigned m,
|
2017-06-02 15:43:54 +00:00
|
|
|
const void* const * data,
|
2017-05-27 02:51:30 +00:00
|
|
|
void** work)
|
2017-05-18 03:06:13 +00:00
|
|
|
{
|
2017-05-27 02:51:30 +00:00
|
|
|
// work <- data
|
|
|
|
|
2017-05-28 20:50:32 +00:00
|
|
|
// TBD: Unroll first loop to eliminate this
|
2017-05-29 22:01:01 +00:00
|
|
|
unsigned first_end = m;
|
|
|
|
if (original_count < m)
|
|
|
|
{
|
|
|
|
first_end = original_count;
|
|
|
|
for (unsigned i = original_count; i < m; ++i)
|
|
|
|
memset(work[i], 0, buffer_bytes);
|
|
|
|
}
|
|
|
|
for (unsigned i = 0; i < first_end; ++i)
|
2017-05-27 02:51:30 +00:00
|
|
|
memcpy(work[i], data[i], buffer_bytes);
|
2017-05-18 03:06:13 +00:00
|
|
|
|
2017-05-27 02:51:30 +00:00
|
|
|
// work <- IFFT(data, m, m)
|
2017-05-18 03:06:13 +00:00
|
|
|
|
2017-05-27 02:51:30 +00:00
|
|
|
for (unsigned width = 1; width < m; width <<= 1)
|
2017-05-18 03:06:13 +00:00
|
|
|
{
|
2017-05-29 22:01:01 +00:00
|
|
|
const unsigned range = width << 1;
|
|
|
|
const ffe_t* skewLUT = FFTSkew + width + m - 1;
|
2017-05-18 03:06:13 +00:00
|
|
|
|
2017-05-29 22:01:01 +00:00
|
|
|
#ifdef LEO_SCHEDULE_OPT
|
|
|
|
for (unsigned j = 0; j < first_end; j += range)
|
|
|
|
#else
|
|
|
|
for (unsigned j = 0; j < m; j += range)
|
|
|
|
#endif
|
|
|
|
{
|
|
|
|
VectorIFFTButterfly(
|
|
|
|
buffer_bytes,
|
|
|
|
width,
|
|
|
|
work + j,
|
|
|
|
work + j + width,
|
|
|
|
skewLUT[j]);
|
2017-05-27 02:51:30 +00:00
|
|
|
}
|
2017-05-18 03:06:13 +00:00
|
|
|
}
|
|
|
|
|
2017-06-02 15:12:03 +00:00
|
|
|
const unsigned last_count = original_count % m;
|
2017-05-29 22:01:01 +00:00
|
|
|
if (m >= original_count)
|
|
|
|
goto skip_body;
|
|
|
|
|
2017-05-27 02:51:30 +00:00
|
|
|
for (unsigned i = m; i + m <= original_count; i += m)
|
2017-05-18 03:06:13 +00:00
|
|
|
{
|
2017-05-27 02:51:30 +00:00
|
|
|
// temp <- data + i
|
2017-05-18 03:06:13 +00:00
|
|
|
|
2017-05-29 22:01:01 +00:00
|
|
|
data += m;
|
2017-05-27 02:51:30 +00:00
|
|
|
void** temp = work + m;
|
2017-05-18 03:06:13 +00:00
|
|
|
|
2017-05-28 20:50:32 +00:00
|
|
|
// TBD: Unroll first loop to eliminate this
|
2017-05-27 02:51:30 +00:00
|
|
|
for (unsigned j = 0; j < m; ++j)
|
|
|
|
memcpy(temp[j], data[j], buffer_bytes);
|
2017-05-18 03:06:13 +00:00
|
|
|
|
2017-05-27 02:51:30 +00:00
|
|
|
// temp <- IFFT(temp, m, m + i)
|
2017-05-18 03:06:13 +00:00
|
|
|
|
2017-05-29 22:01:01 +00:00
|
|
|
const ffe_t* skewLUT = FFTSkew + m + i - 1;
|
|
|
|
|
2017-05-27 02:51:30 +00:00
|
|
|
for (unsigned width = 1; width < m; width <<= 1)
|
|
|
|
{
|
2017-05-29 22:01:01 +00:00
|
|
|
const unsigned range = width << 1;
|
|
|
|
|
|
|
|
for (unsigned j = width; j < m; j += range)
|
2017-05-27 02:51:30 +00:00
|
|
|
{
|
2017-05-29 22:01:01 +00:00
|
|
|
VectorIFFTButterfly(
|
|
|
|
buffer_bytes,
|
|
|
|
width,
|
|
|
|
temp + j - width,
|
|
|
|
temp + j,
|
|
|
|
skewLUT[j]);
|
2017-05-27 02:51:30 +00:00
|
|
|
}
|
|
|
|
}
|
2017-05-18 03:06:13 +00:00
|
|
|
|
2017-05-27 02:51:30 +00:00
|
|
|
// work <- work XOR temp
|
2017-05-18 03:06:13 +00:00
|
|
|
|
2017-05-28 20:50:32 +00:00
|
|
|
// TBD: Unroll last loop to eliminate this
|
2017-05-29 22:01:01 +00:00
|
|
|
VectorXOR(
|
|
|
|
buffer_bytes,
|
|
|
|
m,
|
|
|
|
work,
|
|
|
|
temp);
|
2017-05-27 02:51:30 +00:00
|
|
|
}
|
2017-05-18 03:06:13 +00:00
|
|
|
|
2017-05-27 02:51:30 +00:00
|
|
|
if (last_count != 0)
|
|
|
|
{
|
|
|
|
const unsigned i = original_count - last_count;
|
2017-05-18 03:06:13 +00:00
|
|
|
|
2017-05-27 02:51:30 +00:00
|
|
|
// temp <- data + i
|
2017-05-18 03:06:13 +00:00
|
|
|
|
2017-05-29 22:01:01 +00:00
|
|
|
data += m;
|
2017-05-27 02:51:30 +00:00
|
|
|
void** temp = work + m;
|
2017-05-18 03:06:13 +00:00
|
|
|
|
2017-05-27 02:51:30 +00:00
|
|
|
for (unsigned j = 0; j < last_count; ++j)
|
|
|
|
memcpy(temp[j], data[j], buffer_bytes);
|
|
|
|
for (unsigned j = last_count; j < m; ++j)
|
|
|
|
memset(temp[j], 0, buffer_bytes);
|
2017-05-18 03:06:13 +00:00
|
|
|
|
2017-05-27 02:51:30 +00:00
|
|
|
// temp <- IFFT(temp, m, m + i)
|
2017-05-18 03:06:13 +00:00
|
|
|
|
2017-05-27 02:51:30 +00:00
|
|
|
for (unsigned width = 1, shift = 1; width < m; width <<= 1, ++shift)
|
2017-05-24 08:23:19 +00:00
|
|
|
{
|
2017-05-29 22:01:01 +00:00
|
|
|
const unsigned range = width << 1;
|
|
|
|
const ffe_t* skewLUT = FFTSkew + width + m + i - 1;
|
2017-05-27 02:51:30 +00:00
|
|
|
|
2017-05-29 22:01:01 +00:00
|
|
|
#ifdef LEO_SCHEDULE_OPT
|
|
|
|
// Calculate stop considering that the right is all zeroes
|
|
|
|
const unsigned stop = ((last_count + range - 1) >> shift) << shift;
|
|
|
|
for (unsigned j = 0; j < stop; j += range)
|
|
|
|
#else
|
|
|
|
for (unsigned j = 0; j < m; j += range)
|
|
|
|
#endif
|
2017-05-27 02:51:30 +00:00
|
|
|
{
|
2017-05-29 22:01:01 +00:00
|
|
|
VectorIFFTButterfly(
|
|
|
|
buffer_bytes,
|
|
|
|
width,
|
|
|
|
temp + j,
|
|
|
|
temp + j + width,
|
|
|
|
skewLUT[j]);
|
2017-05-27 02:51:30 +00:00
|
|
|
}
|
2017-05-24 08:23:19 +00:00
|
|
|
}
|
2017-05-18 03:06:13 +00:00
|
|
|
|
2017-05-27 02:51:30 +00:00
|
|
|
// work <- work XOR temp
|
2017-05-18 03:06:13 +00:00
|
|
|
|
2017-05-28 20:50:32 +00:00
|
|
|
// TBD: Unroll last loop to eliminate this
|
2017-05-29 22:01:01 +00:00
|
|
|
VectorXOR(
|
|
|
|
buffer_bytes,
|
|
|
|
m,
|
|
|
|
work,
|
|
|
|
temp);
|
2017-05-24 08:23:19 +00:00
|
|
|
}
|
2017-05-18 03:06:13 +00:00
|
|
|
|
2017-05-29 22:01:01 +00:00
|
|
|
skip_body:
|
|
|
|
|
2017-05-27 02:51:30 +00:00
|
|
|
// work <- FFT(work, m, 0)
|
2017-05-18 03:06:13 +00:00
|
|
|
|
2017-05-27 02:51:30 +00:00
|
|
|
for (unsigned width = (m >> 1); width > 0; width >>= 1)
|
2017-05-18 03:06:13 +00:00
|
|
|
{
|
2017-05-27 02:51:30 +00:00
|
|
|
const ffe_t* skewLUT = FFTSkew + width - 1;
|
|
|
|
const unsigned range = width << 1;
|
2017-05-18 03:06:13 +00:00
|
|
|
|
2017-05-29 22:01:01 +00:00
|
|
|
#ifdef LEO_SCHEDULE_OPT
|
|
|
|
for (unsigned j = 0; j < recovery_count; j += range)
|
|
|
|
#else
|
2017-05-27 02:51:30 +00:00
|
|
|
for (unsigned j = 0; j < m; j += range)
|
2017-05-29 22:01:01 +00:00
|
|
|
#endif
|
2017-05-24 08:23:19 +00:00
|
|
|
{
|
2017-05-29 22:01:01 +00:00
|
|
|
VectorFFTButterfly(
|
|
|
|
buffer_bytes,
|
|
|
|
width,
|
|
|
|
work + j,
|
|
|
|
work + j + width,
|
|
|
|
skewLUT[j]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2017-05-27 02:51:30 +00:00
|
|
|
|
2017-05-29 22:01:01 +00:00
|
|
|
|
|
|
|
//------------------------------------------------------------------------------
|
|
|
|
// ErrorBitfield
|
|
|
|
|
2017-05-30 08:37:27 +00:00
|
|
|
#ifdef LEO_ERROR_BITFIELD_OPT
|
2017-05-29 22:01:01 +00:00
|
|
|
|
|
|
|
// Used in decoding to decide which final FFT operations to perform
|
|
|
|
class ErrorBitfield
|
|
|
|
{
|
2017-05-30 08:37:27 +00:00
|
|
|
static const unsigned kWordMips = 5;
|
2017-05-29 22:01:01 +00:00
|
|
|
static const unsigned kWords = kOrder / 64;
|
2017-05-30 08:37:27 +00:00
|
|
|
uint64_t Words[kWordMips][kWords] = {};
|
|
|
|
|
2017-05-30 08:49:48 +00:00
|
|
|
static const unsigned kBigMips = 6;
|
2017-05-30 08:37:27 +00:00
|
|
|
static const unsigned kBigWords = (kWords + 63) / 64;
|
|
|
|
uint64_t BigWords[kBigMips][kBigWords] = {};
|
|
|
|
|
2017-05-30 08:49:48 +00:00
|
|
|
static const unsigned kBiggestMips = 4;
|
2017-05-30 08:37:27 +00:00
|
|
|
uint64_t BiggestWords[kBiggestMips] = {};
|
2017-05-29 22:01:01 +00:00
|
|
|
|
|
|
|
public:
|
|
|
|
LEO_FORCE_INLINE void Set(unsigned i)
|
|
|
|
{
|
|
|
|
Words[0][i / 64] |= (uint64_t)1 << (i % 64);
|
|
|
|
}
|
|
|
|
|
|
|
|
void Prepare();
|
|
|
|
|
|
|
|
LEO_FORCE_INLINE bool IsNeeded(unsigned mip_level, unsigned bit) const
|
|
|
|
{
|
2017-05-30 08:37:27 +00:00
|
|
|
if (mip_level >= 16)
|
2017-05-29 22:01:01 +00:00
|
|
|
return true;
|
2017-05-30 08:49:48 +00:00
|
|
|
if (mip_level >= 12)
|
2017-05-30 08:37:27 +00:00
|
|
|
{
|
|
|
|
bit /= 4096;
|
2017-05-30 08:49:48 +00:00
|
|
|
return 0 != (BiggestWords[mip_level - 12] & ((uint64_t)1 << bit));
|
2017-05-30 08:37:27 +00:00
|
|
|
}
|
|
|
|
if (mip_level >= 6)
|
|
|
|
{
|
|
|
|
bit /= 64;
|
|
|
|
return 0 != (BigWords[mip_level - 6][bit / 64] & ((uint64_t)1 << (bit % 64)));
|
|
|
|
}
|
2017-05-29 22:01:01 +00:00
|
|
|
return 0 != (Words[mip_level - 1][bit / 64] & ((uint64_t)1 << (bit % 64)));
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
static const uint64_t kHiMasks[5] = {
|
|
|
|
0xAAAAAAAAAAAAAAAAULL,
|
|
|
|
0xCCCCCCCCCCCCCCCCULL,
|
|
|
|
0xF0F0F0F0F0F0F0F0ULL,
|
|
|
|
0xFF00FF00FF00FF00ULL,
|
|
|
|
0xFFFF0000FFFF0000ULL,
|
|
|
|
};
|
|
|
|
|
|
|
|
void ErrorBitfield::Prepare()
|
|
|
|
{
|
|
|
|
// First mip level is for final layer of FFT: pairs of data
|
|
|
|
for (unsigned i = 0; i < kWords; ++i)
|
|
|
|
{
|
2017-05-30 08:37:27 +00:00
|
|
|
uint64_t w_i = Words[0][i];
|
|
|
|
const uint64_t hi2lo0 = w_i | ((w_i & kHiMasks[0]) >> 1);
|
|
|
|
const uint64_t lo2hi0 = ((w_i & (kHiMasks[0] >> 1)) << 1);
|
|
|
|
Words[0][i] = w_i = hi2lo0 | lo2hi0;
|
2017-05-29 22:01:01 +00:00
|
|
|
|
2017-05-30 08:37:27 +00:00
|
|
|
for (unsigned j = 1, bits = 2; j < kWordMips; ++j, bits <<= 1)
|
2017-05-29 22:01:01 +00:00
|
|
|
{
|
2017-05-30 08:37:27 +00:00
|
|
|
const uint64_t hi2lo_j = w_i | ((w_i & kHiMasks[j]) >> bits);
|
|
|
|
const uint64_t lo2hi_j = ((w_i & (kHiMasks[j] >> bits)) << bits);
|
|
|
|
Words[j][i] = w_i = hi2lo_j | lo2hi_j;
|
2017-05-24 08:23:19 +00:00
|
|
|
}
|
|
|
|
}
|
2017-05-29 22:01:01 +00:00
|
|
|
|
2017-05-30 08:37:27 +00:00
|
|
|
for (unsigned i = 0; i < kBigWords; ++i)
|
2017-05-29 22:01:01 +00:00
|
|
|
{
|
2017-05-30 08:37:27 +00:00
|
|
|
uint64_t w_i = 0;
|
|
|
|
uint64_t bit = 1;
|
2017-05-30 08:49:48 +00:00
|
|
|
const uint64_t* src = &Words[kWordMips - 1][i * 64];
|
2017-05-30 08:37:27 +00:00
|
|
|
for (unsigned j = 0; j < 64; ++j, bit <<= 1)
|
|
|
|
{
|
|
|
|
const uint64_t w = src[j];
|
|
|
|
w_i |= (w | (w >> 32) | (w << 32)) & bit;
|
|
|
|
}
|
|
|
|
BigWords[0][i] = w_i;
|
|
|
|
|
|
|
|
for (unsigned j = 1, bits = 1; j < kBigMips; ++j, bits <<= 1)
|
|
|
|
{
|
|
|
|
const uint64_t hi2lo_j = w_i | ((w_i & kHiMasks[j - 1]) >> bits);
|
|
|
|
const uint64_t lo2hi_j = ((w_i & (kHiMasks[j - 1] >> bits)) << bits);
|
|
|
|
BigWords[j][i] = w_i = hi2lo_j | lo2hi_j;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
uint64_t w_i = 0;
|
|
|
|
uint64_t bit = 1;
|
2017-05-30 08:49:48 +00:00
|
|
|
const uint64_t* src = &BigWords[kBigMips - 1][0];
|
2017-05-30 08:37:27 +00:00
|
|
|
for (unsigned j = 0; j < kBigWords; ++j, bit <<= 1)
|
|
|
|
{
|
2017-05-30 08:49:48 +00:00
|
|
|
const uint64_t w = src[j];
|
2017-05-30 08:37:27 +00:00
|
|
|
w_i |= (w | (w >> 32) | (w << 32)) & bit;
|
2017-05-29 22:01:01 +00:00
|
|
|
}
|
2017-05-30 08:37:27 +00:00
|
|
|
BiggestWords[0] = w_i;
|
2017-05-29 22:01:01 +00:00
|
|
|
|
2017-05-30 08:37:27 +00:00
|
|
|
for (unsigned j = 1, bits = 1; j < kBiggestMips; ++j, bits <<= 1)
|
|
|
|
{
|
|
|
|
const uint64_t hi2lo_j = w_i | ((w_i & kHiMasks[j - 1]) >> bits);
|
|
|
|
const uint64_t lo2hi_j = ((w_i & (kHiMasks[j - 1] >> bits)) << bits);
|
|
|
|
BiggestWords[j] = w_i = hi2lo_j | lo2hi_j;
|
|
|
|
}
|
2017-05-18 03:06:13 +00:00
|
|
|
}
|
|
|
|
|
2017-05-30 08:37:27 +00:00
|
|
|
#endif // LEO_ERROR_BITFIELD_OPT
|
2017-05-29 22:01:01 +00:00
|
|
|
|
2017-05-18 03:06:13 +00:00
|
|
|
|
|
|
|
//------------------------------------------------------------------------------
|
2017-05-29 22:01:01 +00:00
|
|
|
// Reed-Solomon Decode
|
2017-05-27 02:51:30 +00:00
|
|
|
|
2017-05-29 22:01:01 +00:00
|
|
|
void ReedSolomonDecode(
|
2017-05-27 02:51:30 +00:00
|
|
|
uint64_t buffer_bytes,
|
|
|
|
unsigned original_count,
|
|
|
|
unsigned recovery_count,
|
|
|
|
unsigned m, // NextPow2(recovery_count)
|
|
|
|
unsigned n, // NextPow2(m + original_count) = work_count
|
2017-06-02 15:43:54 +00:00
|
|
|
const void* const * const original, // original_count entries
|
|
|
|
const void* const * const recovery, // recovery_count entries
|
2017-05-27 02:51:30 +00:00
|
|
|
void** work) // n entries
|
2017-05-18 03:06:13 +00:00
|
|
|
{
|
2017-05-27 02:51:30 +00:00
|
|
|
// Fill in error locations
|
2017-05-18 03:06:13 +00:00
|
|
|
|
2017-05-30 08:37:27 +00:00
|
|
|
#ifdef LEO_ERROR_BITFIELD_OPT
|
2017-05-29 22:01:01 +00:00
|
|
|
ErrorBitfield ErrorBits;
|
2017-05-30 08:37:27 +00:00
|
|
|
#endif // LEO_ERROR_BITFIELD_OPT
|
2017-05-29 22:01:01 +00:00
|
|
|
|
|
|
|
ffe_t ErrorLocations[kOrder] = {};
|
2017-05-27 02:51:30 +00:00
|
|
|
for (unsigned i = 0; i < recovery_count; ++i)
|
2017-05-29 22:01:01 +00:00
|
|
|
if (!recovery[i])
|
|
|
|
ErrorLocations[i] = 1;
|
2017-05-27 02:51:30 +00:00
|
|
|
for (unsigned i = recovery_count; i < m; ++i)
|
|
|
|
ErrorLocations[i] = 1;
|
|
|
|
for (unsigned i = 0; i < original_count; ++i)
|
2017-05-29 22:01:01 +00:00
|
|
|
{
|
|
|
|
if (!original[i])
|
|
|
|
{
|
|
|
|
ErrorLocations[i + m] = 1;
|
2017-05-30 08:37:27 +00:00
|
|
|
#ifdef LEO_ERROR_BITFIELD_OPT
|
2017-05-29 22:01:01 +00:00
|
|
|
ErrorBits.Set(i + m);
|
2017-05-30 08:37:27 +00:00
|
|
|
#endif // LEO_ERROR_BITFIELD_OPT
|
2017-05-29 22:01:01 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-05-30 08:37:27 +00:00
|
|
|
#ifdef LEO_ERROR_BITFIELD_OPT
|
2017-05-29 22:01:01 +00:00
|
|
|
ErrorBits.Prepare();
|
2017-05-30 08:37:27 +00:00
|
|
|
#endif // LEO_ERROR_BITFIELD_OPT
|
2017-05-18 03:06:13 +00:00
|
|
|
|
2017-05-27 02:51:30 +00:00
|
|
|
// Evaluate error locator polynomial
|
2017-05-18 03:06:13 +00:00
|
|
|
|
2017-06-03 23:23:49 +00:00
|
|
|
FWHT(ErrorLocations, kOrder, m + original_count);
|
2017-05-18 03:06:13 +00:00
|
|
|
|
2017-05-27 02:51:30 +00:00
|
|
|
for (unsigned i = 0; i < kOrder; ++i)
|
2017-05-27 03:30:48 +00:00
|
|
|
ErrorLocations[i] = ((unsigned)ErrorLocations[i] * (unsigned)LogWalsh[i]) % kModulus;
|
2017-05-18 03:06:13 +00:00
|
|
|
|
2017-06-03 23:23:49 +00:00
|
|
|
FWHT(ErrorLocations, kOrder, kOrder);
|
2017-05-18 03:06:13 +00:00
|
|
|
|
2017-05-27 02:51:30 +00:00
|
|
|
// work <- recovery data
|
2017-05-18 03:06:13 +00:00
|
|
|
|
2017-05-27 02:51:30 +00:00
|
|
|
for (unsigned i = 0; i < recovery_count; ++i)
|
|
|
|
{
|
|
|
|
if (recovery[i])
|
2017-05-29 22:01:01 +00:00
|
|
|
mul_mem(work[i], recovery[i], ErrorLocations[i], buffer_bytes);
|
2017-05-27 02:51:30 +00:00
|
|
|
else
|
|
|
|
memset(work[i], 0, buffer_bytes);
|
|
|
|
}
|
|
|
|
for (unsigned i = recovery_count; i < m; ++i)
|
|
|
|
memset(work[i], 0, buffer_bytes);
|
2017-05-18 03:06:13 +00:00
|
|
|
|
2017-05-27 02:51:30 +00:00
|
|
|
// work <- original data
|
2017-05-18 03:06:13 +00:00
|
|
|
|
2017-05-27 02:51:30 +00:00
|
|
|
for (unsigned i = 0; i < original_count; ++i)
|
|
|
|
{
|
|
|
|
if (original[i])
|
2017-05-29 22:01:01 +00:00
|
|
|
mul_mem(work[m + i], original[i], ErrorLocations[m + i], buffer_bytes);
|
2017-05-27 02:51:30 +00:00
|
|
|
else
|
|
|
|
memset(work[m + i], 0, buffer_bytes);
|
|
|
|
}
|
|
|
|
for (unsigned i = m + original_count; i < n; ++i)
|
|
|
|
memset(work[i], 0, buffer_bytes);
|
2017-05-18 03:06:13 +00:00
|
|
|
|
2017-05-27 02:51:30 +00:00
|
|
|
// work <- IFFT(work, n, 0)
|
2017-05-18 03:06:13 +00:00
|
|
|
|
2017-05-29 22:01:01 +00:00
|
|
|
const unsigned input_count = m + original_count;
|
|
|
|
unsigned mip_level = 0;
|
|
|
|
|
|
|
|
for (unsigned width = 1; width < n; width <<= 1, ++mip_level)
|
2017-05-18 03:06:13 +00:00
|
|
|
{
|
2017-05-29 22:01:01 +00:00
|
|
|
const unsigned range = width << 1;
|
2017-05-27 02:51:30 +00:00
|
|
|
|
2017-05-29 22:01:01 +00:00
|
|
|
for (unsigned j = width; j < n; j += range)
|
|
|
|
{
|
|
|
|
VectorIFFTButterfly(
|
|
|
|
buffer_bytes,
|
|
|
|
width,
|
|
|
|
work + j - width,
|
|
|
|
work + j,
|
|
|
|
FFTSkew[j - 1]);
|
2017-05-18 03:06:13 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-05-27 02:51:30 +00:00
|
|
|
// work <- FormalDerivative(work, n)
|
2017-05-18 03:06:13 +00:00
|
|
|
|
2017-05-27 02:51:30 +00:00
|
|
|
for (unsigned i = 1; i < n; ++i)
|
|
|
|
{
|
|
|
|
const unsigned width = ((i ^ (i - 1)) + 1) >> 1;
|
2017-05-18 03:06:13 +00:00
|
|
|
|
2017-05-29 22:01:01 +00:00
|
|
|
VectorXOR(
|
|
|
|
buffer_bytes,
|
|
|
|
width,
|
|
|
|
work + i - width,
|
|
|
|
work + i);
|
2017-05-27 02:51:30 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// work <- FFT(work, n, 0) truncated to m + original_count
|
2017-05-18 03:06:13 +00:00
|
|
|
|
2017-05-27 02:51:30 +00:00
|
|
|
const unsigned output_count = m + original_count;
|
2017-05-29 22:01:01 +00:00
|
|
|
for (unsigned width = (n >> 1); width > 0; width >>= 1, --mip_level)
|
2017-05-18 03:06:13 +00:00
|
|
|
{
|
2017-05-27 02:51:30 +00:00
|
|
|
const ffe_t* skewLUT = FFTSkew + width - 1;
|
|
|
|
const unsigned range = width << 1;
|
|
|
|
|
2017-05-29 22:01:01 +00:00
|
|
|
#ifdef LEO_SCHEDULE_OPT
|
2017-05-27 02:51:30 +00:00
|
|
|
for (unsigned j = (m < range) ? 0 : m; j < output_count; j += range)
|
2017-05-29 22:01:01 +00:00
|
|
|
#else
|
|
|
|
for (unsigned j = 0; j < n; j += range)
|
|
|
|
#endif
|
2017-05-18 03:06:13 +00:00
|
|
|
{
|
2017-05-30 08:37:27 +00:00
|
|
|
#ifdef LEO_ERROR_BITFIELD_OPT
|
2017-05-29 22:01:01 +00:00
|
|
|
if (!ErrorBits.IsNeeded(mip_level, j))
|
|
|
|
continue;
|
2017-05-30 08:37:27 +00:00
|
|
|
#endif // LEO_ERROR_BITFIELD_OPT
|
2017-05-29 22:01:01 +00:00
|
|
|
|
|
|
|
VectorFFTButterfly(
|
|
|
|
buffer_bytes,
|
|
|
|
width,
|
|
|
|
work + j,
|
|
|
|
work + j + width,
|
|
|
|
skewLUT[j]);
|
2017-05-18 03:06:13 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-05-27 02:51:30 +00:00
|
|
|
// Reveal erasures
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < original_count; ++i)
|
|
|
|
if (!original[i])
|
2017-05-29 22:01:01 +00:00
|
|
|
mul_mem(work[i], work[i + m], kModulus - ErrorLocations[i + m], buffer_bytes);
|
2017-05-18 03:06:13 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
//------------------------------------------------------------------------------
|
2017-05-27 02:51:30 +00:00
|
|
|
// API
|
2017-05-18 03:06:13 +00:00
|
|
|
|
2017-05-27 02:51:30 +00:00
|
|
|
static bool IsInitialized = false;
|
2017-05-18 03:06:13 +00:00
|
|
|
|
2017-05-27 02:51:30 +00:00
|
|
|
bool Initialize()
|
|
|
|
{
|
|
|
|
if (IsInitialized)
|
|
|
|
return true;
|
2017-05-18 03:06:13 +00:00
|
|
|
|
2017-05-27 02:51:30 +00:00
|
|
|
if (!CpuHasSSSE3)
|
|
|
|
return false;
|
2017-05-18 03:06:13 +00:00
|
|
|
|
2017-05-27 02:51:30 +00:00
|
|
|
InitializeLogarithmTables();
|
2017-05-29 22:01:01 +00:00
|
|
|
InitializeMultiplyTables();
|
2017-05-27 02:51:30 +00:00
|
|
|
FFTInitialize();
|
2017-05-18 03:06:13 +00:00
|
|
|
|
2017-05-27 02:51:30 +00:00
|
|
|
IsInitialized = true;
|
|
|
|
return true;
|
2017-05-18 03:06:13 +00:00
|
|
|
}
|
2017-05-27 02:51:30 +00:00
|
|
|
|
|
|
|
|
|
|
|
}} // namespace leopard::ff16
|
2017-05-27 03:10:53 +00:00
|
|
|
|
|
|
|
#endif // LEO_HAS_FF16
|