leopard/LeopardCommon.cpp

958 lines
26 KiB
C++
Raw Normal View History

2017-05-18 03:06:13 +00:00
/*
2017-05-25 09:24:15 +00:00
Copyright (c) 2017 Christopher A. Taylor. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the name of Leopard-RS nor the names of its contributors may be
used to endorse or promote products derived from this software without
specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
2017-05-18 03:06:13 +00:00
*/
2017-05-25 09:24:15 +00:00
#include "LeopardCommon.h"
2017-05-18 03:06:13 +00:00
2017-05-25 09:24:15 +00:00
namespace leopard {
2017-05-18 03:06:13 +00:00
//------------------------------------------------------------------------------
// Runtime CPU Architecture Check
//
// Feature checks stolen shamelessly from
// https://github.com/jedisct1/libsodium/blob/master/src/libsodium/sodium/runtime.c
#if defined(HAVE_ANDROID_GETCPUFEATURES)
#include <cpu-features.h>
#endif
2017-05-25 09:24:15 +00:00
#if defined(LEO_TRY_NEON)
2017-05-18 03:06:13 +00:00
# if defined(IOS) && defined(__ARM_NEON__)
2017-05-25 09:24:15 +00:00
// Requires iPhone 5S or newer
2017-05-18 03:06:13 +00:00
# else
2017-05-25 09:24:15 +00:00
// Remember to add LOCAL_STATIC_LIBRARIES := cpufeatures
bool CpuHasNeon = false; // V6 / V7
bool CpuHasNeon64 = false; // 64-bit
2017-05-18 03:06:13 +00:00
# endif
#endif
2017-05-25 09:24:15 +00:00
#if !defined(LEO_TARGET_MOBILE)
2017-05-18 03:06:13 +00:00
#ifdef _MSC_VER
#include <intrin.h> // __cpuid
#pragma warning(disable: 4752) // found Intel(R) Advanced Vector Extensions; consider using /arch:AVX
#endif
2017-05-25 09:24:15 +00:00
#ifdef LEO_TRY_AVX2
bool CpuHasAVX2 = false;
2017-05-18 03:06:13 +00:00
#endif
2017-05-25 09:24:15 +00:00
bool CpuHasSSSE3 = false;
2017-05-18 03:06:13 +00:00
#define CPUID_EBX_AVX2 0x00000020
#define CPUID_ECX_SSSE3 0x00000200
static void _cpuid(unsigned int cpu_info[4U], const unsigned int cpu_info_type)
{
#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_AMD64) || defined(_M_IX86))
__cpuid((int *) cpu_info, cpu_info_type);
#else //if defined(HAVE_CPUID)
cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0;
# ifdef __i386__
__asm__ __volatile__ ("pushfl; pushfl; "
"popl %0; "
"movl %0, %1; xorl %2, %0; "
"pushl %0; "
"popfl; pushfl; popl %0; popfl" :
"=&r" (cpu_info[0]), "=&r" (cpu_info[1]) :
"i" (0x200000));
if (((cpu_info[0] ^ cpu_info[1]) & 0x200000) == 0) {
return; /* LCOV_EXCL_LINE */
}
# endif
# ifdef __i386__
__asm__ __volatile__ ("xchgl %%ebx, %k1; cpuid; xchgl %%ebx, %k1" :
"=a" (cpu_info[0]), "=&r" (cpu_info[1]),
"=c" (cpu_info[2]), "=d" (cpu_info[3]) :
"0" (cpu_info_type), "2" (0U));
# elif defined(__x86_64__)
__asm__ __volatile__ ("xchgq %%rbx, %q1; cpuid; xchgq %%rbx, %q1" :
"=a" (cpu_info[0]), "=&r" (cpu_info[1]),
"=c" (cpu_info[2]), "=d" (cpu_info[3]) :
"0" (cpu_info_type), "2" (0U));
# else
__asm__ __volatile__ ("cpuid" :
"=a" (cpu_info[0]), "=b" (cpu_info[1]),
"=c" (cpu_info[2]), "=d" (cpu_info[3]) :
"0" (cpu_info_type), "2" (0U));
# endif
#endif
}
2017-05-25 09:24:15 +00:00
#endif // defined(LEO_TARGET_MOBILE)
2017-05-18 03:06:13 +00:00
2017-05-25 09:24:15 +00:00
void InitializeCPUArch()
2017-05-18 03:06:13 +00:00
{
2017-05-25 09:24:15 +00:00
#if defined(LEO_TRY_NEON) && defined(HAVE_ANDROID_GETCPUFEATURES)
2017-05-18 03:06:13 +00:00
AndroidCpuFamily family = android_getCpuFamily();
if (family == ANDROID_CPU_FAMILY_ARM)
{
if (android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON)
CpuHasNeon = true;
}
else if (family == ANDROID_CPU_FAMILY_ARM64)
{
CpuHasNeon = true;
if (android_getCpuFeatures() & ANDROID_CPU_ARM64_FEATURE_ASIMD)
CpuHasNeon64 = true;
}
#endif
2017-05-25 09:24:15 +00:00
#if !defined(LEO_TARGET_MOBILE)
2017-05-18 03:06:13 +00:00
unsigned int cpu_info[4];
_cpuid(cpu_info, 1);
CpuHasSSSE3 = ((cpu_info[2] & CPUID_ECX_SSSE3) != 0);
2017-05-25 09:24:15 +00:00
#if defined(LEO_TRY_AVX2)
2017-05-18 03:06:13 +00:00
_cpuid(cpu_info, 7);
CpuHasAVX2 = ((cpu_info[1] & CPUID_EBX_AVX2) != 0);
2017-05-25 09:24:15 +00:00
#endif // LEO_TRY_AVX2
2017-05-18 03:06:13 +00:00
2017-05-25 09:24:15 +00:00
#endif // LEO_TARGET_MOBILE
2017-05-18 03:06:13 +00:00
}
// vx[] += vy[] * z
2017-05-25 09:24:15 +00:00
static void muladd_mem(GFSymbol * LEO_RESTRICT vx, const GFSymbol * LEO_RESTRICT vy, GFSymbol z, unsigned symbolCount)
2017-05-18 03:06:13 +00:00
{
for (unsigned i = 0; i < symbolCount; ++i)
{
const GFSymbol a = vy[i];
if (a == 0)
continue;
2017-05-21 05:49:41 +00:00
GFSymbol sum1 = static_cast<GFSymbol>(AddModQ(GFLog[a & 0x0f], z));
GFSymbol value1 = GFExp[sum1];
if ((a & 0x0f) == 0)
{
value1 = 0;
}
GFSymbol sum2 = static_cast<GFSymbol>(AddModQ(GFLog[a & 0xf0], z));
GFSymbol value2 = GFExp[sum2];
if ((a & 0xf0) == 0)
{
value2 = 0;
}
GFSymbol sum3 = static_cast<GFSymbol>(AddModQ(GFLog[a & 0x0f00], z));
GFSymbol value3 = GFExp[sum3];
if ((a & 0x0f00) == 0)
{
value3 = 0;
}
GFSymbol sum4 = static_cast<GFSymbol>(AddModQ(GFLog[a & 0xf000], z));
GFSymbol value4 = GFExp[sum4];
if ((a & 0xf000) == 0)
{
value4 = 0;
}
vx[i] ^= value1;
vx[i] ^= value2;
vx[i] ^= value3;
vx[i] ^= value4;
2017-05-18 03:06:13 +00:00
}
}
// return a*GFExp[b] over GF(2^r)
static GFSymbol mulE(GFSymbol a, GFSymbol b)
{
if (a == 0)
return 0;
const GFSymbol sum = static_cast<GFSymbol>(AddModQ(GFLog[a], b));
return GFExp[sum];
}
2017-05-24 08:23:19 +00:00
2017-05-18 03:06:13 +00:00
//------------------------------------------------------------------------------
// Fast Walsh-Hadamard Transform (FWHT) Mod Q
//
// Q is the maximum symbol value, e.g. 255 or 65535.
// Define this to enable the optimized version of FWHT()
2017-05-25 09:24:15 +00:00
#define LEO_FWHT_OPTIMIZED
2017-05-18 03:06:13 +00:00
2017-05-24 08:23:19 +00:00
typedef GFSymbol fwht_t;
// {a, b} = {a + b, a - b} (Mod Q)
2017-05-25 09:24:15 +00:00
static LEO_FORCE_INLINE void FWHT_2(fwht_t& LEO_RESTRICT a, fwht_t& LEO_RESTRICT b)
2017-05-24 08:23:19 +00:00
{
const fwht_t sum = AddModQ(a, b);
const fwht_t dif = SubModQ(a, b);
a = sum;
b = dif;
}
/*
FWHT is a minor slice of the runtime and does not grow with data size,
but I did attempt a few additional optimizations that failed:
I've attempted to vectorize (with partial reductions) FWHT_4(data, s),
which is 70% of the algorithm, but it was slower. Left in _attic_.
I've attempted to avoid reductions in all or parts of the FWHT.
The final modular reduction ends up being slower than the savings.
Specifically I tried doing it for the whole FWHT and also I tried
doing it just for the FWHT_2 loop in the main routine, but both
approaches are slower than partial reductions.
Replacing word reads with wider reads does speed up the operation, but
at too high a complexity cost relative to minor perf improvement.
*/
2017-05-25 09:24:15 +00:00
#ifndef LEO_FWHT_OPTIMIZED
2017-05-18 03:06:13 +00:00
// Reference implementation
2017-05-24 08:23:19 +00:00
static void FWHT(fwht_t* data, const unsigned bits)
2017-05-18 03:06:13 +00:00
{
const unsigned size = (unsigned)(1UL << bits);
for (unsigned width = 1; width < size; width <<= 1)
for (unsigned i = 0; i < size; i += (width << 1))
for (unsigned j = i; j < (width + i); ++j)
2017-05-24 08:23:19 +00:00
FWHT_2(data[j], data[j + width]);
2017-05-18 03:06:13 +00:00
}
#else
2017-05-25 09:24:15 +00:00
static LEO_FORCE_INLINE void FWHT_4(fwht_t* data)
2017-05-18 03:06:13 +00:00
{
2017-05-24 08:23:19 +00:00
fwht_t t0 = data[0];
fwht_t t1 = data[1];
fwht_t t2 = data[2];
fwht_t t3 = data[3];
2017-05-18 03:06:13 +00:00
FWHT_2(t0, t1);
FWHT_2(t2, t3);
FWHT_2(t0, t2);
FWHT_2(t1, t3);
data[0] = t0;
data[1] = t1;
data[2] = t2;
data[3] = t3;
}
2017-05-25 09:24:15 +00:00
static LEO_FORCE_INLINE void FWHT_4(fwht_t* data, unsigned s)
2017-05-18 03:06:13 +00:00
{
unsigned x = 0;
2017-05-24 08:23:19 +00:00
fwht_t t0 = data[x]; x += s;
fwht_t t1 = data[x]; x += s;
fwht_t t2 = data[x]; x += s;
fwht_t t3 = data[x];
2017-05-18 03:06:13 +00:00
FWHT_2(t0, t1);
FWHT_2(t2, t3);
FWHT_2(t0, t2);
FWHT_2(t1, t3);
unsigned y = 0;
data[y] = t0; y += s;
data[y] = t1; y += s;
data[y] = t2; y += s;
data[y] = t3;
}
2017-05-24 08:23:19 +00:00
static inline void FWHT_8(fwht_t* data)
2017-05-18 03:06:13 +00:00
{
2017-05-24 08:23:19 +00:00
fwht_t t0 = data[0];
fwht_t t1 = data[1];
fwht_t t2 = data[2];
fwht_t t3 = data[3];
fwht_t t4 = data[4];
fwht_t t5 = data[5];
fwht_t t6 = data[6];
fwht_t t7 = data[7];
2017-05-18 03:06:13 +00:00
FWHT_2(t0, t1);
FWHT_2(t2, t3);
FWHT_2(t4, t5);
FWHT_2(t6, t7);
FWHT_2(t0, t2);
FWHT_2(t1, t3);
FWHT_2(t4, t6);
FWHT_2(t5, t7);
FWHT_2(t0, t4);
FWHT_2(t1, t5);
FWHT_2(t2, t6);
FWHT_2(t3, t7);
data[0] = t0;
data[1] = t1;
data[2] = t2;
data[3] = t3;
data[4] = t4;
data[5] = t5;
data[6] = t6;
data[7] = t7;
}
2017-05-24 08:23:19 +00:00
static inline void FWHT_16(fwht_t* data)
2017-05-18 03:06:13 +00:00
{
2017-05-24 08:23:19 +00:00
fwht_t t0 = data[0];
fwht_t t1 = data[1];
fwht_t t2 = data[2];
fwht_t t3 = data[3];
fwht_t t4 = data[4];
fwht_t t5 = data[5];
fwht_t t6 = data[6];
fwht_t t7 = data[7];
fwht_t t8 = data[8];
fwht_t t9 = data[9];
fwht_t t10 = data[10];
fwht_t t11 = data[11];
fwht_t t12 = data[12];
fwht_t t13 = data[13];
fwht_t t14 = data[14];
fwht_t t15 = data[15];
2017-05-18 03:06:13 +00:00
FWHT_2(t0, t1);
FWHT_2(t2, t3);
FWHT_2(t4, t5);
FWHT_2(t6, t7);
FWHT_2(t8, t9);
FWHT_2(t10, t11);
FWHT_2(t12, t13);
FWHT_2(t14, t15);
FWHT_2(t0, t2);
FWHT_2(t1, t3);
FWHT_2(t4, t6);
FWHT_2(t5, t7);
FWHT_2(t8, t10);
FWHT_2(t9, t11);
FWHT_2(t12, t14);
FWHT_2(t13, t15);
FWHT_2(t0, t4);
FWHT_2(t1, t5);
FWHT_2(t2, t6);
FWHT_2(t3, t7);
FWHT_2(t8, t12);
FWHT_2(t9, t13);
FWHT_2(t10, t14);
FWHT_2(t11, t15);
FWHT_2(t0, t8);
FWHT_2(t1, t9);
FWHT_2(t2, t10);
FWHT_2(t3, t11);
FWHT_2(t4, t12);
FWHT_2(t5, t13);
FWHT_2(t6, t14);
FWHT_2(t7, t15);
data[0] = t0;
data[1] = t1;
data[2] = t2;
data[3] = t3;
data[4] = t4;
data[5] = t5;
data[6] = t6;
data[7] = t7;
data[8] = t8;
data[9] = t9;
data[10] = t10;
data[11] = t11;
data[12] = t12;
data[13] = t13;
data[14] = t14;
data[15] = t15;
}
2017-05-24 08:23:19 +00:00
static void FWHT_SmallData(fwht_t* data, unsigned ldn)
2017-05-18 03:06:13 +00:00
{
const unsigned n = (1UL << ldn);
if (n <= 2)
{
if (n == 2)
FWHT_2(data[0], data[1]);
return;
}
for (unsigned ldm = ldn; ldm > 3; ldm -= 2)
{
unsigned m = (1UL << ldm);
unsigned m4 = (m >> 2);
for (unsigned r = 0; r < n; r += m)
for (unsigned j = 0; j < m4; j++)
FWHT_4(data + j + r, m4);
}
if (ldn & 1)
{
for (unsigned i0 = 0; i0 < n; i0 += 8)
FWHT_8(data + i0);
}
else
{
for (unsigned i0 = 0; i0 < n; i0 += 4)
FWHT_4(data + i0);
}
}
2017-05-24 08:23:19 +00:00
// Decimation in time (DIT) version
static void FWHT(fwht_t* data, const unsigned ldn)
2017-05-18 03:06:13 +00:00
{
if (ldn <= 13)
{
FWHT_SmallData(data, ldn);
return;
}
FWHT_2(data[2], data[3]);
FWHT_4(data + 4);
FWHT_8(data + 8);
FWHT_16(data + 16);
for (unsigned ldm = 5; ldm < ldn; ++ldm)
FWHT(data + (unsigned)(1UL << ldm), ldm);
2017-05-24 08:23:19 +00:00
for (unsigned ldm = 0; ldm < ldn; ++ldm)
2017-05-18 03:06:13 +00:00
{
2017-05-24 08:23:19 +00:00
const unsigned mh = (1UL << ldm);
2017-05-18 03:06:13 +00:00
for (unsigned t1 = 0, t2 = mh; t1 < mh; ++t1, ++t2)
FWHT_2(data[t1], data[t2]);
}
}
#endif
//------------------------------------------------------------------------------
// Memory Buffer XOR
2017-05-25 09:24:15 +00:00
static void xor_mem(void * LEO_RESTRICT vx, const void * LEO_RESTRICT vy, unsigned bytes)
2017-05-18 03:06:13 +00:00
{
2017-05-25 09:24:15 +00:00
LEO_M128 * LEO_RESTRICT x16 = reinterpret_cast<LEO_M128 *>(vx);
const LEO_M128 * LEO_RESTRICT y16 = reinterpret_cast<const LEO_M128 *>(vy);
2017-05-18 03:06:13 +00:00
2017-05-25 09:24:15 +00:00
#if defined(LEO_TARGET_MOBILE)
# if defined(LEO_TRY_NEON)
2017-05-18 03:06:13 +00:00
// Handle multiples of 64 bytes
if (CpuHasNeon)
{
while (bytes >= 64)
{
2017-05-25 09:24:15 +00:00
LEO_M128 x0 = vld1q_u8(x16);
LEO_M128 x1 = vld1q_u8(x16 + 1);
LEO_M128 x2 = vld1q_u8(x16 + 2);
LEO_M128 x3 = vld1q_u8(x16 + 3);
LEO_M128 y0 = vld1q_u8(y16);
LEO_M128 y1 = vld1q_u8(y16 + 1);
LEO_M128 y2 = vld1q_u8(y16 + 2);
LEO_M128 y3 = vld1q_u8(y16 + 3);
2017-05-18 03:06:13 +00:00
vst1q_u8(x16, veorq_u8(x0, y0));
vst1q_u8(x16 + 1, veorq_u8(x1, y1));
vst1q_u8(x16 + 2, veorq_u8(x2, y2));
vst1q_u8(x16 + 3, veorq_u8(x3, y3));
bytes -= 64, x16 += 4, y16 += 4;
}
// Handle multiples of 16 bytes
while (bytes >= 16)
{
2017-05-25 09:24:15 +00:00
LEO_M128 x0 = vld1q_u8(x16);
LEO_M128 y0 = vld1q_u8(y16);
2017-05-18 03:06:13 +00:00
vst1q_u8(x16, veorq_u8(x0, y0));
bytes -= 16, ++x16, ++y16;
}
}
else
2017-05-25 09:24:15 +00:00
# endif // LEO_TRY_NEON
2017-05-18 03:06:13 +00:00
{
2017-05-25 09:24:15 +00:00
uint64_t * LEO_RESTRICT x8 = reinterpret_cast<uint64_t *>(x16);
const uint64_t * LEO_RESTRICT y8 = reinterpret_cast<const uint64_t *>(y16);
2017-05-18 03:06:13 +00:00
const unsigned count = (unsigned)bytes / 8;
for (unsigned ii = 0; ii < count; ++ii)
x8[ii] ^= y8[ii];
2017-05-25 09:24:15 +00:00
x16 = reinterpret_cast<LEO_M128 *>(x8 + count);
y16 = reinterpret_cast<const LEO_M128 *>(y8 + count);
2017-05-18 03:06:13 +00:00
}
2017-05-25 09:24:15 +00:00
#else // LEO_TARGET_MOBILE
# if defined(LEO_TRY_AVX2)
2017-05-18 03:06:13 +00:00
if (CpuHasAVX2)
{
2017-05-25 09:24:15 +00:00
LEO_M256 * LEO_RESTRICT x32 = reinterpret_cast<LEO_M256 *>(x16);
const LEO_M256 * LEO_RESTRICT y32 = reinterpret_cast<const LEO_M256 *>(y16);
2017-05-18 03:06:13 +00:00
while (bytes >= 128)
{
2017-05-25 09:24:15 +00:00
LEO_M256 x0 = _mm256_loadu_si256(x32);
LEO_M256 y0 = _mm256_loadu_si256(y32);
2017-05-18 03:06:13 +00:00
x0 = _mm256_xor_si256(x0, y0);
2017-05-25 09:24:15 +00:00
LEO_M256 x1 = _mm256_loadu_si256(x32 + 1);
LEO_M256 y1 = _mm256_loadu_si256(y32 + 1);
2017-05-18 03:06:13 +00:00
x1 = _mm256_xor_si256(x1, y1);
2017-05-25 09:24:15 +00:00
LEO_M256 x2 = _mm256_loadu_si256(x32 + 2);
LEO_M256 y2 = _mm256_loadu_si256(y32 + 2);
2017-05-18 03:06:13 +00:00
x2 = _mm256_xor_si256(x2, y2);
2017-05-25 09:24:15 +00:00
LEO_M256 x3 = _mm256_loadu_si256(x32 + 3);
LEO_M256 y3 = _mm256_loadu_si256(y32 + 3);
2017-05-18 03:06:13 +00:00
x3 = _mm256_xor_si256(x3, y3);
_mm256_storeu_si256(x32, x0);
_mm256_storeu_si256(x32 + 1, x1);
_mm256_storeu_si256(x32 + 2, x2);
_mm256_storeu_si256(x32 + 3, x3);
bytes -= 128, x32 += 4, y32 += 4;
}
// Handle multiples of 32 bytes
while (bytes >= 32)
{
// x[i] = x[i] xor y[i]
_mm256_storeu_si256(x32,
_mm256_xor_si256(
_mm256_loadu_si256(x32),
_mm256_loadu_si256(y32)));
bytes -= 32, ++x32, ++y32;
}
2017-05-25 09:24:15 +00:00
x16 = reinterpret_cast<LEO_M128 *>(x32);
y16 = reinterpret_cast<const LEO_M128 *>(y32);
2017-05-18 03:06:13 +00:00
}
else
2017-05-25 09:24:15 +00:00
# endif // LEO_TRY_AVX2
2017-05-18 03:06:13 +00:00
{
while (bytes >= 64)
{
2017-05-25 09:24:15 +00:00
LEO_M128 x0 = _mm_loadu_si128(x16);
LEO_M128 y0 = _mm_loadu_si128(y16);
2017-05-18 03:06:13 +00:00
x0 = _mm_xor_si128(x0, y0);
2017-05-25 09:24:15 +00:00
LEO_M128 x1 = _mm_loadu_si128(x16 + 1);
LEO_M128 y1 = _mm_loadu_si128(y16 + 1);
2017-05-18 03:06:13 +00:00
x1 = _mm_xor_si128(x1, y1);
2017-05-25 09:24:15 +00:00
LEO_M128 x2 = _mm_loadu_si128(x16 + 2);
LEO_M128 y2 = _mm_loadu_si128(y16 + 2);
2017-05-18 03:06:13 +00:00
x2 = _mm_xor_si128(x2, y2);
2017-05-25 09:24:15 +00:00
LEO_M128 x3 = _mm_loadu_si128(x16 + 3);
LEO_M128 y3 = _mm_loadu_si128(y16 + 3);
2017-05-18 03:06:13 +00:00
x3 = _mm_xor_si128(x3, y3);
_mm_storeu_si128(x16, x0);
_mm_storeu_si128(x16 + 1, x1);
_mm_storeu_si128(x16 + 2, x2);
_mm_storeu_si128(x16 + 3, x3);
bytes -= 64, x16 += 4, y16 += 4;
}
}
2017-05-25 09:24:15 +00:00
#endif // LEO_TARGET_MOBILE
2017-05-18 03:06:13 +00:00
// Handle multiples of 16 bytes
while (bytes >= 16)
{
// x[i] = x[i] xor y[i]
_mm_storeu_si128(x16,
_mm_xor_si128(
_mm_loadu_si128(x16),
_mm_loadu_si128(y16)));
bytes -= 16, ++x16, ++y16;
}
2017-05-25 09:24:15 +00:00
uint8_t * LEO_RESTRICT x1 = reinterpret_cast<uint8_t *>(x16);
const uint8_t * LEO_RESTRICT y1 = reinterpret_cast<const uint8_t *>(y16);
2017-05-18 03:06:13 +00:00
// Handle a block of 8 bytes
const unsigned eight = bytes & 8;
if (eight)
{
2017-05-25 09:24:15 +00:00
uint64_t * LEO_RESTRICT x8 = reinterpret_cast<uint64_t *>(x1);
const uint64_t * LEO_RESTRICT y8 = reinterpret_cast<const uint64_t *>(y1);
2017-05-18 03:06:13 +00:00
*x8 ^= *y8;
}
// Handle a block of 4 bytes
const unsigned four = bytes & 4;
if (four)
{
2017-05-25 09:24:15 +00:00
uint32_t * LEO_RESTRICT x4 = reinterpret_cast<uint32_t *>(x1 + eight);
const uint32_t * LEO_RESTRICT y4 = reinterpret_cast<const uint32_t *>(y1 + eight);
2017-05-18 03:06:13 +00:00
*x4 ^= *y4;
}
// Handle final bytes
const unsigned offset = eight + four;
switch (bytes & 3)
{
case 3: x1[offset + 2] ^= y1[offset + 2];
case 2: x1[offset + 1] ^= y1[offset + 1];
case 1: x1[offset] ^= y1[offset];
default:
break;
}
}
//------------------------------------------------------------------------------
// Formal Derivative
// Formal derivative of polynomial in the new basis
static void formal_derivative(GFSymbol* cos, const unsigned size)
{
for (unsigned i = 1; i < size; ++i)
{
const unsigned leng = ((i ^ (i - 1)) + 1) >> 1;
// If a large number of values are being XORed:
if (leng >= 8)
xor_mem(cos + i - leng, cos + i, leng * sizeof(GFSymbol));
else
for (unsigned j = i - leng; j < i; j++)
cos[j] ^= cos[j + leng];
}
for (unsigned i = size; i < kFieldSize; i <<= 1)
xor_mem(cos, cos + i, size * sizeof(GFSymbol));
}
//------------------------------------------------------------------------------
// Fast Fourier Transform
static GFSymbol skewVec[kFieldModulus]; // twisted factors used in FFT
// IFFT in the proposed basis
static void IFLT(GFSymbol* data, const unsigned size, const unsigned index)
{
for (unsigned depart_no = 1; depart_no < size; depart_no <<= 1)
{
for (unsigned j = depart_no; j < size; j += (depart_no << 1))
{
// If a large number of values are being XORed:
if (depart_no >= 8)
xor_mem(data + j, data + j - depart_no, depart_no * sizeof(GFSymbol));
else
for (unsigned i = j - depart_no; i < j; ++i)
data[i + depart_no] ^= data[i];
const GFSymbol skew = skewVec[j + index - 1];
if (skew != kFieldModulus)
muladd_mem(data + j - depart_no, data + j, skew, depart_no);
}
}
}
// FFT in the proposed basis
static void FLT(GFSymbol* data, const unsigned size, const unsigned index)
{
for (unsigned depart_no = (size >> 1); depart_no > 0; depart_no >>= 1)
{
for (unsigned j = depart_no; j < size; j += (depart_no << 1))
{
const GFSymbol skew = skewVec[j + index - 1];
if (skew != kFieldModulus)
muladd_mem(data + j - depart_no, data + j, skew, depart_no);
// If a large number of values are being XORed:
if (depart_no >= 8)
xor_mem(data + j, data + j - depart_no, depart_no * sizeof(GFSymbol));
else
for (unsigned i = j - depart_no; i < j; ++i)
data[i + depart_no] ^= data[i];
}
}
}
//------------------------------------------------------------------------------
// FFT Initialization
static GFSymbol B[kFieldSize >> 1]; // factors used in formal derivative
2017-05-24 08:23:19 +00:00
static fwht_t log_walsh[kFieldSize]; // factors used in the evaluation of the error locator polynomial
2017-05-18 03:06:13 +00:00
// Initialize skewVec[], B[], log_walsh[]
static void InitFieldOperations()
{
2017-05-24 08:23:19 +00:00
GFSymbol temp[kGFBits - 1];
2017-05-18 03:06:13 +00:00
for (unsigned i = 1; i < kGFBits; ++i)
temp[i - 1] = (GFSymbol)((unsigned)1 << i);
for (unsigned m = 0; m < (kGFBits - 1); ++m)
{
const unsigned step = (unsigned)1 << (m + 1);
skewVec[((unsigned)1 << m) - 1] = 0;
for (unsigned i = m; i < (kGFBits - 1); ++i)
{
const unsigned s = ((unsigned)1 << (i + 1));
for (unsigned j = ((unsigned)1 << m) - 1; j < s; j += step)
skewVec[j + s] = skewVec[j] ^ temp[i];
}
temp[m] = kFieldModulus - GFLog[mulE(temp[m], GFLog[temp[m] ^ 1])];
for (unsigned i = m + 1; i < (kGFBits - 1); ++i)
temp[i] = mulE(temp[i], (GFLog[temp[i] ^ 1] + temp[m]) % kFieldModulus);
}
for (unsigned i = 0; i < kFieldSize; ++i)
skewVec[i] = GFLog[skewVec[i]];
temp[0] = kFieldModulus - temp[0];
for (unsigned i = 1; i < (kGFBits - 1); ++i)
temp[i] = (kFieldModulus - temp[i] + temp[i - 1]) % kFieldModulus;
2017-05-24 08:23:19 +00:00
B[0] = 0;
2017-05-18 03:06:13 +00:00
for (unsigned i = 0; i < (kGFBits - 1); ++i)
{
const unsigned depart = ((unsigned)1 << i);
for (unsigned j = 0; j < depart; ++j)
B[j + depart] = (B[j] + temp[i]) % kFieldModulus;
}
for (unsigned i = 0; i < kFieldSize; ++i)
log_walsh[i] = GFLog[i];
2017-05-24 08:23:19 +00:00
log_walsh[0] = 0;
2017-05-18 03:06:13 +00:00
FWHT(log_walsh, kGFBits);
}
//------------------------------------------------------------------------------
// Encoder
// Encoding alg for k/n<0.5: message is a power of two
static void encodeL(GFSymbol* data, const unsigned k, GFSymbol* codeword)
{
2017-05-24 08:23:19 +00:00
memcpy(codeword, data, sizeof(GFSymbol) * k);
2017-05-18 03:06:13 +00:00
2017-05-24 08:23:19 +00:00
IFLT(codeword, k, 0);
2017-05-18 03:06:13 +00:00
for (unsigned i = k; i < kFieldSize; i += k)
{
memcpy(&codeword[i], codeword, sizeof(GFSymbol) * k);
FLT(&codeword[i], k, i);
}
memcpy(codeword, data, sizeof(GFSymbol) * k);
}
// Encoding alg for k/n>0.5: parity is a power of two.
// data: message array. parity: parity array. mem: buffer(size>= n-k)
static void encodeH(const GFSymbol* data, const unsigned k, GFSymbol* parity, GFSymbol* mem)
{
2017-05-24 08:23:19 +00:00
const unsigned t = kFieldSize - k;
2017-05-18 03:06:13 +00:00
memset(parity, 0, sizeof(GFSymbol) * t);
for (unsigned i = t; i < kFieldSize; i += t)
{
memcpy(mem, &data[i - t], sizeof(GFSymbol) * t);
IFLT(mem, t, i);
xor_mem(parity, mem, t * sizeof(GFSymbol));
}
FLT(parity, t, 0);
}
//------------------------------------------------------------------------------
// Decoder
2017-05-24 08:23:19 +00:00
static void decode(GFSymbol* codeword, unsigned k, const bool* erasure)
2017-05-18 03:06:13 +00:00
{
2017-05-24 08:23:19 +00:00
fwht_t log_walsh2[kFieldSize];
2017-05-18 03:06:13 +00:00
// Compute the evaluations of the error locator polynomial
for (unsigned i = 0; i < kFieldSize; ++i)
log_walsh2[i] = erasure[i] ? 1 : 0;
FWHT(log_walsh2, kGFBits);
for (unsigned i = 0; i < kFieldSize; ++i)
2017-05-24 08:23:19 +00:00
log_walsh2[i] = ((unsigned)log_walsh2[i] * (unsigned)log_walsh[i]) % kFieldModulus;
2017-05-18 03:06:13 +00:00
FWHT(log_walsh2, kGFBits);
// k2 can be replaced with k
2017-05-24 08:23:19 +00:00
const unsigned k2 = kFieldSize;
//const unsigned k2 = k; // cannot actually be replaced with k. what else need to change?
2017-05-18 03:06:13 +00:00
for (unsigned i = 0; i < kFieldSize; ++i)
2017-05-24 08:23:19 +00:00
{
if (erasure[i])
{
codeword[i] = 0;
}
else
{
codeword[i] = mulE(codeword[i], log_walsh2[i]);
}
}
2017-05-18 03:06:13 +00:00
IFLT(codeword, kFieldSize, 0);
// formal derivative
for (unsigned i = 0; i < kFieldSize; i += 2)
{
2017-05-24 08:23:19 +00:00
codeword[i] = mulE(codeword[i], kFieldModulus - B[i >> 1]);
codeword[i + 1] = mulE(codeword[i + 1], kFieldModulus - B[i >> 1]);
}
2017-05-18 03:06:13 +00:00
2017-05-24 08:23:19 +00:00
formal_derivative(codeword, k2);
2017-05-18 03:06:13 +00:00
for (unsigned i = 0; i < k2; i += 2)
{
codeword[i] = mulE(codeword[i], B[i >> 1]);
codeword[i + 1] = mulE(codeword[i + 1], B[i >> 1]);
}
2017-05-24 08:23:19 +00:00
FLT(codeword, k2, 0);
2017-05-18 03:06:13 +00:00
for (unsigned i = 0; i < k2; ++i)
2017-05-24 08:23:19 +00:00
{
if (erasure[i])
{
codeword[i] = mulE(codeword[i], kFieldModulus - log_walsh2[i]);
}
}
2017-05-18 03:06:13 +00:00
}
//------------------------------------------------------------------------------
// Test Application
2017-05-24 08:23:19 +00:00
void test(unsigned k, unsigned seed)
2017-05-18 03:06:13 +00:00
{
2017-05-24 08:23:19 +00:00
srand(seed);
//-----------Generating message----------
2017-05-18 03:06:13 +00:00
// Message array
2017-05-24 08:23:19 +00:00
GFSymbol data[kFieldSize] = {0};
2017-05-18 03:06:13 +00:00
// Filled with random numbers
for (unsigned i = kFieldSize - k; i < kFieldSize; ++i)
data[i] = (GFSymbol)rand();
2017-05-24 08:23:19 +00:00
//---------encoding----------
2017-05-18 03:06:13 +00:00
2017-05-24 08:23:19 +00:00
GFSymbol codeword[kFieldSize];
encodeH(&data[kFieldSize - k], k, data, codeword);
//encodeL(data, k, codeword); // does not seem to work with any input? what else needs to change?
2017-05-18 03:06:13 +00:00
2017-05-24 08:23:19 +00:00
memcpy(codeword, data, sizeof(GFSymbol) * kFieldSize);
2017-05-18 03:06:13 +00:00
2017-05-24 08:23:19 +00:00
//--------erasure simulation---------
2017-05-18 03:06:13 +00:00
// Array indicating erasures
2017-05-24 08:23:19 +00:00
bool erasure[kFieldSize] = {
2017-05-18 03:06:13 +00:00
false
};
for (unsigned i = k; i < kFieldSize; ++i)
erasure[i] = true;
// permuting the erasure array
for (unsigned i = kFieldSize - 1; i > 0; --i)
{
unsigned pos = rand() % (i + 1);
if (i != pos)
{
bool tmp = erasure[i];
erasure[i] = erasure[pos];
erasure[pos] = tmp;
}
}
// erasure codeword symbols
for (unsigned i = 0; i < kFieldSize; ++i)
if (erasure[i])
codeword[i] = 0;
2017-05-24 08:23:19 +00:00
//---------main processing----------
decode(codeword, k, erasure);
2017-05-18 03:06:13 +00:00
// Check the correctness of the result
for (unsigned i = 0; i < kFieldSize; ++i)
{
if (erasure[i] == 1)
{
if (data[i] != codeword[i])
{
2017-05-24 08:23:19 +00:00
printf("Decoding Error with seed = %d!\n", seed);
2017-05-25 09:24:15 +00:00
LEO_DEBUG_BREAK;
2017-05-18 03:06:13 +00:00
return;
}
}
}
2017-05-24 08:23:19 +00:00
//printf("Decoding is successful!\n");
2017-05-18 03:06:13 +00:00
}
//------------------------------------------------------------------------------
// Entrypoint
int main(int argc, char **argv)
{
// Initialize architecture-specific code
2017-05-25 09:24:15 +00:00
leo_architecture_init();
2017-05-18 03:06:13 +00:00
// Fill GFLog table and GFExp table
InitField();
// Compute factors used in erasure decoder
InitFieldOperations();
2017-05-24 08:23:19 +00:00
unsigned seed = (unsigned)time(NULL);
2017-05-18 03:06:13 +00:00
for (;;)
{
// test(int k), k: message size
2017-05-24 08:23:19 +00:00
/*
EncodeH works for kFieldSize / 2 and kFieldSize * 3 / 4, etc,
s.t. the number of recovery pieces is a power of two
*/
test(kFieldSize / 2, seed);
++seed;
2017-05-18 03:06:13 +00:00
}
return 0;
}
2017-05-25 09:24:15 +00:00
} // namespace leopard