mirror of https://github.com/status-im/leopard.git
Project structure
This commit is contained in:
parent
4d78561689
commit
49dbcdc8b1
|
@ -0,0 +1,957 @@
|
|||
/*
|
||||
Copyright (c) 2017 Christopher A. Taylor. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright notice,
|
||||
this list of conditions and the following disclaimer in the documentation
|
||||
and/or other materials provided with the distribution.
|
||||
* Neither the name of Leopard-RS nor the names of its contributors may be
|
||||
used to endorse or promote products derived from this software without
|
||||
specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "LeopardCommon.h"
|
||||
|
||||
namespace leopard {
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Runtime CPU Architecture Check
|
||||
//
|
||||
// Feature checks stolen shamelessly from
|
||||
// https://github.com/jedisct1/libsodium/blob/master/src/libsodium/sodium/runtime.c
|
||||
|
||||
#if defined(HAVE_ANDROID_GETCPUFEATURES)
|
||||
#include <cpu-features.h>
|
||||
#endif
|
||||
|
||||
#if defined(LEO_TRY_NEON)
|
||||
# if defined(IOS) && defined(__ARM_NEON__)
|
||||
// Requires iPhone 5S or newer
|
||||
# else
|
||||
// Remember to add LOCAL_STATIC_LIBRARIES := cpufeatures
|
||||
bool CpuHasNeon = false; // V6 / V7
|
||||
bool CpuHasNeon64 = false; // 64-bit
|
||||
# endif
|
||||
#endif
|
||||
|
||||
|
||||
#if !defined(LEO_TARGET_MOBILE)
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#include <intrin.h> // __cpuid
|
||||
#pragma warning(disable: 4752) // found Intel(R) Advanced Vector Extensions; consider using /arch:AVX
|
||||
#endif
|
||||
|
||||
#ifdef LEO_TRY_AVX2
|
||||
bool CpuHasAVX2 = false;
|
||||
#endif
|
||||
bool CpuHasSSSE3 = false;
|
||||
|
||||
#define CPUID_EBX_AVX2 0x00000020
|
||||
#define CPUID_ECX_SSSE3 0x00000200
|
||||
|
||||
static void _cpuid(unsigned int cpu_info[4U], const unsigned int cpu_info_type)
|
||||
{
|
||||
#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_AMD64) || defined(_M_IX86))
|
||||
__cpuid((int *) cpu_info, cpu_info_type);
|
||||
#else //if defined(HAVE_CPUID)
|
||||
cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0;
|
||||
# ifdef __i386__
|
||||
__asm__ __volatile__ ("pushfl; pushfl; "
|
||||
"popl %0; "
|
||||
"movl %0, %1; xorl %2, %0; "
|
||||
"pushl %0; "
|
||||
"popfl; pushfl; popl %0; popfl" :
|
||||
"=&r" (cpu_info[0]), "=&r" (cpu_info[1]) :
|
||||
"i" (0x200000));
|
||||
if (((cpu_info[0] ^ cpu_info[1]) & 0x200000) == 0) {
|
||||
return; /* LCOV_EXCL_LINE */
|
||||
}
|
||||
# endif
|
||||
# ifdef __i386__
|
||||
__asm__ __volatile__ ("xchgl %%ebx, %k1; cpuid; xchgl %%ebx, %k1" :
|
||||
"=a" (cpu_info[0]), "=&r" (cpu_info[1]),
|
||||
"=c" (cpu_info[2]), "=d" (cpu_info[3]) :
|
||||
"0" (cpu_info_type), "2" (0U));
|
||||
# elif defined(__x86_64__)
|
||||
__asm__ __volatile__ ("xchgq %%rbx, %q1; cpuid; xchgq %%rbx, %q1" :
|
||||
"=a" (cpu_info[0]), "=&r" (cpu_info[1]),
|
||||
"=c" (cpu_info[2]), "=d" (cpu_info[3]) :
|
||||
"0" (cpu_info_type), "2" (0U));
|
||||
# else
|
||||
__asm__ __volatile__ ("cpuid" :
|
||||
"=a" (cpu_info[0]), "=b" (cpu_info[1]),
|
||||
"=c" (cpu_info[2]), "=d" (cpu_info[3]) :
|
||||
"0" (cpu_info_type), "2" (0U));
|
||||
# endif
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif // defined(LEO_TARGET_MOBILE)
|
||||
|
||||
|
||||
void InitializeCPUArch()
|
||||
{
|
||||
#if defined(LEO_TRY_NEON) && defined(HAVE_ANDROID_GETCPUFEATURES)
|
||||
AndroidCpuFamily family = android_getCpuFamily();
|
||||
if (family == ANDROID_CPU_FAMILY_ARM)
|
||||
{
|
||||
if (android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON)
|
||||
CpuHasNeon = true;
|
||||
}
|
||||
else if (family == ANDROID_CPU_FAMILY_ARM64)
|
||||
{
|
||||
CpuHasNeon = true;
|
||||
if (android_getCpuFeatures() & ANDROID_CPU_ARM64_FEATURE_ASIMD)
|
||||
CpuHasNeon64 = true;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if !defined(LEO_TARGET_MOBILE)
|
||||
unsigned int cpu_info[4];
|
||||
|
||||
_cpuid(cpu_info, 1);
|
||||
CpuHasSSSE3 = ((cpu_info[2] & CPUID_ECX_SSSE3) != 0);
|
||||
|
||||
#if defined(LEO_TRY_AVX2)
|
||||
_cpuid(cpu_info, 7);
|
||||
CpuHasAVX2 = ((cpu_info[1] & CPUID_EBX_AVX2) != 0);
|
||||
#endif // LEO_TRY_AVX2
|
||||
|
||||
#endif // LEO_TARGET_MOBILE
|
||||
}
|
||||
|
||||
|
||||
|
||||
// vx[] += vy[] * z
|
||||
static void muladd_mem(GFSymbol * LEO_RESTRICT vx, const GFSymbol * LEO_RESTRICT vy, GFSymbol z, unsigned symbolCount)
|
||||
{
|
||||
for (unsigned i = 0; i < symbolCount; ++i)
|
||||
{
|
||||
const GFSymbol a = vy[i];
|
||||
if (a == 0)
|
||||
continue;
|
||||
|
||||
GFSymbol sum1 = static_cast<GFSymbol>(AddModQ(GFLog[a & 0x0f], z));
|
||||
GFSymbol value1 = GFExp[sum1];
|
||||
if ((a & 0x0f) == 0)
|
||||
{
|
||||
value1 = 0;
|
||||
}
|
||||
GFSymbol sum2 = static_cast<GFSymbol>(AddModQ(GFLog[a & 0xf0], z));
|
||||
GFSymbol value2 = GFExp[sum2];
|
||||
if ((a & 0xf0) == 0)
|
||||
{
|
||||
value2 = 0;
|
||||
}
|
||||
GFSymbol sum3 = static_cast<GFSymbol>(AddModQ(GFLog[a & 0x0f00], z));
|
||||
GFSymbol value3 = GFExp[sum3];
|
||||
if ((a & 0x0f00) == 0)
|
||||
{
|
||||
value3 = 0;
|
||||
}
|
||||
GFSymbol sum4 = static_cast<GFSymbol>(AddModQ(GFLog[a & 0xf000], z));
|
||||
GFSymbol value4 = GFExp[sum4];
|
||||
if ((a & 0xf000) == 0)
|
||||
{
|
||||
value4 = 0;
|
||||
}
|
||||
|
||||
vx[i] ^= value1;
|
||||
vx[i] ^= value2;
|
||||
vx[i] ^= value3;
|
||||
vx[i] ^= value4;
|
||||
}
|
||||
}
|
||||
|
||||
// return a*GFExp[b] over GF(2^r)
|
||||
static GFSymbol mulE(GFSymbol a, GFSymbol b)
|
||||
{
|
||||
if (a == 0)
|
||||
return 0;
|
||||
|
||||
const GFSymbol sum = static_cast<GFSymbol>(AddModQ(GFLog[a], b));
|
||||
return GFExp[sum];
|
||||
}
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Fast Walsh-Hadamard Transform (FWHT) Mod Q
|
||||
//
|
||||
// Q is the maximum symbol value, e.g. 255 or 65535.
|
||||
|
||||
// Define this to enable the optimized version of FWHT()
|
||||
#define LEO_FWHT_OPTIMIZED
|
||||
|
||||
typedef GFSymbol fwht_t;
|
||||
|
||||
// {a, b} = {a + b, a - b} (Mod Q)
|
||||
static LEO_FORCE_INLINE void FWHT_2(fwht_t& LEO_RESTRICT a, fwht_t& LEO_RESTRICT b)
|
||||
{
|
||||
const fwht_t sum = AddModQ(a, b);
|
||||
const fwht_t dif = SubModQ(a, b);
|
||||
a = sum;
|
||||
b = dif;
|
||||
}
|
||||
|
||||
/*
|
||||
FWHT is a minor slice of the runtime and does not grow with data size,
|
||||
but I did attempt a few additional optimizations that failed:
|
||||
|
||||
I've attempted to vectorize (with partial reductions) FWHT_4(data, s),
|
||||
which is 70% of the algorithm, but it was slower. Left in _attic_.
|
||||
|
||||
I've attempted to avoid reductions in all or parts of the FWHT.
|
||||
The final modular reduction ends up being slower than the savings.
|
||||
Specifically I tried doing it for the whole FWHT and also I tried
|
||||
doing it just for the FWHT_2 loop in the main routine, but both
|
||||
approaches are slower than partial reductions.
|
||||
|
||||
Replacing word reads with wider reads does speed up the operation, but
|
||||
at too high a complexity cost relative to minor perf improvement.
|
||||
*/
|
||||
|
||||
#ifndef LEO_FWHT_OPTIMIZED
|
||||
|
||||
// Reference implementation
|
||||
static void FWHT(fwht_t* data, const unsigned bits)
|
||||
{
|
||||
const unsigned size = (unsigned)(1UL << bits);
|
||||
for (unsigned width = 1; width < size; width <<= 1)
|
||||
for (unsigned i = 0; i < size; i += (width << 1))
|
||||
for (unsigned j = i; j < (width + i); ++j)
|
||||
FWHT_2(data[j], data[j + width]);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
static LEO_FORCE_INLINE void FWHT_4(fwht_t* data)
|
||||
{
|
||||
fwht_t t0 = data[0];
|
||||
fwht_t t1 = data[1];
|
||||
fwht_t t2 = data[2];
|
||||
fwht_t t3 = data[3];
|
||||
FWHT_2(t0, t1);
|
||||
FWHT_2(t2, t3);
|
||||
FWHT_2(t0, t2);
|
||||
FWHT_2(t1, t3);
|
||||
data[0] = t0;
|
||||
data[1] = t1;
|
||||
data[2] = t2;
|
||||
data[3] = t3;
|
||||
}
|
||||
|
||||
static LEO_FORCE_INLINE void FWHT_4(fwht_t* data, unsigned s)
|
||||
{
|
||||
unsigned x = 0;
|
||||
fwht_t t0 = data[x]; x += s;
|
||||
fwht_t t1 = data[x]; x += s;
|
||||
fwht_t t2 = data[x]; x += s;
|
||||
fwht_t t3 = data[x];
|
||||
FWHT_2(t0, t1);
|
||||
FWHT_2(t2, t3);
|
||||
FWHT_2(t0, t2);
|
||||
FWHT_2(t1, t3);
|
||||
unsigned y = 0;
|
||||
data[y] = t0; y += s;
|
||||
data[y] = t1; y += s;
|
||||
data[y] = t2; y += s;
|
||||
data[y] = t3;
|
||||
}
|
||||
|
||||
static inline void FWHT_8(fwht_t* data)
|
||||
{
|
||||
fwht_t t0 = data[0];
|
||||
fwht_t t1 = data[1];
|
||||
fwht_t t2 = data[2];
|
||||
fwht_t t3 = data[3];
|
||||
fwht_t t4 = data[4];
|
||||
fwht_t t5 = data[5];
|
||||
fwht_t t6 = data[6];
|
||||
fwht_t t7 = data[7];
|
||||
FWHT_2(t0, t1);
|
||||
FWHT_2(t2, t3);
|
||||
FWHT_2(t4, t5);
|
||||
FWHT_2(t6, t7);
|
||||
FWHT_2(t0, t2);
|
||||
FWHT_2(t1, t3);
|
||||
FWHT_2(t4, t6);
|
||||
FWHT_2(t5, t7);
|
||||
FWHT_2(t0, t4);
|
||||
FWHT_2(t1, t5);
|
||||
FWHT_2(t2, t6);
|
||||
FWHT_2(t3, t7);
|
||||
data[0] = t0;
|
||||
data[1] = t1;
|
||||
data[2] = t2;
|
||||
data[3] = t3;
|
||||
data[4] = t4;
|
||||
data[5] = t5;
|
||||
data[6] = t6;
|
||||
data[7] = t7;
|
||||
}
|
||||
|
||||
static inline void FWHT_16(fwht_t* data)
|
||||
{
|
||||
fwht_t t0 = data[0];
|
||||
fwht_t t1 = data[1];
|
||||
fwht_t t2 = data[2];
|
||||
fwht_t t3 = data[3];
|
||||
fwht_t t4 = data[4];
|
||||
fwht_t t5 = data[5];
|
||||
fwht_t t6 = data[6];
|
||||
fwht_t t7 = data[7];
|
||||
fwht_t t8 = data[8];
|
||||
fwht_t t9 = data[9];
|
||||
fwht_t t10 = data[10];
|
||||
fwht_t t11 = data[11];
|
||||
fwht_t t12 = data[12];
|
||||
fwht_t t13 = data[13];
|
||||
fwht_t t14 = data[14];
|
||||
fwht_t t15 = data[15];
|
||||
FWHT_2(t0, t1);
|
||||
FWHT_2(t2, t3);
|
||||
FWHT_2(t4, t5);
|
||||
FWHT_2(t6, t7);
|
||||
FWHT_2(t8, t9);
|
||||
FWHT_2(t10, t11);
|
||||
FWHT_2(t12, t13);
|
||||
FWHT_2(t14, t15);
|
||||
FWHT_2(t0, t2);
|
||||
FWHT_2(t1, t3);
|
||||
FWHT_2(t4, t6);
|
||||
FWHT_2(t5, t7);
|
||||
FWHT_2(t8, t10);
|
||||
FWHT_2(t9, t11);
|
||||
FWHT_2(t12, t14);
|
||||
FWHT_2(t13, t15);
|
||||
FWHT_2(t0, t4);
|
||||
FWHT_2(t1, t5);
|
||||
FWHT_2(t2, t6);
|
||||
FWHT_2(t3, t7);
|
||||
FWHT_2(t8, t12);
|
||||
FWHT_2(t9, t13);
|
||||
FWHT_2(t10, t14);
|
||||
FWHT_2(t11, t15);
|
||||
FWHT_2(t0, t8);
|
||||
FWHT_2(t1, t9);
|
||||
FWHT_2(t2, t10);
|
||||
FWHT_2(t3, t11);
|
||||
FWHT_2(t4, t12);
|
||||
FWHT_2(t5, t13);
|
||||
FWHT_2(t6, t14);
|
||||
FWHT_2(t7, t15);
|
||||
data[0] = t0;
|
||||
data[1] = t1;
|
||||
data[2] = t2;
|
||||
data[3] = t3;
|
||||
data[4] = t4;
|
||||
data[5] = t5;
|
||||
data[6] = t6;
|
||||
data[7] = t7;
|
||||
data[8] = t8;
|
||||
data[9] = t9;
|
||||
data[10] = t10;
|
||||
data[11] = t11;
|
||||
data[12] = t12;
|
||||
data[13] = t13;
|
||||
data[14] = t14;
|
||||
data[15] = t15;
|
||||
}
|
||||
|
||||
static void FWHT_SmallData(fwht_t* data, unsigned ldn)
|
||||
{
|
||||
const unsigned n = (1UL << ldn);
|
||||
|
||||
if (n <= 2)
|
||||
{
|
||||
if (n == 2)
|
||||
FWHT_2(data[0], data[1]);
|
||||
return;
|
||||
}
|
||||
|
||||
for (unsigned ldm = ldn; ldm > 3; ldm -= 2)
|
||||
{
|
||||
unsigned m = (1UL << ldm);
|
||||
unsigned m4 = (m >> 2);
|
||||
for (unsigned r = 0; r < n; r += m)
|
||||
for (unsigned j = 0; j < m4; j++)
|
||||
FWHT_4(data + j + r, m4);
|
||||
}
|
||||
|
||||
if (ldn & 1)
|
||||
{
|
||||
for (unsigned i0 = 0; i0 < n; i0 += 8)
|
||||
FWHT_8(data + i0);
|
||||
}
|
||||
else
|
||||
{
|
||||
for (unsigned i0 = 0; i0 < n; i0 += 4)
|
||||
FWHT_4(data + i0);
|
||||
}
|
||||
}
|
||||
|
||||
// Decimation in time (DIT) version
|
||||
static void FWHT(fwht_t* data, const unsigned ldn)
|
||||
{
|
||||
if (ldn <= 13)
|
||||
{
|
||||
FWHT_SmallData(data, ldn);
|
||||
return;
|
||||
}
|
||||
|
||||
FWHT_2(data[2], data[3]);
|
||||
FWHT_4(data + 4);
|
||||
FWHT_8(data + 8);
|
||||
FWHT_16(data + 16);
|
||||
for (unsigned ldm = 5; ldm < ldn; ++ldm)
|
||||
FWHT(data + (unsigned)(1UL << ldm), ldm);
|
||||
|
||||
for (unsigned ldm = 0; ldm < ldn; ++ldm)
|
||||
{
|
||||
const unsigned mh = (1UL << ldm);
|
||||
for (unsigned t1 = 0, t2 = mh; t1 < mh; ++t1, ++t2)
|
||||
FWHT_2(data[t1], data[t2]);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Memory Buffer XOR
|
||||
|
||||
static void xor_mem(void * LEO_RESTRICT vx, const void * LEO_RESTRICT vy, unsigned bytes)
|
||||
{
|
||||
LEO_M128 * LEO_RESTRICT x16 = reinterpret_cast<LEO_M128 *>(vx);
|
||||
const LEO_M128 * LEO_RESTRICT y16 = reinterpret_cast<const LEO_M128 *>(vy);
|
||||
|
||||
#if defined(LEO_TARGET_MOBILE)
|
||||
# if defined(LEO_TRY_NEON)
|
||||
// Handle multiples of 64 bytes
|
||||
if (CpuHasNeon)
|
||||
{
|
||||
while (bytes >= 64)
|
||||
{
|
||||
LEO_M128 x0 = vld1q_u8(x16);
|
||||
LEO_M128 x1 = vld1q_u8(x16 + 1);
|
||||
LEO_M128 x2 = vld1q_u8(x16 + 2);
|
||||
LEO_M128 x3 = vld1q_u8(x16 + 3);
|
||||
LEO_M128 y0 = vld1q_u8(y16);
|
||||
LEO_M128 y1 = vld1q_u8(y16 + 1);
|
||||
LEO_M128 y2 = vld1q_u8(y16 + 2);
|
||||
LEO_M128 y3 = vld1q_u8(y16 + 3);
|
||||
|
||||
vst1q_u8(x16, veorq_u8(x0, y0));
|
||||
vst1q_u8(x16 + 1, veorq_u8(x1, y1));
|
||||
vst1q_u8(x16 + 2, veorq_u8(x2, y2));
|
||||
vst1q_u8(x16 + 3, veorq_u8(x3, y3));
|
||||
|
||||
bytes -= 64, x16 += 4, y16 += 4;
|
||||
}
|
||||
|
||||
// Handle multiples of 16 bytes
|
||||
while (bytes >= 16)
|
||||
{
|
||||
LEO_M128 x0 = vld1q_u8(x16);
|
||||
LEO_M128 y0 = vld1q_u8(y16);
|
||||
|
||||
vst1q_u8(x16, veorq_u8(x0, y0));
|
||||
|
||||
bytes -= 16, ++x16, ++y16;
|
||||
}
|
||||
}
|
||||
else
|
||||
# endif // LEO_TRY_NEON
|
||||
{
|
||||
uint64_t * LEO_RESTRICT x8 = reinterpret_cast<uint64_t *>(x16);
|
||||
const uint64_t * LEO_RESTRICT y8 = reinterpret_cast<const uint64_t *>(y16);
|
||||
|
||||
const unsigned count = (unsigned)bytes / 8;
|
||||
for (unsigned ii = 0; ii < count; ++ii)
|
||||
x8[ii] ^= y8[ii];
|
||||
|
||||
x16 = reinterpret_cast<LEO_M128 *>(x8 + count);
|
||||
y16 = reinterpret_cast<const LEO_M128 *>(y8 + count);
|
||||
}
|
||||
#else // LEO_TARGET_MOBILE
|
||||
# if defined(LEO_TRY_AVX2)
|
||||
if (CpuHasAVX2)
|
||||
{
|
||||
LEO_M256 * LEO_RESTRICT x32 = reinterpret_cast<LEO_M256 *>(x16);
|
||||
const LEO_M256 * LEO_RESTRICT y32 = reinterpret_cast<const LEO_M256 *>(y16);
|
||||
|
||||
while (bytes >= 128)
|
||||
{
|
||||
LEO_M256 x0 = _mm256_loadu_si256(x32);
|
||||
LEO_M256 y0 = _mm256_loadu_si256(y32);
|
||||
x0 = _mm256_xor_si256(x0, y0);
|
||||
LEO_M256 x1 = _mm256_loadu_si256(x32 + 1);
|
||||
LEO_M256 y1 = _mm256_loadu_si256(y32 + 1);
|
||||
x1 = _mm256_xor_si256(x1, y1);
|
||||
LEO_M256 x2 = _mm256_loadu_si256(x32 + 2);
|
||||
LEO_M256 y2 = _mm256_loadu_si256(y32 + 2);
|
||||
x2 = _mm256_xor_si256(x2, y2);
|
||||
LEO_M256 x3 = _mm256_loadu_si256(x32 + 3);
|
||||
LEO_M256 y3 = _mm256_loadu_si256(y32 + 3);
|
||||
x3 = _mm256_xor_si256(x3, y3);
|
||||
|
||||
_mm256_storeu_si256(x32, x0);
|
||||
_mm256_storeu_si256(x32 + 1, x1);
|
||||
_mm256_storeu_si256(x32 + 2, x2);
|
||||
_mm256_storeu_si256(x32 + 3, x3);
|
||||
|
||||
bytes -= 128, x32 += 4, y32 += 4;
|
||||
}
|
||||
|
||||
// Handle multiples of 32 bytes
|
||||
while (bytes >= 32)
|
||||
{
|
||||
// x[i] = x[i] xor y[i]
|
||||
_mm256_storeu_si256(x32,
|
||||
_mm256_xor_si256(
|
||||
_mm256_loadu_si256(x32),
|
||||
_mm256_loadu_si256(y32)));
|
||||
|
||||
bytes -= 32, ++x32, ++y32;
|
||||
}
|
||||
|
||||
x16 = reinterpret_cast<LEO_M128 *>(x32);
|
||||
y16 = reinterpret_cast<const LEO_M128 *>(y32);
|
||||
}
|
||||
else
|
||||
# endif // LEO_TRY_AVX2
|
||||
{
|
||||
while (bytes >= 64)
|
||||
{
|
||||
LEO_M128 x0 = _mm_loadu_si128(x16);
|
||||
LEO_M128 y0 = _mm_loadu_si128(y16);
|
||||
x0 = _mm_xor_si128(x0, y0);
|
||||
LEO_M128 x1 = _mm_loadu_si128(x16 + 1);
|
||||
LEO_M128 y1 = _mm_loadu_si128(y16 + 1);
|
||||
x1 = _mm_xor_si128(x1, y1);
|
||||
LEO_M128 x2 = _mm_loadu_si128(x16 + 2);
|
||||
LEO_M128 y2 = _mm_loadu_si128(y16 + 2);
|
||||
x2 = _mm_xor_si128(x2, y2);
|
||||
LEO_M128 x3 = _mm_loadu_si128(x16 + 3);
|
||||
LEO_M128 y3 = _mm_loadu_si128(y16 + 3);
|
||||
x3 = _mm_xor_si128(x3, y3);
|
||||
|
||||
_mm_storeu_si128(x16, x0);
|
||||
_mm_storeu_si128(x16 + 1, x1);
|
||||
_mm_storeu_si128(x16 + 2, x2);
|
||||
_mm_storeu_si128(x16 + 3, x3);
|
||||
|
||||
bytes -= 64, x16 += 4, y16 += 4;
|
||||
}
|
||||
}
|
||||
#endif // LEO_TARGET_MOBILE
|
||||
|
||||
// Handle multiples of 16 bytes
|
||||
while (bytes >= 16)
|
||||
{
|
||||
// x[i] = x[i] xor y[i]
|
||||
_mm_storeu_si128(x16,
|
||||
_mm_xor_si128(
|
||||
_mm_loadu_si128(x16),
|
||||
_mm_loadu_si128(y16)));
|
||||
|
||||
bytes -= 16, ++x16, ++y16;
|
||||
}
|
||||
|
||||
uint8_t * LEO_RESTRICT x1 = reinterpret_cast<uint8_t *>(x16);
|
||||
const uint8_t * LEO_RESTRICT y1 = reinterpret_cast<const uint8_t *>(y16);
|
||||
|
||||
// Handle a block of 8 bytes
|
||||
const unsigned eight = bytes & 8;
|
||||
if (eight)
|
||||
{
|
||||
uint64_t * LEO_RESTRICT x8 = reinterpret_cast<uint64_t *>(x1);
|
||||
const uint64_t * LEO_RESTRICT y8 = reinterpret_cast<const uint64_t *>(y1);
|
||||
*x8 ^= *y8;
|
||||
}
|
||||
|
||||
// Handle a block of 4 bytes
|
||||
const unsigned four = bytes & 4;
|
||||
if (four)
|
||||
{
|
||||
uint32_t * LEO_RESTRICT x4 = reinterpret_cast<uint32_t *>(x1 + eight);
|
||||
const uint32_t * LEO_RESTRICT y4 = reinterpret_cast<const uint32_t *>(y1 + eight);
|
||||
*x4 ^= *y4;
|
||||
}
|
||||
|
||||
// Handle final bytes
|
||||
const unsigned offset = eight + four;
|
||||
switch (bytes & 3)
|
||||
{
|
||||
case 3: x1[offset + 2] ^= y1[offset + 2];
|
||||
case 2: x1[offset + 1] ^= y1[offset + 1];
|
||||
case 1: x1[offset] ^= y1[offset];
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Formal Derivative
|
||||
|
||||
// Formal derivative of polynomial in the new basis
|
||||
static void formal_derivative(GFSymbol* cos, const unsigned size)
|
||||
{
|
||||
for (unsigned i = 1; i < size; ++i)
|
||||
{
|
||||
const unsigned leng = ((i ^ (i - 1)) + 1) >> 1;
|
||||
|
||||
// If a large number of values are being XORed:
|
||||
if (leng >= 8)
|
||||
xor_mem(cos + i - leng, cos + i, leng * sizeof(GFSymbol));
|
||||
else
|
||||
for (unsigned j = i - leng; j < i; j++)
|
||||
cos[j] ^= cos[j + leng];
|
||||
}
|
||||
|
||||
for (unsigned i = size; i < kFieldSize; i <<= 1)
|
||||
xor_mem(cos, cos + i, size * sizeof(GFSymbol));
|
||||
}
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Fast Fourier Transform
|
||||
|
||||
static GFSymbol skewVec[kFieldModulus]; // twisted factors used in FFT
|
||||
|
||||
// IFFT in the proposed basis
|
||||
static void IFLT(GFSymbol* data, const unsigned size, const unsigned index)
|
||||
{
|
||||
for (unsigned depart_no = 1; depart_no < size; depart_no <<= 1)
|
||||
{
|
||||
for (unsigned j = depart_no; j < size; j += (depart_no << 1))
|
||||
{
|
||||
// If a large number of values are being XORed:
|
||||
if (depart_no >= 8)
|
||||
xor_mem(data + j, data + j - depart_no, depart_no * sizeof(GFSymbol));
|
||||
else
|
||||
for (unsigned i = j - depart_no; i < j; ++i)
|
||||
data[i + depart_no] ^= data[i];
|
||||
|
||||
const GFSymbol skew = skewVec[j + index - 1];
|
||||
|
||||
if (skew != kFieldModulus)
|
||||
muladd_mem(data + j - depart_no, data + j, skew, depart_no);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// FFT in the proposed basis
|
||||
static void FLT(GFSymbol* data, const unsigned size, const unsigned index)
|
||||
{
|
||||
for (unsigned depart_no = (size >> 1); depart_no > 0; depart_no >>= 1)
|
||||
{
|
||||
for (unsigned j = depart_no; j < size; j += (depart_no << 1))
|
||||
{
|
||||
const GFSymbol skew = skewVec[j + index - 1];
|
||||
|
||||
if (skew != kFieldModulus)
|
||||
muladd_mem(data + j - depart_no, data + j, skew, depart_no);
|
||||
|
||||
// If a large number of values are being XORed:
|
||||
if (depart_no >= 8)
|
||||
xor_mem(data + j, data + j - depart_no, depart_no * sizeof(GFSymbol));
|
||||
else
|
||||
for (unsigned i = j - depart_no; i < j; ++i)
|
||||
data[i + depart_no] ^= data[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// FFT Initialization
|
||||
|
||||
static GFSymbol B[kFieldSize >> 1]; // factors used in formal derivative
|
||||
static fwht_t log_walsh[kFieldSize]; // factors used in the evaluation of the error locator polynomial
|
||||
|
||||
// Initialize skewVec[], B[], log_walsh[]
|
||||
static void InitFieldOperations()
|
||||
{
|
||||
GFSymbol temp[kGFBits - 1];
|
||||
|
||||
for (unsigned i = 1; i < kGFBits; ++i)
|
||||
temp[i - 1] = (GFSymbol)((unsigned)1 << i);
|
||||
|
||||
for (unsigned m = 0; m < (kGFBits - 1); ++m)
|
||||
{
|
||||
const unsigned step = (unsigned)1 << (m + 1);
|
||||
|
||||
skewVec[((unsigned)1 << m) - 1] = 0;
|
||||
|
||||
for (unsigned i = m; i < (kGFBits - 1); ++i)
|
||||
{
|
||||
const unsigned s = ((unsigned)1 << (i + 1));
|
||||
|
||||
for (unsigned j = ((unsigned)1 << m) - 1; j < s; j += step)
|
||||
skewVec[j + s] = skewVec[j] ^ temp[i];
|
||||
}
|
||||
|
||||
temp[m] = kFieldModulus - GFLog[mulE(temp[m], GFLog[temp[m] ^ 1])];
|
||||
|
||||
for (unsigned i = m + 1; i < (kGFBits - 1); ++i)
|
||||
temp[i] = mulE(temp[i], (GFLog[temp[i] ^ 1] + temp[m]) % kFieldModulus);
|
||||
}
|
||||
|
||||
for (unsigned i = 0; i < kFieldSize; ++i)
|
||||
skewVec[i] = GFLog[skewVec[i]];
|
||||
|
||||
temp[0] = kFieldModulus - temp[0];
|
||||
|
||||
for (unsigned i = 1; i < (kGFBits - 1); ++i)
|
||||
temp[i] = (kFieldModulus - temp[i] + temp[i - 1]) % kFieldModulus;
|
||||
|
||||
B[0] = 0;
|
||||
for (unsigned i = 0; i < (kGFBits - 1); ++i)
|
||||
{
|
||||
const unsigned depart = ((unsigned)1 << i);
|
||||
|
||||
for (unsigned j = 0; j < depart; ++j)
|
||||
B[j + depart] = (B[j] + temp[i]) % kFieldModulus;
|
||||
}
|
||||
|
||||
for (unsigned i = 0; i < kFieldSize; ++i)
|
||||
log_walsh[i] = GFLog[i];
|
||||
|
||||
log_walsh[0] = 0;
|
||||
|
||||
FWHT(log_walsh, kGFBits);
|
||||
}
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Encoder
|
||||
|
||||
// Encoding alg for k/n<0.5: message is a power of two
|
||||
static void encodeL(GFSymbol* data, const unsigned k, GFSymbol* codeword)
|
||||
{
|
||||
memcpy(codeword, data, sizeof(GFSymbol) * k);
|
||||
|
||||
IFLT(codeword, k, 0);
|
||||
|
||||
for (unsigned i = k; i < kFieldSize; i += k)
|
||||
{
|
||||
memcpy(&codeword[i], codeword, sizeof(GFSymbol) * k);
|
||||
|
||||
FLT(&codeword[i], k, i);
|
||||
}
|
||||
|
||||
memcpy(codeword, data, sizeof(GFSymbol) * k);
|
||||
}
|
||||
|
||||
// Encoding alg for k/n>0.5: parity is a power of two.
|
||||
// data: message array. parity: parity array. mem: buffer(size>= n-k)
|
||||
static void encodeH(const GFSymbol* data, const unsigned k, GFSymbol* parity, GFSymbol* mem)
|
||||
{
|
||||
const unsigned t = kFieldSize - k;
|
||||
|
||||
memset(parity, 0, sizeof(GFSymbol) * t);
|
||||
|
||||
for (unsigned i = t; i < kFieldSize; i += t)
|
||||
{
|
||||
memcpy(mem, &data[i - t], sizeof(GFSymbol) * t);
|
||||
|
||||
IFLT(mem, t, i);
|
||||
|
||||
xor_mem(parity, mem, t * sizeof(GFSymbol));
|
||||
}
|
||||
|
||||
FLT(parity, t, 0);
|
||||
}
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Decoder
|
||||
|
||||
static void decode(GFSymbol* codeword, unsigned k, const bool* erasure)
|
||||
{
|
||||
fwht_t log_walsh2[kFieldSize];
|
||||
|
||||
// Compute the evaluations of the error locator polynomial
|
||||
for (unsigned i = 0; i < kFieldSize; ++i)
|
||||
log_walsh2[i] = erasure[i] ? 1 : 0;
|
||||
|
||||
FWHT(log_walsh2, kGFBits);
|
||||
|
||||
for (unsigned i = 0; i < kFieldSize; ++i)
|
||||
log_walsh2[i] = ((unsigned)log_walsh2[i] * (unsigned)log_walsh[i]) % kFieldModulus;
|
||||
|
||||
FWHT(log_walsh2, kGFBits);
|
||||
|
||||
// k2 can be replaced with k
|
||||
const unsigned k2 = kFieldSize;
|
||||
//const unsigned k2 = k; // cannot actually be replaced with k. what else need to change?
|
||||
|
||||
for (unsigned i = 0; i < kFieldSize; ++i)
|
||||
{
|
||||
if (erasure[i])
|
||||
{
|
||||
codeword[i] = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
codeword[i] = mulE(codeword[i], log_walsh2[i]);
|
||||
}
|
||||
}
|
||||
|
||||
IFLT(codeword, kFieldSize, 0);
|
||||
|
||||
// formal derivative
|
||||
for (unsigned i = 0; i < kFieldSize; i += 2)
|
||||
{
|
||||
codeword[i] = mulE(codeword[i], kFieldModulus - B[i >> 1]);
|
||||
codeword[i + 1] = mulE(codeword[i + 1], kFieldModulus - B[i >> 1]);
|
||||
}
|
||||
|
||||
formal_derivative(codeword, k2);
|
||||
|
||||
for (unsigned i = 0; i < k2; i += 2)
|
||||
{
|
||||
codeword[i] = mulE(codeword[i], B[i >> 1]);
|
||||
codeword[i + 1] = mulE(codeword[i + 1], B[i >> 1]);
|
||||
}
|
||||
|
||||
FLT(codeword, k2, 0);
|
||||
|
||||
for (unsigned i = 0; i < k2; ++i)
|
||||
{
|
||||
if (erasure[i])
|
||||
{
|
||||
codeword[i] = mulE(codeword[i], kFieldModulus - log_walsh2[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Test Application
|
||||
|
||||
void test(unsigned k, unsigned seed)
|
||||
{
|
||||
srand(seed);
|
||||
|
||||
//-----------Generating message----------
|
||||
|
||||
// Message array
|
||||
GFSymbol data[kFieldSize] = {0};
|
||||
|
||||
// Filled with random numbers
|
||||
for (unsigned i = kFieldSize - k; i < kFieldSize; ++i)
|
||||
data[i] = (GFSymbol)rand();
|
||||
|
||||
|
||||
//---------encoding----------
|
||||
|
||||
GFSymbol codeword[kFieldSize];
|
||||
encodeH(&data[kFieldSize - k], k, data, codeword);
|
||||
//encodeL(data, k, codeword); // does not seem to work with any input? what else needs to change?
|
||||
|
||||
memcpy(codeword, data, sizeof(GFSymbol) * kFieldSize);
|
||||
|
||||
|
||||
//--------erasure simulation---------
|
||||
|
||||
// Array indicating erasures
|
||||
bool erasure[kFieldSize] = {
|
||||
false
|
||||
};
|
||||
|
||||
for (unsigned i = k; i < kFieldSize; ++i)
|
||||
erasure[i] = true;
|
||||
|
||||
// permuting the erasure array
|
||||
for (unsigned i = kFieldSize - 1; i > 0; --i)
|
||||
{
|
||||
unsigned pos = rand() % (i + 1);
|
||||
|
||||
if (i != pos)
|
||||
{
|
||||
bool tmp = erasure[i];
|
||||
erasure[i] = erasure[pos];
|
||||
erasure[pos] = tmp;
|
||||
}
|
||||
}
|
||||
|
||||
// erasure codeword symbols
|
||||
for (unsigned i = 0; i < kFieldSize; ++i)
|
||||
if (erasure[i])
|
||||
codeword[i] = 0;
|
||||
|
||||
|
||||
//---------main processing----------
|
||||
decode(codeword, k, erasure);
|
||||
|
||||
// Check the correctness of the result
|
||||
for (unsigned i = 0; i < kFieldSize; ++i)
|
||||
{
|
||||
if (erasure[i] == 1)
|
||||
{
|
||||
if (data[i] != codeword[i])
|
||||
{
|
||||
printf("Decoding Error with seed = %d!\n", seed);
|
||||
LEO_DEBUG_BREAK;
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//printf("Decoding is successful!\n");
|
||||
}
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Entrypoint
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
// Initialize architecture-specific code
|
||||
leo_architecture_init();
|
||||
|
||||
// Fill GFLog table and GFExp table
|
||||
InitField();
|
||||
|
||||
// Compute factors used in erasure decoder
|
||||
InitFieldOperations();
|
||||
|
||||
unsigned seed = (unsigned)time(NULL);
|
||||
for (;;)
|
||||
{
|
||||
// test(int k), k: message size
|
||||
/*
|
||||
EncodeH works for kFieldSize / 2 and kFieldSize * 3 / 4, etc,
|
||||
s.t. the number of recovery pieces is a power of two
|
||||
*/
|
||||
test(kFieldSize / 2, seed);
|
||||
|
||||
++seed;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
} // namespace leopard
|
|
@ -0,0 +1,194 @@
|
|||
/*
|
||||
Copyright (c) 2017 Christopher A. Taylor. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright notice,
|
||||
this list of conditions and the following disclaimer in the documentation
|
||||
and/or other materials provided with the distribution.
|
||||
* Neither the name of Leopard-RS nor the names of its contributors may be
|
||||
used to endorse or promote products derived from this software without
|
||||
specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
/*
|
||||
TODO:
|
||||
+ Refactor software
|
||||
+ I think it should be split up into several C++ modules
|
||||
+ Replace GFSymbol with a file data pointer
|
||||
+ New 16-bit Muladd inner loops
|
||||
+ Class to contain the (large) muladd tables
|
||||
+ Preliminary benchmarks for large data!
|
||||
+ New 8-bit Muladd inner loops
|
||||
+ Benchmarks for smaller data!
|
||||
+ Write detailed comments for all the routines
|
||||
+ Look into getting EncodeL working so we can support smaller data (Ask Lin)
|
||||
+ Look into using k instead of k2 to speed up decoder (Ask Lin)
|
||||
+ Avoid performing FFT/IFFT intermediate calculations we're not going to use
|
||||
+ Benchmarks, fun!
|
||||
+ Add multi-threading to split up long parallelizable calculations
|
||||
+ Final benchmarks!
|
||||
+ Finish up documentation
|
||||
+ Release version 1
|
||||
|
||||
|
||||
Muladd implementation notes:
|
||||
|
||||
Specialize for 1-3 rows at a time since often times we're multiplying by
|
||||
the same (skew) value repeatedly, as the ISA-L library does here:
|
||||
|
||||
https://github.com/01org/isa-l/blob/master/erasure_code/gf_3vect_mad_avx.asm#L258
|
||||
|
||||
Except we should be doing it for 16-bit Galois Field.
|
||||
To implement that use the ALTMAP trick from Jerasure:
|
||||
|
||||
http://lab.jerasure.org/jerasure/gf-complete/blob/master/src/gf_w16.c#L1140
|
||||
|
||||
Except we should also support AVX2 since that is a 40% perf boost, so put
|
||||
the high and low bytes 32 bytes instead of 16 bytes apart.
|
||||
|
||||
Also I think we should go ahead and precompute the multiply tables since
|
||||
it avoids a bunch of memory lookups for each muladd, and only costs 8 MB.
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Debug
|
||||
|
||||
// Some bugs only repro in release mode, so this can be helpful
|
||||
//#define LEO_DEBUG_IN_RELEASE
|
||||
|
||||
#if defined(_DEBUG) || defined(DEBUG) || defined(LEO_DEBUG_IN_RELEASE)
|
||||
#define LEO_DEBUG
|
||||
#ifdef _WIN32
|
||||
#define LEO_DEBUG_BREAK __debugbreak()
|
||||
#else
|
||||
#define LEO_DEBUG_BREAK __builtin_trap()
|
||||
#endif
|
||||
#define LEO_DEBUG_ASSERT(cond) { if (!(cond)) { LEO_DEBUG_BREAK; } }
|
||||
#else
|
||||
#define LEO_DEBUG_BREAK ;
|
||||
#define LEO_DEBUG_ASSERT(cond) ;
|
||||
#endif
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Platform/Architecture
|
||||
|
||||
#if defined(ANDROID) || defined(IOS)
|
||||
#define LEO_TARGET_MOBILE
|
||||
#endif // ANDROID
|
||||
|
||||
#if defined(__AVX2__) || (defined (_MSC_VER) && _MSC_VER >= 1900)
|
||||
#define LEO_TRY_AVX2 /* 256-bit */
|
||||
#include <immintrin.h>
|
||||
#define LEO_ALIGN_BYTES 32
|
||||
#else // __AVX2__
|
||||
#define LEO_ALIGN_BYTES 16
|
||||
#endif // __AVX2__
|
||||
|
||||
#if !defined(LEO_TARGET_MOBILE)
|
||||
// Note: MSVC currently only supports SSSE3 but not AVX2
|
||||
#include <tmmintrin.h> // SSSE3: _mm_shuffle_epi8
|
||||
#include <emmintrin.h> // SSE2
|
||||
#endif // LEO_TARGET_MOBILE
|
||||
|
||||
#if defined(HAVE_ARM_NEON_H)
|
||||
#include <arm_neon.h>
|
||||
#endif // HAVE_ARM_NEON_H
|
||||
|
||||
#if defined(LEO_TARGET_MOBILE)
|
||||
|
||||
#define LEO_ALIGNED_ACCESSES /* Inputs must be aligned to LEO_ALIGN_BYTES */
|
||||
|
||||
# if defined(HAVE_ARM_NEON_H)
|
||||
// Compiler-specific 128-bit SIMD register keyword
|
||||
#define LEO_M128 uint8x16_t
|
||||
#define LEO_TRY_NEON
|
||||
#else
|
||||
#define LEO_M128 uint64_t
|
||||
# endif
|
||||
|
||||
#else // LEO_TARGET_MOBILE
|
||||
|
||||
// Compiler-specific 128-bit SIMD register keyword
|
||||
#define LEO_M128 __m128i
|
||||
|
||||
#endif // LEO_TARGET_MOBILE
|
||||
|
||||
#ifdef LEO_TRY_AVX2
|
||||
// Compiler-specific 256-bit SIMD register keyword
|
||||
#define LEO_M256 __m256i
|
||||
#endif
|
||||
|
||||
// Compiler-specific C++11 restrict keyword
|
||||
#define LEO_RESTRICT __restrict
|
||||
|
||||
// Compiler-specific force inline keyword
|
||||
#ifdef _MSC_VER
|
||||
#define LEO_FORCE_INLINE inline __forceinline
|
||||
#else
|
||||
#define LEO_FORCE_INLINE inline __attribute__((always_inline))
|
||||
#endif
|
||||
|
||||
// Compiler-specific alignment keyword
|
||||
// Note: Alignment only matters for ARM NEON where it should be 16
|
||||
#ifdef _MSC_VER
|
||||
#define LEO_ALIGNED __declspec(align(LEO_ALIGN_BYTES))
|
||||
#else // _MSC_VER
|
||||
#define LEO_ALIGNED __attribute__((aligned(LEO_ALIGN_BYTES)))
|
||||
#endif // _MSC_VER
|
||||
|
||||
|
||||
namespace leopard {
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Runtime CPU Architecture Check
|
||||
|
||||
// Initialize CPU architecture flags
|
||||
void InitializeCPUArch();
|
||||
|
||||
#if defined(LEO_TRY_NEON)
|
||||
# if defined(IOS) && defined(__ARM_NEON__)
|
||||
// Does device support NEON?
|
||||
static const bool CpuHasNeon = true;
|
||||
static const bool CpuHasNeon64 = true;
|
||||
# else
|
||||
// Does device support NEON?
|
||||
// Remember to add LOCAL_STATIC_LIBRARIES := cpufeatures
|
||||
extern bool CpuHasNeon; // V6 / V7
|
||||
extern bool CpuHasNeon64; // 64-bit
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#if !defined(LEO_TARGET_MOBILE)
|
||||
# if defined(LEO_TRY_AVX2)
|
||||
// Does CPU support AVX2?
|
||||
extern bool CpuHasAVX2;
|
||||
# endif
|
||||
// Does CPU support SSSE3?
|
||||
extern bool CpuHasSSSE3;
|
||||
#endif // LEO_TARGET_MOBILE
|
||||
|
||||
|
||||
} // namespace leopard
|
|
@ -1,8 +1,29 @@
|
|||
/*
|
||||
S.-J. Lin, T. Y. Al-Naffouri, Y. S. Han, and W.-H. Chung,
|
||||
"Novel Polynomial Basis with Fast Fourier Transform and Its Application to Reed-Solomon Erasure Codes"
|
||||
IEEE Trans. on Information Theory, pp. 6284-6299, November, 2016.
|
||||
http://ct.ee.ntust.edu.tw/it2016-2.pdf
|
||||
Copyright (c) 2017 Christopher A. Taylor. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright notice,
|
||||
this list of conditions and the following disclaimer in the documentation
|
||||
and/or other materials provided with the distribution.
|
||||
* Neither the name of LHC-RS nor the names of its contributors may be
|
||||
used to endorse or promote products derived from this software without
|
||||
specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include <string.h>
|
||||
|
@ -23,7 +44,7 @@
|
|||
+ New 8-bit Muladd inner loops
|
||||
+ Benchmarks for smaller data!
|
||||
+ Refactor software
|
||||
+ Pick a name for the software better than LHC_RS
|
||||
+ Pick a name for the software better than LEO_RS
|
||||
+ I think it should be split up into several C++ modules
|
||||
+ Write detailed comments for all the routines
|
||||
+ Look into getting EncodeL working so we can support smaller data (Ask Lin)
|
||||
|
@ -60,19 +81,19 @@
|
|||
// Debug
|
||||
|
||||
// Some bugs only repro in release mode, so this can be helpful
|
||||
//#define LHC_DEBUG_IN_RELEASE
|
||||
//#define LEO_DEBUG_IN_RELEASE
|
||||
|
||||
#if defined(_DEBUG) || defined(DEBUG) || defined(LHC_DEBUG_IN_RELEASE)
|
||||
#define LHC_DEBUG
|
||||
#if defined(_DEBUG) || defined(DEBUG) || defined(LEO_DEBUG_IN_RELEASE)
|
||||
#define LEO_DEBUG
|
||||
#ifdef _WIN32
|
||||
#define LHC_DEBUG_BREAK __debugbreak()
|
||||
#define LEO_DEBUG_BREAK __debugbreak()
|
||||
#else
|
||||
#define LHC_DEBUG_BREAK __builtin_trap()
|
||||
#define LEO_DEBUG_BREAK __builtin_trap()
|
||||
#endif
|
||||
#define LHC_DEBUG_ASSERT(cond) { if (!(cond)) { LHC_DEBUG_BREAK; } }
|
||||
#define LEO_DEBUG_ASSERT(cond) { if (!(cond)) { LEO_DEBUG_BREAK; } }
|
||||
#else
|
||||
#define LHC_DEBUG_BREAK ;
|
||||
#define LHC_DEBUG_ASSERT(cond) ;
|
||||
#define LEO_DEBUG_BREAK ;
|
||||
#define LEO_DEBUG_ASSERT(cond) ;
|
||||
#endif
|
||||
|
||||
|
||||
|
@ -80,67 +101,67 @@
|
|||
// Platform/Architecture
|
||||
|
||||
#if defined(ANDROID) || defined(IOS)
|
||||
#define LHC_TARGET_MOBILE
|
||||
#define LEO_TARGET_MOBILE
|
||||
#endif // ANDROID
|
||||
|
||||
#if defined(__AVX2__) || (defined (_MSC_VER) && _MSC_VER >= 1900)
|
||||
#define LHC_TRY_AVX2 /* 256-bit */
|
||||
#define LEO_TRY_AVX2 /* 256-bit */
|
||||
#include <immintrin.h>
|
||||
#define LHC_ALIGN_BYTES 32
|
||||
#define LEO_ALIGN_BYTES 32
|
||||
#else // __AVX2__
|
||||
#define LHC_ALIGN_BYTES 16
|
||||
#define LEO_ALIGN_BYTES 16
|
||||
#endif // __AVX2__
|
||||
|
||||
#if !defined(LHC_TARGET_MOBILE)
|
||||
#if !defined(LEO_TARGET_MOBILE)
|
||||
// Note: MSVC currently only supports SSSE3 but not AVX2
|
||||
#include <tmmintrin.h> // SSSE3: _mm_shuffle_epi8
|
||||
#include <emmintrin.h> // SSE2
|
||||
#endif // LHC_TARGET_MOBILE
|
||||
#endif // LEO_TARGET_MOBILE
|
||||
|
||||
#if defined(HAVE_ARM_NEON_H)
|
||||
#include <arm_neon.h>
|
||||
#endif // HAVE_ARM_NEON_H
|
||||
|
||||
#if defined(LHC_TARGET_MOBILE)
|
||||
#if defined(LEO_TARGET_MOBILE)
|
||||
|
||||
#define LHC_ALIGNED_ACCESSES /* Inputs must be aligned to LHC_ALIGN_BYTES */
|
||||
#define LEO_ALIGNED_ACCESSES /* Inputs must be aligned to LEO_ALIGN_BYTES */
|
||||
|
||||
# if defined(HAVE_ARM_NEON_H)
|
||||
// Compiler-specific 128-bit SIMD register keyword
|
||||
#define LHC_M128 uint8x16_t
|
||||
#define LHC_TRY_NEON
|
||||
#define LEO_M128 uint8x16_t
|
||||
#define LEO_TRY_NEON
|
||||
#else
|
||||
#define LHC_M128 uint64_t
|
||||
#define LEO_M128 uint64_t
|
||||
# endif
|
||||
|
||||
#else // LHC_TARGET_MOBILE
|
||||
#else // LEO_TARGET_MOBILE
|
||||
|
||||
// Compiler-specific 128-bit SIMD register keyword
|
||||
#define LHC_M128 __m128i
|
||||
#define LEO_M128 __m128i
|
||||
|
||||
#endif // LHC_TARGET_MOBILE
|
||||
#endif // LEO_TARGET_MOBILE
|
||||
|
||||
#ifdef LHC_TRY_AVX2
|
||||
#ifdef LEO_TRY_AVX2
|
||||
// Compiler-specific 256-bit SIMD register keyword
|
||||
#define LHC_M256 __m256i
|
||||
#define LEO_M256 __m256i
|
||||
#endif
|
||||
|
||||
// Compiler-specific C++11 restrict keyword
|
||||
#define LHC_RESTRICT __restrict
|
||||
#define LEO_RESTRICT __restrict
|
||||
|
||||
// Compiler-specific force inline keyword
|
||||
#ifdef _MSC_VER
|
||||
#define LHC_FORCE_INLINE inline __forceinline
|
||||
#define LEO_FORCE_INLINE inline __forceinline
|
||||
#else
|
||||
#define LHC_FORCE_INLINE inline __attribute__((always_inline))
|
||||
#define LEO_FORCE_INLINE inline __attribute__((always_inline))
|
||||
#endif
|
||||
|
||||
// Compiler-specific alignment keyword
|
||||
// Note: Alignment only matters for ARM NEON where it should be 16
|
||||
#ifdef _MSC_VER
|
||||
#define LHC_ALIGNED __declspec(align(LHC_ALIGN_BYTES))
|
||||
#define LEO_ALIGNED __declspec(align(LEO_ALIGN_BYTES))
|
||||
#else // _MSC_VER
|
||||
#define LHC_ALIGNED __attribute__((aligned(LHC_ALIGN_BYTES)))
|
||||
#define LEO_ALIGNED __attribute__((aligned(LEO_ALIGN_BYTES)))
|
||||
#endif // _MSC_VER
|
||||
|
||||
|
||||
|
@ -154,7 +175,7 @@
|
|||
#include <cpu-features.h>
|
||||
#endif
|
||||
|
||||
#if defined(LHC_TRY_NEON)
|
||||
#if defined(LEO_TRY_NEON)
|
||||
# if defined(IOS) && defined(__ARM_NEON__)
|
||||
// Requires iPhone 5S or newer
|
||||
static const bool CpuHasNeon = true;
|
||||
|
@ -167,14 +188,14 @@
|
|||
#endif
|
||||
|
||||
|
||||
#if !defined(LHC_TARGET_MOBILE)
|
||||
#if !defined(LEO_TARGET_MOBILE)
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#include <intrin.h> // __cpuid
|
||||
#pragma warning(disable: 4752) // found Intel(R) Advanced Vector Extensions; consider using /arch:AVX
|
||||
#endif
|
||||
|
||||
#ifdef LHC_TRY_AVX2
|
||||
#ifdef LEO_TRY_AVX2
|
||||
static bool CpuHasAVX2 = false;
|
||||
#endif
|
||||
static bool CpuHasSSSE3 = false;
|
||||
|
@ -219,12 +240,12 @@ static void _cpuid(unsigned int cpu_info[4U], const unsigned int cpu_info_type)
|
|||
#endif
|
||||
}
|
||||
|
||||
#endif // defined(LHC_TARGET_MOBILE)
|
||||
#endif // defined(LEO_TARGET_MOBILE)
|
||||
|
||||
|
||||
static void lhc_architecture_init()
|
||||
static void leo_architecture_init()
|
||||
{
|
||||
#if defined(LHC_TRY_NEON) && defined(HAVE_ANDROID_GETCPUFEATURES)
|
||||
#if defined(LEO_TRY_NEON) && defined(HAVE_ANDROID_GETCPUFEATURES)
|
||||
AndroidCpuFamily family = android_getCpuFamily();
|
||||
if (family == ANDROID_CPU_FAMILY_ARM)
|
||||
{
|
||||
|
@ -239,32 +260,32 @@ static void lhc_architecture_init()
|
|||
}
|
||||
#endif
|
||||
|
||||
#if !defined(LHC_TARGET_MOBILE)
|
||||
#if !defined(LEO_TARGET_MOBILE)
|
||||
unsigned int cpu_info[4];
|
||||
|
||||
_cpuid(cpu_info, 1);
|
||||
CpuHasSSSE3 = ((cpu_info[2] & CPUID_ECX_SSSE3) != 0);
|
||||
|
||||
#if defined(LHC_TRY_AVX2)
|
||||
#if defined(LEO_TRY_AVX2)
|
||||
_cpuid(cpu_info, 7);
|
||||
CpuHasAVX2 = ((cpu_info[1] & CPUID_EBX_AVX2) != 0);
|
||||
#endif // LHC_TRY_AVX2
|
||||
#endif // LEO_TRY_AVX2
|
||||
|
||||
#endif // LHC_TARGET_MOBILE
|
||||
#endif // LEO_TARGET_MOBILE
|
||||
}
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// SIMD-Safe Aligned Memory Allocations
|
||||
|
||||
static const unsigned kAlignmentBytes = LHC_ALIGN_BYTES;
|
||||
static const unsigned kAlignmentBytes = LEO_ALIGN_BYTES;
|
||||
|
||||
LHC_FORCE_INLINE unsigned NextAlignedOffset(unsigned offset)
|
||||
LEO_FORCE_INLINE unsigned NextAlignedOffset(unsigned offset)
|
||||
{
|
||||
return (offset + kAlignmentBytes - 1) & ~(kAlignmentBytes - 1);
|
||||
}
|
||||
|
||||
static LHC_FORCE_INLINE uint8_t* SIMDSafeAllocate(size_t size)
|
||||
static LEO_FORCE_INLINE uint8_t* SIMDSafeAllocate(size_t size)
|
||||
{
|
||||
uint8_t* data = (uint8_t*)calloc(1, kAlignmentBytes + size);
|
||||
if (!data)
|
||||
|
@ -275,7 +296,7 @@ static LHC_FORCE_INLINE uint8_t* SIMDSafeAllocate(size_t size)
|
|||
return data;
|
||||
}
|
||||
|
||||
static LHC_FORCE_INLINE void SIMDSafeFree(void* ptr)
|
||||
static LEO_FORCE_INLINE void SIMDSafeFree(void* ptr)
|
||||
{
|
||||
if (!ptr)
|
||||
return;
|
||||
|
@ -283,7 +304,7 @@ static LHC_FORCE_INLINE void SIMDSafeFree(void* ptr)
|
|||
unsigned offset = data[-1];
|
||||
if (offset >= kAlignmentBytes)
|
||||
{
|
||||
LHC_DEBUG_BREAK; // Should never happen
|
||||
LEO_DEBUG_BREAK; // Should never happen
|
||||
return;
|
||||
}
|
||||
data -= kAlignmentBytes - offset;
|
||||
|
@ -294,9 +315,9 @@ static LHC_FORCE_INLINE void SIMDSafeFree(void* ptr)
|
|||
//------------------------------------------------------------------------------
|
||||
// Field
|
||||
|
||||
//#define LHC_SHORT_FIELD
|
||||
//#define LEO_SHORT_FIELD
|
||||
|
||||
#ifdef LHC_SHORT_FIELD
|
||||
#ifdef LEO_SHORT_FIELD
|
||||
typedef uint8_t GFSymbol;
|
||||
static const unsigned kGFBits = 8;
|
||||
static const unsigned kGFPolynomial = 0x11D;
|
||||
|
@ -386,7 +407,7 @@ static inline GFSymbol SubModQ(GFSymbol a, GFSymbol b)
|
|||
}
|
||||
|
||||
// vx[] += vy[] * z
|
||||
static void muladd_mem(GFSymbol * LHC_RESTRICT vx, const GFSymbol * LHC_RESTRICT vy, GFSymbol z, unsigned symbolCount)
|
||||
static void muladd_mem(GFSymbol * LEO_RESTRICT vx, const GFSymbol * LEO_RESTRICT vy, GFSymbol z, unsigned symbolCount)
|
||||
{
|
||||
for (unsigned i = 0; i < symbolCount; ++i)
|
||||
{
|
||||
|
@ -443,12 +464,12 @@ static GFSymbol mulE(GFSymbol a, GFSymbol b)
|
|||
// Q is the maximum symbol value, e.g. 255 or 65535.
|
||||
|
||||
// Define this to enable the optimized version of FWHT()
|
||||
#define LHC_FWHT_OPTIMIZED
|
||||
#define LEO_FWHT_OPTIMIZED
|
||||
|
||||
typedef GFSymbol fwht_t;
|
||||
|
||||
// {a, b} = {a + b, a - b} (Mod Q)
|
||||
static LHC_FORCE_INLINE void FWHT_2(fwht_t& LHC_RESTRICT a, fwht_t& LHC_RESTRICT b)
|
||||
static LEO_FORCE_INLINE void FWHT_2(fwht_t& LEO_RESTRICT a, fwht_t& LEO_RESTRICT b)
|
||||
{
|
||||
const fwht_t sum = AddModQ(a, b);
|
||||
const fwht_t dif = SubModQ(a, b);
|
||||
|
@ -473,7 +494,7 @@ static LHC_FORCE_INLINE void FWHT_2(fwht_t& LHC_RESTRICT a, fwht_t& LHC_RESTRICT
|
|||
at too high a complexity cost relative to minor perf improvement.
|
||||
*/
|
||||
|
||||
#ifndef LHC_FWHT_OPTIMIZED
|
||||
#ifndef LEO_FWHT_OPTIMIZED
|
||||
|
||||
// Reference implementation
|
||||
static void FWHT(fwht_t* data, const unsigned bits)
|
||||
|
@ -487,7 +508,7 @@ static void FWHT(fwht_t* data, const unsigned bits)
|
|||
|
||||
#else
|
||||
|
||||
static LHC_FORCE_INLINE void FWHT_4(fwht_t* data)
|
||||
static LEO_FORCE_INLINE void FWHT_4(fwht_t* data)
|
||||
{
|
||||
fwht_t t0 = data[0];
|
||||
fwht_t t1 = data[1];
|
||||
|
@ -503,7 +524,7 @@ static LHC_FORCE_INLINE void FWHT_4(fwht_t* data)
|
|||
data[3] = t3;
|
||||
}
|
||||
|
||||
static LHC_FORCE_INLINE void FWHT_4(fwht_t* data, unsigned s)
|
||||
static LEO_FORCE_INLINE void FWHT_4(fwht_t* data, unsigned s)
|
||||
{
|
||||
unsigned x = 0;
|
||||
fwht_t t0 = data[x]; x += s;
|
||||
|
@ -683,26 +704,26 @@ static void FWHT(fwht_t* data, const unsigned ldn)
|
|||
//------------------------------------------------------------------------------
|
||||
// Memory Buffer XOR
|
||||
|
||||
static void xor_mem(void * LHC_RESTRICT vx, const void * LHC_RESTRICT vy, unsigned bytes)
|
||||
static void xor_mem(void * LEO_RESTRICT vx, const void * LEO_RESTRICT vy, unsigned bytes)
|
||||
{
|
||||
LHC_M128 * LHC_RESTRICT x16 = reinterpret_cast<LHC_M128 *>(vx);
|
||||
const LHC_M128 * LHC_RESTRICT y16 = reinterpret_cast<const LHC_M128 *>(vy);
|
||||
LEO_M128 * LEO_RESTRICT x16 = reinterpret_cast<LEO_M128 *>(vx);
|
||||
const LEO_M128 * LEO_RESTRICT y16 = reinterpret_cast<const LEO_M128 *>(vy);
|
||||
|
||||
#if defined(LHC_TARGET_MOBILE)
|
||||
# if defined(LHC_TRY_NEON)
|
||||
#if defined(LEO_TARGET_MOBILE)
|
||||
# if defined(LEO_TRY_NEON)
|
||||
// Handle multiples of 64 bytes
|
||||
if (CpuHasNeon)
|
||||
{
|
||||
while (bytes >= 64)
|
||||
{
|
||||
LHC_M128 x0 = vld1q_u8(x16);
|
||||
LHC_M128 x1 = vld1q_u8(x16 + 1);
|
||||
LHC_M128 x2 = vld1q_u8(x16 + 2);
|
||||
LHC_M128 x3 = vld1q_u8(x16 + 3);
|
||||
LHC_M128 y0 = vld1q_u8(y16);
|
||||
LHC_M128 y1 = vld1q_u8(y16 + 1);
|
||||
LHC_M128 y2 = vld1q_u8(y16 + 2);
|
||||
LHC_M128 y3 = vld1q_u8(y16 + 3);
|
||||
LEO_M128 x0 = vld1q_u8(x16);
|
||||
LEO_M128 x1 = vld1q_u8(x16 + 1);
|
||||
LEO_M128 x2 = vld1q_u8(x16 + 2);
|
||||
LEO_M128 x3 = vld1q_u8(x16 + 3);
|
||||
LEO_M128 y0 = vld1q_u8(y16);
|
||||
LEO_M128 y1 = vld1q_u8(y16 + 1);
|
||||
LEO_M128 y2 = vld1q_u8(y16 + 2);
|
||||
LEO_M128 y3 = vld1q_u8(y16 + 3);
|
||||
|
||||
vst1q_u8(x16, veorq_u8(x0, y0));
|
||||
vst1q_u8(x16 + 1, veorq_u8(x1, y1));
|
||||
|
@ -715,8 +736,8 @@ static void xor_mem(void * LHC_RESTRICT vx, const void * LHC_RESTRICT vy, unsign
|
|||
// Handle multiples of 16 bytes
|
||||
while (bytes >= 16)
|
||||
{
|
||||
LHC_M128 x0 = vld1q_u8(x16);
|
||||
LHC_M128 y0 = vld1q_u8(y16);
|
||||
LEO_M128 x0 = vld1q_u8(x16);
|
||||
LEO_M128 y0 = vld1q_u8(y16);
|
||||
|
||||
vst1q_u8(x16, veorq_u8(x0, y0));
|
||||
|
||||
|
@ -724,38 +745,38 @@ static void xor_mem(void * LHC_RESTRICT vx, const void * LHC_RESTRICT vy, unsign
|
|||
}
|
||||
}
|
||||
else
|
||||
# endif // LHC_TRY_NEON
|
||||
# endif // LEO_TRY_NEON
|
||||
{
|
||||
uint64_t * LHC_RESTRICT x8 = reinterpret_cast<uint64_t *>(x16);
|
||||
const uint64_t * LHC_RESTRICT y8 = reinterpret_cast<const uint64_t *>(y16);
|
||||
uint64_t * LEO_RESTRICT x8 = reinterpret_cast<uint64_t *>(x16);
|
||||
const uint64_t * LEO_RESTRICT y8 = reinterpret_cast<const uint64_t *>(y16);
|
||||
|
||||
const unsigned count = (unsigned)bytes / 8;
|
||||
for (unsigned ii = 0; ii < count; ++ii)
|
||||
x8[ii] ^= y8[ii];
|
||||
|
||||
x16 = reinterpret_cast<LHC_M128 *>(x8 + count);
|
||||
y16 = reinterpret_cast<const LHC_M128 *>(y8 + count);
|
||||
x16 = reinterpret_cast<LEO_M128 *>(x8 + count);
|
||||
y16 = reinterpret_cast<const LEO_M128 *>(y8 + count);
|
||||
}
|
||||
#else // LHC_TARGET_MOBILE
|
||||
# if defined(LHC_TRY_AVX2)
|
||||
#else // LEO_TARGET_MOBILE
|
||||
# if defined(LEO_TRY_AVX2)
|
||||
if (CpuHasAVX2)
|
||||
{
|
||||
LHC_M256 * LHC_RESTRICT x32 = reinterpret_cast<LHC_M256 *>(x16);
|
||||
const LHC_M256 * LHC_RESTRICT y32 = reinterpret_cast<const LHC_M256 *>(y16);
|
||||
LEO_M256 * LEO_RESTRICT x32 = reinterpret_cast<LEO_M256 *>(x16);
|
||||
const LEO_M256 * LEO_RESTRICT y32 = reinterpret_cast<const LEO_M256 *>(y16);
|
||||
|
||||
while (bytes >= 128)
|
||||
{
|
||||
LHC_M256 x0 = _mm256_loadu_si256(x32);
|
||||
LHC_M256 y0 = _mm256_loadu_si256(y32);
|
||||
LEO_M256 x0 = _mm256_loadu_si256(x32);
|
||||
LEO_M256 y0 = _mm256_loadu_si256(y32);
|
||||
x0 = _mm256_xor_si256(x0, y0);
|
||||
LHC_M256 x1 = _mm256_loadu_si256(x32 + 1);
|
||||
LHC_M256 y1 = _mm256_loadu_si256(y32 + 1);
|
||||
LEO_M256 x1 = _mm256_loadu_si256(x32 + 1);
|
||||
LEO_M256 y1 = _mm256_loadu_si256(y32 + 1);
|
||||
x1 = _mm256_xor_si256(x1, y1);
|
||||
LHC_M256 x2 = _mm256_loadu_si256(x32 + 2);
|
||||
LHC_M256 y2 = _mm256_loadu_si256(y32 + 2);
|
||||
LEO_M256 x2 = _mm256_loadu_si256(x32 + 2);
|
||||
LEO_M256 y2 = _mm256_loadu_si256(y32 + 2);
|
||||
x2 = _mm256_xor_si256(x2, y2);
|
||||
LHC_M256 x3 = _mm256_loadu_si256(x32 + 3);
|
||||
LHC_M256 y3 = _mm256_loadu_si256(y32 + 3);
|
||||
LEO_M256 x3 = _mm256_loadu_si256(x32 + 3);
|
||||
LEO_M256 y3 = _mm256_loadu_si256(y32 + 3);
|
||||
x3 = _mm256_xor_si256(x3, y3);
|
||||
|
||||
_mm256_storeu_si256(x32, x0);
|
||||
|
@ -778,25 +799,25 @@ static void xor_mem(void * LHC_RESTRICT vx, const void * LHC_RESTRICT vy, unsign
|
|||
bytes -= 32, ++x32, ++y32;
|
||||
}
|
||||
|
||||
x16 = reinterpret_cast<LHC_M128 *>(x32);
|
||||
y16 = reinterpret_cast<const LHC_M128 *>(y32);
|
||||
x16 = reinterpret_cast<LEO_M128 *>(x32);
|
||||
y16 = reinterpret_cast<const LEO_M128 *>(y32);
|
||||
}
|
||||
else
|
||||
# endif // LHC_TRY_AVX2
|
||||
# endif // LEO_TRY_AVX2
|
||||
{
|
||||
while (bytes >= 64)
|
||||
{
|
||||
LHC_M128 x0 = _mm_loadu_si128(x16);
|
||||
LHC_M128 y0 = _mm_loadu_si128(y16);
|
||||
LEO_M128 x0 = _mm_loadu_si128(x16);
|
||||
LEO_M128 y0 = _mm_loadu_si128(y16);
|
||||
x0 = _mm_xor_si128(x0, y0);
|
||||
LHC_M128 x1 = _mm_loadu_si128(x16 + 1);
|
||||
LHC_M128 y1 = _mm_loadu_si128(y16 + 1);
|
||||
LEO_M128 x1 = _mm_loadu_si128(x16 + 1);
|
||||
LEO_M128 y1 = _mm_loadu_si128(y16 + 1);
|
||||
x1 = _mm_xor_si128(x1, y1);
|
||||
LHC_M128 x2 = _mm_loadu_si128(x16 + 2);
|
||||
LHC_M128 y2 = _mm_loadu_si128(y16 + 2);
|
||||
LEO_M128 x2 = _mm_loadu_si128(x16 + 2);
|
||||
LEO_M128 y2 = _mm_loadu_si128(y16 + 2);
|
||||
x2 = _mm_xor_si128(x2, y2);
|
||||
LHC_M128 x3 = _mm_loadu_si128(x16 + 3);
|
||||
LHC_M128 y3 = _mm_loadu_si128(y16 + 3);
|
||||
LEO_M128 x3 = _mm_loadu_si128(x16 + 3);
|
||||
LEO_M128 y3 = _mm_loadu_si128(y16 + 3);
|
||||
x3 = _mm_xor_si128(x3, y3);
|
||||
|
||||
_mm_storeu_si128(x16, x0);
|
||||
|
@ -807,7 +828,7 @@ static void xor_mem(void * LHC_RESTRICT vx, const void * LHC_RESTRICT vy, unsign
|
|||
bytes -= 64, x16 += 4, y16 += 4;
|
||||
}
|
||||
}
|
||||
#endif // LHC_TARGET_MOBILE
|
||||
#endif // LEO_TARGET_MOBILE
|
||||
|
||||
// Handle multiples of 16 bytes
|
||||
while (bytes >= 16)
|
||||
|
@ -821,15 +842,15 @@ static void xor_mem(void * LHC_RESTRICT vx, const void * LHC_RESTRICT vy, unsign
|
|||
bytes -= 16, ++x16, ++y16;
|
||||
}
|
||||
|
||||
uint8_t * LHC_RESTRICT x1 = reinterpret_cast<uint8_t *>(x16);
|
||||
const uint8_t * LHC_RESTRICT y1 = reinterpret_cast<const uint8_t *>(y16);
|
||||
uint8_t * LEO_RESTRICT x1 = reinterpret_cast<uint8_t *>(x16);
|
||||
const uint8_t * LEO_RESTRICT y1 = reinterpret_cast<const uint8_t *>(y16);
|
||||
|
||||
// Handle a block of 8 bytes
|
||||
const unsigned eight = bytes & 8;
|
||||
if (eight)
|
||||
{
|
||||
uint64_t * LHC_RESTRICT x8 = reinterpret_cast<uint64_t *>(x1);
|
||||
const uint64_t * LHC_RESTRICT y8 = reinterpret_cast<const uint64_t *>(y1);
|
||||
uint64_t * LEO_RESTRICT x8 = reinterpret_cast<uint64_t *>(x1);
|
||||
const uint64_t * LEO_RESTRICT y8 = reinterpret_cast<const uint64_t *>(y1);
|
||||
*x8 ^= *y8;
|
||||
}
|
||||
|
||||
|
@ -837,8 +858,8 @@ static void xor_mem(void * LHC_RESTRICT vx, const void * LHC_RESTRICT vy, unsign
|
|||
const unsigned four = bytes & 4;
|
||||
if (four)
|
||||
{
|
||||
uint32_t * LHC_RESTRICT x4 = reinterpret_cast<uint32_t *>(x1 + eight);
|
||||
const uint32_t * LHC_RESTRICT y4 = reinterpret_cast<const uint32_t *>(y1 + eight);
|
||||
uint32_t * LEO_RESTRICT x4 = reinterpret_cast<uint32_t *>(x1 + eight);
|
||||
const uint32_t * LEO_RESTRICT y4 = reinterpret_cast<const uint32_t *>(y1 + eight);
|
||||
*x4 ^= *y4;
|
||||
}
|
||||
|
||||
|
@ -1158,7 +1179,7 @@ void test(unsigned k, unsigned seed)
|
|||
if (data[i] != codeword[i])
|
||||
{
|
||||
printf("Decoding Error with seed = %d!\n", seed);
|
||||
LHC_DEBUG_BREAK;
|
||||
LEO_DEBUG_BREAK;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
@ -1174,7 +1195,7 @@ void test(unsigned k, unsigned seed)
|
|||
int main(int argc, char **argv)
|
||||
{
|
||||
// Initialize architecture-specific code
|
||||
lhc_architecture_init();
|
||||
leo_architecture_init();
|
||||
|
||||
// Fill GFLog table and GFExp table
|
||||
InitField();
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,840 @@
|
|||
/*
|
||||
Copyright (c) 2017 Christopher A. Taylor. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright notice,
|
||||
this list of conditions and the following disclaimer in the documentation
|
||||
and/or other materials provided with the distribution.
|
||||
* Neither the name of LHC-RS nor the names of its contributors may be
|
||||
used to endorse or promote products derived from this software without
|
||||
specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "LeopardFF8.h"
|
||||
|
||||
namespace leopard { namespace ff8 {
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Datatypes and Constants
|
||||
|
||||
// LFSR Polynomial that generates the field elements
|
||||
static const unsigned kPolynomial = 0x11D;
|
||||
|
||||
// Basis used for generating logarithm tables
|
||||
static const ffe_t kBasis[kBits] = {
|
||||
1, 214, 152, 146, 86, 200, 88, 230 // Cantor basis
|
||||
// 1, 2, 4, 8, 16, 32, 64, 128 // Monomial basis
|
||||
};
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Field Operations
|
||||
|
||||
// Modulus for field operations
|
||||
static const ffe_t kModulus = 255;
|
||||
|
||||
// z = x + y (mod kModulus)
|
||||
static inline ffe_t AddMod(const ffe_t a, const ffe_t b)
|
||||
{
|
||||
const unsigned sum = (unsigned)a + b;
|
||||
|
||||
// Partial reduction step, allowing for kModulus to be returned
|
||||
return static_cast<ffe_t>(sum + (sum >> kBits));
|
||||
}
|
||||
|
||||
// z = x - y (mod kModulus)
|
||||
static inline ffe_t SubMod(const ffe_t a, const ffe_t b)
|
||||
{
|
||||
const unsigned dif = (unsigned)a - b;
|
||||
|
||||
// Partial reduction step, allowing for kModulus to be returned
|
||||
return static_cast<ffe_t>(dif + (dif >> kBits));
|
||||
}
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Logarithm Tables
|
||||
|
||||
static ffe_t LogLUT[kOrder];
|
||||
static ffe_t ExpLUT[kOrder];
|
||||
|
||||
|
||||
// Initialize LogLUT[], ExpLUT[]
|
||||
static void InitializeLogarithmTables()
|
||||
{
|
||||
// LFSR table generation:
|
||||
|
||||
unsigned state = 1;
|
||||
for (unsigned i = 0; i < kModulus; ++i)
|
||||
{
|
||||
ExpLUT[state] = static_cast<ffe_t>(i);
|
||||
state <<= 1;
|
||||
if (state >= kOrder)
|
||||
state ^= kPolynomial;
|
||||
}
|
||||
ExpLUT[0] = kModulus;
|
||||
|
||||
// Conversion to chosen basis:
|
||||
|
||||
LogLUT[0] = 0;
|
||||
for (unsigned i = 0; i < kBits; ++i)
|
||||
{
|
||||
const ffe_t basis = kBasis[i];
|
||||
const unsigned width = static_cast<unsigned>(1UL << i);
|
||||
|
||||
for (unsigned j = 0; j < width; ++j)
|
||||
LogLUT[j + width] = LogLUT[j] ^ basis;
|
||||
}
|
||||
|
||||
for (unsigned i = 0; i < kOrder; ++i)
|
||||
LogLUT[i] = ExpLUT[LogLUT[i]];
|
||||
|
||||
for (unsigned i = 0; i < kOrder; ++i)
|
||||
ExpLUT[LogLUT[i]] = i;
|
||||
|
||||
ExpLUT[kModulus] = ExpLUT[0];
|
||||
}
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Fast Walsh-Hadamard Transform (FWHT) (mod kModulus)
|
||||
|
||||
#if defined(LEO_FF8_FWHT_OPTIMIZED)
|
||||
|
||||
// {a, b} = {a + b, a - b} (Mod Q)
|
||||
static LEO_FORCE_INLINE void FWHT_2(ffe_t& LEO_RESTRICT a, ffe_t& LEO_RESTRICT b)
|
||||
{
|
||||
const ffe_t sum = AddMod(a, b);
|
||||
const ffe_t dif = SubMod(a, b);
|
||||
a = sum;
|
||||
b = dif;
|
||||
}
|
||||
|
||||
static LEO_FORCE_INLINE void FWHT_4(ffe_t* data)
|
||||
{
|
||||
ffe_t t0 = data[0];
|
||||
ffe_t t1 = data[1];
|
||||
ffe_t t2 = data[2];
|
||||
ffe_t t3 = data[3];
|
||||
FWHT_2(t0, t1);
|
||||
FWHT_2(t2, t3);
|
||||
FWHT_2(t0, t2);
|
||||
FWHT_2(t1, t3);
|
||||
data[0] = t0;
|
||||
data[1] = t1;
|
||||
data[2] = t2;
|
||||
data[3] = t3;
|
||||
}
|
||||
|
||||
static LEO_FORCE_INLINE void FWHT_4(ffe_t* data, unsigned s)
|
||||
{
|
||||
unsigned x = 0;
|
||||
ffe_t t0 = data[x]; x += s;
|
||||
ffe_t t1 = data[x]; x += s;
|
||||
ffe_t t2 = data[x]; x += s;
|
||||
ffe_t t3 = data[x];
|
||||
FWHT_2(t0, t1);
|
||||
FWHT_2(t2, t3);
|
||||
FWHT_2(t0, t2);
|
||||
FWHT_2(t1, t3);
|
||||
unsigned y = 0;
|
||||
data[y] = t0; y += s;
|
||||
data[y] = t1; y += s;
|
||||
data[y] = t2; y += s;
|
||||
data[y] = t3;
|
||||
}
|
||||
|
||||
static inline void FWHT_8(ffe_t* data)
|
||||
{
|
||||
ffe_t t0 = data[0];
|
||||
ffe_t t1 = data[1];
|
||||
ffe_t t2 = data[2];
|
||||
ffe_t t3 = data[3];
|
||||
ffe_t t4 = data[4];
|
||||
ffe_t t5 = data[5];
|
||||
ffe_t t6 = data[6];
|
||||
ffe_t t7 = data[7];
|
||||
FWHT_2(t0, t1);
|
||||
FWHT_2(t2, t3);
|
||||
FWHT_2(t4, t5);
|
||||
FWHT_2(t6, t7);
|
||||
FWHT_2(t0, t2);
|
||||
FWHT_2(t1, t3);
|
||||
FWHT_2(t4, t6);
|
||||
FWHT_2(t5, t7);
|
||||
FWHT_2(t0, t4);
|
||||
FWHT_2(t1, t5);
|
||||
FWHT_2(t2, t6);
|
||||
FWHT_2(t3, t7);
|
||||
data[0] = t0;
|
||||
data[1] = t1;
|
||||
data[2] = t2;
|
||||
data[3] = t3;
|
||||
data[4] = t4;
|
||||
data[5] = t5;
|
||||
data[6] = t6;
|
||||
data[7] = t7;
|
||||
}
|
||||
|
||||
// Decimation in time (DIT) version
|
||||
static void FWHT(ffe_t* data, const unsigned ldn)
|
||||
{
|
||||
const unsigned n = (1UL << ldn);
|
||||
|
||||
if (n <= 2)
|
||||
{
|
||||
if (n == 2)
|
||||
FWHT_2(data[0], data[1]);
|
||||
return;
|
||||
}
|
||||
|
||||
for (unsigned ldm = ldn; ldm > 3; ldm -= 2)
|
||||
{
|
||||
unsigned m = (1UL << ldm);
|
||||
unsigned m4 = (m >> 2);
|
||||
for (unsigned r = 0; r < n; r += m)
|
||||
for (unsigned j = 0; j < m4; j++)
|
||||
FWHT_4(data + j + r, m4);
|
||||
}
|
||||
|
||||
if (ldn & 1)
|
||||
{
|
||||
for (unsigned i0 = 0; i0 < n; i0 += 8)
|
||||
FWHT_8(data + i0);
|
||||
}
|
||||
else
|
||||
{
|
||||
for (unsigned i0 = 0; i0 < n; i0 += 4)
|
||||
FWHT_4(data + i0);
|
||||
}
|
||||
}
|
||||
|
||||
#else // LEO_FF8_FWHT_OPTIMIZED
|
||||
|
||||
// Reference implementation
|
||||
void FWHT(ffe_t* data, const unsigned bits)
|
||||
{
|
||||
const unsigned size = (unsigned)(1UL << bits);
|
||||
for (unsigned width = 1; width < size; width <<= 1)
|
||||
for (unsigned i = 0; i < size; i += (width << 1))
|
||||
for (unsigned j = i; j < (width + i); ++j)
|
||||
FWHT_2(data[j], data[j + width]);
|
||||
}
|
||||
|
||||
#endif // LEO_FF8_FWHT_OPTIMIZED
|
||||
|
||||
// Transform specialized for the finite field order
|
||||
void FWHT(ffe_t data[kOrder])
|
||||
{
|
||||
FWHT(data, kBits);
|
||||
}
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// XOR Memory
|
||||
|
||||
void xor_mem(
|
||||
void * LEO_RESTRICT vx, const void * LEO_RESTRICT vy,
|
||||
unsigned bytes)
|
||||
{
|
||||
#if defined(LEO_TRY_AVX2)
|
||||
if (CpuHasAVX2)
|
||||
{
|
||||
LEO_M256 * LEO_RESTRICT x32 = reinterpret_cast<LEO_M256 *>(vx);
|
||||
const LEO_M256 * LEO_RESTRICT y32 = reinterpret_cast<const LEO_M256 *>(vy);
|
||||
do
|
||||
{
|
||||
const LEO_M256 x0 = _mm256_xor_si256(_mm256_loadu_si256(x32), _mm256_loadu_si256(y32));
|
||||
const LEO_M256 x1 = _mm256_xor_si256(_mm256_loadu_si256(x32 + 1), _mm256_loadu_si256(y32 + 1));
|
||||
const LEO_M256 x2 = _mm256_xor_si256(_mm256_loadu_si256(x32 + 2), _mm256_loadu_si256(y32 + 2));
|
||||
const LEO_M256 x3 = _mm256_xor_si256(_mm256_loadu_si256(x32 + 3), _mm256_loadu_si256(y32 + 3));
|
||||
_mm256_storeu_si256(x32, x0);
|
||||
_mm256_storeu_si256(x32 + 1, x1);
|
||||
_mm256_storeu_si256(x32 + 2, x2);
|
||||
_mm256_storeu_si256(x32 + 3, x3);
|
||||
bytes -= 128, x32 += 4, y32 += 4;
|
||||
} while (bytes >= 128);
|
||||
if (bytes > 0)
|
||||
{
|
||||
const LEO_M256 x0 = _mm256_xor_si256(_mm256_loadu_si256(x32), _mm256_loadu_si256(y32));
|
||||
const LEO_M256 x1 = _mm256_xor_si256(_mm256_loadu_si256(x32 + 1), _mm256_loadu_si256(y32 + 1));
|
||||
_mm256_storeu_si256(x32, x0);
|
||||
_mm256_storeu_si256(x32 + 1, x1);
|
||||
}
|
||||
return;
|
||||
}
|
||||
#endif // LEO_TRY_AVX2
|
||||
LEO_M128 * LEO_RESTRICT x16 = reinterpret_cast<LEO_M128 *>(vx);
|
||||
const LEO_M128 * LEO_RESTRICT y16 = reinterpret_cast<const LEO_M128 *>(vy);
|
||||
do
|
||||
{
|
||||
const LEO_M128 x0 = _mm_xor_si128(_mm_loadu_si128(x16), _mm_loadu_si128(y16));
|
||||
const LEO_M128 x1 = _mm_xor_si128(_mm_loadu_si128(x16 + 1), _mm_loadu_si128(y16 + 1));
|
||||
const LEO_M128 x2 = _mm_xor_si128(_mm_loadu_si128(x16 + 2), _mm_loadu_si128(y16 + 2));
|
||||
const LEO_M128 x3 = _mm_xor_si128(_mm_loadu_si128(x16 + 3), _mm_loadu_si128(y16 + 3));
|
||||
_mm_storeu_si128(x16, x0);
|
||||
_mm_storeu_si128(x16 + 1, x1);
|
||||
_mm_storeu_si128(x16 + 2, x2);
|
||||
_mm_storeu_si128(x16 + 3, x3);
|
||||
bytes -= 64, x16 += 4, y16 += 4;
|
||||
} while (bytes > 0);
|
||||
}
|
||||
|
||||
void xor_mem2(
|
||||
void * LEO_RESTRICT vx_0, const void * LEO_RESTRICT vy_0,
|
||||
void * LEO_RESTRICT vx_1, const void * LEO_RESTRICT vy_1,
|
||||
unsigned bytes)
|
||||
{
|
||||
#if defined(LEO_TRY_AVX2)
|
||||
if (CpuHasAVX2)
|
||||
{
|
||||
LEO_M256 * LEO_RESTRICT x32_0 = reinterpret_cast<LEO_M256 *> (vx_0);
|
||||
const LEO_M256 * LEO_RESTRICT y32_0 = reinterpret_cast<const LEO_M256 *>(vy_0);
|
||||
LEO_M256 * LEO_RESTRICT x32_1 = reinterpret_cast<LEO_M256 *> (vx_1);
|
||||
const LEO_M256 * LEO_RESTRICT y32_1 = reinterpret_cast<const LEO_M256 *>(vy_1);
|
||||
do
|
||||
{
|
||||
const LEO_M256 x0_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0), _mm256_loadu_si256(y32_0));
|
||||
const LEO_M256 x1_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 1), _mm256_loadu_si256(y32_0 + 1));
|
||||
const LEO_M256 x2_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 2), _mm256_loadu_si256(y32_0 + 2));
|
||||
const LEO_M256 x3_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 3), _mm256_loadu_si256(y32_0 + 3));
|
||||
const LEO_M256 x0_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1), _mm256_loadu_si256(y32_1));
|
||||
const LEO_M256 x1_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 1), _mm256_loadu_si256(y32_1 + 1));
|
||||
const LEO_M256 x2_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 2), _mm256_loadu_si256(y32_1 + 2));
|
||||
const LEO_M256 x3_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 3), _mm256_loadu_si256(y32_1 + 3));
|
||||
_mm256_storeu_si256(x32_0, x0_0);
|
||||
_mm256_storeu_si256(x32_0 + 1, x1_0);
|
||||
_mm256_storeu_si256(x32_0 + 2, x2_0);
|
||||
_mm256_storeu_si256(x32_0 + 3, x3_0);
|
||||
_mm256_storeu_si256(x32_1, x0_1);
|
||||
_mm256_storeu_si256(x32_1 + 1, x1_1);
|
||||
_mm256_storeu_si256(x32_1 + 2, x2_1);
|
||||
_mm256_storeu_si256(x32_1 + 3, x3_1);
|
||||
x32_0 += 4, y32_0 += 4;
|
||||
x32_1 += 4, y32_1 += 4;
|
||||
bytes -= 128;
|
||||
} while (bytes >= 128);
|
||||
if (bytes > 0)
|
||||
{
|
||||
const LEO_M256 x0_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0), _mm256_loadu_si256(y32_0));
|
||||
const LEO_M256 x1_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 1), _mm256_loadu_si256(y32_0 + 1));
|
||||
const LEO_M256 x0_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1), _mm256_loadu_si256(y32_1));
|
||||
const LEO_M256 x1_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 1), _mm256_loadu_si256(y32_1 + 1));
|
||||
_mm256_storeu_si256(x32_0, x0_0);
|
||||
_mm256_storeu_si256(x32_0 + 1, x1_0);
|
||||
_mm256_storeu_si256(x32_1, x0_1);
|
||||
_mm256_storeu_si256(x32_1 + 1, x1_1);
|
||||
}
|
||||
return;
|
||||
}
|
||||
#endif // LEO_TRY_AVX2
|
||||
LEO_M128 * LEO_RESTRICT x16_0 = reinterpret_cast<LEO_M128 *> (vx_0);
|
||||
const LEO_M128 * LEO_RESTRICT y16_0 = reinterpret_cast<const LEO_M128 *>(vy_0);
|
||||
LEO_M128 * LEO_RESTRICT x16_1 = reinterpret_cast<LEO_M128 *> (vx_1);
|
||||
const LEO_M128 * LEO_RESTRICT y16_1 = reinterpret_cast<const LEO_M128 *>(vy_1);
|
||||
do
|
||||
{
|
||||
const LEO_M128 x0_0 = _mm_xor_si128(_mm_loadu_si128(x16_0), _mm_loadu_si128(y16_0));
|
||||
const LEO_M128 x1_0 = _mm_xor_si128(_mm_loadu_si128(x16_0 + 1), _mm_loadu_si128(y16_0 + 1));
|
||||
const LEO_M128 x2_0 = _mm_xor_si128(_mm_loadu_si128(x16_0 + 2), _mm_loadu_si128(y16_0 + 2));
|
||||
const LEO_M128 x3_0 = _mm_xor_si128(_mm_loadu_si128(x16_0 + 3), _mm_loadu_si128(y16_0 + 3));
|
||||
const LEO_M128 x0_1 = _mm_xor_si128(_mm_loadu_si128(x16_1), _mm_loadu_si128(y16_1));
|
||||
const LEO_M128 x1_1 = _mm_xor_si128(_mm_loadu_si128(x16_1 + 1), _mm_loadu_si128(y16_1 + 1));
|
||||
const LEO_M128 x2_1 = _mm_xor_si128(_mm_loadu_si128(x16_1 + 2), _mm_loadu_si128(y16_1 + 2));
|
||||
const LEO_M128 x3_1 = _mm_xor_si128(_mm_loadu_si128(x16_1 + 3), _mm_loadu_si128(y16_1 + 3));
|
||||
_mm_storeu_si128(x16_0, x0_0);
|
||||
_mm_storeu_si128(x16_0 + 1, x1_0);
|
||||
_mm_storeu_si128(x16_0 + 2, x2_0);
|
||||
_mm_storeu_si128(x16_0 + 3, x3_0);
|
||||
_mm_storeu_si128(x16_1, x0_1);
|
||||
_mm_storeu_si128(x16_1 + 1, x1_1);
|
||||
_mm_storeu_si128(x16_1 + 2, x2_1);
|
||||
_mm_storeu_si128(x16_1 + 3, x3_1);
|
||||
x16_0 += 4, y16_0 += 4;
|
||||
x16_1 += 4, y16_1 += 4;
|
||||
bytes -= 64;
|
||||
} while (bytes > 0);
|
||||
}
|
||||
|
||||
void xor_mem3(
|
||||
void * LEO_RESTRICT vx_0, const void * LEO_RESTRICT vy_0,
|
||||
void * LEO_RESTRICT vx_1, const void * LEO_RESTRICT vy_1,
|
||||
void * LEO_RESTRICT vx_2, const void * LEO_RESTRICT vy_2,
|
||||
unsigned bytes)
|
||||
{
|
||||
#if defined(LEO_TRY_AVX2)
|
||||
if (CpuHasAVX2)
|
||||
{
|
||||
LEO_M256 * LEO_RESTRICT x32_0 = reinterpret_cast<LEO_M256 *> (vx_0);
|
||||
const LEO_M256 * LEO_RESTRICT y32_0 = reinterpret_cast<const LEO_M256 *>(vy_0);
|
||||
LEO_M256 * LEO_RESTRICT x32_1 = reinterpret_cast<LEO_M256 *> (vx_1);
|
||||
const LEO_M256 * LEO_RESTRICT y32_1 = reinterpret_cast<const LEO_M256 *>(vy_1);
|
||||
LEO_M256 * LEO_RESTRICT x32_2 = reinterpret_cast<LEO_M256 *> (vx_2);
|
||||
const LEO_M256 * LEO_RESTRICT y32_2 = reinterpret_cast<const LEO_M256 *>(vy_2);
|
||||
do
|
||||
{
|
||||
const LEO_M256 x0_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0), _mm256_loadu_si256(y32_0));
|
||||
const LEO_M256 x1_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 1), _mm256_loadu_si256(y32_0 + 1));
|
||||
const LEO_M256 x2_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 2), _mm256_loadu_si256(y32_0 + 2));
|
||||
const LEO_M256 x3_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 3), _mm256_loadu_si256(y32_0 + 3));
|
||||
const LEO_M256 x0_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1), _mm256_loadu_si256(y32_1));
|
||||
const LEO_M256 x1_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 1), _mm256_loadu_si256(y32_1 + 1));
|
||||
const LEO_M256 x2_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 2), _mm256_loadu_si256(y32_1 + 2));
|
||||
const LEO_M256 x3_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 3), _mm256_loadu_si256(y32_1 + 3));
|
||||
const LEO_M256 x0_2 = _mm256_xor_si256(_mm256_loadu_si256(x32_2), _mm256_loadu_si256(y32_2));
|
||||
const LEO_M256 x1_2 = _mm256_xor_si256(_mm256_loadu_si256(x32_2 + 1), _mm256_loadu_si256(y32_2 + 1));
|
||||
const LEO_M256 x2_2 = _mm256_xor_si256(_mm256_loadu_si256(x32_2 + 2), _mm256_loadu_si256(y32_2 + 2));
|
||||
const LEO_M256 x3_2 = _mm256_xor_si256(_mm256_loadu_si256(x32_2 + 3), _mm256_loadu_si256(y32_2 + 3));
|
||||
_mm256_storeu_si256(x32_0, x0_0);
|
||||
_mm256_storeu_si256(x32_0 + 1, x1_0);
|
||||
_mm256_storeu_si256(x32_0 + 2, x2_0);
|
||||
_mm256_storeu_si256(x32_0 + 3, x3_0);
|
||||
_mm256_storeu_si256(x32_1, x0_1);
|
||||
_mm256_storeu_si256(x32_1 + 1, x1_1);
|
||||
_mm256_storeu_si256(x32_1 + 2, x2_1);
|
||||
_mm256_storeu_si256(x32_1 + 3, x3_1);
|
||||
_mm256_storeu_si256(x32_2, x0_2);
|
||||
_mm256_storeu_si256(x32_2 + 1, x1_2);
|
||||
_mm256_storeu_si256(x32_2 + 2, x2_2);
|
||||
_mm256_storeu_si256(x32_2 + 3, x3_2);
|
||||
x32_0 += 4, y32_0 += 4;
|
||||
x32_1 += 4, y32_1 += 4;
|
||||
x32_2 += 4, y32_2 += 4;
|
||||
bytes -= 128;
|
||||
} while (bytes >= 128);
|
||||
if (bytes > 0)
|
||||
{
|
||||
const LEO_M256 x0_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0), _mm256_loadu_si256(y32_0));
|
||||
const LEO_M256 x1_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 1), _mm256_loadu_si256(y32_0 + 1));
|
||||
const LEO_M256 x0_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1), _mm256_loadu_si256(y32_1));
|
||||
const LEO_M256 x1_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 1), _mm256_loadu_si256(y32_1 + 1));
|
||||
const LEO_M256 x0_2 = _mm256_xor_si256(_mm256_loadu_si256(x32_2), _mm256_loadu_si256(y32_2));
|
||||
const LEO_M256 x1_2 = _mm256_xor_si256(_mm256_loadu_si256(x32_2 + 1), _mm256_loadu_si256(y32_2 + 1));
|
||||
_mm256_storeu_si256(x32_0, x0_0);
|
||||
_mm256_storeu_si256(x32_0 + 1, x1_0);
|
||||
_mm256_storeu_si256(x32_1, x0_1);
|
||||
_mm256_storeu_si256(x32_1 + 1, x1_1);
|
||||
_mm256_storeu_si256(x32_2, x0_2);
|
||||
_mm256_storeu_si256(x32_2 + 1, x1_2);
|
||||
}
|
||||
return;
|
||||
}
|
||||
#endif // LEO_TRY_AVX2
|
||||
LEO_M128 * LEO_RESTRICT x16_0 = reinterpret_cast<LEO_M128 *> (vx_0);
|
||||
const LEO_M128 * LEO_RESTRICT y16_0 = reinterpret_cast<const LEO_M128 *>(vy_0);
|
||||
LEO_M128 * LEO_RESTRICT x16_1 = reinterpret_cast<LEO_M128 *> (vx_1);
|
||||
const LEO_M128 * LEO_RESTRICT y16_1 = reinterpret_cast<const LEO_M128 *>(vy_1);
|
||||
LEO_M128 * LEO_RESTRICT x16_2 = reinterpret_cast<LEO_M128 *> (vx_2);
|
||||
const LEO_M128 * LEO_RESTRICT y16_2 = reinterpret_cast<const LEO_M128 *>(vy_2);
|
||||
do
|
||||
{
|
||||
const LEO_M128 x0_0 = _mm_xor_si128(_mm_loadu_si128(x16_0), _mm_loadu_si128(y16_0));
|
||||
const LEO_M128 x1_0 = _mm_xor_si128(_mm_loadu_si128(x16_0 + 1), _mm_loadu_si128(y16_0 + 1));
|
||||
const LEO_M128 x2_0 = _mm_xor_si128(_mm_loadu_si128(x16_0 + 2), _mm_loadu_si128(y16_0 + 2));
|
||||
const LEO_M128 x3_0 = _mm_xor_si128(_mm_loadu_si128(x16_0 + 3), _mm_loadu_si128(y16_0 + 3));
|
||||
const LEO_M128 x0_1 = _mm_xor_si128(_mm_loadu_si128(x16_1), _mm_loadu_si128(y16_1));
|
||||
const LEO_M128 x1_1 = _mm_xor_si128(_mm_loadu_si128(x16_1 + 1), _mm_loadu_si128(y16_1 + 1));
|
||||
const LEO_M128 x2_1 = _mm_xor_si128(_mm_loadu_si128(x16_1 + 2), _mm_loadu_si128(y16_1 + 2));
|
||||
const LEO_M128 x3_1 = _mm_xor_si128(_mm_loadu_si128(x16_1 + 3), _mm_loadu_si128(y16_1 + 3));
|
||||
const LEO_M128 x0_2 = _mm_xor_si128(_mm_loadu_si128(x16_2), _mm_loadu_si128(y16_2));
|
||||
const LEO_M128 x1_2 = _mm_xor_si128(_mm_loadu_si128(x16_2 + 1), _mm_loadu_si128(y16_2 + 1));
|
||||
const LEO_M128 x2_2 = _mm_xor_si128(_mm_loadu_si128(x16_2 + 2), _mm_loadu_si128(y16_2 + 2));
|
||||
const LEO_M128 x3_2 = _mm_xor_si128(_mm_loadu_si128(x16_2 + 3), _mm_loadu_si128(y16_2 + 3));
|
||||
_mm_storeu_si128(x16_0, x0_0);
|
||||
_mm_storeu_si128(x16_0 + 1, x1_0);
|
||||
_mm_storeu_si128(x16_0 + 2, x2_0);
|
||||
_mm_storeu_si128(x16_0 + 3, x3_0);
|
||||
_mm_storeu_si128(x16_1, x0_1);
|
||||
_mm_storeu_si128(x16_1 + 1, x1_1);
|
||||
_mm_storeu_si128(x16_1 + 2, x2_1);
|
||||
_mm_storeu_si128(x16_1 + 3, x3_1);
|
||||
_mm_storeu_si128(x16_2, x0_2);
|
||||
_mm_storeu_si128(x16_2 + 1, x1_2);
|
||||
_mm_storeu_si128(x16_2 + 2, x2_2);
|
||||
_mm_storeu_si128(x16_2 + 3, x3_2);
|
||||
x16_0 += 4, y16_0 += 4;
|
||||
x16_1 += 4, y16_1 += 4;
|
||||
x16_2 += 4, y16_2 += 4;
|
||||
bytes -= 64;
|
||||
} while (bytes > 0);
|
||||
}
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Multiplies
|
||||
|
||||
// We require memory to be aligned since the SIMD instructions benefit from
|
||||
// or require aligned accesses to the table data.
|
||||
struct {
|
||||
LEO_ALIGNED LEO_M128 Lo[256];
|
||||
LEO_ALIGNED LEO_M128 Hi[256];
|
||||
} Multiply128LUT;
|
||||
#if defined(LEO_TRY_AVX2)
|
||||
struct {
|
||||
LEO_ALIGNED LEO_M256 Lo[256];
|
||||
LEO_ALIGNED LEO_M256 Hi[256];
|
||||
} Multiply256LUT;
|
||||
#endif // LEO_TRY_AVX2
|
||||
|
||||
// Returns a * b
|
||||
static ffe_t FFEMultiply(ffe_t a, ffe_t b)
|
||||
{
|
||||
if (a == 0 || b == 0)
|
||||
return 0;
|
||||
return ExpLUT[AddMod(LogLUT[a], LogLUT[b])];
|
||||
}
|
||||
|
||||
bool InitializeMultiplyTables()
|
||||
{
|
||||
// Reuse aligned self test buffers to load table data
|
||||
uint8_t* lo = m_SelfTestBuffers.A;
|
||||
uint8_t* hi = m_SelfTestBuffers.B;
|
||||
|
||||
for (int y = 0; y < 256; ++y)
|
||||
{
|
||||
for (unsigned char x = 0; x < 16; ++x)
|
||||
{
|
||||
lo[x] = FFEMultiply(x, static_cast<uint8_t>(y));
|
||||
hi[x] = FFEMultiply(x << 4, static_cast<uint8_t>(y));
|
||||
}
|
||||
|
||||
const LEO_M128 table_lo = _mm_loadu_si128((LEO_M128*)lo);
|
||||
const LEO_M128 table_hi = _mm_loadu_si128((LEO_M128*)hi);
|
||||
_mm_storeu_si128(Multiply128LUT.Lo + y, table_lo);
|
||||
_mm_storeu_si128(Multiply128LUT.Hi + y, table_hi);
|
||||
#if defined(LEO_TRY_AVX2)
|
||||
if (CpuHasAVX2)
|
||||
{
|
||||
const LEO_M256 table_lo2 = _mm256_broadcastsi128_si256(table_lo);
|
||||
const LEO_M256 table_hi2 = _mm256_broadcastsi128_si256(table_hi);
|
||||
_mm256_storeu_si256(Multiply256LUT.Lo + y, table_lo2);
|
||||
_mm256_storeu_si256(Multiply256LUT.Hi + y, table_hi2);
|
||||
}
|
||||
#endif // LEO_TRY_AVX2
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// vx[] = vy[] * m
|
||||
void mul_mem_set(
|
||||
void * LEO_RESTRICT vx, const void * LEO_RESTRICT vy,
|
||||
ffe_t m, unsigned bytes)
|
||||
{
|
||||
if (m <= 1)
|
||||
{
|
||||
if (m == 1)
|
||||
memcpy(vx, vy, bytes);
|
||||
else
|
||||
memset(vx, 0, bytes);
|
||||
return;
|
||||
}
|
||||
|
||||
#if defined(LEO_TRY_AVX2)
|
||||
if (CpuHasAVX2)
|
||||
{
|
||||
const LEO_M256 table_lo_y = _mm256_loadu_si256(Multiply256LUT.Lo + m);
|
||||
const LEO_M256 table_hi_y = _mm256_loadu_si256(Multiply256LUT.Hi + m);
|
||||
|
||||
const LEO_M256 clr_mask = _mm256_set1_epi8(0x0f);
|
||||
|
||||
LEO_M256 * LEO_RESTRICT z32 = reinterpret_cast<LEO_M256 *>(vx);
|
||||
const LEO_M256 * LEO_RESTRICT x32 = reinterpret_cast<const LEO_M256 *>(vy);
|
||||
|
||||
const unsigned count = bytes / 64;
|
||||
for (unsigned i = 0; i < count; ++i)
|
||||
{
|
||||
LEO_M256 x0 = _mm256_loadu_si256(x32 + i * 2);
|
||||
LEO_M256 l0 = _mm256_and_si256(x0, clr_mask);
|
||||
x0 = _mm256_srli_epi64(x0, 4);
|
||||
LEO_M256 h0 = _mm256_and_si256(x0, clr_mask);
|
||||
l0 = _mm256_shuffle_epi8(table_lo_y, l0);
|
||||
h0 = _mm256_shuffle_epi8(table_hi_y, h0);
|
||||
_mm256_storeu_si256(z32 + i * 2, _mm256_xor_si256(l0, h0));
|
||||
|
||||
LEO_M256 x1 = _mm256_loadu_si256(x32 + i * 2 + 1);
|
||||
LEO_M256 l1 = _mm256_and_si256(x1, clr_mask);
|
||||
x1 = _mm256_srli_epi64(x1, 4);
|
||||
LEO_M256 h1 = _mm256_and_si256(x1, clr_mask);
|
||||
l1 = _mm256_shuffle_epi8(table_lo_y, l1);
|
||||
h1 = _mm256_shuffle_epi8(table_hi_y, h1);
|
||||
_mm256_storeu_si256(z32 + i * 2 + 1, _mm256_xor_si256(l1, h1));
|
||||
}
|
||||
return;
|
||||
}
|
||||
#endif // LEO_TRY_AVX2
|
||||
|
||||
const LEO_M128 table_lo_y = _mm_loadu_si128(Multiply128LUT.Lo + m);
|
||||
const LEO_M128 table_hi_y = _mm_loadu_si128(Multiply128LUT.Hi + m);
|
||||
|
||||
const LEO_M128 clr_mask = _mm_set1_epi8(0x0f);
|
||||
|
||||
LEO_M128 * LEO_RESTRICT x16 = reinterpret_cast<LEO_M128 *> (vx);
|
||||
const LEO_M128 * LEO_RESTRICT y16 = reinterpret_cast<const LEO_M128 *>(vy);
|
||||
|
||||
do
|
||||
{
|
||||
LEO_M128 x3 = _mm_loadu_si128(y16 + 3);
|
||||
LEO_M128 l3 = _mm_and_si128(x3, clr_mask);
|
||||
x3 = _mm_srli_epi64(x3, 4);
|
||||
LEO_M128 h3 = _mm_and_si128(x3, clr_mask);
|
||||
l3 = _mm_shuffle_epi8(table_lo_y, l3);
|
||||
h3 = _mm_shuffle_epi8(table_hi_y, h3);
|
||||
|
||||
LEO_M128 x2 = _mm_loadu_si128(y16 + 2);
|
||||
LEO_M128 l2 = _mm_and_si128(x2, clr_mask);
|
||||
x2 = _mm_srli_epi64(x2, 4);
|
||||
LEO_M128 h2 = _mm_and_si128(x2, clr_mask);
|
||||
l2 = _mm_shuffle_epi8(table_lo_y, l2);
|
||||
h2 = _mm_shuffle_epi8(table_hi_y, h2);
|
||||
|
||||
LEO_M128 x1 = _mm_loadu_si128(y16 + 1);
|
||||
LEO_M128 l1 = _mm_and_si128(x1, clr_mask);
|
||||
x1 = _mm_srli_epi64(x1, 4);
|
||||
LEO_M128 h1 = _mm_and_si128(x1, clr_mask);
|
||||
l1 = _mm_shuffle_epi8(table_lo_y, l1);
|
||||
h1 = _mm_shuffle_epi8(table_hi_y, h1);
|
||||
|
||||
LEO_M128 x0 = _mm_loadu_si128(y16);
|
||||
LEO_M128 l0 = _mm_and_si128(x0, clr_mask);
|
||||
x0 = _mm_srli_epi64(x0, 4);
|
||||
LEO_M128 h0 = _mm_and_si128(x0, clr_mask);
|
||||
l0 = _mm_shuffle_epi8(table_lo_y, l0);
|
||||
h0 = _mm_shuffle_epi8(table_hi_y, h0);
|
||||
|
||||
_mm_storeu_si128(x16 + 3, _mm_xor_si128(l3, h3));
|
||||
_mm_storeu_si128(x16 + 2, _mm_xor_si128(l2, h2));
|
||||
_mm_storeu_si128(x16 + 1, _mm_xor_si128(l1, h1));
|
||||
_mm_storeu_si128(x16, _mm_xor_si128(l0, h0));
|
||||
|
||||
x16 += 4, y16 += 4;
|
||||
bytes -= 64;
|
||||
} while (bytes > 0);
|
||||
}
|
||||
|
||||
// vx0[] *= m, vx1[] *= m
|
||||
void mul_mem2_inplace(
|
||||
void * LEO_RESTRICT vx_0,
|
||||
void * LEO_RESTRICT vx_1,
|
||||
ffe_t m, unsigned bytes)
|
||||
{
|
||||
if (m <= 1)
|
||||
{
|
||||
if (m == 0)
|
||||
{
|
||||
memset(vx_0, 0, bytes);
|
||||
memset(vx_1, 0, bytes);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
#if defined(LEO_TRY_AVX2)
|
||||
if (CpuHasAVX2)
|
||||
{
|
||||
const LEO_M256 table_lo_y = _mm256_loadu_si256(Multiply256LUT.Lo + m);
|
||||
const LEO_M256 table_hi_y = _mm256_loadu_si256(Multiply256LUT.Hi + m);
|
||||
|
||||
const LEO_M256 clr_mask = _mm256_set1_epi8(0x0f);
|
||||
|
||||
LEO_M256 * LEO_RESTRICT x32_0 = reinterpret_cast<LEO_M256 *>(vx_0);
|
||||
LEO_M256 * LEO_RESTRICT x32_1 = reinterpret_cast<LEO_M256 *>(vx_1);
|
||||
|
||||
do
|
||||
{
|
||||
LEO_M256 x0_0 = _mm256_loadu_si256(x32_0 + 1);
|
||||
LEO_M256 l0_0 = _mm256_and_si256(x0_0, clr_mask);
|
||||
x0_0 = _mm256_srli_epi64(x0_0, 4);
|
||||
LEO_M256 h0_0 = _mm256_and_si256(x0_0, clr_mask);
|
||||
l0_0 = _mm256_shuffle_epi8(table_lo_y, l0_0);
|
||||
h0_0 = _mm256_shuffle_epi8(table_hi_y, h0_0);
|
||||
l0_0 = _mm256_xor_si256(l0_0, h0_0);
|
||||
|
||||
LEO_M256 x1_0 = _mm256_loadu_si256(x32_0);
|
||||
LEO_M256 l1_0 = _mm256_and_si256(x1_0, clr_mask);
|
||||
x1_0 = _mm256_srli_epi64(x1_0, 4);
|
||||
LEO_M256 h1_0 = _mm256_and_si256(x1_0, clr_mask);
|
||||
l1_0 = _mm256_shuffle_epi8(table_lo_y, l1_0);
|
||||
h1_0 = _mm256_shuffle_epi8(table_hi_y, h1_0);
|
||||
l1_0 = _mm256_xor_si256(l1_0, h1_0);
|
||||
|
||||
LEO_M256 x0_1 = _mm256_loadu_si256(x32_1 + 1);
|
||||
LEO_M256 l0_1 = _mm256_and_si256(x0_1, clr_mask);
|
||||
x0_1 = _mm256_srli_epi64(x0_1, 4);
|
||||
LEO_M256 h0_1 = _mm256_and_si256(x0_1, clr_mask);
|
||||
l0_1 = _mm256_shuffle_epi8(table_lo_y, l0_1);
|
||||
h0_1 = _mm256_shuffle_epi8(table_hi_y, h0_1);
|
||||
l0_1 = _mm256_xor_si256(l0_1, h0_1);
|
||||
|
||||
LEO_M256 x1_1 = _mm256_loadu_si256(x32_1);
|
||||
LEO_M256 l1_1 = _mm256_and_si256(x1_1, clr_mask);
|
||||
x1_1 = _mm256_srli_epi64(x1_1, 4);
|
||||
LEO_M256 h1_1 = _mm256_and_si256(x1_1, clr_mask);
|
||||
l1_1 = _mm256_shuffle_epi8(table_lo_y, l1_1);
|
||||
h1_1 = _mm256_shuffle_epi8(table_hi_y, h1_1);
|
||||
l1_1 = _mm256_xor_si256(l1_1, h1_1);
|
||||
|
||||
_mm256_storeu_si256(x32_0 + 1, l0_0);
|
||||
_mm256_storeu_si256(x32_0, l1_0);
|
||||
_mm256_storeu_si256(x32_1 + 1, l0_1);
|
||||
_mm256_storeu_si256(x32_1, l1_1);
|
||||
|
||||
x32_0 += 2;
|
||||
x32_1 += 2;
|
||||
bytes -= 64;
|
||||
} while (bytes > 0);
|
||||
return;
|
||||
}
|
||||
#endif // LEO_TRY_AVX2
|
||||
|
||||
const LEO_M128 table_lo_y = _mm_loadu_si128(Multiply128LUT.Lo + m);
|
||||
const LEO_M128 table_hi_y = _mm_loadu_si128(Multiply128LUT.Hi + m);
|
||||
|
||||
const LEO_M128 clr_mask = _mm_set1_epi8(0x0f);
|
||||
|
||||
LEO_M128 * LEO_RESTRICT x16_0 = reinterpret_cast<LEO_M128 *>(vx_0);
|
||||
LEO_M128 * LEO_RESTRICT x16_1 = reinterpret_cast<LEO_M128 *>(vx_1);
|
||||
|
||||
do
|
||||
{
|
||||
LEO_M128 x3 = _mm_loadu_si128(x16_0 + 3);
|
||||
LEO_M128 l3 = _mm_and_si128(x3, clr_mask);
|
||||
x3 = _mm_srli_epi64(x3, 4);
|
||||
LEO_M128 h3 = _mm_and_si128(x3, clr_mask);
|
||||
l3 = _mm_shuffle_epi8(table_lo_y, l3);
|
||||
h3 = _mm_shuffle_epi8(table_hi_y, h3);
|
||||
|
||||
LEO_M128 x2 = _mm_loadu_si128(x16_0 + 2);
|
||||
LEO_M128 l2 = _mm_and_si128(x2, clr_mask);
|
||||
x2 = _mm_srli_epi64(x2, 4);
|
||||
LEO_M128 h2 = _mm_and_si128(x2, clr_mask);
|
||||
l2 = _mm_shuffle_epi8(table_lo_y, l2);
|
||||
h2 = _mm_shuffle_epi8(table_hi_y, h2);
|
||||
|
||||
LEO_M128 x1 = _mm_loadu_si128(x16_0 + 1);
|
||||
LEO_M128 l1 = _mm_and_si128(x1, clr_mask);
|
||||
x1 = _mm_srli_epi64(x1, 4);
|
||||
LEO_M128 h1 = _mm_and_si128(x1, clr_mask);
|
||||
l1 = _mm_shuffle_epi8(table_lo_y, l1);
|
||||
h1 = _mm_shuffle_epi8(table_hi_y, h1);
|
||||
|
||||
LEO_M128 x0 = _mm_loadu_si128(x16_0);
|
||||
LEO_M128 l0 = _mm_and_si128(x0, clr_mask);
|
||||
x0 = _mm_srli_epi64(x0, 4);
|
||||
LEO_M128 h0 = _mm_and_si128(x0, clr_mask);
|
||||
l0 = _mm_shuffle_epi8(table_lo_y, l0);
|
||||
h0 = _mm_shuffle_epi8(table_hi_y, h0);
|
||||
|
||||
_mm_storeu_si128(x16_0 + 3, _mm_xor_si128(l3, h3));
|
||||
_mm_storeu_si128(x16_0 + 2, _mm_xor_si128(l2, h2));
|
||||
_mm_storeu_si128(x16_0 + 1, _mm_xor_si128(l1, h1));
|
||||
_mm_storeu_si128(x16_0, _mm_xor_si128(l0, h0));
|
||||
|
||||
// FIXME: Add second one here
|
||||
|
||||
x16_0 += 4;
|
||||
x16_1 += 4;
|
||||
bytes -= 64;
|
||||
} while (bytes > 0);
|
||||
}
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// FFT Operations
|
||||
|
||||
// x[] ^= y[] * m, y[] ^= x[]
|
||||
void mul_fft(
|
||||
void * LEO_RESTRICT x, void * LEO_RESTRICT y,
|
||||
ffe_t m, unsigned bytes)
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
// For i = {0, 1}: x_i[] ^= y_i[] * m, y_i[] ^= x_i[]
|
||||
void mul_fft2(
|
||||
void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
|
||||
void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1,
|
||||
ffe_t m, unsigned bytes)
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
// For i = {0, 1, 2}: x_i[] ^= y_i[] * m, y_i[] ^= x_i[]
|
||||
void mul_fft3(
|
||||
void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
|
||||
void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1,
|
||||
void * LEO_RESTRICT x_2, void * LEO_RESTRICT y_2,
|
||||
ffe_t m, unsigned bytes)
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// IFFT Operations
|
||||
|
||||
// y[] ^= x[], x[] ^= y[] * m
|
||||
void mul_ifft(
|
||||
void * LEO_RESTRICT x, void * LEO_RESTRICT y,
|
||||
ffe_t m, unsigned bytes)
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
// For i = {0, 1}: y_i[] ^= x_i[], x_i[] ^= y_i[] * m
|
||||
void mul_ifft2(
|
||||
void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
|
||||
void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1,
|
||||
ffe_t m, unsigned bytes)
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
// For i = {0, 1, 2}: y_i[] ^= x_i[], x_i[] ^= y_i[] * m
|
||||
void mul_ifft3(
|
||||
void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
|
||||
void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1,
|
||||
void * LEO_RESTRICT x_2, void * LEO_RESTRICT y_2,
|
||||
ffe_t m, unsigned bytes)
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// API
|
||||
|
||||
static bool IsInitialized = false;
|
||||
|
||||
bool Initialize()
|
||||
{
|
||||
if (IsInitialized)
|
||||
return true;
|
||||
|
||||
if (!CpuHasSSSE3)
|
||||
return false;
|
||||
|
||||
InitializeLogarithmTables();
|
||||
|
||||
IsInitialized = true;
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
}} // namespace leopard::ff8
|
|
@ -0,0 +1,157 @@
|
|||
/*
|
||||
Copyright (c) 2017 Christopher A. Taylor. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright notice,
|
||||
this list of conditions and the following disclaimer in the documentation
|
||||
and/or other materials provided with the distribution.
|
||||
* Neither the name of Leopard-RS nor the names of its contributors may be
|
||||
used to endorse or promote products derived from this software without
|
||||
specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "LeopardCommon.h"
|
||||
|
||||
/*
|
||||
8-bit Finite Field Math
|
||||
|
||||
This finite field contains 256 elements and so each element is one byte.
|
||||
This library is designed for data that is a multiple of 64 bytes in size.
|
||||
*/
|
||||
|
||||
namespace leopard { namespace ff8 {
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Datatypes and Constants
|
||||
|
||||
// Finite field element type
|
||||
typedef uint8_t ffe_t;
|
||||
|
||||
// Number of bits per element
|
||||
static const unsigned kBits = 8;
|
||||
|
||||
// Finite field order: Number of elements in the field
|
||||
static const unsigned kOrder = 256;
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Fast Walsh-Hadamard Transform (FWHT) (mod kModulus)
|
||||
|
||||
// Define this to enable the optimized version of FWHT()
|
||||
#define LEO_FF8_FWHT_OPTIMIZED
|
||||
|
||||
// Transform for a variable number of bits (up to kOrder)
|
||||
void FWHT(ffe_t* data, const unsigned bits);
|
||||
|
||||
// Transform specialized for the finite field order
|
||||
void FWHT(ffe_t data[kOrder]);
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// XOR Memory
|
||||
|
||||
// x[] ^= y[]
|
||||
void xor_mem(
|
||||
void * LEO_RESTRICT x, const void * LEO_RESTRICT y,
|
||||
unsigned bytes);
|
||||
|
||||
// For i = {0, 1}: x_i[] ^= x_i[]
|
||||
void xor_mem2(
|
||||
void * LEO_RESTRICT x_0, const void * LEO_RESTRICT y_0,
|
||||
void * LEO_RESTRICT x_1, const void * LEO_RESTRICT y_1,
|
||||
unsigned bytes);
|
||||
|
||||
// For i = {0, 1, 2}: x_i[] ^= x_i[]
|
||||
void xor_mem3(
|
||||
void * LEO_RESTRICT x_0, const void * LEO_RESTRICT y_0,
|
||||
void * LEO_RESTRICT x_1, const void * LEO_RESTRICT y_1,
|
||||
void * LEO_RESTRICT x_2, const void * LEO_RESTRICT y_2,
|
||||
unsigned bytes);
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Multiplies
|
||||
|
||||
// x[] = y[] * m
|
||||
void mul_mem_set(
|
||||
void * LEO_RESTRICT x, const void * LEO_RESTRICT y,
|
||||
ffe_t m, unsigned bytes);
|
||||
|
||||
// For i = {0, 1}: x_i[] *= m
|
||||
void mul_mem2_inplace(
|
||||
void * LEO_RESTRICT x_0,
|
||||
void * LEO_RESTRICT x_1,
|
||||
ffe_t m, unsigned bytes);
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// FFT Operations
|
||||
|
||||
// x[] ^= y[] * m, y[] ^= x[]
|
||||
void mul_fft(
|
||||
void * LEO_RESTRICT x, void * LEO_RESTRICT y,
|
||||
ffe_t m, unsigned bytes);
|
||||
|
||||
// For i = {0, 1}: x_i[] ^= y_i[] * m, y_i[] ^= x_i[]
|
||||
void mul_fft2(
|
||||
void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
|
||||
void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1,
|
||||
ffe_t m, unsigned bytes);
|
||||
|
||||
// For i = {0, 1, 2}: x_i[] ^= y_i[] * m, y_i[] ^= x_i[]
|
||||
void mul_fft3(
|
||||
void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
|
||||
void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1,
|
||||
void * LEO_RESTRICT x_2, void * LEO_RESTRICT y_2,
|
||||
ffe_t m, unsigned bytes);
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// IFFT Operations
|
||||
|
||||
// y[] ^= x[], x[] ^= y[] * m
|
||||
void mul_ifft(
|
||||
void * LEO_RESTRICT x, void * LEO_RESTRICT y,
|
||||
ffe_t m, unsigned bytes);
|
||||
|
||||
// For i = {0, 1}: y_i[] ^= x_i[], x_i[] ^= y_i[] * m
|
||||
void mul_ifft2(
|
||||
void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
|
||||
void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1,
|
||||
ffe_t m, unsigned bytes);
|
||||
|
||||
// For i = {0, 1, 2}: y_i[] ^= x_i[], x_i[] ^= y_i[] * m
|
||||
void mul_ifft3(
|
||||
void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
|
||||
void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1,
|
||||
void * LEO_RESTRICT x_2, void * LEO_RESTRICT y_2,
|
||||
ffe_t m, unsigned bytes);
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// API
|
||||
|
||||
// Returns false if the self-test fails
|
||||
bool Initialize();
|
||||
|
||||
|
||||
}} // namespace leopard::ff8
|
|
@ -0,0 +1,29 @@
|
|||
BSD 3-Clause License
|
||||
|
||||
Copyright (c) 2017, Christopher A. Taylor
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice, this
|
||||
list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright notice,
|
||||
this list of conditions and the following disclaimer in the documentation
|
||||
and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of the copyright holder nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
88
README.md
88
README.md
|
@ -1,9 +1,91 @@
|
|||
# Lin-Han-Chung RS Codes
|
||||
This is an attempt at implementing a fast version of the algorithm described here:
|
||||
# Leopard-RS
|
||||
## Leopard Reed-Solomon Error Correction Codes in C
|
||||
|
||||
Leopard-RS is a portable, fast library for Forward Error Correction.
|
||||
From a block of equally sized original data pieces, it generates recovery
|
||||
symbols that can be used to recover lost original data.
|
||||
|
||||
* It requires that data pieces are all a fixed size, a multiple of 64 bytes.
|
||||
* The original and recovery data must not exceed 65536 pieces.
|
||||
|
||||
|
||||
#### Motivation:
|
||||
|
||||
It gets slower as O(N Log N) in the input data size, and its inner loops are
|
||||
vectorized using the best approaches available on modern processors, using the
|
||||
fastest finite fields (8-bit or 16-bit Galois fields) for bulk data.
|
||||
|
||||
It sets new speed records for MDS encoding and decoding of large data.
|
||||
It is also the only open-source production ready software for this purpose
|
||||
available today.
|
||||
|
||||
Example applications are data recovery software and data center replication.
|
||||
|
||||
|
||||
#### Encoder API:
|
||||
|
||||
```
|
||||
#include "leopard.h"
|
||||
```
|
||||
|
||||
For full documentation please read `leopard.h`.
|
||||
|
||||
+ `leo_init()` : Initialize library.
|
||||
+ `leo_encode_work_count()` : Calculate the number of work_data buffers to provide to leo_encode().
|
||||
+ `leo_encode()`: Generate recovery data.
|
||||
|
||||
|
||||
#### Decoder API:
|
||||
|
||||
```
|
||||
#include "leopard.h"
|
||||
```
|
||||
|
||||
For full documentation please read `leopard.h`.
|
||||
|
||||
+ `leo_init()` : Initialize library.
|
||||
+ `leo_decode_work_count()` : Calculate the number of work_data buffers to provide to leo_decode().
|
||||
+ `leo_decode()` : Generate recovery data.
|
||||
|
||||
|
||||
#### Benchmarks:
|
||||
|
||||
```
|
||||
TODO
|
||||
```
|
||||
|
||||
|
||||
#### Comparisons:
|
||||
|
||||
```
|
||||
TODO
|
||||
```
|
||||
|
||||
|
||||
#### Background
|
||||
|
||||
This library implements an MDS erasure code introduced in this paper:
|
||||
|
||||
~~~
|
||||
S.-J. Lin, T. Y. Al-Naffouri, Y. S. Han, and W.-H. Chung,
|
||||
"Novel Polynomial Basis with Fast Fourier Transform and Its Application to Reed-Solomon Erasure Codes"
|
||||
IEEE Trans. on Information Theory, pp. 6284-6299, November, 2016.
|
||||
~~~
|
||||
Available here: [http://ct.ee.ntust.edu.tw/it2016-2.pdf](http://ct.ee.ntust.edu.tw/it2016-2.pdf)
|
||||
|
||||
The paper is available here: [http://ct.ee.ntust.edu.tw/it2016-2.pdf](http://ct.ee.ntust.edu.tw/it2016-2.pdf)
|
||||
And also mirrored in the /docs/ folder.
|
||||
|
||||
The high-level summary is that instead of using complicated fields,
|
||||
an additive FFT was introduced that works with familiar Galois fields for the first time.
|
||||
This is actually a huge new result that will change how Reed-Solomon codecs will be written.
|
||||
|
||||
My contribution is extending the ALTMAP approach from Jerasure
|
||||
for 16-bit Galois fields out to 64 bytes to enable AVX2 speedups,
|
||||
and marry it with the row parallelism introduced by ISA-L.
|
||||
|
||||
|
||||
#### Credits
|
||||
|
||||
The idea is the brain-child of S.-J. Lin. He is a super bright guy who should be recognized more widely!
|
||||
|
||||
This software was written entirely by myself ( Christopher A. Taylor mrcatid@gmail.com ). If you find it useful and would like to buy me a coffee, consider tipping.
|
||||
|
|
|
@ -0,0 +1,172 @@
|
|||
/*
|
||||
Copyright (c) 2017 Christopher A. Taylor. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright notice,
|
||||
this list of conditions and the following disclaimer in the documentation
|
||||
and/or other materials provided with the distribution.
|
||||
* Neither the name of Leopard-RS nor the names of its contributors may be
|
||||
used to endorse or promote products derived from this software without
|
||||
specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "leopard.h"
|
||||
#include "FecalEncoder.h"
|
||||
#include "FecalDecoder.h"
|
||||
|
||||
extern "C" {
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Initialization API
|
||||
|
||||
static bool m_Initialized = false;
|
||||
|
||||
FECAL_EXPORT int fecal_init_(int version)
|
||||
{
|
||||
if (version != FECAL_VERSION)
|
||||
return Fecal_InvalidInput;
|
||||
|
||||
if (0 != gf256_init())
|
||||
return Fecal_Platform;
|
||||
|
||||
m_Initialized = true;
|
||||
return Fecal_Success;
|
||||
}
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Encoder API
|
||||
|
||||
FECAL_EXPORT FecalEncoder fecal_encoder_create(unsigned input_count, void* const * const input_data, uint64_t total_bytes)
|
||||
{
|
||||
if (input_count <= 0 || !input_data || total_bytes < input_count)
|
||||
{
|
||||
FECAL_DEBUG_BREAK; // Invalid input
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
FECAL_DEBUG_ASSERT(m_Initialized); // Must call fecal_init() first
|
||||
if (!m_Initialized)
|
||||
return nullptr;
|
||||
|
||||
fecal::Encoder* encoder = new(std::nothrow) fecal::Encoder;
|
||||
if (!encoder)
|
||||
{
|
||||
FECAL_DEBUG_BREAK; // Out of memory
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
if (Fecal_Success != encoder->Initialize(input_count, input_data, total_bytes))
|
||||
{
|
||||
delete encoder;
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
return reinterpret_cast<FecalEncoder>( encoder );
|
||||
}
|
||||
|
||||
FECAL_EXPORT int fecal_encode(FecalEncoder encoder_v, FecalSymbol* symbol)
|
||||
{
|
||||
fecal::Encoder* encoder = reinterpret_cast<fecal::Encoder*>( encoder_v );
|
||||
if (!encoder || !symbol)
|
||||
return Fecal_InvalidInput;
|
||||
|
||||
return encoder->Encode(*symbol);
|
||||
}
|
||||
|
||||
FECAL_EXPORT void fecal_free(void* codec_v)
|
||||
{
|
||||
if (codec_v)
|
||||
{
|
||||
fecal::ICodec* icodec = reinterpret_cast<fecal::ICodec*>( codec_v );
|
||||
delete icodec;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Decoder API
|
||||
|
||||
FECAL_EXPORT FecalDecoder fecal_decoder_create(unsigned input_count, uint64_t total_bytes)
|
||||
{
|
||||
if (input_count <= 0 || total_bytes < input_count)
|
||||
{
|
||||
FECAL_DEBUG_BREAK; // Invalid input
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
FECAL_DEBUG_ASSERT(m_Initialized); // Must call fecal_init() first
|
||||
if (!m_Initialized)
|
||||
return nullptr;
|
||||
|
||||
fecal::Decoder* decoder = new(std::nothrow) fecal::Decoder;
|
||||
if (!decoder)
|
||||
{
|
||||
FECAL_DEBUG_BREAK; // Out of memory
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
if (Fecal_Success != decoder->Initialize(input_count, total_bytes))
|
||||
{
|
||||
delete decoder;
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
return reinterpret_cast<FecalDecoder>( decoder );
|
||||
}
|
||||
|
||||
FECAL_EXPORT int fecal_decoder_add_original(FecalDecoder decoder_v, const FecalSymbol* symbol)
|
||||
{
|
||||
fecal::Decoder* decoder = reinterpret_cast<fecal::Decoder*>( decoder_v );
|
||||
if (!decoder || !symbol)
|
||||
return Fecal_InvalidInput;
|
||||
|
||||
return decoder->AddOriginal(*symbol);
|
||||
}
|
||||
|
||||
FECAL_EXPORT int fecal_decoder_add_recovery(FecalDecoder decoder_v, const FecalSymbol* symbol)
|
||||
{
|
||||
fecal::Decoder* decoder = reinterpret_cast<fecal::Decoder*>( decoder_v );
|
||||
if (!decoder || !symbol)
|
||||
return Fecal_InvalidInput;
|
||||
|
||||
return decoder->AddRecovery(*symbol);
|
||||
}
|
||||
|
||||
FECAL_EXPORT int fecal_decode(FecalDecoder decoder_v, RecoveredSymbols* symbols)
|
||||
{
|
||||
fecal::Decoder* decoder = reinterpret_cast<fecal::Decoder*>( decoder_v );
|
||||
if (!decoder || !symbols)
|
||||
return Fecal_InvalidInput;
|
||||
|
||||
return decoder->Decode(*symbols);
|
||||
}
|
||||
|
||||
FECAL_EXPORT int fecal_decoder_get(FecalDecoder decoder_v, unsigned input_index, FecalSymbol* symbol)
|
||||
{
|
||||
fecal::Decoder* decoder = reinterpret_cast<fecal::Decoder*>( decoder_v );
|
||||
if (!decoder || !symbol)
|
||||
return Fecal_InvalidInput;
|
||||
|
||||
return decoder->GetOriginal(input_index, *symbol);
|
||||
}
|
||||
|
||||
|
||||
} // extern "C"
|
|
@ -0,0 +1,229 @@
|
|||
/*
|
||||
Copyright (c) 2017 Christopher A. Taylor. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright notice,
|
||||
this list of conditions and the following disclaimer in the documentation
|
||||
and/or other materials provided with the distribution.
|
||||
* Neither the name of Leopard-RS nor the names of its contributors may be
|
||||
used to endorse or promote products derived from this software without
|
||||
specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef CAT_LEOPARD_RS_H
|
||||
#define CAT_LEOPARD_RS_H
|
||||
|
||||
/*
|
||||
Leopard-RS: Reed-Solomon Error Correction Coding for Extremely Large Data
|
||||
|
||||
S.-J. Lin, T. Y. Al-Naffouri, Y. S. Han, and W.-H. Chung,
|
||||
"Novel Polynomial Basis with Fast Fourier Transform and Its Application to Reed-Solomon Erasure Codes"
|
||||
IEEE Trans. on Information Theory, pp. 6284-6299, November, 2016.
|
||||
http://ct.ee.ntust.edu.tw/it2016-2.pdf
|
||||
*/
|
||||
|
||||
// Library version
|
||||
#define LEO_VERSION 1
|
||||
|
||||
// Tweak if the functions are exported or statically linked
|
||||
//#define LEO_DLL /* Defined when building/linking as DLL */
|
||||
//#define LEO_BUILDING /* Defined by the library makefile */
|
||||
|
||||
#if defined(LEO_BUILDING)
|
||||
# if defined(LEO_DLL)
|
||||
#define LEO_EXPORT __declspec(dllexport)
|
||||
# else
|
||||
#define LEO_EXPORT
|
||||
# endif
|
||||
#else
|
||||
# if defined(LEO_DLL)
|
||||
#define LEO_EXPORT __declspec(dllimport)
|
||||
# else
|
||||
#define LEO_EXPORT extern
|
||||
# endif
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Initialization API
|
||||
|
||||
/*
|
||||
leo_init()
|
||||
|
||||
Perform static initialization for the library, verifying that the platform
|
||||
is supported.
|
||||
|
||||
Returns 0 on success and other values on failure.
|
||||
*/
|
||||
|
||||
LEO_EXPORT int leo_init_(int version);
|
||||
#define leo_init() leo_init_(LEO_VERSION)
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Shared Constants / Datatypes
|
||||
|
||||
// Results
|
||||
typedef enum LeopardResultT
|
||||
{
|
||||
Leopard_Success = 0, // Operation succeeded
|
||||
|
||||
Leopard_TooMuchData = -1, // Buffer counts are too high
|
||||
Leopard_InvalidBlockSize = -2, // Buffer size must be a multiple of 64 bytes
|
||||
Leopard_InvalidInput = -3, // A function parameter was invalid
|
||||
Leopard_Platform = -4, // Platform is unsupported
|
||||
Leopard_OutOfMemory = -5, // Out of memory error occurred
|
||||
Leopard_Unexpected = -6, // Unexpected error - Software bug?
|
||||
} LeopardResult;
|
||||
|
||||
// Results
|
||||
typedef enum LeopardFlagsT
|
||||
{
|
||||
LeopardFlags_Defaults = 0, // Default settings
|
||||
|
||||
LeopardFlags_Multithreaded = 1, // Enable multiple threads
|
||||
} LeopardFlags;
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Encoder API
|
||||
|
||||
/*
|
||||
leo_encode_work_count()
|
||||
|
||||
Calculate the number of work_data buffers to provide to leo_encode().
|
||||
|
||||
The sum of original_count + recovery_count must not exceed 65536.
|
||||
|
||||
Returns the work_count value to pass into leo_encode().
|
||||
Returns 0 on invalid input.
|
||||
*/
|
||||
|
||||
LEO_EXPORT unsigned leo_encode_work_count(
|
||||
unsigned original_count,
|
||||
unsigned recovery_count);
|
||||
|
||||
/*
|
||||
leo_encode()
|
||||
|
||||
Generate recovery data.
|
||||
|
||||
original_count: Number of original_data[] buffers provided.
|
||||
recovery_count: Number of desired recovery data buffers.
|
||||
buffer_bytes: Number of bytes in each data buffer.
|
||||
original_data: Array of pointers to original data buffers.
|
||||
work_count: Number of work_data[] buffers, from leo_encode_work_count().
|
||||
work_data: Array of pointers to work data buffers.
|
||||
flags: Flags for encoding e.g. LeopardFlag_Multithreaded
|
||||
|
||||
The sum of original_count + recovery_count must not exceed 65536.
|
||||
The buffer_bytes must be a multiple of 64.
|
||||
Each buffer should have the same number of bytes.
|
||||
Even the last piece must be rounded up to the block size.
|
||||
|
||||
Let buffer_bytes = The number of bytes in each buffer:
|
||||
|
||||
original_count = static_cast<unsigned>(
|
||||
((uint64_t)total_bytes + buffer_bytes - 1) / buffer_bytes);
|
||||
|
||||
Or if the number of pieces is known:
|
||||
|
||||
buffer_bytes = static_cast<unsigned>(
|
||||
((uint64_t)total_bytes + original_count - 1) / original_count);
|
||||
|
||||
Returns Leopard_Success on success.
|
||||
The first set of recovery_count buffers in work_data will be the result.
|
||||
|
||||
Returns Leopard_TooMuchData if the data is too large.
|
||||
Returns Leopard_InvalidBlockSize if the data is the wrong size.
|
||||
Returns Leopard_InvalidInput on invalid input.
|
||||
Returns other values on errors.
|
||||
*/
|
||||
LEO_EXPORT LeopardResult leo_encode(
|
||||
unsigned buffer_bytes, // Number of bytes in each data buffer
|
||||
unsigned original_count, // Number of original_data[] buffer pointers
|
||||
unsigned recovery_count, // Number of recovery_data[] buffer pointers
|
||||
unsigned work_count, // Number of work_data[] buffer pointers, from leo_encode_work_count()
|
||||
void* const * const original_data, // Array of pointers to original data buffers
|
||||
void** work_data, // Array of work buffers
|
||||
unsigned flags); // Operation flags
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Decoder API
|
||||
|
||||
/*
|
||||
leo_decode_work_count()
|
||||
|
||||
Calculate the number of work_data buffers to provide to leo_decode().
|
||||
|
||||
The sum of original_count + recovery_count must not exceed 65536.
|
||||
|
||||
Returns the work_count value to pass into leo_encode().
|
||||
Returns 0 on invalid input.
|
||||
*/
|
||||
|
||||
LEO_EXPORT unsigned leo_decode_work_count(
|
||||
unsigned original_count,
|
||||
unsigned recovery_count);
|
||||
|
||||
/*
|
||||
leo_decode()
|
||||
|
||||
Decode original data from recovery data.
|
||||
|
||||
buffer_bytes: Number of bytes in each data buffer.
|
||||
original_count: Number of original_data[] buffers provided.
|
||||
original_data: Array of pointers to original data buffers.
|
||||
recovery_count: Number of recovery_data[] buffers provided.
|
||||
recovery_data: Array of pointers to recovery data buffers.
|
||||
work_count: Number of work_data[] buffers, from leo_decode_work_count().
|
||||
work_data: Array of pointers to recovery data buffers.
|
||||
flags: Flags for encoding e.g. LeopardFlag_Multithreaded
|
||||
|
||||
Lost original/recovery data should be set to NULL.
|
||||
|
||||
The sum of recovery_count + the number of non-NULL original data must be at
|
||||
least original_count in order to perform recovery.
|
||||
|
||||
Returns Leopard_Success on success.
|
||||
Returns other values on errors.
|
||||
*/
|
||||
LEO_EXPORT LeopardResult leo_decode(
|
||||
unsigned buffer_bytes, // Number of bytes in each data buffer
|
||||
unsigned original_count, // Number of original_data[] buffer pointers
|
||||
unsigned recovery_count, // Number of recovery_data[] buffer pointers
|
||||
unsigned work_count, // Number of buffer pointers in work_data[]
|
||||
void* const * const original_data, // Array of original data buffers
|
||||
void* const * const recovery_data, // Array of recovery data buffers
|
||||
void** work_data, // Array of work data buffers
|
||||
unsigned flags); // Operation flags
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
#endif // CAT_LEOPARD_RS_H
|
|
@ -1,9 +1,11 @@
|
|||
|
||||
Microsoft Visual Studio Solution File, Format Version 12.00
|
||||
# Visual Studio 14
|
||||
VisualStudioVersion = 14.0.25420.1
|
||||
# Visual Studio 15
|
||||
VisualStudioVersion = 15.0.26127.3
|
||||
MinimumVisualStudioVersion = 10.0.40219.1
|
||||
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "LHC_RS", "LHC_RS.vcxproj", "{32176592-2F30-4BD5-B645-EB11C8D3453E}"
|
||||
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "Leopard", "Leopard.vcxproj", "{32176592-2F30-4BD5-B645-EB11C8D3453E}"
|
||||
EndProject
|
||||
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "LeopardBenchmark", "..\tests\proj\Benchmark.vcxproj", "{97FCA15F-EAF3-4F1A-AFF8-83E693DA9D45}"
|
||||
EndProject
|
||||
Global
|
||||
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
||||
|
@ -21,6 +23,14 @@ Global
|
|||
{32176592-2F30-4BD5-B645-EB11C8D3453E}.Release|Win32.Build.0 = Release|Win32
|
||||
{32176592-2F30-4BD5-B645-EB11C8D3453E}.Release|x64.ActiveCfg = Release|x64
|
||||
{32176592-2F30-4BD5-B645-EB11C8D3453E}.Release|x64.Build.0 = Release|x64
|
||||
{97FCA15F-EAF3-4F1A-AFF8-83E693DA9D45}.Debug|Win32.ActiveCfg = Debug|Win32
|
||||
{97FCA15F-EAF3-4F1A-AFF8-83E693DA9D45}.Debug|Win32.Build.0 = Debug|Win32
|
||||
{97FCA15F-EAF3-4F1A-AFF8-83E693DA9D45}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{97FCA15F-EAF3-4F1A-AFF8-83E693DA9D45}.Debug|x64.Build.0 = Debug|x64
|
||||
{97FCA15F-EAF3-4F1A-AFF8-83E693DA9D45}.Release|Win32.ActiveCfg = Release|Win32
|
||||
{97FCA15F-EAF3-4F1A-AFF8-83E693DA9D45}.Release|Win32.Build.0 = Release|Win32
|
||||
{97FCA15F-EAF3-4F1A-AFF8-83E693DA9D45}.Release|x64.ActiveCfg = Release|x64
|
||||
{97FCA15F-EAF3-4F1A-AFF8-83E693DA9D45}.Release|x64.Build.0 = Release|x64
|
||||
EndGlobalSection
|
||||
GlobalSection(SolutionProperties) = preSolution
|
||||
HideSolutionNode = FALSE
|
|
@ -0,0 +1,193 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<ItemGroup Label="ProjectConfigurations">
|
||||
<ProjectConfiguration Include="Debug|Win32">
|
||||
<Configuration>Debug</Configuration>
|
||||
<Platform>Win32</Platform>
|
||||
</ProjectConfiguration>
|
||||
<ProjectConfiguration Include="Debug|x64">
|
||||
<Configuration>Debug</Configuration>
|
||||
<Platform>x64</Platform>
|
||||
</ProjectConfiguration>
|
||||
<ProjectConfiguration Include="Release|Win32">
|
||||
<Configuration>Release</Configuration>
|
||||
<Platform>Win32</Platform>
|
||||
</ProjectConfiguration>
|
||||
<ProjectConfiguration Include="Release|x64">
|
||||
<Configuration>Release</Configuration>
|
||||
<Platform>x64</Platform>
|
||||
</ProjectConfiguration>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClInclude Include="..\leopard.h" />
|
||||
<ClInclude Include="..\LeopardCommon.h" />
|
||||
<ClInclude Include="..\LeopardDecoder.h" />
|
||||
<ClInclude Include="..\LeopardEncoder.h" />
|
||||
<ClInclude Include="..\LeopardFF8.h" />
|
||||
<ClInclude Include="..\LeopardFF16.h" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClCompile Include="..\leopard.cpp" />
|
||||
<ClCompile Include="..\LeopardCommon.cpp" />
|
||||
<ClCompile Include="..\LeopardDecoder.cpp" />
|
||||
<ClCompile Include="..\LeopardEncoder.cpp" />
|
||||
<ClCompile Include="..\LeopardFF8.cpp" />
|
||||
<ClCompile Include="..\LeopardFF16.cpp" />
|
||||
</ItemGroup>
|
||||
<PropertyGroup Label="Globals">
|
||||
<ProjectGuid>{32176592-2F30-4BD5-B645-EB11C8D3453E}</ProjectGuid>
|
||||
<RootNamespace>GF65536</RootNamespace>
|
||||
<ProjectName>Leopard</ProjectName>
|
||||
<WindowsTargetPlatformVersion>10.0.14393.0</WindowsTargetPlatformVersion>
|
||||
</PropertyGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
|
||||
<ConfigurationType>StaticLibrary</ConfigurationType>
|
||||
<UseDebugLibraries>true</UseDebugLibraries>
|
||||
<CharacterSet>MultiByte</CharacterSet>
|
||||
<PlatformToolset>v141</PlatformToolset>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
|
||||
<ConfigurationType>StaticLibrary</ConfigurationType>
|
||||
<UseDebugLibraries>true</UseDebugLibraries>
|
||||
<CharacterSet>MultiByte</CharacterSet>
|
||||
<PlatformToolset>v141</PlatformToolset>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
|
||||
<ConfigurationType>StaticLibrary</ConfigurationType>
|
||||
<UseDebugLibraries>false</UseDebugLibraries>
|
||||
<WholeProgramOptimization>true</WholeProgramOptimization>
|
||||
<CharacterSet>MultiByte</CharacterSet>
|
||||
<PlatformToolset>v141</PlatformToolset>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
|
||||
<ConfigurationType>StaticLibrary</ConfigurationType>
|
||||
<UseDebugLibraries>false</UseDebugLibraries>
|
||||
<WholeProgramOptimization>true</WholeProgramOptimization>
|
||||
<CharacterSet>MultiByte</CharacterSet>
|
||||
<PlatformToolset>v141</PlatformToolset>
|
||||
</PropertyGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||
<ImportGroup Label="ExtensionSettings">
|
||||
</ImportGroup>
|
||||
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
|
||||
</ImportGroup>
|
||||
<ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
|
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
|
||||
</ImportGroup>
|
||||
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
|
||||
</ImportGroup>
|
||||
<ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
|
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
|
||||
</ImportGroup>
|
||||
<PropertyGroup Label="UserMacros" />
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<OutDir>Output/$(ProjectName)/$(Configuration)/$(Platform)/</OutDir>
|
||||
<IntDir>Obj/$(ProjectName)/$(Configuration)/$(Platform)/</IntDir>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
<OutDir>Output/$(ProjectName)/$(Configuration)/$(Platform)/</OutDir>
|
||||
<IntDir>Obj/$(ProjectName)/$(Configuration)/$(Platform)/</IntDir>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
<OutDir>Output/$(ProjectName)/$(Configuration)/$(Platform)/</OutDir>
|
||||
<IntDir>Obj/$(ProjectName)/$(Configuration)/$(Platform)/</IntDir>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||
<OutDir>Output/$(ProjectName)/$(Configuration)/$(Platform)/</OutDir>
|
||||
<IntDir>Obj/$(ProjectName)/$(Configuration)/$(Platform)/</IntDir>
|
||||
</PropertyGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<ClCompile>
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<SDLCheck>true</SDLCheck>
|
||||
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
|
||||
<PreprocessorDefinitions>_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
<AdditionalLibraryDirectories>
|
||||
</AdditionalLibraryDirectories>
|
||||
</Link>
|
||||
<PostBuildEvent>
|
||||
<Command>
|
||||
</Command>
|
||||
</PostBuildEvent>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
<ClCompile>
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<SDLCheck>true</SDLCheck>
|
||||
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
|
||||
<PreprocessorDefinitions>_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
<AdditionalLibraryDirectories>
|
||||
</AdditionalLibraryDirectories>
|
||||
</Link>
|
||||
<PostBuildEvent>
|
||||
<Command>
|
||||
</Command>
|
||||
</PostBuildEvent>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
<ClCompile>
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>MaxSpeed</Optimization>
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<SDLCheck>true</SDLCheck>
|
||||
<InlineFunctionExpansion>AnySuitable</InlineFunctionExpansion>
|
||||
<FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
|
||||
<OmitFramePointers>false</OmitFramePointers>
|
||||
<RuntimeLibrary>MultiThreaded</RuntimeLibrary>
|
||||
<BufferSecurityCheck>true</BufferSecurityCheck>
|
||||
<PreprocessorDefinitions>_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
<EnableCOMDATFolding>true</EnableCOMDATFolding>
|
||||
<OptimizeReferences>true</OptimizeReferences>
|
||||
<AdditionalLibraryDirectories>
|
||||
</AdditionalLibraryDirectories>
|
||||
</Link>
|
||||
<PostBuildEvent>
|
||||
<Command>
|
||||
</Command>
|
||||
</PostBuildEvent>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||
<ClCompile>
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>MaxSpeed</Optimization>
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<SDLCheck>true</SDLCheck>
|
||||
<InlineFunctionExpansion>OnlyExplicitInline</InlineFunctionExpansion>
|
||||
<FavorSizeOrSpeed>Size</FavorSizeOrSpeed>
|
||||
<OmitFramePointers>false</OmitFramePointers>
|
||||
<RuntimeLibrary>MultiThreaded</RuntimeLibrary>
|
||||
<BufferSecurityCheck>true</BufferSecurityCheck>
|
||||
<PreprocessorDefinitions>_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
<EnableCOMDATFolding>true</EnableCOMDATFolding>
|
||||
<OptimizeReferences>true</OptimizeReferences>
|
||||
<AdditionalLibraryDirectories>
|
||||
</AdditionalLibraryDirectories>
|
||||
</Link>
|
||||
<PostBuildEvent>
|
||||
<Command>
|
||||
</Command>
|
||||
</PostBuildEvent>
|
||||
</ItemDefinitionGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||
<ImportGroup Label="ExtensionTargets">
|
||||
</ImportGroup>
|
||||
</Project>
|
|
@ -0,0 +1,57 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<ItemGroup>
|
||||
<Filter Include="Source Files">
|
||||
<UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
|
||||
<Extensions>cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx</Extensions>
|
||||
</Filter>
|
||||
<Filter Include="Header Files">
|
||||
<UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>
|
||||
<Extensions>h;hh;hpp;hxx;hm;inl;inc;xsd</Extensions>
|
||||
</Filter>
|
||||
<Filter Include="Resource Files">
|
||||
<UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier>
|
||||
<Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
|
||||
</Filter>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClInclude Include="..\leopard.h">
|
||||
<Filter>Header Files</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="..\LeopardCommon.h">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="..\LeopardDecoder.h">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="..\LeopardEncoder.h">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="..\LeopardFF16.h">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="..\LeopardFF8.h">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClInclude>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClCompile Include="..\LeopardDecoder.cpp">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\LeopardEncoder.cpp">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\leopard.cpp">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\LeopardCommon.cpp">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\LeopardFF16.cpp">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\LeopardFF8.cpp">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
</ItemGroup>
|
||||
</Project>
|
|
@ -0,0 +1,567 @@
|
|||
/*
|
||||
Copyright (c) 2017 Christopher A. Taylor. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright notice,
|
||||
this list of conditions and the following disclaimer in the documentation
|
||||
and/or other materials provided with the distribution.
|
||||
* Neither the name of Leopard nor the names of its contributors may be
|
||||
used to endorse or promote products derived from this software without
|
||||
specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "../LeopardCommon.h"
|
||||
#include "../leopard.h"
|
||||
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
using namespace std;
|
||||
|
||||
//#define TEST_DATA_ALL_SAME
|
||||
//#define TEST_LOSE_FIRST_K_PACKETS
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Windows
|
||||
|
||||
#ifdef _WIN32
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
|
||||
#ifndef _WINSOCKAPI_
|
||||
#define DID_DEFINE_WINSOCKAPI
|
||||
#define _WINSOCKAPI_
|
||||
#endif
|
||||
#ifndef NOMINMAX
|
||||
#define NOMINMAX
|
||||
#endif
|
||||
#ifndef _WIN32_WINNT
|
||||
#define _WIN32_WINNT 0x0601 /* Windows 7+ */
|
||||
#endif
|
||||
|
||||
#include <windows.h>
|
||||
#endif
|
||||
|
||||
#ifdef DID_DEFINE_WINSOCKAPI
|
||||
#undef _WINSOCKAPI_
|
||||
#undef DID_DEFINE_WINSOCKAPI
|
||||
#endif
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Threads
|
||||
|
||||
static bool SetCurrentThreadPriority()
|
||||
{
|
||||
#ifdef _WIN32
|
||||
return 0 != ::SetThreadPriority(::GetCurrentThread(), THREAD_PRIORITY_ABOVE_NORMAL);
|
||||
#else
|
||||
return -1 != nice(2);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Timing
|
||||
|
||||
static uint64_t GetTimeUsec()
|
||||
{
|
||||
#ifdef _WIN32
|
||||
LARGE_INTEGER timeStamp = {};
|
||||
if (!::QueryPerformanceCounter(&timeStamp))
|
||||
return 0;
|
||||
static double PerfFrequencyInverse = 0.;
|
||||
if (PerfFrequencyInverse == 0.)
|
||||
{
|
||||
LARGE_INTEGER freq = {};
|
||||
if (!::QueryPerformanceFrequency(&freq) || freq.QuadPart == 0)
|
||||
return 0;
|
||||
PerfFrequencyInverse = 1000000. / (double)freq.QuadPart;
|
||||
}
|
||||
return (uint64_t)(PerfFrequencyInverse * timeStamp.QuadPart);
|
||||
#else
|
||||
struct timeval tv;
|
||||
gettimeofday(&tv, nullptr);
|
||||
return 1000000 * tv.tv_sec + tv.tv_usec;
|
||||
#endif // _WIN32
|
||||
}
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// PCG PRNG
|
||||
// From http://www.pcg-random.org/
|
||||
|
||||
class PCGRandom
|
||||
{
|
||||
public:
|
||||
inline void Seed(uint64_t y, uint64_t x = 0)
|
||||
{
|
||||
State = 0;
|
||||
Inc = (y << 1u) | 1u;
|
||||
Next();
|
||||
State += x;
|
||||
Next();
|
||||
}
|
||||
|
||||
inline uint32_t Next()
|
||||
{
|
||||
const uint64_t oldstate = State;
|
||||
State = oldstate * UINT64_C(6364136223846793005) + Inc;
|
||||
const uint32_t xorshifted = (uint32_t)(((oldstate >> 18) ^ oldstate) >> 27);
|
||||
const uint32_t rot = oldstate >> 59;
|
||||
return (xorshifted >> rot) | (xorshifted << ((uint32_t)(-(int32_t)rot) & 31));
|
||||
}
|
||||
|
||||
uint64_t State = 0, Inc = 0;
|
||||
};
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Self-Checking Packet
|
||||
|
||||
static void WriteRandomSelfCheckingPacket(PCGRandom& prng, void* packet, unsigned bytes)
|
||||
{
|
||||
uint8_t* buffer = (uint8_t*)packet;
|
||||
#ifdef TEST_DATA_ALL_SAME
|
||||
if (bytes != 0)
|
||||
#else
|
||||
if (bytes < 16)
|
||||
#endif
|
||||
{
|
||||
LEO_DEBUG_ASSERT(bytes >= 2);
|
||||
buffer[0] = (uint8_t)prng.Next();
|
||||
for (unsigned i = 1; i < bytes; ++i)
|
||||
{
|
||||
buffer[i] = buffer[0];
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
uint32_t crc = bytes;
|
||||
*(uint32_t*)(buffer + 4) = bytes;
|
||||
for (unsigned i = 8; i < bytes; ++i)
|
||||
{
|
||||
uint8_t v = (uint8_t)prng.Next();
|
||||
buffer[i] = v;
|
||||
crc = (crc << 3) | (crc >> (32 - 3));
|
||||
crc += v;
|
||||
}
|
||||
*(uint32_t*)buffer = crc;
|
||||
}
|
||||
}
|
||||
|
||||
static bool CheckPacket(const void* packet, unsigned bytes)
|
||||
{
|
||||
uint8_t* buffer = (uint8_t*)packet;
|
||||
#ifdef TEST_DATA_ALL_SAME
|
||||
if (bytes != 0)
|
||||
#else
|
||||
if (bytes < 16)
|
||||
#endif
|
||||
{
|
||||
if (bytes < 2)
|
||||
return false;
|
||||
|
||||
uint8_t v = buffer[0];
|
||||
for (unsigned i = 1; i < bytes; ++i)
|
||||
{
|
||||
if (buffer[i] != v)
|
||||
return false;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
uint32_t crc = bytes;
|
||||
uint32_t readBytes = *(uint32_t*)(buffer + 4);
|
||||
if (readBytes != bytes)
|
||||
return false;
|
||||
for (unsigned i = 8; i < bytes; ++i)
|
||||
{
|
||||
uint8_t v = buffer[i];
|
||||
crc = (crc << 3) | (crc >> (32 - 3));
|
||||
crc += v;
|
||||
}
|
||||
uint32_t readCRC = *(uint32_t*)buffer;
|
||||
if (readCRC != crc)
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// FunctionTimer
|
||||
|
||||
class FunctionTimer
|
||||
{
|
||||
public:
|
||||
FunctionTimer(const std::string& name)
|
||||
{
|
||||
FunctionName = name;
|
||||
}
|
||||
void BeginCall()
|
||||
{
|
||||
LEO_DEBUG_ASSERT(t0 == 0);
|
||||
t0 = GetTimeUsec();
|
||||
}
|
||||
void EndCall()
|
||||
{
|
||||
LEO_DEBUG_ASSERT(t0 != 0);
|
||||
uint64_t t1 = GetTimeUsec();
|
||||
++Invokations;
|
||||
TotalUsec += t1 - t0;
|
||||
t0 = 0;
|
||||
}
|
||||
void Reset()
|
||||
{
|
||||
LEO_DEBUG_ASSERT(t0 == 0);
|
||||
t0 = 0;
|
||||
Invokations = 0;
|
||||
TotalUsec = 0;
|
||||
}
|
||||
void Print(unsigned trials)
|
||||
{
|
||||
cout << FunctionName << " called " << Invokations / (float)trials << " times per trial (avg). " << TotalUsec / (double)Invokations << " usec avg for all invokations. " << TotalUsec / (float)trials << " usec (avg) of " << trials << " trials" << endl;
|
||||
}
|
||||
|
||||
uint64_t t0 = 0;
|
||||
uint64_t Invokations = 0;
|
||||
uint64_t TotalUsec = 0;
|
||||
std::string FunctionName;
|
||||
};
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Utility: Deck Shuffling function
|
||||
|
||||
/*
|
||||
Given a PRNG, generate a deck of cards in a random order.
|
||||
The deck will contain elements with values between 0 and count - 1.
|
||||
*/
|
||||
|
||||
static void ShuffleDeck16(PCGRandom &prng, uint16_t * LEO_RESTRICT deck, uint32_t count)
|
||||
{
|
||||
deck[0] = 0;
|
||||
|
||||
// If we can unroll 4 times,
|
||||
if (count <= 256)
|
||||
{
|
||||
for (uint32_t ii = 1;;)
|
||||
{
|
||||
uint32_t jj, rv = prng.Next();
|
||||
|
||||
// 8-bit unroll
|
||||
switch (count - ii)
|
||||
{
|
||||
default:
|
||||
jj = (uint8_t)rv % ii;
|
||||
deck[ii] = deck[jj];
|
||||
deck[jj] = ii;
|
||||
++ii;
|
||||
jj = (uint8_t)(rv >> 8) % ii;
|
||||
deck[ii] = deck[jj];
|
||||
deck[jj] = ii;
|
||||
++ii;
|
||||
jj = (uint8_t)(rv >> 16) % ii;
|
||||
deck[ii] = deck[jj];
|
||||
deck[jj] = ii;
|
||||
++ii;
|
||||
jj = (uint8_t)(rv >> 24) % ii;
|
||||
deck[ii] = deck[jj];
|
||||
deck[jj] = ii;
|
||||
++ii;
|
||||
break;
|
||||
|
||||
case 3:
|
||||
jj = (uint8_t)rv % ii;
|
||||
deck[ii] = deck[jj];
|
||||
deck[jj] = ii;
|
||||
++ii;
|
||||
case 2:
|
||||
jj = (uint8_t)(rv >> 8) % ii;
|
||||
deck[ii] = deck[jj];
|
||||
deck[jj] = ii;
|
||||
++ii;
|
||||
case 1:
|
||||
jj = (uint8_t)(rv >> 16) % ii;
|
||||
deck[ii] = deck[jj];
|
||||
deck[jj] = ii;
|
||||
case 0:
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// For each deck entry,
|
||||
for (uint32_t ii = 1;;)
|
||||
{
|
||||
uint32_t jj, rv = prng.Next();
|
||||
|
||||
// 16-bit unroll
|
||||
switch (count - ii)
|
||||
{
|
||||
default:
|
||||
jj = (uint16_t)rv % ii;
|
||||
deck[ii] = deck[jj];
|
||||
deck[jj] = ii;
|
||||
++ii;
|
||||
jj = (uint16_t)(rv >> 16) % ii;
|
||||
deck[ii] = deck[jj];
|
||||
deck[jj] = ii;
|
||||
++ii;
|
||||
break;
|
||||
|
||||
case 1:
|
||||
jj = (uint16_t)rv % ii;
|
||||
deck[ii] = deck[jj];
|
||||
deck[jj] = ii;
|
||||
case 0:
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// SIMD-Safe Aligned Memory Allocations
|
||||
|
||||
static const unsigned kAlignmentBytes = LEO_ALIGN_BYTES;
|
||||
|
||||
LEO_FORCE_INLINE unsigned NextAlignedOffset(unsigned offset)
|
||||
{
|
||||
return (offset + kAlignmentBytes - 1) & ~(kAlignmentBytes - 1);
|
||||
}
|
||||
|
||||
static LEO_FORCE_INLINE uint8_t* SIMDSafeAllocate(size_t size)
|
||||
{
|
||||
uint8_t* data = (uint8_t*)calloc(1, kAlignmentBytes + size);
|
||||
if (!data)
|
||||
return nullptr;
|
||||
unsigned offset = (unsigned)((uintptr_t)data % kAlignmentBytes);
|
||||
data += kAlignmentBytes - offset;
|
||||
data[-1] = (uint8_t)offset;
|
||||
return data;
|
||||
}
|
||||
|
||||
static LEO_FORCE_INLINE void SIMDSafeFree(void* ptr)
|
||||
{
|
||||
if (!ptr)
|
||||
return;
|
||||
uint8_t* data = (uint8_t*)ptr;
|
||||
unsigned offset = data[-1];
|
||||
if (offset >= kAlignmentBytes)
|
||||
{
|
||||
LEO_DEBUG_BREAK; // Should never happen
|
||||
return;
|
||||
}
|
||||
data -= kAlignmentBytes - offset;
|
||||
free(data);
|
||||
}
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Tests
|
||||
|
||||
struct TestParameters
|
||||
{
|
||||
unsigned original_count = 200; // under 65536
|
||||
unsigned recovery_count = 100; // under 65536 - original_count
|
||||
unsigned buffer_bytes = 64000; // multiple of 64 bytes
|
||||
unsigned loss_count = 20; // some fraction of original_count
|
||||
unsigned seed = 0;
|
||||
bool multithreaded = true;
|
||||
};
|
||||
|
||||
static void BasicTest(const TestParameters& params)
|
||||
{
|
||||
static const unsigned kTrials = 4;
|
||||
|
||||
std::vector<uint8_t*> original_data(params.original_count);
|
||||
|
||||
const unsigned encode_work_count = leo_encode_work_count(params.original_count, params.recovery_count);
|
||||
const unsigned decode_work_count = leo_decode_work_count(params.original_count, params.recovery_count);
|
||||
|
||||
std::vector<uint8_t*> encode_work_data(encode_work_count);
|
||||
std::vector<uint8_t*> decode_work_data(decode_work_count);
|
||||
|
||||
FunctionTimer t_mem_alloc("memory_allocation");
|
||||
FunctionTimer t_leo_encode("leo_encode");
|
||||
FunctionTimer t_leo_decode("leo_decode");
|
||||
FunctionTimer t_mem_free("memory_free");
|
||||
|
||||
const uint64_t total_bytes = (uint64_t)params.buffer_bytes * params.original_count;
|
||||
|
||||
for (unsigned trial = 0; trial < kTrials; ++trial)
|
||||
{
|
||||
// Allocate memory:
|
||||
|
||||
t_mem_alloc.BeginCall();
|
||||
for (unsigned i = 0, count = params.original_count; i < count; ++i)
|
||||
original_data[i] = SIMDSafeAllocate(params.buffer_bytes);
|
||||
for (unsigned i = 0, count = encode_work_count; i < count; ++i)
|
||||
encode_work_data[i] = SIMDSafeAllocate(params.buffer_bytes);
|
||||
for (unsigned i = 0, count = decode_work_count; i < count; ++i)
|
||||
decode_work_data[i] = SIMDSafeAllocate(params.buffer_bytes);
|
||||
t_mem_alloc.EndCall();
|
||||
|
||||
// Generate data:
|
||||
|
||||
PCGRandom prng;
|
||||
prng.Seed(params.seed, trial);
|
||||
|
||||
for (unsigned i = 0; i < params.original_count; ++i)
|
||||
WriteRandomSelfCheckingPacket(prng, original_data[i], params.buffer_bytes);
|
||||
|
||||
// Encode:
|
||||
|
||||
t_leo_encode.BeginCall();
|
||||
LeopardResult encodeResult = leo_encode(
|
||||
params.buffer_bytes,
|
||||
params.original_count,
|
||||
params.recovery_count,
|
||||
encode_work_count,
|
||||
(void**)&original_data[0],
|
||||
(void**)&encode_work_data[0], // recovery data written here
|
||||
params.multithreaded ? LeopardFlags_Multithreaded : LeopardFlags_Defaults
|
||||
);
|
||||
t_leo_encode.EndCall();
|
||||
|
||||
if (encodeResult != Leopard_Success)
|
||||
{
|
||||
cout << "Error: Leopard encode failed with result=" << encodeResult << endl;
|
||||
LEO_DEBUG_BREAK;
|
||||
return;
|
||||
}
|
||||
|
||||
// Lose random original data:
|
||||
|
||||
std::vector<uint16_t> original_losses(params.original_count);
|
||||
ShuffleDeck16(prng, &original_losses[0], params.original_count);
|
||||
|
||||
for (unsigned i = 0, count = params.loss_count; i < count; ++i)
|
||||
{
|
||||
const unsigned loss_index = original_losses[i];
|
||||
delete[] original_data[loss_index];
|
||||
original_data[loss_index] = nullptr;
|
||||
}
|
||||
|
||||
// Lose random recovery data:
|
||||
|
||||
const unsigned recovery_loss_count = params.recovery_count - params.loss_count;
|
||||
|
||||
std::vector<uint16_t> recovery_losses(params.recovery_count);
|
||||
ShuffleDeck16(prng, &recovery_losses[0], params.recovery_count);
|
||||
|
||||
for (unsigned i = 0, count = params.loss_count; i < count; ++i)
|
||||
{
|
||||
const unsigned loss_index = original_losses[i];
|
||||
delete[] encode_work_data[loss_index];
|
||||
encode_work_data[loss_index] = nullptr;
|
||||
}
|
||||
|
||||
// Decode:
|
||||
|
||||
t_leo_decode.BeginCall();
|
||||
LeopardResult decodeResult = leo_decode(
|
||||
params.buffer_bytes,
|
||||
params.original_count,
|
||||
params.recovery_count,
|
||||
decode_work_count,
|
||||
(void**)&original_data[0],
|
||||
(void**)&encode_work_data[0],
|
||||
(void**)&decode_work_data[0],
|
||||
params.multithreaded ? LeopardFlags_Multithreaded : LeopardFlags_Defaults);
|
||||
t_leo_decode.EndCall();
|
||||
|
||||
if (decodeResult != Leopard_Success)
|
||||
{
|
||||
cout << "Error: Leopard decode failed with result=" << decodeResult << endl;
|
||||
LEO_DEBUG_BREAK;
|
||||
return;
|
||||
}
|
||||
|
||||
// Free memory:
|
||||
|
||||
t_mem_free.BeginCall();
|
||||
for (unsigned i = 0, count = params.original_count; i < count; ++i)
|
||||
SIMDSafeFree(original_data[i]);
|
||||
for (unsigned i = 0, count = encode_work_count; i < count; ++i)
|
||||
SIMDSafeFree(encode_work_data[i]);
|
||||
for (unsigned i = 0, count = decode_work_count; i < count; ++i)
|
||||
SIMDSafeFree(decode_work_data[i]);
|
||||
t_mem_free.EndCall();
|
||||
}
|
||||
|
||||
t_mem_alloc.Print(kTrials);
|
||||
t_leo_encode.Print(kTrials);
|
||||
t_leo_decode.Print(kTrials);
|
||||
t_mem_free.Print(kTrials);
|
||||
|
||||
float encode_input_MBPS = total_bytes * kTrials / (float)(t_leo_encode.TotalUsec);
|
||||
float encode_output_MBPS = params.buffer_bytes * (uint64_t)params.recovery_count * kTrials / (float)(t_leo_encode.TotalUsec);
|
||||
float decode_input_MBPS = total_bytes * kTrials / (float)(t_leo_decode.TotalUsec);
|
||||
float decode_output_MBPS = params.buffer_bytes * (uint64_t)params.loss_count * kTrials / (float)(t_leo_decode.TotalUsec);
|
||||
|
||||
cout << "Leopard Encoder(" << total_bytes / 1000000.f << " MB in " << params.original_count << " pieces, " << params.loss_count << " losses): Input=" << encode_input_MBPS << " MB/s, Output=" << encode_output_MBPS << " MB/s" << endl;
|
||||
cout << "Leopard Decoder(" << total_bytes / 1000000.f << " MB in " << params.original_count << " pieces, " << params.loss_count << " losses): Input=" << decode_input_MBPS << " MB/s, Output=" << decode_output_MBPS << " MB/s" << endl << endl;
|
||||
}
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Entrypoint
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
SetCurrentThreadPriority();
|
||||
|
||||
FunctionTimer t_leo_init("leo_init");
|
||||
|
||||
t_leo_init.BeginCall();
|
||||
if (0 != leo_init())
|
||||
{
|
||||
cout << "Failed to initialize" << endl;
|
||||
return -1;
|
||||
}
|
||||
t_leo_init.EndCall();
|
||||
t_leo_init.Print(1);
|
||||
|
||||
TestParameters params;
|
||||
|
||||
if (argc >= 2)
|
||||
params.original_count = atoi(argv[1]);
|
||||
if (argc >= 3)
|
||||
params.recovery_count = atoi(argv[2]);
|
||||
if (argc >= 4)
|
||||
params.buffer_bytes = atoi(argv[3]);
|
||||
if (argc >= 5)
|
||||
params.loss_count = atoi(argv[4]);
|
||||
if (argc >= 6)
|
||||
params.multithreaded = (atoi(argv[5]) != 0);
|
||||
|
||||
cout << "Parameters: [original count=" << params.original_count << "] [recovery count=" << params.recovery_count << "] [buffer bytes=" << params.buffer_bytes << "] [loss count=" << params.loss_count << "] [random seed=" << params.seed << "]" << endl;
|
||||
|
||||
BasicTest(params);
|
||||
|
||||
getchar();
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -18,41 +18,38 @@
|
|||
<Platform>x64</Platform>
|
||||
</ProjectConfiguration>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClCompile Include="..\lhc_rs.cpp" />
|
||||
</ItemGroup>
|
||||
<PropertyGroup Label="Globals">
|
||||
<ProjectGuid>{32176592-2F30-4BD5-B645-EB11C8D3453E}</ProjectGuid>
|
||||
<RootNamespace>GF65536</RootNamespace>
|
||||
<ProjectName>LHC_RS</ProjectName>
|
||||
<WindowsTargetPlatformVersion>8.1</WindowsTargetPlatformVersion>
|
||||
<ProjectGuid>{97FCA15F-EAF3-4F1A-AFF8-83E693DA9D45}</ProjectGuid>
|
||||
<RootNamespace>Fecal</RootNamespace>
|
||||
<ProjectName>LeopardBenchmark</ProjectName>
|
||||
<WindowsTargetPlatformVersion>10.0.14393.0</WindowsTargetPlatformVersion>
|
||||
</PropertyGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
|
||||
<ConfigurationType>Application</ConfigurationType>
|
||||
<UseDebugLibraries>true</UseDebugLibraries>
|
||||
<CharacterSet>MultiByte</CharacterSet>
|
||||
<PlatformToolset>v140</PlatformToolset>
|
||||
<PlatformToolset>v141</PlatformToolset>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
|
||||
<ConfigurationType>Application</ConfigurationType>
|
||||
<UseDebugLibraries>true</UseDebugLibraries>
|
||||
<CharacterSet>MultiByte</CharacterSet>
|
||||
<PlatformToolset>v140</PlatformToolset>
|
||||
<PlatformToolset>v141</PlatformToolset>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
|
||||
<ConfigurationType>Application</ConfigurationType>
|
||||
<UseDebugLibraries>false</UseDebugLibraries>
|
||||
<WholeProgramOptimization>true</WholeProgramOptimization>
|
||||
<CharacterSet>MultiByte</CharacterSet>
|
||||
<PlatformToolset>v140</PlatformToolset>
|
||||
<PlatformToolset>v141</PlatformToolset>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
|
||||
<ConfigurationType>Application</ConfigurationType>
|
||||
<UseDebugLibraries>false</UseDebugLibraries>
|
||||
<WholeProgramOptimization>true</WholeProgramOptimization>
|
||||
<CharacterSet>MultiByte</CharacterSet>
|
||||
<PlatformToolset>v140</PlatformToolset>
|
||||
<PlatformToolset>v141</PlatformToolset>
|
||||
</PropertyGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||
<ImportGroup Label="ExtensionSettings">
|
||||
|
@ -155,8 +152,8 @@
|
|||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<SDLCheck>true</SDLCheck>
|
||||
<InlineFunctionExpansion>OnlyExplicitInline</InlineFunctionExpansion>
|
||||
<FavorSizeOrSpeed>Size</FavorSizeOrSpeed>
|
||||
<InlineFunctionExpansion>AnySuitable</InlineFunctionExpansion>
|
||||
<FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
|
||||
<OmitFramePointers>false</OmitFramePointers>
|
||||
<RuntimeLibrary>MultiThreaded</RuntimeLibrary>
|
||||
<BufferSecurityCheck>true</BufferSecurityCheck>
|
||||
|
@ -174,6 +171,14 @@
|
|||
</Command>
|
||||
</PostBuildEvent>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemGroup>
|
||||
<ClCompile Include="..\benchmark.cpp" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="..\..\proj\Leopard.vcxproj">
|
||||
<Project>{32176592-2f30-4bd5-b645-eb11c8d3453e}</Project>
|
||||
</ProjectReference>
|
||||
</ItemGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||
<ImportGroup Label="ExtensionTargets">
|
||||
</ImportGroup>
|
|
@ -15,7 +15,7 @@
|
|||
</Filter>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClCompile Include="..\lhc_rs.cpp">
|
||||
<ClCompile Include="..\benchmark.cpp">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
</ItemGroup>
|
Loading…
Reference in New Issue