Project structure

This commit is contained in:
Christopher Taylor 2017-05-25 02:24:15 -07:00
parent 4d78561689
commit 49dbcdc8b1
20 changed files with 9742 additions and 129 deletions

957
LeopardCommon.cpp Normal file
View File

@ -0,0 +1,957 @@
/*
Copyright (c) 2017 Christopher A. Taylor. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the name of Leopard-RS nor the names of its contributors may be
used to endorse or promote products derived from this software without
specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
*/
#include "LeopardCommon.h"
namespace leopard {
//------------------------------------------------------------------------------
// Runtime CPU Architecture Check
//
// Feature checks stolen shamelessly from
// https://github.com/jedisct1/libsodium/blob/master/src/libsodium/sodium/runtime.c
#if defined(HAVE_ANDROID_GETCPUFEATURES)
#include <cpu-features.h>
#endif
#if defined(LEO_TRY_NEON)
# if defined(IOS) && defined(__ARM_NEON__)
// Requires iPhone 5S or newer
# else
// Remember to add LOCAL_STATIC_LIBRARIES := cpufeatures
bool CpuHasNeon = false; // V6 / V7
bool CpuHasNeon64 = false; // 64-bit
# endif
#endif
#if !defined(LEO_TARGET_MOBILE)
#ifdef _MSC_VER
#include <intrin.h> // __cpuid
#pragma warning(disable: 4752) // found Intel(R) Advanced Vector Extensions; consider using /arch:AVX
#endif
#ifdef LEO_TRY_AVX2
bool CpuHasAVX2 = false;
#endif
bool CpuHasSSSE3 = false;
#define CPUID_EBX_AVX2 0x00000020
#define CPUID_ECX_SSSE3 0x00000200
static void _cpuid(unsigned int cpu_info[4U], const unsigned int cpu_info_type)
{
#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_AMD64) || defined(_M_IX86))
__cpuid((int *) cpu_info, cpu_info_type);
#else //if defined(HAVE_CPUID)
cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0;
# ifdef __i386__
__asm__ __volatile__ ("pushfl; pushfl; "
"popl %0; "
"movl %0, %1; xorl %2, %0; "
"pushl %0; "
"popfl; pushfl; popl %0; popfl" :
"=&r" (cpu_info[0]), "=&r" (cpu_info[1]) :
"i" (0x200000));
if (((cpu_info[0] ^ cpu_info[1]) & 0x200000) == 0) {
return; /* LCOV_EXCL_LINE */
}
# endif
# ifdef __i386__
__asm__ __volatile__ ("xchgl %%ebx, %k1; cpuid; xchgl %%ebx, %k1" :
"=a" (cpu_info[0]), "=&r" (cpu_info[1]),
"=c" (cpu_info[2]), "=d" (cpu_info[3]) :
"0" (cpu_info_type), "2" (0U));
# elif defined(__x86_64__)
__asm__ __volatile__ ("xchgq %%rbx, %q1; cpuid; xchgq %%rbx, %q1" :
"=a" (cpu_info[0]), "=&r" (cpu_info[1]),
"=c" (cpu_info[2]), "=d" (cpu_info[3]) :
"0" (cpu_info_type), "2" (0U));
# else
__asm__ __volatile__ ("cpuid" :
"=a" (cpu_info[0]), "=b" (cpu_info[1]),
"=c" (cpu_info[2]), "=d" (cpu_info[3]) :
"0" (cpu_info_type), "2" (0U));
# endif
#endif
}
#endif // defined(LEO_TARGET_MOBILE)
void InitializeCPUArch()
{
#if defined(LEO_TRY_NEON) && defined(HAVE_ANDROID_GETCPUFEATURES)
AndroidCpuFamily family = android_getCpuFamily();
if (family == ANDROID_CPU_FAMILY_ARM)
{
if (android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON)
CpuHasNeon = true;
}
else if (family == ANDROID_CPU_FAMILY_ARM64)
{
CpuHasNeon = true;
if (android_getCpuFeatures() & ANDROID_CPU_ARM64_FEATURE_ASIMD)
CpuHasNeon64 = true;
}
#endif
#if !defined(LEO_TARGET_MOBILE)
unsigned int cpu_info[4];
_cpuid(cpu_info, 1);
CpuHasSSSE3 = ((cpu_info[2] & CPUID_ECX_SSSE3) != 0);
#if defined(LEO_TRY_AVX2)
_cpuid(cpu_info, 7);
CpuHasAVX2 = ((cpu_info[1] & CPUID_EBX_AVX2) != 0);
#endif // LEO_TRY_AVX2
#endif // LEO_TARGET_MOBILE
}
// vx[] += vy[] * z
static void muladd_mem(GFSymbol * LEO_RESTRICT vx, const GFSymbol * LEO_RESTRICT vy, GFSymbol z, unsigned symbolCount)
{
for (unsigned i = 0; i < symbolCount; ++i)
{
const GFSymbol a = vy[i];
if (a == 0)
continue;
GFSymbol sum1 = static_cast<GFSymbol>(AddModQ(GFLog[a & 0x0f], z));
GFSymbol value1 = GFExp[sum1];
if ((a & 0x0f) == 0)
{
value1 = 0;
}
GFSymbol sum2 = static_cast<GFSymbol>(AddModQ(GFLog[a & 0xf0], z));
GFSymbol value2 = GFExp[sum2];
if ((a & 0xf0) == 0)
{
value2 = 0;
}
GFSymbol sum3 = static_cast<GFSymbol>(AddModQ(GFLog[a & 0x0f00], z));
GFSymbol value3 = GFExp[sum3];
if ((a & 0x0f00) == 0)
{
value3 = 0;
}
GFSymbol sum4 = static_cast<GFSymbol>(AddModQ(GFLog[a & 0xf000], z));
GFSymbol value4 = GFExp[sum4];
if ((a & 0xf000) == 0)
{
value4 = 0;
}
vx[i] ^= value1;
vx[i] ^= value2;
vx[i] ^= value3;
vx[i] ^= value4;
}
}
// return a*GFExp[b] over GF(2^r)
static GFSymbol mulE(GFSymbol a, GFSymbol b)
{
if (a == 0)
return 0;
const GFSymbol sum = static_cast<GFSymbol>(AddModQ(GFLog[a], b));
return GFExp[sum];
}
//------------------------------------------------------------------------------
// Fast Walsh-Hadamard Transform (FWHT) Mod Q
//
// Q is the maximum symbol value, e.g. 255 or 65535.
// Define this to enable the optimized version of FWHT()
#define LEO_FWHT_OPTIMIZED
typedef GFSymbol fwht_t;
// {a, b} = {a + b, a - b} (Mod Q)
static LEO_FORCE_INLINE void FWHT_2(fwht_t& LEO_RESTRICT a, fwht_t& LEO_RESTRICT b)
{
const fwht_t sum = AddModQ(a, b);
const fwht_t dif = SubModQ(a, b);
a = sum;
b = dif;
}
/*
FWHT is a minor slice of the runtime and does not grow with data size,
but I did attempt a few additional optimizations that failed:
I've attempted to vectorize (with partial reductions) FWHT_4(data, s),
which is 70% of the algorithm, but it was slower. Left in _attic_.
I've attempted to avoid reductions in all or parts of the FWHT.
The final modular reduction ends up being slower than the savings.
Specifically I tried doing it for the whole FWHT and also I tried
doing it just for the FWHT_2 loop in the main routine, but both
approaches are slower than partial reductions.
Replacing word reads with wider reads does speed up the operation, but
at too high a complexity cost relative to minor perf improvement.
*/
#ifndef LEO_FWHT_OPTIMIZED
// Reference implementation
static void FWHT(fwht_t* data, const unsigned bits)
{
const unsigned size = (unsigned)(1UL << bits);
for (unsigned width = 1; width < size; width <<= 1)
for (unsigned i = 0; i < size; i += (width << 1))
for (unsigned j = i; j < (width + i); ++j)
FWHT_2(data[j], data[j + width]);
}
#else
static LEO_FORCE_INLINE void FWHT_4(fwht_t* data)
{
fwht_t t0 = data[0];
fwht_t t1 = data[1];
fwht_t t2 = data[2];
fwht_t t3 = data[3];
FWHT_2(t0, t1);
FWHT_2(t2, t3);
FWHT_2(t0, t2);
FWHT_2(t1, t3);
data[0] = t0;
data[1] = t1;
data[2] = t2;
data[3] = t3;
}
static LEO_FORCE_INLINE void FWHT_4(fwht_t* data, unsigned s)
{
unsigned x = 0;
fwht_t t0 = data[x]; x += s;
fwht_t t1 = data[x]; x += s;
fwht_t t2 = data[x]; x += s;
fwht_t t3 = data[x];
FWHT_2(t0, t1);
FWHT_2(t2, t3);
FWHT_2(t0, t2);
FWHT_2(t1, t3);
unsigned y = 0;
data[y] = t0; y += s;
data[y] = t1; y += s;
data[y] = t2; y += s;
data[y] = t3;
}
static inline void FWHT_8(fwht_t* data)
{
fwht_t t0 = data[0];
fwht_t t1 = data[1];
fwht_t t2 = data[2];
fwht_t t3 = data[3];
fwht_t t4 = data[4];
fwht_t t5 = data[5];
fwht_t t6 = data[6];
fwht_t t7 = data[7];
FWHT_2(t0, t1);
FWHT_2(t2, t3);
FWHT_2(t4, t5);
FWHT_2(t6, t7);
FWHT_2(t0, t2);
FWHT_2(t1, t3);
FWHT_2(t4, t6);
FWHT_2(t5, t7);
FWHT_2(t0, t4);
FWHT_2(t1, t5);
FWHT_2(t2, t6);
FWHT_2(t3, t7);
data[0] = t0;
data[1] = t1;
data[2] = t2;
data[3] = t3;
data[4] = t4;
data[5] = t5;
data[6] = t6;
data[7] = t7;
}
static inline void FWHT_16(fwht_t* data)
{
fwht_t t0 = data[0];
fwht_t t1 = data[1];
fwht_t t2 = data[2];
fwht_t t3 = data[3];
fwht_t t4 = data[4];
fwht_t t5 = data[5];
fwht_t t6 = data[6];
fwht_t t7 = data[7];
fwht_t t8 = data[8];
fwht_t t9 = data[9];
fwht_t t10 = data[10];
fwht_t t11 = data[11];
fwht_t t12 = data[12];
fwht_t t13 = data[13];
fwht_t t14 = data[14];
fwht_t t15 = data[15];
FWHT_2(t0, t1);
FWHT_2(t2, t3);
FWHT_2(t4, t5);
FWHT_2(t6, t7);
FWHT_2(t8, t9);
FWHT_2(t10, t11);
FWHT_2(t12, t13);
FWHT_2(t14, t15);
FWHT_2(t0, t2);
FWHT_2(t1, t3);
FWHT_2(t4, t6);
FWHT_2(t5, t7);
FWHT_2(t8, t10);
FWHT_2(t9, t11);
FWHT_2(t12, t14);
FWHT_2(t13, t15);
FWHT_2(t0, t4);
FWHT_2(t1, t5);
FWHT_2(t2, t6);
FWHT_2(t3, t7);
FWHT_2(t8, t12);
FWHT_2(t9, t13);
FWHT_2(t10, t14);
FWHT_2(t11, t15);
FWHT_2(t0, t8);
FWHT_2(t1, t9);
FWHT_2(t2, t10);
FWHT_2(t3, t11);
FWHT_2(t4, t12);
FWHT_2(t5, t13);
FWHT_2(t6, t14);
FWHT_2(t7, t15);
data[0] = t0;
data[1] = t1;
data[2] = t2;
data[3] = t3;
data[4] = t4;
data[5] = t5;
data[6] = t6;
data[7] = t7;
data[8] = t8;
data[9] = t9;
data[10] = t10;
data[11] = t11;
data[12] = t12;
data[13] = t13;
data[14] = t14;
data[15] = t15;
}
static void FWHT_SmallData(fwht_t* data, unsigned ldn)
{
const unsigned n = (1UL << ldn);
if (n <= 2)
{
if (n == 2)
FWHT_2(data[0], data[1]);
return;
}
for (unsigned ldm = ldn; ldm > 3; ldm -= 2)
{
unsigned m = (1UL << ldm);
unsigned m4 = (m >> 2);
for (unsigned r = 0; r < n; r += m)
for (unsigned j = 0; j < m4; j++)
FWHT_4(data + j + r, m4);
}
if (ldn & 1)
{
for (unsigned i0 = 0; i0 < n; i0 += 8)
FWHT_8(data + i0);
}
else
{
for (unsigned i0 = 0; i0 < n; i0 += 4)
FWHT_4(data + i0);
}
}
// Decimation in time (DIT) version
static void FWHT(fwht_t* data, const unsigned ldn)
{
if (ldn <= 13)
{
FWHT_SmallData(data, ldn);
return;
}
FWHT_2(data[2], data[3]);
FWHT_4(data + 4);
FWHT_8(data + 8);
FWHT_16(data + 16);
for (unsigned ldm = 5; ldm < ldn; ++ldm)
FWHT(data + (unsigned)(1UL << ldm), ldm);
for (unsigned ldm = 0; ldm < ldn; ++ldm)
{
const unsigned mh = (1UL << ldm);
for (unsigned t1 = 0, t2 = mh; t1 < mh; ++t1, ++t2)
FWHT_2(data[t1], data[t2]);
}
}
#endif
//------------------------------------------------------------------------------
// Memory Buffer XOR
static void xor_mem(void * LEO_RESTRICT vx, const void * LEO_RESTRICT vy, unsigned bytes)
{
LEO_M128 * LEO_RESTRICT x16 = reinterpret_cast<LEO_M128 *>(vx);
const LEO_M128 * LEO_RESTRICT y16 = reinterpret_cast<const LEO_M128 *>(vy);
#if defined(LEO_TARGET_MOBILE)
# if defined(LEO_TRY_NEON)
// Handle multiples of 64 bytes
if (CpuHasNeon)
{
while (bytes >= 64)
{
LEO_M128 x0 = vld1q_u8(x16);
LEO_M128 x1 = vld1q_u8(x16 + 1);
LEO_M128 x2 = vld1q_u8(x16 + 2);
LEO_M128 x3 = vld1q_u8(x16 + 3);
LEO_M128 y0 = vld1q_u8(y16);
LEO_M128 y1 = vld1q_u8(y16 + 1);
LEO_M128 y2 = vld1q_u8(y16 + 2);
LEO_M128 y3 = vld1q_u8(y16 + 3);
vst1q_u8(x16, veorq_u8(x0, y0));
vst1q_u8(x16 + 1, veorq_u8(x1, y1));
vst1q_u8(x16 + 2, veorq_u8(x2, y2));
vst1q_u8(x16 + 3, veorq_u8(x3, y3));
bytes -= 64, x16 += 4, y16 += 4;
}
// Handle multiples of 16 bytes
while (bytes >= 16)
{
LEO_M128 x0 = vld1q_u8(x16);
LEO_M128 y0 = vld1q_u8(y16);
vst1q_u8(x16, veorq_u8(x0, y0));
bytes -= 16, ++x16, ++y16;
}
}
else
# endif // LEO_TRY_NEON
{
uint64_t * LEO_RESTRICT x8 = reinterpret_cast<uint64_t *>(x16);
const uint64_t * LEO_RESTRICT y8 = reinterpret_cast<const uint64_t *>(y16);
const unsigned count = (unsigned)bytes / 8;
for (unsigned ii = 0; ii < count; ++ii)
x8[ii] ^= y8[ii];
x16 = reinterpret_cast<LEO_M128 *>(x8 + count);
y16 = reinterpret_cast<const LEO_M128 *>(y8 + count);
}
#else // LEO_TARGET_MOBILE
# if defined(LEO_TRY_AVX2)
if (CpuHasAVX2)
{
LEO_M256 * LEO_RESTRICT x32 = reinterpret_cast<LEO_M256 *>(x16);
const LEO_M256 * LEO_RESTRICT y32 = reinterpret_cast<const LEO_M256 *>(y16);
while (bytes >= 128)
{
LEO_M256 x0 = _mm256_loadu_si256(x32);
LEO_M256 y0 = _mm256_loadu_si256(y32);
x0 = _mm256_xor_si256(x0, y0);
LEO_M256 x1 = _mm256_loadu_si256(x32 + 1);
LEO_M256 y1 = _mm256_loadu_si256(y32 + 1);
x1 = _mm256_xor_si256(x1, y1);
LEO_M256 x2 = _mm256_loadu_si256(x32 + 2);
LEO_M256 y2 = _mm256_loadu_si256(y32 + 2);
x2 = _mm256_xor_si256(x2, y2);
LEO_M256 x3 = _mm256_loadu_si256(x32 + 3);
LEO_M256 y3 = _mm256_loadu_si256(y32 + 3);
x3 = _mm256_xor_si256(x3, y3);
_mm256_storeu_si256(x32, x0);
_mm256_storeu_si256(x32 + 1, x1);
_mm256_storeu_si256(x32 + 2, x2);
_mm256_storeu_si256(x32 + 3, x3);
bytes -= 128, x32 += 4, y32 += 4;
}
// Handle multiples of 32 bytes
while (bytes >= 32)
{
// x[i] = x[i] xor y[i]
_mm256_storeu_si256(x32,
_mm256_xor_si256(
_mm256_loadu_si256(x32),
_mm256_loadu_si256(y32)));
bytes -= 32, ++x32, ++y32;
}
x16 = reinterpret_cast<LEO_M128 *>(x32);
y16 = reinterpret_cast<const LEO_M128 *>(y32);
}
else
# endif // LEO_TRY_AVX2
{
while (bytes >= 64)
{
LEO_M128 x0 = _mm_loadu_si128(x16);
LEO_M128 y0 = _mm_loadu_si128(y16);
x0 = _mm_xor_si128(x0, y0);
LEO_M128 x1 = _mm_loadu_si128(x16 + 1);
LEO_M128 y1 = _mm_loadu_si128(y16 + 1);
x1 = _mm_xor_si128(x1, y1);
LEO_M128 x2 = _mm_loadu_si128(x16 + 2);
LEO_M128 y2 = _mm_loadu_si128(y16 + 2);
x2 = _mm_xor_si128(x2, y2);
LEO_M128 x3 = _mm_loadu_si128(x16 + 3);
LEO_M128 y3 = _mm_loadu_si128(y16 + 3);
x3 = _mm_xor_si128(x3, y3);
_mm_storeu_si128(x16, x0);
_mm_storeu_si128(x16 + 1, x1);
_mm_storeu_si128(x16 + 2, x2);
_mm_storeu_si128(x16 + 3, x3);
bytes -= 64, x16 += 4, y16 += 4;
}
}
#endif // LEO_TARGET_MOBILE
// Handle multiples of 16 bytes
while (bytes >= 16)
{
// x[i] = x[i] xor y[i]
_mm_storeu_si128(x16,
_mm_xor_si128(
_mm_loadu_si128(x16),
_mm_loadu_si128(y16)));
bytes -= 16, ++x16, ++y16;
}
uint8_t * LEO_RESTRICT x1 = reinterpret_cast<uint8_t *>(x16);
const uint8_t * LEO_RESTRICT y1 = reinterpret_cast<const uint8_t *>(y16);
// Handle a block of 8 bytes
const unsigned eight = bytes & 8;
if (eight)
{
uint64_t * LEO_RESTRICT x8 = reinterpret_cast<uint64_t *>(x1);
const uint64_t * LEO_RESTRICT y8 = reinterpret_cast<const uint64_t *>(y1);
*x8 ^= *y8;
}
// Handle a block of 4 bytes
const unsigned four = bytes & 4;
if (four)
{
uint32_t * LEO_RESTRICT x4 = reinterpret_cast<uint32_t *>(x1 + eight);
const uint32_t * LEO_RESTRICT y4 = reinterpret_cast<const uint32_t *>(y1 + eight);
*x4 ^= *y4;
}
// Handle final bytes
const unsigned offset = eight + four;
switch (bytes & 3)
{
case 3: x1[offset + 2] ^= y1[offset + 2];
case 2: x1[offset + 1] ^= y1[offset + 1];
case 1: x1[offset] ^= y1[offset];
default:
break;
}
}
//------------------------------------------------------------------------------
// Formal Derivative
// Formal derivative of polynomial in the new basis
static void formal_derivative(GFSymbol* cos, const unsigned size)
{
for (unsigned i = 1; i < size; ++i)
{
const unsigned leng = ((i ^ (i - 1)) + 1) >> 1;
// If a large number of values are being XORed:
if (leng >= 8)
xor_mem(cos + i - leng, cos + i, leng * sizeof(GFSymbol));
else
for (unsigned j = i - leng; j < i; j++)
cos[j] ^= cos[j + leng];
}
for (unsigned i = size; i < kFieldSize; i <<= 1)
xor_mem(cos, cos + i, size * sizeof(GFSymbol));
}
//------------------------------------------------------------------------------
// Fast Fourier Transform
static GFSymbol skewVec[kFieldModulus]; // twisted factors used in FFT
// IFFT in the proposed basis
static void IFLT(GFSymbol* data, const unsigned size, const unsigned index)
{
for (unsigned depart_no = 1; depart_no < size; depart_no <<= 1)
{
for (unsigned j = depart_no; j < size; j += (depart_no << 1))
{
// If a large number of values are being XORed:
if (depart_no >= 8)
xor_mem(data + j, data + j - depart_no, depart_no * sizeof(GFSymbol));
else
for (unsigned i = j - depart_no; i < j; ++i)
data[i + depart_no] ^= data[i];
const GFSymbol skew = skewVec[j + index - 1];
if (skew != kFieldModulus)
muladd_mem(data + j - depart_no, data + j, skew, depart_no);
}
}
}
// FFT in the proposed basis
static void FLT(GFSymbol* data, const unsigned size, const unsigned index)
{
for (unsigned depart_no = (size >> 1); depart_no > 0; depart_no >>= 1)
{
for (unsigned j = depart_no; j < size; j += (depart_no << 1))
{
const GFSymbol skew = skewVec[j + index - 1];
if (skew != kFieldModulus)
muladd_mem(data + j - depart_no, data + j, skew, depart_no);
// If a large number of values are being XORed:
if (depart_no >= 8)
xor_mem(data + j, data + j - depart_no, depart_no * sizeof(GFSymbol));
else
for (unsigned i = j - depart_no; i < j; ++i)
data[i + depart_no] ^= data[i];
}
}
}
//------------------------------------------------------------------------------
// FFT Initialization
static GFSymbol B[kFieldSize >> 1]; // factors used in formal derivative
static fwht_t log_walsh[kFieldSize]; // factors used in the evaluation of the error locator polynomial
// Initialize skewVec[], B[], log_walsh[]
static void InitFieldOperations()
{
GFSymbol temp[kGFBits - 1];
for (unsigned i = 1; i < kGFBits; ++i)
temp[i - 1] = (GFSymbol)((unsigned)1 << i);
for (unsigned m = 0; m < (kGFBits - 1); ++m)
{
const unsigned step = (unsigned)1 << (m + 1);
skewVec[((unsigned)1 << m) - 1] = 0;
for (unsigned i = m; i < (kGFBits - 1); ++i)
{
const unsigned s = ((unsigned)1 << (i + 1));
for (unsigned j = ((unsigned)1 << m) - 1; j < s; j += step)
skewVec[j + s] = skewVec[j] ^ temp[i];
}
temp[m] = kFieldModulus - GFLog[mulE(temp[m], GFLog[temp[m] ^ 1])];
for (unsigned i = m + 1; i < (kGFBits - 1); ++i)
temp[i] = mulE(temp[i], (GFLog[temp[i] ^ 1] + temp[m]) % kFieldModulus);
}
for (unsigned i = 0; i < kFieldSize; ++i)
skewVec[i] = GFLog[skewVec[i]];
temp[0] = kFieldModulus - temp[0];
for (unsigned i = 1; i < (kGFBits - 1); ++i)
temp[i] = (kFieldModulus - temp[i] + temp[i - 1]) % kFieldModulus;
B[0] = 0;
for (unsigned i = 0; i < (kGFBits - 1); ++i)
{
const unsigned depart = ((unsigned)1 << i);
for (unsigned j = 0; j < depart; ++j)
B[j + depart] = (B[j] + temp[i]) % kFieldModulus;
}
for (unsigned i = 0; i < kFieldSize; ++i)
log_walsh[i] = GFLog[i];
log_walsh[0] = 0;
FWHT(log_walsh, kGFBits);
}
//------------------------------------------------------------------------------
// Encoder
// Encoding alg for k/n<0.5: message is a power of two
static void encodeL(GFSymbol* data, const unsigned k, GFSymbol* codeword)
{
memcpy(codeword, data, sizeof(GFSymbol) * k);
IFLT(codeword, k, 0);
for (unsigned i = k; i < kFieldSize; i += k)
{
memcpy(&codeword[i], codeword, sizeof(GFSymbol) * k);
FLT(&codeword[i], k, i);
}
memcpy(codeword, data, sizeof(GFSymbol) * k);
}
// Encoding alg for k/n>0.5: parity is a power of two.
// data: message array. parity: parity array. mem: buffer(size>= n-k)
static void encodeH(const GFSymbol* data, const unsigned k, GFSymbol* parity, GFSymbol* mem)
{
const unsigned t = kFieldSize - k;
memset(parity, 0, sizeof(GFSymbol) * t);
for (unsigned i = t; i < kFieldSize; i += t)
{
memcpy(mem, &data[i - t], sizeof(GFSymbol) * t);
IFLT(mem, t, i);
xor_mem(parity, mem, t * sizeof(GFSymbol));
}
FLT(parity, t, 0);
}
//------------------------------------------------------------------------------
// Decoder
static void decode(GFSymbol* codeword, unsigned k, const bool* erasure)
{
fwht_t log_walsh2[kFieldSize];
// Compute the evaluations of the error locator polynomial
for (unsigned i = 0; i < kFieldSize; ++i)
log_walsh2[i] = erasure[i] ? 1 : 0;
FWHT(log_walsh2, kGFBits);
for (unsigned i = 0; i < kFieldSize; ++i)
log_walsh2[i] = ((unsigned)log_walsh2[i] * (unsigned)log_walsh[i]) % kFieldModulus;
FWHT(log_walsh2, kGFBits);
// k2 can be replaced with k
const unsigned k2 = kFieldSize;
//const unsigned k2 = k; // cannot actually be replaced with k. what else need to change?
for (unsigned i = 0; i < kFieldSize; ++i)
{
if (erasure[i])
{
codeword[i] = 0;
}
else
{
codeword[i] = mulE(codeword[i], log_walsh2[i]);
}
}
IFLT(codeword, kFieldSize, 0);
// formal derivative
for (unsigned i = 0; i < kFieldSize; i += 2)
{
codeword[i] = mulE(codeword[i], kFieldModulus - B[i >> 1]);
codeword[i + 1] = mulE(codeword[i + 1], kFieldModulus - B[i >> 1]);
}
formal_derivative(codeword, k2);
for (unsigned i = 0; i < k2; i += 2)
{
codeword[i] = mulE(codeword[i], B[i >> 1]);
codeword[i + 1] = mulE(codeword[i + 1], B[i >> 1]);
}
FLT(codeword, k2, 0);
for (unsigned i = 0; i < k2; ++i)
{
if (erasure[i])
{
codeword[i] = mulE(codeword[i], kFieldModulus - log_walsh2[i]);
}
}
}
//------------------------------------------------------------------------------
// Test Application
void test(unsigned k, unsigned seed)
{
srand(seed);
//-----------Generating message----------
// Message array
GFSymbol data[kFieldSize] = {0};
// Filled with random numbers
for (unsigned i = kFieldSize - k; i < kFieldSize; ++i)
data[i] = (GFSymbol)rand();
//---------encoding----------
GFSymbol codeword[kFieldSize];
encodeH(&data[kFieldSize - k], k, data, codeword);
//encodeL(data, k, codeword); // does not seem to work with any input? what else needs to change?
memcpy(codeword, data, sizeof(GFSymbol) * kFieldSize);
//--------erasure simulation---------
// Array indicating erasures
bool erasure[kFieldSize] = {
false
};
for (unsigned i = k; i < kFieldSize; ++i)
erasure[i] = true;
// permuting the erasure array
for (unsigned i = kFieldSize - 1; i > 0; --i)
{
unsigned pos = rand() % (i + 1);
if (i != pos)
{
bool tmp = erasure[i];
erasure[i] = erasure[pos];
erasure[pos] = tmp;
}
}
// erasure codeword symbols
for (unsigned i = 0; i < kFieldSize; ++i)
if (erasure[i])
codeword[i] = 0;
//---------main processing----------
decode(codeword, k, erasure);
// Check the correctness of the result
for (unsigned i = 0; i < kFieldSize; ++i)
{
if (erasure[i] == 1)
{
if (data[i] != codeword[i])
{
printf("Decoding Error with seed = %d!\n", seed);
LEO_DEBUG_BREAK;
return;
}
}
}
//printf("Decoding is successful!\n");
}
//------------------------------------------------------------------------------
// Entrypoint
int main(int argc, char **argv)
{
// Initialize architecture-specific code
leo_architecture_init();
// Fill GFLog table and GFExp table
InitField();
// Compute factors used in erasure decoder
InitFieldOperations();
unsigned seed = (unsigned)time(NULL);
for (;;)
{
// test(int k), k: message size
/*
EncodeH works for kFieldSize / 2 and kFieldSize * 3 / 4, etc,
s.t. the number of recovery pieces is a power of two
*/
test(kFieldSize / 2, seed);
++seed;
}
return 0;
}
} // namespace leopard

194
LeopardCommon.h Normal file
View File

@ -0,0 +1,194 @@
/*
Copyright (c) 2017 Christopher A. Taylor. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the name of Leopard-RS nor the names of its contributors may be
used to endorse or promote products derived from this software without
specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
/*
TODO:
+ Refactor software
+ I think it should be split up into several C++ modules
+ Replace GFSymbol with a file data pointer
+ New 16-bit Muladd inner loops
+ Class to contain the (large) muladd tables
+ Preliminary benchmarks for large data!
+ New 8-bit Muladd inner loops
+ Benchmarks for smaller data!
+ Write detailed comments for all the routines
+ Look into getting EncodeL working so we can support smaller data (Ask Lin)
+ Look into using k instead of k2 to speed up decoder (Ask Lin)
+ Avoid performing FFT/IFFT intermediate calculations we're not going to use
+ Benchmarks, fun!
+ Add multi-threading to split up long parallelizable calculations
+ Final benchmarks!
+ Finish up documentation
+ Release version 1
Muladd implementation notes:
Specialize for 1-3 rows at a time since often times we're multiplying by
the same (skew) value repeatedly, as the ISA-L library does here:
https://github.com/01org/isa-l/blob/master/erasure_code/gf_3vect_mad_avx.asm#L258
Except we should be doing it for 16-bit Galois Field.
To implement that use the ALTMAP trick from Jerasure:
http://lab.jerasure.org/jerasure/gf-complete/blob/master/src/gf_w16.c#L1140
Except we should also support AVX2 since that is a 40% perf boost, so put
the high and low bytes 32 bytes instead of 16 bytes apart.
Also I think we should go ahead and precompute the multiply tables since
it avoids a bunch of memory lookups for each muladd, and only costs 8 MB.
*/
#include <stdint.h>
//------------------------------------------------------------------------------
// Debug
// Some bugs only repro in release mode, so this can be helpful
//#define LEO_DEBUG_IN_RELEASE
#if defined(_DEBUG) || defined(DEBUG) || defined(LEO_DEBUG_IN_RELEASE)
#define LEO_DEBUG
#ifdef _WIN32
#define LEO_DEBUG_BREAK __debugbreak()
#else
#define LEO_DEBUG_BREAK __builtin_trap()
#endif
#define LEO_DEBUG_ASSERT(cond) { if (!(cond)) { LEO_DEBUG_BREAK; } }
#else
#define LEO_DEBUG_BREAK ;
#define LEO_DEBUG_ASSERT(cond) ;
#endif
//------------------------------------------------------------------------------
// Platform/Architecture
#if defined(ANDROID) || defined(IOS)
#define LEO_TARGET_MOBILE
#endif // ANDROID
#if defined(__AVX2__) || (defined (_MSC_VER) && _MSC_VER >= 1900)
#define LEO_TRY_AVX2 /* 256-bit */
#include <immintrin.h>
#define LEO_ALIGN_BYTES 32
#else // __AVX2__
#define LEO_ALIGN_BYTES 16
#endif // __AVX2__
#if !defined(LEO_TARGET_MOBILE)
// Note: MSVC currently only supports SSSE3 but not AVX2
#include <tmmintrin.h> // SSSE3: _mm_shuffle_epi8
#include <emmintrin.h> // SSE2
#endif // LEO_TARGET_MOBILE
#if defined(HAVE_ARM_NEON_H)
#include <arm_neon.h>
#endif // HAVE_ARM_NEON_H
#if defined(LEO_TARGET_MOBILE)
#define LEO_ALIGNED_ACCESSES /* Inputs must be aligned to LEO_ALIGN_BYTES */
# if defined(HAVE_ARM_NEON_H)
// Compiler-specific 128-bit SIMD register keyword
#define LEO_M128 uint8x16_t
#define LEO_TRY_NEON
#else
#define LEO_M128 uint64_t
# endif
#else // LEO_TARGET_MOBILE
// Compiler-specific 128-bit SIMD register keyword
#define LEO_M128 __m128i
#endif // LEO_TARGET_MOBILE
#ifdef LEO_TRY_AVX2
// Compiler-specific 256-bit SIMD register keyword
#define LEO_M256 __m256i
#endif
// Compiler-specific C++11 restrict keyword
#define LEO_RESTRICT __restrict
// Compiler-specific force inline keyword
#ifdef _MSC_VER
#define LEO_FORCE_INLINE inline __forceinline
#else
#define LEO_FORCE_INLINE inline __attribute__((always_inline))
#endif
// Compiler-specific alignment keyword
// Note: Alignment only matters for ARM NEON where it should be 16
#ifdef _MSC_VER
#define LEO_ALIGNED __declspec(align(LEO_ALIGN_BYTES))
#else // _MSC_VER
#define LEO_ALIGNED __attribute__((aligned(LEO_ALIGN_BYTES)))
#endif // _MSC_VER
namespace leopard {
//------------------------------------------------------------------------------
// Runtime CPU Architecture Check
// Initialize CPU architecture flags
void InitializeCPUArch();
#if defined(LEO_TRY_NEON)
# if defined(IOS) && defined(__ARM_NEON__)
// Does device support NEON?
static const bool CpuHasNeon = true;
static const bool CpuHasNeon64 = true;
# else
// Does device support NEON?
// Remember to add LOCAL_STATIC_LIBRARIES := cpufeatures
extern bool CpuHasNeon; // V6 / V7
extern bool CpuHasNeon64; // 64-bit
# endif
#endif
#if !defined(LEO_TARGET_MOBILE)
# if defined(LEO_TRY_AVX2)
// Does CPU support AVX2?
extern bool CpuHasAVX2;
# endif
// Does CPU support SSSE3?
extern bool CpuHasSSSE3;
#endif // LEO_TARGET_MOBILE
} // namespace leopard

View File

@ -1,8 +1,29 @@
/*
S.-J. Lin, T. Y. Al-Naffouri, Y. S. Han, and W.-H. Chung,
"Novel Polynomial Basis with Fast Fourier Transform and Its Application to Reed-Solomon Erasure Codes"
IEEE Trans. on Information Theory, pp. 6284-6299, November, 2016.
http://ct.ee.ntust.edu.tw/it2016-2.pdf
Copyright (c) 2017 Christopher A. Taylor. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the name of LHC-RS nor the names of its contributors may be
used to endorse or promote products derived from this software without
specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
*/
#include <string.h>
@ -23,7 +44,7 @@
+ New 8-bit Muladd inner loops
+ Benchmarks for smaller data!
+ Refactor software
+ Pick a name for the software better than LHC_RS
+ Pick a name for the software better than LEO_RS
+ I think it should be split up into several C++ modules
+ Write detailed comments for all the routines
+ Look into getting EncodeL working so we can support smaller data (Ask Lin)
@ -60,19 +81,19 @@
// Debug
// Some bugs only repro in release mode, so this can be helpful
//#define LHC_DEBUG_IN_RELEASE
//#define LEO_DEBUG_IN_RELEASE
#if defined(_DEBUG) || defined(DEBUG) || defined(LHC_DEBUG_IN_RELEASE)
#define LHC_DEBUG
#if defined(_DEBUG) || defined(DEBUG) || defined(LEO_DEBUG_IN_RELEASE)
#define LEO_DEBUG
#ifdef _WIN32
#define LHC_DEBUG_BREAK __debugbreak()
#define LEO_DEBUG_BREAK __debugbreak()
#else
#define LHC_DEBUG_BREAK __builtin_trap()
#define LEO_DEBUG_BREAK __builtin_trap()
#endif
#define LHC_DEBUG_ASSERT(cond) { if (!(cond)) { LHC_DEBUG_BREAK; } }
#define LEO_DEBUG_ASSERT(cond) { if (!(cond)) { LEO_DEBUG_BREAK; } }
#else
#define LHC_DEBUG_BREAK ;
#define LHC_DEBUG_ASSERT(cond) ;
#define LEO_DEBUG_BREAK ;
#define LEO_DEBUG_ASSERT(cond) ;
#endif
@ -80,67 +101,67 @@
// Platform/Architecture
#if defined(ANDROID) || defined(IOS)
#define LHC_TARGET_MOBILE
#define LEO_TARGET_MOBILE
#endif // ANDROID
#if defined(__AVX2__) || (defined (_MSC_VER) && _MSC_VER >= 1900)
#define LHC_TRY_AVX2 /* 256-bit */
#define LEO_TRY_AVX2 /* 256-bit */
#include <immintrin.h>
#define LHC_ALIGN_BYTES 32
#define LEO_ALIGN_BYTES 32
#else // __AVX2__
#define LHC_ALIGN_BYTES 16
#define LEO_ALIGN_BYTES 16
#endif // __AVX2__
#if !defined(LHC_TARGET_MOBILE)
#if !defined(LEO_TARGET_MOBILE)
// Note: MSVC currently only supports SSSE3 but not AVX2
#include <tmmintrin.h> // SSSE3: _mm_shuffle_epi8
#include <emmintrin.h> // SSE2
#endif // LHC_TARGET_MOBILE
#endif // LEO_TARGET_MOBILE
#if defined(HAVE_ARM_NEON_H)
#include <arm_neon.h>
#endif // HAVE_ARM_NEON_H
#if defined(LHC_TARGET_MOBILE)
#if defined(LEO_TARGET_MOBILE)
#define LHC_ALIGNED_ACCESSES /* Inputs must be aligned to LHC_ALIGN_BYTES */
#define LEO_ALIGNED_ACCESSES /* Inputs must be aligned to LEO_ALIGN_BYTES */
# if defined(HAVE_ARM_NEON_H)
// Compiler-specific 128-bit SIMD register keyword
#define LHC_M128 uint8x16_t
#define LHC_TRY_NEON
#define LEO_M128 uint8x16_t
#define LEO_TRY_NEON
#else
#define LHC_M128 uint64_t
#define LEO_M128 uint64_t
# endif
#else // LHC_TARGET_MOBILE
#else // LEO_TARGET_MOBILE
// Compiler-specific 128-bit SIMD register keyword
#define LHC_M128 __m128i
#define LEO_M128 __m128i
#endif // LHC_TARGET_MOBILE
#endif // LEO_TARGET_MOBILE
#ifdef LHC_TRY_AVX2
#ifdef LEO_TRY_AVX2
// Compiler-specific 256-bit SIMD register keyword
#define LHC_M256 __m256i
#define LEO_M256 __m256i
#endif
// Compiler-specific C++11 restrict keyword
#define LHC_RESTRICT __restrict
#define LEO_RESTRICT __restrict
// Compiler-specific force inline keyword
#ifdef _MSC_VER
#define LHC_FORCE_INLINE inline __forceinline
#define LEO_FORCE_INLINE inline __forceinline
#else
#define LHC_FORCE_INLINE inline __attribute__((always_inline))
#define LEO_FORCE_INLINE inline __attribute__((always_inline))
#endif
// Compiler-specific alignment keyword
// Note: Alignment only matters for ARM NEON where it should be 16
#ifdef _MSC_VER
#define LHC_ALIGNED __declspec(align(LHC_ALIGN_BYTES))
#define LEO_ALIGNED __declspec(align(LEO_ALIGN_BYTES))
#else // _MSC_VER
#define LHC_ALIGNED __attribute__((aligned(LHC_ALIGN_BYTES)))
#define LEO_ALIGNED __attribute__((aligned(LEO_ALIGN_BYTES)))
#endif // _MSC_VER
@ -154,7 +175,7 @@
#include <cpu-features.h>
#endif
#if defined(LHC_TRY_NEON)
#if defined(LEO_TRY_NEON)
# if defined(IOS) && defined(__ARM_NEON__)
// Requires iPhone 5S or newer
static const bool CpuHasNeon = true;
@ -167,14 +188,14 @@
#endif
#if !defined(LHC_TARGET_MOBILE)
#if !defined(LEO_TARGET_MOBILE)
#ifdef _MSC_VER
#include <intrin.h> // __cpuid
#pragma warning(disable: 4752) // found Intel(R) Advanced Vector Extensions; consider using /arch:AVX
#endif
#ifdef LHC_TRY_AVX2
#ifdef LEO_TRY_AVX2
static bool CpuHasAVX2 = false;
#endif
static bool CpuHasSSSE3 = false;
@ -219,12 +240,12 @@ static void _cpuid(unsigned int cpu_info[4U], const unsigned int cpu_info_type)
#endif
}
#endif // defined(LHC_TARGET_MOBILE)
#endif // defined(LEO_TARGET_MOBILE)
static void lhc_architecture_init()
static void leo_architecture_init()
{
#if defined(LHC_TRY_NEON) && defined(HAVE_ANDROID_GETCPUFEATURES)
#if defined(LEO_TRY_NEON) && defined(HAVE_ANDROID_GETCPUFEATURES)
AndroidCpuFamily family = android_getCpuFamily();
if (family == ANDROID_CPU_FAMILY_ARM)
{
@ -239,32 +260,32 @@ static void lhc_architecture_init()
}
#endif
#if !defined(LHC_TARGET_MOBILE)
#if !defined(LEO_TARGET_MOBILE)
unsigned int cpu_info[4];
_cpuid(cpu_info, 1);
CpuHasSSSE3 = ((cpu_info[2] & CPUID_ECX_SSSE3) != 0);
#if defined(LHC_TRY_AVX2)
#if defined(LEO_TRY_AVX2)
_cpuid(cpu_info, 7);
CpuHasAVX2 = ((cpu_info[1] & CPUID_EBX_AVX2) != 0);
#endif // LHC_TRY_AVX2
#endif // LEO_TRY_AVX2
#endif // LHC_TARGET_MOBILE
#endif // LEO_TARGET_MOBILE
}
//------------------------------------------------------------------------------
// SIMD-Safe Aligned Memory Allocations
static const unsigned kAlignmentBytes = LHC_ALIGN_BYTES;
static const unsigned kAlignmentBytes = LEO_ALIGN_BYTES;
LHC_FORCE_INLINE unsigned NextAlignedOffset(unsigned offset)
LEO_FORCE_INLINE unsigned NextAlignedOffset(unsigned offset)
{
return (offset + kAlignmentBytes - 1) & ~(kAlignmentBytes - 1);
}
static LHC_FORCE_INLINE uint8_t* SIMDSafeAllocate(size_t size)
static LEO_FORCE_INLINE uint8_t* SIMDSafeAllocate(size_t size)
{
uint8_t* data = (uint8_t*)calloc(1, kAlignmentBytes + size);
if (!data)
@ -275,7 +296,7 @@ static LHC_FORCE_INLINE uint8_t* SIMDSafeAllocate(size_t size)
return data;
}
static LHC_FORCE_INLINE void SIMDSafeFree(void* ptr)
static LEO_FORCE_INLINE void SIMDSafeFree(void* ptr)
{
if (!ptr)
return;
@ -283,7 +304,7 @@ static LHC_FORCE_INLINE void SIMDSafeFree(void* ptr)
unsigned offset = data[-1];
if (offset >= kAlignmentBytes)
{
LHC_DEBUG_BREAK; // Should never happen
LEO_DEBUG_BREAK; // Should never happen
return;
}
data -= kAlignmentBytes - offset;
@ -294,9 +315,9 @@ static LHC_FORCE_INLINE void SIMDSafeFree(void* ptr)
//------------------------------------------------------------------------------
// Field
//#define LHC_SHORT_FIELD
//#define LEO_SHORT_FIELD
#ifdef LHC_SHORT_FIELD
#ifdef LEO_SHORT_FIELD
typedef uint8_t GFSymbol;
static const unsigned kGFBits = 8;
static const unsigned kGFPolynomial = 0x11D;
@ -386,7 +407,7 @@ static inline GFSymbol SubModQ(GFSymbol a, GFSymbol b)
}
// vx[] += vy[] * z
static void muladd_mem(GFSymbol * LHC_RESTRICT vx, const GFSymbol * LHC_RESTRICT vy, GFSymbol z, unsigned symbolCount)
static void muladd_mem(GFSymbol * LEO_RESTRICT vx, const GFSymbol * LEO_RESTRICT vy, GFSymbol z, unsigned symbolCount)
{
for (unsigned i = 0; i < symbolCount; ++i)
{
@ -443,12 +464,12 @@ static GFSymbol mulE(GFSymbol a, GFSymbol b)
// Q is the maximum symbol value, e.g. 255 or 65535.
// Define this to enable the optimized version of FWHT()
#define LHC_FWHT_OPTIMIZED
#define LEO_FWHT_OPTIMIZED
typedef GFSymbol fwht_t;
// {a, b} = {a + b, a - b} (Mod Q)
static LHC_FORCE_INLINE void FWHT_2(fwht_t& LHC_RESTRICT a, fwht_t& LHC_RESTRICT b)
static LEO_FORCE_INLINE void FWHT_2(fwht_t& LEO_RESTRICT a, fwht_t& LEO_RESTRICT b)
{
const fwht_t sum = AddModQ(a, b);
const fwht_t dif = SubModQ(a, b);
@ -473,7 +494,7 @@ static LHC_FORCE_INLINE void FWHT_2(fwht_t& LHC_RESTRICT a, fwht_t& LHC_RESTRICT
at too high a complexity cost relative to minor perf improvement.
*/
#ifndef LHC_FWHT_OPTIMIZED
#ifndef LEO_FWHT_OPTIMIZED
// Reference implementation
static void FWHT(fwht_t* data, const unsigned bits)
@ -487,7 +508,7 @@ static void FWHT(fwht_t* data, const unsigned bits)
#else
static LHC_FORCE_INLINE void FWHT_4(fwht_t* data)
static LEO_FORCE_INLINE void FWHT_4(fwht_t* data)
{
fwht_t t0 = data[0];
fwht_t t1 = data[1];
@ -503,7 +524,7 @@ static LHC_FORCE_INLINE void FWHT_4(fwht_t* data)
data[3] = t3;
}
static LHC_FORCE_INLINE void FWHT_4(fwht_t* data, unsigned s)
static LEO_FORCE_INLINE void FWHT_4(fwht_t* data, unsigned s)
{
unsigned x = 0;
fwht_t t0 = data[x]; x += s;
@ -683,26 +704,26 @@ static void FWHT(fwht_t* data, const unsigned ldn)
//------------------------------------------------------------------------------
// Memory Buffer XOR
static void xor_mem(void * LHC_RESTRICT vx, const void * LHC_RESTRICT vy, unsigned bytes)
static void xor_mem(void * LEO_RESTRICT vx, const void * LEO_RESTRICT vy, unsigned bytes)
{
LHC_M128 * LHC_RESTRICT x16 = reinterpret_cast<LHC_M128 *>(vx);
const LHC_M128 * LHC_RESTRICT y16 = reinterpret_cast<const LHC_M128 *>(vy);
LEO_M128 * LEO_RESTRICT x16 = reinterpret_cast<LEO_M128 *>(vx);
const LEO_M128 * LEO_RESTRICT y16 = reinterpret_cast<const LEO_M128 *>(vy);
#if defined(LHC_TARGET_MOBILE)
# if defined(LHC_TRY_NEON)
#if defined(LEO_TARGET_MOBILE)
# if defined(LEO_TRY_NEON)
// Handle multiples of 64 bytes
if (CpuHasNeon)
{
while (bytes >= 64)
{
LHC_M128 x0 = vld1q_u8(x16);
LHC_M128 x1 = vld1q_u8(x16 + 1);
LHC_M128 x2 = vld1q_u8(x16 + 2);
LHC_M128 x3 = vld1q_u8(x16 + 3);
LHC_M128 y0 = vld1q_u8(y16);
LHC_M128 y1 = vld1q_u8(y16 + 1);
LHC_M128 y2 = vld1q_u8(y16 + 2);
LHC_M128 y3 = vld1q_u8(y16 + 3);
LEO_M128 x0 = vld1q_u8(x16);
LEO_M128 x1 = vld1q_u8(x16 + 1);
LEO_M128 x2 = vld1q_u8(x16 + 2);
LEO_M128 x3 = vld1q_u8(x16 + 3);
LEO_M128 y0 = vld1q_u8(y16);
LEO_M128 y1 = vld1q_u8(y16 + 1);
LEO_M128 y2 = vld1q_u8(y16 + 2);
LEO_M128 y3 = vld1q_u8(y16 + 3);
vst1q_u8(x16, veorq_u8(x0, y0));
vst1q_u8(x16 + 1, veorq_u8(x1, y1));
@ -715,8 +736,8 @@ static void xor_mem(void * LHC_RESTRICT vx, const void * LHC_RESTRICT vy, unsign
// Handle multiples of 16 bytes
while (bytes >= 16)
{
LHC_M128 x0 = vld1q_u8(x16);
LHC_M128 y0 = vld1q_u8(y16);
LEO_M128 x0 = vld1q_u8(x16);
LEO_M128 y0 = vld1q_u8(y16);
vst1q_u8(x16, veorq_u8(x0, y0));
@ -724,38 +745,38 @@ static void xor_mem(void * LHC_RESTRICT vx, const void * LHC_RESTRICT vy, unsign
}
}
else
# endif // LHC_TRY_NEON
# endif // LEO_TRY_NEON
{
uint64_t * LHC_RESTRICT x8 = reinterpret_cast<uint64_t *>(x16);
const uint64_t * LHC_RESTRICT y8 = reinterpret_cast<const uint64_t *>(y16);
uint64_t * LEO_RESTRICT x8 = reinterpret_cast<uint64_t *>(x16);
const uint64_t * LEO_RESTRICT y8 = reinterpret_cast<const uint64_t *>(y16);
const unsigned count = (unsigned)bytes / 8;
for (unsigned ii = 0; ii < count; ++ii)
x8[ii] ^= y8[ii];
x16 = reinterpret_cast<LHC_M128 *>(x8 + count);
y16 = reinterpret_cast<const LHC_M128 *>(y8 + count);
x16 = reinterpret_cast<LEO_M128 *>(x8 + count);
y16 = reinterpret_cast<const LEO_M128 *>(y8 + count);
}
#else // LHC_TARGET_MOBILE
# if defined(LHC_TRY_AVX2)
#else // LEO_TARGET_MOBILE
# if defined(LEO_TRY_AVX2)
if (CpuHasAVX2)
{
LHC_M256 * LHC_RESTRICT x32 = reinterpret_cast<LHC_M256 *>(x16);
const LHC_M256 * LHC_RESTRICT y32 = reinterpret_cast<const LHC_M256 *>(y16);
LEO_M256 * LEO_RESTRICT x32 = reinterpret_cast<LEO_M256 *>(x16);
const LEO_M256 * LEO_RESTRICT y32 = reinterpret_cast<const LEO_M256 *>(y16);
while (bytes >= 128)
{
LHC_M256 x0 = _mm256_loadu_si256(x32);
LHC_M256 y0 = _mm256_loadu_si256(y32);
LEO_M256 x0 = _mm256_loadu_si256(x32);
LEO_M256 y0 = _mm256_loadu_si256(y32);
x0 = _mm256_xor_si256(x0, y0);
LHC_M256 x1 = _mm256_loadu_si256(x32 + 1);
LHC_M256 y1 = _mm256_loadu_si256(y32 + 1);
LEO_M256 x1 = _mm256_loadu_si256(x32 + 1);
LEO_M256 y1 = _mm256_loadu_si256(y32 + 1);
x1 = _mm256_xor_si256(x1, y1);
LHC_M256 x2 = _mm256_loadu_si256(x32 + 2);
LHC_M256 y2 = _mm256_loadu_si256(y32 + 2);
LEO_M256 x2 = _mm256_loadu_si256(x32 + 2);
LEO_M256 y2 = _mm256_loadu_si256(y32 + 2);
x2 = _mm256_xor_si256(x2, y2);
LHC_M256 x3 = _mm256_loadu_si256(x32 + 3);
LHC_M256 y3 = _mm256_loadu_si256(y32 + 3);
LEO_M256 x3 = _mm256_loadu_si256(x32 + 3);
LEO_M256 y3 = _mm256_loadu_si256(y32 + 3);
x3 = _mm256_xor_si256(x3, y3);
_mm256_storeu_si256(x32, x0);
@ -778,25 +799,25 @@ static void xor_mem(void * LHC_RESTRICT vx, const void * LHC_RESTRICT vy, unsign
bytes -= 32, ++x32, ++y32;
}
x16 = reinterpret_cast<LHC_M128 *>(x32);
y16 = reinterpret_cast<const LHC_M128 *>(y32);
x16 = reinterpret_cast<LEO_M128 *>(x32);
y16 = reinterpret_cast<const LEO_M128 *>(y32);
}
else
# endif // LHC_TRY_AVX2
# endif // LEO_TRY_AVX2
{
while (bytes >= 64)
{
LHC_M128 x0 = _mm_loadu_si128(x16);
LHC_M128 y0 = _mm_loadu_si128(y16);
LEO_M128 x0 = _mm_loadu_si128(x16);
LEO_M128 y0 = _mm_loadu_si128(y16);
x0 = _mm_xor_si128(x0, y0);
LHC_M128 x1 = _mm_loadu_si128(x16 + 1);
LHC_M128 y1 = _mm_loadu_si128(y16 + 1);
LEO_M128 x1 = _mm_loadu_si128(x16 + 1);
LEO_M128 y1 = _mm_loadu_si128(y16 + 1);
x1 = _mm_xor_si128(x1, y1);
LHC_M128 x2 = _mm_loadu_si128(x16 + 2);
LHC_M128 y2 = _mm_loadu_si128(y16 + 2);
LEO_M128 x2 = _mm_loadu_si128(x16 + 2);
LEO_M128 y2 = _mm_loadu_si128(y16 + 2);
x2 = _mm_xor_si128(x2, y2);
LHC_M128 x3 = _mm_loadu_si128(x16 + 3);
LHC_M128 y3 = _mm_loadu_si128(y16 + 3);
LEO_M128 x3 = _mm_loadu_si128(x16 + 3);
LEO_M128 y3 = _mm_loadu_si128(y16 + 3);
x3 = _mm_xor_si128(x3, y3);
_mm_storeu_si128(x16, x0);
@ -807,7 +828,7 @@ static void xor_mem(void * LHC_RESTRICT vx, const void * LHC_RESTRICT vy, unsign
bytes -= 64, x16 += 4, y16 += 4;
}
}
#endif // LHC_TARGET_MOBILE
#endif // LEO_TARGET_MOBILE
// Handle multiples of 16 bytes
while (bytes >= 16)
@ -821,15 +842,15 @@ static void xor_mem(void * LHC_RESTRICT vx, const void * LHC_RESTRICT vy, unsign
bytes -= 16, ++x16, ++y16;
}
uint8_t * LHC_RESTRICT x1 = reinterpret_cast<uint8_t *>(x16);
const uint8_t * LHC_RESTRICT y1 = reinterpret_cast<const uint8_t *>(y16);
uint8_t * LEO_RESTRICT x1 = reinterpret_cast<uint8_t *>(x16);
const uint8_t * LEO_RESTRICT y1 = reinterpret_cast<const uint8_t *>(y16);
// Handle a block of 8 bytes
const unsigned eight = bytes & 8;
if (eight)
{
uint64_t * LHC_RESTRICT x8 = reinterpret_cast<uint64_t *>(x1);
const uint64_t * LHC_RESTRICT y8 = reinterpret_cast<const uint64_t *>(y1);
uint64_t * LEO_RESTRICT x8 = reinterpret_cast<uint64_t *>(x1);
const uint64_t * LEO_RESTRICT y8 = reinterpret_cast<const uint64_t *>(y1);
*x8 ^= *y8;
}
@ -837,8 +858,8 @@ static void xor_mem(void * LHC_RESTRICT vx, const void * LHC_RESTRICT vy, unsign
const unsigned four = bytes & 4;
if (four)
{
uint32_t * LHC_RESTRICT x4 = reinterpret_cast<uint32_t *>(x1 + eight);
const uint32_t * LHC_RESTRICT y4 = reinterpret_cast<const uint32_t *>(y1 + eight);
uint32_t * LEO_RESTRICT x4 = reinterpret_cast<uint32_t *>(x1 + eight);
const uint32_t * LEO_RESTRICT y4 = reinterpret_cast<const uint32_t *>(y1 + eight);
*x4 ^= *y4;
}
@ -1158,7 +1179,7 @@ void test(unsigned k, unsigned seed)
if (data[i] != codeword[i])
{
printf("Decoding Error with seed = %d!\n", seed);
LHC_DEBUG_BREAK;
LEO_DEBUG_BREAK;
return;
}
}
@ -1174,7 +1195,7 @@ void test(unsigned k, unsigned seed)
int main(int argc, char **argv)
{
// Initialize architecture-specific code
lhc_architecture_init();
leo_architecture_init();
// Fill GFLog table and GFExp table
InitField();

1220
LeopardDecoder.h Normal file

File diff suppressed because it is too large Load Diff

1220
LeopardEncoder.cpp Normal file

File diff suppressed because it is too large Load Diff

1220
LeopardEncoder.h Normal file

File diff suppressed because it is too large Load Diff

1220
LeopardFF16.cpp Normal file

File diff suppressed because it is too large Load Diff

1220
LeopardFF16.h Normal file

File diff suppressed because it is too large Load Diff

840
LeopardFF8.cpp Normal file
View File

@ -0,0 +1,840 @@
/*
Copyright (c) 2017 Christopher A. Taylor. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the name of LHC-RS nor the names of its contributors may be
used to endorse or promote products derived from this software without
specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
*/
#include "LeopardFF8.h"
namespace leopard { namespace ff8 {
//------------------------------------------------------------------------------
// Datatypes and Constants
// LFSR Polynomial that generates the field elements
static const unsigned kPolynomial = 0x11D;
// Basis used for generating logarithm tables
static const ffe_t kBasis[kBits] = {
1, 214, 152, 146, 86, 200, 88, 230 // Cantor basis
// 1, 2, 4, 8, 16, 32, 64, 128 // Monomial basis
};
//------------------------------------------------------------------------------
// Field Operations
// Modulus for field operations
static const ffe_t kModulus = 255;
// z = x + y (mod kModulus)
static inline ffe_t AddMod(const ffe_t a, const ffe_t b)
{
const unsigned sum = (unsigned)a + b;
// Partial reduction step, allowing for kModulus to be returned
return static_cast<ffe_t>(sum + (sum >> kBits));
}
// z = x - y (mod kModulus)
static inline ffe_t SubMod(const ffe_t a, const ffe_t b)
{
const unsigned dif = (unsigned)a - b;
// Partial reduction step, allowing for kModulus to be returned
return static_cast<ffe_t>(dif + (dif >> kBits));
}
//------------------------------------------------------------------------------
// Logarithm Tables
static ffe_t LogLUT[kOrder];
static ffe_t ExpLUT[kOrder];
// Initialize LogLUT[], ExpLUT[]
static void InitializeLogarithmTables()
{
// LFSR table generation:
unsigned state = 1;
for (unsigned i = 0; i < kModulus; ++i)
{
ExpLUT[state] = static_cast<ffe_t>(i);
state <<= 1;
if (state >= kOrder)
state ^= kPolynomial;
}
ExpLUT[0] = kModulus;
// Conversion to chosen basis:
LogLUT[0] = 0;
for (unsigned i = 0; i < kBits; ++i)
{
const ffe_t basis = kBasis[i];
const unsigned width = static_cast<unsigned>(1UL << i);
for (unsigned j = 0; j < width; ++j)
LogLUT[j + width] = LogLUT[j] ^ basis;
}
for (unsigned i = 0; i < kOrder; ++i)
LogLUT[i] = ExpLUT[LogLUT[i]];
for (unsigned i = 0; i < kOrder; ++i)
ExpLUT[LogLUT[i]] = i;
ExpLUT[kModulus] = ExpLUT[0];
}
//------------------------------------------------------------------------------
// Fast Walsh-Hadamard Transform (FWHT) (mod kModulus)
#if defined(LEO_FF8_FWHT_OPTIMIZED)
// {a, b} = {a + b, a - b} (Mod Q)
static LEO_FORCE_INLINE void FWHT_2(ffe_t& LEO_RESTRICT a, ffe_t& LEO_RESTRICT b)
{
const ffe_t sum = AddMod(a, b);
const ffe_t dif = SubMod(a, b);
a = sum;
b = dif;
}
static LEO_FORCE_INLINE void FWHT_4(ffe_t* data)
{
ffe_t t0 = data[0];
ffe_t t1 = data[1];
ffe_t t2 = data[2];
ffe_t t3 = data[3];
FWHT_2(t0, t1);
FWHT_2(t2, t3);
FWHT_2(t0, t2);
FWHT_2(t1, t3);
data[0] = t0;
data[1] = t1;
data[2] = t2;
data[3] = t3;
}
static LEO_FORCE_INLINE void FWHT_4(ffe_t* data, unsigned s)
{
unsigned x = 0;
ffe_t t0 = data[x]; x += s;
ffe_t t1 = data[x]; x += s;
ffe_t t2 = data[x]; x += s;
ffe_t t3 = data[x];
FWHT_2(t0, t1);
FWHT_2(t2, t3);
FWHT_2(t0, t2);
FWHT_2(t1, t3);
unsigned y = 0;
data[y] = t0; y += s;
data[y] = t1; y += s;
data[y] = t2; y += s;
data[y] = t3;
}
static inline void FWHT_8(ffe_t* data)
{
ffe_t t0 = data[0];
ffe_t t1 = data[1];
ffe_t t2 = data[2];
ffe_t t3 = data[3];
ffe_t t4 = data[4];
ffe_t t5 = data[5];
ffe_t t6 = data[6];
ffe_t t7 = data[7];
FWHT_2(t0, t1);
FWHT_2(t2, t3);
FWHT_2(t4, t5);
FWHT_2(t6, t7);
FWHT_2(t0, t2);
FWHT_2(t1, t3);
FWHT_2(t4, t6);
FWHT_2(t5, t7);
FWHT_2(t0, t4);
FWHT_2(t1, t5);
FWHT_2(t2, t6);
FWHT_2(t3, t7);
data[0] = t0;
data[1] = t1;
data[2] = t2;
data[3] = t3;
data[4] = t4;
data[5] = t5;
data[6] = t6;
data[7] = t7;
}
// Decimation in time (DIT) version
static void FWHT(ffe_t* data, const unsigned ldn)
{
const unsigned n = (1UL << ldn);
if (n <= 2)
{
if (n == 2)
FWHT_2(data[0], data[1]);
return;
}
for (unsigned ldm = ldn; ldm > 3; ldm -= 2)
{
unsigned m = (1UL << ldm);
unsigned m4 = (m >> 2);
for (unsigned r = 0; r < n; r += m)
for (unsigned j = 0; j < m4; j++)
FWHT_4(data + j + r, m4);
}
if (ldn & 1)
{
for (unsigned i0 = 0; i0 < n; i0 += 8)
FWHT_8(data + i0);
}
else
{
for (unsigned i0 = 0; i0 < n; i0 += 4)
FWHT_4(data + i0);
}
}
#else // LEO_FF8_FWHT_OPTIMIZED
// Reference implementation
void FWHT(ffe_t* data, const unsigned bits)
{
const unsigned size = (unsigned)(1UL << bits);
for (unsigned width = 1; width < size; width <<= 1)
for (unsigned i = 0; i < size; i += (width << 1))
for (unsigned j = i; j < (width + i); ++j)
FWHT_2(data[j], data[j + width]);
}
#endif // LEO_FF8_FWHT_OPTIMIZED
// Transform specialized for the finite field order
void FWHT(ffe_t data[kOrder])
{
FWHT(data, kBits);
}
//------------------------------------------------------------------------------
// XOR Memory
void xor_mem(
void * LEO_RESTRICT vx, const void * LEO_RESTRICT vy,
unsigned bytes)
{
#if defined(LEO_TRY_AVX2)
if (CpuHasAVX2)
{
LEO_M256 * LEO_RESTRICT x32 = reinterpret_cast<LEO_M256 *>(vx);
const LEO_M256 * LEO_RESTRICT y32 = reinterpret_cast<const LEO_M256 *>(vy);
do
{
const LEO_M256 x0 = _mm256_xor_si256(_mm256_loadu_si256(x32), _mm256_loadu_si256(y32));
const LEO_M256 x1 = _mm256_xor_si256(_mm256_loadu_si256(x32 + 1), _mm256_loadu_si256(y32 + 1));
const LEO_M256 x2 = _mm256_xor_si256(_mm256_loadu_si256(x32 + 2), _mm256_loadu_si256(y32 + 2));
const LEO_M256 x3 = _mm256_xor_si256(_mm256_loadu_si256(x32 + 3), _mm256_loadu_si256(y32 + 3));
_mm256_storeu_si256(x32, x0);
_mm256_storeu_si256(x32 + 1, x1);
_mm256_storeu_si256(x32 + 2, x2);
_mm256_storeu_si256(x32 + 3, x3);
bytes -= 128, x32 += 4, y32 += 4;
} while (bytes >= 128);
if (bytes > 0)
{
const LEO_M256 x0 = _mm256_xor_si256(_mm256_loadu_si256(x32), _mm256_loadu_si256(y32));
const LEO_M256 x1 = _mm256_xor_si256(_mm256_loadu_si256(x32 + 1), _mm256_loadu_si256(y32 + 1));
_mm256_storeu_si256(x32, x0);
_mm256_storeu_si256(x32 + 1, x1);
}
return;
}
#endif // LEO_TRY_AVX2
LEO_M128 * LEO_RESTRICT x16 = reinterpret_cast<LEO_M128 *>(vx);
const LEO_M128 * LEO_RESTRICT y16 = reinterpret_cast<const LEO_M128 *>(vy);
do
{
const LEO_M128 x0 = _mm_xor_si128(_mm_loadu_si128(x16), _mm_loadu_si128(y16));
const LEO_M128 x1 = _mm_xor_si128(_mm_loadu_si128(x16 + 1), _mm_loadu_si128(y16 + 1));
const LEO_M128 x2 = _mm_xor_si128(_mm_loadu_si128(x16 + 2), _mm_loadu_si128(y16 + 2));
const LEO_M128 x3 = _mm_xor_si128(_mm_loadu_si128(x16 + 3), _mm_loadu_si128(y16 + 3));
_mm_storeu_si128(x16, x0);
_mm_storeu_si128(x16 + 1, x1);
_mm_storeu_si128(x16 + 2, x2);
_mm_storeu_si128(x16 + 3, x3);
bytes -= 64, x16 += 4, y16 += 4;
} while (bytes > 0);
}
void xor_mem2(
void * LEO_RESTRICT vx_0, const void * LEO_RESTRICT vy_0,
void * LEO_RESTRICT vx_1, const void * LEO_RESTRICT vy_1,
unsigned bytes)
{
#if defined(LEO_TRY_AVX2)
if (CpuHasAVX2)
{
LEO_M256 * LEO_RESTRICT x32_0 = reinterpret_cast<LEO_M256 *> (vx_0);
const LEO_M256 * LEO_RESTRICT y32_0 = reinterpret_cast<const LEO_M256 *>(vy_0);
LEO_M256 * LEO_RESTRICT x32_1 = reinterpret_cast<LEO_M256 *> (vx_1);
const LEO_M256 * LEO_RESTRICT y32_1 = reinterpret_cast<const LEO_M256 *>(vy_1);
do
{
const LEO_M256 x0_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0), _mm256_loadu_si256(y32_0));
const LEO_M256 x1_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 1), _mm256_loadu_si256(y32_0 + 1));
const LEO_M256 x2_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 2), _mm256_loadu_si256(y32_0 + 2));
const LEO_M256 x3_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 3), _mm256_loadu_si256(y32_0 + 3));
const LEO_M256 x0_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1), _mm256_loadu_si256(y32_1));
const LEO_M256 x1_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 1), _mm256_loadu_si256(y32_1 + 1));
const LEO_M256 x2_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 2), _mm256_loadu_si256(y32_1 + 2));
const LEO_M256 x3_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 3), _mm256_loadu_si256(y32_1 + 3));
_mm256_storeu_si256(x32_0, x0_0);
_mm256_storeu_si256(x32_0 + 1, x1_0);
_mm256_storeu_si256(x32_0 + 2, x2_0);
_mm256_storeu_si256(x32_0 + 3, x3_0);
_mm256_storeu_si256(x32_1, x0_1);
_mm256_storeu_si256(x32_1 + 1, x1_1);
_mm256_storeu_si256(x32_1 + 2, x2_1);
_mm256_storeu_si256(x32_1 + 3, x3_1);
x32_0 += 4, y32_0 += 4;
x32_1 += 4, y32_1 += 4;
bytes -= 128;
} while (bytes >= 128);
if (bytes > 0)
{
const LEO_M256 x0_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0), _mm256_loadu_si256(y32_0));
const LEO_M256 x1_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 1), _mm256_loadu_si256(y32_0 + 1));
const LEO_M256 x0_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1), _mm256_loadu_si256(y32_1));
const LEO_M256 x1_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 1), _mm256_loadu_si256(y32_1 + 1));
_mm256_storeu_si256(x32_0, x0_0);
_mm256_storeu_si256(x32_0 + 1, x1_0);
_mm256_storeu_si256(x32_1, x0_1);
_mm256_storeu_si256(x32_1 + 1, x1_1);
}
return;
}
#endif // LEO_TRY_AVX2
LEO_M128 * LEO_RESTRICT x16_0 = reinterpret_cast<LEO_M128 *> (vx_0);
const LEO_M128 * LEO_RESTRICT y16_0 = reinterpret_cast<const LEO_M128 *>(vy_0);
LEO_M128 * LEO_RESTRICT x16_1 = reinterpret_cast<LEO_M128 *> (vx_1);
const LEO_M128 * LEO_RESTRICT y16_1 = reinterpret_cast<const LEO_M128 *>(vy_1);
do
{
const LEO_M128 x0_0 = _mm_xor_si128(_mm_loadu_si128(x16_0), _mm_loadu_si128(y16_0));
const LEO_M128 x1_0 = _mm_xor_si128(_mm_loadu_si128(x16_0 + 1), _mm_loadu_si128(y16_0 + 1));
const LEO_M128 x2_0 = _mm_xor_si128(_mm_loadu_si128(x16_0 + 2), _mm_loadu_si128(y16_0 + 2));
const LEO_M128 x3_0 = _mm_xor_si128(_mm_loadu_si128(x16_0 + 3), _mm_loadu_si128(y16_0 + 3));
const LEO_M128 x0_1 = _mm_xor_si128(_mm_loadu_si128(x16_1), _mm_loadu_si128(y16_1));
const LEO_M128 x1_1 = _mm_xor_si128(_mm_loadu_si128(x16_1 + 1), _mm_loadu_si128(y16_1 + 1));
const LEO_M128 x2_1 = _mm_xor_si128(_mm_loadu_si128(x16_1 + 2), _mm_loadu_si128(y16_1 + 2));
const LEO_M128 x3_1 = _mm_xor_si128(_mm_loadu_si128(x16_1 + 3), _mm_loadu_si128(y16_1 + 3));
_mm_storeu_si128(x16_0, x0_0);
_mm_storeu_si128(x16_0 + 1, x1_0);
_mm_storeu_si128(x16_0 + 2, x2_0);
_mm_storeu_si128(x16_0 + 3, x3_0);
_mm_storeu_si128(x16_1, x0_1);
_mm_storeu_si128(x16_1 + 1, x1_1);
_mm_storeu_si128(x16_1 + 2, x2_1);
_mm_storeu_si128(x16_1 + 3, x3_1);
x16_0 += 4, y16_0 += 4;
x16_1 += 4, y16_1 += 4;
bytes -= 64;
} while (bytes > 0);
}
void xor_mem3(
void * LEO_RESTRICT vx_0, const void * LEO_RESTRICT vy_0,
void * LEO_RESTRICT vx_1, const void * LEO_RESTRICT vy_1,
void * LEO_RESTRICT vx_2, const void * LEO_RESTRICT vy_2,
unsigned bytes)
{
#if defined(LEO_TRY_AVX2)
if (CpuHasAVX2)
{
LEO_M256 * LEO_RESTRICT x32_0 = reinterpret_cast<LEO_M256 *> (vx_0);
const LEO_M256 * LEO_RESTRICT y32_0 = reinterpret_cast<const LEO_M256 *>(vy_0);
LEO_M256 * LEO_RESTRICT x32_1 = reinterpret_cast<LEO_M256 *> (vx_1);
const LEO_M256 * LEO_RESTRICT y32_1 = reinterpret_cast<const LEO_M256 *>(vy_1);
LEO_M256 * LEO_RESTRICT x32_2 = reinterpret_cast<LEO_M256 *> (vx_2);
const LEO_M256 * LEO_RESTRICT y32_2 = reinterpret_cast<const LEO_M256 *>(vy_2);
do
{
const LEO_M256 x0_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0), _mm256_loadu_si256(y32_0));
const LEO_M256 x1_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 1), _mm256_loadu_si256(y32_0 + 1));
const LEO_M256 x2_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 2), _mm256_loadu_si256(y32_0 + 2));
const LEO_M256 x3_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 3), _mm256_loadu_si256(y32_0 + 3));
const LEO_M256 x0_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1), _mm256_loadu_si256(y32_1));
const LEO_M256 x1_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 1), _mm256_loadu_si256(y32_1 + 1));
const LEO_M256 x2_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 2), _mm256_loadu_si256(y32_1 + 2));
const LEO_M256 x3_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 3), _mm256_loadu_si256(y32_1 + 3));
const LEO_M256 x0_2 = _mm256_xor_si256(_mm256_loadu_si256(x32_2), _mm256_loadu_si256(y32_2));
const LEO_M256 x1_2 = _mm256_xor_si256(_mm256_loadu_si256(x32_2 + 1), _mm256_loadu_si256(y32_2 + 1));
const LEO_M256 x2_2 = _mm256_xor_si256(_mm256_loadu_si256(x32_2 + 2), _mm256_loadu_si256(y32_2 + 2));
const LEO_M256 x3_2 = _mm256_xor_si256(_mm256_loadu_si256(x32_2 + 3), _mm256_loadu_si256(y32_2 + 3));
_mm256_storeu_si256(x32_0, x0_0);
_mm256_storeu_si256(x32_0 + 1, x1_0);
_mm256_storeu_si256(x32_0 + 2, x2_0);
_mm256_storeu_si256(x32_0 + 3, x3_0);
_mm256_storeu_si256(x32_1, x0_1);
_mm256_storeu_si256(x32_1 + 1, x1_1);
_mm256_storeu_si256(x32_1 + 2, x2_1);
_mm256_storeu_si256(x32_1 + 3, x3_1);
_mm256_storeu_si256(x32_2, x0_2);
_mm256_storeu_si256(x32_2 + 1, x1_2);
_mm256_storeu_si256(x32_2 + 2, x2_2);
_mm256_storeu_si256(x32_2 + 3, x3_2);
x32_0 += 4, y32_0 += 4;
x32_1 += 4, y32_1 += 4;
x32_2 += 4, y32_2 += 4;
bytes -= 128;
} while (bytes >= 128);
if (bytes > 0)
{
const LEO_M256 x0_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0), _mm256_loadu_si256(y32_0));
const LEO_M256 x1_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 1), _mm256_loadu_si256(y32_0 + 1));
const LEO_M256 x0_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1), _mm256_loadu_si256(y32_1));
const LEO_M256 x1_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 1), _mm256_loadu_si256(y32_1 + 1));
const LEO_M256 x0_2 = _mm256_xor_si256(_mm256_loadu_si256(x32_2), _mm256_loadu_si256(y32_2));
const LEO_M256 x1_2 = _mm256_xor_si256(_mm256_loadu_si256(x32_2 + 1), _mm256_loadu_si256(y32_2 + 1));
_mm256_storeu_si256(x32_0, x0_0);
_mm256_storeu_si256(x32_0 + 1, x1_0);
_mm256_storeu_si256(x32_1, x0_1);
_mm256_storeu_si256(x32_1 + 1, x1_1);
_mm256_storeu_si256(x32_2, x0_2);
_mm256_storeu_si256(x32_2 + 1, x1_2);
}
return;
}
#endif // LEO_TRY_AVX2
LEO_M128 * LEO_RESTRICT x16_0 = reinterpret_cast<LEO_M128 *> (vx_0);
const LEO_M128 * LEO_RESTRICT y16_0 = reinterpret_cast<const LEO_M128 *>(vy_0);
LEO_M128 * LEO_RESTRICT x16_1 = reinterpret_cast<LEO_M128 *> (vx_1);
const LEO_M128 * LEO_RESTRICT y16_1 = reinterpret_cast<const LEO_M128 *>(vy_1);
LEO_M128 * LEO_RESTRICT x16_2 = reinterpret_cast<LEO_M128 *> (vx_2);
const LEO_M128 * LEO_RESTRICT y16_2 = reinterpret_cast<const LEO_M128 *>(vy_2);
do
{
const LEO_M128 x0_0 = _mm_xor_si128(_mm_loadu_si128(x16_0), _mm_loadu_si128(y16_0));
const LEO_M128 x1_0 = _mm_xor_si128(_mm_loadu_si128(x16_0 + 1), _mm_loadu_si128(y16_0 + 1));
const LEO_M128 x2_0 = _mm_xor_si128(_mm_loadu_si128(x16_0 + 2), _mm_loadu_si128(y16_0 + 2));
const LEO_M128 x3_0 = _mm_xor_si128(_mm_loadu_si128(x16_0 + 3), _mm_loadu_si128(y16_0 + 3));
const LEO_M128 x0_1 = _mm_xor_si128(_mm_loadu_si128(x16_1), _mm_loadu_si128(y16_1));
const LEO_M128 x1_1 = _mm_xor_si128(_mm_loadu_si128(x16_1 + 1), _mm_loadu_si128(y16_1 + 1));
const LEO_M128 x2_1 = _mm_xor_si128(_mm_loadu_si128(x16_1 + 2), _mm_loadu_si128(y16_1 + 2));
const LEO_M128 x3_1 = _mm_xor_si128(_mm_loadu_si128(x16_1 + 3), _mm_loadu_si128(y16_1 + 3));
const LEO_M128 x0_2 = _mm_xor_si128(_mm_loadu_si128(x16_2), _mm_loadu_si128(y16_2));
const LEO_M128 x1_2 = _mm_xor_si128(_mm_loadu_si128(x16_2 + 1), _mm_loadu_si128(y16_2 + 1));
const LEO_M128 x2_2 = _mm_xor_si128(_mm_loadu_si128(x16_2 + 2), _mm_loadu_si128(y16_2 + 2));
const LEO_M128 x3_2 = _mm_xor_si128(_mm_loadu_si128(x16_2 + 3), _mm_loadu_si128(y16_2 + 3));
_mm_storeu_si128(x16_0, x0_0);
_mm_storeu_si128(x16_0 + 1, x1_0);
_mm_storeu_si128(x16_0 + 2, x2_0);
_mm_storeu_si128(x16_0 + 3, x3_0);
_mm_storeu_si128(x16_1, x0_1);
_mm_storeu_si128(x16_1 + 1, x1_1);
_mm_storeu_si128(x16_1 + 2, x2_1);
_mm_storeu_si128(x16_1 + 3, x3_1);
_mm_storeu_si128(x16_2, x0_2);
_mm_storeu_si128(x16_2 + 1, x1_2);
_mm_storeu_si128(x16_2 + 2, x2_2);
_mm_storeu_si128(x16_2 + 3, x3_2);
x16_0 += 4, y16_0 += 4;
x16_1 += 4, y16_1 += 4;
x16_2 += 4, y16_2 += 4;
bytes -= 64;
} while (bytes > 0);
}
//------------------------------------------------------------------------------
// Multiplies
// We require memory to be aligned since the SIMD instructions benefit from
// or require aligned accesses to the table data.
struct {
LEO_ALIGNED LEO_M128 Lo[256];
LEO_ALIGNED LEO_M128 Hi[256];
} Multiply128LUT;
#if defined(LEO_TRY_AVX2)
struct {
LEO_ALIGNED LEO_M256 Lo[256];
LEO_ALIGNED LEO_M256 Hi[256];
} Multiply256LUT;
#endif // LEO_TRY_AVX2
// Returns a * b
static ffe_t FFEMultiply(ffe_t a, ffe_t b)
{
if (a == 0 || b == 0)
return 0;
return ExpLUT[AddMod(LogLUT[a], LogLUT[b])];
}
bool InitializeMultiplyTables()
{
// Reuse aligned self test buffers to load table data
uint8_t* lo = m_SelfTestBuffers.A;
uint8_t* hi = m_SelfTestBuffers.B;
for (int y = 0; y < 256; ++y)
{
for (unsigned char x = 0; x < 16; ++x)
{
lo[x] = FFEMultiply(x, static_cast<uint8_t>(y));
hi[x] = FFEMultiply(x << 4, static_cast<uint8_t>(y));
}
const LEO_M128 table_lo = _mm_loadu_si128((LEO_M128*)lo);
const LEO_M128 table_hi = _mm_loadu_si128((LEO_M128*)hi);
_mm_storeu_si128(Multiply128LUT.Lo + y, table_lo);
_mm_storeu_si128(Multiply128LUT.Hi + y, table_hi);
#if defined(LEO_TRY_AVX2)
if (CpuHasAVX2)
{
const LEO_M256 table_lo2 = _mm256_broadcastsi128_si256(table_lo);
const LEO_M256 table_hi2 = _mm256_broadcastsi128_si256(table_hi);
_mm256_storeu_si256(Multiply256LUT.Lo + y, table_lo2);
_mm256_storeu_si256(Multiply256LUT.Hi + y, table_hi2);
}
#endif // LEO_TRY_AVX2
}
return true;
}
// vx[] = vy[] * m
void mul_mem_set(
void * LEO_RESTRICT vx, const void * LEO_RESTRICT vy,
ffe_t m, unsigned bytes)
{
if (m <= 1)
{
if (m == 1)
memcpy(vx, vy, bytes);
else
memset(vx, 0, bytes);
return;
}
#if defined(LEO_TRY_AVX2)
if (CpuHasAVX2)
{
const LEO_M256 table_lo_y = _mm256_loadu_si256(Multiply256LUT.Lo + m);
const LEO_M256 table_hi_y = _mm256_loadu_si256(Multiply256LUT.Hi + m);
const LEO_M256 clr_mask = _mm256_set1_epi8(0x0f);
LEO_M256 * LEO_RESTRICT z32 = reinterpret_cast<LEO_M256 *>(vx);
const LEO_M256 * LEO_RESTRICT x32 = reinterpret_cast<const LEO_M256 *>(vy);
const unsigned count = bytes / 64;
for (unsigned i = 0; i < count; ++i)
{
LEO_M256 x0 = _mm256_loadu_si256(x32 + i * 2);
LEO_M256 l0 = _mm256_and_si256(x0, clr_mask);
x0 = _mm256_srli_epi64(x0, 4);
LEO_M256 h0 = _mm256_and_si256(x0, clr_mask);
l0 = _mm256_shuffle_epi8(table_lo_y, l0);
h0 = _mm256_shuffle_epi8(table_hi_y, h0);
_mm256_storeu_si256(z32 + i * 2, _mm256_xor_si256(l0, h0));
LEO_M256 x1 = _mm256_loadu_si256(x32 + i * 2 + 1);
LEO_M256 l1 = _mm256_and_si256(x1, clr_mask);
x1 = _mm256_srli_epi64(x1, 4);
LEO_M256 h1 = _mm256_and_si256(x1, clr_mask);
l1 = _mm256_shuffle_epi8(table_lo_y, l1);
h1 = _mm256_shuffle_epi8(table_hi_y, h1);
_mm256_storeu_si256(z32 + i * 2 + 1, _mm256_xor_si256(l1, h1));
}
return;
}
#endif // LEO_TRY_AVX2
const LEO_M128 table_lo_y = _mm_loadu_si128(Multiply128LUT.Lo + m);
const LEO_M128 table_hi_y = _mm_loadu_si128(Multiply128LUT.Hi + m);
const LEO_M128 clr_mask = _mm_set1_epi8(0x0f);
LEO_M128 * LEO_RESTRICT x16 = reinterpret_cast<LEO_M128 *> (vx);
const LEO_M128 * LEO_RESTRICT y16 = reinterpret_cast<const LEO_M128 *>(vy);
do
{
LEO_M128 x3 = _mm_loadu_si128(y16 + 3);
LEO_M128 l3 = _mm_and_si128(x3, clr_mask);
x3 = _mm_srli_epi64(x3, 4);
LEO_M128 h3 = _mm_and_si128(x3, clr_mask);
l3 = _mm_shuffle_epi8(table_lo_y, l3);
h3 = _mm_shuffle_epi8(table_hi_y, h3);
LEO_M128 x2 = _mm_loadu_si128(y16 + 2);
LEO_M128 l2 = _mm_and_si128(x2, clr_mask);
x2 = _mm_srli_epi64(x2, 4);
LEO_M128 h2 = _mm_and_si128(x2, clr_mask);
l2 = _mm_shuffle_epi8(table_lo_y, l2);
h2 = _mm_shuffle_epi8(table_hi_y, h2);
LEO_M128 x1 = _mm_loadu_si128(y16 + 1);
LEO_M128 l1 = _mm_and_si128(x1, clr_mask);
x1 = _mm_srli_epi64(x1, 4);
LEO_M128 h1 = _mm_and_si128(x1, clr_mask);
l1 = _mm_shuffle_epi8(table_lo_y, l1);
h1 = _mm_shuffle_epi8(table_hi_y, h1);
LEO_M128 x0 = _mm_loadu_si128(y16);
LEO_M128 l0 = _mm_and_si128(x0, clr_mask);
x0 = _mm_srli_epi64(x0, 4);
LEO_M128 h0 = _mm_and_si128(x0, clr_mask);
l0 = _mm_shuffle_epi8(table_lo_y, l0);
h0 = _mm_shuffle_epi8(table_hi_y, h0);
_mm_storeu_si128(x16 + 3, _mm_xor_si128(l3, h3));
_mm_storeu_si128(x16 + 2, _mm_xor_si128(l2, h2));
_mm_storeu_si128(x16 + 1, _mm_xor_si128(l1, h1));
_mm_storeu_si128(x16, _mm_xor_si128(l0, h0));
x16 += 4, y16 += 4;
bytes -= 64;
} while (bytes > 0);
}
// vx0[] *= m, vx1[] *= m
void mul_mem2_inplace(
void * LEO_RESTRICT vx_0,
void * LEO_RESTRICT vx_1,
ffe_t m, unsigned bytes)
{
if (m <= 1)
{
if (m == 0)
{
memset(vx_0, 0, bytes);
memset(vx_1, 0, bytes);
}
return;
}
#if defined(LEO_TRY_AVX2)
if (CpuHasAVX2)
{
const LEO_M256 table_lo_y = _mm256_loadu_si256(Multiply256LUT.Lo + m);
const LEO_M256 table_hi_y = _mm256_loadu_si256(Multiply256LUT.Hi + m);
const LEO_M256 clr_mask = _mm256_set1_epi8(0x0f);
LEO_M256 * LEO_RESTRICT x32_0 = reinterpret_cast<LEO_M256 *>(vx_0);
LEO_M256 * LEO_RESTRICT x32_1 = reinterpret_cast<LEO_M256 *>(vx_1);
do
{
LEO_M256 x0_0 = _mm256_loadu_si256(x32_0 + 1);
LEO_M256 l0_0 = _mm256_and_si256(x0_0, clr_mask);
x0_0 = _mm256_srli_epi64(x0_0, 4);
LEO_M256 h0_0 = _mm256_and_si256(x0_0, clr_mask);
l0_0 = _mm256_shuffle_epi8(table_lo_y, l0_0);
h0_0 = _mm256_shuffle_epi8(table_hi_y, h0_0);
l0_0 = _mm256_xor_si256(l0_0, h0_0);
LEO_M256 x1_0 = _mm256_loadu_si256(x32_0);
LEO_M256 l1_0 = _mm256_and_si256(x1_0, clr_mask);
x1_0 = _mm256_srli_epi64(x1_0, 4);
LEO_M256 h1_0 = _mm256_and_si256(x1_0, clr_mask);
l1_0 = _mm256_shuffle_epi8(table_lo_y, l1_0);
h1_0 = _mm256_shuffle_epi8(table_hi_y, h1_0);
l1_0 = _mm256_xor_si256(l1_0, h1_0);
LEO_M256 x0_1 = _mm256_loadu_si256(x32_1 + 1);
LEO_M256 l0_1 = _mm256_and_si256(x0_1, clr_mask);
x0_1 = _mm256_srli_epi64(x0_1, 4);
LEO_M256 h0_1 = _mm256_and_si256(x0_1, clr_mask);
l0_1 = _mm256_shuffle_epi8(table_lo_y, l0_1);
h0_1 = _mm256_shuffle_epi8(table_hi_y, h0_1);
l0_1 = _mm256_xor_si256(l0_1, h0_1);
LEO_M256 x1_1 = _mm256_loadu_si256(x32_1);
LEO_M256 l1_1 = _mm256_and_si256(x1_1, clr_mask);
x1_1 = _mm256_srli_epi64(x1_1, 4);
LEO_M256 h1_1 = _mm256_and_si256(x1_1, clr_mask);
l1_1 = _mm256_shuffle_epi8(table_lo_y, l1_1);
h1_1 = _mm256_shuffle_epi8(table_hi_y, h1_1);
l1_1 = _mm256_xor_si256(l1_1, h1_1);
_mm256_storeu_si256(x32_0 + 1, l0_0);
_mm256_storeu_si256(x32_0, l1_0);
_mm256_storeu_si256(x32_1 + 1, l0_1);
_mm256_storeu_si256(x32_1, l1_1);
x32_0 += 2;
x32_1 += 2;
bytes -= 64;
} while (bytes > 0);
return;
}
#endif // LEO_TRY_AVX2
const LEO_M128 table_lo_y = _mm_loadu_si128(Multiply128LUT.Lo + m);
const LEO_M128 table_hi_y = _mm_loadu_si128(Multiply128LUT.Hi + m);
const LEO_M128 clr_mask = _mm_set1_epi8(0x0f);
LEO_M128 * LEO_RESTRICT x16_0 = reinterpret_cast<LEO_M128 *>(vx_0);
LEO_M128 * LEO_RESTRICT x16_1 = reinterpret_cast<LEO_M128 *>(vx_1);
do
{
LEO_M128 x3 = _mm_loadu_si128(x16_0 + 3);
LEO_M128 l3 = _mm_and_si128(x3, clr_mask);
x3 = _mm_srli_epi64(x3, 4);
LEO_M128 h3 = _mm_and_si128(x3, clr_mask);
l3 = _mm_shuffle_epi8(table_lo_y, l3);
h3 = _mm_shuffle_epi8(table_hi_y, h3);
LEO_M128 x2 = _mm_loadu_si128(x16_0 + 2);
LEO_M128 l2 = _mm_and_si128(x2, clr_mask);
x2 = _mm_srli_epi64(x2, 4);
LEO_M128 h2 = _mm_and_si128(x2, clr_mask);
l2 = _mm_shuffle_epi8(table_lo_y, l2);
h2 = _mm_shuffle_epi8(table_hi_y, h2);
LEO_M128 x1 = _mm_loadu_si128(x16_0 + 1);
LEO_M128 l1 = _mm_and_si128(x1, clr_mask);
x1 = _mm_srli_epi64(x1, 4);
LEO_M128 h1 = _mm_and_si128(x1, clr_mask);
l1 = _mm_shuffle_epi8(table_lo_y, l1);
h1 = _mm_shuffle_epi8(table_hi_y, h1);
LEO_M128 x0 = _mm_loadu_si128(x16_0);
LEO_M128 l0 = _mm_and_si128(x0, clr_mask);
x0 = _mm_srli_epi64(x0, 4);
LEO_M128 h0 = _mm_and_si128(x0, clr_mask);
l0 = _mm_shuffle_epi8(table_lo_y, l0);
h0 = _mm_shuffle_epi8(table_hi_y, h0);
_mm_storeu_si128(x16_0 + 3, _mm_xor_si128(l3, h3));
_mm_storeu_si128(x16_0 + 2, _mm_xor_si128(l2, h2));
_mm_storeu_si128(x16_0 + 1, _mm_xor_si128(l1, h1));
_mm_storeu_si128(x16_0, _mm_xor_si128(l0, h0));
// FIXME: Add second one here
x16_0 += 4;
x16_1 += 4;
bytes -= 64;
} while (bytes > 0);
}
//------------------------------------------------------------------------------
// FFT Operations
// x[] ^= y[] * m, y[] ^= x[]
void mul_fft(
void * LEO_RESTRICT x, void * LEO_RESTRICT y,
ffe_t m, unsigned bytes)
{
}
// For i = {0, 1}: x_i[] ^= y_i[] * m, y_i[] ^= x_i[]
void mul_fft2(
void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1,
ffe_t m, unsigned bytes)
{
}
// For i = {0, 1, 2}: x_i[] ^= y_i[] * m, y_i[] ^= x_i[]
void mul_fft3(
void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1,
void * LEO_RESTRICT x_2, void * LEO_RESTRICT y_2,
ffe_t m, unsigned bytes)
{
}
//------------------------------------------------------------------------------
// IFFT Operations
// y[] ^= x[], x[] ^= y[] * m
void mul_ifft(
void * LEO_RESTRICT x, void * LEO_RESTRICT y,
ffe_t m, unsigned bytes)
{
}
// For i = {0, 1}: y_i[] ^= x_i[], x_i[] ^= y_i[] * m
void mul_ifft2(
void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1,
ffe_t m, unsigned bytes)
{
}
// For i = {0, 1, 2}: y_i[] ^= x_i[], x_i[] ^= y_i[] * m
void mul_ifft3(
void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1,
void * LEO_RESTRICT x_2, void * LEO_RESTRICT y_2,
ffe_t m, unsigned bytes)
{
}
//------------------------------------------------------------------------------
// API
static bool IsInitialized = false;
bool Initialize()
{
if (IsInitialized)
return true;
if (!CpuHasSSSE3)
return false;
InitializeLogarithmTables();
IsInitialized = true;
return true;
}
}} // namespace leopard::ff8

157
LeopardFF8.h Normal file
View File

@ -0,0 +1,157 @@
/*
Copyright (c) 2017 Christopher A. Taylor. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the name of Leopard-RS nor the names of its contributors may be
used to endorse or promote products derived from this software without
specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#include "LeopardCommon.h"
/*
8-bit Finite Field Math
This finite field contains 256 elements and so each element is one byte.
This library is designed for data that is a multiple of 64 bytes in size.
*/
namespace leopard { namespace ff8 {
//------------------------------------------------------------------------------
// Datatypes and Constants
// Finite field element type
typedef uint8_t ffe_t;
// Number of bits per element
static const unsigned kBits = 8;
// Finite field order: Number of elements in the field
static const unsigned kOrder = 256;
//------------------------------------------------------------------------------
// Fast Walsh-Hadamard Transform (FWHT) (mod kModulus)
// Define this to enable the optimized version of FWHT()
#define LEO_FF8_FWHT_OPTIMIZED
// Transform for a variable number of bits (up to kOrder)
void FWHT(ffe_t* data, const unsigned bits);
// Transform specialized for the finite field order
void FWHT(ffe_t data[kOrder]);
//------------------------------------------------------------------------------
// XOR Memory
// x[] ^= y[]
void xor_mem(
void * LEO_RESTRICT x, const void * LEO_RESTRICT y,
unsigned bytes);
// For i = {0, 1}: x_i[] ^= x_i[]
void xor_mem2(
void * LEO_RESTRICT x_0, const void * LEO_RESTRICT y_0,
void * LEO_RESTRICT x_1, const void * LEO_RESTRICT y_1,
unsigned bytes);
// For i = {0, 1, 2}: x_i[] ^= x_i[]
void xor_mem3(
void * LEO_RESTRICT x_0, const void * LEO_RESTRICT y_0,
void * LEO_RESTRICT x_1, const void * LEO_RESTRICT y_1,
void * LEO_RESTRICT x_2, const void * LEO_RESTRICT y_2,
unsigned bytes);
//------------------------------------------------------------------------------
// Multiplies
// x[] = y[] * m
void mul_mem_set(
void * LEO_RESTRICT x, const void * LEO_RESTRICT y,
ffe_t m, unsigned bytes);
// For i = {0, 1}: x_i[] *= m
void mul_mem2_inplace(
void * LEO_RESTRICT x_0,
void * LEO_RESTRICT x_1,
ffe_t m, unsigned bytes);
//------------------------------------------------------------------------------
// FFT Operations
// x[] ^= y[] * m, y[] ^= x[]
void mul_fft(
void * LEO_RESTRICT x, void * LEO_RESTRICT y,
ffe_t m, unsigned bytes);
// For i = {0, 1}: x_i[] ^= y_i[] * m, y_i[] ^= x_i[]
void mul_fft2(
void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1,
ffe_t m, unsigned bytes);
// For i = {0, 1, 2}: x_i[] ^= y_i[] * m, y_i[] ^= x_i[]
void mul_fft3(
void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1,
void * LEO_RESTRICT x_2, void * LEO_RESTRICT y_2,
ffe_t m, unsigned bytes);
//------------------------------------------------------------------------------
// IFFT Operations
// y[] ^= x[], x[] ^= y[] * m
void mul_ifft(
void * LEO_RESTRICT x, void * LEO_RESTRICT y,
ffe_t m, unsigned bytes);
// For i = {0, 1}: y_i[] ^= x_i[], x_i[] ^= y_i[] * m
void mul_ifft2(
void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1,
ffe_t m, unsigned bytes);
// For i = {0, 1, 2}: y_i[] ^= x_i[], x_i[] ^= y_i[] * m
void mul_ifft3(
void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1,
void * LEO_RESTRICT x_2, void * LEO_RESTRICT y_2,
ffe_t m, unsigned bytes);
//------------------------------------------------------------------------------
// API
// Returns false if the self-test fails
bool Initialize();
}} // namespace leopard::ff8

29
License.md Normal file
View File

@ -0,0 +1,29 @@
BSD 3-Clause License
Copyright (c) 2017, Christopher A. Taylor
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View File

@ -1,9 +1,91 @@
# Lin-Han-Chung RS Codes
This is an attempt at implementing a fast version of the algorithm described here:
# Leopard-RS
## Leopard Reed-Solomon Error Correction Codes in C
Leopard-RS is a portable, fast library for Forward Error Correction.
From a block of equally sized original data pieces, it generates recovery
symbols that can be used to recover lost original data.
* It requires that data pieces are all a fixed size, a multiple of 64 bytes.
* The original and recovery data must not exceed 65536 pieces.
#### Motivation:
It gets slower as O(N Log N) in the input data size, and its inner loops are
vectorized using the best approaches available on modern processors, using the
fastest finite fields (8-bit or 16-bit Galois fields) for bulk data.
It sets new speed records for MDS encoding and decoding of large data.
It is also the only open-source production ready software for this purpose
available today.
Example applications are data recovery software and data center replication.
#### Encoder API:
```
#include "leopard.h"
```
For full documentation please read `leopard.h`.
+ `leo_init()` : Initialize library.
+ `leo_encode_work_count()` : Calculate the number of work_data buffers to provide to leo_encode().
+ `leo_encode()`: Generate recovery data.
#### Decoder API:
```
#include "leopard.h"
```
For full documentation please read `leopard.h`.
+ `leo_init()` : Initialize library.
+ `leo_decode_work_count()` : Calculate the number of work_data buffers to provide to leo_decode().
+ `leo_decode()` : Generate recovery data.
#### Benchmarks:
```
TODO
```
#### Comparisons:
```
TODO
```
#### Background
This library implements an MDS erasure code introduced in this paper:
~~~
S.-J. Lin, T. Y. Al-Naffouri, Y. S. Han, and W.-H. Chung,
"Novel Polynomial Basis with Fast Fourier Transform and Its Application to Reed-Solomon Erasure Codes"
IEEE Trans. on Information Theory, pp. 6284-6299, November, 2016.
~~~
Available here: [http://ct.ee.ntust.edu.tw/it2016-2.pdf](http://ct.ee.ntust.edu.tw/it2016-2.pdf)
The paper is available here: [http://ct.ee.ntust.edu.tw/it2016-2.pdf](http://ct.ee.ntust.edu.tw/it2016-2.pdf)
And also mirrored in the /docs/ folder.
The high-level summary is that instead of using complicated fields,
an additive FFT was introduced that works with familiar Galois fields for the first time.
This is actually a huge new result that will change how Reed-Solomon codecs will be written.
My contribution is extending the ALTMAP approach from Jerasure
for 16-bit Galois fields out to 64 bytes to enable AVX2 speedups,
and marry it with the row parallelism introduced by ISA-L.
#### Credits
The idea is the brain-child of S.-J. Lin. He is a super bright guy who should be recognized more widely!
This software was written entirely by myself ( Christopher A. Taylor mrcatid@gmail.com ). If you find it useful and would like to buy me a coffee, consider tipping.

172
leopard.cpp Normal file
View File

@ -0,0 +1,172 @@
/*
Copyright (c) 2017 Christopher A. Taylor. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the name of Leopard-RS nor the names of its contributors may be
used to endorse or promote products derived from this software without
specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
*/
#include "leopard.h"
#include "FecalEncoder.h"
#include "FecalDecoder.h"
extern "C" {
//------------------------------------------------------------------------------
// Initialization API
static bool m_Initialized = false;
FECAL_EXPORT int fecal_init_(int version)
{
if (version != FECAL_VERSION)
return Fecal_InvalidInput;
if (0 != gf256_init())
return Fecal_Platform;
m_Initialized = true;
return Fecal_Success;
}
//------------------------------------------------------------------------------
// Encoder API
FECAL_EXPORT FecalEncoder fecal_encoder_create(unsigned input_count, void* const * const input_data, uint64_t total_bytes)
{
if (input_count <= 0 || !input_data || total_bytes < input_count)
{
FECAL_DEBUG_BREAK; // Invalid input
return nullptr;
}
FECAL_DEBUG_ASSERT(m_Initialized); // Must call fecal_init() first
if (!m_Initialized)
return nullptr;
fecal::Encoder* encoder = new(std::nothrow) fecal::Encoder;
if (!encoder)
{
FECAL_DEBUG_BREAK; // Out of memory
return nullptr;
}
if (Fecal_Success != encoder->Initialize(input_count, input_data, total_bytes))
{
delete encoder;
return nullptr;
}
return reinterpret_cast<FecalEncoder>( encoder );
}
FECAL_EXPORT int fecal_encode(FecalEncoder encoder_v, FecalSymbol* symbol)
{
fecal::Encoder* encoder = reinterpret_cast<fecal::Encoder*>( encoder_v );
if (!encoder || !symbol)
return Fecal_InvalidInput;
return encoder->Encode(*symbol);
}
FECAL_EXPORT void fecal_free(void* codec_v)
{
if (codec_v)
{
fecal::ICodec* icodec = reinterpret_cast<fecal::ICodec*>( codec_v );
delete icodec;
}
}
//------------------------------------------------------------------------------
// Decoder API
FECAL_EXPORT FecalDecoder fecal_decoder_create(unsigned input_count, uint64_t total_bytes)
{
if (input_count <= 0 || total_bytes < input_count)
{
FECAL_DEBUG_BREAK; // Invalid input
return nullptr;
}
FECAL_DEBUG_ASSERT(m_Initialized); // Must call fecal_init() first
if (!m_Initialized)
return nullptr;
fecal::Decoder* decoder = new(std::nothrow) fecal::Decoder;
if (!decoder)
{
FECAL_DEBUG_BREAK; // Out of memory
return nullptr;
}
if (Fecal_Success != decoder->Initialize(input_count, total_bytes))
{
delete decoder;
return nullptr;
}
return reinterpret_cast<FecalDecoder>( decoder );
}
FECAL_EXPORT int fecal_decoder_add_original(FecalDecoder decoder_v, const FecalSymbol* symbol)
{
fecal::Decoder* decoder = reinterpret_cast<fecal::Decoder*>( decoder_v );
if (!decoder || !symbol)
return Fecal_InvalidInput;
return decoder->AddOriginal(*symbol);
}
FECAL_EXPORT int fecal_decoder_add_recovery(FecalDecoder decoder_v, const FecalSymbol* symbol)
{
fecal::Decoder* decoder = reinterpret_cast<fecal::Decoder*>( decoder_v );
if (!decoder || !symbol)
return Fecal_InvalidInput;
return decoder->AddRecovery(*symbol);
}
FECAL_EXPORT int fecal_decode(FecalDecoder decoder_v, RecoveredSymbols* symbols)
{
fecal::Decoder* decoder = reinterpret_cast<fecal::Decoder*>( decoder_v );
if (!decoder || !symbols)
return Fecal_InvalidInput;
return decoder->Decode(*symbols);
}
FECAL_EXPORT int fecal_decoder_get(FecalDecoder decoder_v, unsigned input_index, FecalSymbol* symbol)
{
fecal::Decoder* decoder = reinterpret_cast<fecal::Decoder*>( decoder_v );
if (!decoder || !symbol)
return Fecal_InvalidInput;
return decoder->GetOriginal(input_index, *symbol);
}
} // extern "C"

229
leopard.h Normal file
View File

@ -0,0 +1,229 @@
/*
Copyright (c) 2017 Christopher A. Taylor. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the name of Leopard-RS nor the names of its contributors may be
used to endorse or promote products derived from this software without
specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef CAT_LEOPARD_RS_H
#define CAT_LEOPARD_RS_H
/*
Leopard-RS: Reed-Solomon Error Correction Coding for Extremely Large Data
S.-J. Lin, T. Y. Al-Naffouri, Y. S. Han, and W.-H. Chung,
"Novel Polynomial Basis with Fast Fourier Transform and Its Application to Reed-Solomon Erasure Codes"
IEEE Trans. on Information Theory, pp. 6284-6299, November, 2016.
http://ct.ee.ntust.edu.tw/it2016-2.pdf
*/
// Library version
#define LEO_VERSION 1
// Tweak if the functions are exported or statically linked
//#define LEO_DLL /* Defined when building/linking as DLL */
//#define LEO_BUILDING /* Defined by the library makefile */
#if defined(LEO_BUILDING)
# if defined(LEO_DLL)
#define LEO_EXPORT __declspec(dllexport)
# else
#define LEO_EXPORT
# endif
#else
# if defined(LEO_DLL)
#define LEO_EXPORT __declspec(dllimport)
# else
#define LEO_EXPORT extern
# endif
#endif
#ifdef __cplusplus
extern "C" {
#endif
//------------------------------------------------------------------------------
// Initialization API
/*
leo_init()
Perform static initialization for the library, verifying that the platform
is supported.
Returns 0 on success and other values on failure.
*/
LEO_EXPORT int leo_init_(int version);
#define leo_init() leo_init_(LEO_VERSION)
//------------------------------------------------------------------------------
// Shared Constants / Datatypes
// Results
typedef enum LeopardResultT
{
Leopard_Success = 0, // Operation succeeded
Leopard_TooMuchData = -1, // Buffer counts are too high
Leopard_InvalidBlockSize = -2, // Buffer size must be a multiple of 64 bytes
Leopard_InvalidInput = -3, // A function parameter was invalid
Leopard_Platform = -4, // Platform is unsupported
Leopard_OutOfMemory = -5, // Out of memory error occurred
Leopard_Unexpected = -6, // Unexpected error - Software bug?
} LeopardResult;
// Results
typedef enum LeopardFlagsT
{
LeopardFlags_Defaults = 0, // Default settings
LeopardFlags_Multithreaded = 1, // Enable multiple threads
} LeopardFlags;
//------------------------------------------------------------------------------
// Encoder API
/*
leo_encode_work_count()
Calculate the number of work_data buffers to provide to leo_encode().
The sum of original_count + recovery_count must not exceed 65536.
Returns the work_count value to pass into leo_encode().
Returns 0 on invalid input.
*/
LEO_EXPORT unsigned leo_encode_work_count(
unsigned original_count,
unsigned recovery_count);
/*
leo_encode()
Generate recovery data.
original_count: Number of original_data[] buffers provided.
recovery_count: Number of desired recovery data buffers.
buffer_bytes: Number of bytes in each data buffer.
original_data: Array of pointers to original data buffers.
work_count: Number of work_data[] buffers, from leo_encode_work_count().
work_data: Array of pointers to work data buffers.
flags: Flags for encoding e.g. LeopardFlag_Multithreaded
The sum of original_count + recovery_count must not exceed 65536.
The buffer_bytes must be a multiple of 64.
Each buffer should have the same number of bytes.
Even the last piece must be rounded up to the block size.
Let buffer_bytes = The number of bytes in each buffer:
original_count = static_cast<unsigned>(
((uint64_t)total_bytes + buffer_bytes - 1) / buffer_bytes);
Or if the number of pieces is known:
buffer_bytes = static_cast<unsigned>(
((uint64_t)total_bytes + original_count - 1) / original_count);
Returns Leopard_Success on success.
The first set of recovery_count buffers in work_data will be the result.
Returns Leopard_TooMuchData if the data is too large.
Returns Leopard_InvalidBlockSize if the data is the wrong size.
Returns Leopard_InvalidInput on invalid input.
Returns other values on errors.
*/
LEO_EXPORT LeopardResult leo_encode(
unsigned buffer_bytes, // Number of bytes in each data buffer
unsigned original_count, // Number of original_data[] buffer pointers
unsigned recovery_count, // Number of recovery_data[] buffer pointers
unsigned work_count, // Number of work_data[] buffer pointers, from leo_encode_work_count()
void* const * const original_data, // Array of pointers to original data buffers
void** work_data, // Array of work buffers
unsigned flags); // Operation flags
//------------------------------------------------------------------------------
// Decoder API
/*
leo_decode_work_count()
Calculate the number of work_data buffers to provide to leo_decode().
The sum of original_count + recovery_count must not exceed 65536.
Returns the work_count value to pass into leo_encode().
Returns 0 on invalid input.
*/
LEO_EXPORT unsigned leo_decode_work_count(
unsigned original_count,
unsigned recovery_count);
/*
leo_decode()
Decode original data from recovery data.
buffer_bytes: Number of bytes in each data buffer.
original_count: Number of original_data[] buffers provided.
original_data: Array of pointers to original data buffers.
recovery_count: Number of recovery_data[] buffers provided.
recovery_data: Array of pointers to recovery data buffers.
work_count: Number of work_data[] buffers, from leo_decode_work_count().
work_data: Array of pointers to recovery data buffers.
flags: Flags for encoding e.g. LeopardFlag_Multithreaded
Lost original/recovery data should be set to NULL.
The sum of recovery_count + the number of non-NULL original data must be at
least original_count in order to perform recovery.
Returns Leopard_Success on success.
Returns other values on errors.
*/
LEO_EXPORT LeopardResult leo_decode(
unsigned buffer_bytes, // Number of bytes in each data buffer
unsigned original_count, // Number of original_data[] buffer pointers
unsigned recovery_count, // Number of recovery_data[] buffer pointers
unsigned work_count, // Number of buffer pointers in work_data[]
void* const * const original_data, // Array of original data buffers
void* const * const recovery_data, // Array of recovery data buffers
void** work_data, // Array of work data buffers
unsigned flags); // Operation flags
#ifdef __cplusplus
}
#endif
#endif // CAT_LEOPARD_RS_H

View File

@ -1,9 +1,11 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 14
VisualStudioVersion = 14.0.25420.1
# Visual Studio 15
VisualStudioVersion = 15.0.26127.3
MinimumVisualStudioVersion = 10.0.40219.1
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "LHC_RS", "LHC_RS.vcxproj", "{32176592-2F30-4BD5-B645-EB11C8D3453E}"
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "Leopard", "Leopard.vcxproj", "{32176592-2F30-4BD5-B645-EB11C8D3453E}"
EndProject
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "LeopardBenchmark", "..\tests\proj\Benchmark.vcxproj", "{97FCA15F-EAF3-4F1A-AFF8-83E693DA9D45}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
@ -21,6 +23,14 @@ Global
{32176592-2F30-4BD5-B645-EB11C8D3453E}.Release|Win32.Build.0 = Release|Win32
{32176592-2F30-4BD5-B645-EB11C8D3453E}.Release|x64.ActiveCfg = Release|x64
{32176592-2F30-4BD5-B645-EB11C8D3453E}.Release|x64.Build.0 = Release|x64
{97FCA15F-EAF3-4F1A-AFF8-83E693DA9D45}.Debug|Win32.ActiveCfg = Debug|Win32
{97FCA15F-EAF3-4F1A-AFF8-83E693DA9D45}.Debug|Win32.Build.0 = Debug|Win32
{97FCA15F-EAF3-4F1A-AFF8-83E693DA9D45}.Debug|x64.ActiveCfg = Debug|x64
{97FCA15F-EAF3-4F1A-AFF8-83E693DA9D45}.Debug|x64.Build.0 = Debug|x64
{97FCA15F-EAF3-4F1A-AFF8-83E693DA9D45}.Release|Win32.ActiveCfg = Release|Win32
{97FCA15F-EAF3-4F1A-AFF8-83E693DA9D45}.Release|Win32.Build.0 = Release|Win32
{97FCA15F-EAF3-4F1A-AFF8-83E693DA9D45}.Release|x64.ActiveCfg = Release|x64
{97FCA15F-EAF3-4F1A-AFF8-83E693DA9D45}.Release|x64.Build.0 = Release|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE

193
proj/Leopard.vcxproj Normal file
View File

@ -0,0 +1,193 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|Win32">
<Configuration>Debug</Configuration>
<Platform>Win32</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|Win32">
<Configuration>Release</Configuration>
<Platform>Win32</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<ItemGroup>
<ClInclude Include="..\leopard.h" />
<ClInclude Include="..\LeopardCommon.h" />
<ClInclude Include="..\LeopardDecoder.h" />
<ClInclude Include="..\LeopardEncoder.h" />
<ClInclude Include="..\LeopardFF8.h" />
<ClInclude Include="..\LeopardFF16.h" />
</ItemGroup>
<ItemGroup>
<ClCompile Include="..\leopard.cpp" />
<ClCompile Include="..\LeopardCommon.cpp" />
<ClCompile Include="..\LeopardDecoder.cpp" />
<ClCompile Include="..\LeopardEncoder.cpp" />
<ClCompile Include="..\LeopardFF8.cpp" />
<ClCompile Include="..\LeopardFF16.cpp" />
</ItemGroup>
<PropertyGroup Label="Globals">
<ProjectGuid>{32176592-2F30-4BD5-B645-EB11C8D3453E}</ProjectGuid>
<RootNamespace>GF65536</RootNamespace>
<ProjectName>Leopard</ProjectName>
<WindowsTargetPlatformVersion>10.0.14393.0</WindowsTargetPlatformVersion>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
<ConfigurationType>StaticLibrary</ConfigurationType>
<UseDebugLibraries>true</UseDebugLibraries>
<CharacterSet>MultiByte</CharacterSet>
<PlatformToolset>v141</PlatformToolset>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
<ConfigurationType>StaticLibrary</ConfigurationType>
<UseDebugLibraries>true</UseDebugLibraries>
<CharacterSet>MultiByte</CharacterSet>
<PlatformToolset>v141</PlatformToolset>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
<ConfigurationType>StaticLibrary</ConfigurationType>
<UseDebugLibraries>false</UseDebugLibraries>
<WholeProgramOptimization>true</WholeProgramOptimization>
<CharacterSet>MultiByte</CharacterSet>
<PlatformToolset>v141</PlatformToolset>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
<ConfigurationType>StaticLibrary</ConfigurationType>
<UseDebugLibraries>false</UseDebugLibraries>
<WholeProgramOptimization>true</WholeProgramOptimization>
<CharacterSet>MultiByte</CharacterSet>
<PlatformToolset>v141</PlatformToolset>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
</ImportGroup>
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
<OutDir>Output/$(ProjectName)/$(Configuration)/$(Platform)/</OutDir>
<IntDir>Obj/$(ProjectName)/$(Configuration)/$(Platform)/</IntDir>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
<OutDir>Output/$(ProjectName)/$(Configuration)/$(Platform)/</OutDir>
<IntDir>Obj/$(ProjectName)/$(Configuration)/$(Platform)/</IntDir>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<OutDir>Output/$(ProjectName)/$(Configuration)/$(Platform)/</OutDir>
<IntDir>Obj/$(ProjectName)/$(Configuration)/$(Platform)/</IntDir>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<OutDir>Output/$(ProjectName)/$(Configuration)/$(Platform)/</OutDir>
<IntDir>Obj/$(ProjectName)/$(Configuration)/$(Platform)/</IntDir>
</PropertyGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<Optimization>Disabled</Optimization>
<SDLCheck>true</SDLCheck>
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
<PreprocessorDefinitions>_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
</ClCompile>
<Link>
<GenerateDebugInformation>true</GenerateDebugInformation>
<AdditionalLibraryDirectories>
</AdditionalLibraryDirectories>
</Link>
<PostBuildEvent>
<Command>
</Command>
</PostBuildEvent>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<Optimization>Disabled</Optimization>
<SDLCheck>true</SDLCheck>
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
<PreprocessorDefinitions>_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
</ClCompile>
<Link>
<GenerateDebugInformation>true</GenerateDebugInformation>
<AdditionalLibraryDirectories>
</AdditionalLibraryDirectories>
</Link>
<PostBuildEvent>
<Command>
</Command>
</PostBuildEvent>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<Optimization>MaxSpeed</Optimization>
<FunctionLevelLinking>true</FunctionLevelLinking>
<IntrinsicFunctions>true</IntrinsicFunctions>
<SDLCheck>true</SDLCheck>
<InlineFunctionExpansion>AnySuitable</InlineFunctionExpansion>
<FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
<OmitFramePointers>false</OmitFramePointers>
<RuntimeLibrary>MultiThreaded</RuntimeLibrary>
<BufferSecurityCheck>true</BufferSecurityCheck>
<PreprocessorDefinitions>_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
</ClCompile>
<Link>
<GenerateDebugInformation>true</GenerateDebugInformation>
<EnableCOMDATFolding>true</EnableCOMDATFolding>
<OptimizeReferences>true</OptimizeReferences>
<AdditionalLibraryDirectories>
</AdditionalLibraryDirectories>
</Link>
<PostBuildEvent>
<Command>
</Command>
</PostBuildEvent>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<Optimization>MaxSpeed</Optimization>
<FunctionLevelLinking>true</FunctionLevelLinking>
<IntrinsicFunctions>true</IntrinsicFunctions>
<SDLCheck>true</SDLCheck>
<InlineFunctionExpansion>OnlyExplicitInline</InlineFunctionExpansion>
<FavorSizeOrSpeed>Size</FavorSizeOrSpeed>
<OmitFramePointers>false</OmitFramePointers>
<RuntimeLibrary>MultiThreaded</RuntimeLibrary>
<BufferSecurityCheck>true</BufferSecurityCheck>
<PreprocessorDefinitions>_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
</ClCompile>
<Link>
<GenerateDebugInformation>true</GenerateDebugInformation>
<EnableCOMDATFolding>true</EnableCOMDATFolding>
<OptimizeReferences>true</OptimizeReferences>
<AdditionalLibraryDirectories>
</AdditionalLibraryDirectories>
</Link>
<PostBuildEvent>
<Command>
</Command>
</PostBuildEvent>
</ItemDefinitionGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
</ImportGroup>
</Project>

View File

@ -0,0 +1,57 @@
<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup>
<Filter Include="Source Files">
<UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
<Extensions>cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx</Extensions>
</Filter>
<Filter Include="Header Files">
<UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>
<Extensions>h;hh;hpp;hxx;hm;inl;inc;xsd</Extensions>
</Filter>
<Filter Include="Resource Files">
<UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier>
<Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
</Filter>
</ItemGroup>
<ItemGroup>
<ClInclude Include="..\leopard.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="..\LeopardCommon.h">
<Filter>Source Files</Filter>
</ClInclude>
<ClInclude Include="..\LeopardDecoder.h">
<Filter>Source Files</Filter>
</ClInclude>
<ClInclude Include="..\LeopardEncoder.h">
<Filter>Source Files</Filter>
</ClInclude>
<ClInclude Include="..\LeopardFF16.h">
<Filter>Source Files</Filter>
</ClInclude>
<ClInclude Include="..\LeopardFF8.h">
<Filter>Source Files</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<ClCompile Include="..\LeopardDecoder.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="..\LeopardEncoder.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="..\leopard.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="..\LeopardCommon.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="..\LeopardFF16.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="..\LeopardFF8.cpp">
<Filter>Source Files</Filter>
</ClCompile>
</ItemGroup>
</Project>

567
tests/benchmark.cpp Normal file
View File

@ -0,0 +1,567 @@
/*
Copyright (c) 2017 Christopher A. Taylor. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the name of Leopard nor the names of its contributors may be
used to endorse or promote products derived from this software without
specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
*/
#include "../LeopardCommon.h"
#include "../leopard.h"
#include <memory>
#include <vector>
#include <iostream>
#include <string>
using namespace std;
//#define TEST_DATA_ALL_SAME
//#define TEST_LOSE_FIRST_K_PACKETS
//------------------------------------------------------------------------------
// Windows
#ifdef _WIN32
#define WIN32_LEAN_AND_MEAN
#ifndef _WINSOCKAPI_
#define DID_DEFINE_WINSOCKAPI
#define _WINSOCKAPI_
#endif
#ifndef NOMINMAX
#define NOMINMAX
#endif
#ifndef _WIN32_WINNT
#define _WIN32_WINNT 0x0601 /* Windows 7+ */
#endif
#include <windows.h>
#endif
#ifdef DID_DEFINE_WINSOCKAPI
#undef _WINSOCKAPI_
#undef DID_DEFINE_WINSOCKAPI
#endif
//------------------------------------------------------------------------------
// Threads
static bool SetCurrentThreadPriority()
{
#ifdef _WIN32
return 0 != ::SetThreadPriority(::GetCurrentThread(), THREAD_PRIORITY_ABOVE_NORMAL);
#else
return -1 != nice(2);
#endif
}
//------------------------------------------------------------------------------
// Timing
static uint64_t GetTimeUsec()
{
#ifdef _WIN32
LARGE_INTEGER timeStamp = {};
if (!::QueryPerformanceCounter(&timeStamp))
return 0;
static double PerfFrequencyInverse = 0.;
if (PerfFrequencyInverse == 0.)
{
LARGE_INTEGER freq = {};
if (!::QueryPerformanceFrequency(&freq) || freq.QuadPart == 0)
return 0;
PerfFrequencyInverse = 1000000. / (double)freq.QuadPart;
}
return (uint64_t)(PerfFrequencyInverse * timeStamp.QuadPart);
#else
struct timeval tv;
gettimeofday(&tv, nullptr);
return 1000000 * tv.tv_sec + tv.tv_usec;
#endif // _WIN32
}
//------------------------------------------------------------------------------
// PCG PRNG
// From http://www.pcg-random.org/
class PCGRandom
{
public:
inline void Seed(uint64_t y, uint64_t x = 0)
{
State = 0;
Inc = (y << 1u) | 1u;
Next();
State += x;
Next();
}
inline uint32_t Next()
{
const uint64_t oldstate = State;
State = oldstate * UINT64_C(6364136223846793005) + Inc;
const uint32_t xorshifted = (uint32_t)(((oldstate >> 18) ^ oldstate) >> 27);
const uint32_t rot = oldstate >> 59;
return (xorshifted >> rot) | (xorshifted << ((uint32_t)(-(int32_t)rot) & 31));
}
uint64_t State = 0, Inc = 0;
};
//------------------------------------------------------------------------------
// Self-Checking Packet
static void WriteRandomSelfCheckingPacket(PCGRandom& prng, void* packet, unsigned bytes)
{
uint8_t* buffer = (uint8_t*)packet;
#ifdef TEST_DATA_ALL_SAME
if (bytes != 0)
#else
if (bytes < 16)
#endif
{
LEO_DEBUG_ASSERT(bytes >= 2);
buffer[0] = (uint8_t)prng.Next();
for (unsigned i = 1; i < bytes; ++i)
{
buffer[i] = buffer[0];
}
}
else
{
uint32_t crc = bytes;
*(uint32_t*)(buffer + 4) = bytes;
for (unsigned i = 8; i < bytes; ++i)
{
uint8_t v = (uint8_t)prng.Next();
buffer[i] = v;
crc = (crc << 3) | (crc >> (32 - 3));
crc += v;
}
*(uint32_t*)buffer = crc;
}
}
static bool CheckPacket(const void* packet, unsigned bytes)
{
uint8_t* buffer = (uint8_t*)packet;
#ifdef TEST_DATA_ALL_SAME
if (bytes != 0)
#else
if (bytes < 16)
#endif
{
if (bytes < 2)
return false;
uint8_t v = buffer[0];
for (unsigned i = 1; i < bytes; ++i)
{
if (buffer[i] != v)
return false;
}
}
else
{
uint32_t crc = bytes;
uint32_t readBytes = *(uint32_t*)(buffer + 4);
if (readBytes != bytes)
return false;
for (unsigned i = 8; i < bytes; ++i)
{
uint8_t v = buffer[i];
crc = (crc << 3) | (crc >> (32 - 3));
crc += v;
}
uint32_t readCRC = *(uint32_t*)buffer;
if (readCRC != crc)
return false;
}
return true;
}
//------------------------------------------------------------------------------
// FunctionTimer
class FunctionTimer
{
public:
FunctionTimer(const std::string& name)
{
FunctionName = name;
}
void BeginCall()
{
LEO_DEBUG_ASSERT(t0 == 0);
t0 = GetTimeUsec();
}
void EndCall()
{
LEO_DEBUG_ASSERT(t0 != 0);
uint64_t t1 = GetTimeUsec();
++Invokations;
TotalUsec += t1 - t0;
t0 = 0;
}
void Reset()
{
LEO_DEBUG_ASSERT(t0 == 0);
t0 = 0;
Invokations = 0;
TotalUsec = 0;
}
void Print(unsigned trials)
{
cout << FunctionName << " called " << Invokations / (float)trials << " times per trial (avg). " << TotalUsec / (double)Invokations << " usec avg for all invokations. " << TotalUsec / (float)trials << " usec (avg) of " << trials << " trials" << endl;
}
uint64_t t0 = 0;
uint64_t Invokations = 0;
uint64_t TotalUsec = 0;
std::string FunctionName;
};
//------------------------------------------------------------------------------
// Utility: Deck Shuffling function
/*
Given a PRNG, generate a deck of cards in a random order.
The deck will contain elements with values between 0 and count - 1.
*/
static void ShuffleDeck16(PCGRandom &prng, uint16_t * LEO_RESTRICT deck, uint32_t count)
{
deck[0] = 0;
// If we can unroll 4 times,
if (count <= 256)
{
for (uint32_t ii = 1;;)
{
uint32_t jj, rv = prng.Next();
// 8-bit unroll
switch (count - ii)
{
default:
jj = (uint8_t)rv % ii;
deck[ii] = deck[jj];
deck[jj] = ii;
++ii;
jj = (uint8_t)(rv >> 8) % ii;
deck[ii] = deck[jj];
deck[jj] = ii;
++ii;
jj = (uint8_t)(rv >> 16) % ii;
deck[ii] = deck[jj];
deck[jj] = ii;
++ii;
jj = (uint8_t)(rv >> 24) % ii;
deck[ii] = deck[jj];
deck[jj] = ii;
++ii;
break;
case 3:
jj = (uint8_t)rv % ii;
deck[ii] = deck[jj];
deck[jj] = ii;
++ii;
case 2:
jj = (uint8_t)(rv >> 8) % ii;
deck[ii] = deck[jj];
deck[jj] = ii;
++ii;
case 1:
jj = (uint8_t)(rv >> 16) % ii;
deck[ii] = deck[jj];
deck[jj] = ii;
case 0:
return;
}
}
}
else
{
// For each deck entry,
for (uint32_t ii = 1;;)
{
uint32_t jj, rv = prng.Next();
// 16-bit unroll
switch (count - ii)
{
default:
jj = (uint16_t)rv % ii;
deck[ii] = deck[jj];
deck[jj] = ii;
++ii;
jj = (uint16_t)(rv >> 16) % ii;
deck[ii] = deck[jj];
deck[jj] = ii;
++ii;
break;
case 1:
jj = (uint16_t)rv % ii;
deck[ii] = deck[jj];
deck[jj] = ii;
case 0:
return;
}
}
}
}
//------------------------------------------------------------------------------
// SIMD-Safe Aligned Memory Allocations
static const unsigned kAlignmentBytes = LEO_ALIGN_BYTES;
LEO_FORCE_INLINE unsigned NextAlignedOffset(unsigned offset)
{
return (offset + kAlignmentBytes - 1) & ~(kAlignmentBytes - 1);
}
static LEO_FORCE_INLINE uint8_t* SIMDSafeAllocate(size_t size)
{
uint8_t* data = (uint8_t*)calloc(1, kAlignmentBytes + size);
if (!data)
return nullptr;
unsigned offset = (unsigned)((uintptr_t)data % kAlignmentBytes);
data += kAlignmentBytes - offset;
data[-1] = (uint8_t)offset;
return data;
}
static LEO_FORCE_INLINE void SIMDSafeFree(void* ptr)
{
if (!ptr)
return;
uint8_t* data = (uint8_t*)ptr;
unsigned offset = data[-1];
if (offset >= kAlignmentBytes)
{
LEO_DEBUG_BREAK; // Should never happen
return;
}
data -= kAlignmentBytes - offset;
free(data);
}
//------------------------------------------------------------------------------
// Tests
struct TestParameters
{
unsigned original_count = 200; // under 65536
unsigned recovery_count = 100; // under 65536 - original_count
unsigned buffer_bytes = 64000; // multiple of 64 bytes
unsigned loss_count = 20; // some fraction of original_count
unsigned seed = 0;
bool multithreaded = true;
};
static void BasicTest(const TestParameters& params)
{
static const unsigned kTrials = 4;
std::vector<uint8_t*> original_data(params.original_count);
const unsigned encode_work_count = leo_encode_work_count(params.original_count, params.recovery_count);
const unsigned decode_work_count = leo_decode_work_count(params.original_count, params.recovery_count);
std::vector<uint8_t*> encode_work_data(encode_work_count);
std::vector<uint8_t*> decode_work_data(decode_work_count);
FunctionTimer t_mem_alloc("memory_allocation");
FunctionTimer t_leo_encode("leo_encode");
FunctionTimer t_leo_decode("leo_decode");
FunctionTimer t_mem_free("memory_free");
const uint64_t total_bytes = (uint64_t)params.buffer_bytes * params.original_count;
for (unsigned trial = 0; trial < kTrials; ++trial)
{
// Allocate memory:
t_mem_alloc.BeginCall();
for (unsigned i = 0, count = params.original_count; i < count; ++i)
original_data[i] = SIMDSafeAllocate(params.buffer_bytes);
for (unsigned i = 0, count = encode_work_count; i < count; ++i)
encode_work_data[i] = SIMDSafeAllocate(params.buffer_bytes);
for (unsigned i = 0, count = decode_work_count; i < count; ++i)
decode_work_data[i] = SIMDSafeAllocate(params.buffer_bytes);
t_mem_alloc.EndCall();
// Generate data:
PCGRandom prng;
prng.Seed(params.seed, trial);
for (unsigned i = 0; i < params.original_count; ++i)
WriteRandomSelfCheckingPacket(prng, original_data[i], params.buffer_bytes);
// Encode:
t_leo_encode.BeginCall();
LeopardResult encodeResult = leo_encode(
params.buffer_bytes,
params.original_count,
params.recovery_count,
encode_work_count,
(void**)&original_data[0],
(void**)&encode_work_data[0], // recovery data written here
params.multithreaded ? LeopardFlags_Multithreaded : LeopardFlags_Defaults
);
t_leo_encode.EndCall();
if (encodeResult != Leopard_Success)
{
cout << "Error: Leopard encode failed with result=" << encodeResult << endl;
LEO_DEBUG_BREAK;
return;
}
// Lose random original data:
std::vector<uint16_t> original_losses(params.original_count);
ShuffleDeck16(prng, &original_losses[0], params.original_count);
for (unsigned i = 0, count = params.loss_count; i < count; ++i)
{
const unsigned loss_index = original_losses[i];
delete[] original_data[loss_index];
original_data[loss_index] = nullptr;
}
// Lose random recovery data:
const unsigned recovery_loss_count = params.recovery_count - params.loss_count;
std::vector<uint16_t> recovery_losses(params.recovery_count);
ShuffleDeck16(prng, &recovery_losses[0], params.recovery_count);
for (unsigned i = 0, count = params.loss_count; i < count; ++i)
{
const unsigned loss_index = original_losses[i];
delete[] encode_work_data[loss_index];
encode_work_data[loss_index] = nullptr;
}
// Decode:
t_leo_decode.BeginCall();
LeopardResult decodeResult = leo_decode(
params.buffer_bytes,
params.original_count,
params.recovery_count,
decode_work_count,
(void**)&original_data[0],
(void**)&encode_work_data[0],
(void**)&decode_work_data[0],
params.multithreaded ? LeopardFlags_Multithreaded : LeopardFlags_Defaults);
t_leo_decode.EndCall();
if (decodeResult != Leopard_Success)
{
cout << "Error: Leopard decode failed with result=" << decodeResult << endl;
LEO_DEBUG_BREAK;
return;
}
// Free memory:
t_mem_free.BeginCall();
for (unsigned i = 0, count = params.original_count; i < count; ++i)
SIMDSafeFree(original_data[i]);
for (unsigned i = 0, count = encode_work_count; i < count; ++i)
SIMDSafeFree(encode_work_data[i]);
for (unsigned i = 0, count = decode_work_count; i < count; ++i)
SIMDSafeFree(decode_work_data[i]);
t_mem_free.EndCall();
}
t_mem_alloc.Print(kTrials);
t_leo_encode.Print(kTrials);
t_leo_decode.Print(kTrials);
t_mem_free.Print(kTrials);
float encode_input_MBPS = total_bytes * kTrials / (float)(t_leo_encode.TotalUsec);
float encode_output_MBPS = params.buffer_bytes * (uint64_t)params.recovery_count * kTrials / (float)(t_leo_encode.TotalUsec);
float decode_input_MBPS = total_bytes * kTrials / (float)(t_leo_decode.TotalUsec);
float decode_output_MBPS = params.buffer_bytes * (uint64_t)params.loss_count * kTrials / (float)(t_leo_decode.TotalUsec);
cout << "Leopard Encoder(" << total_bytes / 1000000.f << " MB in " << params.original_count << " pieces, " << params.loss_count << " losses): Input=" << encode_input_MBPS << " MB/s, Output=" << encode_output_MBPS << " MB/s" << endl;
cout << "Leopard Decoder(" << total_bytes / 1000000.f << " MB in " << params.original_count << " pieces, " << params.loss_count << " losses): Input=" << decode_input_MBPS << " MB/s, Output=" << decode_output_MBPS << " MB/s" << endl << endl;
}
//------------------------------------------------------------------------------
// Entrypoint
int main(int argc, char **argv)
{
SetCurrentThreadPriority();
FunctionTimer t_leo_init("leo_init");
t_leo_init.BeginCall();
if (0 != leo_init())
{
cout << "Failed to initialize" << endl;
return -1;
}
t_leo_init.EndCall();
t_leo_init.Print(1);
TestParameters params;
if (argc >= 2)
params.original_count = atoi(argv[1]);
if (argc >= 3)
params.recovery_count = atoi(argv[2]);
if (argc >= 4)
params.buffer_bytes = atoi(argv[3]);
if (argc >= 5)
params.loss_count = atoi(argv[4]);
if (argc >= 6)
params.multithreaded = (atoi(argv[5]) != 0);
cout << "Parameters: [original count=" << params.original_count << "] [recovery count=" << params.recovery_count << "] [buffer bytes=" << params.buffer_bytes << "] [loss count=" << params.loss_count << "] [random seed=" << params.seed << "]" << endl;
BasicTest(params);
getchar();
return 0;
}

View File

@ -18,41 +18,38 @@
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<ItemGroup>
<ClCompile Include="..\lhc_rs.cpp" />
</ItemGroup>
<PropertyGroup Label="Globals">
<ProjectGuid>{32176592-2F30-4BD5-B645-EB11C8D3453E}</ProjectGuid>
<RootNamespace>GF65536</RootNamespace>
<ProjectName>LHC_RS</ProjectName>
<WindowsTargetPlatformVersion>8.1</WindowsTargetPlatformVersion>
<ProjectGuid>{97FCA15F-EAF3-4F1A-AFF8-83E693DA9D45}</ProjectGuid>
<RootNamespace>Fecal</RootNamespace>
<ProjectName>LeopardBenchmark</ProjectName>
<WindowsTargetPlatformVersion>10.0.14393.0</WindowsTargetPlatformVersion>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>true</UseDebugLibraries>
<CharacterSet>MultiByte</CharacterSet>
<PlatformToolset>v140</PlatformToolset>
<PlatformToolset>v141</PlatformToolset>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>true</UseDebugLibraries>
<CharacterSet>MultiByte</CharacterSet>
<PlatformToolset>v140</PlatformToolset>
<PlatformToolset>v141</PlatformToolset>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>false</UseDebugLibraries>
<WholeProgramOptimization>true</WholeProgramOptimization>
<CharacterSet>MultiByte</CharacterSet>
<PlatformToolset>v140</PlatformToolset>
<PlatformToolset>v141</PlatformToolset>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>false</UseDebugLibraries>
<WholeProgramOptimization>true</WholeProgramOptimization>
<CharacterSet>MultiByte</CharacterSet>
<PlatformToolset>v140</PlatformToolset>
<PlatformToolset>v141</PlatformToolset>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
@ -155,8 +152,8 @@
<FunctionLevelLinking>true</FunctionLevelLinking>
<IntrinsicFunctions>true</IntrinsicFunctions>
<SDLCheck>true</SDLCheck>
<InlineFunctionExpansion>OnlyExplicitInline</InlineFunctionExpansion>
<FavorSizeOrSpeed>Size</FavorSizeOrSpeed>
<InlineFunctionExpansion>AnySuitable</InlineFunctionExpansion>
<FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
<OmitFramePointers>false</OmitFramePointers>
<RuntimeLibrary>MultiThreaded</RuntimeLibrary>
<BufferSecurityCheck>true</BufferSecurityCheck>
@ -174,6 +171,14 @@
</Command>
</PostBuildEvent>
</ItemDefinitionGroup>
<ItemGroup>
<ClCompile Include="..\benchmark.cpp" />
</ItemGroup>
<ItemGroup>
<ProjectReference Include="..\..\proj\Leopard.vcxproj">
<Project>{32176592-2f30-4bd5-b645-eb11c8d3453e}</Project>
</ProjectReference>
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
</ImportGroup>

View File

@ -15,7 +15,7 @@
</Filter>
</ItemGroup>
<ItemGroup>
<ClCompile Include="..\lhc_rs.cpp">
<ClCompile Include="..\benchmark.cpp">
<Filter>Source Files</Filter>
</ClCompile>
</ItemGroup>