leopard/LeopardFF16.cpp

1221 lines
34 KiB
C++

/*
Copyright (c) 2017 Christopher A. Taylor. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the name of LHC-RS nor the names of its contributors may be
used to endorse or promote products derived from this software without
specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
*/
#include <string.h>
#include <time.h>
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
/*
TODO:
+ Write C API and unit tester
+ Limit input to multiples of 64 bytes
+ Replace GFSymbol with a file data pointer
+ New 16-bit Muladd inner loops
+ Class to contain the (large) muladd tables
+ Preliminary benchmarks for large data!
+ New 8-bit Muladd inner loops
+ Benchmarks for smaller data!
+ Refactor software
+ Pick a name for the software better than LEO_RS
+ I think it should be split up into several C++ modules
+ Write detailed comments for all the routines
+ Look into getting EncodeL working so we can support smaller data (Ask Lin)
+ Look into using k instead of k2 to speed up decoder (Ask Lin)
+ Avoid performing FFT/IFFT intermediate calculations we're not going to use
+ Benchmarks, fun!
+ Add multi-threading to split up long parallelizable calculations
+ Final benchmarks!
+ Finish up documentation
+ Release version 1
Muladd implementation notes:
Specialize for 1-3 rows at a time since often times we're multiplying by
the same (skew) value repeatedly, as the ISA-L library does here:
https://github.com/01org/isa-l/blob/master/erasure_code/gf_3vect_mad_avx.asm#L258
Except we should be doing it for 16-bit Galois Field.
To implement that use the ALTMAP trick from Jerasure:
http://lab.jerasure.org/jerasure/gf-complete/blob/master/src/gf_w16.c#L1140
Except we should also support AVX2 since that is a 40% perf boost, so put
the high and low bytes 32 bytes instead of 16 bytes apart.
Also I think we should go ahead and precompute the multiply tables since
it avoids a bunch of memory lookups for each muladd, and only costs 8 MB.
*/
//------------------------------------------------------------------------------
// Debug
// Some bugs only repro in release mode, so this can be helpful
//#define LEO_DEBUG_IN_RELEASE
#if defined(_DEBUG) || defined(DEBUG) || defined(LEO_DEBUG_IN_RELEASE)
#define LEO_DEBUG
#ifdef _WIN32
#define LEO_DEBUG_BREAK __debugbreak()
#else
#define LEO_DEBUG_BREAK __builtin_trap()
#endif
#define LEO_DEBUG_ASSERT(cond) { if (!(cond)) { LEO_DEBUG_BREAK; } }
#else
#define LEO_DEBUG_BREAK ;
#define LEO_DEBUG_ASSERT(cond) ;
#endif
//------------------------------------------------------------------------------
// Platform/Architecture
#if defined(ANDROID) || defined(IOS)
#define LEO_TARGET_MOBILE
#endif // ANDROID
#if defined(__AVX2__) || (defined (_MSC_VER) && _MSC_VER >= 1900)
#define LEO_TRY_AVX2 /* 256-bit */
#include <immintrin.h>
#define LEO_ALIGN_BYTES 32
#else // __AVX2__
#define LEO_ALIGN_BYTES 16
#endif // __AVX2__
#if !defined(LEO_TARGET_MOBILE)
// Note: MSVC currently only supports SSSE3 but not AVX2
#include <tmmintrin.h> // SSSE3: _mm_shuffle_epi8
#include <emmintrin.h> // SSE2
#endif // LEO_TARGET_MOBILE
#if defined(HAVE_ARM_NEON_H)
#include <arm_neon.h>
#endif // HAVE_ARM_NEON_H
#if defined(LEO_TARGET_MOBILE)
#define LEO_ALIGNED_ACCESSES /* Inputs must be aligned to LEO_ALIGN_BYTES */
# if defined(HAVE_ARM_NEON_H)
// Compiler-specific 128-bit SIMD register keyword
#define LEO_M128 uint8x16_t
#define LEO_TRY_NEON
#else
#define LEO_M128 uint64_t
# endif
#else // LEO_TARGET_MOBILE
// Compiler-specific 128-bit SIMD register keyword
#define LEO_M128 __m128i
#endif // LEO_TARGET_MOBILE
#ifdef LEO_TRY_AVX2
// Compiler-specific 256-bit SIMD register keyword
#define LEO_M256 __m256i
#endif
// Compiler-specific C++11 restrict keyword
#define LEO_RESTRICT __restrict
// Compiler-specific force inline keyword
#ifdef _MSC_VER
#define LEO_FORCE_INLINE inline __forceinline
#else
#define LEO_FORCE_INLINE inline __attribute__((always_inline))
#endif
// Compiler-specific alignment keyword
// Note: Alignment only matters for ARM NEON where it should be 16
#ifdef _MSC_VER
#define LEO_ALIGNED __declspec(align(LEO_ALIGN_BYTES))
#else // _MSC_VER
#define LEO_ALIGNED __attribute__((aligned(LEO_ALIGN_BYTES)))
#endif // _MSC_VER
//------------------------------------------------------------------------------
// Runtime CPU Architecture Check
//
// Feature checks stolen shamelessly from
// https://github.com/jedisct1/libsodium/blob/master/src/libsodium/sodium/runtime.c
#if defined(HAVE_ANDROID_GETCPUFEATURES)
#include <cpu-features.h>
#endif
#if defined(LEO_TRY_NEON)
# if defined(IOS) && defined(__ARM_NEON__)
// Requires iPhone 5S or newer
static const bool CpuHasNeon = true;
static const bool CpuHasNeon64 = true;
# else
// Remember to add LOCAL_STATIC_LIBRARIES := cpufeatures
static bool CpuHasNeon = false; // V6 / V7
static bool CpuHasNeon64 = false; // 64-bit
# endif
#endif
#if !defined(LEO_TARGET_MOBILE)
#ifdef _MSC_VER
#include <intrin.h> // __cpuid
#pragma warning(disable: 4752) // found Intel(R) Advanced Vector Extensions; consider using /arch:AVX
#endif
#ifdef LEO_TRY_AVX2
static bool CpuHasAVX2 = false;
#endif
static bool CpuHasSSSE3 = false;
#define CPUID_EBX_AVX2 0x00000020
#define CPUID_ECX_SSSE3 0x00000200
static void _cpuid(unsigned int cpu_info[4U], const unsigned int cpu_info_type)
{
#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_AMD64) || defined(_M_IX86))
__cpuid((int *) cpu_info, cpu_info_type);
#else //if defined(HAVE_CPUID)
cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0;
# ifdef __i386__
__asm__ __volatile__ ("pushfl; pushfl; "
"popl %0; "
"movl %0, %1; xorl %2, %0; "
"pushl %0; "
"popfl; pushfl; popl %0; popfl" :
"=&r" (cpu_info[0]), "=&r" (cpu_info[1]) :
"i" (0x200000));
if (((cpu_info[0] ^ cpu_info[1]) & 0x200000) == 0) {
return; /* LCOV_EXCL_LINE */
}
# endif
# ifdef __i386__
__asm__ __volatile__ ("xchgl %%ebx, %k1; cpuid; xchgl %%ebx, %k1" :
"=a" (cpu_info[0]), "=&r" (cpu_info[1]),
"=c" (cpu_info[2]), "=d" (cpu_info[3]) :
"0" (cpu_info_type), "2" (0U));
# elif defined(__x86_64__)
__asm__ __volatile__ ("xchgq %%rbx, %q1; cpuid; xchgq %%rbx, %q1" :
"=a" (cpu_info[0]), "=&r" (cpu_info[1]),
"=c" (cpu_info[2]), "=d" (cpu_info[3]) :
"0" (cpu_info_type), "2" (0U));
# else
__asm__ __volatile__ ("cpuid" :
"=a" (cpu_info[0]), "=b" (cpu_info[1]),
"=c" (cpu_info[2]), "=d" (cpu_info[3]) :
"0" (cpu_info_type), "2" (0U));
# endif
#endif
}
#endif // defined(LEO_TARGET_MOBILE)
static void leo_architecture_init()
{
#if defined(LEO_TRY_NEON) && defined(HAVE_ANDROID_GETCPUFEATURES)
AndroidCpuFamily family = android_getCpuFamily();
if (family == ANDROID_CPU_FAMILY_ARM)
{
if (android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON)
CpuHasNeon = true;
}
else if (family == ANDROID_CPU_FAMILY_ARM64)
{
CpuHasNeon = true;
if (android_getCpuFeatures() & ANDROID_CPU_ARM64_FEATURE_ASIMD)
CpuHasNeon64 = true;
}
#endif
#if !defined(LEO_TARGET_MOBILE)
unsigned int cpu_info[4];
_cpuid(cpu_info, 1);
CpuHasSSSE3 = ((cpu_info[2] & CPUID_ECX_SSSE3) != 0);
#if defined(LEO_TRY_AVX2)
_cpuid(cpu_info, 7);
CpuHasAVX2 = ((cpu_info[1] & CPUID_EBX_AVX2) != 0);
#endif // LEO_TRY_AVX2
#endif // LEO_TARGET_MOBILE
}
//------------------------------------------------------------------------------
// SIMD-Safe Aligned Memory Allocations
static const unsigned kAlignmentBytes = LEO_ALIGN_BYTES;
LEO_FORCE_INLINE unsigned NextAlignedOffset(unsigned offset)
{
return (offset + kAlignmentBytes - 1) & ~(kAlignmentBytes - 1);
}
static LEO_FORCE_INLINE uint8_t* SIMDSafeAllocate(size_t size)
{
uint8_t* data = (uint8_t*)calloc(1, kAlignmentBytes + size);
if (!data)
return nullptr;
unsigned offset = (unsigned)((uintptr_t)data % kAlignmentBytes);
data += kAlignmentBytes - offset;
data[-1] = (uint8_t)offset;
return data;
}
static LEO_FORCE_INLINE void SIMDSafeFree(void* ptr)
{
if (!ptr)
return;
uint8_t* data = (uint8_t*)ptr;
unsigned offset = data[-1];
if (offset >= kAlignmentBytes)
{
LEO_DEBUG_BREAK; // Should never happen
return;
}
data -= kAlignmentBytes - offset;
free(data);
}
//------------------------------------------------------------------------------
// Field
//#define LEO_SHORT_FIELD
#ifdef LEO_SHORT_FIELD
typedef uint8_t GFSymbol;
static const unsigned kGFBits = 8;
static const unsigned kGFPolynomial = 0x11D;
GFSymbol kGFBasis[kGFBits] = {
1, 214, 152, 146, 86, 200, 88, 230 // Cantor basis
};
#else
typedef uint16_t GFSymbol;
static const unsigned kGFBits = 16;
static const unsigned kGFPolynomial = 0x1002D;
GFSymbol kGFBasis[kGFBits] = {
0x0001, 0xACCA, 0x3C0E, 0x163E, // Cantor basis
0xC582, 0xED2E, 0x914C, 0x4012,
0x6C98, 0x10D8, 0x6A72, 0xB900,
0xFDB8, 0xFB34, 0xFF38, 0x991E
};
#endif
/*
Cantor Basis introduced by:
D. G. Cantor, "On arithmetical algorithms over finite fields",
Journal of Combinatorial Theory, Series A, vol. 50, no. 2, pp. 285-300, 1989.
*/
static const unsigned kFieldSize = (unsigned)1 << kGFBits; //Field size
static const unsigned kFieldModulus = kFieldSize - 1;
static GFSymbol GFLog[kFieldSize];
static GFSymbol GFExp[kFieldSize];
// Initialize GFLog[], GFExp[]
static void InitField()
{
unsigned state = 1;
for (unsigned i = 0; i < kFieldModulus; ++i)
{
GFExp[state] = static_cast<GFSymbol>(i);
state <<= 1;
if (state >= kFieldSize)
state ^= kGFPolynomial;
}
GFExp[0] = kFieldModulus;
// Conversion to chosen basis:
GFLog[0] = 0;
for (unsigned i = 0; i < kGFBits; ++i)
{
const GFSymbol basis = kGFBasis[i];
const unsigned width = (unsigned)(1UL << i);
for (unsigned j = 0; j < width; ++j)
GFLog[j + width] = GFLog[j] ^ basis;
}
for (unsigned i = 0; i < kFieldSize; ++i)
GFLog[i] = GFExp[GFLog[i]];
for (unsigned i = 0; i < kFieldSize; ++i)
GFExp[GFLog[i]] = i;
GFExp[kFieldModulus] = GFExp[0];
}
//------------------------------------------------------------------------------
// Mod Q Field Operations
//
// Q is the maximum symbol value, e.g. 255 or 65535.
// z = x + y (mod Q)
static inline GFSymbol AddModQ(GFSymbol a, GFSymbol b)
{
const unsigned sum = (unsigned)a + b;
// Partial reduction step, allowing for Q to be returned
return static_cast<GFSymbol>(sum + (sum >> kGFBits));
}
// z = x - y (mod Q)
static inline GFSymbol SubModQ(GFSymbol a, GFSymbol b)
{
const unsigned dif = (unsigned)a - b;
// Partial reduction step, allowing for Q to be returned
return static_cast<GFSymbol>(dif + (dif >> kGFBits));
}
// vx[] += vy[] * z
static void muladd_mem(GFSymbol * LEO_RESTRICT vx, const GFSymbol * LEO_RESTRICT vy, GFSymbol z, unsigned symbolCount)
{
for (unsigned i = 0; i < symbolCount; ++i)
{
const GFSymbol a = vy[i];
if (a == 0)
continue;
GFSymbol sum1 = static_cast<GFSymbol>(AddModQ(GFLog[a & 0x0f], z));
GFSymbol value1 = GFExp[sum1];
if ((a & 0x0f) == 0)
{
value1 = 0;
}
GFSymbol sum2 = static_cast<GFSymbol>(AddModQ(GFLog[a & 0xf0], z));
GFSymbol value2 = GFExp[sum2];
if ((a & 0xf0) == 0)
{
value2 = 0;
}
GFSymbol sum3 = static_cast<GFSymbol>(AddModQ(GFLog[a & 0x0f00], z));
GFSymbol value3 = GFExp[sum3];
if ((a & 0x0f00) == 0)
{
value3 = 0;
}
GFSymbol sum4 = static_cast<GFSymbol>(AddModQ(GFLog[a & 0xf000], z));
GFSymbol value4 = GFExp[sum4];
if ((a & 0xf000) == 0)
{
value4 = 0;
}
vx[i] ^= value1;
vx[i] ^= value2;
vx[i] ^= value3;
vx[i] ^= value4;
}
}
// return a*GFExp[b] over GF(2^r)
static GFSymbol mulE(GFSymbol a, GFSymbol b)
{
if (a == 0)
return 0;
const GFSymbol sum = static_cast<GFSymbol>(AddModQ(GFLog[a], b));
return GFExp[sum];
}
//------------------------------------------------------------------------------
// Fast Walsh-Hadamard Transform (FWHT) Mod Q
//
// Q is the maximum symbol value, e.g. 255 or 65535.
// Define this to enable the optimized version of FWHT()
#define LEO_FWHT_OPTIMIZED
typedef GFSymbol fwht_t;
// {a, b} = {a + b, a - b} (Mod Q)
static LEO_FORCE_INLINE void FWHT_2(fwht_t& LEO_RESTRICT a, fwht_t& LEO_RESTRICT b)
{
const fwht_t sum = AddModQ(a, b);
const fwht_t dif = SubModQ(a, b);
a = sum;
b = dif;
}
/*
FWHT is a minor slice of the runtime and does not grow with data size,
but I did attempt a few additional optimizations that failed:
I've attempted to vectorize (with partial reductions) FWHT_4(data, s),
which is 70% of the algorithm, but it was slower. Left in _attic_.
I've attempted to avoid reductions in all or parts of the FWHT.
The final modular reduction ends up being slower than the savings.
Specifically I tried doing it for the whole FWHT and also I tried
doing it just for the FWHT_2 loop in the main routine, but both
approaches are slower than partial reductions.
Replacing word reads with wider reads does speed up the operation, but
at too high a complexity cost relative to minor perf improvement.
*/
#ifndef LEO_FWHT_OPTIMIZED
// Reference implementation
static void FWHT(fwht_t* data, const unsigned bits)
{
const unsigned size = (unsigned)(1UL << bits);
for (unsigned width = 1; width < size; width <<= 1)
for (unsigned i = 0; i < size; i += (width << 1))
for (unsigned j = i; j < (width + i); ++j)
FWHT_2(data[j], data[j + width]);
}
#else
static LEO_FORCE_INLINE void FWHT_4(fwht_t* data)
{
fwht_t t0 = data[0];
fwht_t t1 = data[1];
fwht_t t2 = data[2];
fwht_t t3 = data[3];
FWHT_2(t0, t1);
FWHT_2(t2, t3);
FWHT_2(t0, t2);
FWHT_2(t1, t3);
data[0] = t0;
data[1] = t1;
data[2] = t2;
data[3] = t3;
}
static LEO_FORCE_INLINE void FWHT_4(fwht_t* data, unsigned s)
{
unsigned x = 0;
fwht_t t0 = data[x]; x += s;
fwht_t t1 = data[x]; x += s;
fwht_t t2 = data[x]; x += s;
fwht_t t3 = data[x];
FWHT_2(t0, t1);
FWHT_2(t2, t3);
FWHT_2(t0, t2);
FWHT_2(t1, t3);
unsigned y = 0;
data[y] = t0; y += s;
data[y] = t1; y += s;
data[y] = t2; y += s;
data[y] = t3;
}
static inline void FWHT_8(fwht_t* data)
{
fwht_t t0 = data[0];
fwht_t t1 = data[1];
fwht_t t2 = data[2];
fwht_t t3 = data[3];
fwht_t t4 = data[4];
fwht_t t5 = data[5];
fwht_t t6 = data[6];
fwht_t t7 = data[7];
FWHT_2(t0, t1);
FWHT_2(t2, t3);
FWHT_2(t4, t5);
FWHT_2(t6, t7);
FWHT_2(t0, t2);
FWHT_2(t1, t3);
FWHT_2(t4, t6);
FWHT_2(t5, t7);
FWHT_2(t0, t4);
FWHT_2(t1, t5);
FWHT_2(t2, t6);
FWHT_2(t3, t7);
data[0] = t0;
data[1] = t1;
data[2] = t2;
data[3] = t3;
data[4] = t4;
data[5] = t5;
data[6] = t6;
data[7] = t7;
}
static inline void FWHT_16(fwht_t* data)
{
fwht_t t0 = data[0];
fwht_t t1 = data[1];
fwht_t t2 = data[2];
fwht_t t3 = data[3];
fwht_t t4 = data[4];
fwht_t t5 = data[5];
fwht_t t6 = data[6];
fwht_t t7 = data[7];
fwht_t t8 = data[8];
fwht_t t9 = data[9];
fwht_t t10 = data[10];
fwht_t t11 = data[11];
fwht_t t12 = data[12];
fwht_t t13 = data[13];
fwht_t t14 = data[14];
fwht_t t15 = data[15];
FWHT_2(t0, t1);
FWHT_2(t2, t3);
FWHT_2(t4, t5);
FWHT_2(t6, t7);
FWHT_2(t8, t9);
FWHT_2(t10, t11);
FWHT_2(t12, t13);
FWHT_2(t14, t15);
FWHT_2(t0, t2);
FWHT_2(t1, t3);
FWHT_2(t4, t6);
FWHT_2(t5, t7);
FWHT_2(t8, t10);
FWHT_2(t9, t11);
FWHT_2(t12, t14);
FWHT_2(t13, t15);
FWHT_2(t0, t4);
FWHT_2(t1, t5);
FWHT_2(t2, t6);
FWHT_2(t3, t7);
FWHT_2(t8, t12);
FWHT_2(t9, t13);
FWHT_2(t10, t14);
FWHT_2(t11, t15);
FWHT_2(t0, t8);
FWHT_2(t1, t9);
FWHT_2(t2, t10);
FWHT_2(t3, t11);
FWHT_2(t4, t12);
FWHT_2(t5, t13);
FWHT_2(t6, t14);
FWHT_2(t7, t15);
data[0] = t0;
data[1] = t1;
data[2] = t2;
data[3] = t3;
data[4] = t4;
data[5] = t5;
data[6] = t6;
data[7] = t7;
data[8] = t8;
data[9] = t9;
data[10] = t10;
data[11] = t11;
data[12] = t12;
data[13] = t13;
data[14] = t14;
data[15] = t15;
}
static void FWHT_SmallData(fwht_t* data, unsigned ldn)
{
const unsigned n = (1UL << ldn);
if (n <= 2)
{
if (n == 2)
FWHT_2(data[0], data[1]);
return;
}
for (unsigned ldm = ldn; ldm > 3; ldm -= 2)
{
unsigned m = (1UL << ldm);
unsigned m4 = (m >> 2);
for (unsigned r = 0; r < n; r += m)
for (unsigned j = 0; j < m4; j++)
FWHT_4(data + j + r, m4);
}
if (ldn & 1)
{
for (unsigned i0 = 0; i0 < n; i0 += 8)
FWHT_8(data + i0);
}
else
{
for (unsigned i0 = 0; i0 < n; i0 += 4)
FWHT_4(data + i0);
}
}
// Decimation in time (DIT) version
static void FWHT(fwht_t* data, const unsigned ldn)
{
if (ldn <= 13)
{
FWHT_SmallData(data, ldn);
return;
}
FWHT_2(data[2], data[3]);
FWHT_4(data + 4);
FWHT_8(data + 8);
FWHT_16(data + 16);
for (unsigned ldm = 5; ldm < ldn; ++ldm)
FWHT(data + (unsigned)(1UL << ldm), ldm);
for (unsigned ldm = 0; ldm < ldn; ++ldm)
{
const unsigned mh = (1UL << ldm);
for (unsigned t1 = 0, t2 = mh; t1 < mh; ++t1, ++t2)
FWHT_2(data[t1], data[t2]);
}
}
#endif
//------------------------------------------------------------------------------
// Memory Buffer XOR
static void xor_mem(void * LEO_RESTRICT vx, const void * LEO_RESTRICT vy, unsigned bytes)
{
LEO_M128 * LEO_RESTRICT x16 = reinterpret_cast<LEO_M128 *>(vx);
const LEO_M128 * LEO_RESTRICT y16 = reinterpret_cast<const LEO_M128 *>(vy);
#if defined(LEO_TARGET_MOBILE)
# if defined(LEO_TRY_NEON)
// Handle multiples of 64 bytes
if (CpuHasNeon)
{
while (bytes >= 64)
{
LEO_M128 x0 = vld1q_u8(x16);
LEO_M128 x1 = vld1q_u8(x16 + 1);
LEO_M128 x2 = vld1q_u8(x16 + 2);
LEO_M128 x3 = vld1q_u8(x16 + 3);
LEO_M128 y0 = vld1q_u8(y16);
LEO_M128 y1 = vld1q_u8(y16 + 1);
LEO_M128 y2 = vld1q_u8(y16 + 2);
LEO_M128 y3 = vld1q_u8(y16 + 3);
vst1q_u8(x16, veorq_u8(x0, y0));
vst1q_u8(x16 + 1, veorq_u8(x1, y1));
vst1q_u8(x16 + 2, veorq_u8(x2, y2));
vst1q_u8(x16 + 3, veorq_u8(x3, y3));
bytes -= 64, x16 += 4, y16 += 4;
}
// Handle multiples of 16 bytes
while (bytes >= 16)
{
LEO_M128 x0 = vld1q_u8(x16);
LEO_M128 y0 = vld1q_u8(y16);
vst1q_u8(x16, veorq_u8(x0, y0));
bytes -= 16, ++x16, ++y16;
}
}
else
# endif // LEO_TRY_NEON
{
uint64_t * LEO_RESTRICT x8 = reinterpret_cast<uint64_t *>(x16);
const uint64_t * LEO_RESTRICT y8 = reinterpret_cast<const uint64_t *>(y16);
const unsigned count = (unsigned)bytes / 8;
for (unsigned ii = 0; ii < count; ++ii)
x8[ii] ^= y8[ii];
x16 = reinterpret_cast<LEO_M128 *>(x8 + count);
y16 = reinterpret_cast<const LEO_M128 *>(y8 + count);
}
#else // LEO_TARGET_MOBILE
# if defined(LEO_TRY_AVX2)
if (CpuHasAVX2)
{
LEO_M256 * LEO_RESTRICT x32 = reinterpret_cast<LEO_M256 *>(x16);
const LEO_M256 * LEO_RESTRICT y32 = reinterpret_cast<const LEO_M256 *>(y16);
while (bytes >= 128)
{
LEO_M256 x0 = _mm256_loadu_si256(x32);
LEO_M256 y0 = _mm256_loadu_si256(y32);
x0 = _mm256_xor_si256(x0, y0);
LEO_M256 x1 = _mm256_loadu_si256(x32 + 1);
LEO_M256 y1 = _mm256_loadu_si256(y32 + 1);
x1 = _mm256_xor_si256(x1, y1);
LEO_M256 x2 = _mm256_loadu_si256(x32 + 2);
LEO_M256 y2 = _mm256_loadu_si256(y32 + 2);
x2 = _mm256_xor_si256(x2, y2);
LEO_M256 x3 = _mm256_loadu_si256(x32 + 3);
LEO_M256 y3 = _mm256_loadu_si256(y32 + 3);
x3 = _mm256_xor_si256(x3, y3);
_mm256_storeu_si256(x32, x0);
_mm256_storeu_si256(x32 + 1, x1);
_mm256_storeu_si256(x32 + 2, x2);
_mm256_storeu_si256(x32 + 3, x3);
bytes -= 128, x32 += 4, y32 += 4;
}
// Handle multiples of 32 bytes
while (bytes >= 32)
{
// x[i] = x[i] xor y[i]
_mm256_storeu_si256(x32,
_mm256_xor_si256(
_mm256_loadu_si256(x32),
_mm256_loadu_si256(y32)));
bytes -= 32, ++x32, ++y32;
}
x16 = reinterpret_cast<LEO_M128 *>(x32);
y16 = reinterpret_cast<const LEO_M128 *>(y32);
}
else
# endif // LEO_TRY_AVX2
{
while (bytes >= 64)
{
LEO_M128 x0 = _mm_loadu_si128(x16);
LEO_M128 y0 = _mm_loadu_si128(y16);
x0 = _mm_xor_si128(x0, y0);
LEO_M128 x1 = _mm_loadu_si128(x16 + 1);
LEO_M128 y1 = _mm_loadu_si128(y16 + 1);
x1 = _mm_xor_si128(x1, y1);
LEO_M128 x2 = _mm_loadu_si128(x16 + 2);
LEO_M128 y2 = _mm_loadu_si128(y16 + 2);
x2 = _mm_xor_si128(x2, y2);
LEO_M128 x3 = _mm_loadu_si128(x16 + 3);
LEO_M128 y3 = _mm_loadu_si128(y16 + 3);
x3 = _mm_xor_si128(x3, y3);
_mm_storeu_si128(x16, x0);
_mm_storeu_si128(x16 + 1, x1);
_mm_storeu_si128(x16 + 2, x2);
_mm_storeu_si128(x16 + 3, x3);
bytes -= 64, x16 += 4, y16 += 4;
}
}
#endif // LEO_TARGET_MOBILE
// Handle multiples of 16 bytes
while (bytes >= 16)
{
// x[i] = x[i] xor y[i]
_mm_storeu_si128(x16,
_mm_xor_si128(
_mm_loadu_si128(x16),
_mm_loadu_si128(y16)));
bytes -= 16, ++x16, ++y16;
}
uint8_t * LEO_RESTRICT x1 = reinterpret_cast<uint8_t *>(x16);
const uint8_t * LEO_RESTRICT y1 = reinterpret_cast<const uint8_t *>(y16);
// Handle a block of 8 bytes
const unsigned eight = bytes & 8;
if (eight)
{
uint64_t * LEO_RESTRICT x8 = reinterpret_cast<uint64_t *>(x1);
const uint64_t * LEO_RESTRICT y8 = reinterpret_cast<const uint64_t *>(y1);
*x8 ^= *y8;
}
// Handle a block of 4 bytes
const unsigned four = bytes & 4;
if (four)
{
uint32_t * LEO_RESTRICT x4 = reinterpret_cast<uint32_t *>(x1 + eight);
const uint32_t * LEO_RESTRICT y4 = reinterpret_cast<const uint32_t *>(y1 + eight);
*x4 ^= *y4;
}
// Handle final bytes
const unsigned offset = eight + four;
switch (bytes & 3)
{
case 3: x1[offset + 2] ^= y1[offset + 2];
case 2: x1[offset + 1] ^= y1[offset + 1];
case 1: x1[offset] ^= y1[offset];
default:
break;
}
}
//------------------------------------------------------------------------------
// Formal Derivative
// Formal derivative of polynomial in the new basis
static void formal_derivative(GFSymbol* cos, const unsigned size)
{
for (unsigned i = 1; i < size; ++i)
{
const unsigned leng = ((i ^ (i - 1)) + 1) >> 1;
// If a large number of values are being XORed:
if (leng >= 8)
xor_mem(cos + i - leng, cos + i, leng * sizeof(GFSymbol));
else
for (unsigned j = i - leng; j < i; j++)
cos[j] ^= cos[j + leng];
}
for (unsigned i = size; i < kFieldSize; i <<= 1)
xor_mem(cos, cos + i, size * sizeof(GFSymbol));
}
//------------------------------------------------------------------------------
// Fast Fourier Transform
static GFSymbol skewVec[kFieldModulus]; // twisted factors used in FFT
// IFFT in the proposed basis
static void IFLT(GFSymbol* data, const unsigned size, const unsigned index)
{
for (unsigned depart_no = 1; depart_no < size; depart_no <<= 1)
{
for (unsigned j = depart_no; j < size; j += (depart_no << 1))
{
// If a large number of values are being XORed:
if (depart_no >= 8)
xor_mem(data + j, data + j - depart_no, depart_no * sizeof(GFSymbol));
else
for (unsigned i = j - depart_no; i < j; ++i)
data[i + depart_no] ^= data[i];
const GFSymbol skew = skewVec[j + index - 1];
if (skew != kFieldModulus)
muladd_mem(data + j - depart_no, data + j, skew, depart_no);
}
}
}
// FFT in the proposed basis
static void FLT(GFSymbol* data, const unsigned size, const unsigned index)
{
for (unsigned depart_no = (size >> 1); depart_no > 0; depart_no >>= 1)
{
for (unsigned j = depart_no; j < size; j += (depart_no << 1))
{
const GFSymbol skew = skewVec[j + index - 1];
if (skew != kFieldModulus)
muladd_mem(data + j - depart_no, data + j, skew, depart_no);
// If a large number of values are being XORed:
if (depart_no >= 8)
xor_mem(data + j, data + j - depart_no, depart_no * sizeof(GFSymbol));
else
for (unsigned i = j - depart_no; i < j; ++i)
data[i + depart_no] ^= data[i];
}
}
}
//------------------------------------------------------------------------------
// FFT Initialization
static GFSymbol B[kFieldSize >> 1]; // factors used in formal derivative
static fwht_t log_walsh[kFieldSize]; // factors used in the evaluation of the error locator polynomial
// Initialize skewVec[], B[], log_walsh[]
static void InitFieldOperations()
{
GFSymbol temp[kGFBits - 1];
for (unsigned i = 1; i < kGFBits; ++i)
temp[i - 1] = (GFSymbol)((unsigned)1 << i);
for (unsigned m = 0; m < (kGFBits - 1); ++m)
{
const unsigned step = (unsigned)1 << (m + 1);
skewVec[((unsigned)1 << m) - 1] = 0;
for (unsigned i = m; i < (kGFBits - 1); ++i)
{
const unsigned s = ((unsigned)1 << (i + 1));
for (unsigned j = ((unsigned)1 << m) - 1; j < s; j += step)
skewVec[j + s] = skewVec[j] ^ temp[i];
}
temp[m] = kFieldModulus - GFLog[mulE(temp[m], GFLog[temp[m] ^ 1])];
for (unsigned i = m + 1; i < (kGFBits - 1); ++i)
temp[i] = mulE(temp[i], (GFLog[temp[i] ^ 1] + temp[m]) % kFieldModulus);
}
for (unsigned i = 0; i < kFieldSize; ++i)
skewVec[i] = GFLog[skewVec[i]];
temp[0] = kFieldModulus - temp[0];
for (unsigned i = 1; i < (kGFBits - 1); ++i)
temp[i] = (kFieldModulus - temp[i] + temp[i - 1]) % kFieldModulus;
B[0] = 0;
for (unsigned i = 0; i < (kGFBits - 1); ++i)
{
const unsigned depart = ((unsigned)1 << i);
for (unsigned j = 0; j < depart; ++j)
B[j + depart] = (B[j] + temp[i]) % kFieldModulus;
}
for (unsigned i = 0; i < kFieldSize; ++i)
log_walsh[i] = GFLog[i];
log_walsh[0] = 0;
FWHT(log_walsh, kGFBits);
}
//------------------------------------------------------------------------------
// Encoder
// Encoding alg for k/n<0.5: message is a power of two
static void encodeL(GFSymbol* data, const unsigned k, GFSymbol* codeword)
{
memcpy(codeword, data, sizeof(GFSymbol) * k);
IFLT(codeword, k, 0);
for (unsigned i = k; i < kFieldSize; i += k)
{
memcpy(&codeword[i], codeword, sizeof(GFSymbol) * k);
FLT(&codeword[i], k, i);
}
memcpy(codeword, data, sizeof(GFSymbol) * k);
}
// Encoding alg for k/n>0.5: parity is a power of two.
// data: message array. parity: parity array. mem: buffer(size>= n-k)
static void encodeH(const GFSymbol* data, const unsigned k, GFSymbol* parity, GFSymbol* mem)
{
const unsigned t = kFieldSize - k;
memset(parity, 0, sizeof(GFSymbol) * t);
for (unsigned i = t; i < kFieldSize; i += t)
{
memcpy(mem, &data[i - t], sizeof(GFSymbol) * t);
IFLT(mem, t, i);
xor_mem(parity, mem, t * sizeof(GFSymbol));
}
FLT(parity, t, 0);
}
//------------------------------------------------------------------------------
// Decoder
static void decode(GFSymbol* codeword, unsigned k, const bool* erasure)
{
fwht_t log_walsh2[kFieldSize];
// Compute the evaluations of the error locator polynomial
for (unsigned i = 0; i < kFieldSize; ++i)
log_walsh2[i] = erasure[i] ? 1 : 0;
FWHT(log_walsh2, kGFBits);
for (unsigned i = 0; i < kFieldSize; ++i)
log_walsh2[i] = ((unsigned)log_walsh2[i] * (unsigned)log_walsh[i]) % kFieldModulus;
FWHT(log_walsh2, kGFBits);
// k2 can be replaced with k
const unsigned k2 = kFieldSize;
//const unsigned k2 = k; // cannot actually be replaced with k. what else need to change?
for (unsigned i = 0; i < kFieldSize; ++i)
{
if (erasure[i])
{
codeword[i] = 0;
}
else
{
codeword[i] = mulE(codeword[i], log_walsh2[i]);
}
}
IFLT(codeword, kFieldSize, 0);
// formal derivative
for (unsigned i = 0; i < kFieldSize; i += 2)
{
codeword[i] = mulE(codeword[i], kFieldModulus - B[i >> 1]);
codeword[i + 1] = mulE(codeword[i + 1], kFieldModulus - B[i >> 1]);
}
formal_derivative(codeword, k2);
for (unsigned i = 0; i < k2; i += 2)
{
codeword[i] = mulE(codeword[i], B[i >> 1]);
codeword[i + 1] = mulE(codeword[i + 1], B[i >> 1]);
}
FLT(codeword, k2, 0);
for (unsigned i = 0; i < k2; ++i)
{
if (erasure[i])
{
codeword[i] = mulE(codeword[i], kFieldModulus - log_walsh2[i]);
}
}
}
//------------------------------------------------------------------------------
// Test Application
void test(unsigned k, unsigned seed)
{
srand(seed);
//-----------Generating message----------
// Message array
GFSymbol data[kFieldSize] = {0};
// Filled with random numbers
for (unsigned i = kFieldSize - k; i < kFieldSize; ++i)
data[i] = (GFSymbol)rand();
//---------encoding----------
GFSymbol codeword[kFieldSize];
encodeH(&data[kFieldSize - k], k, data, codeword);
//encodeL(data, k, codeword); // does not seem to work with any input? what else needs to change?
memcpy(codeword, data, sizeof(GFSymbol) * kFieldSize);
//--------erasure simulation---------
// Array indicating erasures
bool erasure[kFieldSize] = {
false
};
for (unsigned i = k; i < kFieldSize; ++i)
erasure[i] = true;
// permuting the erasure array
for (unsigned i = kFieldSize - 1; i > 0; --i)
{
unsigned pos = rand() % (i + 1);
if (i != pos)
{
bool tmp = erasure[i];
erasure[i] = erasure[pos];
erasure[pos] = tmp;
}
}
// erasure codeword symbols
for (unsigned i = 0; i < kFieldSize; ++i)
if (erasure[i])
codeword[i] = 0;
//---------main processing----------
decode(codeword, k, erasure);
// Check the correctness of the result
for (unsigned i = 0; i < kFieldSize; ++i)
{
if (erasure[i] == 1)
{
if (data[i] != codeword[i])
{
printf("Decoding Error with seed = %d!\n", seed);
LEO_DEBUG_BREAK;
return;
}
}
}
//printf("Decoding is successful!\n");
}
//------------------------------------------------------------------------------
// Entrypoint
int main(int argc, char **argv)
{
// Initialize architecture-specific code
leo_architecture_init();
// Fill GFLog table and GFExp table
InitField();
// Compute factors used in erasure decoder
InitFieldOperations();
unsigned seed = (unsigned)time(NULL);
for (;;)
{
// test(int k), k: message size
/*
EncodeH works for kFieldSize / 2 and kFieldSize * 3 / 4, etc,
s.t. the number of recovery pieces is a power of two
*/
test(kFieldSize / 2, seed);
++seed;
}
return 0;
}