diff --git a/LeopardCommon.cpp b/LeopardCommon.cpp new file mode 100644 index 0000000..82bdbcf --- /dev/null +++ b/LeopardCommon.cpp @@ -0,0 +1,957 @@ +/* + Copyright (c) 2017 Christopher A. Taylor. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + * Neither the name of Leopard-RS nor the names of its contributors may be + used to endorse or promote products derived from this software without + specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. +*/ + +#include "LeopardCommon.h" + +namespace leopard { + + +//------------------------------------------------------------------------------ +// Runtime CPU Architecture Check +// +// Feature checks stolen shamelessly from +// https://github.com/jedisct1/libsodium/blob/master/src/libsodium/sodium/runtime.c + +#if defined(HAVE_ANDROID_GETCPUFEATURES) + #include +#endif + +#if defined(LEO_TRY_NEON) +# if defined(IOS) && defined(__ARM_NEON__) +// Requires iPhone 5S or newer +# else +// Remember to add LOCAL_STATIC_LIBRARIES := cpufeatures +bool CpuHasNeon = false; // V6 / V7 +bool CpuHasNeon64 = false; // 64-bit +# endif +#endif + + +#if !defined(LEO_TARGET_MOBILE) + +#ifdef _MSC_VER + #include // __cpuid + #pragma warning(disable: 4752) // found Intel(R) Advanced Vector Extensions; consider using /arch:AVX +#endif + +#ifdef LEO_TRY_AVX2 +bool CpuHasAVX2 = false; +#endif +bool CpuHasSSSE3 = false; + +#define CPUID_EBX_AVX2 0x00000020 +#define CPUID_ECX_SSSE3 0x00000200 + +static void _cpuid(unsigned int cpu_info[4U], const unsigned int cpu_info_type) +{ +#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_AMD64) || defined(_M_IX86)) + __cpuid((int *) cpu_info, cpu_info_type); +#else //if defined(HAVE_CPUID) + cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0; +# ifdef __i386__ + __asm__ __volatile__ ("pushfl; pushfl; " + "popl %0; " + "movl %0, %1; xorl %2, %0; " + "pushl %0; " + "popfl; pushfl; popl %0; popfl" : + "=&r" (cpu_info[0]), "=&r" (cpu_info[1]) : + "i" (0x200000)); + if (((cpu_info[0] ^ cpu_info[1]) & 0x200000) == 0) { + return; /* LCOV_EXCL_LINE */ + } +# endif +# ifdef __i386__ + __asm__ __volatile__ ("xchgl %%ebx, %k1; cpuid; xchgl %%ebx, %k1" : + "=a" (cpu_info[0]), "=&r" (cpu_info[1]), + "=c" (cpu_info[2]), "=d" (cpu_info[3]) : + "0" (cpu_info_type), "2" (0U)); +# elif defined(__x86_64__) + __asm__ __volatile__ ("xchgq %%rbx, %q1; cpuid; xchgq %%rbx, %q1" : + "=a" (cpu_info[0]), "=&r" (cpu_info[1]), + "=c" (cpu_info[2]), "=d" (cpu_info[3]) : + "0" (cpu_info_type), "2" (0U)); +# else + __asm__ __volatile__ ("cpuid" : + "=a" (cpu_info[0]), "=b" (cpu_info[1]), + "=c" (cpu_info[2]), "=d" (cpu_info[3]) : + "0" (cpu_info_type), "2" (0U)); +# endif +#endif +} + +#endif // defined(LEO_TARGET_MOBILE) + + +void InitializeCPUArch() +{ +#if defined(LEO_TRY_NEON) && defined(HAVE_ANDROID_GETCPUFEATURES) + AndroidCpuFamily family = android_getCpuFamily(); + if (family == ANDROID_CPU_FAMILY_ARM) + { + if (android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON) + CpuHasNeon = true; + } + else if (family == ANDROID_CPU_FAMILY_ARM64) + { + CpuHasNeon = true; + if (android_getCpuFeatures() & ANDROID_CPU_ARM64_FEATURE_ASIMD) + CpuHasNeon64 = true; + } +#endif + +#if !defined(LEO_TARGET_MOBILE) + unsigned int cpu_info[4]; + + _cpuid(cpu_info, 1); + CpuHasSSSE3 = ((cpu_info[2] & CPUID_ECX_SSSE3) != 0); + +#if defined(LEO_TRY_AVX2) + _cpuid(cpu_info, 7); + CpuHasAVX2 = ((cpu_info[1] & CPUID_EBX_AVX2) != 0); +#endif // LEO_TRY_AVX2 + +#endif // LEO_TARGET_MOBILE +} + + + +// vx[] += vy[] * z +static void muladd_mem(GFSymbol * LEO_RESTRICT vx, const GFSymbol * LEO_RESTRICT vy, GFSymbol z, unsigned symbolCount) +{ + for (unsigned i = 0; i < symbolCount; ++i) + { + const GFSymbol a = vy[i]; + if (a == 0) + continue; + + GFSymbol sum1 = static_cast(AddModQ(GFLog[a & 0x0f], z)); + GFSymbol value1 = GFExp[sum1]; + if ((a & 0x0f) == 0) + { + value1 = 0; + } + GFSymbol sum2 = static_cast(AddModQ(GFLog[a & 0xf0], z)); + GFSymbol value2 = GFExp[sum2]; + if ((a & 0xf0) == 0) + { + value2 = 0; + } + GFSymbol sum3 = static_cast(AddModQ(GFLog[a & 0x0f00], z)); + GFSymbol value3 = GFExp[sum3]; + if ((a & 0x0f00) == 0) + { + value3 = 0; + } + GFSymbol sum4 = static_cast(AddModQ(GFLog[a & 0xf000], z)); + GFSymbol value4 = GFExp[sum4]; + if ((a & 0xf000) == 0) + { + value4 = 0; + } + + vx[i] ^= value1; + vx[i] ^= value2; + vx[i] ^= value3; + vx[i] ^= value4; + } +} + +// return a*GFExp[b] over GF(2^r) +static GFSymbol mulE(GFSymbol a, GFSymbol b) +{ + if (a == 0) + return 0; + + const GFSymbol sum = static_cast(AddModQ(GFLog[a], b)); + return GFExp[sum]; +} + + +//------------------------------------------------------------------------------ +// Fast Walsh-Hadamard Transform (FWHT) Mod Q +// +// Q is the maximum symbol value, e.g. 255 or 65535. + +// Define this to enable the optimized version of FWHT() +#define LEO_FWHT_OPTIMIZED + +typedef GFSymbol fwht_t; + +// {a, b} = {a + b, a - b} (Mod Q) +static LEO_FORCE_INLINE void FWHT_2(fwht_t& LEO_RESTRICT a, fwht_t& LEO_RESTRICT b) +{ + const fwht_t sum = AddModQ(a, b); + const fwht_t dif = SubModQ(a, b); + a = sum; + b = dif; +} + +/* + FWHT is a minor slice of the runtime and does not grow with data size, + but I did attempt a few additional optimizations that failed: + + I've attempted to vectorize (with partial reductions) FWHT_4(data, s), + which is 70% of the algorithm, but it was slower. Left in _attic_. + + I've attempted to avoid reductions in all or parts of the FWHT. + The final modular reduction ends up being slower than the savings. + Specifically I tried doing it for the whole FWHT and also I tried + doing it just for the FWHT_2 loop in the main routine, but both + approaches are slower than partial reductions. + + Replacing word reads with wider reads does speed up the operation, but + at too high a complexity cost relative to minor perf improvement. +*/ + +#ifndef LEO_FWHT_OPTIMIZED + +// Reference implementation +static void FWHT(fwht_t* data, const unsigned bits) +{ + const unsigned size = (unsigned)(1UL << bits); + for (unsigned width = 1; width < size; width <<= 1) + for (unsigned i = 0; i < size; i += (width << 1)) + for (unsigned j = i; j < (width + i); ++j) + FWHT_2(data[j], data[j + width]); +} + +#else + +static LEO_FORCE_INLINE void FWHT_4(fwht_t* data) +{ + fwht_t t0 = data[0]; + fwht_t t1 = data[1]; + fwht_t t2 = data[2]; + fwht_t t3 = data[3]; + FWHT_2(t0, t1); + FWHT_2(t2, t3); + FWHT_2(t0, t2); + FWHT_2(t1, t3); + data[0] = t0; + data[1] = t1; + data[2] = t2; + data[3] = t3; +} + +static LEO_FORCE_INLINE void FWHT_4(fwht_t* data, unsigned s) +{ + unsigned x = 0; + fwht_t t0 = data[x]; x += s; + fwht_t t1 = data[x]; x += s; + fwht_t t2 = data[x]; x += s; + fwht_t t3 = data[x]; + FWHT_2(t0, t1); + FWHT_2(t2, t3); + FWHT_2(t0, t2); + FWHT_2(t1, t3); + unsigned y = 0; + data[y] = t0; y += s; + data[y] = t1; y += s; + data[y] = t2; y += s; + data[y] = t3; +} + +static inline void FWHT_8(fwht_t* data) +{ + fwht_t t0 = data[0]; + fwht_t t1 = data[1]; + fwht_t t2 = data[2]; + fwht_t t3 = data[3]; + fwht_t t4 = data[4]; + fwht_t t5 = data[5]; + fwht_t t6 = data[6]; + fwht_t t7 = data[7]; + FWHT_2(t0, t1); + FWHT_2(t2, t3); + FWHT_2(t4, t5); + FWHT_2(t6, t7); + FWHT_2(t0, t2); + FWHT_2(t1, t3); + FWHT_2(t4, t6); + FWHT_2(t5, t7); + FWHT_2(t0, t4); + FWHT_2(t1, t5); + FWHT_2(t2, t6); + FWHT_2(t3, t7); + data[0] = t0; + data[1] = t1; + data[2] = t2; + data[3] = t3; + data[4] = t4; + data[5] = t5; + data[6] = t6; + data[7] = t7; +} + +static inline void FWHT_16(fwht_t* data) +{ + fwht_t t0 = data[0]; + fwht_t t1 = data[1]; + fwht_t t2 = data[2]; + fwht_t t3 = data[3]; + fwht_t t4 = data[4]; + fwht_t t5 = data[5]; + fwht_t t6 = data[6]; + fwht_t t7 = data[7]; + fwht_t t8 = data[8]; + fwht_t t9 = data[9]; + fwht_t t10 = data[10]; + fwht_t t11 = data[11]; + fwht_t t12 = data[12]; + fwht_t t13 = data[13]; + fwht_t t14 = data[14]; + fwht_t t15 = data[15]; + FWHT_2(t0, t1); + FWHT_2(t2, t3); + FWHT_2(t4, t5); + FWHT_2(t6, t7); + FWHT_2(t8, t9); + FWHT_2(t10, t11); + FWHT_2(t12, t13); + FWHT_2(t14, t15); + FWHT_2(t0, t2); + FWHT_2(t1, t3); + FWHT_2(t4, t6); + FWHT_2(t5, t7); + FWHT_2(t8, t10); + FWHT_2(t9, t11); + FWHT_2(t12, t14); + FWHT_2(t13, t15); + FWHT_2(t0, t4); + FWHT_2(t1, t5); + FWHT_2(t2, t6); + FWHT_2(t3, t7); + FWHT_2(t8, t12); + FWHT_2(t9, t13); + FWHT_2(t10, t14); + FWHT_2(t11, t15); + FWHT_2(t0, t8); + FWHT_2(t1, t9); + FWHT_2(t2, t10); + FWHT_2(t3, t11); + FWHT_2(t4, t12); + FWHT_2(t5, t13); + FWHT_2(t6, t14); + FWHT_2(t7, t15); + data[0] = t0; + data[1] = t1; + data[2] = t2; + data[3] = t3; + data[4] = t4; + data[5] = t5; + data[6] = t6; + data[7] = t7; + data[8] = t8; + data[9] = t9; + data[10] = t10; + data[11] = t11; + data[12] = t12; + data[13] = t13; + data[14] = t14; + data[15] = t15; +} + +static void FWHT_SmallData(fwht_t* data, unsigned ldn) +{ + const unsigned n = (1UL << ldn); + + if (n <= 2) + { + if (n == 2) + FWHT_2(data[0], data[1]); + return; + } + + for (unsigned ldm = ldn; ldm > 3; ldm -= 2) + { + unsigned m = (1UL << ldm); + unsigned m4 = (m >> 2); + for (unsigned r = 0; r < n; r += m) + for (unsigned j = 0; j < m4; j++) + FWHT_4(data + j + r, m4); + } + + if (ldn & 1) + { + for (unsigned i0 = 0; i0 < n; i0 += 8) + FWHT_8(data + i0); + } + else + { + for (unsigned i0 = 0; i0 < n; i0 += 4) + FWHT_4(data + i0); + } +} + +// Decimation in time (DIT) version +static void FWHT(fwht_t* data, const unsigned ldn) +{ + if (ldn <= 13) + { + FWHT_SmallData(data, ldn); + return; + } + + FWHT_2(data[2], data[3]); + FWHT_4(data + 4); + FWHT_8(data + 8); + FWHT_16(data + 16); + for (unsigned ldm = 5; ldm < ldn; ++ldm) + FWHT(data + (unsigned)(1UL << ldm), ldm); + + for (unsigned ldm = 0; ldm < ldn; ++ldm) + { + const unsigned mh = (1UL << ldm); + for (unsigned t1 = 0, t2 = mh; t1 < mh; ++t1, ++t2) + FWHT_2(data[t1], data[t2]); + } +} + +#endif + + +//------------------------------------------------------------------------------ +// Memory Buffer XOR + +static void xor_mem(void * LEO_RESTRICT vx, const void * LEO_RESTRICT vy, unsigned bytes) +{ + LEO_M128 * LEO_RESTRICT x16 = reinterpret_cast(vx); + const LEO_M128 * LEO_RESTRICT y16 = reinterpret_cast(vy); + +#if defined(LEO_TARGET_MOBILE) +# if defined(LEO_TRY_NEON) + // Handle multiples of 64 bytes + if (CpuHasNeon) + { + while (bytes >= 64) + { + LEO_M128 x0 = vld1q_u8(x16); + LEO_M128 x1 = vld1q_u8(x16 + 1); + LEO_M128 x2 = vld1q_u8(x16 + 2); + LEO_M128 x3 = vld1q_u8(x16 + 3); + LEO_M128 y0 = vld1q_u8(y16); + LEO_M128 y1 = vld1q_u8(y16 + 1); + LEO_M128 y2 = vld1q_u8(y16 + 2); + LEO_M128 y3 = vld1q_u8(y16 + 3); + + vst1q_u8(x16, veorq_u8(x0, y0)); + vst1q_u8(x16 + 1, veorq_u8(x1, y1)); + vst1q_u8(x16 + 2, veorq_u8(x2, y2)); + vst1q_u8(x16 + 3, veorq_u8(x3, y3)); + + bytes -= 64, x16 += 4, y16 += 4; + } + + // Handle multiples of 16 bytes + while (bytes >= 16) + { + LEO_M128 x0 = vld1q_u8(x16); + LEO_M128 y0 = vld1q_u8(y16); + + vst1q_u8(x16, veorq_u8(x0, y0)); + + bytes -= 16, ++x16, ++y16; + } + } + else +# endif // LEO_TRY_NEON + { + uint64_t * LEO_RESTRICT x8 = reinterpret_cast(x16); + const uint64_t * LEO_RESTRICT y8 = reinterpret_cast(y16); + + const unsigned count = (unsigned)bytes / 8; + for (unsigned ii = 0; ii < count; ++ii) + x8[ii] ^= y8[ii]; + + x16 = reinterpret_cast(x8 + count); + y16 = reinterpret_cast(y8 + count); + } +#else // LEO_TARGET_MOBILE +# if defined(LEO_TRY_AVX2) + if (CpuHasAVX2) + { + LEO_M256 * LEO_RESTRICT x32 = reinterpret_cast(x16); + const LEO_M256 * LEO_RESTRICT y32 = reinterpret_cast(y16); + + while (bytes >= 128) + { + LEO_M256 x0 = _mm256_loadu_si256(x32); + LEO_M256 y0 = _mm256_loadu_si256(y32); + x0 = _mm256_xor_si256(x0, y0); + LEO_M256 x1 = _mm256_loadu_si256(x32 + 1); + LEO_M256 y1 = _mm256_loadu_si256(y32 + 1); + x1 = _mm256_xor_si256(x1, y1); + LEO_M256 x2 = _mm256_loadu_si256(x32 + 2); + LEO_M256 y2 = _mm256_loadu_si256(y32 + 2); + x2 = _mm256_xor_si256(x2, y2); + LEO_M256 x3 = _mm256_loadu_si256(x32 + 3); + LEO_M256 y3 = _mm256_loadu_si256(y32 + 3); + x3 = _mm256_xor_si256(x3, y3); + + _mm256_storeu_si256(x32, x0); + _mm256_storeu_si256(x32 + 1, x1); + _mm256_storeu_si256(x32 + 2, x2); + _mm256_storeu_si256(x32 + 3, x3); + + bytes -= 128, x32 += 4, y32 += 4; + } + + // Handle multiples of 32 bytes + while (bytes >= 32) + { + // x[i] = x[i] xor y[i] + _mm256_storeu_si256(x32, + _mm256_xor_si256( + _mm256_loadu_si256(x32), + _mm256_loadu_si256(y32))); + + bytes -= 32, ++x32, ++y32; + } + + x16 = reinterpret_cast(x32); + y16 = reinterpret_cast(y32); + } + else +# endif // LEO_TRY_AVX2 + { + while (bytes >= 64) + { + LEO_M128 x0 = _mm_loadu_si128(x16); + LEO_M128 y0 = _mm_loadu_si128(y16); + x0 = _mm_xor_si128(x0, y0); + LEO_M128 x1 = _mm_loadu_si128(x16 + 1); + LEO_M128 y1 = _mm_loadu_si128(y16 + 1); + x1 = _mm_xor_si128(x1, y1); + LEO_M128 x2 = _mm_loadu_si128(x16 + 2); + LEO_M128 y2 = _mm_loadu_si128(y16 + 2); + x2 = _mm_xor_si128(x2, y2); + LEO_M128 x3 = _mm_loadu_si128(x16 + 3); + LEO_M128 y3 = _mm_loadu_si128(y16 + 3); + x3 = _mm_xor_si128(x3, y3); + + _mm_storeu_si128(x16, x0); + _mm_storeu_si128(x16 + 1, x1); + _mm_storeu_si128(x16 + 2, x2); + _mm_storeu_si128(x16 + 3, x3); + + bytes -= 64, x16 += 4, y16 += 4; + } + } +#endif // LEO_TARGET_MOBILE + + // Handle multiples of 16 bytes + while (bytes >= 16) + { + // x[i] = x[i] xor y[i] + _mm_storeu_si128(x16, + _mm_xor_si128( + _mm_loadu_si128(x16), + _mm_loadu_si128(y16))); + + bytes -= 16, ++x16, ++y16; + } + + uint8_t * LEO_RESTRICT x1 = reinterpret_cast(x16); + const uint8_t * LEO_RESTRICT y1 = reinterpret_cast(y16); + + // Handle a block of 8 bytes + const unsigned eight = bytes & 8; + if (eight) + { + uint64_t * LEO_RESTRICT x8 = reinterpret_cast(x1); + const uint64_t * LEO_RESTRICT y8 = reinterpret_cast(y1); + *x8 ^= *y8; + } + + // Handle a block of 4 bytes + const unsigned four = bytes & 4; + if (four) + { + uint32_t * LEO_RESTRICT x4 = reinterpret_cast(x1 + eight); + const uint32_t * LEO_RESTRICT y4 = reinterpret_cast(y1 + eight); + *x4 ^= *y4; + } + + // Handle final bytes + const unsigned offset = eight + four; + switch (bytes & 3) + { + case 3: x1[offset + 2] ^= y1[offset + 2]; + case 2: x1[offset + 1] ^= y1[offset + 1]; + case 1: x1[offset] ^= y1[offset]; + default: + break; + } +} + + +//------------------------------------------------------------------------------ +// Formal Derivative + +// Formal derivative of polynomial in the new basis +static void formal_derivative(GFSymbol* cos, const unsigned size) +{ + for (unsigned i = 1; i < size; ++i) + { + const unsigned leng = ((i ^ (i - 1)) + 1) >> 1; + + // If a large number of values are being XORed: + if (leng >= 8) + xor_mem(cos + i - leng, cos + i, leng * sizeof(GFSymbol)); + else + for (unsigned j = i - leng; j < i; j++) + cos[j] ^= cos[j + leng]; + } + + for (unsigned i = size; i < kFieldSize; i <<= 1) + xor_mem(cos, cos + i, size * sizeof(GFSymbol)); +} + + +//------------------------------------------------------------------------------ +// Fast Fourier Transform + +static GFSymbol skewVec[kFieldModulus]; // twisted factors used in FFT + +// IFFT in the proposed basis +static void IFLT(GFSymbol* data, const unsigned size, const unsigned index) +{ + for (unsigned depart_no = 1; depart_no < size; depart_no <<= 1) + { + for (unsigned j = depart_no; j < size; j += (depart_no << 1)) + { + // If a large number of values are being XORed: + if (depart_no >= 8) + xor_mem(data + j, data + j - depart_no, depart_no * sizeof(GFSymbol)); + else + for (unsigned i = j - depart_no; i < j; ++i) + data[i + depart_no] ^= data[i]; + + const GFSymbol skew = skewVec[j + index - 1]; + + if (skew != kFieldModulus) + muladd_mem(data + j - depart_no, data + j, skew, depart_no); + } + } +} + +// FFT in the proposed basis +static void FLT(GFSymbol* data, const unsigned size, const unsigned index) +{ + for (unsigned depart_no = (size >> 1); depart_no > 0; depart_no >>= 1) + { + for (unsigned j = depart_no; j < size; j += (depart_no << 1)) + { + const GFSymbol skew = skewVec[j + index - 1]; + + if (skew != kFieldModulus) + muladd_mem(data + j - depart_no, data + j, skew, depart_no); + + // If a large number of values are being XORed: + if (depart_no >= 8) + xor_mem(data + j, data + j - depart_no, depart_no * sizeof(GFSymbol)); + else + for (unsigned i = j - depart_no; i < j; ++i) + data[i + depart_no] ^= data[i]; + } + } +} + + +//------------------------------------------------------------------------------ +// FFT Initialization + +static GFSymbol B[kFieldSize >> 1]; // factors used in formal derivative +static fwht_t log_walsh[kFieldSize]; // factors used in the evaluation of the error locator polynomial + +// Initialize skewVec[], B[], log_walsh[] +static void InitFieldOperations() +{ + GFSymbol temp[kGFBits - 1]; + + for (unsigned i = 1; i < kGFBits; ++i) + temp[i - 1] = (GFSymbol)((unsigned)1 << i); + + for (unsigned m = 0; m < (kGFBits - 1); ++m) + { + const unsigned step = (unsigned)1 << (m + 1); + + skewVec[((unsigned)1 << m) - 1] = 0; + + for (unsigned i = m; i < (kGFBits - 1); ++i) + { + const unsigned s = ((unsigned)1 << (i + 1)); + + for (unsigned j = ((unsigned)1 << m) - 1; j < s; j += step) + skewVec[j + s] = skewVec[j] ^ temp[i]; + } + + temp[m] = kFieldModulus - GFLog[mulE(temp[m], GFLog[temp[m] ^ 1])]; + + for (unsigned i = m + 1; i < (kGFBits - 1); ++i) + temp[i] = mulE(temp[i], (GFLog[temp[i] ^ 1] + temp[m]) % kFieldModulus); + } + + for (unsigned i = 0; i < kFieldSize; ++i) + skewVec[i] = GFLog[skewVec[i]]; + + temp[0] = kFieldModulus - temp[0]; + + for (unsigned i = 1; i < (kGFBits - 1); ++i) + temp[i] = (kFieldModulus - temp[i] + temp[i - 1]) % kFieldModulus; + + B[0] = 0; + for (unsigned i = 0; i < (kGFBits - 1); ++i) + { + const unsigned depart = ((unsigned)1 << i); + + for (unsigned j = 0; j < depart; ++j) + B[j + depart] = (B[j] + temp[i]) % kFieldModulus; + } + + for (unsigned i = 0; i < kFieldSize; ++i) + log_walsh[i] = GFLog[i]; + + log_walsh[0] = 0; + + FWHT(log_walsh, kGFBits); +} + + +//------------------------------------------------------------------------------ +// Encoder + +// Encoding alg for k/n<0.5: message is a power of two +static void encodeL(GFSymbol* data, const unsigned k, GFSymbol* codeword) +{ + memcpy(codeword, data, sizeof(GFSymbol) * k); + + IFLT(codeword, k, 0); + + for (unsigned i = k; i < kFieldSize; i += k) + { + memcpy(&codeword[i], codeword, sizeof(GFSymbol) * k); + + FLT(&codeword[i], k, i); + } + + memcpy(codeword, data, sizeof(GFSymbol) * k); +} + +// Encoding alg for k/n>0.5: parity is a power of two. +// data: message array. parity: parity array. mem: buffer(size>= n-k) +static void encodeH(const GFSymbol* data, const unsigned k, GFSymbol* parity, GFSymbol* mem) +{ + const unsigned t = kFieldSize - k; + + memset(parity, 0, sizeof(GFSymbol) * t); + + for (unsigned i = t; i < kFieldSize; i += t) + { + memcpy(mem, &data[i - t], sizeof(GFSymbol) * t); + + IFLT(mem, t, i); + + xor_mem(parity, mem, t * sizeof(GFSymbol)); + } + + FLT(parity, t, 0); +} + + +//------------------------------------------------------------------------------ +// Decoder + +static void decode(GFSymbol* codeword, unsigned k, const bool* erasure) +{ + fwht_t log_walsh2[kFieldSize]; + + // Compute the evaluations of the error locator polynomial + for (unsigned i = 0; i < kFieldSize; ++i) + log_walsh2[i] = erasure[i] ? 1 : 0; + + FWHT(log_walsh2, kGFBits); + + for (unsigned i = 0; i < kFieldSize; ++i) + log_walsh2[i] = ((unsigned)log_walsh2[i] * (unsigned)log_walsh[i]) % kFieldModulus; + + FWHT(log_walsh2, kGFBits); + + // k2 can be replaced with k + const unsigned k2 = kFieldSize; + //const unsigned k2 = k; // cannot actually be replaced with k. what else need to change? + + for (unsigned i = 0; i < kFieldSize; ++i) + { + if (erasure[i]) + { + codeword[i] = 0; + } + else + { + codeword[i] = mulE(codeword[i], log_walsh2[i]); + } + } + + IFLT(codeword, kFieldSize, 0); + + // formal derivative + for (unsigned i = 0; i < kFieldSize; i += 2) + { + codeword[i] = mulE(codeword[i], kFieldModulus - B[i >> 1]); + codeword[i + 1] = mulE(codeword[i + 1], kFieldModulus - B[i >> 1]); + } + + formal_derivative(codeword, k2); + + for (unsigned i = 0; i < k2; i += 2) + { + codeword[i] = mulE(codeword[i], B[i >> 1]); + codeword[i + 1] = mulE(codeword[i + 1], B[i >> 1]); + } + + FLT(codeword, k2, 0); + + for (unsigned i = 0; i < k2; ++i) + { + if (erasure[i]) + { + codeword[i] = mulE(codeword[i], kFieldModulus - log_walsh2[i]); + } + } +} + + +//------------------------------------------------------------------------------ +// Test Application + +void test(unsigned k, unsigned seed) +{ + srand(seed); + + //-----------Generating message---------- + + // Message array + GFSymbol data[kFieldSize] = {0}; + + // Filled with random numbers + for (unsigned i = kFieldSize - k; i < kFieldSize; ++i) + data[i] = (GFSymbol)rand(); + + + //---------encoding---------- + + GFSymbol codeword[kFieldSize]; + encodeH(&data[kFieldSize - k], k, data, codeword); + //encodeL(data, k, codeword); // does not seem to work with any input? what else needs to change? + + memcpy(codeword, data, sizeof(GFSymbol) * kFieldSize); + + + //--------erasure simulation--------- + + // Array indicating erasures + bool erasure[kFieldSize] = { + false + }; + + for (unsigned i = k; i < kFieldSize; ++i) + erasure[i] = true; + + // permuting the erasure array + for (unsigned i = kFieldSize - 1; i > 0; --i) + { + unsigned pos = rand() % (i + 1); + + if (i != pos) + { + bool tmp = erasure[i]; + erasure[i] = erasure[pos]; + erasure[pos] = tmp; + } + } + + // erasure codeword symbols + for (unsigned i = 0; i < kFieldSize; ++i) + if (erasure[i]) + codeword[i] = 0; + + + //---------main processing---------- + decode(codeword, k, erasure); + + // Check the correctness of the result + for (unsigned i = 0; i < kFieldSize; ++i) + { + if (erasure[i] == 1) + { + if (data[i] != codeword[i]) + { + printf("Decoding Error with seed = %d!\n", seed); + LEO_DEBUG_BREAK; + return; + } + } + } + + //printf("Decoding is successful!\n"); +} + + +//------------------------------------------------------------------------------ +// Entrypoint + +int main(int argc, char **argv) +{ + // Initialize architecture-specific code + leo_architecture_init(); + + // Fill GFLog table and GFExp table + InitField(); + + // Compute factors used in erasure decoder + InitFieldOperations(); + + unsigned seed = (unsigned)time(NULL); + for (;;) + { + // test(int k), k: message size + /* + EncodeH works for kFieldSize / 2 and kFieldSize * 3 / 4, etc, + s.t. the number of recovery pieces is a power of two + */ + test(kFieldSize / 2, seed); + + ++seed; + } + + return 0; +} + + +} // namespace leopard diff --git a/LeopardCommon.h b/LeopardCommon.h new file mode 100644 index 0000000..17425c0 --- /dev/null +++ b/LeopardCommon.h @@ -0,0 +1,194 @@ +/* + Copyright (c) 2017 Christopher A. Taylor. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + * Neither the name of Leopard-RS nor the names of its contributors may be + used to endorse or promote products derived from this software without + specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. +*/ + +#pragma once + +/* + TODO: + + Refactor software + + I think it should be split up into several C++ modules + + Replace GFSymbol with a file data pointer + + New 16-bit Muladd inner loops + + Class to contain the (large) muladd tables + + Preliminary benchmarks for large data! + + New 8-bit Muladd inner loops + + Benchmarks for smaller data! + + Write detailed comments for all the routines + + Look into getting EncodeL working so we can support smaller data (Ask Lin) + + Look into using k instead of k2 to speed up decoder (Ask Lin) + + Avoid performing FFT/IFFT intermediate calculations we're not going to use + + Benchmarks, fun! + + Add multi-threading to split up long parallelizable calculations + + Final benchmarks! + + Finish up documentation + + Release version 1 + + + Muladd implementation notes: + + Specialize for 1-3 rows at a time since often times we're multiplying by + the same (skew) value repeatedly, as the ISA-L library does here: + + https://github.com/01org/isa-l/blob/master/erasure_code/gf_3vect_mad_avx.asm#L258 + + Except we should be doing it for 16-bit Galois Field. + To implement that use the ALTMAP trick from Jerasure: + + http://lab.jerasure.org/jerasure/gf-complete/blob/master/src/gf_w16.c#L1140 + + Except we should also support AVX2 since that is a 40% perf boost, so put + the high and low bytes 32 bytes instead of 16 bytes apart. + + Also I think we should go ahead and precompute the multiply tables since + it avoids a bunch of memory lookups for each muladd, and only costs 8 MB. +*/ + +#include + + +//------------------------------------------------------------------------------ +// Debug + +// Some bugs only repro in release mode, so this can be helpful +//#define LEO_DEBUG_IN_RELEASE + +#if defined(_DEBUG) || defined(DEBUG) || defined(LEO_DEBUG_IN_RELEASE) + #define LEO_DEBUG + #ifdef _WIN32 + #define LEO_DEBUG_BREAK __debugbreak() + #else + #define LEO_DEBUG_BREAK __builtin_trap() + #endif + #define LEO_DEBUG_ASSERT(cond) { if (!(cond)) { LEO_DEBUG_BREAK; } } +#else + #define LEO_DEBUG_BREAK ; + #define LEO_DEBUG_ASSERT(cond) ; +#endif + + +//------------------------------------------------------------------------------ +// Platform/Architecture + +#if defined(ANDROID) || defined(IOS) + #define LEO_TARGET_MOBILE +#endif // ANDROID + +#if defined(__AVX2__) || (defined (_MSC_VER) && _MSC_VER >= 1900) + #define LEO_TRY_AVX2 /* 256-bit */ + #include + #define LEO_ALIGN_BYTES 32 +#else // __AVX2__ + #define LEO_ALIGN_BYTES 16 +#endif // __AVX2__ + +#if !defined(LEO_TARGET_MOBILE) + // Note: MSVC currently only supports SSSE3 but not AVX2 + #include // SSSE3: _mm_shuffle_epi8 + #include // SSE2 +#endif // LEO_TARGET_MOBILE + +#if defined(HAVE_ARM_NEON_H) + #include +#endif // HAVE_ARM_NEON_H + +#if defined(LEO_TARGET_MOBILE) + + #define LEO_ALIGNED_ACCESSES /* Inputs must be aligned to LEO_ALIGN_BYTES */ + +# if defined(HAVE_ARM_NEON_H) + // Compiler-specific 128-bit SIMD register keyword + #define LEO_M128 uint8x16_t + #define LEO_TRY_NEON +#else + #define LEO_M128 uint64_t +# endif + +#else // LEO_TARGET_MOBILE + + // Compiler-specific 128-bit SIMD register keyword + #define LEO_M128 __m128i + +#endif // LEO_TARGET_MOBILE + +#ifdef LEO_TRY_AVX2 + // Compiler-specific 256-bit SIMD register keyword + #define LEO_M256 __m256i +#endif + +// Compiler-specific C++11 restrict keyword +#define LEO_RESTRICT __restrict + +// Compiler-specific force inline keyword +#ifdef _MSC_VER + #define LEO_FORCE_INLINE inline __forceinline +#else + #define LEO_FORCE_INLINE inline __attribute__((always_inline)) +#endif + +// Compiler-specific alignment keyword +// Note: Alignment only matters for ARM NEON where it should be 16 +#ifdef _MSC_VER + #define LEO_ALIGNED __declspec(align(LEO_ALIGN_BYTES)) +#else // _MSC_VER + #define LEO_ALIGNED __attribute__((aligned(LEO_ALIGN_BYTES))) +#endif // _MSC_VER + + +namespace leopard { + + +//------------------------------------------------------------------------------ +// Runtime CPU Architecture Check + +// Initialize CPU architecture flags +void InitializeCPUArch(); + +#if defined(LEO_TRY_NEON) +# if defined(IOS) && defined(__ARM_NEON__) +// Does device support NEON? +static const bool CpuHasNeon = true; +static const bool CpuHasNeon64 = true; +# else +// Does device support NEON? +// Remember to add LOCAL_STATIC_LIBRARIES := cpufeatures +extern bool CpuHasNeon; // V6 / V7 +extern bool CpuHasNeon64; // 64-bit +# endif +#endif + +#if !defined(LEO_TARGET_MOBILE) +# if defined(LEO_TRY_AVX2) +// Does CPU support AVX2? +extern bool CpuHasAVX2; +# endif +// Does CPU support SSSE3? +extern bool CpuHasSSSE3; +#endif // LEO_TARGET_MOBILE + + +} // namespace leopard diff --git a/lhc_rs.cpp b/LeopardDecoder.cpp similarity index 81% rename from lhc_rs.cpp rename to LeopardDecoder.cpp index 76cb178..71d22e2 100644 --- a/lhc_rs.cpp +++ b/LeopardDecoder.cpp @@ -1,8 +1,29 @@ /* - S.-J. Lin, T. Y. Al-Naffouri, Y. S. Han, and W.-H. Chung, - "Novel Polynomial Basis with Fast Fourier Transform and Its Application to Reed-Solomon Erasure Codes" - IEEE Trans. on Information Theory, pp. 6284-6299, November, 2016. - http://ct.ee.ntust.edu.tw/it2016-2.pdf + Copyright (c) 2017 Christopher A. Taylor. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + * Neither the name of LHC-RS nor the names of its contributors may be + used to endorse or promote products derived from this software without + specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. */ #include @@ -23,7 +44,7 @@ + New 8-bit Muladd inner loops + Benchmarks for smaller data! + Refactor software - + Pick a name for the software better than LHC_RS + + Pick a name for the software better than LEO_RS + I think it should be split up into several C++ modules + Write detailed comments for all the routines + Look into getting EncodeL working so we can support smaller data (Ask Lin) @@ -60,19 +81,19 @@ // Debug // Some bugs only repro in release mode, so this can be helpful -//#define LHC_DEBUG_IN_RELEASE +//#define LEO_DEBUG_IN_RELEASE -#if defined(_DEBUG) || defined(DEBUG) || defined(LHC_DEBUG_IN_RELEASE) - #define LHC_DEBUG +#if defined(_DEBUG) || defined(DEBUG) || defined(LEO_DEBUG_IN_RELEASE) + #define LEO_DEBUG #ifdef _WIN32 - #define LHC_DEBUG_BREAK __debugbreak() + #define LEO_DEBUG_BREAK __debugbreak() #else - #define LHC_DEBUG_BREAK __builtin_trap() + #define LEO_DEBUG_BREAK __builtin_trap() #endif - #define LHC_DEBUG_ASSERT(cond) { if (!(cond)) { LHC_DEBUG_BREAK; } } + #define LEO_DEBUG_ASSERT(cond) { if (!(cond)) { LEO_DEBUG_BREAK; } } #else - #define LHC_DEBUG_BREAK ; - #define LHC_DEBUG_ASSERT(cond) ; + #define LEO_DEBUG_BREAK ; + #define LEO_DEBUG_ASSERT(cond) ; #endif @@ -80,67 +101,67 @@ // Platform/Architecture #if defined(ANDROID) || defined(IOS) - #define LHC_TARGET_MOBILE + #define LEO_TARGET_MOBILE #endif // ANDROID #if defined(__AVX2__) || (defined (_MSC_VER) && _MSC_VER >= 1900) - #define LHC_TRY_AVX2 /* 256-bit */ + #define LEO_TRY_AVX2 /* 256-bit */ #include - #define LHC_ALIGN_BYTES 32 + #define LEO_ALIGN_BYTES 32 #else // __AVX2__ - #define LHC_ALIGN_BYTES 16 + #define LEO_ALIGN_BYTES 16 #endif // __AVX2__ -#if !defined(LHC_TARGET_MOBILE) +#if !defined(LEO_TARGET_MOBILE) // Note: MSVC currently only supports SSSE3 but not AVX2 #include // SSSE3: _mm_shuffle_epi8 #include // SSE2 -#endif // LHC_TARGET_MOBILE +#endif // LEO_TARGET_MOBILE #if defined(HAVE_ARM_NEON_H) #include #endif // HAVE_ARM_NEON_H -#if defined(LHC_TARGET_MOBILE) +#if defined(LEO_TARGET_MOBILE) - #define LHC_ALIGNED_ACCESSES /* Inputs must be aligned to LHC_ALIGN_BYTES */ + #define LEO_ALIGNED_ACCESSES /* Inputs must be aligned to LEO_ALIGN_BYTES */ # if defined(HAVE_ARM_NEON_H) // Compiler-specific 128-bit SIMD register keyword - #define LHC_M128 uint8x16_t - #define LHC_TRY_NEON + #define LEO_M128 uint8x16_t + #define LEO_TRY_NEON #else - #define LHC_M128 uint64_t + #define LEO_M128 uint64_t # endif -#else // LHC_TARGET_MOBILE +#else // LEO_TARGET_MOBILE // Compiler-specific 128-bit SIMD register keyword - #define LHC_M128 __m128i + #define LEO_M128 __m128i -#endif // LHC_TARGET_MOBILE +#endif // LEO_TARGET_MOBILE -#ifdef LHC_TRY_AVX2 +#ifdef LEO_TRY_AVX2 // Compiler-specific 256-bit SIMD register keyword - #define LHC_M256 __m256i + #define LEO_M256 __m256i #endif // Compiler-specific C++11 restrict keyword -#define LHC_RESTRICT __restrict +#define LEO_RESTRICT __restrict // Compiler-specific force inline keyword #ifdef _MSC_VER - #define LHC_FORCE_INLINE inline __forceinline + #define LEO_FORCE_INLINE inline __forceinline #else - #define LHC_FORCE_INLINE inline __attribute__((always_inline)) + #define LEO_FORCE_INLINE inline __attribute__((always_inline)) #endif // Compiler-specific alignment keyword // Note: Alignment only matters for ARM NEON where it should be 16 #ifdef _MSC_VER - #define LHC_ALIGNED __declspec(align(LHC_ALIGN_BYTES)) + #define LEO_ALIGNED __declspec(align(LEO_ALIGN_BYTES)) #else // _MSC_VER - #define LHC_ALIGNED __attribute__((aligned(LHC_ALIGN_BYTES))) + #define LEO_ALIGNED __attribute__((aligned(LEO_ALIGN_BYTES))) #endif // _MSC_VER @@ -154,7 +175,7 @@ #include #endif -#if defined(LHC_TRY_NEON) +#if defined(LEO_TRY_NEON) # if defined(IOS) && defined(__ARM_NEON__) // Requires iPhone 5S or newer static const bool CpuHasNeon = true; @@ -167,14 +188,14 @@ #endif -#if !defined(LHC_TARGET_MOBILE) +#if !defined(LEO_TARGET_MOBILE) #ifdef _MSC_VER #include // __cpuid #pragma warning(disable: 4752) // found Intel(R) Advanced Vector Extensions; consider using /arch:AVX #endif -#ifdef LHC_TRY_AVX2 +#ifdef LEO_TRY_AVX2 static bool CpuHasAVX2 = false; #endif static bool CpuHasSSSE3 = false; @@ -219,12 +240,12 @@ static void _cpuid(unsigned int cpu_info[4U], const unsigned int cpu_info_type) #endif } -#endif // defined(LHC_TARGET_MOBILE) +#endif // defined(LEO_TARGET_MOBILE) -static void lhc_architecture_init() +static void leo_architecture_init() { -#if defined(LHC_TRY_NEON) && defined(HAVE_ANDROID_GETCPUFEATURES) +#if defined(LEO_TRY_NEON) && defined(HAVE_ANDROID_GETCPUFEATURES) AndroidCpuFamily family = android_getCpuFamily(); if (family == ANDROID_CPU_FAMILY_ARM) { @@ -239,32 +260,32 @@ static void lhc_architecture_init() } #endif -#if !defined(LHC_TARGET_MOBILE) +#if !defined(LEO_TARGET_MOBILE) unsigned int cpu_info[4]; _cpuid(cpu_info, 1); CpuHasSSSE3 = ((cpu_info[2] & CPUID_ECX_SSSE3) != 0); -#if defined(LHC_TRY_AVX2) +#if defined(LEO_TRY_AVX2) _cpuid(cpu_info, 7); CpuHasAVX2 = ((cpu_info[1] & CPUID_EBX_AVX2) != 0); -#endif // LHC_TRY_AVX2 +#endif // LEO_TRY_AVX2 -#endif // LHC_TARGET_MOBILE +#endif // LEO_TARGET_MOBILE } //------------------------------------------------------------------------------ // SIMD-Safe Aligned Memory Allocations -static const unsigned kAlignmentBytes = LHC_ALIGN_BYTES; +static const unsigned kAlignmentBytes = LEO_ALIGN_BYTES; -LHC_FORCE_INLINE unsigned NextAlignedOffset(unsigned offset) +LEO_FORCE_INLINE unsigned NextAlignedOffset(unsigned offset) { return (offset + kAlignmentBytes - 1) & ~(kAlignmentBytes - 1); } -static LHC_FORCE_INLINE uint8_t* SIMDSafeAllocate(size_t size) +static LEO_FORCE_INLINE uint8_t* SIMDSafeAllocate(size_t size) { uint8_t* data = (uint8_t*)calloc(1, kAlignmentBytes + size); if (!data) @@ -275,7 +296,7 @@ static LHC_FORCE_INLINE uint8_t* SIMDSafeAllocate(size_t size) return data; } -static LHC_FORCE_INLINE void SIMDSafeFree(void* ptr) +static LEO_FORCE_INLINE void SIMDSafeFree(void* ptr) { if (!ptr) return; @@ -283,7 +304,7 @@ static LHC_FORCE_INLINE void SIMDSafeFree(void* ptr) unsigned offset = data[-1]; if (offset >= kAlignmentBytes) { - LHC_DEBUG_BREAK; // Should never happen + LEO_DEBUG_BREAK; // Should never happen return; } data -= kAlignmentBytes - offset; @@ -294,9 +315,9 @@ static LHC_FORCE_INLINE void SIMDSafeFree(void* ptr) //------------------------------------------------------------------------------ // Field -//#define LHC_SHORT_FIELD +//#define LEO_SHORT_FIELD -#ifdef LHC_SHORT_FIELD +#ifdef LEO_SHORT_FIELD typedef uint8_t GFSymbol; static const unsigned kGFBits = 8; static const unsigned kGFPolynomial = 0x11D; @@ -386,7 +407,7 @@ static inline GFSymbol SubModQ(GFSymbol a, GFSymbol b) } // vx[] += vy[] * z -static void muladd_mem(GFSymbol * LHC_RESTRICT vx, const GFSymbol * LHC_RESTRICT vy, GFSymbol z, unsigned symbolCount) +static void muladd_mem(GFSymbol * LEO_RESTRICT vx, const GFSymbol * LEO_RESTRICT vy, GFSymbol z, unsigned symbolCount) { for (unsigned i = 0; i < symbolCount; ++i) { @@ -443,12 +464,12 @@ static GFSymbol mulE(GFSymbol a, GFSymbol b) // Q is the maximum symbol value, e.g. 255 or 65535. // Define this to enable the optimized version of FWHT() -#define LHC_FWHT_OPTIMIZED +#define LEO_FWHT_OPTIMIZED typedef GFSymbol fwht_t; // {a, b} = {a + b, a - b} (Mod Q) -static LHC_FORCE_INLINE void FWHT_2(fwht_t& LHC_RESTRICT a, fwht_t& LHC_RESTRICT b) +static LEO_FORCE_INLINE void FWHT_2(fwht_t& LEO_RESTRICT a, fwht_t& LEO_RESTRICT b) { const fwht_t sum = AddModQ(a, b); const fwht_t dif = SubModQ(a, b); @@ -473,7 +494,7 @@ static LHC_FORCE_INLINE void FWHT_2(fwht_t& LHC_RESTRICT a, fwht_t& LHC_RESTRICT at too high a complexity cost relative to minor perf improvement. */ -#ifndef LHC_FWHT_OPTIMIZED +#ifndef LEO_FWHT_OPTIMIZED // Reference implementation static void FWHT(fwht_t* data, const unsigned bits) @@ -487,7 +508,7 @@ static void FWHT(fwht_t* data, const unsigned bits) #else -static LHC_FORCE_INLINE void FWHT_4(fwht_t* data) +static LEO_FORCE_INLINE void FWHT_4(fwht_t* data) { fwht_t t0 = data[0]; fwht_t t1 = data[1]; @@ -503,7 +524,7 @@ static LHC_FORCE_INLINE void FWHT_4(fwht_t* data) data[3] = t3; } -static LHC_FORCE_INLINE void FWHT_4(fwht_t* data, unsigned s) +static LEO_FORCE_INLINE void FWHT_4(fwht_t* data, unsigned s) { unsigned x = 0; fwht_t t0 = data[x]; x += s; @@ -683,26 +704,26 @@ static void FWHT(fwht_t* data, const unsigned ldn) //------------------------------------------------------------------------------ // Memory Buffer XOR -static void xor_mem(void * LHC_RESTRICT vx, const void * LHC_RESTRICT vy, unsigned bytes) +static void xor_mem(void * LEO_RESTRICT vx, const void * LEO_RESTRICT vy, unsigned bytes) { - LHC_M128 * LHC_RESTRICT x16 = reinterpret_cast(vx); - const LHC_M128 * LHC_RESTRICT y16 = reinterpret_cast(vy); + LEO_M128 * LEO_RESTRICT x16 = reinterpret_cast(vx); + const LEO_M128 * LEO_RESTRICT y16 = reinterpret_cast(vy); -#if defined(LHC_TARGET_MOBILE) -# if defined(LHC_TRY_NEON) +#if defined(LEO_TARGET_MOBILE) +# if defined(LEO_TRY_NEON) // Handle multiples of 64 bytes if (CpuHasNeon) { while (bytes >= 64) { - LHC_M128 x0 = vld1q_u8(x16); - LHC_M128 x1 = vld1q_u8(x16 + 1); - LHC_M128 x2 = vld1q_u8(x16 + 2); - LHC_M128 x3 = vld1q_u8(x16 + 3); - LHC_M128 y0 = vld1q_u8(y16); - LHC_M128 y1 = vld1q_u8(y16 + 1); - LHC_M128 y2 = vld1q_u8(y16 + 2); - LHC_M128 y3 = vld1q_u8(y16 + 3); + LEO_M128 x0 = vld1q_u8(x16); + LEO_M128 x1 = vld1q_u8(x16 + 1); + LEO_M128 x2 = vld1q_u8(x16 + 2); + LEO_M128 x3 = vld1q_u8(x16 + 3); + LEO_M128 y0 = vld1q_u8(y16); + LEO_M128 y1 = vld1q_u8(y16 + 1); + LEO_M128 y2 = vld1q_u8(y16 + 2); + LEO_M128 y3 = vld1q_u8(y16 + 3); vst1q_u8(x16, veorq_u8(x0, y0)); vst1q_u8(x16 + 1, veorq_u8(x1, y1)); @@ -715,8 +736,8 @@ static void xor_mem(void * LHC_RESTRICT vx, const void * LHC_RESTRICT vy, unsign // Handle multiples of 16 bytes while (bytes >= 16) { - LHC_M128 x0 = vld1q_u8(x16); - LHC_M128 y0 = vld1q_u8(y16); + LEO_M128 x0 = vld1q_u8(x16); + LEO_M128 y0 = vld1q_u8(y16); vst1q_u8(x16, veorq_u8(x0, y0)); @@ -724,38 +745,38 @@ static void xor_mem(void * LHC_RESTRICT vx, const void * LHC_RESTRICT vy, unsign } } else -# endif // LHC_TRY_NEON +# endif // LEO_TRY_NEON { - uint64_t * LHC_RESTRICT x8 = reinterpret_cast(x16); - const uint64_t * LHC_RESTRICT y8 = reinterpret_cast(y16); + uint64_t * LEO_RESTRICT x8 = reinterpret_cast(x16); + const uint64_t * LEO_RESTRICT y8 = reinterpret_cast(y16); const unsigned count = (unsigned)bytes / 8; for (unsigned ii = 0; ii < count; ++ii) x8[ii] ^= y8[ii]; - x16 = reinterpret_cast(x8 + count); - y16 = reinterpret_cast(y8 + count); + x16 = reinterpret_cast(x8 + count); + y16 = reinterpret_cast(y8 + count); } -#else // LHC_TARGET_MOBILE -# if defined(LHC_TRY_AVX2) +#else // LEO_TARGET_MOBILE +# if defined(LEO_TRY_AVX2) if (CpuHasAVX2) { - LHC_M256 * LHC_RESTRICT x32 = reinterpret_cast(x16); - const LHC_M256 * LHC_RESTRICT y32 = reinterpret_cast(y16); + LEO_M256 * LEO_RESTRICT x32 = reinterpret_cast(x16); + const LEO_M256 * LEO_RESTRICT y32 = reinterpret_cast(y16); while (bytes >= 128) { - LHC_M256 x0 = _mm256_loadu_si256(x32); - LHC_M256 y0 = _mm256_loadu_si256(y32); + LEO_M256 x0 = _mm256_loadu_si256(x32); + LEO_M256 y0 = _mm256_loadu_si256(y32); x0 = _mm256_xor_si256(x0, y0); - LHC_M256 x1 = _mm256_loadu_si256(x32 + 1); - LHC_M256 y1 = _mm256_loadu_si256(y32 + 1); + LEO_M256 x1 = _mm256_loadu_si256(x32 + 1); + LEO_M256 y1 = _mm256_loadu_si256(y32 + 1); x1 = _mm256_xor_si256(x1, y1); - LHC_M256 x2 = _mm256_loadu_si256(x32 + 2); - LHC_M256 y2 = _mm256_loadu_si256(y32 + 2); + LEO_M256 x2 = _mm256_loadu_si256(x32 + 2); + LEO_M256 y2 = _mm256_loadu_si256(y32 + 2); x2 = _mm256_xor_si256(x2, y2); - LHC_M256 x3 = _mm256_loadu_si256(x32 + 3); - LHC_M256 y3 = _mm256_loadu_si256(y32 + 3); + LEO_M256 x3 = _mm256_loadu_si256(x32 + 3); + LEO_M256 y3 = _mm256_loadu_si256(y32 + 3); x3 = _mm256_xor_si256(x3, y3); _mm256_storeu_si256(x32, x0); @@ -778,25 +799,25 @@ static void xor_mem(void * LHC_RESTRICT vx, const void * LHC_RESTRICT vy, unsign bytes -= 32, ++x32, ++y32; } - x16 = reinterpret_cast(x32); - y16 = reinterpret_cast(y32); + x16 = reinterpret_cast(x32); + y16 = reinterpret_cast(y32); } else -# endif // LHC_TRY_AVX2 +# endif // LEO_TRY_AVX2 { while (bytes >= 64) { - LHC_M128 x0 = _mm_loadu_si128(x16); - LHC_M128 y0 = _mm_loadu_si128(y16); + LEO_M128 x0 = _mm_loadu_si128(x16); + LEO_M128 y0 = _mm_loadu_si128(y16); x0 = _mm_xor_si128(x0, y0); - LHC_M128 x1 = _mm_loadu_si128(x16 + 1); - LHC_M128 y1 = _mm_loadu_si128(y16 + 1); + LEO_M128 x1 = _mm_loadu_si128(x16 + 1); + LEO_M128 y1 = _mm_loadu_si128(y16 + 1); x1 = _mm_xor_si128(x1, y1); - LHC_M128 x2 = _mm_loadu_si128(x16 + 2); - LHC_M128 y2 = _mm_loadu_si128(y16 + 2); + LEO_M128 x2 = _mm_loadu_si128(x16 + 2); + LEO_M128 y2 = _mm_loadu_si128(y16 + 2); x2 = _mm_xor_si128(x2, y2); - LHC_M128 x3 = _mm_loadu_si128(x16 + 3); - LHC_M128 y3 = _mm_loadu_si128(y16 + 3); + LEO_M128 x3 = _mm_loadu_si128(x16 + 3); + LEO_M128 y3 = _mm_loadu_si128(y16 + 3); x3 = _mm_xor_si128(x3, y3); _mm_storeu_si128(x16, x0); @@ -807,7 +828,7 @@ static void xor_mem(void * LHC_RESTRICT vx, const void * LHC_RESTRICT vy, unsign bytes -= 64, x16 += 4, y16 += 4; } } -#endif // LHC_TARGET_MOBILE +#endif // LEO_TARGET_MOBILE // Handle multiples of 16 bytes while (bytes >= 16) @@ -821,15 +842,15 @@ static void xor_mem(void * LHC_RESTRICT vx, const void * LHC_RESTRICT vy, unsign bytes -= 16, ++x16, ++y16; } - uint8_t * LHC_RESTRICT x1 = reinterpret_cast(x16); - const uint8_t * LHC_RESTRICT y1 = reinterpret_cast(y16); + uint8_t * LEO_RESTRICT x1 = reinterpret_cast(x16); + const uint8_t * LEO_RESTRICT y1 = reinterpret_cast(y16); // Handle a block of 8 bytes const unsigned eight = bytes & 8; if (eight) { - uint64_t * LHC_RESTRICT x8 = reinterpret_cast(x1); - const uint64_t * LHC_RESTRICT y8 = reinterpret_cast(y1); + uint64_t * LEO_RESTRICT x8 = reinterpret_cast(x1); + const uint64_t * LEO_RESTRICT y8 = reinterpret_cast(y1); *x8 ^= *y8; } @@ -837,8 +858,8 @@ static void xor_mem(void * LHC_RESTRICT vx, const void * LHC_RESTRICT vy, unsign const unsigned four = bytes & 4; if (four) { - uint32_t * LHC_RESTRICT x4 = reinterpret_cast(x1 + eight); - const uint32_t * LHC_RESTRICT y4 = reinterpret_cast(y1 + eight); + uint32_t * LEO_RESTRICT x4 = reinterpret_cast(x1 + eight); + const uint32_t * LEO_RESTRICT y4 = reinterpret_cast(y1 + eight); *x4 ^= *y4; } @@ -1158,7 +1179,7 @@ void test(unsigned k, unsigned seed) if (data[i] != codeword[i]) { printf("Decoding Error with seed = %d!\n", seed); - LHC_DEBUG_BREAK; + LEO_DEBUG_BREAK; return; } } @@ -1174,7 +1195,7 @@ void test(unsigned k, unsigned seed) int main(int argc, char **argv) { // Initialize architecture-specific code - lhc_architecture_init(); + leo_architecture_init(); // Fill GFLog table and GFExp table InitField(); diff --git a/LeopardDecoder.h b/LeopardDecoder.h new file mode 100644 index 0000000..71d22e2 --- /dev/null +++ b/LeopardDecoder.h @@ -0,0 +1,1220 @@ +/* + Copyright (c) 2017 Christopher A. Taylor. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + * Neither the name of LHC-RS nor the names of its contributors may be + used to endorse or promote products derived from this software without + specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. +*/ + +#include +#include +#include +#include +#include + + +/* + TODO: + + Write C API and unit tester + + Limit input to multiples of 64 bytes + + Replace GFSymbol with a file data pointer + + New 16-bit Muladd inner loops + + Class to contain the (large) muladd tables + + Preliminary benchmarks for large data! + + New 8-bit Muladd inner loops + + Benchmarks for smaller data! + + Refactor software + + Pick a name for the software better than LEO_RS + + I think it should be split up into several C++ modules + + Write detailed comments for all the routines + + Look into getting EncodeL working so we can support smaller data (Ask Lin) + + Look into using k instead of k2 to speed up decoder (Ask Lin) + + Avoid performing FFT/IFFT intermediate calculations we're not going to use + + Benchmarks, fun! + + Add multi-threading to split up long parallelizable calculations + + Final benchmarks! + + Finish up documentation + + Release version 1 + + + Muladd implementation notes: + + Specialize for 1-3 rows at a time since often times we're multiplying by + the same (skew) value repeatedly, as the ISA-L library does here: + + https://github.com/01org/isa-l/blob/master/erasure_code/gf_3vect_mad_avx.asm#L258 + + Except we should be doing it for 16-bit Galois Field. + To implement that use the ALTMAP trick from Jerasure: + + http://lab.jerasure.org/jerasure/gf-complete/blob/master/src/gf_w16.c#L1140 + + Except we should also support AVX2 since that is a 40% perf boost, so put + the high and low bytes 32 bytes instead of 16 bytes apart. + + Also I think we should go ahead and precompute the multiply tables since + it avoids a bunch of memory lookups for each muladd, and only costs 8 MB. +*/ + + +//------------------------------------------------------------------------------ +// Debug + +// Some bugs only repro in release mode, so this can be helpful +//#define LEO_DEBUG_IN_RELEASE + +#if defined(_DEBUG) || defined(DEBUG) || defined(LEO_DEBUG_IN_RELEASE) + #define LEO_DEBUG + #ifdef _WIN32 + #define LEO_DEBUG_BREAK __debugbreak() + #else + #define LEO_DEBUG_BREAK __builtin_trap() + #endif + #define LEO_DEBUG_ASSERT(cond) { if (!(cond)) { LEO_DEBUG_BREAK; } } +#else + #define LEO_DEBUG_BREAK ; + #define LEO_DEBUG_ASSERT(cond) ; +#endif + + +//------------------------------------------------------------------------------ +// Platform/Architecture + +#if defined(ANDROID) || defined(IOS) + #define LEO_TARGET_MOBILE +#endif // ANDROID + +#if defined(__AVX2__) || (defined (_MSC_VER) && _MSC_VER >= 1900) + #define LEO_TRY_AVX2 /* 256-bit */ + #include + #define LEO_ALIGN_BYTES 32 +#else // __AVX2__ + #define LEO_ALIGN_BYTES 16 +#endif // __AVX2__ + +#if !defined(LEO_TARGET_MOBILE) + // Note: MSVC currently only supports SSSE3 but not AVX2 + #include // SSSE3: _mm_shuffle_epi8 + #include // SSE2 +#endif // LEO_TARGET_MOBILE + +#if defined(HAVE_ARM_NEON_H) + #include +#endif // HAVE_ARM_NEON_H + +#if defined(LEO_TARGET_MOBILE) + + #define LEO_ALIGNED_ACCESSES /* Inputs must be aligned to LEO_ALIGN_BYTES */ + +# if defined(HAVE_ARM_NEON_H) + // Compiler-specific 128-bit SIMD register keyword + #define LEO_M128 uint8x16_t + #define LEO_TRY_NEON +#else + #define LEO_M128 uint64_t +# endif + +#else // LEO_TARGET_MOBILE + + // Compiler-specific 128-bit SIMD register keyword + #define LEO_M128 __m128i + +#endif // LEO_TARGET_MOBILE + +#ifdef LEO_TRY_AVX2 + // Compiler-specific 256-bit SIMD register keyword + #define LEO_M256 __m256i +#endif + +// Compiler-specific C++11 restrict keyword +#define LEO_RESTRICT __restrict + +// Compiler-specific force inline keyword +#ifdef _MSC_VER + #define LEO_FORCE_INLINE inline __forceinline +#else + #define LEO_FORCE_INLINE inline __attribute__((always_inline)) +#endif + +// Compiler-specific alignment keyword +// Note: Alignment only matters for ARM NEON where it should be 16 +#ifdef _MSC_VER + #define LEO_ALIGNED __declspec(align(LEO_ALIGN_BYTES)) +#else // _MSC_VER + #define LEO_ALIGNED __attribute__((aligned(LEO_ALIGN_BYTES))) +#endif // _MSC_VER + + +//------------------------------------------------------------------------------ +// Runtime CPU Architecture Check +// +// Feature checks stolen shamelessly from +// https://github.com/jedisct1/libsodium/blob/master/src/libsodium/sodium/runtime.c + +#if defined(HAVE_ANDROID_GETCPUFEATURES) + #include +#endif + +#if defined(LEO_TRY_NEON) +# if defined(IOS) && defined(__ARM_NEON__) + // Requires iPhone 5S or newer + static const bool CpuHasNeon = true; + static const bool CpuHasNeon64 = true; +# else + // Remember to add LOCAL_STATIC_LIBRARIES := cpufeatures + static bool CpuHasNeon = false; // V6 / V7 + static bool CpuHasNeon64 = false; // 64-bit +# endif +#endif + + +#if !defined(LEO_TARGET_MOBILE) + +#ifdef _MSC_VER + #include // __cpuid + #pragma warning(disable: 4752) // found Intel(R) Advanced Vector Extensions; consider using /arch:AVX +#endif + +#ifdef LEO_TRY_AVX2 +static bool CpuHasAVX2 = false; +#endif +static bool CpuHasSSSE3 = false; + +#define CPUID_EBX_AVX2 0x00000020 +#define CPUID_ECX_SSSE3 0x00000200 + +static void _cpuid(unsigned int cpu_info[4U], const unsigned int cpu_info_type) +{ +#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_AMD64) || defined(_M_IX86)) + __cpuid((int *) cpu_info, cpu_info_type); +#else //if defined(HAVE_CPUID) + cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0; +# ifdef __i386__ + __asm__ __volatile__ ("pushfl; pushfl; " + "popl %0; " + "movl %0, %1; xorl %2, %0; " + "pushl %0; " + "popfl; pushfl; popl %0; popfl" : + "=&r" (cpu_info[0]), "=&r" (cpu_info[1]) : + "i" (0x200000)); + if (((cpu_info[0] ^ cpu_info[1]) & 0x200000) == 0) { + return; /* LCOV_EXCL_LINE */ + } +# endif +# ifdef __i386__ + __asm__ __volatile__ ("xchgl %%ebx, %k1; cpuid; xchgl %%ebx, %k1" : + "=a" (cpu_info[0]), "=&r" (cpu_info[1]), + "=c" (cpu_info[2]), "=d" (cpu_info[3]) : + "0" (cpu_info_type), "2" (0U)); +# elif defined(__x86_64__) + __asm__ __volatile__ ("xchgq %%rbx, %q1; cpuid; xchgq %%rbx, %q1" : + "=a" (cpu_info[0]), "=&r" (cpu_info[1]), + "=c" (cpu_info[2]), "=d" (cpu_info[3]) : + "0" (cpu_info_type), "2" (0U)); +# else + __asm__ __volatile__ ("cpuid" : + "=a" (cpu_info[0]), "=b" (cpu_info[1]), + "=c" (cpu_info[2]), "=d" (cpu_info[3]) : + "0" (cpu_info_type), "2" (0U)); +# endif +#endif +} + +#endif // defined(LEO_TARGET_MOBILE) + + +static void leo_architecture_init() +{ +#if defined(LEO_TRY_NEON) && defined(HAVE_ANDROID_GETCPUFEATURES) + AndroidCpuFamily family = android_getCpuFamily(); + if (family == ANDROID_CPU_FAMILY_ARM) + { + if (android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON) + CpuHasNeon = true; + } + else if (family == ANDROID_CPU_FAMILY_ARM64) + { + CpuHasNeon = true; + if (android_getCpuFeatures() & ANDROID_CPU_ARM64_FEATURE_ASIMD) + CpuHasNeon64 = true; + } +#endif + +#if !defined(LEO_TARGET_MOBILE) + unsigned int cpu_info[4]; + + _cpuid(cpu_info, 1); + CpuHasSSSE3 = ((cpu_info[2] & CPUID_ECX_SSSE3) != 0); + +#if defined(LEO_TRY_AVX2) + _cpuid(cpu_info, 7); + CpuHasAVX2 = ((cpu_info[1] & CPUID_EBX_AVX2) != 0); +#endif // LEO_TRY_AVX2 + +#endif // LEO_TARGET_MOBILE +} + + +//------------------------------------------------------------------------------ +// SIMD-Safe Aligned Memory Allocations + +static const unsigned kAlignmentBytes = LEO_ALIGN_BYTES; + +LEO_FORCE_INLINE unsigned NextAlignedOffset(unsigned offset) +{ + return (offset + kAlignmentBytes - 1) & ~(kAlignmentBytes - 1); +} + +static LEO_FORCE_INLINE uint8_t* SIMDSafeAllocate(size_t size) +{ + uint8_t* data = (uint8_t*)calloc(1, kAlignmentBytes + size); + if (!data) + return nullptr; + unsigned offset = (unsigned)((uintptr_t)data % kAlignmentBytes); + data += kAlignmentBytes - offset; + data[-1] = (uint8_t)offset; + return data; +} + +static LEO_FORCE_INLINE void SIMDSafeFree(void* ptr) +{ + if (!ptr) + return; + uint8_t* data = (uint8_t*)ptr; + unsigned offset = data[-1]; + if (offset >= kAlignmentBytes) + { + LEO_DEBUG_BREAK; // Should never happen + return; + } + data -= kAlignmentBytes - offset; + free(data); +} + + +//------------------------------------------------------------------------------ +// Field + +//#define LEO_SHORT_FIELD + +#ifdef LEO_SHORT_FIELD +typedef uint8_t GFSymbol; +static const unsigned kGFBits = 8; +static const unsigned kGFPolynomial = 0x11D; +GFSymbol kGFBasis[kGFBits] = { + 1, 214, 152, 146, 86, 200, 88, 230 // Cantor basis +}; +#else +typedef uint16_t GFSymbol; +static const unsigned kGFBits = 16; +static const unsigned kGFPolynomial = 0x1002D; +GFSymbol kGFBasis[kGFBits] = { + 0x0001, 0xACCA, 0x3C0E, 0x163E, // Cantor basis + 0xC582, 0xED2E, 0x914C, 0x4012, + 0x6C98, 0x10D8, 0x6A72, 0xB900, + 0xFDB8, 0xFB34, 0xFF38, 0x991E +}; +#endif + +/* + Cantor Basis introduced by: + D. G. Cantor, "On arithmetical algorithms over finite fields", + Journal of Combinatorial Theory, Series A, vol. 50, no. 2, pp. 285-300, 1989. +*/ + +static const unsigned kFieldSize = (unsigned)1 << kGFBits; //Field size +static const unsigned kFieldModulus = kFieldSize - 1; + +static GFSymbol GFLog[kFieldSize]; +static GFSymbol GFExp[kFieldSize]; + +// Initialize GFLog[], GFExp[] +static void InitField() +{ + unsigned state = 1; + for (unsigned i = 0; i < kFieldModulus; ++i) + { + GFExp[state] = static_cast(i); + state <<= 1; + if (state >= kFieldSize) + state ^= kGFPolynomial; + } + GFExp[0] = kFieldModulus; + + // Conversion to chosen basis: + + GFLog[0] = 0; + for (unsigned i = 0; i < kGFBits; ++i) + { + const GFSymbol basis = kGFBasis[i]; + const unsigned width = (unsigned)(1UL << i); + + for (unsigned j = 0; j < width; ++j) + GFLog[j + width] = GFLog[j] ^ basis; + } + + for (unsigned i = 0; i < kFieldSize; ++i) + GFLog[i] = GFExp[GFLog[i]]; + + for (unsigned i = 0; i < kFieldSize; ++i) + GFExp[GFLog[i]] = i; + + GFExp[kFieldModulus] = GFExp[0]; +} + + +//------------------------------------------------------------------------------ +// Mod Q Field Operations +// +// Q is the maximum symbol value, e.g. 255 or 65535. + +// z = x + y (mod Q) +static inline GFSymbol AddModQ(GFSymbol a, GFSymbol b) +{ + const unsigned sum = (unsigned)a + b; + + // Partial reduction step, allowing for Q to be returned + return static_cast(sum + (sum >> kGFBits)); +} + +// z = x - y (mod Q) +static inline GFSymbol SubModQ(GFSymbol a, GFSymbol b) +{ + const unsigned dif = (unsigned)a - b; + + // Partial reduction step, allowing for Q to be returned + return static_cast(dif + (dif >> kGFBits)); +} + +// vx[] += vy[] * z +static void muladd_mem(GFSymbol * LEO_RESTRICT vx, const GFSymbol * LEO_RESTRICT vy, GFSymbol z, unsigned symbolCount) +{ + for (unsigned i = 0; i < symbolCount; ++i) + { + const GFSymbol a = vy[i]; + if (a == 0) + continue; + + GFSymbol sum1 = static_cast(AddModQ(GFLog[a & 0x0f], z)); + GFSymbol value1 = GFExp[sum1]; + if ((a & 0x0f) == 0) + { + value1 = 0; + } + GFSymbol sum2 = static_cast(AddModQ(GFLog[a & 0xf0], z)); + GFSymbol value2 = GFExp[sum2]; + if ((a & 0xf0) == 0) + { + value2 = 0; + } + GFSymbol sum3 = static_cast(AddModQ(GFLog[a & 0x0f00], z)); + GFSymbol value3 = GFExp[sum3]; + if ((a & 0x0f00) == 0) + { + value3 = 0; + } + GFSymbol sum4 = static_cast(AddModQ(GFLog[a & 0xf000], z)); + GFSymbol value4 = GFExp[sum4]; + if ((a & 0xf000) == 0) + { + value4 = 0; + } + + vx[i] ^= value1; + vx[i] ^= value2; + vx[i] ^= value3; + vx[i] ^= value4; + } +} + +// return a*GFExp[b] over GF(2^r) +static GFSymbol mulE(GFSymbol a, GFSymbol b) +{ + if (a == 0) + return 0; + + const GFSymbol sum = static_cast(AddModQ(GFLog[a], b)); + return GFExp[sum]; +} + + +//------------------------------------------------------------------------------ +// Fast Walsh-Hadamard Transform (FWHT) Mod Q +// +// Q is the maximum symbol value, e.g. 255 or 65535. + +// Define this to enable the optimized version of FWHT() +#define LEO_FWHT_OPTIMIZED + +typedef GFSymbol fwht_t; + +// {a, b} = {a + b, a - b} (Mod Q) +static LEO_FORCE_INLINE void FWHT_2(fwht_t& LEO_RESTRICT a, fwht_t& LEO_RESTRICT b) +{ + const fwht_t sum = AddModQ(a, b); + const fwht_t dif = SubModQ(a, b); + a = sum; + b = dif; +} + +/* + FWHT is a minor slice of the runtime and does not grow with data size, + but I did attempt a few additional optimizations that failed: + + I've attempted to vectorize (with partial reductions) FWHT_4(data, s), + which is 70% of the algorithm, but it was slower. Left in _attic_. + + I've attempted to avoid reductions in all or parts of the FWHT. + The final modular reduction ends up being slower than the savings. + Specifically I tried doing it for the whole FWHT and also I tried + doing it just for the FWHT_2 loop in the main routine, but both + approaches are slower than partial reductions. + + Replacing word reads with wider reads does speed up the operation, but + at too high a complexity cost relative to minor perf improvement. +*/ + +#ifndef LEO_FWHT_OPTIMIZED + +// Reference implementation +static void FWHT(fwht_t* data, const unsigned bits) +{ + const unsigned size = (unsigned)(1UL << bits); + for (unsigned width = 1; width < size; width <<= 1) + for (unsigned i = 0; i < size; i += (width << 1)) + for (unsigned j = i; j < (width + i); ++j) + FWHT_2(data[j], data[j + width]); +} + +#else + +static LEO_FORCE_INLINE void FWHT_4(fwht_t* data) +{ + fwht_t t0 = data[0]; + fwht_t t1 = data[1]; + fwht_t t2 = data[2]; + fwht_t t3 = data[3]; + FWHT_2(t0, t1); + FWHT_2(t2, t3); + FWHT_2(t0, t2); + FWHT_2(t1, t3); + data[0] = t0; + data[1] = t1; + data[2] = t2; + data[3] = t3; +} + +static LEO_FORCE_INLINE void FWHT_4(fwht_t* data, unsigned s) +{ + unsigned x = 0; + fwht_t t0 = data[x]; x += s; + fwht_t t1 = data[x]; x += s; + fwht_t t2 = data[x]; x += s; + fwht_t t3 = data[x]; + FWHT_2(t0, t1); + FWHT_2(t2, t3); + FWHT_2(t0, t2); + FWHT_2(t1, t3); + unsigned y = 0; + data[y] = t0; y += s; + data[y] = t1; y += s; + data[y] = t2; y += s; + data[y] = t3; +} + +static inline void FWHT_8(fwht_t* data) +{ + fwht_t t0 = data[0]; + fwht_t t1 = data[1]; + fwht_t t2 = data[2]; + fwht_t t3 = data[3]; + fwht_t t4 = data[4]; + fwht_t t5 = data[5]; + fwht_t t6 = data[6]; + fwht_t t7 = data[7]; + FWHT_2(t0, t1); + FWHT_2(t2, t3); + FWHT_2(t4, t5); + FWHT_2(t6, t7); + FWHT_2(t0, t2); + FWHT_2(t1, t3); + FWHT_2(t4, t6); + FWHT_2(t5, t7); + FWHT_2(t0, t4); + FWHT_2(t1, t5); + FWHT_2(t2, t6); + FWHT_2(t3, t7); + data[0] = t0; + data[1] = t1; + data[2] = t2; + data[3] = t3; + data[4] = t4; + data[5] = t5; + data[6] = t6; + data[7] = t7; +} + +static inline void FWHT_16(fwht_t* data) +{ + fwht_t t0 = data[0]; + fwht_t t1 = data[1]; + fwht_t t2 = data[2]; + fwht_t t3 = data[3]; + fwht_t t4 = data[4]; + fwht_t t5 = data[5]; + fwht_t t6 = data[6]; + fwht_t t7 = data[7]; + fwht_t t8 = data[8]; + fwht_t t9 = data[9]; + fwht_t t10 = data[10]; + fwht_t t11 = data[11]; + fwht_t t12 = data[12]; + fwht_t t13 = data[13]; + fwht_t t14 = data[14]; + fwht_t t15 = data[15]; + FWHT_2(t0, t1); + FWHT_2(t2, t3); + FWHT_2(t4, t5); + FWHT_2(t6, t7); + FWHT_2(t8, t9); + FWHT_2(t10, t11); + FWHT_2(t12, t13); + FWHT_2(t14, t15); + FWHT_2(t0, t2); + FWHT_2(t1, t3); + FWHT_2(t4, t6); + FWHT_2(t5, t7); + FWHT_2(t8, t10); + FWHT_2(t9, t11); + FWHT_2(t12, t14); + FWHT_2(t13, t15); + FWHT_2(t0, t4); + FWHT_2(t1, t5); + FWHT_2(t2, t6); + FWHT_2(t3, t7); + FWHT_2(t8, t12); + FWHT_2(t9, t13); + FWHT_2(t10, t14); + FWHT_2(t11, t15); + FWHT_2(t0, t8); + FWHT_2(t1, t9); + FWHT_2(t2, t10); + FWHT_2(t3, t11); + FWHT_2(t4, t12); + FWHT_2(t5, t13); + FWHT_2(t6, t14); + FWHT_2(t7, t15); + data[0] = t0; + data[1] = t1; + data[2] = t2; + data[3] = t3; + data[4] = t4; + data[5] = t5; + data[6] = t6; + data[7] = t7; + data[8] = t8; + data[9] = t9; + data[10] = t10; + data[11] = t11; + data[12] = t12; + data[13] = t13; + data[14] = t14; + data[15] = t15; +} + +static void FWHT_SmallData(fwht_t* data, unsigned ldn) +{ + const unsigned n = (1UL << ldn); + + if (n <= 2) + { + if (n == 2) + FWHT_2(data[0], data[1]); + return; + } + + for (unsigned ldm = ldn; ldm > 3; ldm -= 2) + { + unsigned m = (1UL << ldm); + unsigned m4 = (m >> 2); + for (unsigned r = 0; r < n; r += m) + for (unsigned j = 0; j < m4; j++) + FWHT_4(data + j + r, m4); + } + + if (ldn & 1) + { + for (unsigned i0 = 0; i0 < n; i0 += 8) + FWHT_8(data + i0); + } + else + { + for (unsigned i0 = 0; i0 < n; i0 += 4) + FWHT_4(data + i0); + } +} + +// Decimation in time (DIT) version +static void FWHT(fwht_t* data, const unsigned ldn) +{ + if (ldn <= 13) + { + FWHT_SmallData(data, ldn); + return; + } + + FWHT_2(data[2], data[3]); + FWHT_4(data + 4); + FWHT_8(data + 8); + FWHT_16(data + 16); + for (unsigned ldm = 5; ldm < ldn; ++ldm) + FWHT(data + (unsigned)(1UL << ldm), ldm); + + for (unsigned ldm = 0; ldm < ldn; ++ldm) + { + const unsigned mh = (1UL << ldm); + for (unsigned t1 = 0, t2 = mh; t1 < mh; ++t1, ++t2) + FWHT_2(data[t1], data[t2]); + } +} + +#endif + + +//------------------------------------------------------------------------------ +// Memory Buffer XOR + +static void xor_mem(void * LEO_RESTRICT vx, const void * LEO_RESTRICT vy, unsigned bytes) +{ + LEO_M128 * LEO_RESTRICT x16 = reinterpret_cast(vx); + const LEO_M128 * LEO_RESTRICT y16 = reinterpret_cast(vy); + +#if defined(LEO_TARGET_MOBILE) +# if defined(LEO_TRY_NEON) + // Handle multiples of 64 bytes + if (CpuHasNeon) + { + while (bytes >= 64) + { + LEO_M128 x0 = vld1q_u8(x16); + LEO_M128 x1 = vld1q_u8(x16 + 1); + LEO_M128 x2 = vld1q_u8(x16 + 2); + LEO_M128 x3 = vld1q_u8(x16 + 3); + LEO_M128 y0 = vld1q_u8(y16); + LEO_M128 y1 = vld1q_u8(y16 + 1); + LEO_M128 y2 = vld1q_u8(y16 + 2); + LEO_M128 y3 = vld1q_u8(y16 + 3); + + vst1q_u8(x16, veorq_u8(x0, y0)); + vst1q_u8(x16 + 1, veorq_u8(x1, y1)); + vst1q_u8(x16 + 2, veorq_u8(x2, y2)); + vst1q_u8(x16 + 3, veorq_u8(x3, y3)); + + bytes -= 64, x16 += 4, y16 += 4; + } + + // Handle multiples of 16 bytes + while (bytes >= 16) + { + LEO_M128 x0 = vld1q_u8(x16); + LEO_M128 y0 = vld1q_u8(y16); + + vst1q_u8(x16, veorq_u8(x0, y0)); + + bytes -= 16, ++x16, ++y16; + } + } + else +# endif // LEO_TRY_NEON + { + uint64_t * LEO_RESTRICT x8 = reinterpret_cast(x16); + const uint64_t * LEO_RESTRICT y8 = reinterpret_cast(y16); + + const unsigned count = (unsigned)bytes / 8; + for (unsigned ii = 0; ii < count; ++ii) + x8[ii] ^= y8[ii]; + + x16 = reinterpret_cast(x8 + count); + y16 = reinterpret_cast(y8 + count); + } +#else // LEO_TARGET_MOBILE +# if defined(LEO_TRY_AVX2) + if (CpuHasAVX2) + { + LEO_M256 * LEO_RESTRICT x32 = reinterpret_cast(x16); + const LEO_M256 * LEO_RESTRICT y32 = reinterpret_cast(y16); + + while (bytes >= 128) + { + LEO_M256 x0 = _mm256_loadu_si256(x32); + LEO_M256 y0 = _mm256_loadu_si256(y32); + x0 = _mm256_xor_si256(x0, y0); + LEO_M256 x1 = _mm256_loadu_si256(x32 + 1); + LEO_M256 y1 = _mm256_loadu_si256(y32 + 1); + x1 = _mm256_xor_si256(x1, y1); + LEO_M256 x2 = _mm256_loadu_si256(x32 + 2); + LEO_M256 y2 = _mm256_loadu_si256(y32 + 2); + x2 = _mm256_xor_si256(x2, y2); + LEO_M256 x3 = _mm256_loadu_si256(x32 + 3); + LEO_M256 y3 = _mm256_loadu_si256(y32 + 3); + x3 = _mm256_xor_si256(x3, y3); + + _mm256_storeu_si256(x32, x0); + _mm256_storeu_si256(x32 + 1, x1); + _mm256_storeu_si256(x32 + 2, x2); + _mm256_storeu_si256(x32 + 3, x3); + + bytes -= 128, x32 += 4, y32 += 4; + } + + // Handle multiples of 32 bytes + while (bytes >= 32) + { + // x[i] = x[i] xor y[i] + _mm256_storeu_si256(x32, + _mm256_xor_si256( + _mm256_loadu_si256(x32), + _mm256_loadu_si256(y32))); + + bytes -= 32, ++x32, ++y32; + } + + x16 = reinterpret_cast(x32); + y16 = reinterpret_cast(y32); + } + else +# endif // LEO_TRY_AVX2 + { + while (bytes >= 64) + { + LEO_M128 x0 = _mm_loadu_si128(x16); + LEO_M128 y0 = _mm_loadu_si128(y16); + x0 = _mm_xor_si128(x0, y0); + LEO_M128 x1 = _mm_loadu_si128(x16 + 1); + LEO_M128 y1 = _mm_loadu_si128(y16 + 1); + x1 = _mm_xor_si128(x1, y1); + LEO_M128 x2 = _mm_loadu_si128(x16 + 2); + LEO_M128 y2 = _mm_loadu_si128(y16 + 2); + x2 = _mm_xor_si128(x2, y2); + LEO_M128 x3 = _mm_loadu_si128(x16 + 3); + LEO_M128 y3 = _mm_loadu_si128(y16 + 3); + x3 = _mm_xor_si128(x3, y3); + + _mm_storeu_si128(x16, x0); + _mm_storeu_si128(x16 + 1, x1); + _mm_storeu_si128(x16 + 2, x2); + _mm_storeu_si128(x16 + 3, x3); + + bytes -= 64, x16 += 4, y16 += 4; + } + } +#endif // LEO_TARGET_MOBILE + + // Handle multiples of 16 bytes + while (bytes >= 16) + { + // x[i] = x[i] xor y[i] + _mm_storeu_si128(x16, + _mm_xor_si128( + _mm_loadu_si128(x16), + _mm_loadu_si128(y16))); + + bytes -= 16, ++x16, ++y16; + } + + uint8_t * LEO_RESTRICT x1 = reinterpret_cast(x16); + const uint8_t * LEO_RESTRICT y1 = reinterpret_cast(y16); + + // Handle a block of 8 bytes + const unsigned eight = bytes & 8; + if (eight) + { + uint64_t * LEO_RESTRICT x8 = reinterpret_cast(x1); + const uint64_t * LEO_RESTRICT y8 = reinterpret_cast(y1); + *x8 ^= *y8; + } + + // Handle a block of 4 bytes + const unsigned four = bytes & 4; + if (four) + { + uint32_t * LEO_RESTRICT x4 = reinterpret_cast(x1 + eight); + const uint32_t * LEO_RESTRICT y4 = reinterpret_cast(y1 + eight); + *x4 ^= *y4; + } + + // Handle final bytes + const unsigned offset = eight + four; + switch (bytes & 3) + { + case 3: x1[offset + 2] ^= y1[offset + 2]; + case 2: x1[offset + 1] ^= y1[offset + 1]; + case 1: x1[offset] ^= y1[offset]; + default: + break; + } +} + + +//------------------------------------------------------------------------------ +// Formal Derivative + +// Formal derivative of polynomial in the new basis +static void formal_derivative(GFSymbol* cos, const unsigned size) +{ + for (unsigned i = 1; i < size; ++i) + { + const unsigned leng = ((i ^ (i - 1)) + 1) >> 1; + + // If a large number of values are being XORed: + if (leng >= 8) + xor_mem(cos + i - leng, cos + i, leng * sizeof(GFSymbol)); + else + for (unsigned j = i - leng; j < i; j++) + cos[j] ^= cos[j + leng]; + } + + for (unsigned i = size; i < kFieldSize; i <<= 1) + xor_mem(cos, cos + i, size * sizeof(GFSymbol)); +} + + +//------------------------------------------------------------------------------ +// Fast Fourier Transform + +static GFSymbol skewVec[kFieldModulus]; // twisted factors used in FFT + +// IFFT in the proposed basis +static void IFLT(GFSymbol* data, const unsigned size, const unsigned index) +{ + for (unsigned depart_no = 1; depart_no < size; depart_no <<= 1) + { + for (unsigned j = depart_no; j < size; j += (depart_no << 1)) + { + // If a large number of values are being XORed: + if (depart_no >= 8) + xor_mem(data + j, data + j - depart_no, depart_no * sizeof(GFSymbol)); + else + for (unsigned i = j - depart_no; i < j; ++i) + data[i + depart_no] ^= data[i]; + + const GFSymbol skew = skewVec[j + index - 1]; + + if (skew != kFieldModulus) + muladd_mem(data + j - depart_no, data + j, skew, depart_no); + } + } +} + +// FFT in the proposed basis +static void FLT(GFSymbol* data, const unsigned size, const unsigned index) +{ + for (unsigned depart_no = (size >> 1); depart_no > 0; depart_no >>= 1) + { + for (unsigned j = depart_no; j < size; j += (depart_no << 1)) + { + const GFSymbol skew = skewVec[j + index - 1]; + + if (skew != kFieldModulus) + muladd_mem(data + j - depart_no, data + j, skew, depart_no); + + // If a large number of values are being XORed: + if (depart_no >= 8) + xor_mem(data + j, data + j - depart_no, depart_no * sizeof(GFSymbol)); + else + for (unsigned i = j - depart_no; i < j; ++i) + data[i + depart_no] ^= data[i]; + } + } +} + + +//------------------------------------------------------------------------------ +// FFT Initialization + +static GFSymbol B[kFieldSize >> 1]; // factors used in formal derivative +static fwht_t log_walsh[kFieldSize]; // factors used in the evaluation of the error locator polynomial + +// Initialize skewVec[], B[], log_walsh[] +static void InitFieldOperations() +{ + GFSymbol temp[kGFBits - 1]; + + for (unsigned i = 1; i < kGFBits; ++i) + temp[i - 1] = (GFSymbol)((unsigned)1 << i); + + for (unsigned m = 0; m < (kGFBits - 1); ++m) + { + const unsigned step = (unsigned)1 << (m + 1); + + skewVec[((unsigned)1 << m) - 1] = 0; + + for (unsigned i = m; i < (kGFBits - 1); ++i) + { + const unsigned s = ((unsigned)1 << (i + 1)); + + for (unsigned j = ((unsigned)1 << m) - 1; j < s; j += step) + skewVec[j + s] = skewVec[j] ^ temp[i]; + } + + temp[m] = kFieldModulus - GFLog[mulE(temp[m], GFLog[temp[m] ^ 1])]; + + for (unsigned i = m + 1; i < (kGFBits - 1); ++i) + temp[i] = mulE(temp[i], (GFLog[temp[i] ^ 1] + temp[m]) % kFieldModulus); + } + + for (unsigned i = 0; i < kFieldSize; ++i) + skewVec[i] = GFLog[skewVec[i]]; + + temp[0] = kFieldModulus - temp[0]; + + for (unsigned i = 1; i < (kGFBits - 1); ++i) + temp[i] = (kFieldModulus - temp[i] + temp[i - 1]) % kFieldModulus; + + B[0] = 0; + for (unsigned i = 0; i < (kGFBits - 1); ++i) + { + const unsigned depart = ((unsigned)1 << i); + + for (unsigned j = 0; j < depart; ++j) + B[j + depart] = (B[j] + temp[i]) % kFieldModulus; + } + + for (unsigned i = 0; i < kFieldSize; ++i) + log_walsh[i] = GFLog[i]; + + log_walsh[0] = 0; + + FWHT(log_walsh, kGFBits); +} + + +//------------------------------------------------------------------------------ +// Encoder + +// Encoding alg for k/n<0.5: message is a power of two +static void encodeL(GFSymbol* data, const unsigned k, GFSymbol* codeword) +{ + memcpy(codeword, data, sizeof(GFSymbol) * k); + + IFLT(codeword, k, 0); + + for (unsigned i = k; i < kFieldSize; i += k) + { + memcpy(&codeword[i], codeword, sizeof(GFSymbol) * k); + + FLT(&codeword[i], k, i); + } + + memcpy(codeword, data, sizeof(GFSymbol) * k); +} + +// Encoding alg for k/n>0.5: parity is a power of two. +// data: message array. parity: parity array. mem: buffer(size>= n-k) +static void encodeH(const GFSymbol* data, const unsigned k, GFSymbol* parity, GFSymbol* mem) +{ + const unsigned t = kFieldSize - k; + + memset(parity, 0, sizeof(GFSymbol) * t); + + for (unsigned i = t; i < kFieldSize; i += t) + { + memcpy(mem, &data[i - t], sizeof(GFSymbol) * t); + + IFLT(mem, t, i); + + xor_mem(parity, mem, t * sizeof(GFSymbol)); + } + + FLT(parity, t, 0); +} + + +//------------------------------------------------------------------------------ +// Decoder + +static void decode(GFSymbol* codeword, unsigned k, const bool* erasure) +{ + fwht_t log_walsh2[kFieldSize]; + + // Compute the evaluations of the error locator polynomial + for (unsigned i = 0; i < kFieldSize; ++i) + log_walsh2[i] = erasure[i] ? 1 : 0; + + FWHT(log_walsh2, kGFBits); + + for (unsigned i = 0; i < kFieldSize; ++i) + log_walsh2[i] = ((unsigned)log_walsh2[i] * (unsigned)log_walsh[i]) % kFieldModulus; + + FWHT(log_walsh2, kGFBits); + + // k2 can be replaced with k + const unsigned k2 = kFieldSize; + //const unsigned k2 = k; // cannot actually be replaced with k. what else need to change? + + for (unsigned i = 0; i < kFieldSize; ++i) + { + if (erasure[i]) + { + codeword[i] = 0; + } + else + { + codeword[i] = mulE(codeword[i], log_walsh2[i]); + } + } + + IFLT(codeword, kFieldSize, 0); + + // formal derivative + for (unsigned i = 0; i < kFieldSize; i += 2) + { + codeword[i] = mulE(codeword[i], kFieldModulus - B[i >> 1]); + codeword[i + 1] = mulE(codeword[i + 1], kFieldModulus - B[i >> 1]); + } + + formal_derivative(codeword, k2); + + for (unsigned i = 0; i < k2; i += 2) + { + codeword[i] = mulE(codeword[i], B[i >> 1]); + codeword[i + 1] = mulE(codeword[i + 1], B[i >> 1]); + } + + FLT(codeword, k2, 0); + + for (unsigned i = 0; i < k2; ++i) + { + if (erasure[i]) + { + codeword[i] = mulE(codeword[i], kFieldModulus - log_walsh2[i]); + } + } +} + + +//------------------------------------------------------------------------------ +// Test Application + +void test(unsigned k, unsigned seed) +{ + srand(seed); + + //-----------Generating message---------- + + // Message array + GFSymbol data[kFieldSize] = {0}; + + // Filled with random numbers + for (unsigned i = kFieldSize - k; i < kFieldSize; ++i) + data[i] = (GFSymbol)rand(); + + + //---------encoding---------- + + GFSymbol codeword[kFieldSize]; + encodeH(&data[kFieldSize - k], k, data, codeword); + //encodeL(data, k, codeword); // does not seem to work with any input? what else needs to change? + + memcpy(codeword, data, sizeof(GFSymbol) * kFieldSize); + + + //--------erasure simulation--------- + + // Array indicating erasures + bool erasure[kFieldSize] = { + false + }; + + for (unsigned i = k; i < kFieldSize; ++i) + erasure[i] = true; + + // permuting the erasure array + for (unsigned i = kFieldSize - 1; i > 0; --i) + { + unsigned pos = rand() % (i + 1); + + if (i != pos) + { + bool tmp = erasure[i]; + erasure[i] = erasure[pos]; + erasure[pos] = tmp; + } + } + + // erasure codeword symbols + for (unsigned i = 0; i < kFieldSize; ++i) + if (erasure[i]) + codeword[i] = 0; + + + //---------main processing---------- + decode(codeword, k, erasure); + + // Check the correctness of the result + for (unsigned i = 0; i < kFieldSize; ++i) + { + if (erasure[i] == 1) + { + if (data[i] != codeword[i]) + { + printf("Decoding Error with seed = %d!\n", seed); + LEO_DEBUG_BREAK; + return; + } + } + } + + //printf("Decoding is successful!\n"); +} + + +//------------------------------------------------------------------------------ +// Entrypoint + +int main(int argc, char **argv) +{ + // Initialize architecture-specific code + leo_architecture_init(); + + // Fill GFLog table and GFExp table + InitField(); + + // Compute factors used in erasure decoder + InitFieldOperations(); + + unsigned seed = (unsigned)time(NULL); + for (;;) + { + // test(int k), k: message size + /* + EncodeH works for kFieldSize / 2 and kFieldSize * 3 / 4, etc, + s.t. the number of recovery pieces is a power of two + */ + test(kFieldSize / 2, seed); + + ++seed; + } + + return 0; +} diff --git a/LeopardEncoder.cpp b/LeopardEncoder.cpp new file mode 100644 index 0000000..71d22e2 --- /dev/null +++ b/LeopardEncoder.cpp @@ -0,0 +1,1220 @@ +/* + Copyright (c) 2017 Christopher A. Taylor. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + * Neither the name of LHC-RS nor the names of its contributors may be + used to endorse or promote products derived from this software without + specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. +*/ + +#include +#include +#include +#include +#include + + +/* + TODO: + + Write C API and unit tester + + Limit input to multiples of 64 bytes + + Replace GFSymbol with a file data pointer + + New 16-bit Muladd inner loops + + Class to contain the (large) muladd tables + + Preliminary benchmarks for large data! + + New 8-bit Muladd inner loops + + Benchmarks for smaller data! + + Refactor software + + Pick a name for the software better than LEO_RS + + I think it should be split up into several C++ modules + + Write detailed comments for all the routines + + Look into getting EncodeL working so we can support smaller data (Ask Lin) + + Look into using k instead of k2 to speed up decoder (Ask Lin) + + Avoid performing FFT/IFFT intermediate calculations we're not going to use + + Benchmarks, fun! + + Add multi-threading to split up long parallelizable calculations + + Final benchmarks! + + Finish up documentation + + Release version 1 + + + Muladd implementation notes: + + Specialize for 1-3 rows at a time since often times we're multiplying by + the same (skew) value repeatedly, as the ISA-L library does here: + + https://github.com/01org/isa-l/blob/master/erasure_code/gf_3vect_mad_avx.asm#L258 + + Except we should be doing it for 16-bit Galois Field. + To implement that use the ALTMAP trick from Jerasure: + + http://lab.jerasure.org/jerasure/gf-complete/blob/master/src/gf_w16.c#L1140 + + Except we should also support AVX2 since that is a 40% perf boost, so put + the high and low bytes 32 bytes instead of 16 bytes apart. + + Also I think we should go ahead and precompute the multiply tables since + it avoids a bunch of memory lookups for each muladd, and only costs 8 MB. +*/ + + +//------------------------------------------------------------------------------ +// Debug + +// Some bugs only repro in release mode, so this can be helpful +//#define LEO_DEBUG_IN_RELEASE + +#if defined(_DEBUG) || defined(DEBUG) || defined(LEO_DEBUG_IN_RELEASE) + #define LEO_DEBUG + #ifdef _WIN32 + #define LEO_DEBUG_BREAK __debugbreak() + #else + #define LEO_DEBUG_BREAK __builtin_trap() + #endif + #define LEO_DEBUG_ASSERT(cond) { if (!(cond)) { LEO_DEBUG_BREAK; } } +#else + #define LEO_DEBUG_BREAK ; + #define LEO_DEBUG_ASSERT(cond) ; +#endif + + +//------------------------------------------------------------------------------ +// Platform/Architecture + +#if defined(ANDROID) || defined(IOS) + #define LEO_TARGET_MOBILE +#endif // ANDROID + +#if defined(__AVX2__) || (defined (_MSC_VER) && _MSC_VER >= 1900) + #define LEO_TRY_AVX2 /* 256-bit */ + #include + #define LEO_ALIGN_BYTES 32 +#else // __AVX2__ + #define LEO_ALIGN_BYTES 16 +#endif // __AVX2__ + +#if !defined(LEO_TARGET_MOBILE) + // Note: MSVC currently only supports SSSE3 but not AVX2 + #include // SSSE3: _mm_shuffle_epi8 + #include // SSE2 +#endif // LEO_TARGET_MOBILE + +#if defined(HAVE_ARM_NEON_H) + #include +#endif // HAVE_ARM_NEON_H + +#if defined(LEO_TARGET_MOBILE) + + #define LEO_ALIGNED_ACCESSES /* Inputs must be aligned to LEO_ALIGN_BYTES */ + +# if defined(HAVE_ARM_NEON_H) + // Compiler-specific 128-bit SIMD register keyword + #define LEO_M128 uint8x16_t + #define LEO_TRY_NEON +#else + #define LEO_M128 uint64_t +# endif + +#else // LEO_TARGET_MOBILE + + // Compiler-specific 128-bit SIMD register keyword + #define LEO_M128 __m128i + +#endif // LEO_TARGET_MOBILE + +#ifdef LEO_TRY_AVX2 + // Compiler-specific 256-bit SIMD register keyword + #define LEO_M256 __m256i +#endif + +// Compiler-specific C++11 restrict keyword +#define LEO_RESTRICT __restrict + +// Compiler-specific force inline keyword +#ifdef _MSC_VER + #define LEO_FORCE_INLINE inline __forceinline +#else + #define LEO_FORCE_INLINE inline __attribute__((always_inline)) +#endif + +// Compiler-specific alignment keyword +// Note: Alignment only matters for ARM NEON where it should be 16 +#ifdef _MSC_VER + #define LEO_ALIGNED __declspec(align(LEO_ALIGN_BYTES)) +#else // _MSC_VER + #define LEO_ALIGNED __attribute__((aligned(LEO_ALIGN_BYTES))) +#endif // _MSC_VER + + +//------------------------------------------------------------------------------ +// Runtime CPU Architecture Check +// +// Feature checks stolen shamelessly from +// https://github.com/jedisct1/libsodium/blob/master/src/libsodium/sodium/runtime.c + +#if defined(HAVE_ANDROID_GETCPUFEATURES) + #include +#endif + +#if defined(LEO_TRY_NEON) +# if defined(IOS) && defined(__ARM_NEON__) + // Requires iPhone 5S or newer + static const bool CpuHasNeon = true; + static const bool CpuHasNeon64 = true; +# else + // Remember to add LOCAL_STATIC_LIBRARIES := cpufeatures + static bool CpuHasNeon = false; // V6 / V7 + static bool CpuHasNeon64 = false; // 64-bit +# endif +#endif + + +#if !defined(LEO_TARGET_MOBILE) + +#ifdef _MSC_VER + #include // __cpuid + #pragma warning(disable: 4752) // found Intel(R) Advanced Vector Extensions; consider using /arch:AVX +#endif + +#ifdef LEO_TRY_AVX2 +static bool CpuHasAVX2 = false; +#endif +static bool CpuHasSSSE3 = false; + +#define CPUID_EBX_AVX2 0x00000020 +#define CPUID_ECX_SSSE3 0x00000200 + +static void _cpuid(unsigned int cpu_info[4U], const unsigned int cpu_info_type) +{ +#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_AMD64) || defined(_M_IX86)) + __cpuid((int *) cpu_info, cpu_info_type); +#else //if defined(HAVE_CPUID) + cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0; +# ifdef __i386__ + __asm__ __volatile__ ("pushfl; pushfl; " + "popl %0; " + "movl %0, %1; xorl %2, %0; " + "pushl %0; " + "popfl; pushfl; popl %0; popfl" : + "=&r" (cpu_info[0]), "=&r" (cpu_info[1]) : + "i" (0x200000)); + if (((cpu_info[0] ^ cpu_info[1]) & 0x200000) == 0) { + return; /* LCOV_EXCL_LINE */ + } +# endif +# ifdef __i386__ + __asm__ __volatile__ ("xchgl %%ebx, %k1; cpuid; xchgl %%ebx, %k1" : + "=a" (cpu_info[0]), "=&r" (cpu_info[1]), + "=c" (cpu_info[2]), "=d" (cpu_info[3]) : + "0" (cpu_info_type), "2" (0U)); +# elif defined(__x86_64__) + __asm__ __volatile__ ("xchgq %%rbx, %q1; cpuid; xchgq %%rbx, %q1" : + "=a" (cpu_info[0]), "=&r" (cpu_info[1]), + "=c" (cpu_info[2]), "=d" (cpu_info[3]) : + "0" (cpu_info_type), "2" (0U)); +# else + __asm__ __volatile__ ("cpuid" : + "=a" (cpu_info[0]), "=b" (cpu_info[1]), + "=c" (cpu_info[2]), "=d" (cpu_info[3]) : + "0" (cpu_info_type), "2" (0U)); +# endif +#endif +} + +#endif // defined(LEO_TARGET_MOBILE) + + +static void leo_architecture_init() +{ +#if defined(LEO_TRY_NEON) && defined(HAVE_ANDROID_GETCPUFEATURES) + AndroidCpuFamily family = android_getCpuFamily(); + if (family == ANDROID_CPU_FAMILY_ARM) + { + if (android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON) + CpuHasNeon = true; + } + else if (family == ANDROID_CPU_FAMILY_ARM64) + { + CpuHasNeon = true; + if (android_getCpuFeatures() & ANDROID_CPU_ARM64_FEATURE_ASIMD) + CpuHasNeon64 = true; + } +#endif + +#if !defined(LEO_TARGET_MOBILE) + unsigned int cpu_info[4]; + + _cpuid(cpu_info, 1); + CpuHasSSSE3 = ((cpu_info[2] & CPUID_ECX_SSSE3) != 0); + +#if defined(LEO_TRY_AVX2) + _cpuid(cpu_info, 7); + CpuHasAVX2 = ((cpu_info[1] & CPUID_EBX_AVX2) != 0); +#endif // LEO_TRY_AVX2 + +#endif // LEO_TARGET_MOBILE +} + + +//------------------------------------------------------------------------------ +// SIMD-Safe Aligned Memory Allocations + +static const unsigned kAlignmentBytes = LEO_ALIGN_BYTES; + +LEO_FORCE_INLINE unsigned NextAlignedOffset(unsigned offset) +{ + return (offset + kAlignmentBytes - 1) & ~(kAlignmentBytes - 1); +} + +static LEO_FORCE_INLINE uint8_t* SIMDSafeAllocate(size_t size) +{ + uint8_t* data = (uint8_t*)calloc(1, kAlignmentBytes + size); + if (!data) + return nullptr; + unsigned offset = (unsigned)((uintptr_t)data % kAlignmentBytes); + data += kAlignmentBytes - offset; + data[-1] = (uint8_t)offset; + return data; +} + +static LEO_FORCE_INLINE void SIMDSafeFree(void* ptr) +{ + if (!ptr) + return; + uint8_t* data = (uint8_t*)ptr; + unsigned offset = data[-1]; + if (offset >= kAlignmentBytes) + { + LEO_DEBUG_BREAK; // Should never happen + return; + } + data -= kAlignmentBytes - offset; + free(data); +} + + +//------------------------------------------------------------------------------ +// Field + +//#define LEO_SHORT_FIELD + +#ifdef LEO_SHORT_FIELD +typedef uint8_t GFSymbol; +static const unsigned kGFBits = 8; +static const unsigned kGFPolynomial = 0x11D; +GFSymbol kGFBasis[kGFBits] = { + 1, 214, 152, 146, 86, 200, 88, 230 // Cantor basis +}; +#else +typedef uint16_t GFSymbol; +static const unsigned kGFBits = 16; +static const unsigned kGFPolynomial = 0x1002D; +GFSymbol kGFBasis[kGFBits] = { + 0x0001, 0xACCA, 0x3C0E, 0x163E, // Cantor basis + 0xC582, 0xED2E, 0x914C, 0x4012, + 0x6C98, 0x10D8, 0x6A72, 0xB900, + 0xFDB8, 0xFB34, 0xFF38, 0x991E +}; +#endif + +/* + Cantor Basis introduced by: + D. G. Cantor, "On arithmetical algorithms over finite fields", + Journal of Combinatorial Theory, Series A, vol. 50, no. 2, pp. 285-300, 1989. +*/ + +static const unsigned kFieldSize = (unsigned)1 << kGFBits; //Field size +static const unsigned kFieldModulus = kFieldSize - 1; + +static GFSymbol GFLog[kFieldSize]; +static GFSymbol GFExp[kFieldSize]; + +// Initialize GFLog[], GFExp[] +static void InitField() +{ + unsigned state = 1; + for (unsigned i = 0; i < kFieldModulus; ++i) + { + GFExp[state] = static_cast(i); + state <<= 1; + if (state >= kFieldSize) + state ^= kGFPolynomial; + } + GFExp[0] = kFieldModulus; + + // Conversion to chosen basis: + + GFLog[0] = 0; + for (unsigned i = 0; i < kGFBits; ++i) + { + const GFSymbol basis = kGFBasis[i]; + const unsigned width = (unsigned)(1UL << i); + + for (unsigned j = 0; j < width; ++j) + GFLog[j + width] = GFLog[j] ^ basis; + } + + for (unsigned i = 0; i < kFieldSize; ++i) + GFLog[i] = GFExp[GFLog[i]]; + + for (unsigned i = 0; i < kFieldSize; ++i) + GFExp[GFLog[i]] = i; + + GFExp[kFieldModulus] = GFExp[0]; +} + + +//------------------------------------------------------------------------------ +// Mod Q Field Operations +// +// Q is the maximum symbol value, e.g. 255 or 65535. + +// z = x + y (mod Q) +static inline GFSymbol AddModQ(GFSymbol a, GFSymbol b) +{ + const unsigned sum = (unsigned)a + b; + + // Partial reduction step, allowing for Q to be returned + return static_cast(sum + (sum >> kGFBits)); +} + +// z = x - y (mod Q) +static inline GFSymbol SubModQ(GFSymbol a, GFSymbol b) +{ + const unsigned dif = (unsigned)a - b; + + // Partial reduction step, allowing for Q to be returned + return static_cast(dif + (dif >> kGFBits)); +} + +// vx[] += vy[] * z +static void muladd_mem(GFSymbol * LEO_RESTRICT vx, const GFSymbol * LEO_RESTRICT vy, GFSymbol z, unsigned symbolCount) +{ + for (unsigned i = 0; i < symbolCount; ++i) + { + const GFSymbol a = vy[i]; + if (a == 0) + continue; + + GFSymbol sum1 = static_cast(AddModQ(GFLog[a & 0x0f], z)); + GFSymbol value1 = GFExp[sum1]; + if ((a & 0x0f) == 0) + { + value1 = 0; + } + GFSymbol sum2 = static_cast(AddModQ(GFLog[a & 0xf0], z)); + GFSymbol value2 = GFExp[sum2]; + if ((a & 0xf0) == 0) + { + value2 = 0; + } + GFSymbol sum3 = static_cast(AddModQ(GFLog[a & 0x0f00], z)); + GFSymbol value3 = GFExp[sum3]; + if ((a & 0x0f00) == 0) + { + value3 = 0; + } + GFSymbol sum4 = static_cast(AddModQ(GFLog[a & 0xf000], z)); + GFSymbol value4 = GFExp[sum4]; + if ((a & 0xf000) == 0) + { + value4 = 0; + } + + vx[i] ^= value1; + vx[i] ^= value2; + vx[i] ^= value3; + vx[i] ^= value4; + } +} + +// return a*GFExp[b] over GF(2^r) +static GFSymbol mulE(GFSymbol a, GFSymbol b) +{ + if (a == 0) + return 0; + + const GFSymbol sum = static_cast(AddModQ(GFLog[a], b)); + return GFExp[sum]; +} + + +//------------------------------------------------------------------------------ +// Fast Walsh-Hadamard Transform (FWHT) Mod Q +// +// Q is the maximum symbol value, e.g. 255 or 65535. + +// Define this to enable the optimized version of FWHT() +#define LEO_FWHT_OPTIMIZED + +typedef GFSymbol fwht_t; + +// {a, b} = {a + b, a - b} (Mod Q) +static LEO_FORCE_INLINE void FWHT_2(fwht_t& LEO_RESTRICT a, fwht_t& LEO_RESTRICT b) +{ + const fwht_t sum = AddModQ(a, b); + const fwht_t dif = SubModQ(a, b); + a = sum; + b = dif; +} + +/* + FWHT is a minor slice of the runtime and does not grow with data size, + but I did attempt a few additional optimizations that failed: + + I've attempted to vectorize (with partial reductions) FWHT_4(data, s), + which is 70% of the algorithm, but it was slower. Left in _attic_. + + I've attempted to avoid reductions in all or parts of the FWHT. + The final modular reduction ends up being slower than the savings. + Specifically I tried doing it for the whole FWHT and also I tried + doing it just for the FWHT_2 loop in the main routine, but both + approaches are slower than partial reductions. + + Replacing word reads with wider reads does speed up the operation, but + at too high a complexity cost relative to minor perf improvement. +*/ + +#ifndef LEO_FWHT_OPTIMIZED + +// Reference implementation +static void FWHT(fwht_t* data, const unsigned bits) +{ + const unsigned size = (unsigned)(1UL << bits); + for (unsigned width = 1; width < size; width <<= 1) + for (unsigned i = 0; i < size; i += (width << 1)) + for (unsigned j = i; j < (width + i); ++j) + FWHT_2(data[j], data[j + width]); +} + +#else + +static LEO_FORCE_INLINE void FWHT_4(fwht_t* data) +{ + fwht_t t0 = data[0]; + fwht_t t1 = data[1]; + fwht_t t2 = data[2]; + fwht_t t3 = data[3]; + FWHT_2(t0, t1); + FWHT_2(t2, t3); + FWHT_2(t0, t2); + FWHT_2(t1, t3); + data[0] = t0; + data[1] = t1; + data[2] = t2; + data[3] = t3; +} + +static LEO_FORCE_INLINE void FWHT_4(fwht_t* data, unsigned s) +{ + unsigned x = 0; + fwht_t t0 = data[x]; x += s; + fwht_t t1 = data[x]; x += s; + fwht_t t2 = data[x]; x += s; + fwht_t t3 = data[x]; + FWHT_2(t0, t1); + FWHT_2(t2, t3); + FWHT_2(t0, t2); + FWHT_2(t1, t3); + unsigned y = 0; + data[y] = t0; y += s; + data[y] = t1; y += s; + data[y] = t2; y += s; + data[y] = t3; +} + +static inline void FWHT_8(fwht_t* data) +{ + fwht_t t0 = data[0]; + fwht_t t1 = data[1]; + fwht_t t2 = data[2]; + fwht_t t3 = data[3]; + fwht_t t4 = data[4]; + fwht_t t5 = data[5]; + fwht_t t6 = data[6]; + fwht_t t7 = data[7]; + FWHT_2(t0, t1); + FWHT_2(t2, t3); + FWHT_2(t4, t5); + FWHT_2(t6, t7); + FWHT_2(t0, t2); + FWHT_2(t1, t3); + FWHT_2(t4, t6); + FWHT_2(t5, t7); + FWHT_2(t0, t4); + FWHT_2(t1, t5); + FWHT_2(t2, t6); + FWHT_2(t3, t7); + data[0] = t0; + data[1] = t1; + data[2] = t2; + data[3] = t3; + data[4] = t4; + data[5] = t5; + data[6] = t6; + data[7] = t7; +} + +static inline void FWHT_16(fwht_t* data) +{ + fwht_t t0 = data[0]; + fwht_t t1 = data[1]; + fwht_t t2 = data[2]; + fwht_t t3 = data[3]; + fwht_t t4 = data[4]; + fwht_t t5 = data[5]; + fwht_t t6 = data[6]; + fwht_t t7 = data[7]; + fwht_t t8 = data[8]; + fwht_t t9 = data[9]; + fwht_t t10 = data[10]; + fwht_t t11 = data[11]; + fwht_t t12 = data[12]; + fwht_t t13 = data[13]; + fwht_t t14 = data[14]; + fwht_t t15 = data[15]; + FWHT_2(t0, t1); + FWHT_2(t2, t3); + FWHT_2(t4, t5); + FWHT_2(t6, t7); + FWHT_2(t8, t9); + FWHT_2(t10, t11); + FWHT_2(t12, t13); + FWHT_2(t14, t15); + FWHT_2(t0, t2); + FWHT_2(t1, t3); + FWHT_2(t4, t6); + FWHT_2(t5, t7); + FWHT_2(t8, t10); + FWHT_2(t9, t11); + FWHT_2(t12, t14); + FWHT_2(t13, t15); + FWHT_2(t0, t4); + FWHT_2(t1, t5); + FWHT_2(t2, t6); + FWHT_2(t3, t7); + FWHT_2(t8, t12); + FWHT_2(t9, t13); + FWHT_2(t10, t14); + FWHT_2(t11, t15); + FWHT_2(t0, t8); + FWHT_2(t1, t9); + FWHT_2(t2, t10); + FWHT_2(t3, t11); + FWHT_2(t4, t12); + FWHT_2(t5, t13); + FWHT_2(t6, t14); + FWHT_2(t7, t15); + data[0] = t0; + data[1] = t1; + data[2] = t2; + data[3] = t3; + data[4] = t4; + data[5] = t5; + data[6] = t6; + data[7] = t7; + data[8] = t8; + data[9] = t9; + data[10] = t10; + data[11] = t11; + data[12] = t12; + data[13] = t13; + data[14] = t14; + data[15] = t15; +} + +static void FWHT_SmallData(fwht_t* data, unsigned ldn) +{ + const unsigned n = (1UL << ldn); + + if (n <= 2) + { + if (n == 2) + FWHT_2(data[0], data[1]); + return; + } + + for (unsigned ldm = ldn; ldm > 3; ldm -= 2) + { + unsigned m = (1UL << ldm); + unsigned m4 = (m >> 2); + for (unsigned r = 0; r < n; r += m) + for (unsigned j = 0; j < m4; j++) + FWHT_4(data + j + r, m4); + } + + if (ldn & 1) + { + for (unsigned i0 = 0; i0 < n; i0 += 8) + FWHT_8(data + i0); + } + else + { + for (unsigned i0 = 0; i0 < n; i0 += 4) + FWHT_4(data + i0); + } +} + +// Decimation in time (DIT) version +static void FWHT(fwht_t* data, const unsigned ldn) +{ + if (ldn <= 13) + { + FWHT_SmallData(data, ldn); + return; + } + + FWHT_2(data[2], data[3]); + FWHT_4(data + 4); + FWHT_8(data + 8); + FWHT_16(data + 16); + for (unsigned ldm = 5; ldm < ldn; ++ldm) + FWHT(data + (unsigned)(1UL << ldm), ldm); + + for (unsigned ldm = 0; ldm < ldn; ++ldm) + { + const unsigned mh = (1UL << ldm); + for (unsigned t1 = 0, t2 = mh; t1 < mh; ++t1, ++t2) + FWHT_2(data[t1], data[t2]); + } +} + +#endif + + +//------------------------------------------------------------------------------ +// Memory Buffer XOR + +static void xor_mem(void * LEO_RESTRICT vx, const void * LEO_RESTRICT vy, unsigned bytes) +{ + LEO_M128 * LEO_RESTRICT x16 = reinterpret_cast(vx); + const LEO_M128 * LEO_RESTRICT y16 = reinterpret_cast(vy); + +#if defined(LEO_TARGET_MOBILE) +# if defined(LEO_TRY_NEON) + // Handle multiples of 64 bytes + if (CpuHasNeon) + { + while (bytes >= 64) + { + LEO_M128 x0 = vld1q_u8(x16); + LEO_M128 x1 = vld1q_u8(x16 + 1); + LEO_M128 x2 = vld1q_u8(x16 + 2); + LEO_M128 x3 = vld1q_u8(x16 + 3); + LEO_M128 y0 = vld1q_u8(y16); + LEO_M128 y1 = vld1q_u8(y16 + 1); + LEO_M128 y2 = vld1q_u8(y16 + 2); + LEO_M128 y3 = vld1q_u8(y16 + 3); + + vst1q_u8(x16, veorq_u8(x0, y0)); + vst1q_u8(x16 + 1, veorq_u8(x1, y1)); + vst1q_u8(x16 + 2, veorq_u8(x2, y2)); + vst1q_u8(x16 + 3, veorq_u8(x3, y3)); + + bytes -= 64, x16 += 4, y16 += 4; + } + + // Handle multiples of 16 bytes + while (bytes >= 16) + { + LEO_M128 x0 = vld1q_u8(x16); + LEO_M128 y0 = vld1q_u8(y16); + + vst1q_u8(x16, veorq_u8(x0, y0)); + + bytes -= 16, ++x16, ++y16; + } + } + else +# endif // LEO_TRY_NEON + { + uint64_t * LEO_RESTRICT x8 = reinterpret_cast(x16); + const uint64_t * LEO_RESTRICT y8 = reinterpret_cast(y16); + + const unsigned count = (unsigned)bytes / 8; + for (unsigned ii = 0; ii < count; ++ii) + x8[ii] ^= y8[ii]; + + x16 = reinterpret_cast(x8 + count); + y16 = reinterpret_cast(y8 + count); + } +#else // LEO_TARGET_MOBILE +# if defined(LEO_TRY_AVX2) + if (CpuHasAVX2) + { + LEO_M256 * LEO_RESTRICT x32 = reinterpret_cast(x16); + const LEO_M256 * LEO_RESTRICT y32 = reinterpret_cast(y16); + + while (bytes >= 128) + { + LEO_M256 x0 = _mm256_loadu_si256(x32); + LEO_M256 y0 = _mm256_loadu_si256(y32); + x0 = _mm256_xor_si256(x0, y0); + LEO_M256 x1 = _mm256_loadu_si256(x32 + 1); + LEO_M256 y1 = _mm256_loadu_si256(y32 + 1); + x1 = _mm256_xor_si256(x1, y1); + LEO_M256 x2 = _mm256_loadu_si256(x32 + 2); + LEO_M256 y2 = _mm256_loadu_si256(y32 + 2); + x2 = _mm256_xor_si256(x2, y2); + LEO_M256 x3 = _mm256_loadu_si256(x32 + 3); + LEO_M256 y3 = _mm256_loadu_si256(y32 + 3); + x3 = _mm256_xor_si256(x3, y3); + + _mm256_storeu_si256(x32, x0); + _mm256_storeu_si256(x32 + 1, x1); + _mm256_storeu_si256(x32 + 2, x2); + _mm256_storeu_si256(x32 + 3, x3); + + bytes -= 128, x32 += 4, y32 += 4; + } + + // Handle multiples of 32 bytes + while (bytes >= 32) + { + // x[i] = x[i] xor y[i] + _mm256_storeu_si256(x32, + _mm256_xor_si256( + _mm256_loadu_si256(x32), + _mm256_loadu_si256(y32))); + + bytes -= 32, ++x32, ++y32; + } + + x16 = reinterpret_cast(x32); + y16 = reinterpret_cast(y32); + } + else +# endif // LEO_TRY_AVX2 + { + while (bytes >= 64) + { + LEO_M128 x0 = _mm_loadu_si128(x16); + LEO_M128 y0 = _mm_loadu_si128(y16); + x0 = _mm_xor_si128(x0, y0); + LEO_M128 x1 = _mm_loadu_si128(x16 + 1); + LEO_M128 y1 = _mm_loadu_si128(y16 + 1); + x1 = _mm_xor_si128(x1, y1); + LEO_M128 x2 = _mm_loadu_si128(x16 + 2); + LEO_M128 y2 = _mm_loadu_si128(y16 + 2); + x2 = _mm_xor_si128(x2, y2); + LEO_M128 x3 = _mm_loadu_si128(x16 + 3); + LEO_M128 y3 = _mm_loadu_si128(y16 + 3); + x3 = _mm_xor_si128(x3, y3); + + _mm_storeu_si128(x16, x0); + _mm_storeu_si128(x16 + 1, x1); + _mm_storeu_si128(x16 + 2, x2); + _mm_storeu_si128(x16 + 3, x3); + + bytes -= 64, x16 += 4, y16 += 4; + } + } +#endif // LEO_TARGET_MOBILE + + // Handle multiples of 16 bytes + while (bytes >= 16) + { + // x[i] = x[i] xor y[i] + _mm_storeu_si128(x16, + _mm_xor_si128( + _mm_loadu_si128(x16), + _mm_loadu_si128(y16))); + + bytes -= 16, ++x16, ++y16; + } + + uint8_t * LEO_RESTRICT x1 = reinterpret_cast(x16); + const uint8_t * LEO_RESTRICT y1 = reinterpret_cast(y16); + + // Handle a block of 8 bytes + const unsigned eight = bytes & 8; + if (eight) + { + uint64_t * LEO_RESTRICT x8 = reinterpret_cast(x1); + const uint64_t * LEO_RESTRICT y8 = reinterpret_cast(y1); + *x8 ^= *y8; + } + + // Handle a block of 4 bytes + const unsigned four = bytes & 4; + if (four) + { + uint32_t * LEO_RESTRICT x4 = reinterpret_cast(x1 + eight); + const uint32_t * LEO_RESTRICT y4 = reinterpret_cast(y1 + eight); + *x4 ^= *y4; + } + + // Handle final bytes + const unsigned offset = eight + four; + switch (bytes & 3) + { + case 3: x1[offset + 2] ^= y1[offset + 2]; + case 2: x1[offset + 1] ^= y1[offset + 1]; + case 1: x1[offset] ^= y1[offset]; + default: + break; + } +} + + +//------------------------------------------------------------------------------ +// Formal Derivative + +// Formal derivative of polynomial in the new basis +static void formal_derivative(GFSymbol* cos, const unsigned size) +{ + for (unsigned i = 1; i < size; ++i) + { + const unsigned leng = ((i ^ (i - 1)) + 1) >> 1; + + // If a large number of values are being XORed: + if (leng >= 8) + xor_mem(cos + i - leng, cos + i, leng * sizeof(GFSymbol)); + else + for (unsigned j = i - leng; j < i; j++) + cos[j] ^= cos[j + leng]; + } + + for (unsigned i = size; i < kFieldSize; i <<= 1) + xor_mem(cos, cos + i, size * sizeof(GFSymbol)); +} + + +//------------------------------------------------------------------------------ +// Fast Fourier Transform + +static GFSymbol skewVec[kFieldModulus]; // twisted factors used in FFT + +// IFFT in the proposed basis +static void IFLT(GFSymbol* data, const unsigned size, const unsigned index) +{ + for (unsigned depart_no = 1; depart_no < size; depart_no <<= 1) + { + for (unsigned j = depart_no; j < size; j += (depart_no << 1)) + { + // If a large number of values are being XORed: + if (depart_no >= 8) + xor_mem(data + j, data + j - depart_no, depart_no * sizeof(GFSymbol)); + else + for (unsigned i = j - depart_no; i < j; ++i) + data[i + depart_no] ^= data[i]; + + const GFSymbol skew = skewVec[j + index - 1]; + + if (skew != kFieldModulus) + muladd_mem(data + j - depart_no, data + j, skew, depart_no); + } + } +} + +// FFT in the proposed basis +static void FLT(GFSymbol* data, const unsigned size, const unsigned index) +{ + for (unsigned depart_no = (size >> 1); depart_no > 0; depart_no >>= 1) + { + for (unsigned j = depart_no; j < size; j += (depart_no << 1)) + { + const GFSymbol skew = skewVec[j + index - 1]; + + if (skew != kFieldModulus) + muladd_mem(data + j - depart_no, data + j, skew, depart_no); + + // If a large number of values are being XORed: + if (depart_no >= 8) + xor_mem(data + j, data + j - depart_no, depart_no * sizeof(GFSymbol)); + else + for (unsigned i = j - depart_no; i < j; ++i) + data[i + depart_no] ^= data[i]; + } + } +} + + +//------------------------------------------------------------------------------ +// FFT Initialization + +static GFSymbol B[kFieldSize >> 1]; // factors used in formal derivative +static fwht_t log_walsh[kFieldSize]; // factors used in the evaluation of the error locator polynomial + +// Initialize skewVec[], B[], log_walsh[] +static void InitFieldOperations() +{ + GFSymbol temp[kGFBits - 1]; + + for (unsigned i = 1; i < kGFBits; ++i) + temp[i - 1] = (GFSymbol)((unsigned)1 << i); + + for (unsigned m = 0; m < (kGFBits - 1); ++m) + { + const unsigned step = (unsigned)1 << (m + 1); + + skewVec[((unsigned)1 << m) - 1] = 0; + + for (unsigned i = m; i < (kGFBits - 1); ++i) + { + const unsigned s = ((unsigned)1 << (i + 1)); + + for (unsigned j = ((unsigned)1 << m) - 1; j < s; j += step) + skewVec[j + s] = skewVec[j] ^ temp[i]; + } + + temp[m] = kFieldModulus - GFLog[mulE(temp[m], GFLog[temp[m] ^ 1])]; + + for (unsigned i = m + 1; i < (kGFBits - 1); ++i) + temp[i] = mulE(temp[i], (GFLog[temp[i] ^ 1] + temp[m]) % kFieldModulus); + } + + for (unsigned i = 0; i < kFieldSize; ++i) + skewVec[i] = GFLog[skewVec[i]]; + + temp[0] = kFieldModulus - temp[0]; + + for (unsigned i = 1; i < (kGFBits - 1); ++i) + temp[i] = (kFieldModulus - temp[i] + temp[i - 1]) % kFieldModulus; + + B[0] = 0; + for (unsigned i = 0; i < (kGFBits - 1); ++i) + { + const unsigned depart = ((unsigned)1 << i); + + for (unsigned j = 0; j < depart; ++j) + B[j + depart] = (B[j] + temp[i]) % kFieldModulus; + } + + for (unsigned i = 0; i < kFieldSize; ++i) + log_walsh[i] = GFLog[i]; + + log_walsh[0] = 0; + + FWHT(log_walsh, kGFBits); +} + + +//------------------------------------------------------------------------------ +// Encoder + +// Encoding alg for k/n<0.5: message is a power of two +static void encodeL(GFSymbol* data, const unsigned k, GFSymbol* codeword) +{ + memcpy(codeword, data, sizeof(GFSymbol) * k); + + IFLT(codeword, k, 0); + + for (unsigned i = k; i < kFieldSize; i += k) + { + memcpy(&codeword[i], codeword, sizeof(GFSymbol) * k); + + FLT(&codeword[i], k, i); + } + + memcpy(codeword, data, sizeof(GFSymbol) * k); +} + +// Encoding alg for k/n>0.5: parity is a power of two. +// data: message array. parity: parity array. mem: buffer(size>= n-k) +static void encodeH(const GFSymbol* data, const unsigned k, GFSymbol* parity, GFSymbol* mem) +{ + const unsigned t = kFieldSize - k; + + memset(parity, 0, sizeof(GFSymbol) * t); + + for (unsigned i = t; i < kFieldSize; i += t) + { + memcpy(mem, &data[i - t], sizeof(GFSymbol) * t); + + IFLT(mem, t, i); + + xor_mem(parity, mem, t * sizeof(GFSymbol)); + } + + FLT(parity, t, 0); +} + + +//------------------------------------------------------------------------------ +// Decoder + +static void decode(GFSymbol* codeword, unsigned k, const bool* erasure) +{ + fwht_t log_walsh2[kFieldSize]; + + // Compute the evaluations of the error locator polynomial + for (unsigned i = 0; i < kFieldSize; ++i) + log_walsh2[i] = erasure[i] ? 1 : 0; + + FWHT(log_walsh2, kGFBits); + + for (unsigned i = 0; i < kFieldSize; ++i) + log_walsh2[i] = ((unsigned)log_walsh2[i] * (unsigned)log_walsh[i]) % kFieldModulus; + + FWHT(log_walsh2, kGFBits); + + // k2 can be replaced with k + const unsigned k2 = kFieldSize; + //const unsigned k2 = k; // cannot actually be replaced with k. what else need to change? + + for (unsigned i = 0; i < kFieldSize; ++i) + { + if (erasure[i]) + { + codeword[i] = 0; + } + else + { + codeword[i] = mulE(codeword[i], log_walsh2[i]); + } + } + + IFLT(codeword, kFieldSize, 0); + + // formal derivative + for (unsigned i = 0; i < kFieldSize; i += 2) + { + codeword[i] = mulE(codeword[i], kFieldModulus - B[i >> 1]); + codeword[i + 1] = mulE(codeword[i + 1], kFieldModulus - B[i >> 1]); + } + + formal_derivative(codeword, k2); + + for (unsigned i = 0; i < k2; i += 2) + { + codeword[i] = mulE(codeword[i], B[i >> 1]); + codeword[i + 1] = mulE(codeword[i + 1], B[i >> 1]); + } + + FLT(codeword, k2, 0); + + for (unsigned i = 0; i < k2; ++i) + { + if (erasure[i]) + { + codeword[i] = mulE(codeword[i], kFieldModulus - log_walsh2[i]); + } + } +} + + +//------------------------------------------------------------------------------ +// Test Application + +void test(unsigned k, unsigned seed) +{ + srand(seed); + + //-----------Generating message---------- + + // Message array + GFSymbol data[kFieldSize] = {0}; + + // Filled with random numbers + for (unsigned i = kFieldSize - k; i < kFieldSize; ++i) + data[i] = (GFSymbol)rand(); + + + //---------encoding---------- + + GFSymbol codeword[kFieldSize]; + encodeH(&data[kFieldSize - k], k, data, codeword); + //encodeL(data, k, codeword); // does not seem to work with any input? what else needs to change? + + memcpy(codeword, data, sizeof(GFSymbol) * kFieldSize); + + + //--------erasure simulation--------- + + // Array indicating erasures + bool erasure[kFieldSize] = { + false + }; + + for (unsigned i = k; i < kFieldSize; ++i) + erasure[i] = true; + + // permuting the erasure array + for (unsigned i = kFieldSize - 1; i > 0; --i) + { + unsigned pos = rand() % (i + 1); + + if (i != pos) + { + bool tmp = erasure[i]; + erasure[i] = erasure[pos]; + erasure[pos] = tmp; + } + } + + // erasure codeword symbols + for (unsigned i = 0; i < kFieldSize; ++i) + if (erasure[i]) + codeword[i] = 0; + + + //---------main processing---------- + decode(codeword, k, erasure); + + // Check the correctness of the result + for (unsigned i = 0; i < kFieldSize; ++i) + { + if (erasure[i] == 1) + { + if (data[i] != codeword[i]) + { + printf("Decoding Error with seed = %d!\n", seed); + LEO_DEBUG_BREAK; + return; + } + } + } + + //printf("Decoding is successful!\n"); +} + + +//------------------------------------------------------------------------------ +// Entrypoint + +int main(int argc, char **argv) +{ + // Initialize architecture-specific code + leo_architecture_init(); + + // Fill GFLog table and GFExp table + InitField(); + + // Compute factors used in erasure decoder + InitFieldOperations(); + + unsigned seed = (unsigned)time(NULL); + for (;;) + { + // test(int k), k: message size + /* + EncodeH works for kFieldSize / 2 and kFieldSize * 3 / 4, etc, + s.t. the number of recovery pieces is a power of two + */ + test(kFieldSize / 2, seed); + + ++seed; + } + + return 0; +} diff --git a/LeopardEncoder.h b/LeopardEncoder.h new file mode 100644 index 0000000..71d22e2 --- /dev/null +++ b/LeopardEncoder.h @@ -0,0 +1,1220 @@ +/* + Copyright (c) 2017 Christopher A. Taylor. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + * Neither the name of LHC-RS nor the names of its contributors may be + used to endorse or promote products derived from this software without + specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. +*/ + +#include +#include +#include +#include +#include + + +/* + TODO: + + Write C API and unit tester + + Limit input to multiples of 64 bytes + + Replace GFSymbol with a file data pointer + + New 16-bit Muladd inner loops + + Class to contain the (large) muladd tables + + Preliminary benchmarks for large data! + + New 8-bit Muladd inner loops + + Benchmarks for smaller data! + + Refactor software + + Pick a name for the software better than LEO_RS + + I think it should be split up into several C++ modules + + Write detailed comments for all the routines + + Look into getting EncodeL working so we can support smaller data (Ask Lin) + + Look into using k instead of k2 to speed up decoder (Ask Lin) + + Avoid performing FFT/IFFT intermediate calculations we're not going to use + + Benchmarks, fun! + + Add multi-threading to split up long parallelizable calculations + + Final benchmarks! + + Finish up documentation + + Release version 1 + + + Muladd implementation notes: + + Specialize for 1-3 rows at a time since often times we're multiplying by + the same (skew) value repeatedly, as the ISA-L library does here: + + https://github.com/01org/isa-l/blob/master/erasure_code/gf_3vect_mad_avx.asm#L258 + + Except we should be doing it for 16-bit Galois Field. + To implement that use the ALTMAP trick from Jerasure: + + http://lab.jerasure.org/jerasure/gf-complete/blob/master/src/gf_w16.c#L1140 + + Except we should also support AVX2 since that is a 40% perf boost, so put + the high and low bytes 32 bytes instead of 16 bytes apart. + + Also I think we should go ahead and precompute the multiply tables since + it avoids a bunch of memory lookups for each muladd, and only costs 8 MB. +*/ + + +//------------------------------------------------------------------------------ +// Debug + +// Some bugs only repro in release mode, so this can be helpful +//#define LEO_DEBUG_IN_RELEASE + +#if defined(_DEBUG) || defined(DEBUG) || defined(LEO_DEBUG_IN_RELEASE) + #define LEO_DEBUG + #ifdef _WIN32 + #define LEO_DEBUG_BREAK __debugbreak() + #else + #define LEO_DEBUG_BREAK __builtin_trap() + #endif + #define LEO_DEBUG_ASSERT(cond) { if (!(cond)) { LEO_DEBUG_BREAK; } } +#else + #define LEO_DEBUG_BREAK ; + #define LEO_DEBUG_ASSERT(cond) ; +#endif + + +//------------------------------------------------------------------------------ +// Platform/Architecture + +#if defined(ANDROID) || defined(IOS) + #define LEO_TARGET_MOBILE +#endif // ANDROID + +#if defined(__AVX2__) || (defined (_MSC_VER) && _MSC_VER >= 1900) + #define LEO_TRY_AVX2 /* 256-bit */ + #include + #define LEO_ALIGN_BYTES 32 +#else // __AVX2__ + #define LEO_ALIGN_BYTES 16 +#endif // __AVX2__ + +#if !defined(LEO_TARGET_MOBILE) + // Note: MSVC currently only supports SSSE3 but not AVX2 + #include // SSSE3: _mm_shuffle_epi8 + #include // SSE2 +#endif // LEO_TARGET_MOBILE + +#if defined(HAVE_ARM_NEON_H) + #include +#endif // HAVE_ARM_NEON_H + +#if defined(LEO_TARGET_MOBILE) + + #define LEO_ALIGNED_ACCESSES /* Inputs must be aligned to LEO_ALIGN_BYTES */ + +# if defined(HAVE_ARM_NEON_H) + // Compiler-specific 128-bit SIMD register keyword + #define LEO_M128 uint8x16_t + #define LEO_TRY_NEON +#else + #define LEO_M128 uint64_t +# endif + +#else // LEO_TARGET_MOBILE + + // Compiler-specific 128-bit SIMD register keyword + #define LEO_M128 __m128i + +#endif // LEO_TARGET_MOBILE + +#ifdef LEO_TRY_AVX2 + // Compiler-specific 256-bit SIMD register keyword + #define LEO_M256 __m256i +#endif + +// Compiler-specific C++11 restrict keyword +#define LEO_RESTRICT __restrict + +// Compiler-specific force inline keyword +#ifdef _MSC_VER + #define LEO_FORCE_INLINE inline __forceinline +#else + #define LEO_FORCE_INLINE inline __attribute__((always_inline)) +#endif + +// Compiler-specific alignment keyword +// Note: Alignment only matters for ARM NEON where it should be 16 +#ifdef _MSC_VER + #define LEO_ALIGNED __declspec(align(LEO_ALIGN_BYTES)) +#else // _MSC_VER + #define LEO_ALIGNED __attribute__((aligned(LEO_ALIGN_BYTES))) +#endif // _MSC_VER + + +//------------------------------------------------------------------------------ +// Runtime CPU Architecture Check +// +// Feature checks stolen shamelessly from +// https://github.com/jedisct1/libsodium/blob/master/src/libsodium/sodium/runtime.c + +#if defined(HAVE_ANDROID_GETCPUFEATURES) + #include +#endif + +#if defined(LEO_TRY_NEON) +# if defined(IOS) && defined(__ARM_NEON__) + // Requires iPhone 5S or newer + static const bool CpuHasNeon = true; + static const bool CpuHasNeon64 = true; +# else + // Remember to add LOCAL_STATIC_LIBRARIES := cpufeatures + static bool CpuHasNeon = false; // V6 / V7 + static bool CpuHasNeon64 = false; // 64-bit +# endif +#endif + + +#if !defined(LEO_TARGET_MOBILE) + +#ifdef _MSC_VER + #include // __cpuid + #pragma warning(disable: 4752) // found Intel(R) Advanced Vector Extensions; consider using /arch:AVX +#endif + +#ifdef LEO_TRY_AVX2 +static bool CpuHasAVX2 = false; +#endif +static bool CpuHasSSSE3 = false; + +#define CPUID_EBX_AVX2 0x00000020 +#define CPUID_ECX_SSSE3 0x00000200 + +static void _cpuid(unsigned int cpu_info[4U], const unsigned int cpu_info_type) +{ +#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_AMD64) || defined(_M_IX86)) + __cpuid((int *) cpu_info, cpu_info_type); +#else //if defined(HAVE_CPUID) + cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0; +# ifdef __i386__ + __asm__ __volatile__ ("pushfl; pushfl; " + "popl %0; " + "movl %0, %1; xorl %2, %0; " + "pushl %0; " + "popfl; pushfl; popl %0; popfl" : + "=&r" (cpu_info[0]), "=&r" (cpu_info[1]) : + "i" (0x200000)); + if (((cpu_info[0] ^ cpu_info[1]) & 0x200000) == 0) { + return; /* LCOV_EXCL_LINE */ + } +# endif +# ifdef __i386__ + __asm__ __volatile__ ("xchgl %%ebx, %k1; cpuid; xchgl %%ebx, %k1" : + "=a" (cpu_info[0]), "=&r" (cpu_info[1]), + "=c" (cpu_info[2]), "=d" (cpu_info[3]) : + "0" (cpu_info_type), "2" (0U)); +# elif defined(__x86_64__) + __asm__ __volatile__ ("xchgq %%rbx, %q1; cpuid; xchgq %%rbx, %q1" : + "=a" (cpu_info[0]), "=&r" (cpu_info[1]), + "=c" (cpu_info[2]), "=d" (cpu_info[3]) : + "0" (cpu_info_type), "2" (0U)); +# else + __asm__ __volatile__ ("cpuid" : + "=a" (cpu_info[0]), "=b" (cpu_info[1]), + "=c" (cpu_info[2]), "=d" (cpu_info[3]) : + "0" (cpu_info_type), "2" (0U)); +# endif +#endif +} + +#endif // defined(LEO_TARGET_MOBILE) + + +static void leo_architecture_init() +{ +#if defined(LEO_TRY_NEON) && defined(HAVE_ANDROID_GETCPUFEATURES) + AndroidCpuFamily family = android_getCpuFamily(); + if (family == ANDROID_CPU_FAMILY_ARM) + { + if (android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON) + CpuHasNeon = true; + } + else if (family == ANDROID_CPU_FAMILY_ARM64) + { + CpuHasNeon = true; + if (android_getCpuFeatures() & ANDROID_CPU_ARM64_FEATURE_ASIMD) + CpuHasNeon64 = true; + } +#endif + +#if !defined(LEO_TARGET_MOBILE) + unsigned int cpu_info[4]; + + _cpuid(cpu_info, 1); + CpuHasSSSE3 = ((cpu_info[2] & CPUID_ECX_SSSE3) != 0); + +#if defined(LEO_TRY_AVX2) + _cpuid(cpu_info, 7); + CpuHasAVX2 = ((cpu_info[1] & CPUID_EBX_AVX2) != 0); +#endif // LEO_TRY_AVX2 + +#endif // LEO_TARGET_MOBILE +} + + +//------------------------------------------------------------------------------ +// SIMD-Safe Aligned Memory Allocations + +static const unsigned kAlignmentBytes = LEO_ALIGN_BYTES; + +LEO_FORCE_INLINE unsigned NextAlignedOffset(unsigned offset) +{ + return (offset + kAlignmentBytes - 1) & ~(kAlignmentBytes - 1); +} + +static LEO_FORCE_INLINE uint8_t* SIMDSafeAllocate(size_t size) +{ + uint8_t* data = (uint8_t*)calloc(1, kAlignmentBytes + size); + if (!data) + return nullptr; + unsigned offset = (unsigned)((uintptr_t)data % kAlignmentBytes); + data += kAlignmentBytes - offset; + data[-1] = (uint8_t)offset; + return data; +} + +static LEO_FORCE_INLINE void SIMDSafeFree(void* ptr) +{ + if (!ptr) + return; + uint8_t* data = (uint8_t*)ptr; + unsigned offset = data[-1]; + if (offset >= kAlignmentBytes) + { + LEO_DEBUG_BREAK; // Should never happen + return; + } + data -= kAlignmentBytes - offset; + free(data); +} + + +//------------------------------------------------------------------------------ +// Field + +//#define LEO_SHORT_FIELD + +#ifdef LEO_SHORT_FIELD +typedef uint8_t GFSymbol; +static const unsigned kGFBits = 8; +static const unsigned kGFPolynomial = 0x11D; +GFSymbol kGFBasis[kGFBits] = { + 1, 214, 152, 146, 86, 200, 88, 230 // Cantor basis +}; +#else +typedef uint16_t GFSymbol; +static const unsigned kGFBits = 16; +static const unsigned kGFPolynomial = 0x1002D; +GFSymbol kGFBasis[kGFBits] = { + 0x0001, 0xACCA, 0x3C0E, 0x163E, // Cantor basis + 0xC582, 0xED2E, 0x914C, 0x4012, + 0x6C98, 0x10D8, 0x6A72, 0xB900, + 0xFDB8, 0xFB34, 0xFF38, 0x991E +}; +#endif + +/* + Cantor Basis introduced by: + D. G. Cantor, "On arithmetical algorithms over finite fields", + Journal of Combinatorial Theory, Series A, vol. 50, no. 2, pp. 285-300, 1989. +*/ + +static const unsigned kFieldSize = (unsigned)1 << kGFBits; //Field size +static const unsigned kFieldModulus = kFieldSize - 1; + +static GFSymbol GFLog[kFieldSize]; +static GFSymbol GFExp[kFieldSize]; + +// Initialize GFLog[], GFExp[] +static void InitField() +{ + unsigned state = 1; + for (unsigned i = 0; i < kFieldModulus; ++i) + { + GFExp[state] = static_cast(i); + state <<= 1; + if (state >= kFieldSize) + state ^= kGFPolynomial; + } + GFExp[0] = kFieldModulus; + + // Conversion to chosen basis: + + GFLog[0] = 0; + for (unsigned i = 0; i < kGFBits; ++i) + { + const GFSymbol basis = kGFBasis[i]; + const unsigned width = (unsigned)(1UL << i); + + for (unsigned j = 0; j < width; ++j) + GFLog[j + width] = GFLog[j] ^ basis; + } + + for (unsigned i = 0; i < kFieldSize; ++i) + GFLog[i] = GFExp[GFLog[i]]; + + for (unsigned i = 0; i < kFieldSize; ++i) + GFExp[GFLog[i]] = i; + + GFExp[kFieldModulus] = GFExp[0]; +} + + +//------------------------------------------------------------------------------ +// Mod Q Field Operations +// +// Q is the maximum symbol value, e.g. 255 or 65535. + +// z = x + y (mod Q) +static inline GFSymbol AddModQ(GFSymbol a, GFSymbol b) +{ + const unsigned sum = (unsigned)a + b; + + // Partial reduction step, allowing for Q to be returned + return static_cast(sum + (sum >> kGFBits)); +} + +// z = x - y (mod Q) +static inline GFSymbol SubModQ(GFSymbol a, GFSymbol b) +{ + const unsigned dif = (unsigned)a - b; + + // Partial reduction step, allowing for Q to be returned + return static_cast(dif + (dif >> kGFBits)); +} + +// vx[] += vy[] * z +static void muladd_mem(GFSymbol * LEO_RESTRICT vx, const GFSymbol * LEO_RESTRICT vy, GFSymbol z, unsigned symbolCount) +{ + for (unsigned i = 0; i < symbolCount; ++i) + { + const GFSymbol a = vy[i]; + if (a == 0) + continue; + + GFSymbol sum1 = static_cast(AddModQ(GFLog[a & 0x0f], z)); + GFSymbol value1 = GFExp[sum1]; + if ((a & 0x0f) == 0) + { + value1 = 0; + } + GFSymbol sum2 = static_cast(AddModQ(GFLog[a & 0xf0], z)); + GFSymbol value2 = GFExp[sum2]; + if ((a & 0xf0) == 0) + { + value2 = 0; + } + GFSymbol sum3 = static_cast(AddModQ(GFLog[a & 0x0f00], z)); + GFSymbol value3 = GFExp[sum3]; + if ((a & 0x0f00) == 0) + { + value3 = 0; + } + GFSymbol sum4 = static_cast(AddModQ(GFLog[a & 0xf000], z)); + GFSymbol value4 = GFExp[sum4]; + if ((a & 0xf000) == 0) + { + value4 = 0; + } + + vx[i] ^= value1; + vx[i] ^= value2; + vx[i] ^= value3; + vx[i] ^= value4; + } +} + +// return a*GFExp[b] over GF(2^r) +static GFSymbol mulE(GFSymbol a, GFSymbol b) +{ + if (a == 0) + return 0; + + const GFSymbol sum = static_cast(AddModQ(GFLog[a], b)); + return GFExp[sum]; +} + + +//------------------------------------------------------------------------------ +// Fast Walsh-Hadamard Transform (FWHT) Mod Q +// +// Q is the maximum symbol value, e.g. 255 or 65535. + +// Define this to enable the optimized version of FWHT() +#define LEO_FWHT_OPTIMIZED + +typedef GFSymbol fwht_t; + +// {a, b} = {a + b, a - b} (Mod Q) +static LEO_FORCE_INLINE void FWHT_2(fwht_t& LEO_RESTRICT a, fwht_t& LEO_RESTRICT b) +{ + const fwht_t sum = AddModQ(a, b); + const fwht_t dif = SubModQ(a, b); + a = sum; + b = dif; +} + +/* + FWHT is a minor slice of the runtime and does not grow with data size, + but I did attempt a few additional optimizations that failed: + + I've attempted to vectorize (with partial reductions) FWHT_4(data, s), + which is 70% of the algorithm, but it was slower. Left in _attic_. + + I've attempted to avoid reductions in all or parts of the FWHT. + The final modular reduction ends up being slower than the savings. + Specifically I tried doing it for the whole FWHT and also I tried + doing it just for the FWHT_2 loop in the main routine, but both + approaches are slower than partial reductions. + + Replacing word reads with wider reads does speed up the operation, but + at too high a complexity cost relative to minor perf improvement. +*/ + +#ifndef LEO_FWHT_OPTIMIZED + +// Reference implementation +static void FWHT(fwht_t* data, const unsigned bits) +{ + const unsigned size = (unsigned)(1UL << bits); + for (unsigned width = 1; width < size; width <<= 1) + for (unsigned i = 0; i < size; i += (width << 1)) + for (unsigned j = i; j < (width + i); ++j) + FWHT_2(data[j], data[j + width]); +} + +#else + +static LEO_FORCE_INLINE void FWHT_4(fwht_t* data) +{ + fwht_t t0 = data[0]; + fwht_t t1 = data[1]; + fwht_t t2 = data[2]; + fwht_t t3 = data[3]; + FWHT_2(t0, t1); + FWHT_2(t2, t3); + FWHT_2(t0, t2); + FWHT_2(t1, t3); + data[0] = t0; + data[1] = t1; + data[2] = t2; + data[3] = t3; +} + +static LEO_FORCE_INLINE void FWHT_4(fwht_t* data, unsigned s) +{ + unsigned x = 0; + fwht_t t0 = data[x]; x += s; + fwht_t t1 = data[x]; x += s; + fwht_t t2 = data[x]; x += s; + fwht_t t3 = data[x]; + FWHT_2(t0, t1); + FWHT_2(t2, t3); + FWHT_2(t0, t2); + FWHT_2(t1, t3); + unsigned y = 0; + data[y] = t0; y += s; + data[y] = t1; y += s; + data[y] = t2; y += s; + data[y] = t3; +} + +static inline void FWHT_8(fwht_t* data) +{ + fwht_t t0 = data[0]; + fwht_t t1 = data[1]; + fwht_t t2 = data[2]; + fwht_t t3 = data[3]; + fwht_t t4 = data[4]; + fwht_t t5 = data[5]; + fwht_t t6 = data[6]; + fwht_t t7 = data[7]; + FWHT_2(t0, t1); + FWHT_2(t2, t3); + FWHT_2(t4, t5); + FWHT_2(t6, t7); + FWHT_2(t0, t2); + FWHT_2(t1, t3); + FWHT_2(t4, t6); + FWHT_2(t5, t7); + FWHT_2(t0, t4); + FWHT_2(t1, t5); + FWHT_2(t2, t6); + FWHT_2(t3, t7); + data[0] = t0; + data[1] = t1; + data[2] = t2; + data[3] = t3; + data[4] = t4; + data[5] = t5; + data[6] = t6; + data[7] = t7; +} + +static inline void FWHT_16(fwht_t* data) +{ + fwht_t t0 = data[0]; + fwht_t t1 = data[1]; + fwht_t t2 = data[2]; + fwht_t t3 = data[3]; + fwht_t t4 = data[4]; + fwht_t t5 = data[5]; + fwht_t t6 = data[6]; + fwht_t t7 = data[7]; + fwht_t t8 = data[8]; + fwht_t t9 = data[9]; + fwht_t t10 = data[10]; + fwht_t t11 = data[11]; + fwht_t t12 = data[12]; + fwht_t t13 = data[13]; + fwht_t t14 = data[14]; + fwht_t t15 = data[15]; + FWHT_2(t0, t1); + FWHT_2(t2, t3); + FWHT_2(t4, t5); + FWHT_2(t6, t7); + FWHT_2(t8, t9); + FWHT_2(t10, t11); + FWHT_2(t12, t13); + FWHT_2(t14, t15); + FWHT_2(t0, t2); + FWHT_2(t1, t3); + FWHT_2(t4, t6); + FWHT_2(t5, t7); + FWHT_2(t8, t10); + FWHT_2(t9, t11); + FWHT_2(t12, t14); + FWHT_2(t13, t15); + FWHT_2(t0, t4); + FWHT_2(t1, t5); + FWHT_2(t2, t6); + FWHT_2(t3, t7); + FWHT_2(t8, t12); + FWHT_2(t9, t13); + FWHT_2(t10, t14); + FWHT_2(t11, t15); + FWHT_2(t0, t8); + FWHT_2(t1, t9); + FWHT_2(t2, t10); + FWHT_2(t3, t11); + FWHT_2(t4, t12); + FWHT_2(t5, t13); + FWHT_2(t6, t14); + FWHT_2(t7, t15); + data[0] = t0; + data[1] = t1; + data[2] = t2; + data[3] = t3; + data[4] = t4; + data[5] = t5; + data[6] = t6; + data[7] = t7; + data[8] = t8; + data[9] = t9; + data[10] = t10; + data[11] = t11; + data[12] = t12; + data[13] = t13; + data[14] = t14; + data[15] = t15; +} + +static void FWHT_SmallData(fwht_t* data, unsigned ldn) +{ + const unsigned n = (1UL << ldn); + + if (n <= 2) + { + if (n == 2) + FWHT_2(data[0], data[1]); + return; + } + + for (unsigned ldm = ldn; ldm > 3; ldm -= 2) + { + unsigned m = (1UL << ldm); + unsigned m4 = (m >> 2); + for (unsigned r = 0; r < n; r += m) + for (unsigned j = 0; j < m4; j++) + FWHT_4(data + j + r, m4); + } + + if (ldn & 1) + { + for (unsigned i0 = 0; i0 < n; i0 += 8) + FWHT_8(data + i0); + } + else + { + for (unsigned i0 = 0; i0 < n; i0 += 4) + FWHT_4(data + i0); + } +} + +// Decimation in time (DIT) version +static void FWHT(fwht_t* data, const unsigned ldn) +{ + if (ldn <= 13) + { + FWHT_SmallData(data, ldn); + return; + } + + FWHT_2(data[2], data[3]); + FWHT_4(data + 4); + FWHT_8(data + 8); + FWHT_16(data + 16); + for (unsigned ldm = 5; ldm < ldn; ++ldm) + FWHT(data + (unsigned)(1UL << ldm), ldm); + + for (unsigned ldm = 0; ldm < ldn; ++ldm) + { + const unsigned mh = (1UL << ldm); + for (unsigned t1 = 0, t2 = mh; t1 < mh; ++t1, ++t2) + FWHT_2(data[t1], data[t2]); + } +} + +#endif + + +//------------------------------------------------------------------------------ +// Memory Buffer XOR + +static void xor_mem(void * LEO_RESTRICT vx, const void * LEO_RESTRICT vy, unsigned bytes) +{ + LEO_M128 * LEO_RESTRICT x16 = reinterpret_cast(vx); + const LEO_M128 * LEO_RESTRICT y16 = reinterpret_cast(vy); + +#if defined(LEO_TARGET_MOBILE) +# if defined(LEO_TRY_NEON) + // Handle multiples of 64 bytes + if (CpuHasNeon) + { + while (bytes >= 64) + { + LEO_M128 x0 = vld1q_u8(x16); + LEO_M128 x1 = vld1q_u8(x16 + 1); + LEO_M128 x2 = vld1q_u8(x16 + 2); + LEO_M128 x3 = vld1q_u8(x16 + 3); + LEO_M128 y0 = vld1q_u8(y16); + LEO_M128 y1 = vld1q_u8(y16 + 1); + LEO_M128 y2 = vld1q_u8(y16 + 2); + LEO_M128 y3 = vld1q_u8(y16 + 3); + + vst1q_u8(x16, veorq_u8(x0, y0)); + vst1q_u8(x16 + 1, veorq_u8(x1, y1)); + vst1q_u8(x16 + 2, veorq_u8(x2, y2)); + vst1q_u8(x16 + 3, veorq_u8(x3, y3)); + + bytes -= 64, x16 += 4, y16 += 4; + } + + // Handle multiples of 16 bytes + while (bytes >= 16) + { + LEO_M128 x0 = vld1q_u8(x16); + LEO_M128 y0 = vld1q_u8(y16); + + vst1q_u8(x16, veorq_u8(x0, y0)); + + bytes -= 16, ++x16, ++y16; + } + } + else +# endif // LEO_TRY_NEON + { + uint64_t * LEO_RESTRICT x8 = reinterpret_cast(x16); + const uint64_t * LEO_RESTRICT y8 = reinterpret_cast(y16); + + const unsigned count = (unsigned)bytes / 8; + for (unsigned ii = 0; ii < count; ++ii) + x8[ii] ^= y8[ii]; + + x16 = reinterpret_cast(x8 + count); + y16 = reinterpret_cast(y8 + count); + } +#else // LEO_TARGET_MOBILE +# if defined(LEO_TRY_AVX2) + if (CpuHasAVX2) + { + LEO_M256 * LEO_RESTRICT x32 = reinterpret_cast(x16); + const LEO_M256 * LEO_RESTRICT y32 = reinterpret_cast(y16); + + while (bytes >= 128) + { + LEO_M256 x0 = _mm256_loadu_si256(x32); + LEO_M256 y0 = _mm256_loadu_si256(y32); + x0 = _mm256_xor_si256(x0, y0); + LEO_M256 x1 = _mm256_loadu_si256(x32 + 1); + LEO_M256 y1 = _mm256_loadu_si256(y32 + 1); + x1 = _mm256_xor_si256(x1, y1); + LEO_M256 x2 = _mm256_loadu_si256(x32 + 2); + LEO_M256 y2 = _mm256_loadu_si256(y32 + 2); + x2 = _mm256_xor_si256(x2, y2); + LEO_M256 x3 = _mm256_loadu_si256(x32 + 3); + LEO_M256 y3 = _mm256_loadu_si256(y32 + 3); + x3 = _mm256_xor_si256(x3, y3); + + _mm256_storeu_si256(x32, x0); + _mm256_storeu_si256(x32 + 1, x1); + _mm256_storeu_si256(x32 + 2, x2); + _mm256_storeu_si256(x32 + 3, x3); + + bytes -= 128, x32 += 4, y32 += 4; + } + + // Handle multiples of 32 bytes + while (bytes >= 32) + { + // x[i] = x[i] xor y[i] + _mm256_storeu_si256(x32, + _mm256_xor_si256( + _mm256_loadu_si256(x32), + _mm256_loadu_si256(y32))); + + bytes -= 32, ++x32, ++y32; + } + + x16 = reinterpret_cast(x32); + y16 = reinterpret_cast(y32); + } + else +# endif // LEO_TRY_AVX2 + { + while (bytes >= 64) + { + LEO_M128 x0 = _mm_loadu_si128(x16); + LEO_M128 y0 = _mm_loadu_si128(y16); + x0 = _mm_xor_si128(x0, y0); + LEO_M128 x1 = _mm_loadu_si128(x16 + 1); + LEO_M128 y1 = _mm_loadu_si128(y16 + 1); + x1 = _mm_xor_si128(x1, y1); + LEO_M128 x2 = _mm_loadu_si128(x16 + 2); + LEO_M128 y2 = _mm_loadu_si128(y16 + 2); + x2 = _mm_xor_si128(x2, y2); + LEO_M128 x3 = _mm_loadu_si128(x16 + 3); + LEO_M128 y3 = _mm_loadu_si128(y16 + 3); + x3 = _mm_xor_si128(x3, y3); + + _mm_storeu_si128(x16, x0); + _mm_storeu_si128(x16 + 1, x1); + _mm_storeu_si128(x16 + 2, x2); + _mm_storeu_si128(x16 + 3, x3); + + bytes -= 64, x16 += 4, y16 += 4; + } + } +#endif // LEO_TARGET_MOBILE + + // Handle multiples of 16 bytes + while (bytes >= 16) + { + // x[i] = x[i] xor y[i] + _mm_storeu_si128(x16, + _mm_xor_si128( + _mm_loadu_si128(x16), + _mm_loadu_si128(y16))); + + bytes -= 16, ++x16, ++y16; + } + + uint8_t * LEO_RESTRICT x1 = reinterpret_cast(x16); + const uint8_t * LEO_RESTRICT y1 = reinterpret_cast(y16); + + // Handle a block of 8 bytes + const unsigned eight = bytes & 8; + if (eight) + { + uint64_t * LEO_RESTRICT x8 = reinterpret_cast(x1); + const uint64_t * LEO_RESTRICT y8 = reinterpret_cast(y1); + *x8 ^= *y8; + } + + // Handle a block of 4 bytes + const unsigned four = bytes & 4; + if (four) + { + uint32_t * LEO_RESTRICT x4 = reinterpret_cast(x1 + eight); + const uint32_t * LEO_RESTRICT y4 = reinterpret_cast(y1 + eight); + *x4 ^= *y4; + } + + // Handle final bytes + const unsigned offset = eight + four; + switch (bytes & 3) + { + case 3: x1[offset + 2] ^= y1[offset + 2]; + case 2: x1[offset + 1] ^= y1[offset + 1]; + case 1: x1[offset] ^= y1[offset]; + default: + break; + } +} + + +//------------------------------------------------------------------------------ +// Formal Derivative + +// Formal derivative of polynomial in the new basis +static void formal_derivative(GFSymbol* cos, const unsigned size) +{ + for (unsigned i = 1; i < size; ++i) + { + const unsigned leng = ((i ^ (i - 1)) + 1) >> 1; + + // If a large number of values are being XORed: + if (leng >= 8) + xor_mem(cos + i - leng, cos + i, leng * sizeof(GFSymbol)); + else + for (unsigned j = i - leng; j < i; j++) + cos[j] ^= cos[j + leng]; + } + + for (unsigned i = size; i < kFieldSize; i <<= 1) + xor_mem(cos, cos + i, size * sizeof(GFSymbol)); +} + + +//------------------------------------------------------------------------------ +// Fast Fourier Transform + +static GFSymbol skewVec[kFieldModulus]; // twisted factors used in FFT + +// IFFT in the proposed basis +static void IFLT(GFSymbol* data, const unsigned size, const unsigned index) +{ + for (unsigned depart_no = 1; depart_no < size; depart_no <<= 1) + { + for (unsigned j = depart_no; j < size; j += (depart_no << 1)) + { + // If a large number of values are being XORed: + if (depart_no >= 8) + xor_mem(data + j, data + j - depart_no, depart_no * sizeof(GFSymbol)); + else + for (unsigned i = j - depart_no; i < j; ++i) + data[i + depart_no] ^= data[i]; + + const GFSymbol skew = skewVec[j + index - 1]; + + if (skew != kFieldModulus) + muladd_mem(data + j - depart_no, data + j, skew, depart_no); + } + } +} + +// FFT in the proposed basis +static void FLT(GFSymbol* data, const unsigned size, const unsigned index) +{ + for (unsigned depart_no = (size >> 1); depart_no > 0; depart_no >>= 1) + { + for (unsigned j = depart_no; j < size; j += (depart_no << 1)) + { + const GFSymbol skew = skewVec[j + index - 1]; + + if (skew != kFieldModulus) + muladd_mem(data + j - depart_no, data + j, skew, depart_no); + + // If a large number of values are being XORed: + if (depart_no >= 8) + xor_mem(data + j, data + j - depart_no, depart_no * sizeof(GFSymbol)); + else + for (unsigned i = j - depart_no; i < j; ++i) + data[i + depart_no] ^= data[i]; + } + } +} + + +//------------------------------------------------------------------------------ +// FFT Initialization + +static GFSymbol B[kFieldSize >> 1]; // factors used in formal derivative +static fwht_t log_walsh[kFieldSize]; // factors used in the evaluation of the error locator polynomial + +// Initialize skewVec[], B[], log_walsh[] +static void InitFieldOperations() +{ + GFSymbol temp[kGFBits - 1]; + + for (unsigned i = 1; i < kGFBits; ++i) + temp[i - 1] = (GFSymbol)((unsigned)1 << i); + + for (unsigned m = 0; m < (kGFBits - 1); ++m) + { + const unsigned step = (unsigned)1 << (m + 1); + + skewVec[((unsigned)1 << m) - 1] = 0; + + for (unsigned i = m; i < (kGFBits - 1); ++i) + { + const unsigned s = ((unsigned)1 << (i + 1)); + + for (unsigned j = ((unsigned)1 << m) - 1; j < s; j += step) + skewVec[j + s] = skewVec[j] ^ temp[i]; + } + + temp[m] = kFieldModulus - GFLog[mulE(temp[m], GFLog[temp[m] ^ 1])]; + + for (unsigned i = m + 1; i < (kGFBits - 1); ++i) + temp[i] = mulE(temp[i], (GFLog[temp[i] ^ 1] + temp[m]) % kFieldModulus); + } + + for (unsigned i = 0; i < kFieldSize; ++i) + skewVec[i] = GFLog[skewVec[i]]; + + temp[0] = kFieldModulus - temp[0]; + + for (unsigned i = 1; i < (kGFBits - 1); ++i) + temp[i] = (kFieldModulus - temp[i] + temp[i - 1]) % kFieldModulus; + + B[0] = 0; + for (unsigned i = 0; i < (kGFBits - 1); ++i) + { + const unsigned depart = ((unsigned)1 << i); + + for (unsigned j = 0; j < depart; ++j) + B[j + depart] = (B[j] + temp[i]) % kFieldModulus; + } + + for (unsigned i = 0; i < kFieldSize; ++i) + log_walsh[i] = GFLog[i]; + + log_walsh[0] = 0; + + FWHT(log_walsh, kGFBits); +} + + +//------------------------------------------------------------------------------ +// Encoder + +// Encoding alg for k/n<0.5: message is a power of two +static void encodeL(GFSymbol* data, const unsigned k, GFSymbol* codeword) +{ + memcpy(codeword, data, sizeof(GFSymbol) * k); + + IFLT(codeword, k, 0); + + for (unsigned i = k; i < kFieldSize; i += k) + { + memcpy(&codeword[i], codeword, sizeof(GFSymbol) * k); + + FLT(&codeword[i], k, i); + } + + memcpy(codeword, data, sizeof(GFSymbol) * k); +} + +// Encoding alg for k/n>0.5: parity is a power of two. +// data: message array. parity: parity array. mem: buffer(size>= n-k) +static void encodeH(const GFSymbol* data, const unsigned k, GFSymbol* parity, GFSymbol* mem) +{ + const unsigned t = kFieldSize - k; + + memset(parity, 0, sizeof(GFSymbol) * t); + + for (unsigned i = t; i < kFieldSize; i += t) + { + memcpy(mem, &data[i - t], sizeof(GFSymbol) * t); + + IFLT(mem, t, i); + + xor_mem(parity, mem, t * sizeof(GFSymbol)); + } + + FLT(parity, t, 0); +} + + +//------------------------------------------------------------------------------ +// Decoder + +static void decode(GFSymbol* codeword, unsigned k, const bool* erasure) +{ + fwht_t log_walsh2[kFieldSize]; + + // Compute the evaluations of the error locator polynomial + for (unsigned i = 0; i < kFieldSize; ++i) + log_walsh2[i] = erasure[i] ? 1 : 0; + + FWHT(log_walsh2, kGFBits); + + for (unsigned i = 0; i < kFieldSize; ++i) + log_walsh2[i] = ((unsigned)log_walsh2[i] * (unsigned)log_walsh[i]) % kFieldModulus; + + FWHT(log_walsh2, kGFBits); + + // k2 can be replaced with k + const unsigned k2 = kFieldSize; + //const unsigned k2 = k; // cannot actually be replaced with k. what else need to change? + + for (unsigned i = 0; i < kFieldSize; ++i) + { + if (erasure[i]) + { + codeword[i] = 0; + } + else + { + codeword[i] = mulE(codeword[i], log_walsh2[i]); + } + } + + IFLT(codeword, kFieldSize, 0); + + // formal derivative + for (unsigned i = 0; i < kFieldSize; i += 2) + { + codeword[i] = mulE(codeword[i], kFieldModulus - B[i >> 1]); + codeword[i + 1] = mulE(codeword[i + 1], kFieldModulus - B[i >> 1]); + } + + formal_derivative(codeword, k2); + + for (unsigned i = 0; i < k2; i += 2) + { + codeword[i] = mulE(codeword[i], B[i >> 1]); + codeword[i + 1] = mulE(codeword[i + 1], B[i >> 1]); + } + + FLT(codeword, k2, 0); + + for (unsigned i = 0; i < k2; ++i) + { + if (erasure[i]) + { + codeword[i] = mulE(codeword[i], kFieldModulus - log_walsh2[i]); + } + } +} + + +//------------------------------------------------------------------------------ +// Test Application + +void test(unsigned k, unsigned seed) +{ + srand(seed); + + //-----------Generating message---------- + + // Message array + GFSymbol data[kFieldSize] = {0}; + + // Filled with random numbers + for (unsigned i = kFieldSize - k; i < kFieldSize; ++i) + data[i] = (GFSymbol)rand(); + + + //---------encoding---------- + + GFSymbol codeword[kFieldSize]; + encodeH(&data[kFieldSize - k], k, data, codeword); + //encodeL(data, k, codeword); // does not seem to work with any input? what else needs to change? + + memcpy(codeword, data, sizeof(GFSymbol) * kFieldSize); + + + //--------erasure simulation--------- + + // Array indicating erasures + bool erasure[kFieldSize] = { + false + }; + + for (unsigned i = k; i < kFieldSize; ++i) + erasure[i] = true; + + // permuting the erasure array + for (unsigned i = kFieldSize - 1; i > 0; --i) + { + unsigned pos = rand() % (i + 1); + + if (i != pos) + { + bool tmp = erasure[i]; + erasure[i] = erasure[pos]; + erasure[pos] = tmp; + } + } + + // erasure codeword symbols + for (unsigned i = 0; i < kFieldSize; ++i) + if (erasure[i]) + codeword[i] = 0; + + + //---------main processing---------- + decode(codeword, k, erasure); + + // Check the correctness of the result + for (unsigned i = 0; i < kFieldSize; ++i) + { + if (erasure[i] == 1) + { + if (data[i] != codeword[i]) + { + printf("Decoding Error with seed = %d!\n", seed); + LEO_DEBUG_BREAK; + return; + } + } + } + + //printf("Decoding is successful!\n"); +} + + +//------------------------------------------------------------------------------ +// Entrypoint + +int main(int argc, char **argv) +{ + // Initialize architecture-specific code + leo_architecture_init(); + + // Fill GFLog table and GFExp table + InitField(); + + // Compute factors used in erasure decoder + InitFieldOperations(); + + unsigned seed = (unsigned)time(NULL); + for (;;) + { + // test(int k), k: message size + /* + EncodeH works for kFieldSize / 2 and kFieldSize * 3 / 4, etc, + s.t. the number of recovery pieces is a power of two + */ + test(kFieldSize / 2, seed); + + ++seed; + } + + return 0; +} diff --git a/LeopardFF16.cpp b/LeopardFF16.cpp new file mode 100644 index 0000000..71d22e2 --- /dev/null +++ b/LeopardFF16.cpp @@ -0,0 +1,1220 @@ +/* + Copyright (c) 2017 Christopher A. Taylor. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + * Neither the name of LHC-RS nor the names of its contributors may be + used to endorse or promote products derived from this software without + specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. +*/ + +#include +#include +#include +#include +#include + + +/* + TODO: + + Write C API and unit tester + + Limit input to multiples of 64 bytes + + Replace GFSymbol with a file data pointer + + New 16-bit Muladd inner loops + + Class to contain the (large) muladd tables + + Preliminary benchmarks for large data! + + New 8-bit Muladd inner loops + + Benchmarks for smaller data! + + Refactor software + + Pick a name for the software better than LEO_RS + + I think it should be split up into several C++ modules + + Write detailed comments for all the routines + + Look into getting EncodeL working so we can support smaller data (Ask Lin) + + Look into using k instead of k2 to speed up decoder (Ask Lin) + + Avoid performing FFT/IFFT intermediate calculations we're not going to use + + Benchmarks, fun! + + Add multi-threading to split up long parallelizable calculations + + Final benchmarks! + + Finish up documentation + + Release version 1 + + + Muladd implementation notes: + + Specialize for 1-3 rows at a time since often times we're multiplying by + the same (skew) value repeatedly, as the ISA-L library does here: + + https://github.com/01org/isa-l/blob/master/erasure_code/gf_3vect_mad_avx.asm#L258 + + Except we should be doing it for 16-bit Galois Field. + To implement that use the ALTMAP trick from Jerasure: + + http://lab.jerasure.org/jerasure/gf-complete/blob/master/src/gf_w16.c#L1140 + + Except we should also support AVX2 since that is a 40% perf boost, so put + the high and low bytes 32 bytes instead of 16 bytes apart. + + Also I think we should go ahead and precompute the multiply tables since + it avoids a bunch of memory lookups for each muladd, and only costs 8 MB. +*/ + + +//------------------------------------------------------------------------------ +// Debug + +// Some bugs only repro in release mode, so this can be helpful +//#define LEO_DEBUG_IN_RELEASE + +#if defined(_DEBUG) || defined(DEBUG) || defined(LEO_DEBUG_IN_RELEASE) + #define LEO_DEBUG + #ifdef _WIN32 + #define LEO_DEBUG_BREAK __debugbreak() + #else + #define LEO_DEBUG_BREAK __builtin_trap() + #endif + #define LEO_DEBUG_ASSERT(cond) { if (!(cond)) { LEO_DEBUG_BREAK; } } +#else + #define LEO_DEBUG_BREAK ; + #define LEO_DEBUG_ASSERT(cond) ; +#endif + + +//------------------------------------------------------------------------------ +// Platform/Architecture + +#if defined(ANDROID) || defined(IOS) + #define LEO_TARGET_MOBILE +#endif // ANDROID + +#if defined(__AVX2__) || (defined (_MSC_VER) && _MSC_VER >= 1900) + #define LEO_TRY_AVX2 /* 256-bit */ + #include + #define LEO_ALIGN_BYTES 32 +#else // __AVX2__ + #define LEO_ALIGN_BYTES 16 +#endif // __AVX2__ + +#if !defined(LEO_TARGET_MOBILE) + // Note: MSVC currently only supports SSSE3 but not AVX2 + #include // SSSE3: _mm_shuffle_epi8 + #include // SSE2 +#endif // LEO_TARGET_MOBILE + +#if defined(HAVE_ARM_NEON_H) + #include +#endif // HAVE_ARM_NEON_H + +#if defined(LEO_TARGET_MOBILE) + + #define LEO_ALIGNED_ACCESSES /* Inputs must be aligned to LEO_ALIGN_BYTES */ + +# if defined(HAVE_ARM_NEON_H) + // Compiler-specific 128-bit SIMD register keyword + #define LEO_M128 uint8x16_t + #define LEO_TRY_NEON +#else + #define LEO_M128 uint64_t +# endif + +#else // LEO_TARGET_MOBILE + + // Compiler-specific 128-bit SIMD register keyword + #define LEO_M128 __m128i + +#endif // LEO_TARGET_MOBILE + +#ifdef LEO_TRY_AVX2 + // Compiler-specific 256-bit SIMD register keyword + #define LEO_M256 __m256i +#endif + +// Compiler-specific C++11 restrict keyword +#define LEO_RESTRICT __restrict + +// Compiler-specific force inline keyword +#ifdef _MSC_VER + #define LEO_FORCE_INLINE inline __forceinline +#else + #define LEO_FORCE_INLINE inline __attribute__((always_inline)) +#endif + +// Compiler-specific alignment keyword +// Note: Alignment only matters for ARM NEON where it should be 16 +#ifdef _MSC_VER + #define LEO_ALIGNED __declspec(align(LEO_ALIGN_BYTES)) +#else // _MSC_VER + #define LEO_ALIGNED __attribute__((aligned(LEO_ALIGN_BYTES))) +#endif // _MSC_VER + + +//------------------------------------------------------------------------------ +// Runtime CPU Architecture Check +// +// Feature checks stolen shamelessly from +// https://github.com/jedisct1/libsodium/blob/master/src/libsodium/sodium/runtime.c + +#if defined(HAVE_ANDROID_GETCPUFEATURES) + #include +#endif + +#if defined(LEO_TRY_NEON) +# if defined(IOS) && defined(__ARM_NEON__) + // Requires iPhone 5S or newer + static const bool CpuHasNeon = true; + static const bool CpuHasNeon64 = true; +# else + // Remember to add LOCAL_STATIC_LIBRARIES := cpufeatures + static bool CpuHasNeon = false; // V6 / V7 + static bool CpuHasNeon64 = false; // 64-bit +# endif +#endif + + +#if !defined(LEO_TARGET_MOBILE) + +#ifdef _MSC_VER + #include // __cpuid + #pragma warning(disable: 4752) // found Intel(R) Advanced Vector Extensions; consider using /arch:AVX +#endif + +#ifdef LEO_TRY_AVX2 +static bool CpuHasAVX2 = false; +#endif +static bool CpuHasSSSE3 = false; + +#define CPUID_EBX_AVX2 0x00000020 +#define CPUID_ECX_SSSE3 0x00000200 + +static void _cpuid(unsigned int cpu_info[4U], const unsigned int cpu_info_type) +{ +#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_AMD64) || defined(_M_IX86)) + __cpuid((int *) cpu_info, cpu_info_type); +#else //if defined(HAVE_CPUID) + cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0; +# ifdef __i386__ + __asm__ __volatile__ ("pushfl; pushfl; " + "popl %0; " + "movl %0, %1; xorl %2, %0; " + "pushl %0; " + "popfl; pushfl; popl %0; popfl" : + "=&r" (cpu_info[0]), "=&r" (cpu_info[1]) : + "i" (0x200000)); + if (((cpu_info[0] ^ cpu_info[1]) & 0x200000) == 0) { + return; /* LCOV_EXCL_LINE */ + } +# endif +# ifdef __i386__ + __asm__ __volatile__ ("xchgl %%ebx, %k1; cpuid; xchgl %%ebx, %k1" : + "=a" (cpu_info[0]), "=&r" (cpu_info[1]), + "=c" (cpu_info[2]), "=d" (cpu_info[3]) : + "0" (cpu_info_type), "2" (0U)); +# elif defined(__x86_64__) + __asm__ __volatile__ ("xchgq %%rbx, %q1; cpuid; xchgq %%rbx, %q1" : + "=a" (cpu_info[0]), "=&r" (cpu_info[1]), + "=c" (cpu_info[2]), "=d" (cpu_info[3]) : + "0" (cpu_info_type), "2" (0U)); +# else + __asm__ __volatile__ ("cpuid" : + "=a" (cpu_info[0]), "=b" (cpu_info[1]), + "=c" (cpu_info[2]), "=d" (cpu_info[3]) : + "0" (cpu_info_type), "2" (0U)); +# endif +#endif +} + +#endif // defined(LEO_TARGET_MOBILE) + + +static void leo_architecture_init() +{ +#if defined(LEO_TRY_NEON) && defined(HAVE_ANDROID_GETCPUFEATURES) + AndroidCpuFamily family = android_getCpuFamily(); + if (family == ANDROID_CPU_FAMILY_ARM) + { + if (android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON) + CpuHasNeon = true; + } + else if (family == ANDROID_CPU_FAMILY_ARM64) + { + CpuHasNeon = true; + if (android_getCpuFeatures() & ANDROID_CPU_ARM64_FEATURE_ASIMD) + CpuHasNeon64 = true; + } +#endif + +#if !defined(LEO_TARGET_MOBILE) + unsigned int cpu_info[4]; + + _cpuid(cpu_info, 1); + CpuHasSSSE3 = ((cpu_info[2] & CPUID_ECX_SSSE3) != 0); + +#if defined(LEO_TRY_AVX2) + _cpuid(cpu_info, 7); + CpuHasAVX2 = ((cpu_info[1] & CPUID_EBX_AVX2) != 0); +#endif // LEO_TRY_AVX2 + +#endif // LEO_TARGET_MOBILE +} + + +//------------------------------------------------------------------------------ +// SIMD-Safe Aligned Memory Allocations + +static const unsigned kAlignmentBytes = LEO_ALIGN_BYTES; + +LEO_FORCE_INLINE unsigned NextAlignedOffset(unsigned offset) +{ + return (offset + kAlignmentBytes - 1) & ~(kAlignmentBytes - 1); +} + +static LEO_FORCE_INLINE uint8_t* SIMDSafeAllocate(size_t size) +{ + uint8_t* data = (uint8_t*)calloc(1, kAlignmentBytes + size); + if (!data) + return nullptr; + unsigned offset = (unsigned)((uintptr_t)data % kAlignmentBytes); + data += kAlignmentBytes - offset; + data[-1] = (uint8_t)offset; + return data; +} + +static LEO_FORCE_INLINE void SIMDSafeFree(void* ptr) +{ + if (!ptr) + return; + uint8_t* data = (uint8_t*)ptr; + unsigned offset = data[-1]; + if (offset >= kAlignmentBytes) + { + LEO_DEBUG_BREAK; // Should never happen + return; + } + data -= kAlignmentBytes - offset; + free(data); +} + + +//------------------------------------------------------------------------------ +// Field + +//#define LEO_SHORT_FIELD + +#ifdef LEO_SHORT_FIELD +typedef uint8_t GFSymbol; +static const unsigned kGFBits = 8; +static const unsigned kGFPolynomial = 0x11D; +GFSymbol kGFBasis[kGFBits] = { + 1, 214, 152, 146, 86, 200, 88, 230 // Cantor basis +}; +#else +typedef uint16_t GFSymbol; +static const unsigned kGFBits = 16; +static const unsigned kGFPolynomial = 0x1002D; +GFSymbol kGFBasis[kGFBits] = { + 0x0001, 0xACCA, 0x3C0E, 0x163E, // Cantor basis + 0xC582, 0xED2E, 0x914C, 0x4012, + 0x6C98, 0x10D8, 0x6A72, 0xB900, + 0xFDB8, 0xFB34, 0xFF38, 0x991E +}; +#endif + +/* + Cantor Basis introduced by: + D. G. Cantor, "On arithmetical algorithms over finite fields", + Journal of Combinatorial Theory, Series A, vol. 50, no. 2, pp. 285-300, 1989. +*/ + +static const unsigned kFieldSize = (unsigned)1 << kGFBits; //Field size +static const unsigned kFieldModulus = kFieldSize - 1; + +static GFSymbol GFLog[kFieldSize]; +static GFSymbol GFExp[kFieldSize]; + +// Initialize GFLog[], GFExp[] +static void InitField() +{ + unsigned state = 1; + for (unsigned i = 0; i < kFieldModulus; ++i) + { + GFExp[state] = static_cast(i); + state <<= 1; + if (state >= kFieldSize) + state ^= kGFPolynomial; + } + GFExp[0] = kFieldModulus; + + // Conversion to chosen basis: + + GFLog[0] = 0; + for (unsigned i = 0; i < kGFBits; ++i) + { + const GFSymbol basis = kGFBasis[i]; + const unsigned width = (unsigned)(1UL << i); + + for (unsigned j = 0; j < width; ++j) + GFLog[j + width] = GFLog[j] ^ basis; + } + + for (unsigned i = 0; i < kFieldSize; ++i) + GFLog[i] = GFExp[GFLog[i]]; + + for (unsigned i = 0; i < kFieldSize; ++i) + GFExp[GFLog[i]] = i; + + GFExp[kFieldModulus] = GFExp[0]; +} + + +//------------------------------------------------------------------------------ +// Mod Q Field Operations +// +// Q is the maximum symbol value, e.g. 255 or 65535. + +// z = x + y (mod Q) +static inline GFSymbol AddModQ(GFSymbol a, GFSymbol b) +{ + const unsigned sum = (unsigned)a + b; + + // Partial reduction step, allowing for Q to be returned + return static_cast(sum + (sum >> kGFBits)); +} + +// z = x - y (mod Q) +static inline GFSymbol SubModQ(GFSymbol a, GFSymbol b) +{ + const unsigned dif = (unsigned)a - b; + + // Partial reduction step, allowing for Q to be returned + return static_cast(dif + (dif >> kGFBits)); +} + +// vx[] += vy[] * z +static void muladd_mem(GFSymbol * LEO_RESTRICT vx, const GFSymbol * LEO_RESTRICT vy, GFSymbol z, unsigned symbolCount) +{ + for (unsigned i = 0; i < symbolCount; ++i) + { + const GFSymbol a = vy[i]; + if (a == 0) + continue; + + GFSymbol sum1 = static_cast(AddModQ(GFLog[a & 0x0f], z)); + GFSymbol value1 = GFExp[sum1]; + if ((a & 0x0f) == 0) + { + value1 = 0; + } + GFSymbol sum2 = static_cast(AddModQ(GFLog[a & 0xf0], z)); + GFSymbol value2 = GFExp[sum2]; + if ((a & 0xf0) == 0) + { + value2 = 0; + } + GFSymbol sum3 = static_cast(AddModQ(GFLog[a & 0x0f00], z)); + GFSymbol value3 = GFExp[sum3]; + if ((a & 0x0f00) == 0) + { + value3 = 0; + } + GFSymbol sum4 = static_cast(AddModQ(GFLog[a & 0xf000], z)); + GFSymbol value4 = GFExp[sum4]; + if ((a & 0xf000) == 0) + { + value4 = 0; + } + + vx[i] ^= value1; + vx[i] ^= value2; + vx[i] ^= value3; + vx[i] ^= value4; + } +} + +// return a*GFExp[b] over GF(2^r) +static GFSymbol mulE(GFSymbol a, GFSymbol b) +{ + if (a == 0) + return 0; + + const GFSymbol sum = static_cast(AddModQ(GFLog[a], b)); + return GFExp[sum]; +} + + +//------------------------------------------------------------------------------ +// Fast Walsh-Hadamard Transform (FWHT) Mod Q +// +// Q is the maximum symbol value, e.g. 255 or 65535. + +// Define this to enable the optimized version of FWHT() +#define LEO_FWHT_OPTIMIZED + +typedef GFSymbol fwht_t; + +// {a, b} = {a + b, a - b} (Mod Q) +static LEO_FORCE_INLINE void FWHT_2(fwht_t& LEO_RESTRICT a, fwht_t& LEO_RESTRICT b) +{ + const fwht_t sum = AddModQ(a, b); + const fwht_t dif = SubModQ(a, b); + a = sum; + b = dif; +} + +/* + FWHT is a minor slice of the runtime and does not grow with data size, + but I did attempt a few additional optimizations that failed: + + I've attempted to vectorize (with partial reductions) FWHT_4(data, s), + which is 70% of the algorithm, but it was slower. Left in _attic_. + + I've attempted to avoid reductions in all or parts of the FWHT. + The final modular reduction ends up being slower than the savings. + Specifically I tried doing it for the whole FWHT and also I tried + doing it just for the FWHT_2 loop in the main routine, but both + approaches are slower than partial reductions. + + Replacing word reads with wider reads does speed up the operation, but + at too high a complexity cost relative to minor perf improvement. +*/ + +#ifndef LEO_FWHT_OPTIMIZED + +// Reference implementation +static void FWHT(fwht_t* data, const unsigned bits) +{ + const unsigned size = (unsigned)(1UL << bits); + for (unsigned width = 1; width < size; width <<= 1) + for (unsigned i = 0; i < size; i += (width << 1)) + for (unsigned j = i; j < (width + i); ++j) + FWHT_2(data[j], data[j + width]); +} + +#else + +static LEO_FORCE_INLINE void FWHT_4(fwht_t* data) +{ + fwht_t t0 = data[0]; + fwht_t t1 = data[1]; + fwht_t t2 = data[2]; + fwht_t t3 = data[3]; + FWHT_2(t0, t1); + FWHT_2(t2, t3); + FWHT_2(t0, t2); + FWHT_2(t1, t3); + data[0] = t0; + data[1] = t1; + data[2] = t2; + data[3] = t3; +} + +static LEO_FORCE_INLINE void FWHT_4(fwht_t* data, unsigned s) +{ + unsigned x = 0; + fwht_t t0 = data[x]; x += s; + fwht_t t1 = data[x]; x += s; + fwht_t t2 = data[x]; x += s; + fwht_t t3 = data[x]; + FWHT_2(t0, t1); + FWHT_2(t2, t3); + FWHT_2(t0, t2); + FWHT_2(t1, t3); + unsigned y = 0; + data[y] = t0; y += s; + data[y] = t1; y += s; + data[y] = t2; y += s; + data[y] = t3; +} + +static inline void FWHT_8(fwht_t* data) +{ + fwht_t t0 = data[0]; + fwht_t t1 = data[1]; + fwht_t t2 = data[2]; + fwht_t t3 = data[3]; + fwht_t t4 = data[4]; + fwht_t t5 = data[5]; + fwht_t t6 = data[6]; + fwht_t t7 = data[7]; + FWHT_2(t0, t1); + FWHT_2(t2, t3); + FWHT_2(t4, t5); + FWHT_2(t6, t7); + FWHT_2(t0, t2); + FWHT_2(t1, t3); + FWHT_2(t4, t6); + FWHT_2(t5, t7); + FWHT_2(t0, t4); + FWHT_2(t1, t5); + FWHT_2(t2, t6); + FWHT_2(t3, t7); + data[0] = t0; + data[1] = t1; + data[2] = t2; + data[3] = t3; + data[4] = t4; + data[5] = t5; + data[6] = t6; + data[7] = t7; +} + +static inline void FWHT_16(fwht_t* data) +{ + fwht_t t0 = data[0]; + fwht_t t1 = data[1]; + fwht_t t2 = data[2]; + fwht_t t3 = data[3]; + fwht_t t4 = data[4]; + fwht_t t5 = data[5]; + fwht_t t6 = data[6]; + fwht_t t7 = data[7]; + fwht_t t8 = data[8]; + fwht_t t9 = data[9]; + fwht_t t10 = data[10]; + fwht_t t11 = data[11]; + fwht_t t12 = data[12]; + fwht_t t13 = data[13]; + fwht_t t14 = data[14]; + fwht_t t15 = data[15]; + FWHT_2(t0, t1); + FWHT_2(t2, t3); + FWHT_2(t4, t5); + FWHT_2(t6, t7); + FWHT_2(t8, t9); + FWHT_2(t10, t11); + FWHT_2(t12, t13); + FWHT_2(t14, t15); + FWHT_2(t0, t2); + FWHT_2(t1, t3); + FWHT_2(t4, t6); + FWHT_2(t5, t7); + FWHT_2(t8, t10); + FWHT_2(t9, t11); + FWHT_2(t12, t14); + FWHT_2(t13, t15); + FWHT_2(t0, t4); + FWHT_2(t1, t5); + FWHT_2(t2, t6); + FWHT_2(t3, t7); + FWHT_2(t8, t12); + FWHT_2(t9, t13); + FWHT_2(t10, t14); + FWHT_2(t11, t15); + FWHT_2(t0, t8); + FWHT_2(t1, t9); + FWHT_2(t2, t10); + FWHT_2(t3, t11); + FWHT_2(t4, t12); + FWHT_2(t5, t13); + FWHT_2(t6, t14); + FWHT_2(t7, t15); + data[0] = t0; + data[1] = t1; + data[2] = t2; + data[3] = t3; + data[4] = t4; + data[5] = t5; + data[6] = t6; + data[7] = t7; + data[8] = t8; + data[9] = t9; + data[10] = t10; + data[11] = t11; + data[12] = t12; + data[13] = t13; + data[14] = t14; + data[15] = t15; +} + +static void FWHT_SmallData(fwht_t* data, unsigned ldn) +{ + const unsigned n = (1UL << ldn); + + if (n <= 2) + { + if (n == 2) + FWHT_2(data[0], data[1]); + return; + } + + for (unsigned ldm = ldn; ldm > 3; ldm -= 2) + { + unsigned m = (1UL << ldm); + unsigned m4 = (m >> 2); + for (unsigned r = 0; r < n; r += m) + for (unsigned j = 0; j < m4; j++) + FWHT_4(data + j + r, m4); + } + + if (ldn & 1) + { + for (unsigned i0 = 0; i0 < n; i0 += 8) + FWHT_8(data + i0); + } + else + { + for (unsigned i0 = 0; i0 < n; i0 += 4) + FWHT_4(data + i0); + } +} + +// Decimation in time (DIT) version +static void FWHT(fwht_t* data, const unsigned ldn) +{ + if (ldn <= 13) + { + FWHT_SmallData(data, ldn); + return; + } + + FWHT_2(data[2], data[3]); + FWHT_4(data + 4); + FWHT_8(data + 8); + FWHT_16(data + 16); + for (unsigned ldm = 5; ldm < ldn; ++ldm) + FWHT(data + (unsigned)(1UL << ldm), ldm); + + for (unsigned ldm = 0; ldm < ldn; ++ldm) + { + const unsigned mh = (1UL << ldm); + for (unsigned t1 = 0, t2 = mh; t1 < mh; ++t1, ++t2) + FWHT_2(data[t1], data[t2]); + } +} + +#endif + + +//------------------------------------------------------------------------------ +// Memory Buffer XOR + +static void xor_mem(void * LEO_RESTRICT vx, const void * LEO_RESTRICT vy, unsigned bytes) +{ + LEO_M128 * LEO_RESTRICT x16 = reinterpret_cast(vx); + const LEO_M128 * LEO_RESTRICT y16 = reinterpret_cast(vy); + +#if defined(LEO_TARGET_MOBILE) +# if defined(LEO_TRY_NEON) + // Handle multiples of 64 bytes + if (CpuHasNeon) + { + while (bytes >= 64) + { + LEO_M128 x0 = vld1q_u8(x16); + LEO_M128 x1 = vld1q_u8(x16 + 1); + LEO_M128 x2 = vld1q_u8(x16 + 2); + LEO_M128 x3 = vld1q_u8(x16 + 3); + LEO_M128 y0 = vld1q_u8(y16); + LEO_M128 y1 = vld1q_u8(y16 + 1); + LEO_M128 y2 = vld1q_u8(y16 + 2); + LEO_M128 y3 = vld1q_u8(y16 + 3); + + vst1q_u8(x16, veorq_u8(x0, y0)); + vst1q_u8(x16 + 1, veorq_u8(x1, y1)); + vst1q_u8(x16 + 2, veorq_u8(x2, y2)); + vst1q_u8(x16 + 3, veorq_u8(x3, y3)); + + bytes -= 64, x16 += 4, y16 += 4; + } + + // Handle multiples of 16 bytes + while (bytes >= 16) + { + LEO_M128 x0 = vld1q_u8(x16); + LEO_M128 y0 = vld1q_u8(y16); + + vst1q_u8(x16, veorq_u8(x0, y0)); + + bytes -= 16, ++x16, ++y16; + } + } + else +# endif // LEO_TRY_NEON + { + uint64_t * LEO_RESTRICT x8 = reinterpret_cast(x16); + const uint64_t * LEO_RESTRICT y8 = reinterpret_cast(y16); + + const unsigned count = (unsigned)bytes / 8; + for (unsigned ii = 0; ii < count; ++ii) + x8[ii] ^= y8[ii]; + + x16 = reinterpret_cast(x8 + count); + y16 = reinterpret_cast(y8 + count); + } +#else // LEO_TARGET_MOBILE +# if defined(LEO_TRY_AVX2) + if (CpuHasAVX2) + { + LEO_M256 * LEO_RESTRICT x32 = reinterpret_cast(x16); + const LEO_M256 * LEO_RESTRICT y32 = reinterpret_cast(y16); + + while (bytes >= 128) + { + LEO_M256 x0 = _mm256_loadu_si256(x32); + LEO_M256 y0 = _mm256_loadu_si256(y32); + x0 = _mm256_xor_si256(x0, y0); + LEO_M256 x1 = _mm256_loadu_si256(x32 + 1); + LEO_M256 y1 = _mm256_loadu_si256(y32 + 1); + x1 = _mm256_xor_si256(x1, y1); + LEO_M256 x2 = _mm256_loadu_si256(x32 + 2); + LEO_M256 y2 = _mm256_loadu_si256(y32 + 2); + x2 = _mm256_xor_si256(x2, y2); + LEO_M256 x3 = _mm256_loadu_si256(x32 + 3); + LEO_M256 y3 = _mm256_loadu_si256(y32 + 3); + x3 = _mm256_xor_si256(x3, y3); + + _mm256_storeu_si256(x32, x0); + _mm256_storeu_si256(x32 + 1, x1); + _mm256_storeu_si256(x32 + 2, x2); + _mm256_storeu_si256(x32 + 3, x3); + + bytes -= 128, x32 += 4, y32 += 4; + } + + // Handle multiples of 32 bytes + while (bytes >= 32) + { + // x[i] = x[i] xor y[i] + _mm256_storeu_si256(x32, + _mm256_xor_si256( + _mm256_loadu_si256(x32), + _mm256_loadu_si256(y32))); + + bytes -= 32, ++x32, ++y32; + } + + x16 = reinterpret_cast(x32); + y16 = reinterpret_cast(y32); + } + else +# endif // LEO_TRY_AVX2 + { + while (bytes >= 64) + { + LEO_M128 x0 = _mm_loadu_si128(x16); + LEO_M128 y0 = _mm_loadu_si128(y16); + x0 = _mm_xor_si128(x0, y0); + LEO_M128 x1 = _mm_loadu_si128(x16 + 1); + LEO_M128 y1 = _mm_loadu_si128(y16 + 1); + x1 = _mm_xor_si128(x1, y1); + LEO_M128 x2 = _mm_loadu_si128(x16 + 2); + LEO_M128 y2 = _mm_loadu_si128(y16 + 2); + x2 = _mm_xor_si128(x2, y2); + LEO_M128 x3 = _mm_loadu_si128(x16 + 3); + LEO_M128 y3 = _mm_loadu_si128(y16 + 3); + x3 = _mm_xor_si128(x3, y3); + + _mm_storeu_si128(x16, x0); + _mm_storeu_si128(x16 + 1, x1); + _mm_storeu_si128(x16 + 2, x2); + _mm_storeu_si128(x16 + 3, x3); + + bytes -= 64, x16 += 4, y16 += 4; + } + } +#endif // LEO_TARGET_MOBILE + + // Handle multiples of 16 bytes + while (bytes >= 16) + { + // x[i] = x[i] xor y[i] + _mm_storeu_si128(x16, + _mm_xor_si128( + _mm_loadu_si128(x16), + _mm_loadu_si128(y16))); + + bytes -= 16, ++x16, ++y16; + } + + uint8_t * LEO_RESTRICT x1 = reinterpret_cast(x16); + const uint8_t * LEO_RESTRICT y1 = reinterpret_cast(y16); + + // Handle a block of 8 bytes + const unsigned eight = bytes & 8; + if (eight) + { + uint64_t * LEO_RESTRICT x8 = reinterpret_cast(x1); + const uint64_t * LEO_RESTRICT y8 = reinterpret_cast(y1); + *x8 ^= *y8; + } + + // Handle a block of 4 bytes + const unsigned four = bytes & 4; + if (four) + { + uint32_t * LEO_RESTRICT x4 = reinterpret_cast(x1 + eight); + const uint32_t * LEO_RESTRICT y4 = reinterpret_cast(y1 + eight); + *x4 ^= *y4; + } + + // Handle final bytes + const unsigned offset = eight + four; + switch (bytes & 3) + { + case 3: x1[offset + 2] ^= y1[offset + 2]; + case 2: x1[offset + 1] ^= y1[offset + 1]; + case 1: x1[offset] ^= y1[offset]; + default: + break; + } +} + + +//------------------------------------------------------------------------------ +// Formal Derivative + +// Formal derivative of polynomial in the new basis +static void formal_derivative(GFSymbol* cos, const unsigned size) +{ + for (unsigned i = 1; i < size; ++i) + { + const unsigned leng = ((i ^ (i - 1)) + 1) >> 1; + + // If a large number of values are being XORed: + if (leng >= 8) + xor_mem(cos + i - leng, cos + i, leng * sizeof(GFSymbol)); + else + for (unsigned j = i - leng; j < i; j++) + cos[j] ^= cos[j + leng]; + } + + for (unsigned i = size; i < kFieldSize; i <<= 1) + xor_mem(cos, cos + i, size * sizeof(GFSymbol)); +} + + +//------------------------------------------------------------------------------ +// Fast Fourier Transform + +static GFSymbol skewVec[kFieldModulus]; // twisted factors used in FFT + +// IFFT in the proposed basis +static void IFLT(GFSymbol* data, const unsigned size, const unsigned index) +{ + for (unsigned depart_no = 1; depart_no < size; depart_no <<= 1) + { + for (unsigned j = depart_no; j < size; j += (depart_no << 1)) + { + // If a large number of values are being XORed: + if (depart_no >= 8) + xor_mem(data + j, data + j - depart_no, depart_no * sizeof(GFSymbol)); + else + for (unsigned i = j - depart_no; i < j; ++i) + data[i + depart_no] ^= data[i]; + + const GFSymbol skew = skewVec[j + index - 1]; + + if (skew != kFieldModulus) + muladd_mem(data + j - depart_no, data + j, skew, depart_no); + } + } +} + +// FFT in the proposed basis +static void FLT(GFSymbol* data, const unsigned size, const unsigned index) +{ + for (unsigned depart_no = (size >> 1); depart_no > 0; depart_no >>= 1) + { + for (unsigned j = depart_no; j < size; j += (depart_no << 1)) + { + const GFSymbol skew = skewVec[j + index - 1]; + + if (skew != kFieldModulus) + muladd_mem(data + j - depart_no, data + j, skew, depart_no); + + // If a large number of values are being XORed: + if (depart_no >= 8) + xor_mem(data + j, data + j - depart_no, depart_no * sizeof(GFSymbol)); + else + for (unsigned i = j - depart_no; i < j; ++i) + data[i + depart_no] ^= data[i]; + } + } +} + + +//------------------------------------------------------------------------------ +// FFT Initialization + +static GFSymbol B[kFieldSize >> 1]; // factors used in formal derivative +static fwht_t log_walsh[kFieldSize]; // factors used in the evaluation of the error locator polynomial + +// Initialize skewVec[], B[], log_walsh[] +static void InitFieldOperations() +{ + GFSymbol temp[kGFBits - 1]; + + for (unsigned i = 1; i < kGFBits; ++i) + temp[i - 1] = (GFSymbol)((unsigned)1 << i); + + for (unsigned m = 0; m < (kGFBits - 1); ++m) + { + const unsigned step = (unsigned)1 << (m + 1); + + skewVec[((unsigned)1 << m) - 1] = 0; + + for (unsigned i = m; i < (kGFBits - 1); ++i) + { + const unsigned s = ((unsigned)1 << (i + 1)); + + for (unsigned j = ((unsigned)1 << m) - 1; j < s; j += step) + skewVec[j + s] = skewVec[j] ^ temp[i]; + } + + temp[m] = kFieldModulus - GFLog[mulE(temp[m], GFLog[temp[m] ^ 1])]; + + for (unsigned i = m + 1; i < (kGFBits - 1); ++i) + temp[i] = mulE(temp[i], (GFLog[temp[i] ^ 1] + temp[m]) % kFieldModulus); + } + + for (unsigned i = 0; i < kFieldSize; ++i) + skewVec[i] = GFLog[skewVec[i]]; + + temp[0] = kFieldModulus - temp[0]; + + for (unsigned i = 1; i < (kGFBits - 1); ++i) + temp[i] = (kFieldModulus - temp[i] + temp[i - 1]) % kFieldModulus; + + B[0] = 0; + for (unsigned i = 0; i < (kGFBits - 1); ++i) + { + const unsigned depart = ((unsigned)1 << i); + + for (unsigned j = 0; j < depart; ++j) + B[j + depart] = (B[j] + temp[i]) % kFieldModulus; + } + + for (unsigned i = 0; i < kFieldSize; ++i) + log_walsh[i] = GFLog[i]; + + log_walsh[0] = 0; + + FWHT(log_walsh, kGFBits); +} + + +//------------------------------------------------------------------------------ +// Encoder + +// Encoding alg for k/n<0.5: message is a power of two +static void encodeL(GFSymbol* data, const unsigned k, GFSymbol* codeword) +{ + memcpy(codeword, data, sizeof(GFSymbol) * k); + + IFLT(codeword, k, 0); + + for (unsigned i = k; i < kFieldSize; i += k) + { + memcpy(&codeword[i], codeword, sizeof(GFSymbol) * k); + + FLT(&codeword[i], k, i); + } + + memcpy(codeword, data, sizeof(GFSymbol) * k); +} + +// Encoding alg for k/n>0.5: parity is a power of two. +// data: message array. parity: parity array. mem: buffer(size>= n-k) +static void encodeH(const GFSymbol* data, const unsigned k, GFSymbol* parity, GFSymbol* mem) +{ + const unsigned t = kFieldSize - k; + + memset(parity, 0, sizeof(GFSymbol) * t); + + for (unsigned i = t; i < kFieldSize; i += t) + { + memcpy(mem, &data[i - t], sizeof(GFSymbol) * t); + + IFLT(mem, t, i); + + xor_mem(parity, mem, t * sizeof(GFSymbol)); + } + + FLT(parity, t, 0); +} + + +//------------------------------------------------------------------------------ +// Decoder + +static void decode(GFSymbol* codeword, unsigned k, const bool* erasure) +{ + fwht_t log_walsh2[kFieldSize]; + + // Compute the evaluations of the error locator polynomial + for (unsigned i = 0; i < kFieldSize; ++i) + log_walsh2[i] = erasure[i] ? 1 : 0; + + FWHT(log_walsh2, kGFBits); + + for (unsigned i = 0; i < kFieldSize; ++i) + log_walsh2[i] = ((unsigned)log_walsh2[i] * (unsigned)log_walsh[i]) % kFieldModulus; + + FWHT(log_walsh2, kGFBits); + + // k2 can be replaced with k + const unsigned k2 = kFieldSize; + //const unsigned k2 = k; // cannot actually be replaced with k. what else need to change? + + for (unsigned i = 0; i < kFieldSize; ++i) + { + if (erasure[i]) + { + codeword[i] = 0; + } + else + { + codeword[i] = mulE(codeword[i], log_walsh2[i]); + } + } + + IFLT(codeword, kFieldSize, 0); + + // formal derivative + for (unsigned i = 0; i < kFieldSize; i += 2) + { + codeword[i] = mulE(codeword[i], kFieldModulus - B[i >> 1]); + codeword[i + 1] = mulE(codeword[i + 1], kFieldModulus - B[i >> 1]); + } + + formal_derivative(codeword, k2); + + for (unsigned i = 0; i < k2; i += 2) + { + codeword[i] = mulE(codeword[i], B[i >> 1]); + codeword[i + 1] = mulE(codeword[i + 1], B[i >> 1]); + } + + FLT(codeword, k2, 0); + + for (unsigned i = 0; i < k2; ++i) + { + if (erasure[i]) + { + codeword[i] = mulE(codeword[i], kFieldModulus - log_walsh2[i]); + } + } +} + + +//------------------------------------------------------------------------------ +// Test Application + +void test(unsigned k, unsigned seed) +{ + srand(seed); + + //-----------Generating message---------- + + // Message array + GFSymbol data[kFieldSize] = {0}; + + // Filled with random numbers + for (unsigned i = kFieldSize - k; i < kFieldSize; ++i) + data[i] = (GFSymbol)rand(); + + + //---------encoding---------- + + GFSymbol codeword[kFieldSize]; + encodeH(&data[kFieldSize - k], k, data, codeword); + //encodeL(data, k, codeword); // does not seem to work with any input? what else needs to change? + + memcpy(codeword, data, sizeof(GFSymbol) * kFieldSize); + + + //--------erasure simulation--------- + + // Array indicating erasures + bool erasure[kFieldSize] = { + false + }; + + for (unsigned i = k; i < kFieldSize; ++i) + erasure[i] = true; + + // permuting the erasure array + for (unsigned i = kFieldSize - 1; i > 0; --i) + { + unsigned pos = rand() % (i + 1); + + if (i != pos) + { + bool tmp = erasure[i]; + erasure[i] = erasure[pos]; + erasure[pos] = tmp; + } + } + + // erasure codeword symbols + for (unsigned i = 0; i < kFieldSize; ++i) + if (erasure[i]) + codeword[i] = 0; + + + //---------main processing---------- + decode(codeword, k, erasure); + + // Check the correctness of the result + for (unsigned i = 0; i < kFieldSize; ++i) + { + if (erasure[i] == 1) + { + if (data[i] != codeword[i]) + { + printf("Decoding Error with seed = %d!\n", seed); + LEO_DEBUG_BREAK; + return; + } + } + } + + //printf("Decoding is successful!\n"); +} + + +//------------------------------------------------------------------------------ +// Entrypoint + +int main(int argc, char **argv) +{ + // Initialize architecture-specific code + leo_architecture_init(); + + // Fill GFLog table and GFExp table + InitField(); + + // Compute factors used in erasure decoder + InitFieldOperations(); + + unsigned seed = (unsigned)time(NULL); + for (;;) + { + // test(int k), k: message size + /* + EncodeH works for kFieldSize / 2 and kFieldSize * 3 / 4, etc, + s.t. the number of recovery pieces is a power of two + */ + test(kFieldSize / 2, seed); + + ++seed; + } + + return 0; +} diff --git a/LeopardFF16.h b/LeopardFF16.h new file mode 100644 index 0000000..71d22e2 --- /dev/null +++ b/LeopardFF16.h @@ -0,0 +1,1220 @@ +/* + Copyright (c) 2017 Christopher A. Taylor. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + * Neither the name of LHC-RS nor the names of its contributors may be + used to endorse or promote products derived from this software without + specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. +*/ + +#include +#include +#include +#include +#include + + +/* + TODO: + + Write C API and unit tester + + Limit input to multiples of 64 bytes + + Replace GFSymbol with a file data pointer + + New 16-bit Muladd inner loops + + Class to contain the (large) muladd tables + + Preliminary benchmarks for large data! + + New 8-bit Muladd inner loops + + Benchmarks for smaller data! + + Refactor software + + Pick a name for the software better than LEO_RS + + I think it should be split up into several C++ modules + + Write detailed comments for all the routines + + Look into getting EncodeL working so we can support smaller data (Ask Lin) + + Look into using k instead of k2 to speed up decoder (Ask Lin) + + Avoid performing FFT/IFFT intermediate calculations we're not going to use + + Benchmarks, fun! + + Add multi-threading to split up long parallelizable calculations + + Final benchmarks! + + Finish up documentation + + Release version 1 + + + Muladd implementation notes: + + Specialize for 1-3 rows at a time since often times we're multiplying by + the same (skew) value repeatedly, as the ISA-L library does here: + + https://github.com/01org/isa-l/blob/master/erasure_code/gf_3vect_mad_avx.asm#L258 + + Except we should be doing it for 16-bit Galois Field. + To implement that use the ALTMAP trick from Jerasure: + + http://lab.jerasure.org/jerasure/gf-complete/blob/master/src/gf_w16.c#L1140 + + Except we should also support AVX2 since that is a 40% perf boost, so put + the high and low bytes 32 bytes instead of 16 bytes apart. + + Also I think we should go ahead and precompute the multiply tables since + it avoids a bunch of memory lookups for each muladd, and only costs 8 MB. +*/ + + +//------------------------------------------------------------------------------ +// Debug + +// Some bugs only repro in release mode, so this can be helpful +//#define LEO_DEBUG_IN_RELEASE + +#if defined(_DEBUG) || defined(DEBUG) || defined(LEO_DEBUG_IN_RELEASE) + #define LEO_DEBUG + #ifdef _WIN32 + #define LEO_DEBUG_BREAK __debugbreak() + #else + #define LEO_DEBUG_BREAK __builtin_trap() + #endif + #define LEO_DEBUG_ASSERT(cond) { if (!(cond)) { LEO_DEBUG_BREAK; } } +#else + #define LEO_DEBUG_BREAK ; + #define LEO_DEBUG_ASSERT(cond) ; +#endif + + +//------------------------------------------------------------------------------ +// Platform/Architecture + +#if defined(ANDROID) || defined(IOS) + #define LEO_TARGET_MOBILE +#endif // ANDROID + +#if defined(__AVX2__) || (defined (_MSC_VER) && _MSC_VER >= 1900) + #define LEO_TRY_AVX2 /* 256-bit */ + #include + #define LEO_ALIGN_BYTES 32 +#else // __AVX2__ + #define LEO_ALIGN_BYTES 16 +#endif // __AVX2__ + +#if !defined(LEO_TARGET_MOBILE) + // Note: MSVC currently only supports SSSE3 but not AVX2 + #include // SSSE3: _mm_shuffle_epi8 + #include // SSE2 +#endif // LEO_TARGET_MOBILE + +#if defined(HAVE_ARM_NEON_H) + #include +#endif // HAVE_ARM_NEON_H + +#if defined(LEO_TARGET_MOBILE) + + #define LEO_ALIGNED_ACCESSES /* Inputs must be aligned to LEO_ALIGN_BYTES */ + +# if defined(HAVE_ARM_NEON_H) + // Compiler-specific 128-bit SIMD register keyword + #define LEO_M128 uint8x16_t + #define LEO_TRY_NEON +#else + #define LEO_M128 uint64_t +# endif + +#else // LEO_TARGET_MOBILE + + // Compiler-specific 128-bit SIMD register keyword + #define LEO_M128 __m128i + +#endif // LEO_TARGET_MOBILE + +#ifdef LEO_TRY_AVX2 + // Compiler-specific 256-bit SIMD register keyword + #define LEO_M256 __m256i +#endif + +// Compiler-specific C++11 restrict keyword +#define LEO_RESTRICT __restrict + +// Compiler-specific force inline keyword +#ifdef _MSC_VER + #define LEO_FORCE_INLINE inline __forceinline +#else + #define LEO_FORCE_INLINE inline __attribute__((always_inline)) +#endif + +// Compiler-specific alignment keyword +// Note: Alignment only matters for ARM NEON where it should be 16 +#ifdef _MSC_VER + #define LEO_ALIGNED __declspec(align(LEO_ALIGN_BYTES)) +#else // _MSC_VER + #define LEO_ALIGNED __attribute__((aligned(LEO_ALIGN_BYTES))) +#endif // _MSC_VER + + +//------------------------------------------------------------------------------ +// Runtime CPU Architecture Check +// +// Feature checks stolen shamelessly from +// https://github.com/jedisct1/libsodium/blob/master/src/libsodium/sodium/runtime.c + +#if defined(HAVE_ANDROID_GETCPUFEATURES) + #include +#endif + +#if defined(LEO_TRY_NEON) +# if defined(IOS) && defined(__ARM_NEON__) + // Requires iPhone 5S or newer + static const bool CpuHasNeon = true; + static const bool CpuHasNeon64 = true; +# else + // Remember to add LOCAL_STATIC_LIBRARIES := cpufeatures + static bool CpuHasNeon = false; // V6 / V7 + static bool CpuHasNeon64 = false; // 64-bit +# endif +#endif + + +#if !defined(LEO_TARGET_MOBILE) + +#ifdef _MSC_VER + #include // __cpuid + #pragma warning(disable: 4752) // found Intel(R) Advanced Vector Extensions; consider using /arch:AVX +#endif + +#ifdef LEO_TRY_AVX2 +static bool CpuHasAVX2 = false; +#endif +static bool CpuHasSSSE3 = false; + +#define CPUID_EBX_AVX2 0x00000020 +#define CPUID_ECX_SSSE3 0x00000200 + +static void _cpuid(unsigned int cpu_info[4U], const unsigned int cpu_info_type) +{ +#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_AMD64) || defined(_M_IX86)) + __cpuid((int *) cpu_info, cpu_info_type); +#else //if defined(HAVE_CPUID) + cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0; +# ifdef __i386__ + __asm__ __volatile__ ("pushfl; pushfl; " + "popl %0; " + "movl %0, %1; xorl %2, %0; " + "pushl %0; " + "popfl; pushfl; popl %0; popfl" : + "=&r" (cpu_info[0]), "=&r" (cpu_info[1]) : + "i" (0x200000)); + if (((cpu_info[0] ^ cpu_info[1]) & 0x200000) == 0) { + return; /* LCOV_EXCL_LINE */ + } +# endif +# ifdef __i386__ + __asm__ __volatile__ ("xchgl %%ebx, %k1; cpuid; xchgl %%ebx, %k1" : + "=a" (cpu_info[0]), "=&r" (cpu_info[1]), + "=c" (cpu_info[2]), "=d" (cpu_info[3]) : + "0" (cpu_info_type), "2" (0U)); +# elif defined(__x86_64__) + __asm__ __volatile__ ("xchgq %%rbx, %q1; cpuid; xchgq %%rbx, %q1" : + "=a" (cpu_info[0]), "=&r" (cpu_info[1]), + "=c" (cpu_info[2]), "=d" (cpu_info[3]) : + "0" (cpu_info_type), "2" (0U)); +# else + __asm__ __volatile__ ("cpuid" : + "=a" (cpu_info[0]), "=b" (cpu_info[1]), + "=c" (cpu_info[2]), "=d" (cpu_info[3]) : + "0" (cpu_info_type), "2" (0U)); +# endif +#endif +} + +#endif // defined(LEO_TARGET_MOBILE) + + +static void leo_architecture_init() +{ +#if defined(LEO_TRY_NEON) && defined(HAVE_ANDROID_GETCPUFEATURES) + AndroidCpuFamily family = android_getCpuFamily(); + if (family == ANDROID_CPU_FAMILY_ARM) + { + if (android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON) + CpuHasNeon = true; + } + else if (family == ANDROID_CPU_FAMILY_ARM64) + { + CpuHasNeon = true; + if (android_getCpuFeatures() & ANDROID_CPU_ARM64_FEATURE_ASIMD) + CpuHasNeon64 = true; + } +#endif + +#if !defined(LEO_TARGET_MOBILE) + unsigned int cpu_info[4]; + + _cpuid(cpu_info, 1); + CpuHasSSSE3 = ((cpu_info[2] & CPUID_ECX_SSSE3) != 0); + +#if defined(LEO_TRY_AVX2) + _cpuid(cpu_info, 7); + CpuHasAVX2 = ((cpu_info[1] & CPUID_EBX_AVX2) != 0); +#endif // LEO_TRY_AVX2 + +#endif // LEO_TARGET_MOBILE +} + + +//------------------------------------------------------------------------------ +// SIMD-Safe Aligned Memory Allocations + +static const unsigned kAlignmentBytes = LEO_ALIGN_BYTES; + +LEO_FORCE_INLINE unsigned NextAlignedOffset(unsigned offset) +{ + return (offset + kAlignmentBytes - 1) & ~(kAlignmentBytes - 1); +} + +static LEO_FORCE_INLINE uint8_t* SIMDSafeAllocate(size_t size) +{ + uint8_t* data = (uint8_t*)calloc(1, kAlignmentBytes + size); + if (!data) + return nullptr; + unsigned offset = (unsigned)((uintptr_t)data % kAlignmentBytes); + data += kAlignmentBytes - offset; + data[-1] = (uint8_t)offset; + return data; +} + +static LEO_FORCE_INLINE void SIMDSafeFree(void* ptr) +{ + if (!ptr) + return; + uint8_t* data = (uint8_t*)ptr; + unsigned offset = data[-1]; + if (offset >= kAlignmentBytes) + { + LEO_DEBUG_BREAK; // Should never happen + return; + } + data -= kAlignmentBytes - offset; + free(data); +} + + +//------------------------------------------------------------------------------ +// Field + +//#define LEO_SHORT_FIELD + +#ifdef LEO_SHORT_FIELD +typedef uint8_t GFSymbol; +static const unsigned kGFBits = 8; +static const unsigned kGFPolynomial = 0x11D; +GFSymbol kGFBasis[kGFBits] = { + 1, 214, 152, 146, 86, 200, 88, 230 // Cantor basis +}; +#else +typedef uint16_t GFSymbol; +static const unsigned kGFBits = 16; +static const unsigned kGFPolynomial = 0x1002D; +GFSymbol kGFBasis[kGFBits] = { + 0x0001, 0xACCA, 0x3C0E, 0x163E, // Cantor basis + 0xC582, 0xED2E, 0x914C, 0x4012, + 0x6C98, 0x10D8, 0x6A72, 0xB900, + 0xFDB8, 0xFB34, 0xFF38, 0x991E +}; +#endif + +/* + Cantor Basis introduced by: + D. G. Cantor, "On arithmetical algorithms over finite fields", + Journal of Combinatorial Theory, Series A, vol. 50, no. 2, pp. 285-300, 1989. +*/ + +static const unsigned kFieldSize = (unsigned)1 << kGFBits; //Field size +static const unsigned kFieldModulus = kFieldSize - 1; + +static GFSymbol GFLog[kFieldSize]; +static GFSymbol GFExp[kFieldSize]; + +// Initialize GFLog[], GFExp[] +static void InitField() +{ + unsigned state = 1; + for (unsigned i = 0; i < kFieldModulus; ++i) + { + GFExp[state] = static_cast(i); + state <<= 1; + if (state >= kFieldSize) + state ^= kGFPolynomial; + } + GFExp[0] = kFieldModulus; + + // Conversion to chosen basis: + + GFLog[0] = 0; + for (unsigned i = 0; i < kGFBits; ++i) + { + const GFSymbol basis = kGFBasis[i]; + const unsigned width = (unsigned)(1UL << i); + + for (unsigned j = 0; j < width; ++j) + GFLog[j + width] = GFLog[j] ^ basis; + } + + for (unsigned i = 0; i < kFieldSize; ++i) + GFLog[i] = GFExp[GFLog[i]]; + + for (unsigned i = 0; i < kFieldSize; ++i) + GFExp[GFLog[i]] = i; + + GFExp[kFieldModulus] = GFExp[0]; +} + + +//------------------------------------------------------------------------------ +// Mod Q Field Operations +// +// Q is the maximum symbol value, e.g. 255 or 65535. + +// z = x + y (mod Q) +static inline GFSymbol AddModQ(GFSymbol a, GFSymbol b) +{ + const unsigned sum = (unsigned)a + b; + + // Partial reduction step, allowing for Q to be returned + return static_cast(sum + (sum >> kGFBits)); +} + +// z = x - y (mod Q) +static inline GFSymbol SubModQ(GFSymbol a, GFSymbol b) +{ + const unsigned dif = (unsigned)a - b; + + // Partial reduction step, allowing for Q to be returned + return static_cast(dif + (dif >> kGFBits)); +} + +// vx[] += vy[] * z +static void muladd_mem(GFSymbol * LEO_RESTRICT vx, const GFSymbol * LEO_RESTRICT vy, GFSymbol z, unsigned symbolCount) +{ + for (unsigned i = 0; i < symbolCount; ++i) + { + const GFSymbol a = vy[i]; + if (a == 0) + continue; + + GFSymbol sum1 = static_cast(AddModQ(GFLog[a & 0x0f], z)); + GFSymbol value1 = GFExp[sum1]; + if ((a & 0x0f) == 0) + { + value1 = 0; + } + GFSymbol sum2 = static_cast(AddModQ(GFLog[a & 0xf0], z)); + GFSymbol value2 = GFExp[sum2]; + if ((a & 0xf0) == 0) + { + value2 = 0; + } + GFSymbol sum3 = static_cast(AddModQ(GFLog[a & 0x0f00], z)); + GFSymbol value3 = GFExp[sum3]; + if ((a & 0x0f00) == 0) + { + value3 = 0; + } + GFSymbol sum4 = static_cast(AddModQ(GFLog[a & 0xf000], z)); + GFSymbol value4 = GFExp[sum4]; + if ((a & 0xf000) == 0) + { + value4 = 0; + } + + vx[i] ^= value1; + vx[i] ^= value2; + vx[i] ^= value3; + vx[i] ^= value4; + } +} + +// return a*GFExp[b] over GF(2^r) +static GFSymbol mulE(GFSymbol a, GFSymbol b) +{ + if (a == 0) + return 0; + + const GFSymbol sum = static_cast(AddModQ(GFLog[a], b)); + return GFExp[sum]; +} + + +//------------------------------------------------------------------------------ +// Fast Walsh-Hadamard Transform (FWHT) Mod Q +// +// Q is the maximum symbol value, e.g. 255 or 65535. + +// Define this to enable the optimized version of FWHT() +#define LEO_FWHT_OPTIMIZED + +typedef GFSymbol fwht_t; + +// {a, b} = {a + b, a - b} (Mod Q) +static LEO_FORCE_INLINE void FWHT_2(fwht_t& LEO_RESTRICT a, fwht_t& LEO_RESTRICT b) +{ + const fwht_t sum = AddModQ(a, b); + const fwht_t dif = SubModQ(a, b); + a = sum; + b = dif; +} + +/* + FWHT is a minor slice of the runtime and does not grow with data size, + but I did attempt a few additional optimizations that failed: + + I've attempted to vectorize (with partial reductions) FWHT_4(data, s), + which is 70% of the algorithm, but it was slower. Left in _attic_. + + I've attempted to avoid reductions in all or parts of the FWHT. + The final modular reduction ends up being slower than the savings. + Specifically I tried doing it for the whole FWHT and also I tried + doing it just for the FWHT_2 loop in the main routine, but both + approaches are slower than partial reductions. + + Replacing word reads with wider reads does speed up the operation, but + at too high a complexity cost relative to minor perf improvement. +*/ + +#ifndef LEO_FWHT_OPTIMIZED + +// Reference implementation +static void FWHT(fwht_t* data, const unsigned bits) +{ + const unsigned size = (unsigned)(1UL << bits); + for (unsigned width = 1; width < size; width <<= 1) + for (unsigned i = 0; i < size; i += (width << 1)) + for (unsigned j = i; j < (width + i); ++j) + FWHT_2(data[j], data[j + width]); +} + +#else + +static LEO_FORCE_INLINE void FWHT_4(fwht_t* data) +{ + fwht_t t0 = data[0]; + fwht_t t1 = data[1]; + fwht_t t2 = data[2]; + fwht_t t3 = data[3]; + FWHT_2(t0, t1); + FWHT_2(t2, t3); + FWHT_2(t0, t2); + FWHT_2(t1, t3); + data[0] = t0; + data[1] = t1; + data[2] = t2; + data[3] = t3; +} + +static LEO_FORCE_INLINE void FWHT_4(fwht_t* data, unsigned s) +{ + unsigned x = 0; + fwht_t t0 = data[x]; x += s; + fwht_t t1 = data[x]; x += s; + fwht_t t2 = data[x]; x += s; + fwht_t t3 = data[x]; + FWHT_2(t0, t1); + FWHT_2(t2, t3); + FWHT_2(t0, t2); + FWHT_2(t1, t3); + unsigned y = 0; + data[y] = t0; y += s; + data[y] = t1; y += s; + data[y] = t2; y += s; + data[y] = t3; +} + +static inline void FWHT_8(fwht_t* data) +{ + fwht_t t0 = data[0]; + fwht_t t1 = data[1]; + fwht_t t2 = data[2]; + fwht_t t3 = data[3]; + fwht_t t4 = data[4]; + fwht_t t5 = data[5]; + fwht_t t6 = data[6]; + fwht_t t7 = data[7]; + FWHT_2(t0, t1); + FWHT_2(t2, t3); + FWHT_2(t4, t5); + FWHT_2(t6, t7); + FWHT_2(t0, t2); + FWHT_2(t1, t3); + FWHT_2(t4, t6); + FWHT_2(t5, t7); + FWHT_2(t0, t4); + FWHT_2(t1, t5); + FWHT_2(t2, t6); + FWHT_2(t3, t7); + data[0] = t0; + data[1] = t1; + data[2] = t2; + data[3] = t3; + data[4] = t4; + data[5] = t5; + data[6] = t6; + data[7] = t7; +} + +static inline void FWHT_16(fwht_t* data) +{ + fwht_t t0 = data[0]; + fwht_t t1 = data[1]; + fwht_t t2 = data[2]; + fwht_t t3 = data[3]; + fwht_t t4 = data[4]; + fwht_t t5 = data[5]; + fwht_t t6 = data[6]; + fwht_t t7 = data[7]; + fwht_t t8 = data[8]; + fwht_t t9 = data[9]; + fwht_t t10 = data[10]; + fwht_t t11 = data[11]; + fwht_t t12 = data[12]; + fwht_t t13 = data[13]; + fwht_t t14 = data[14]; + fwht_t t15 = data[15]; + FWHT_2(t0, t1); + FWHT_2(t2, t3); + FWHT_2(t4, t5); + FWHT_2(t6, t7); + FWHT_2(t8, t9); + FWHT_2(t10, t11); + FWHT_2(t12, t13); + FWHT_2(t14, t15); + FWHT_2(t0, t2); + FWHT_2(t1, t3); + FWHT_2(t4, t6); + FWHT_2(t5, t7); + FWHT_2(t8, t10); + FWHT_2(t9, t11); + FWHT_2(t12, t14); + FWHT_2(t13, t15); + FWHT_2(t0, t4); + FWHT_2(t1, t5); + FWHT_2(t2, t6); + FWHT_2(t3, t7); + FWHT_2(t8, t12); + FWHT_2(t9, t13); + FWHT_2(t10, t14); + FWHT_2(t11, t15); + FWHT_2(t0, t8); + FWHT_2(t1, t9); + FWHT_2(t2, t10); + FWHT_2(t3, t11); + FWHT_2(t4, t12); + FWHT_2(t5, t13); + FWHT_2(t6, t14); + FWHT_2(t7, t15); + data[0] = t0; + data[1] = t1; + data[2] = t2; + data[3] = t3; + data[4] = t4; + data[5] = t5; + data[6] = t6; + data[7] = t7; + data[8] = t8; + data[9] = t9; + data[10] = t10; + data[11] = t11; + data[12] = t12; + data[13] = t13; + data[14] = t14; + data[15] = t15; +} + +static void FWHT_SmallData(fwht_t* data, unsigned ldn) +{ + const unsigned n = (1UL << ldn); + + if (n <= 2) + { + if (n == 2) + FWHT_2(data[0], data[1]); + return; + } + + for (unsigned ldm = ldn; ldm > 3; ldm -= 2) + { + unsigned m = (1UL << ldm); + unsigned m4 = (m >> 2); + for (unsigned r = 0; r < n; r += m) + for (unsigned j = 0; j < m4; j++) + FWHT_4(data + j + r, m4); + } + + if (ldn & 1) + { + for (unsigned i0 = 0; i0 < n; i0 += 8) + FWHT_8(data + i0); + } + else + { + for (unsigned i0 = 0; i0 < n; i0 += 4) + FWHT_4(data + i0); + } +} + +// Decimation in time (DIT) version +static void FWHT(fwht_t* data, const unsigned ldn) +{ + if (ldn <= 13) + { + FWHT_SmallData(data, ldn); + return; + } + + FWHT_2(data[2], data[3]); + FWHT_4(data + 4); + FWHT_8(data + 8); + FWHT_16(data + 16); + for (unsigned ldm = 5; ldm < ldn; ++ldm) + FWHT(data + (unsigned)(1UL << ldm), ldm); + + for (unsigned ldm = 0; ldm < ldn; ++ldm) + { + const unsigned mh = (1UL << ldm); + for (unsigned t1 = 0, t2 = mh; t1 < mh; ++t1, ++t2) + FWHT_2(data[t1], data[t2]); + } +} + +#endif + + +//------------------------------------------------------------------------------ +// Memory Buffer XOR + +static void xor_mem(void * LEO_RESTRICT vx, const void * LEO_RESTRICT vy, unsigned bytes) +{ + LEO_M128 * LEO_RESTRICT x16 = reinterpret_cast(vx); + const LEO_M128 * LEO_RESTRICT y16 = reinterpret_cast(vy); + +#if defined(LEO_TARGET_MOBILE) +# if defined(LEO_TRY_NEON) + // Handle multiples of 64 bytes + if (CpuHasNeon) + { + while (bytes >= 64) + { + LEO_M128 x0 = vld1q_u8(x16); + LEO_M128 x1 = vld1q_u8(x16 + 1); + LEO_M128 x2 = vld1q_u8(x16 + 2); + LEO_M128 x3 = vld1q_u8(x16 + 3); + LEO_M128 y0 = vld1q_u8(y16); + LEO_M128 y1 = vld1q_u8(y16 + 1); + LEO_M128 y2 = vld1q_u8(y16 + 2); + LEO_M128 y3 = vld1q_u8(y16 + 3); + + vst1q_u8(x16, veorq_u8(x0, y0)); + vst1q_u8(x16 + 1, veorq_u8(x1, y1)); + vst1q_u8(x16 + 2, veorq_u8(x2, y2)); + vst1q_u8(x16 + 3, veorq_u8(x3, y3)); + + bytes -= 64, x16 += 4, y16 += 4; + } + + // Handle multiples of 16 bytes + while (bytes >= 16) + { + LEO_M128 x0 = vld1q_u8(x16); + LEO_M128 y0 = vld1q_u8(y16); + + vst1q_u8(x16, veorq_u8(x0, y0)); + + bytes -= 16, ++x16, ++y16; + } + } + else +# endif // LEO_TRY_NEON + { + uint64_t * LEO_RESTRICT x8 = reinterpret_cast(x16); + const uint64_t * LEO_RESTRICT y8 = reinterpret_cast(y16); + + const unsigned count = (unsigned)bytes / 8; + for (unsigned ii = 0; ii < count; ++ii) + x8[ii] ^= y8[ii]; + + x16 = reinterpret_cast(x8 + count); + y16 = reinterpret_cast(y8 + count); + } +#else // LEO_TARGET_MOBILE +# if defined(LEO_TRY_AVX2) + if (CpuHasAVX2) + { + LEO_M256 * LEO_RESTRICT x32 = reinterpret_cast(x16); + const LEO_M256 * LEO_RESTRICT y32 = reinterpret_cast(y16); + + while (bytes >= 128) + { + LEO_M256 x0 = _mm256_loadu_si256(x32); + LEO_M256 y0 = _mm256_loadu_si256(y32); + x0 = _mm256_xor_si256(x0, y0); + LEO_M256 x1 = _mm256_loadu_si256(x32 + 1); + LEO_M256 y1 = _mm256_loadu_si256(y32 + 1); + x1 = _mm256_xor_si256(x1, y1); + LEO_M256 x2 = _mm256_loadu_si256(x32 + 2); + LEO_M256 y2 = _mm256_loadu_si256(y32 + 2); + x2 = _mm256_xor_si256(x2, y2); + LEO_M256 x3 = _mm256_loadu_si256(x32 + 3); + LEO_M256 y3 = _mm256_loadu_si256(y32 + 3); + x3 = _mm256_xor_si256(x3, y3); + + _mm256_storeu_si256(x32, x0); + _mm256_storeu_si256(x32 + 1, x1); + _mm256_storeu_si256(x32 + 2, x2); + _mm256_storeu_si256(x32 + 3, x3); + + bytes -= 128, x32 += 4, y32 += 4; + } + + // Handle multiples of 32 bytes + while (bytes >= 32) + { + // x[i] = x[i] xor y[i] + _mm256_storeu_si256(x32, + _mm256_xor_si256( + _mm256_loadu_si256(x32), + _mm256_loadu_si256(y32))); + + bytes -= 32, ++x32, ++y32; + } + + x16 = reinterpret_cast(x32); + y16 = reinterpret_cast(y32); + } + else +# endif // LEO_TRY_AVX2 + { + while (bytes >= 64) + { + LEO_M128 x0 = _mm_loadu_si128(x16); + LEO_M128 y0 = _mm_loadu_si128(y16); + x0 = _mm_xor_si128(x0, y0); + LEO_M128 x1 = _mm_loadu_si128(x16 + 1); + LEO_M128 y1 = _mm_loadu_si128(y16 + 1); + x1 = _mm_xor_si128(x1, y1); + LEO_M128 x2 = _mm_loadu_si128(x16 + 2); + LEO_M128 y2 = _mm_loadu_si128(y16 + 2); + x2 = _mm_xor_si128(x2, y2); + LEO_M128 x3 = _mm_loadu_si128(x16 + 3); + LEO_M128 y3 = _mm_loadu_si128(y16 + 3); + x3 = _mm_xor_si128(x3, y3); + + _mm_storeu_si128(x16, x0); + _mm_storeu_si128(x16 + 1, x1); + _mm_storeu_si128(x16 + 2, x2); + _mm_storeu_si128(x16 + 3, x3); + + bytes -= 64, x16 += 4, y16 += 4; + } + } +#endif // LEO_TARGET_MOBILE + + // Handle multiples of 16 bytes + while (bytes >= 16) + { + // x[i] = x[i] xor y[i] + _mm_storeu_si128(x16, + _mm_xor_si128( + _mm_loadu_si128(x16), + _mm_loadu_si128(y16))); + + bytes -= 16, ++x16, ++y16; + } + + uint8_t * LEO_RESTRICT x1 = reinterpret_cast(x16); + const uint8_t * LEO_RESTRICT y1 = reinterpret_cast(y16); + + // Handle a block of 8 bytes + const unsigned eight = bytes & 8; + if (eight) + { + uint64_t * LEO_RESTRICT x8 = reinterpret_cast(x1); + const uint64_t * LEO_RESTRICT y8 = reinterpret_cast(y1); + *x8 ^= *y8; + } + + // Handle a block of 4 bytes + const unsigned four = bytes & 4; + if (four) + { + uint32_t * LEO_RESTRICT x4 = reinterpret_cast(x1 + eight); + const uint32_t * LEO_RESTRICT y4 = reinterpret_cast(y1 + eight); + *x4 ^= *y4; + } + + // Handle final bytes + const unsigned offset = eight + four; + switch (bytes & 3) + { + case 3: x1[offset + 2] ^= y1[offset + 2]; + case 2: x1[offset + 1] ^= y1[offset + 1]; + case 1: x1[offset] ^= y1[offset]; + default: + break; + } +} + + +//------------------------------------------------------------------------------ +// Formal Derivative + +// Formal derivative of polynomial in the new basis +static void formal_derivative(GFSymbol* cos, const unsigned size) +{ + for (unsigned i = 1; i < size; ++i) + { + const unsigned leng = ((i ^ (i - 1)) + 1) >> 1; + + // If a large number of values are being XORed: + if (leng >= 8) + xor_mem(cos + i - leng, cos + i, leng * sizeof(GFSymbol)); + else + for (unsigned j = i - leng; j < i; j++) + cos[j] ^= cos[j + leng]; + } + + for (unsigned i = size; i < kFieldSize; i <<= 1) + xor_mem(cos, cos + i, size * sizeof(GFSymbol)); +} + + +//------------------------------------------------------------------------------ +// Fast Fourier Transform + +static GFSymbol skewVec[kFieldModulus]; // twisted factors used in FFT + +// IFFT in the proposed basis +static void IFLT(GFSymbol* data, const unsigned size, const unsigned index) +{ + for (unsigned depart_no = 1; depart_no < size; depart_no <<= 1) + { + for (unsigned j = depart_no; j < size; j += (depart_no << 1)) + { + // If a large number of values are being XORed: + if (depart_no >= 8) + xor_mem(data + j, data + j - depart_no, depart_no * sizeof(GFSymbol)); + else + for (unsigned i = j - depart_no; i < j; ++i) + data[i + depart_no] ^= data[i]; + + const GFSymbol skew = skewVec[j + index - 1]; + + if (skew != kFieldModulus) + muladd_mem(data + j - depart_no, data + j, skew, depart_no); + } + } +} + +// FFT in the proposed basis +static void FLT(GFSymbol* data, const unsigned size, const unsigned index) +{ + for (unsigned depart_no = (size >> 1); depart_no > 0; depart_no >>= 1) + { + for (unsigned j = depart_no; j < size; j += (depart_no << 1)) + { + const GFSymbol skew = skewVec[j + index - 1]; + + if (skew != kFieldModulus) + muladd_mem(data + j - depart_no, data + j, skew, depart_no); + + // If a large number of values are being XORed: + if (depart_no >= 8) + xor_mem(data + j, data + j - depart_no, depart_no * sizeof(GFSymbol)); + else + for (unsigned i = j - depart_no; i < j; ++i) + data[i + depart_no] ^= data[i]; + } + } +} + + +//------------------------------------------------------------------------------ +// FFT Initialization + +static GFSymbol B[kFieldSize >> 1]; // factors used in formal derivative +static fwht_t log_walsh[kFieldSize]; // factors used in the evaluation of the error locator polynomial + +// Initialize skewVec[], B[], log_walsh[] +static void InitFieldOperations() +{ + GFSymbol temp[kGFBits - 1]; + + for (unsigned i = 1; i < kGFBits; ++i) + temp[i - 1] = (GFSymbol)((unsigned)1 << i); + + for (unsigned m = 0; m < (kGFBits - 1); ++m) + { + const unsigned step = (unsigned)1 << (m + 1); + + skewVec[((unsigned)1 << m) - 1] = 0; + + for (unsigned i = m; i < (kGFBits - 1); ++i) + { + const unsigned s = ((unsigned)1 << (i + 1)); + + for (unsigned j = ((unsigned)1 << m) - 1; j < s; j += step) + skewVec[j + s] = skewVec[j] ^ temp[i]; + } + + temp[m] = kFieldModulus - GFLog[mulE(temp[m], GFLog[temp[m] ^ 1])]; + + for (unsigned i = m + 1; i < (kGFBits - 1); ++i) + temp[i] = mulE(temp[i], (GFLog[temp[i] ^ 1] + temp[m]) % kFieldModulus); + } + + for (unsigned i = 0; i < kFieldSize; ++i) + skewVec[i] = GFLog[skewVec[i]]; + + temp[0] = kFieldModulus - temp[0]; + + for (unsigned i = 1; i < (kGFBits - 1); ++i) + temp[i] = (kFieldModulus - temp[i] + temp[i - 1]) % kFieldModulus; + + B[0] = 0; + for (unsigned i = 0; i < (kGFBits - 1); ++i) + { + const unsigned depart = ((unsigned)1 << i); + + for (unsigned j = 0; j < depart; ++j) + B[j + depart] = (B[j] + temp[i]) % kFieldModulus; + } + + for (unsigned i = 0; i < kFieldSize; ++i) + log_walsh[i] = GFLog[i]; + + log_walsh[0] = 0; + + FWHT(log_walsh, kGFBits); +} + + +//------------------------------------------------------------------------------ +// Encoder + +// Encoding alg for k/n<0.5: message is a power of two +static void encodeL(GFSymbol* data, const unsigned k, GFSymbol* codeword) +{ + memcpy(codeword, data, sizeof(GFSymbol) * k); + + IFLT(codeword, k, 0); + + for (unsigned i = k; i < kFieldSize; i += k) + { + memcpy(&codeword[i], codeword, sizeof(GFSymbol) * k); + + FLT(&codeword[i], k, i); + } + + memcpy(codeword, data, sizeof(GFSymbol) * k); +} + +// Encoding alg for k/n>0.5: parity is a power of two. +// data: message array. parity: parity array. mem: buffer(size>= n-k) +static void encodeH(const GFSymbol* data, const unsigned k, GFSymbol* parity, GFSymbol* mem) +{ + const unsigned t = kFieldSize - k; + + memset(parity, 0, sizeof(GFSymbol) * t); + + for (unsigned i = t; i < kFieldSize; i += t) + { + memcpy(mem, &data[i - t], sizeof(GFSymbol) * t); + + IFLT(mem, t, i); + + xor_mem(parity, mem, t * sizeof(GFSymbol)); + } + + FLT(parity, t, 0); +} + + +//------------------------------------------------------------------------------ +// Decoder + +static void decode(GFSymbol* codeword, unsigned k, const bool* erasure) +{ + fwht_t log_walsh2[kFieldSize]; + + // Compute the evaluations of the error locator polynomial + for (unsigned i = 0; i < kFieldSize; ++i) + log_walsh2[i] = erasure[i] ? 1 : 0; + + FWHT(log_walsh2, kGFBits); + + for (unsigned i = 0; i < kFieldSize; ++i) + log_walsh2[i] = ((unsigned)log_walsh2[i] * (unsigned)log_walsh[i]) % kFieldModulus; + + FWHT(log_walsh2, kGFBits); + + // k2 can be replaced with k + const unsigned k2 = kFieldSize; + //const unsigned k2 = k; // cannot actually be replaced with k. what else need to change? + + for (unsigned i = 0; i < kFieldSize; ++i) + { + if (erasure[i]) + { + codeword[i] = 0; + } + else + { + codeword[i] = mulE(codeword[i], log_walsh2[i]); + } + } + + IFLT(codeword, kFieldSize, 0); + + // formal derivative + for (unsigned i = 0; i < kFieldSize; i += 2) + { + codeword[i] = mulE(codeword[i], kFieldModulus - B[i >> 1]); + codeword[i + 1] = mulE(codeword[i + 1], kFieldModulus - B[i >> 1]); + } + + formal_derivative(codeword, k2); + + for (unsigned i = 0; i < k2; i += 2) + { + codeword[i] = mulE(codeword[i], B[i >> 1]); + codeword[i + 1] = mulE(codeword[i + 1], B[i >> 1]); + } + + FLT(codeword, k2, 0); + + for (unsigned i = 0; i < k2; ++i) + { + if (erasure[i]) + { + codeword[i] = mulE(codeword[i], kFieldModulus - log_walsh2[i]); + } + } +} + + +//------------------------------------------------------------------------------ +// Test Application + +void test(unsigned k, unsigned seed) +{ + srand(seed); + + //-----------Generating message---------- + + // Message array + GFSymbol data[kFieldSize] = {0}; + + // Filled with random numbers + for (unsigned i = kFieldSize - k; i < kFieldSize; ++i) + data[i] = (GFSymbol)rand(); + + + //---------encoding---------- + + GFSymbol codeword[kFieldSize]; + encodeH(&data[kFieldSize - k], k, data, codeword); + //encodeL(data, k, codeword); // does not seem to work with any input? what else needs to change? + + memcpy(codeword, data, sizeof(GFSymbol) * kFieldSize); + + + //--------erasure simulation--------- + + // Array indicating erasures + bool erasure[kFieldSize] = { + false + }; + + for (unsigned i = k; i < kFieldSize; ++i) + erasure[i] = true; + + // permuting the erasure array + for (unsigned i = kFieldSize - 1; i > 0; --i) + { + unsigned pos = rand() % (i + 1); + + if (i != pos) + { + bool tmp = erasure[i]; + erasure[i] = erasure[pos]; + erasure[pos] = tmp; + } + } + + // erasure codeword symbols + for (unsigned i = 0; i < kFieldSize; ++i) + if (erasure[i]) + codeword[i] = 0; + + + //---------main processing---------- + decode(codeword, k, erasure); + + // Check the correctness of the result + for (unsigned i = 0; i < kFieldSize; ++i) + { + if (erasure[i] == 1) + { + if (data[i] != codeword[i]) + { + printf("Decoding Error with seed = %d!\n", seed); + LEO_DEBUG_BREAK; + return; + } + } + } + + //printf("Decoding is successful!\n"); +} + + +//------------------------------------------------------------------------------ +// Entrypoint + +int main(int argc, char **argv) +{ + // Initialize architecture-specific code + leo_architecture_init(); + + // Fill GFLog table and GFExp table + InitField(); + + // Compute factors used in erasure decoder + InitFieldOperations(); + + unsigned seed = (unsigned)time(NULL); + for (;;) + { + // test(int k), k: message size + /* + EncodeH works for kFieldSize / 2 and kFieldSize * 3 / 4, etc, + s.t. the number of recovery pieces is a power of two + */ + test(kFieldSize / 2, seed); + + ++seed; + } + + return 0; +} diff --git a/LeopardFF8.cpp b/LeopardFF8.cpp new file mode 100644 index 0000000..030a555 --- /dev/null +++ b/LeopardFF8.cpp @@ -0,0 +1,840 @@ +/* + Copyright (c) 2017 Christopher A. Taylor. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + * Neither the name of LHC-RS nor the names of its contributors may be + used to endorse or promote products derived from this software without + specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. +*/ + +#include "LeopardFF8.h" + +namespace leopard { namespace ff8 { + + +//------------------------------------------------------------------------------ +// Datatypes and Constants + +// LFSR Polynomial that generates the field elements +static const unsigned kPolynomial = 0x11D; + +// Basis used for generating logarithm tables +static const ffe_t kBasis[kBits] = { + 1, 214, 152, 146, 86, 200, 88, 230 // Cantor basis + // 1, 2, 4, 8, 16, 32, 64, 128 // Monomial basis +}; + + +//------------------------------------------------------------------------------ +// Field Operations + +// Modulus for field operations +static const ffe_t kModulus = 255; + +// z = x + y (mod kModulus) +static inline ffe_t AddMod(const ffe_t a, const ffe_t b) +{ + const unsigned sum = (unsigned)a + b; + + // Partial reduction step, allowing for kModulus to be returned + return static_cast(sum + (sum >> kBits)); +} + +// z = x - y (mod kModulus) +static inline ffe_t SubMod(const ffe_t a, const ffe_t b) +{ + const unsigned dif = (unsigned)a - b; + + // Partial reduction step, allowing for kModulus to be returned + return static_cast(dif + (dif >> kBits)); +} + + +//------------------------------------------------------------------------------ +// Logarithm Tables + +static ffe_t LogLUT[kOrder]; +static ffe_t ExpLUT[kOrder]; + + +// Initialize LogLUT[], ExpLUT[] +static void InitializeLogarithmTables() +{ + // LFSR table generation: + + unsigned state = 1; + for (unsigned i = 0; i < kModulus; ++i) + { + ExpLUT[state] = static_cast(i); + state <<= 1; + if (state >= kOrder) + state ^= kPolynomial; + } + ExpLUT[0] = kModulus; + + // Conversion to chosen basis: + + LogLUT[0] = 0; + for (unsigned i = 0; i < kBits; ++i) + { + const ffe_t basis = kBasis[i]; + const unsigned width = static_cast(1UL << i); + + for (unsigned j = 0; j < width; ++j) + LogLUT[j + width] = LogLUT[j] ^ basis; + } + + for (unsigned i = 0; i < kOrder; ++i) + LogLUT[i] = ExpLUT[LogLUT[i]]; + + for (unsigned i = 0; i < kOrder; ++i) + ExpLUT[LogLUT[i]] = i; + + ExpLUT[kModulus] = ExpLUT[0]; +} + + +//------------------------------------------------------------------------------ +// Fast Walsh-Hadamard Transform (FWHT) (mod kModulus) + +#if defined(LEO_FF8_FWHT_OPTIMIZED) + +// {a, b} = {a + b, a - b} (Mod Q) +static LEO_FORCE_INLINE void FWHT_2(ffe_t& LEO_RESTRICT a, ffe_t& LEO_RESTRICT b) +{ + const ffe_t sum = AddMod(a, b); + const ffe_t dif = SubMod(a, b); + a = sum; + b = dif; +} + +static LEO_FORCE_INLINE void FWHT_4(ffe_t* data) +{ + ffe_t t0 = data[0]; + ffe_t t1 = data[1]; + ffe_t t2 = data[2]; + ffe_t t3 = data[3]; + FWHT_2(t0, t1); + FWHT_2(t2, t3); + FWHT_2(t0, t2); + FWHT_2(t1, t3); + data[0] = t0; + data[1] = t1; + data[2] = t2; + data[3] = t3; +} + +static LEO_FORCE_INLINE void FWHT_4(ffe_t* data, unsigned s) +{ + unsigned x = 0; + ffe_t t0 = data[x]; x += s; + ffe_t t1 = data[x]; x += s; + ffe_t t2 = data[x]; x += s; + ffe_t t3 = data[x]; + FWHT_2(t0, t1); + FWHT_2(t2, t3); + FWHT_2(t0, t2); + FWHT_2(t1, t3); + unsigned y = 0; + data[y] = t0; y += s; + data[y] = t1; y += s; + data[y] = t2; y += s; + data[y] = t3; +} + +static inline void FWHT_8(ffe_t* data) +{ + ffe_t t0 = data[0]; + ffe_t t1 = data[1]; + ffe_t t2 = data[2]; + ffe_t t3 = data[3]; + ffe_t t4 = data[4]; + ffe_t t5 = data[5]; + ffe_t t6 = data[6]; + ffe_t t7 = data[7]; + FWHT_2(t0, t1); + FWHT_2(t2, t3); + FWHT_2(t4, t5); + FWHT_2(t6, t7); + FWHT_2(t0, t2); + FWHT_2(t1, t3); + FWHT_2(t4, t6); + FWHT_2(t5, t7); + FWHT_2(t0, t4); + FWHT_2(t1, t5); + FWHT_2(t2, t6); + FWHT_2(t3, t7); + data[0] = t0; + data[1] = t1; + data[2] = t2; + data[3] = t3; + data[4] = t4; + data[5] = t5; + data[6] = t6; + data[7] = t7; +} + +// Decimation in time (DIT) version +static void FWHT(ffe_t* data, const unsigned ldn) +{ + const unsigned n = (1UL << ldn); + + if (n <= 2) + { + if (n == 2) + FWHT_2(data[0], data[1]); + return; + } + + for (unsigned ldm = ldn; ldm > 3; ldm -= 2) + { + unsigned m = (1UL << ldm); + unsigned m4 = (m >> 2); + for (unsigned r = 0; r < n; r += m) + for (unsigned j = 0; j < m4; j++) + FWHT_4(data + j + r, m4); + } + + if (ldn & 1) + { + for (unsigned i0 = 0; i0 < n; i0 += 8) + FWHT_8(data + i0); + } + else + { + for (unsigned i0 = 0; i0 < n; i0 += 4) + FWHT_4(data + i0); + } +} + +#else // LEO_FF8_FWHT_OPTIMIZED + +// Reference implementation +void FWHT(ffe_t* data, const unsigned bits) +{ + const unsigned size = (unsigned)(1UL << bits); + for (unsigned width = 1; width < size; width <<= 1) + for (unsigned i = 0; i < size; i += (width << 1)) + for (unsigned j = i; j < (width + i); ++j) + FWHT_2(data[j], data[j + width]); +} + +#endif // LEO_FF8_FWHT_OPTIMIZED + +// Transform specialized for the finite field order +void FWHT(ffe_t data[kOrder]) +{ + FWHT(data, kBits); +} + + +//------------------------------------------------------------------------------ +// XOR Memory + +void xor_mem( + void * LEO_RESTRICT vx, const void * LEO_RESTRICT vy, + unsigned bytes) +{ +#if defined(LEO_TRY_AVX2) + if (CpuHasAVX2) + { + LEO_M256 * LEO_RESTRICT x32 = reinterpret_cast(vx); + const LEO_M256 * LEO_RESTRICT y32 = reinterpret_cast(vy); + do + { + const LEO_M256 x0 = _mm256_xor_si256(_mm256_loadu_si256(x32), _mm256_loadu_si256(y32)); + const LEO_M256 x1 = _mm256_xor_si256(_mm256_loadu_si256(x32 + 1), _mm256_loadu_si256(y32 + 1)); + const LEO_M256 x2 = _mm256_xor_si256(_mm256_loadu_si256(x32 + 2), _mm256_loadu_si256(y32 + 2)); + const LEO_M256 x3 = _mm256_xor_si256(_mm256_loadu_si256(x32 + 3), _mm256_loadu_si256(y32 + 3)); + _mm256_storeu_si256(x32, x0); + _mm256_storeu_si256(x32 + 1, x1); + _mm256_storeu_si256(x32 + 2, x2); + _mm256_storeu_si256(x32 + 3, x3); + bytes -= 128, x32 += 4, y32 += 4; + } while (bytes >= 128); + if (bytes > 0) + { + const LEO_M256 x0 = _mm256_xor_si256(_mm256_loadu_si256(x32), _mm256_loadu_si256(y32)); + const LEO_M256 x1 = _mm256_xor_si256(_mm256_loadu_si256(x32 + 1), _mm256_loadu_si256(y32 + 1)); + _mm256_storeu_si256(x32, x0); + _mm256_storeu_si256(x32 + 1, x1); + } + return; + } +#endif // LEO_TRY_AVX2 + LEO_M128 * LEO_RESTRICT x16 = reinterpret_cast(vx); + const LEO_M128 * LEO_RESTRICT y16 = reinterpret_cast(vy); + do + { + const LEO_M128 x0 = _mm_xor_si128(_mm_loadu_si128(x16), _mm_loadu_si128(y16)); + const LEO_M128 x1 = _mm_xor_si128(_mm_loadu_si128(x16 + 1), _mm_loadu_si128(y16 + 1)); + const LEO_M128 x2 = _mm_xor_si128(_mm_loadu_si128(x16 + 2), _mm_loadu_si128(y16 + 2)); + const LEO_M128 x3 = _mm_xor_si128(_mm_loadu_si128(x16 + 3), _mm_loadu_si128(y16 + 3)); + _mm_storeu_si128(x16, x0); + _mm_storeu_si128(x16 + 1, x1); + _mm_storeu_si128(x16 + 2, x2); + _mm_storeu_si128(x16 + 3, x3); + bytes -= 64, x16 += 4, y16 += 4; + } while (bytes > 0); +} + +void xor_mem2( + void * LEO_RESTRICT vx_0, const void * LEO_RESTRICT vy_0, + void * LEO_RESTRICT vx_1, const void * LEO_RESTRICT vy_1, + unsigned bytes) +{ +#if defined(LEO_TRY_AVX2) + if (CpuHasAVX2) + { + LEO_M256 * LEO_RESTRICT x32_0 = reinterpret_cast (vx_0); + const LEO_M256 * LEO_RESTRICT y32_0 = reinterpret_cast(vy_0); + LEO_M256 * LEO_RESTRICT x32_1 = reinterpret_cast (vx_1); + const LEO_M256 * LEO_RESTRICT y32_1 = reinterpret_cast(vy_1); + do + { + const LEO_M256 x0_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0), _mm256_loadu_si256(y32_0)); + const LEO_M256 x1_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 1), _mm256_loadu_si256(y32_0 + 1)); + const LEO_M256 x2_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 2), _mm256_loadu_si256(y32_0 + 2)); + const LEO_M256 x3_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 3), _mm256_loadu_si256(y32_0 + 3)); + const LEO_M256 x0_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1), _mm256_loadu_si256(y32_1)); + const LEO_M256 x1_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 1), _mm256_loadu_si256(y32_1 + 1)); + const LEO_M256 x2_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 2), _mm256_loadu_si256(y32_1 + 2)); + const LEO_M256 x3_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 3), _mm256_loadu_si256(y32_1 + 3)); + _mm256_storeu_si256(x32_0, x0_0); + _mm256_storeu_si256(x32_0 + 1, x1_0); + _mm256_storeu_si256(x32_0 + 2, x2_0); + _mm256_storeu_si256(x32_0 + 3, x3_0); + _mm256_storeu_si256(x32_1, x0_1); + _mm256_storeu_si256(x32_1 + 1, x1_1); + _mm256_storeu_si256(x32_1 + 2, x2_1); + _mm256_storeu_si256(x32_1 + 3, x3_1); + x32_0 += 4, y32_0 += 4; + x32_1 += 4, y32_1 += 4; + bytes -= 128; + } while (bytes >= 128); + if (bytes > 0) + { + const LEO_M256 x0_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0), _mm256_loadu_si256(y32_0)); + const LEO_M256 x1_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 1), _mm256_loadu_si256(y32_0 + 1)); + const LEO_M256 x0_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1), _mm256_loadu_si256(y32_1)); + const LEO_M256 x1_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 1), _mm256_loadu_si256(y32_1 + 1)); + _mm256_storeu_si256(x32_0, x0_0); + _mm256_storeu_si256(x32_0 + 1, x1_0); + _mm256_storeu_si256(x32_1, x0_1); + _mm256_storeu_si256(x32_1 + 1, x1_1); + } + return; + } +#endif // LEO_TRY_AVX2 + LEO_M128 * LEO_RESTRICT x16_0 = reinterpret_cast (vx_0); + const LEO_M128 * LEO_RESTRICT y16_0 = reinterpret_cast(vy_0); + LEO_M128 * LEO_RESTRICT x16_1 = reinterpret_cast (vx_1); + const LEO_M128 * LEO_RESTRICT y16_1 = reinterpret_cast(vy_1); + do + { + const LEO_M128 x0_0 = _mm_xor_si128(_mm_loadu_si128(x16_0), _mm_loadu_si128(y16_0)); + const LEO_M128 x1_0 = _mm_xor_si128(_mm_loadu_si128(x16_0 + 1), _mm_loadu_si128(y16_0 + 1)); + const LEO_M128 x2_0 = _mm_xor_si128(_mm_loadu_si128(x16_0 + 2), _mm_loadu_si128(y16_0 + 2)); + const LEO_M128 x3_0 = _mm_xor_si128(_mm_loadu_si128(x16_0 + 3), _mm_loadu_si128(y16_0 + 3)); + const LEO_M128 x0_1 = _mm_xor_si128(_mm_loadu_si128(x16_1), _mm_loadu_si128(y16_1)); + const LEO_M128 x1_1 = _mm_xor_si128(_mm_loadu_si128(x16_1 + 1), _mm_loadu_si128(y16_1 + 1)); + const LEO_M128 x2_1 = _mm_xor_si128(_mm_loadu_si128(x16_1 + 2), _mm_loadu_si128(y16_1 + 2)); + const LEO_M128 x3_1 = _mm_xor_si128(_mm_loadu_si128(x16_1 + 3), _mm_loadu_si128(y16_1 + 3)); + _mm_storeu_si128(x16_0, x0_0); + _mm_storeu_si128(x16_0 + 1, x1_0); + _mm_storeu_si128(x16_0 + 2, x2_0); + _mm_storeu_si128(x16_0 + 3, x3_0); + _mm_storeu_si128(x16_1, x0_1); + _mm_storeu_si128(x16_1 + 1, x1_1); + _mm_storeu_si128(x16_1 + 2, x2_1); + _mm_storeu_si128(x16_1 + 3, x3_1); + x16_0 += 4, y16_0 += 4; + x16_1 += 4, y16_1 += 4; + bytes -= 64; + } while (bytes > 0); +} + +void xor_mem3( + void * LEO_RESTRICT vx_0, const void * LEO_RESTRICT vy_0, + void * LEO_RESTRICT vx_1, const void * LEO_RESTRICT vy_1, + void * LEO_RESTRICT vx_2, const void * LEO_RESTRICT vy_2, + unsigned bytes) +{ +#if defined(LEO_TRY_AVX2) + if (CpuHasAVX2) + { + LEO_M256 * LEO_RESTRICT x32_0 = reinterpret_cast (vx_0); + const LEO_M256 * LEO_RESTRICT y32_0 = reinterpret_cast(vy_0); + LEO_M256 * LEO_RESTRICT x32_1 = reinterpret_cast (vx_1); + const LEO_M256 * LEO_RESTRICT y32_1 = reinterpret_cast(vy_1); + LEO_M256 * LEO_RESTRICT x32_2 = reinterpret_cast (vx_2); + const LEO_M256 * LEO_RESTRICT y32_2 = reinterpret_cast(vy_2); + do + { + const LEO_M256 x0_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0), _mm256_loadu_si256(y32_0)); + const LEO_M256 x1_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 1), _mm256_loadu_si256(y32_0 + 1)); + const LEO_M256 x2_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 2), _mm256_loadu_si256(y32_0 + 2)); + const LEO_M256 x3_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 3), _mm256_loadu_si256(y32_0 + 3)); + const LEO_M256 x0_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1), _mm256_loadu_si256(y32_1)); + const LEO_M256 x1_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 1), _mm256_loadu_si256(y32_1 + 1)); + const LEO_M256 x2_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 2), _mm256_loadu_si256(y32_1 + 2)); + const LEO_M256 x3_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 3), _mm256_loadu_si256(y32_1 + 3)); + const LEO_M256 x0_2 = _mm256_xor_si256(_mm256_loadu_si256(x32_2), _mm256_loadu_si256(y32_2)); + const LEO_M256 x1_2 = _mm256_xor_si256(_mm256_loadu_si256(x32_2 + 1), _mm256_loadu_si256(y32_2 + 1)); + const LEO_M256 x2_2 = _mm256_xor_si256(_mm256_loadu_si256(x32_2 + 2), _mm256_loadu_si256(y32_2 + 2)); + const LEO_M256 x3_2 = _mm256_xor_si256(_mm256_loadu_si256(x32_2 + 3), _mm256_loadu_si256(y32_2 + 3)); + _mm256_storeu_si256(x32_0, x0_0); + _mm256_storeu_si256(x32_0 + 1, x1_0); + _mm256_storeu_si256(x32_0 + 2, x2_0); + _mm256_storeu_si256(x32_0 + 3, x3_0); + _mm256_storeu_si256(x32_1, x0_1); + _mm256_storeu_si256(x32_1 + 1, x1_1); + _mm256_storeu_si256(x32_1 + 2, x2_1); + _mm256_storeu_si256(x32_1 + 3, x3_1); + _mm256_storeu_si256(x32_2, x0_2); + _mm256_storeu_si256(x32_2 + 1, x1_2); + _mm256_storeu_si256(x32_2 + 2, x2_2); + _mm256_storeu_si256(x32_2 + 3, x3_2); + x32_0 += 4, y32_0 += 4; + x32_1 += 4, y32_1 += 4; + x32_2 += 4, y32_2 += 4; + bytes -= 128; + } while (bytes >= 128); + if (bytes > 0) + { + const LEO_M256 x0_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0), _mm256_loadu_si256(y32_0)); + const LEO_M256 x1_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 1), _mm256_loadu_si256(y32_0 + 1)); + const LEO_M256 x0_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1), _mm256_loadu_si256(y32_1)); + const LEO_M256 x1_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 1), _mm256_loadu_si256(y32_1 + 1)); + const LEO_M256 x0_2 = _mm256_xor_si256(_mm256_loadu_si256(x32_2), _mm256_loadu_si256(y32_2)); + const LEO_M256 x1_2 = _mm256_xor_si256(_mm256_loadu_si256(x32_2 + 1), _mm256_loadu_si256(y32_2 + 1)); + _mm256_storeu_si256(x32_0, x0_0); + _mm256_storeu_si256(x32_0 + 1, x1_0); + _mm256_storeu_si256(x32_1, x0_1); + _mm256_storeu_si256(x32_1 + 1, x1_1); + _mm256_storeu_si256(x32_2, x0_2); + _mm256_storeu_si256(x32_2 + 1, x1_2); + } + return; + } +#endif // LEO_TRY_AVX2 + LEO_M128 * LEO_RESTRICT x16_0 = reinterpret_cast (vx_0); + const LEO_M128 * LEO_RESTRICT y16_0 = reinterpret_cast(vy_0); + LEO_M128 * LEO_RESTRICT x16_1 = reinterpret_cast (vx_1); + const LEO_M128 * LEO_RESTRICT y16_1 = reinterpret_cast(vy_1); + LEO_M128 * LEO_RESTRICT x16_2 = reinterpret_cast (vx_2); + const LEO_M128 * LEO_RESTRICT y16_2 = reinterpret_cast(vy_2); + do + { + const LEO_M128 x0_0 = _mm_xor_si128(_mm_loadu_si128(x16_0), _mm_loadu_si128(y16_0)); + const LEO_M128 x1_0 = _mm_xor_si128(_mm_loadu_si128(x16_0 + 1), _mm_loadu_si128(y16_0 + 1)); + const LEO_M128 x2_0 = _mm_xor_si128(_mm_loadu_si128(x16_0 + 2), _mm_loadu_si128(y16_0 + 2)); + const LEO_M128 x3_0 = _mm_xor_si128(_mm_loadu_si128(x16_0 + 3), _mm_loadu_si128(y16_0 + 3)); + const LEO_M128 x0_1 = _mm_xor_si128(_mm_loadu_si128(x16_1), _mm_loadu_si128(y16_1)); + const LEO_M128 x1_1 = _mm_xor_si128(_mm_loadu_si128(x16_1 + 1), _mm_loadu_si128(y16_1 + 1)); + const LEO_M128 x2_1 = _mm_xor_si128(_mm_loadu_si128(x16_1 + 2), _mm_loadu_si128(y16_1 + 2)); + const LEO_M128 x3_1 = _mm_xor_si128(_mm_loadu_si128(x16_1 + 3), _mm_loadu_si128(y16_1 + 3)); + const LEO_M128 x0_2 = _mm_xor_si128(_mm_loadu_si128(x16_2), _mm_loadu_si128(y16_2)); + const LEO_M128 x1_2 = _mm_xor_si128(_mm_loadu_si128(x16_2 + 1), _mm_loadu_si128(y16_2 + 1)); + const LEO_M128 x2_2 = _mm_xor_si128(_mm_loadu_si128(x16_2 + 2), _mm_loadu_si128(y16_2 + 2)); + const LEO_M128 x3_2 = _mm_xor_si128(_mm_loadu_si128(x16_2 + 3), _mm_loadu_si128(y16_2 + 3)); + _mm_storeu_si128(x16_0, x0_0); + _mm_storeu_si128(x16_0 + 1, x1_0); + _mm_storeu_si128(x16_0 + 2, x2_0); + _mm_storeu_si128(x16_0 + 3, x3_0); + _mm_storeu_si128(x16_1, x0_1); + _mm_storeu_si128(x16_1 + 1, x1_1); + _mm_storeu_si128(x16_1 + 2, x2_1); + _mm_storeu_si128(x16_1 + 3, x3_1); + _mm_storeu_si128(x16_2, x0_2); + _mm_storeu_si128(x16_2 + 1, x1_2); + _mm_storeu_si128(x16_2 + 2, x2_2); + _mm_storeu_si128(x16_2 + 3, x3_2); + x16_0 += 4, y16_0 += 4; + x16_1 += 4, y16_1 += 4; + x16_2 += 4, y16_2 += 4; + bytes -= 64; + } while (bytes > 0); +} + + +//------------------------------------------------------------------------------ +// Multiplies + +// We require memory to be aligned since the SIMD instructions benefit from +// or require aligned accesses to the table data. +struct { + LEO_ALIGNED LEO_M128 Lo[256]; + LEO_ALIGNED LEO_M128 Hi[256]; +} Multiply128LUT; +#if defined(LEO_TRY_AVX2) +struct { + LEO_ALIGNED LEO_M256 Lo[256]; + LEO_ALIGNED LEO_M256 Hi[256]; +} Multiply256LUT; +#endif // LEO_TRY_AVX2 + +// Returns a * b +static ffe_t FFEMultiply(ffe_t a, ffe_t b) +{ + if (a == 0 || b == 0) + return 0; + return ExpLUT[AddMod(LogLUT[a], LogLUT[b])]; +} + +bool InitializeMultiplyTables() +{ + // Reuse aligned self test buffers to load table data + uint8_t* lo = m_SelfTestBuffers.A; + uint8_t* hi = m_SelfTestBuffers.B; + + for (int y = 0; y < 256; ++y) + { + for (unsigned char x = 0; x < 16; ++x) + { + lo[x] = FFEMultiply(x, static_cast(y)); + hi[x] = FFEMultiply(x << 4, static_cast(y)); + } + + const LEO_M128 table_lo = _mm_loadu_si128((LEO_M128*)lo); + const LEO_M128 table_hi = _mm_loadu_si128((LEO_M128*)hi); + _mm_storeu_si128(Multiply128LUT.Lo + y, table_lo); + _mm_storeu_si128(Multiply128LUT.Hi + y, table_hi); +#if defined(LEO_TRY_AVX2) + if (CpuHasAVX2) + { + const LEO_M256 table_lo2 = _mm256_broadcastsi128_si256(table_lo); + const LEO_M256 table_hi2 = _mm256_broadcastsi128_si256(table_hi); + _mm256_storeu_si256(Multiply256LUT.Lo + y, table_lo2); + _mm256_storeu_si256(Multiply256LUT.Hi + y, table_hi2); + } +#endif // LEO_TRY_AVX2 + } + + return true; +} + +// vx[] = vy[] * m +void mul_mem_set( + void * LEO_RESTRICT vx, const void * LEO_RESTRICT vy, + ffe_t m, unsigned bytes) +{ + if (m <= 1) + { + if (m == 1) + memcpy(vx, vy, bytes); + else + memset(vx, 0, bytes); + return; + } + +#if defined(LEO_TRY_AVX2) + if (CpuHasAVX2) + { + const LEO_M256 table_lo_y = _mm256_loadu_si256(Multiply256LUT.Lo + m); + const LEO_M256 table_hi_y = _mm256_loadu_si256(Multiply256LUT.Hi + m); + + const LEO_M256 clr_mask = _mm256_set1_epi8(0x0f); + + LEO_M256 * LEO_RESTRICT z32 = reinterpret_cast(vx); + const LEO_M256 * LEO_RESTRICT x32 = reinterpret_cast(vy); + + const unsigned count = bytes / 64; + for (unsigned i = 0; i < count; ++i) + { + LEO_M256 x0 = _mm256_loadu_si256(x32 + i * 2); + LEO_M256 l0 = _mm256_and_si256(x0, clr_mask); + x0 = _mm256_srli_epi64(x0, 4); + LEO_M256 h0 = _mm256_and_si256(x0, clr_mask); + l0 = _mm256_shuffle_epi8(table_lo_y, l0); + h0 = _mm256_shuffle_epi8(table_hi_y, h0); + _mm256_storeu_si256(z32 + i * 2, _mm256_xor_si256(l0, h0)); + + LEO_M256 x1 = _mm256_loadu_si256(x32 + i * 2 + 1); + LEO_M256 l1 = _mm256_and_si256(x1, clr_mask); + x1 = _mm256_srli_epi64(x1, 4); + LEO_M256 h1 = _mm256_and_si256(x1, clr_mask); + l1 = _mm256_shuffle_epi8(table_lo_y, l1); + h1 = _mm256_shuffle_epi8(table_hi_y, h1); + _mm256_storeu_si256(z32 + i * 2 + 1, _mm256_xor_si256(l1, h1)); + } + return; + } +#endif // LEO_TRY_AVX2 + + const LEO_M128 table_lo_y = _mm_loadu_si128(Multiply128LUT.Lo + m); + const LEO_M128 table_hi_y = _mm_loadu_si128(Multiply128LUT.Hi + m); + + const LEO_M128 clr_mask = _mm_set1_epi8(0x0f); + + LEO_M128 * LEO_RESTRICT x16 = reinterpret_cast (vx); + const LEO_M128 * LEO_RESTRICT y16 = reinterpret_cast(vy); + + do + { + LEO_M128 x3 = _mm_loadu_si128(y16 + 3); + LEO_M128 l3 = _mm_and_si128(x3, clr_mask); + x3 = _mm_srli_epi64(x3, 4); + LEO_M128 h3 = _mm_and_si128(x3, clr_mask); + l3 = _mm_shuffle_epi8(table_lo_y, l3); + h3 = _mm_shuffle_epi8(table_hi_y, h3); + + LEO_M128 x2 = _mm_loadu_si128(y16 + 2); + LEO_M128 l2 = _mm_and_si128(x2, clr_mask); + x2 = _mm_srli_epi64(x2, 4); + LEO_M128 h2 = _mm_and_si128(x2, clr_mask); + l2 = _mm_shuffle_epi8(table_lo_y, l2); + h2 = _mm_shuffle_epi8(table_hi_y, h2); + + LEO_M128 x1 = _mm_loadu_si128(y16 + 1); + LEO_M128 l1 = _mm_and_si128(x1, clr_mask); + x1 = _mm_srli_epi64(x1, 4); + LEO_M128 h1 = _mm_and_si128(x1, clr_mask); + l1 = _mm_shuffle_epi8(table_lo_y, l1); + h1 = _mm_shuffle_epi8(table_hi_y, h1); + + LEO_M128 x0 = _mm_loadu_si128(y16); + LEO_M128 l0 = _mm_and_si128(x0, clr_mask); + x0 = _mm_srli_epi64(x0, 4); + LEO_M128 h0 = _mm_and_si128(x0, clr_mask); + l0 = _mm_shuffle_epi8(table_lo_y, l0); + h0 = _mm_shuffle_epi8(table_hi_y, h0); + + _mm_storeu_si128(x16 + 3, _mm_xor_si128(l3, h3)); + _mm_storeu_si128(x16 + 2, _mm_xor_si128(l2, h2)); + _mm_storeu_si128(x16 + 1, _mm_xor_si128(l1, h1)); + _mm_storeu_si128(x16, _mm_xor_si128(l0, h0)); + + x16 += 4, y16 += 4; + bytes -= 64; + } while (bytes > 0); +} + +// vx0[] *= m, vx1[] *= m +void mul_mem2_inplace( + void * LEO_RESTRICT vx_0, + void * LEO_RESTRICT vx_1, + ffe_t m, unsigned bytes) +{ + if (m <= 1) + { + if (m == 0) + { + memset(vx_0, 0, bytes); + memset(vx_1, 0, bytes); + } + return; + } + +#if defined(LEO_TRY_AVX2) + if (CpuHasAVX2) + { + const LEO_M256 table_lo_y = _mm256_loadu_si256(Multiply256LUT.Lo + m); + const LEO_M256 table_hi_y = _mm256_loadu_si256(Multiply256LUT.Hi + m); + + const LEO_M256 clr_mask = _mm256_set1_epi8(0x0f); + + LEO_M256 * LEO_RESTRICT x32_0 = reinterpret_cast(vx_0); + LEO_M256 * LEO_RESTRICT x32_1 = reinterpret_cast(vx_1); + + do + { + LEO_M256 x0_0 = _mm256_loadu_si256(x32_0 + 1); + LEO_M256 l0_0 = _mm256_and_si256(x0_0, clr_mask); + x0_0 = _mm256_srli_epi64(x0_0, 4); + LEO_M256 h0_0 = _mm256_and_si256(x0_0, clr_mask); + l0_0 = _mm256_shuffle_epi8(table_lo_y, l0_0); + h0_0 = _mm256_shuffle_epi8(table_hi_y, h0_0); + l0_0 = _mm256_xor_si256(l0_0, h0_0); + + LEO_M256 x1_0 = _mm256_loadu_si256(x32_0); + LEO_M256 l1_0 = _mm256_and_si256(x1_0, clr_mask); + x1_0 = _mm256_srli_epi64(x1_0, 4); + LEO_M256 h1_0 = _mm256_and_si256(x1_0, clr_mask); + l1_0 = _mm256_shuffle_epi8(table_lo_y, l1_0); + h1_0 = _mm256_shuffle_epi8(table_hi_y, h1_0); + l1_0 = _mm256_xor_si256(l1_0, h1_0); + + LEO_M256 x0_1 = _mm256_loadu_si256(x32_1 + 1); + LEO_M256 l0_1 = _mm256_and_si256(x0_1, clr_mask); + x0_1 = _mm256_srli_epi64(x0_1, 4); + LEO_M256 h0_1 = _mm256_and_si256(x0_1, clr_mask); + l0_1 = _mm256_shuffle_epi8(table_lo_y, l0_1); + h0_1 = _mm256_shuffle_epi8(table_hi_y, h0_1); + l0_1 = _mm256_xor_si256(l0_1, h0_1); + + LEO_M256 x1_1 = _mm256_loadu_si256(x32_1); + LEO_M256 l1_1 = _mm256_and_si256(x1_1, clr_mask); + x1_1 = _mm256_srli_epi64(x1_1, 4); + LEO_M256 h1_1 = _mm256_and_si256(x1_1, clr_mask); + l1_1 = _mm256_shuffle_epi8(table_lo_y, l1_1); + h1_1 = _mm256_shuffle_epi8(table_hi_y, h1_1); + l1_1 = _mm256_xor_si256(l1_1, h1_1); + + _mm256_storeu_si256(x32_0 + 1, l0_0); + _mm256_storeu_si256(x32_0, l1_0); + _mm256_storeu_si256(x32_1 + 1, l0_1); + _mm256_storeu_si256(x32_1, l1_1); + + x32_0 += 2; + x32_1 += 2; + bytes -= 64; + } while (bytes > 0); + return; + } +#endif // LEO_TRY_AVX2 + + const LEO_M128 table_lo_y = _mm_loadu_si128(Multiply128LUT.Lo + m); + const LEO_M128 table_hi_y = _mm_loadu_si128(Multiply128LUT.Hi + m); + + const LEO_M128 clr_mask = _mm_set1_epi8(0x0f); + + LEO_M128 * LEO_RESTRICT x16_0 = reinterpret_cast(vx_0); + LEO_M128 * LEO_RESTRICT x16_1 = reinterpret_cast(vx_1); + + do + { + LEO_M128 x3 = _mm_loadu_si128(x16_0 + 3); + LEO_M128 l3 = _mm_and_si128(x3, clr_mask); + x3 = _mm_srli_epi64(x3, 4); + LEO_M128 h3 = _mm_and_si128(x3, clr_mask); + l3 = _mm_shuffle_epi8(table_lo_y, l3); + h3 = _mm_shuffle_epi8(table_hi_y, h3); + + LEO_M128 x2 = _mm_loadu_si128(x16_0 + 2); + LEO_M128 l2 = _mm_and_si128(x2, clr_mask); + x2 = _mm_srli_epi64(x2, 4); + LEO_M128 h2 = _mm_and_si128(x2, clr_mask); + l2 = _mm_shuffle_epi8(table_lo_y, l2); + h2 = _mm_shuffle_epi8(table_hi_y, h2); + + LEO_M128 x1 = _mm_loadu_si128(x16_0 + 1); + LEO_M128 l1 = _mm_and_si128(x1, clr_mask); + x1 = _mm_srli_epi64(x1, 4); + LEO_M128 h1 = _mm_and_si128(x1, clr_mask); + l1 = _mm_shuffle_epi8(table_lo_y, l1); + h1 = _mm_shuffle_epi8(table_hi_y, h1); + + LEO_M128 x0 = _mm_loadu_si128(x16_0); + LEO_M128 l0 = _mm_and_si128(x0, clr_mask); + x0 = _mm_srli_epi64(x0, 4); + LEO_M128 h0 = _mm_and_si128(x0, clr_mask); + l0 = _mm_shuffle_epi8(table_lo_y, l0); + h0 = _mm_shuffle_epi8(table_hi_y, h0); + + _mm_storeu_si128(x16_0 + 3, _mm_xor_si128(l3, h3)); + _mm_storeu_si128(x16_0 + 2, _mm_xor_si128(l2, h2)); + _mm_storeu_si128(x16_0 + 1, _mm_xor_si128(l1, h1)); + _mm_storeu_si128(x16_0, _mm_xor_si128(l0, h0)); + + // FIXME: Add second one here + + x16_0 += 4; + x16_1 += 4; + bytes -= 64; + } while (bytes > 0); +} + + +//------------------------------------------------------------------------------ +// FFT Operations + +// x[] ^= y[] * m, y[] ^= x[] +void mul_fft( + void * LEO_RESTRICT x, void * LEO_RESTRICT y, + ffe_t m, unsigned bytes) +{ + +} + +// For i = {0, 1}: x_i[] ^= y_i[] * m, y_i[] ^= x_i[] +void mul_fft2( + void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0, + void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1, + ffe_t m, unsigned bytes) +{ + +} + +// For i = {0, 1, 2}: x_i[] ^= y_i[] * m, y_i[] ^= x_i[] +void mul_fft3( + void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0, + void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1, + void * LEO_RESTRICT x_2, void * LEO_RESTRICT y_2, + ffe_t m, unsigned bytes) +{ + +} + + +//------------------------------------------------------------------------------ +// IFFT Operations + +// y[] ^= x[], x[] ^= y[] * m +void mul_ifft( + void * LEO_RESTRICT x, void * LEO_RESTRICT y, + ffe_t m, unsigned bytes) +{ + +} + +// For i = {0, 1}: y_i[] ^= x_i[], x_i[] ^= y_i[] * m +void mul_ifft2( + void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0, + void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1, + ffe_t m, unsigned bytes) +{ + +} + +// For i = {0, 1, 2}: y_i[] ^= x_i[], x_i[] ^= y_i[] * m +void mul_ifft3( + void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0, + void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1, + void * LEO_RESTRICT x_2, void * LEO_RESTRICT y_2, + ffe_t m, unsigned bytes) +{ + +} + + +//------------------------------------------------------------------------------ +// API + +static bool IsInitialized = false; + +bool Initialize() +{ + if (IsInitialized) + return true; + + if (!CpuHasSSSE3) + return false; + + InitializeLogarithmTables(); + + IsInitialized = true; + return true; +} + + +}} // namespace leopard::ff8 diff --git a/LeopardFF8.h b/LeopardFF8.h new file mode 100644 index 0000000..1ef933b --- /dev/null +++ b/LeopardFF8.h @@ -0,0 +1,157 @@ +/* + Copyright (c) 2017 Christopher A. Taylor. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + * Neither the name of Leopard-RS nor the names of its contributors may be + used to endorse or promote products derived from this software without + specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. +*/ + +#pragma once + +#include "LeopardCommon.h" + +/* + 8-bit Finite Field Math + + This finite field contains 256 elements and so each element is one byte. + This library is designed for data that is a multiple of 64 bytes in size. +*/ + +namespace leopard { namespace ff8 { + + +//------------------------------------------------------------------------------ +// Datatypes and Constants + +// Finite field element type +typedef uint8_t ffe_t; + +// Number of bits per element +static const unsigned kBits = 8; + +// Finite field order: Number of elements in the field +static const unsigned kOrder = 256; + + +//------------------------------------------------------------------------------ +// Fast Walsh-Hadamard Transform (FWHT) (mod kModulus) + +// Define this to enable the optimized version of FWHT() +#define LEO_FF8_FWHT_OPTIMIZED + +// Transform for a variable number of bits (up to kOrder) +void FWHT(ffe_t* data, const unsigned bits); + +// Transform specialized for the finite field order +void FWHT(ffe_t data[kOrder]); + + +//------------------------------------------------------------------------------ +// XOR Memory + +// x[] ^= y[] +void xor_mem( + void * LEO_RESTRICT x, const void * LEO_RESTRICT y, + unsigned bytes); + +// For i = {0, 1}: x_i[] ^= x_i[] +void xor_mem2( + void * LEO_RESTRICT x_0, const void * LEO_RESTRICT y_0, + void * LEO_RESTRICT x_1, const void * LEO_RESTRICT y_1, + unsigned bytes); + +// For i = {0, 1, 2}: x_i[] ^= x_i[] +void xor_mem3( + void * LEO_RESTRICT x_0, const void * LEO_RESTRICT y_0, + void * LEO_RESTRICT x_1, const void * LEO_RESTRICT y_1, + void * LEO_RESTRICT x_2, const void * LEO_RESTRICT y_2, + unsigned bytes); + + +//------------------------------------------------------------------------------ +// Multiplies + +// x[] = y[] * m +void mul_mem_set( + void * LEO_RESTRICT x, const void * LEO_RESTRICT y, + ffe_t m, unsigned bytes); + +// For i = {0, 1}: x_i[] *= m +void mul_mem2_inplace( + void * LEO_RESTRICT x_0, + void * LEO_RESTRICT x_1, + ffe_t m, unsigned bytes); + + +//------------------------------------------------------------------------------ +// FFT Operations + +// x[] ^= y[] * m, y[] ^= x[] +void mul_fft( + void * LEO_RESTRICT x, void * LEO_RESTRICT y, + ffe_t m, unsigned bytes); + +// For i = {0, 1}: x_i[] ^= y_i[] * m, y_i[] ^= x_i[] +void mul_fft2( + void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0, + void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1, + ffe_t m, unsigned bytes); + +// For i = {0, 1, 2}: x_i[] ^= y_i[] * m, y_i[] ^= x_i[] +void mul_fft3( + void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0, + void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1, + void * LEO_RESTRICT x_2, void * LEO_RESTRICT y_2, + ffe_t m, unsigned bytes); + + +//------------------------------------------------------------------------------ +// IFFT Operations + +// y[] ^= x[], x[] ^= y[] * m +void mul_ifft( + void * LEO_RESTRICT x, void * LEO_RESTRICT y, + ffe_t m, unsigned bytes); + +// For i = {0, 1}: y_i[] ^= x_i[], x_i[] ^= y_i[] * m +void mul_ifft2( + void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0, + void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1, + ffe_t m, unsigned bytes); + +// For i = {0, 1, 2}: y_i[] ^= x_i[], x_i[] ^= y_i[] * m +void mul_ifft3( + void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0, + void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1, + void * LEO_RESTRICT x_2, void * LEO_RESTRICT y_2, + ffe_t m, unsigned bytes); + + +//------------------------------------------------------------------------------ +// API + +// Returns false if the self-test fails +bool Initialize(); + + +}} // namespace leopard::ff8 diff --git a/License.md b/License.md new file mode 100644 index 0000000..77d5436 --- /dev/null +++ b/License.md @@ -0,0 +1,29 @@ +BSD 3-Clause License + +Copyright (c) 2017, Christopher A. Taylor +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +* Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/README.md b/README.md index 28befd3..9a91334 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,91 @@ -# Lin-Han-Chung RS Codes -This is an attempt at implementing a fast version of the algorithm described here: +# Leopard-RS +## Leopard Reed-Solomon Error Correction Codes in C + +Leopard-RS is a portable, fast library for Forward Error Correction. +From a block of equally sized original data pieces, it generates recovery +symbols that can be used to recover lost original data. + +* It requires that data pieces are all a fixed size, a multiple of 64 bytes. +* The original and recovery data must not exceed 65536 pieces. + + +#### Motivation: + +It gets slower as O(N Log N) in the input data size, and its inner loops are +vectorized using the best approaches available on modern processors, using the +fastest finite fields (8-bit or 16-bit Galois fields) for bulk data. + +It sets new speed records for MDS encoding and decoding of large data. +It is also the only open-source production ready software for this purpose +available today. + +Example applications are data recovery software and data center replication. + + +#### Encoder API: + +``` +#include "leopard.h" +``` + +For full documentation please read `leopard.h`. + ++ `leo_init()` : Initialize library. ++ `leo_encode_work_count()` : Calculate the number of work_data buffers to provide to leo_encode(). ++ `leo_encode()`: Generate recovery data. + + +#### Decoder API: + +``` +#include "leopard.h" +``` + +For full documentation please read `leopard.h`. + ++ `leo_init()` : Initialize library. ++ `leo_decode_work_count()` : Calculate the number of work_data buffers to provide to leo_decode(). ++ `leo_decode()` : Generate recovery data. + + +#### Benchmarks: + +``` +TODO +``` + + +#### Comparisons: + +``` +TODO +``` + + +#### Background + +This library implements an MDS erasure code introduced in this paper: ~~~ S.-J. Lin, T. Y. Al-Naffouri, Y. S. Han, and W.-H. Chung, "Novel Polynomial Basis with Fast Fourier Transform and Its Application to Reed-Solomon Erasure Codes" IEEE Trans. on Information Theory, pp. 6284-6299, November, 2016. ~~~ -Available here: [http://ct.ee.ntust.edu.tw/it2016-2.pdf](http://ct.ee.ntust.edu.tw/it2016-2.pdf) + +The paper is available here: [http://ct.ee.ntust.edu.tw/it2016-2.pdf](http://ct.ee.ntust.edu.tw/it2016-2.pdf) +And also mirrored in the /docs/ folder. + +The high-level summary is that instead of using complicated fields, +an additive FFT was introduced that works with familiar Galois fields for the first time. +This is actually a huge new result that will change how Reed-Solomon codecs will be written. + +My contribution is extending the ALTMAP approach from Jerasure +for 16-bit Galois fields out to 64 bytes to enable AVX2 speedups, +and marry it with the row parallelism introduced by ISA-L. + + +#### Credits + +The idea is the brain-child of S.-J. Lin. He is a super bright guy who should be recognized more widely! + +This software was written entirely by myself ( Christopher A. Taylor mrcatid@gmail.com ). If you find it useful and would like to buy me a coffee, consider tipping. diff --git a/leopard.cpp b/leopard.cpp new file mode 100644 index 0000000..5c694fd --- /dev/null +++ b/leopard.cpp @@ -0,0 +1,172 @@ +/* + Copyright (c) 2017 Christopher A. Taylor. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + * Neither the name of Leopard-RS nor the names of its contributors may be + used to endorse or promote products derived from this software without + specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. +*/ + +#include "leopard.h" +#include "FecalEncoder.h" +#include "FecalDecoder.h" + +extern "C" { + + +//------------------------------------------------------------------------------ +// Initialization API + +static bool m_Initialized = false; + +FECAL_EXPORT int fecal_init_(int version) +{ + if (version != FECAL_VERSION) + return Fecal_InvalidInput; + + if (0 != gf256_init()) + return Fecal_Platform; + + m_Initialized = true; + return Fecal_Success; +} + + +//------------------------------------------------------------------------------ +// Encoder API + +FECAL_EXPORT FecalEncoder fecal_encoder_create(unsigned input_count, void* const * const input_data, uint64_t total_bytes) +{ + if (input_count <= 0 || !input_data || total_bytes < input_count) + { + FECAL_DEBUG_BREAK; // Invalid input + return nullptr; + } + + FECAL_DEBUG_ASSERT(m_Initialized); // Must call fecal_init() first + if (!m_Initialized) + return nullptr; + + fecal::Encoder* encoder = new(std::nothrow) fecal::Encoder; + if (!encoder) + { + FECAL_DEBUG_BREAK; // Out of memory + return nullptr; + } + + if (Fecal_Success != encoder->Initialize(input_count, input_data, total_bytes)) + { + delete encoder; + return nullptr; + } + + return reinterpret_cast( encoder ); +} + +FECAL_EXPORT int fecal_encode(FecalEncoder encoder_v, FecalSymbol* symbol) +{ + fecal::Encoder* encoder = reinterpret_cast( encoder_v ); + if (!encoder || !symbol) + return Fecal_InvalidInput; + + return encoder->Encode(*symbol); +} + +FECAL_EXPORT void fecal_free(void* codec_v) +{ + if (codec_v) + { + fecal::ICodec* icodec = reinterpret_cast( codec_v ); + delete icodec; + } +} + + +//------------------------------------------------------------------------------ +// Decoder API + +FECAL_EXPORT FecalDecoder fecal_decoder_create(unsigned input_count, uint64_t total_bytes) +{ + if (input_count <= 0 || total_bytes < input_count) + { + FECAL_DEBUG_BREAK; // Invalid input + return nullptr; + } + + FECAL_DEBUG_ASSERT(m_Initialized); // Must call fecal_init() first + if (!m_Initialized) + return nullptr; + + fecal::Decoder* decoder = new(std::nothrow) fecal::Decoder; + if (!decoder) + { + FECAL_DEBUG_BREAK; // Out of memory + return nullptr; + } + + if (Fecal_Success != decoder->Initialize(input_count, total_bytes)) + { + delete decoder; + return nullptr; + } + + return reinterpret_cast( decoder ); +} + +FECAL_EXPORT int fecal_decoder_add_original(FecalDecoder decoder_v, const FecalSymbol* symbol) +{ + fecal::Decoder* decoder = reinterpret_cast( decoder_v ); + if (!decoder || !symbol) + return Fecal_InvalidInput; + + return decoder->AddOriginal(*symbol); +} + +FECAL_EXPORT int fecal_decoder_add_recovery(FecalDecoder decoder_v, const FecalSymbol* symbol) +{ + fecal::Decoder* decoder = reinterpret_cast( decoder_v ); + if (!decoder || !symbol) + return Fecal_InvalidInput; + + return decoder->AddRecovery(*symbol); +} + +FECAL_EXPORT int fecal_decode(FecalDecoder decoder_v, RecoveredSymbols* symbols) +{ + fecal::Decoder* decoder = reinterpret_cast( decoder_v ); + if (!decoder || !symbols) + return Fecal_InvalidInput; + + return decoder->Decode(*symbols); +} + +FECAL_EXPORT int fecal_decoder_get(FecalDecoder decoder_v, unsigned input_index, FecalSymbol* symbol) +{ + fecal::Decoder* decoder = reinterpret_cast( decoder_v ); + if (!decoder || !symbol) + return Fecal_InvalidInput; + + return decoder->GetOriginal(input_index, *symbol); +} + + +} // extern "C" diff --git a/leopard.h b/leopard.h new file mode 100644 index 0000000..8c0e85f --- /dev/null +++ b/leopard.h @@ -0,0 +1,229 @@ +/* + Copyright (c) 2017 Christopher A. Taylor. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + * Neither the name of Leopard-RS nor the names of its contributors may be + used to endorse or promote products derived from this software without + specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef CAT_LEOPARD_RS_H +#define CAT_LEOPARD_RS_H + +/* + Leopard-RS: Reed-Solomon Error Correction Coding for Extremely Large Data + + S.-J. Lin, T. Y. Al-Naffouri, Y. S. Han, and W.-H. Chung, + "Novel Polynomial Basis with Fast Fourier Transform and Its Application to Reed-Solomon Erasure Codes" + IEEE Trans. on Information Theory, pp. 6284-6299, November, 2016. + http://ct.ee.ntust.edu.tw/it2016-2.pdf +*/ + +// Library version +#define LEO_VERSION 1 + +// Tweak if the functions are exported or statically linked +//#define LEO_DLL /* Defined when building/linking as DLL */ +//#define LEO_BUILDING /* Defined by the library makefile */ + +#if defined(LEO_BUILDING) +# if defined(LEO_DLL) + #define LEO_EXPORT __declspec(dllexport) +# else + #define LEO_EXPORT +# endif +#else +# if defined(LEO_DLL) + #define LEO_EXPORT __declspec(dllimport) +# else + #define LEO_EXPORT extern +# endif +#endif + + +#ifdef __cplusplus +extern "C" { +#endif + + +//------------------------------------------------------------------------------ +// Initialization API + +/* + leo_init() + + Perform static initialization for the library, verifying that the platform + is supported. + + Returns 0 on success and other values on failure. +*/ + +LEO_EXPORT int leo_init_(int version); +#define leo_init() leo_init_(LEO_VERSION) + + +//------------------------------------------------------------------------------ +// Shared Constants / Datatypes + +// Results +typedef enum LeopardResultT +{ + Leopard_Success = 0, // Operation succeeded + + Leopard_TooMuchData = -1, // Buffer counts are too high + Leopard_InvalidBlockSize = -2, // Buffer size must be a multiple of 64 bytes + Leopard_InvalidInput = -3, // A function parameter was invalid + Leopard_Platform = -4, // Platform is unsupported + Leopard_OutOfMemory = -5, // Out of memory error occurred + Leopard_Unexpected = -6, // Unexpected error - Software bug? +} LeopardResult; + +// Results +typedef enum LeopardFlagsT +{ + LeopardFlags_Defaults = 0, // Default settings + + LeopardFlags_Multithreaded = 1, // Enable multiple threads +} LeopardFlags; + + +//------------------------------------------------------------------------------ +// Encoder API + +/* + leo_encode_work_count() + + Calculate the number of work_data buffers to provide to leo_encode(). + + The sum of original_count + recovery_count must not exceed 65536. + + Returns the work_count value to pass into leo_encode(). + Returns 0 on invalid input. +*/ + +LEO_EXPORT unsigned leo_encode_work_count( + unsigned original_count, + unsigned recovery_count); + +/* + leo_encode() + + Generate recovery data. + + original_count: Number of original_data[] buffers provided. + recovery_count: Number of desired recovery data buffers. + buffer_bytes: Number of bytes in each data buffer. + original_data: Array of pointers to original data buffers. + work_count: Number of work_data[] buffers, from leo_encode_work_count(). + work_data: Array of pointers to work data buffers. + flags: Flags for encoding e.g. LeopardFlag_Multithreaded + + The sum of original_count + recovery_count must not exceed 65536. + The buffer_bytes must be a multiple of 64. + Each buffer should have the same number of bytes. + Even the last piece must be rounded up to the block size. + + Let buffer_bytes = The number of bytes in each buffer: + + original_count = static_cast( + ((uint64_t)total_bytes + buffer_bytes - 1) / buffer_bytes); + + Or if the number of pieces is known: + + buffer_bytes = static_cast( + ((uint64_t)total_bytes + original_count - 1) / original_count); + + Returns Leopard_Success on success. + The first set of recovery_count buffers in work_data will be the result. + + Returns Leopard_TooMuchData if the data is too large. + Returns Leopard_InvalidBlockSize if the data is the wrong size. + Returns Leopard_InvalidInput on invalid input. + Returns other values on errors. +*/ +LEO_EXPORT LeopardResult leo_encode( + unsigned buffer_bytes, // Number of bytes in each data buffer + unsigned original_count, // Number of original_data[] buffer pointers + unsigned recovery_count, // Number of recovery_data[] buffer pointers + unsigned work_count, // Number of work_data[] buffer pointers, from leo_encode_work_count() + void* const * const original_data, // Array of pointers to original data buffers + void** work_data, // Array of work buffers + unsigned flags); // Operation flags + + +//------------------------------------------------------------------------------ +// Decoder API + +/* + leo_decode_work_count() + + Calculate the number of work_data buffers to provide to leo_decode(). + + The sum of original_count + recovery_count must not exceed 65536. + + Returns the work_count value to pass into leo_encode(). + Returns 0 on invalid input. +*/ + +LEO_EXPORT unsigned leo_decode_work_count( + unsigned original_count, + unsigned recovery_count); + +/* + leo_decode() + + Decode original data from recovery data. + + buffer_bytes: Number of bytes in each data buffer. + original_count: Number of original_data[] buffers provided. + original_data: Array of pointers to original data buffers. + recovery_count: Number of recovery_data[] buffers provided. + recovery_data: Array of pointers to recovery data buffers. + work_count: Number of work_data[] buffers, from leo_decode_work_count(). + work_data: Array of pointers to recovery data buffers. + flags: Flags for encoding e.g. LeopardFlag_Multithreaded + + Lost original/recovery data should be set to NULL. + + The sum of recovery_count + the number of non-NULL original data must be at + least original_count in order to perform recovery. + + Returns Leopard_Success on success. + Returns other values on errors. +*/ +LEO_EXPORT LeopardResult leo_decode( + unsigned buffer_bytes, // Number of bytes in each data buffer + unsigned original_count, // Number of original_data[] buffer pointers + unsigned recovery_count, // Number of recovery_data[] buffer pointers + unsigned work_count, // Number of buffer pointers in work_data[] + void* const * const original_data, // Array of original data buffers + void* const * const recovery_data, // Array of recovery data buffers + void** work_data, // Array of work data buffers + unsigned flags); // Operation flags + + +#ifdef __cplusplus +} +#endif + + +#endif // CAT_LEOPARD_RS_H diff --git a/msvc/LHC_RS.sln b/proj/Leopard.sln similarity index 53% rename from msvc/LHC_RS.sln rename to proj/Leopard.sln index 4a05416..bafad8e 100644 --- a/msvc/LHC_RS.sln +++ b/proj/Leopard.sln @@ -1,9 +1,11 @@  Microsoft Visual Studio Solution File, Format Version 12.00 -# Visual Studio 14 -VisualStudioVersion = 14.0.25420.1 +# Visual Studio 15 +VisualStudioVersion = 15.0.26127.3 MinimumVisualStudioVersion = 10.0.40219.1 -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "LHC_RS", "LHC_RS.vcxproj", "{32176592-2F30-4BD5-B645-EB11C8D3453E}" +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "Leopard", "Leopard.vcxproj", "{32176592-2F30-4BD5-B645-EB11C8D3453E}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "LeopardBenchmark", "..\tests\proj\Benchmark.vcxproj", "{97FCA15F-EAF3-4F1A-AFF8-83E693DA9D45}" EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution @@ -21,6 +23,14 @@ Global {32176592-2F30-4BD5-B645-EB11C8D3453E}.Release|Win32.Build.0 = Release|Win32 {32176592-2F30-4BD5-B645-EB11C8D3453E}.Release|x64.ActiveCfg = Release|x64 {32176592-2F30-4BD5-B645-EB11C8D3453E}.Release|x64.Build.0 = Release|x64 + {97FCA15F-EAF3-4F1A-AFF8-83E693DA9D45}.Debug|Win32.ActiveCfg = Debug|Win32 + {97FCA15F-EAF3-4F1A-AFF8-83E693DA9D45}.Debug|Win32.Build.0 = Debug|Win32 + {97FCA15F-EAF3-4F1A-AFF8-83E693DA9D45}.Debug|x64.ActiveCfg = Debug|x64 + {97FCA15F-EAF3-4F1A-AFF8-83E693DA9D45}.Debug|x64.Build.0 = Debug|x64 + {97FCA15F-EAF3-4F1A-AFF8-83E693DA9D45}.Release|Win32.ActiveCfg = Release|Win32 + {97FCA15F-EAF3-4F1A-AFF8-83E693DA9D45}.Release|Win32.Build.0 = Release|Win32 + {97FCA15F-EAF3-4F1A-AFF8-83E693DA9D45}.Release|x64.ActiveCfg = Release|x64 + {97FCA15F-EAF3-4F1A-AFF8-83E693DA9D45}.Release|x64.Build.0 = Release|x64 EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE diff --git a/proj/Leopard.vcxproj b/proj/Leopard.vcxproj new file mode 100644 index 0000000..da9a8ad --- /dev/null +++ b/proj/Leopard.vcxproj @@ -0,0 +1,193 @@ + + + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + + + + + + + + + + + + + + + + + + + {32176592-2F30-4BD5-B645-EB11C8D3453E} + GF65536 + Leopard + 10.0.14393.0 + + + + StaticLibrary + true + MultiByte + v141 + + + StaticLibrary + true + MultiByte + v141 + + + StaticLibrary + false + true + MultiByte + v141 + + + StaticLibrary + false + true + MultiByte + v141 + + + + + + + + + + + + + + + + + + + Output/$(ProjectName)/$(Configuration)/$(Platform)/ + Obj/$(ProjectName)/$(Configuration)/$(Platform)/ + + + Output/$(ProjectName)/$(Configuration)/$(Platform)/ + Obj/$(ProjectName)/$(Configuration)/$(Platform)/ + + + Output/$(ProjectName)/$(Configuration)/$(Platform)/ + Obj/$(ProjectName)/$(Configuration)/$(Platform)/ + + + Output/$(ProjectName)/$(Configuration)/$(Platform)/ + Obj/$(ProjectName)/$(Configuration)/$(Platform)/ + + + + Level3 + Disabled + true + MultiThreadedDebug + _MBCS;%(PreprocessorDefinitions) + + + true + + + + + + + + + + + Level3 + Disabled + true + MultiThreadedDebug + _MBCS;%(PreprocessorDefinitions) + + + true + + + + + + + + + + + Level3 + MaxSpeed + true + true + true + AnySuitable + Speed + false + MultiThreaded + true + _MBCS;%(PreprocessorDefinitions) + + + true + true + true + + + + + + + + + + + Level3 + MaxSpeed + true + true + true + OnlyExplicitInline + Size + false + MultiThreaded + true + _MBCS;%(PreprocessorDefinitions) + + + true + true + true + + + + + + + + + + + + \ No newline at end of file diff --git a/proj/Leopard.vcxproj.filters b/proj/Leopard.vcxproj.filters new file mode 100644 index 0000000..079edb1 --- /dev/null +++ b/proj/Leopard.vcxproj.filters @@ -0,0 +1,57 @@ + + + + + {4FC737F1-C7A5-4376-A066-2A32D752A2FF} + cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx + + + {93995380-89BD-4b04-88EB-625FBE52EBFB} + h;hh;hpp;hxx;hm;inl;inc;xsd + + + {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} + rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms + + + + + Header Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + \ No newline at end of file diff --git a/tests/benchmark.cpp b/tests/benchmark.cpp new file mode 100644 index 0000000..2b0719f --- /dev/null +++ b/tests/benchmark.cpp @@ -0,0 +1,567 @@ +/* + Copyright (c) 2017 Christopher A. Taylor. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + * Neither the name of Leopard nor the names of its contributors may be + used to endorse or promote products derived from this software without + specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. +*/ + +#include "../LeopardCommon.h" +#include "../leopard.h" + +#include +#include +#include +#include +using namespace std; + +//#define TEST_DATA_ALL_SAME +//#define TEST_LOSE_FIRST_K_PACKETS + + +//------------------------------------------------------------------------------ +// Windows + +#ifdef _WIN32 + #define WIN32_LEAN_AND_MEAN + + #ifndef _WINSOCKAPI_ + #define DID_DEFINE_WINSOCKAPI + #define _WINSOCKAPI_ + #endif + #ifndef NOMINMAX + #define NOMINMAX + #endif + #ifndef _WIN32_WINNT + #define _WIN32_WINNT 0x0601 /* Windows 7+ */ + #endif + + #include +#endif + +#ifdef DID_DEFINE_WINSOCKAPI + #undef _WINSOCKAPI_ + #undef DID_DEFINE_WINSOCKAPI +#endif + + +//------------------------------------------------------------------------------ +// Threads + +static bool SetCurrentThreadPriority() +{ +#ifdef _WIN32 + return 0 != ::SetThreadPriority(::GetCurrentThread(), THREAD_PRIORITY_ABOVE_NORMAL); +#else + return -1 != nice(2); +#endif +} + + +//------------------------------------------------------------------------------ +// Timing + +static uint64_t GetTimeUsec() +{ +#ifdef _WIN32 + LARGE_INTEGER timeStamp = {}; + if (!::QueryPerformanceCounter(&timeStamp)) + return 0; + static double PerfFrequencyInverse = 0.; + if (PerfFrequencyInverse == 0.) + { + LARGE_INTEGER freq = {}; + if (!::QueryPerformanceFrequency(&freq) || freq.QuadPart == 0) + return 0; + PerfFrequencyInverse = 1000000. / (double)freq.QuadPart; + } + return (uint64_t)(PerfFrequencyInverse * timeStamp.QuadPart); +#else + struct timeval tv; + gettimeofday(&tv, nullptr); + return 1000000 * tv.tv_sec + tv.tv_usec; +#endif // _WIN32 +} + + +//------------------------------------------------------------------------------ +// PCG PRNG +// From http://www.pcg-random.org/ + +class PCGRandom +{ +public: + inline void Seed(uint64_t y, uint64_t x = 0) + { + State = 0; + Inc = (y << 1u) | 1u; + Next(); + State += x; + Next(); + } + + inline uint32_t Next() + { + const uint64_t oldstate = State; + State = oldstate * UINT64_C(6364136223846793005) + Inc; + const uint32_t xorshifted = (uint32_t)(((oldstate >> 18) ^ oldstate) >> 27); + const uint32_t rot = oldstate >> 59; + return (xorshifted >> rot) | (xorshifted << ((uint32_t)(-(int32_t)rot) & 31)); + } + + uint64_t State = 0, Inc = 0; +}; + + +//------------------------------------------------------------------------------ +// Self-Checking Packet + +static void WriteRandomSelfCheckingPacket(PCGRandom& prng, void* packet, unsigned bytes) +{ + uint8_t* buffer = (uint8_t*)packet; +#ifdef TEST_DATA_ALL_SAME + if (bytes != 0) +#else + if (bytes < 16) +#endif + { + LEO_DEBUG_ASSERT(bytes >= 2); + buffer[0] = (uint8_t)prng.Next(); + for (unsigned i = 1; i < bytes; ++i) + { + buffer[i] = buffer[0]; + } + } + else + { + uint32_t crc = bytes; + *(uint32_t*)(buffer + 4) = bytes; + for (unsigned i = 8; i < bytes; ++i) + { + uint8_t v = (uint8_t)prng.Next(); + buffer[i] = v; + crc = (crc << 3) | (crc >> (32 - 3)); + crc += v; + } + *(uint32_t*)buffer = crc; + } +} + +static bool CheckPacket(const void* packet, unsigned bytes) +{ + uint8_t* buffer = (uint8_t*)packet; +#ifdef TEST_DATA_ALL_SAME + if (bytes != 0) +#else + if (bytes < 16) +#endif + { + if (bytes < 2) + return false; + + uint8_t v = buffer[0]; + for (unsigned i = 1; i < bytes; ++i) + { + if (buffer[i] != v) + return false; + } + } + else + { + uint32_t crc = bytes; + uint32_t readBytes = *(uint32_t*)(buffer + 4); + if (readBytes != bytes) + return false; + for (unsigned i = 8; i < bytes; ++i) + { + uint8_t v = buffer[i]; + crc = (crc << 3) | (crc >> (32 - 3)); + crc += v; + } + uint32_t readCRC = *(uint32_t*)buffer; + if (readCRC != crc) + return false; + } + return true; +} + + +//------------------------------------------------------------------------------ +// FunctionTimer + +class FunctionTimer +{ +public: + FunctionTimer(const std::string& name) + { + FunctionName = name; + } + void BeginCall() + { + LEO_DEBUG_ASSERT(t0 == 0); + t0 = GetTimeUsec(); + } + void EndCall() + { + LEO_DEBUG_ASSERT(t0 != 0); + uint64_t t1 = GetTimeUsec(); + ++Invokations; + TotalUsec += t1 - t0; + t0 = 0; + } + void Reset() + { + LEO_DEBUG_ASSERT(t0 == 0); + t0 = 0; + Invokations = 0; + TotalUsec = 0; + } + void Print(unsigned trials) + { + cout << FunctionName << " called " << Invokations / (float)trials << " times per trial (avg). " << TotalUsec / (double)Invokations << " usec avg for all invokations. " << TotalUsec / (float)trials << " usec (avg) of " << trials << " trials" << endl; + } + + uint64_t t0 = 0; + uint64_t Invokations = 0; + uint64_t TotalUsec = 0; + std::string FunctionName; +}; + + +//------------------------------------------------------------------------------ +// Utility: Deck Shuffling function + +/* + Given a PRNG, generate a deck of cards in a random order. + The deck will contain elements with values between 0 and count - 1. +*/ + +static void ShuffleDeck16(PCGRandom &prng, uint16_t * LEO_RESTRICT deck, uint32_t count) +{ + deck[0] = 0; + + // If we can unroll 4 times, + if (count <= 256) + { + for (uint32_t ii = 1;;) + { + uint32_t jj, rv = prng.Next(); + + // 8-bit unroll + switch (count - ii) + { + default: + jj = (uint8_t)rv % ii; + deck[ii] = deck[jj]; + deck[jj] = ii; + ++ii; + jj = (uint8_t)(rv >> 8) % ii; + deck[ii] = deck[jj]; + deck[jj] = ii; + ++ii; + jj = (uint8_t)(rv >> 16) % ii; + deck[ii] = deck[jj]; + deck[jj] = ii; + ++ii; + jj = (uint8_t)(rv >> 24) % ii; + deck[ii] = deck[jj]; + deck[jj] = ii; + ++ii; + break; + + case 3: + jj = (uint8_t)rv % ii; + deck[ii] = deck[jj]; + deck[jj] = ii; + ++ii; + case 2: + jj = (uint8_t)(rv >> 8) % ii; + deck[ii] = deck[jj]; + deck[jj] = ii; + ++ii; + case 1: + jj = (uint8_t)(rv >> 16) % ii; + deck[ii] = deck[jj]; + deck[jj] = ii; + case 0: + return; + } + } + } + else + { + // For each deck entry, + for (uint32_t ii = 1;;) + { + uint32_t jj, rv = prng.Next(); + + // 16-bit unroll + switch (count - ii) + { + default: + jj = (uint16_t)rv % ii; + deck[ii] = deck[jj]; + deck[jj] = ii; + ++ii; + jj = (uint16_t)(rv >> 16) % ii; + deck[ii] = deck[jj]; + deck[jj] = ii; + ++ii; + break; + + case 1: + jj = (uint16_t)rv % ii; + deck[ii] = deck[jj]; + deck[jj] = ii; + case 0: + return; + } + } + } +} + + +//------------------------------------------------------------------------------ +// SIMD-Safe Aligned Memory Allocations + +static const unsigned kAlignmentBytes = LEO_ALIGN_BYTES; + +LEO_FORCE_INLINE unsigned NextAlignedOffset(unsigned offset) +{ + return (offset + kAlignmentBytes - 1) & ~(kAlignmentBytes - 1); +} + +static LEO_FORCE_INLINE uint8_t* SIMDSafeAllocate(size_t size) +{ + uint8_t* data = (uint8_t*)calloc(1, kAlignmentBytes + size); + if (!data) + return nullptr; + unsigned offset = (unsigned)((uintptr_t)data % kAlignmentBytes); + data += kAlignmentBytes - offset; + data[-1] = (uint8_t)offset; + return data; +} + +static LEO_FORCE_INLINE void SIMDSafeFree(void* ptr) +{ + if (!ptr) + return; + uint8_t* data = (uint8_t*)ptr; + unsigned offset = data[-1]; + if (offset >= kAlignmentBytes) + { + LEO_DEBUG_BREAK; // Should never happen + return; + } + data -= kAlignmentBytes - offset; + free(data); +} + + +//------------------------------------------------------------------------------ +// Tests + +struct TestParameters +{ + unsigned original_count = 200; // under 65536 + unsigned recovery_count = 100; // under 65536 - original_count + unsigned buffer_bytes = 64000; // multiple of 64 bytes + unsigned loss_count = 20; // some fraction of original_count + unsigned seed = 0; + bool multithreaded = true; +}; + +static void BasicTest(const TestParameters& params) +{ + static const unsigned kTrials = 4; + + std::vector original_data(params.original_count); + + const unsigned encode_work_count = leo_encode_work_count(params.original_count, params.recovery_count); + const unsigned decode_work_count = leo_decode_work_count(params.original_count, params.recovery_count); + + std::vector encode_work_data(encode_work_count); + std::vector decode_work_data(decode_work_count); + + FunctionTimer t_mem_alloc("memory_allocation"); + FunctionTimer t_leo_encode("leo_encode"); + FunctionTimer t_leo_decode("leo_decode"); + FunctionTimer t_mem_free("memory_free"); + + const uint64_t total_bytes = (uint64_t)params.buffer_bytes * params.original_count; + + for (unsigned trial = 0; trial < kTrials; ++trial) + { + // Allocate memory: + + t_mem_alloc.BeginCall(); + for (unsigned i = 0, count = params.original_count; i < count; ++i) + original_data[i] = SIMDSafeAllocate(params.buffer_bytes); + for (unsigned i = 0, count = encode_work_count; i < count; ++i) + encode_work_data[i] = SIMDSafeAllocate(params.buffer_bytes); + for (unsigned i = 0, count = decode_work_count; i < count; ++i) + decode_work_data[i] = SIMDSafeAllocate(params.buffer_bytes); + t_mem_alloc.EndCall(); + + // Generate data: + + PCGRandom prng; + prng.Seed(params.seed, trial); + + for (unsigned i = 0; i < params.original_count; ++i) + WriteRandomSelfCheckingPacket(prng, original_data[i], params.buffer_bytes); + + // Encode: + + t_leo_encode.BeginCall(); + LeopardResult encodeResult = leo_encode( + params.buffer_bytes, + params.original_count, + params.recovery_count, + encode_work_count, + (void**)&original_data[0], + (void**)&encode_work_data[0], // recovery data written here + params.multithreaded ? LeopardFlags_Multithreaded : LeopardFlags_Defaults + ); + t_leo_encode.EndCall(); + + if (encodeResult != Leopard_Success) + { + cout << "Error: Leopard encode failed with result=" << encodeResult << endl; + LEO_DEBUG_BREAK; + return; + } + + // Lose random original data: + + std::vector original_losses(params.original_count); + ShuffleDeck16(prng, &original_losses[0], params.original_count); + + for (unsigned i = 0, count = params.loss_count; i < count; ++i) + { + const unsigned loss_index = original_losses[i]; + delete[] original_data[loss_index]; + original_data[loss_index] = nullptr; + } + + // Lose random recovery data: + + const unsigned recovery_loss_count = params.recovery_count - params.loss_count; + + std::vector recovery_losses(params.recovery_count); + ShuffleDeck16(prng, &recovery_losses[0], params.recovery_count); + + for (unsigned i = 0, count = params.loss_count; i < count; ++i) + { + const unsigned loss_index = original_losses[i]; + delete[] encode_work_data[loss_index]; + encode_work_data[loss_index] = nullptr; + } + + // Decode: + + t_leo_decode.BeginCall(); + LeopardResult decodeResult = leo_decode( + params.buffer_bytes, + params.original_count, + params.recovery_count, + decode_work_count, + (void**)&original_data[0], + (void**)&encode_work_data[0], + (void**)&decode_work_data[0], + params.multithreaded ? LeopardFlags_Multithreaded : LeopardFlags_Defaults); + t_leo_decode.EndCall(); + + if (decodeResult != Leopard_Success) + { + cout << "Error: Leopard decode failed with result=" << decodeResult << endl; + LEO_DEBUG_BREAK; + return; + } + + // Free memory: + + t_mem_free.BeginCall(); + for (unsigned i = 0, count = params.original_count; i < count; ++i) + SIMDSafeFree(original_data[i]); + for (unsigned i = 0, count = encode_work_count; i < count; ++i) + SIMDSafeFree(encode_work_data[i]); + for (unsigned i = 0, count = decode_work_count; i < count; ++i) + SIMDSafeFree(decode_work_data[i]); + t_mem_free.EndCall(); + } + + t_mem_alloc.Print(kTrials); + t_leo_encode.Print(kTrials); + t_leo_decode.Print(kTrials); + t_mem_free.Print(kTrials); + + float encode_input_MBPS = total_bytes * kTrials / (float)(t_leo_encode.TotalUsec); + float encode_output_MBPS = params.buffer_bytes * (uint64_t)params.recovery_count * kTrials / (float)(t_leo_encode.TotalUsec); + float decode_input_MBPS = total_bytes * kTrials / (float)(t_leo_decode.TotalUsec); + float decode_output_MBPS = params.buffer_bytes * (uint64_t)params.loss_count * kTrials / (float)(t_leo_decode.TotalUsec); + + cout << "Leopard Encoder(" << total_bytes / 1000000.f << " MB in " << params.original_count << " pieces, " << params.loss_count << " losses): Input=" << encode_input_MBPS << " MB/s, Output=" << encode_output_MBPS << " MB/s" << endl; + cout << "Leopard Decoder(" << total_bytes / 1000000.f << " MB in " << params.original_count << " pieces, " << params.loss_count << " losses): Input=" << decode_input_MBPS << " MB/s, Output=" << decode_output_MBPS << " MB/s" << endl << endl; +} + + +//------------------------------------------------------------------------------ +// Entrypoint + +int main(int argc, char **argv) +{ + SetCurrentThreadPriority(); + + FunctionTimer t_leo_init("leo_init"); + + t_leo_init.BeginCall(); + if (0 != leo_init()) + { + cout << "Failed to initialize" << endl; + return -1; + } + t_leo_init.EndCall(); + t_leo_init.Print(1); + + TestParameters params; + + if (argc >= 2) + params.original_count = atoi(argv[1]); + if (argc >= 3) + params.recovery_count = atoi(argv[2]); + if (argc >= 4) + params.buffer_bytes = atoi(argv[3]); + if (argc >= 5) + params.loss_count = atoi(argv[4]); + if (argc >= 6) + params.multithreaded = (atoi(argv[5]) != 0); + + cout << "Parameters: [original count=" << params.original_count << "] [recovery count=" << params.recovery_count << "] [buffer bytes=" << params.buffer_bytes << "] [loss count=" << params.loss_count << "] [random seed=" << params.seed << "]" << endl; + + BasicTest(params); + + getchar(); + + return 0; +} diff --git a/msvc/LHC_RS.vcxproj b/tests/proj/Benchmark.vcxproj similarity index 91% rename from msvc/LHC_RS.vcxproj rename to tests/proj/Benchmark.vcxproj index e030ced..6c008f5 100644 --- a/msvc/LHC_RS.vcxproj +++ b/tests/proj/Benchmark.vcxproj @@ -18,41 +18,38 @@ x64 - - - - {32176592-2F30-4BD5-B645-EB11C8D3453E} - GF65536 - LHC_RS - 8.1 + {97FCA15F-EAF3-4F1A-AFF8-83E693DA9D45} + Fecal + LeopardBenchmark + 10.0.14393.0 Application true MultiByte - v140 + v141 Application true MultiByte - v140 + v141 Application false true MultiByte - v140 + v141 Application false true MultiByte - v140 + v141 @@ -155,8 +152,8 @@ true true true - OnlyExplicitInline - Size + AnySuitable + Speed false MultiThreaded true @@ -174,6 +171,14 @@ + + + + + + {32176592-2f30-4bd5-b645-eb11c8d3453e} + + diff --git a/msvc/LHC_RS.vcxproj.filters b/tests/proj/Benchmark.vcxproj.filters similarity index 94% rename from msvc/LHC_RS.vcxproj.filters rename to tests/proj/Benchmark.vcxproj.filters index d265062..50a05dd 100644 --- a/msvc/LHC_RS.vcxproj.filters +++ b/tests/proj/Benchmark.vcxproj.filters @@ -15,7 +15,7 @@ - + Source Files