diff --git a/LeopardCommon.cpp b/LeopardCommon.cpp index 82bdbcf..55850bc 100644 --- a/LeopardCommon.cpp +++ b/LeopardCommon.cpp @@ -139,818 +139,233 @@ void InitializeCPUArch() } - -// vx[] += vy[] * z -static void muladd_mem(GFSymbol * LEO_RESTRICT vx, const GFSymbol * LEO_RESTRICT vy, GFSymbol z, unsigned symbolCount) -{ - for (unsigned i = 0; i < symbolCount; ++i) - { - const GFSymbol a = vy[i]; - if (a == 0) - continue; - - GFSymbol sum1 = static_cast(AddModQ(GFLog[a & 0x0f], z)); - GFSymbol value1 = GFExp[sum1]; - if ((a & 0x0f) == 0) - { - value1 = 0; - } - GFSymbol sum2 = static_cast(AddModQ(GFLog[a & 0xf0], z)); - GFSymbol value2 = GFExp[sum2]; - if ((a & 0xf0) == 0) - { - value2 = 0; - } - GFSymbol sum3 = static_cast(AddModQ(GFLog[a & 0x0f00], z)); - GFSymbol value3 = GFExp[sum3]; - if ((a & 0x0f00) == 0) - { - value3 = 0; - } - GFSymbol sum4 = static_cast(AddModQ(GFLog[a & 0xf000], z)); - GFSymbol value4 = GFExp[sum4]; - if ((a & 0xf000) == 0) - { - value4 = 0; - } - - vx[i] ^= value1; - vx[i] ^= value2; - vx[i] ^= value3; - vx[i] ^= value4; - } -} - -// return a*GFExp[b] over GF(2^r) -static GFSymbol mulE(GFSymbol a, GFSymbol b) -{ - if (a == 0) - return 0; - - const GFSymbol sum = static_cast(AddModQ(GFLog[a], b)); - return GFExp[sum]; -} - - //------------------------------------------------------------------------------ -// Fast Walsh-Hadamard Transform (FWHT) Mod Q -// -// Q is the maximum symbol value, e.g. 255 or 65535. +// XOR Memory -// Define this to enable the optimized version of FWHT() -#define LEO_FWHT_OPTIMIZED - -typedef GFSymbol fwht_t; - -// {a, b} = {a + b, a - b} (Mod Q) -static LEO_FORCE_INLINE void FWHT_2(fwht_t& LEO_RESTRICT a, fwht_t& LEO_RESTRICT b) +void xor_mem( + void * LEO_RESTRICT vx, const void * LEO_RESTRICT vy, + unsigned bytes) { - const fwht_t sum = AddModQ(a, b); - const fwht_t dif = SubModQ(a, b); - a = sum; - b = dif; -} - -/* - FWHT is a minor slice of the runtime and does not grow with data size, - but I did attempt a few additional optimizations that failed: - - I've attempted to vectorize (with partial reductions) FWHT_4(data, s), - which is 70% of the algorithm, but it was slower. Left in _attic_. - - I've attempted to avoid reductions in all or parts of the FWHT. - The final modular reduction ends up being slower than the savings. - Specifically I tried doing it for the whole FWHT and also I tried - doing it just for the FWHT_2 loop in the main routine, but both - approaches are slower than partial reductions. - - Replacing word reads with wider reads does speed up the operation, but - at too high a complexity cost relative to minor perf improvement. -*/ - -#ifndef LEO_FWHT_OPTIMIZED - -// Reference implementation -static void FWHT(fwht_t* data, const unsigned bits) -{ - const unsigned size = (unsigned)(1UL << bits); - for (unsigned width = 1; width < size; width <<= 1) - for (unsigned i = 0; i < size; i += (width << 1)) - for (unsigned j = i; j < (width + i); ++j) - FWHT_2(data[j], data[j + width]); -} - -#else - -static LEO_FORCE_INLINE void FWHT_4(fwht_t* data) -{ - fwht_t t0 = data[0]; - fwht_t t1 = data[1]; - fwht_t t2 = data[2]; - fwht_t t3 = data[3]; - FWHT_2(t0, t1); - FWHT_2(t2, t3); - FWHT_2(t0, t2); - FWHT_2(t1, t3); - data[0] = t0; - data[1] = t1; - data[2] = t2; - data[3] = t3; -} - -static LEO_FORCE_INLINE void FWHT_4(fwht_t* data, unsigned s) -{ - unsigned x = 0; - fwht_t t0 = data[x]; x += s; - fwht_t t1 = data[x]; x += s; - fwht_t t2 = data[x]; x += s; - fwht_t t3 = data[x]; - FWHT_2(t0, t1); - FWHT_2(t2, t3); - FWHT_2(t0, t2); - FWHT_2(t1, t3); - unsigned y = 0; - data[y] = t0; y += s; - data[y] = t1; y += s; - data[y] = t2; y += s; - data[y] = t3; -} - -static inline void FWHT_8(fwht_t* data) -{ - fwht_t t0 = data[0]; - fwht_t t1 = data[1]; - fwht_t t2 = data[2]; - fwht_t t3 = data[3]; - fwht_t t4 = data[4]; - fwht_t t5 = data[5]; - fwht_t t6 = data[6]; - fwht_t t7 = data[7]; - FWHT_2(t0, t1); - FWHT_2(t2, t3); - FWHT_2(t4, t5); - FWHT_2(t6, t7); - FWHT_2(t0, t2); - FWHT_2(t1, t3); - FWHT_2(t4, t6); - FWHT_2(t5, t7); - FWHT_2(t0, t4); - FWHT_2(t1, t5); - FWHT_2(t2, t6); - FWHT_2(t3, t7); - data[0] = t0; - data[1] = t1; - data[2] = t2; - data[3] = t3; - data[4] = t4; - data[5] = t5; - data[6] = t6; - data[7] = t7; -} - -static inline void FWHT_16(fwht_t* data) -{ - fwht_t t0 = data[0]; - fwht_t t1 = data[1]; - fwht_t t2 = data[2]; - fwht_t t3 = data[3]; - fwht_t t4 = data[4]; - fwht_t t5 = data[5]; - fwht_t t6 = data[6]; - fwht_t t7 = data[7]; - fwht_t t8 = data[8]; - fwht_t t9 = data[9]; - fwht_t t10 = data[10]; - fwht_t t11 = data[11]; - fwht_t t12 = data[12]; - fwht_t t13 = data[13]; - fwht_t t14 = data[14]; - fwht_t t15 = data[15]; - FWHT_2(t0, t1); - FWHT_2(t2, t3); - FWHT_2(t4, t5); - FWHT_2(t6, t7); - FWHT_2(t8, t9); - FWHT_2(t10, t11); - FWHT_2(t12, t13); - FWHT_2(t14, t15); - FWHT_2(t0, t2); - FWHT_2(t1, t3); - FWHT_2(t4, t6); - FWHT_2(t5, t7); - FWHT_2(t8, t10); - FWHT_2(t9, t11); - FWHT_2(t12, t14); - FWHT_2(t13, t15); - FWHT_2(t0, t4); - FWHT_2(t1, t5); - FWHT_2(t2, t6); - FWHT_2(t3, t7); - FWHT_2(t8, t12); - FWHT_2(t9, t13); - FWHT_2(t10, t14); - FWHT_2(t11, t15); - FWHT_2(t0, t8); - FWHT_2(t1, t9); - FWHT_2(t2, t10); - FWHT_2(t3, t11); - FWHT_2(t4, t12); - FWHT_2(t5, t13); - FWHT_2(t6, t14); - FWHT_2(t7, t15); - data[0] = t0; - data[1] = t1; - data[2] = t2; - data[3] = t3; - data[4] = t4; - data[5] = t5; - data[6] = t6; - data[7] = t7; - data[8] = t8; - data[9] = t9; - data[10] = t10; - data[11] = t11; - data[12] = t12; - data[13] = t13; - data[14] = t14; - data[15] = t15; -} - -static void FWHT_SmallData(fwht_t* data, unsigned ldn) -{ - const unsigned n = (1UL << ldn); - - if (n <= 2) - { - if (n == 2) - FWHT_2(data[0], data[1]); - return; - } - - for (unsigned ldm = ldn; ldm > 3; ldm -= 2) - { - unsigned m = (1UL << ldm); - unsigned m4 = (m >> 2); - for (unsigned r = 0; r < n; r += m) - for (unsigned j = 0; j < m4; j++) - FWHT_4(data + j + r, m4); - } - - if (ldn & 1) - { - for (unsigned i0 = 0; i0 < n; i0 += 8) - FWHT_8(data + i0); - } - else - { - for (unsigned i0 = 0; i0 < n; i0 += 4) - FWHT_4(data + i0); - } -} - -// Decimation in time (DIT) version -static void FWHT(fwht_t* data, const unsigned ldn) -{ - if (ldn <= 13) - { - FWHT_SmallData(data, ldn); - return; - } - - FWHT_2(data[2], data[3]); - FWHT_4(data + 4); - FWHT_8(data + 8); - FWHT_16(data + 16); - for (unsigned ldm = 5; ldm < ldn; ++ldm) - FWHT(data + (unsigned)(1UL << ldm), ldm); - - for (unsigned ldm = 0; ldm < ldn; ++ldm) - { - const unsigned mh = (1UL << ldm); - for (unsigned t1 = 0, t2 = mh; t1 < mh; ++t1, ++t2) - FWHT_2(data[t1], data[t2]); - } -} - -#endif - - -//------------------------------------------------------------------------------ -// Memory Buffer XOR - -static void xor_mem(void * LEO_RESTRICT vx, const void * LEO_RESTRICT vy, unsigned bytes) -{ - LEO_M128 * LEO_RESTRICT x16 = reinterpret_cast(vx); - const LEO_M128 * LEO_RESTRICT y16 = reinterpret_cast(vy); - -#if defined(LEO_TARGET_MOBILE) -# if defined(LEO_TRY_NEON) - // Handle multiples of 64 bytes - if (CpuHasNeon) - { - while (bytes >= 64) - { - LEO_M128 x0 = vld1q_u8(x16); - LEO_M128 x1 = vld1q_u8(x16 + 1); - LEO_M128 x2 = vld1q_u8(x16 + 2); - LEO_M128 x3 = vld1q_u8(x16 + 3); - LEO_M128 y0 = vld1q_u8(y16); - LEO_M128 y1 = vld1q_u8(y16 + 1); - LEO_M128 y2 = vld1q_u8(y16 + 2); - LEO_M128 y3 = vld1q_u8(y16 + 3); - - vst1q_u8(x16, veorq_u8(x0, y0)); - vst1q_u8(x16 + 1, veorq_u8(x1, y1)); - vst1q_u8(x16 + 2, veorq_u8(x2, y2)); - vst1q_u8(x16 + 3, veorq_u8(x3, y3)); - - bytes -= 64, x16 += 4, y16 += 4; - } - - // Handle multiples of 16 bytes - while (bytes >= 16) - { - LEO_M128 x0 = vld1q_u8(x16); - LEO_M128 y0 = vld1q_u8(y16); - - vst1q_u8(x16, veorq_u8(x0, y0)); - - bytes -= 16, ++x16, ++y16; - } - } - else -# endif // LEO_TRY_NEON - { - uint64_t * LEO_RESTRICT x8 = reinterpret_cast(x16); - const uint64_t * LEO_RESTRICT y8 = reinterpret_cast(y16); - - const unsigned count = (unsigned)bytes / 8; - for (unsigned ii = 0; ii < count; ++ii) - x8[ii] ^= y8[ii]; - - x16 = reinterpret_cast(x8 + count); - y16 = reinterpret_cast(y8 + count); - } -#else // LEO_TARGET_MOBILE -# if defined(LEO_TRY_AVX2) +#if defined(LEO_TRY_AVX2) if (CpuHasAVX2) { - LEO_M256 * LEO_RESTRICT x32 = reinterpret_cast(x16); - const LEO_M256 * LEO_RESTRICT y32 = reinterpret_cast(y16); - - while (bytes >= 128) + LEO_M256 * LEO_RESTRICT x32 = reinterpret_cast(vx); + const LEO_M256 * LEO_RESTRICT y32 = reinterpret_cast(vy); + do { - LEO_M256 x0 = _mm256_loadu_si256(x32); - LEO_M256 y0 = _mm256_loadu_si256(y32); - x0 = _mm256_xor_si256(x0, y0); - LEO_M256 x1 = _mm256_loadu_si256(x32 + 1); - LEO_M256 y1 = _mm256_loadu_si256(y32 + 1); - x1 = _mm256_xor_si256(x1, y1); - LEO_M256 x2 = _mm256_loadu_si256(x32 + 2); - LEO_M256 y2 = _mm256_loadu_si256(y32 + 2); - x2 = _mm256_xor_si256(x2, y2); - LEO_M256 x3 = _mm256_loadu_si256(x32 + 3); - LEO_M256 y3 = _mm256_loadu_si256(y32 + 3); - x3 = _mm256_xor_si256(x3, y3); - + const LEO_M256 x0 = _mm256_xor_si256(_mm256_loadu_si256(x32), _mm256_loadu_si256(y32)); + const LEO_M256 x1 = _mm256_xor_si256(_mm256_loadu_si256(x32 + 1), _mm256_loadu_si256(y32 + 1)); + const LEO_M256 x2 = _mm256_xor_si256(_mm256_loadu_si256(x32 + 2), _mm256_loadu_si256(y32 + 2)); + const LEO_M256 x3 = _mm256_xor_si256(_mm256_loadu_si256(x32 + 3), _mm256_loadu_si256(y32 + 3)); _mm256_storeu_si256(x32, x0); _mm256_storeu_si256(x32 + 1, x1); _mm256_storeu_si256(x32 + 2, x2); _mm256_storeu_si256(x32 + 3, x3); - bytes -= 128, x32 += 4, y32 += 4; - } - - // Handle multiples of 32 bytes - while (bytes >= 32) + } while (bytes >= 128); + if (bytes > 0) { - // x[i] = x[i] xor y[i] - _mm256_storeu_si256(x32, - _mm256_xor_si256( - _mm256_loadu_si256(x32), - _mm256_loadu_si256(y32))); - - bytes -= 32, ++x32, ++y32; + const LEO_M256 x0 = _mm256_xor_si256(_mm256_loadu_si256(x32), _mm256_loadu_si256(y32)); + const LEO_M256 x1 = _mm256_xor_si256(_mm256_loadu_si256(x32 + 1), _mm256_loadu_si256(y32 + 1)); + _mm256_storeu_si256(x32, x0); + _mm256_storeu_si256(x32 + 1, x1); } - - x16 = reinterpret_cast(x32); - y16 = reinterpret_cast(y32); + return; } - else -# endif // LEO_TRY_AVX2 +#endif // LEO_TRY_AVX2 + LEO_M128 * LEO_RESTRICT x16 = reinterpret_cast(vx); + const LEO_M128 * LEO_RESTRICT y16 = reinterpret_cast(vy); + do { - while (bytes >= 64) - { - LEO_M128 x0 = _mm_loadu_si128(x16); - LEO_M128 y0 = _mm_loadu_si128(y16); - x0 = _mm_xor_si128(x0, y0); - LEO_M128 x1 = _mm_loadu_si128(x16 + 1); - LEO_M128 y1 = _mm_loadu_si128(y16 + 1); - x1 = _mm_xor_si128(x1, y1); - LEO_M128 x2 = _mm_loadu_si128(x16 + 2); - LEO_M128 y2 = _mm_loadu_si128(y16 + 2); - x2 = _mm_xor_si128(x2, y2); - LEO_M128 x3 = _mm_loadu_si128(x16 + 3); - LEO_M128 y3 = _mm_loadu_si128(y16 + 3); - x3 = _mm_xor_si128(x3, y3); - - _mm_storeu_si128(x16, x0); - _mm_storeu_si128(x16 + 1, x1); - _mm_storeu_si128(x16 + 2, x2); - _mm_storeu_si128(x16 + 3, x3); - - bytes -= 64, x16 += 4, y16 += 4; - } - } -#endif // LEO_TARGET_MOBILE - - // Handle multiples of 16 bytes - while (bytes >= 16) - { - // x[i] = x[i] xor y[i] - _mm_storeu_si128(x16, - _mm_xor_si128( - _mm_loadu_si128(x16), - _mm_loadu_si128(y16))); - - bytes -= 16, ++x16, ++y16; - } - - uint8_t * LEO_RESTRICT x1 = reinterpret_cast(x16); - const uint8_t * LEO_RESTRICT y1 = reinterpret_cast(y16); - - // Handle a block of 8 bytes - const unsigned eight = bytes & 8; - if (eight) - { - uint64_t * LEO_RESTRICT x8 = reinterpret_cast(x1); - const uint64_t * LEO_RESTRICT y8 = reinterpret_cast(y1); - *x8 ^= *y8; - } - - // Handle a block of 4 bytes - const unsigned four = bytes & 4; - if (four) - { - uint32_t * LEO_RESTRICT x4 = reinterpret_cast(x1 + eight); - const uint32_t * LEO_RESTRICT y4 = reinterpret_cast(y1 + eight); - *x4 ^= *y4; - } - - // Handle final bytes - const unsigned offset = eight + four; - switch (bytes & 3) - { - case 3: x1[offset + 2] ^= y1[offset + 2]; - case 2: x1[offset + 1] ^= y1[offset + 1]; - case 1: x1[offset] ^= y1[offset]; - default: - break; - } + const LEO_M128 x0 = _mm_xor_si128(_mm_loadu_si128(x16), _mm_loadu_si128(y16)); + const LEO_M128 x1 = _mm_xor_si128(_mm_loadu_si128(x16 + 1), _mm_loadu_si128(y16 + 1)); + const LEO_M128 x2 = _mm_xor_si128(_mm_loadu_si128(x16 + 2), _mm_loadu_si128(y16 + 2)); + const LEO_M128 x3 = _mm_xor_si128(_mm_loadu_si128(x16 + 3), _mm_loadu_si128(y16 + 3)); + _mm_storeu_si128(x16, x0); + _mm_storeu_si128(x16 + 1, x1); + _mm_storeu_si128(x16 + 2, x2); + _mm_storeu_si128(x16 + 3, x3); + bytes -= 64, x16 += 4, y16 += 4; + } while (bytes > 0); } - -//------------------------------------------------------------------------------ -// Formal Derivative - -// Formal derivative of polynomial in the new basis -static void formal_derivative(GFSymbol* cos, const unsigned size) +void xor_mem2( + void * LEO_RESTRICT vx_0, const void * LEO_RESTRICT vy_0, + void * LEO_RESTRICT vx_1, const void * LEO_RESTRICT vy_1, + unsigned bytes) { - for (unsigned i = 1; i < size; ++i) +#if defined(LEO_TRY_AVX2) + if (CpuHasAVX2) { - const unsigned leng = ((i ^ (i - 1)) + 1) >> 1; - - // If a large number of values are being XORed: - if (leng >= 8) - xor_mem(cos + i - leng, cos + i, leng * sizeof(GFSymbol)); - else - for (unsigned j = i - leng; j < i; j++) - cos[j] ^= cos[j + leng]; + LEO_M256 * LEO_RESTRICT x32_0 = reinterpret_cast (vx_0); + const LEO_M256 * LEO_RESTRICT y32_0 = reinterpret_cast(vy_0); + LEO_M256 * LEO_RESTRICT x32_1 = reinterpret_cast (vx_1); + const LEO_M256 * LEO_RESTRICT y32_1 = reinterpret_cast(vy_1); + do + { + const LEO_M256 x0_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0), _mm256_loadu_si256(y32_0)); + const LEO_M256 x1_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 1), _mm256_loadu_si256(y32_0 + 1)); + const LEO_M256 x2_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 2), _mm256_loadu_si256(y32_0 + 2)); + const LEO_M256 x3_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 3), _mm256_loadu_si256(y32_0 + 3)); + const LEO_M256 x0_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1), _mm256_loadu_si256(y32_1)); + const LEO_M256 x1_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 1), _mm256_loadu_si256(y32_1 + 1)); + const LEO_M256 x2_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 2), _mm256_loadu_si256(y32_1 + 2)); + const LEO_M256 x3_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 3), _mm256_loadu_si256(y32_1 + 3)); + _mm256_storeu_si256(x32_0, x0_0); + _mm256_storeu_si256(x32_0 + 1, x1_0); + _mm256_storeu_si256(x32_0 + 2, x2_0); + _mm256_storeu_si256(x32_0 + 3, x3_0); + _mm256_storeu_si256(x32_1, x0_1); + _mm256_storeu_si256(x32_1 + 1, x1_1); + _mm256_storeu_si256(x32_1 + 2, x2_1); + _mm256_storeu_si256(x32_1 + 3, x3_1); + x32_0 += 4, y32_0 += 4; + x32_1 += 4, y32_1 += 4; + bytes -= 128; + } while (bytes >= 128); + if (bytes > 0) + { + const LEO_M256 x0_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0), _mm256_loadu_si256(y32_0)); + const LEO_M256 x1_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 1), _mm256_loadu_si256(y32_0 + 1)); + const LEO_M256 x0_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1), _mm256_loadu_si256(y32_1)); + const LEO_M256 x1_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 1), _mm256_loadu_si256(y32_1 + 1)); + _mm256_storeu_si256(x32_0, x0_0); + _mm256_storeu_si256(x32_0 + 1, x1_0); + _mm256_storeu_si256(x32_1, x0_1); + _mm256_storeu_si256(x32_1 + 1, x1_1); + } + return; } - - for (unsigned i = size; i < kFieldSize; i <<= 1) - xor_mem(cos, cos + i, size * sizeof(GFSymbol)); +#endif // LEO_TRY_AVX2 + LEO_M128 * LEO_RESTRICT x16_0 = reinterpret_cast (vx_0); + const LEO_M128 * LEO_RESTRICT y16_0 = reinterpret_cast(vy_0); + LEO_M128 * LEO_RESTRICT x16_1 = reinterpret_cast (vx_1); + const LEO_M128 * LEO_RESTRICT y16_1 = reinterpret_cast(vy_1); + do + { + const LEO_M128 x0_0 = _mm_xor_si128(_mm_loadu_si128(x16_0), _mm_loadu_si128(y16_0)); + const LEO_M128 x1_0 = _mm_xor_si128(_mm_loadu_si128(x16_0 + 1), _mm_loadu_si128(y16_0 + 1)); + const LEO_M128 x2_0 = _mm_xor_si128(_mm_loadu_si128(x16_0 + 2), _mm_loadu_si128(y16_0 + 2)); + const LEO_M128 x3_0 = _mm_xor_si128(_mm_loadu_si128(x16_0 + 3), _mm_loadu_si128(y16_0 + 3)); + const LEO_M128 x0_1 = _mm_xor_si128(_mm_loadu_si128(x16_1), _mm_loadu_si128(y16_1)); + const LEO_M128 x1_1 = _mm_xor_si128(_mm_loadu_si128(x16_1 + 1), _mm_loadu_si128(y16_1 + 1)); + const LEO_M128 x2_1 = _mm_xor_si128(_mm_loadu_si128(x16_1 + 2), _mm_loadu_si128(y16_1 + 2)); + const LEO_M128 x3_1 = _mm_xor_si128(_mm_loadu_si128(x16_1 + 3), _mm_loadu_si128(y16_1 + 3)); + _mm_storeu_si128(x16_0, x0_0); + _mm_storeu_si128(x16_0 + 1, x1_0); + _mm_storeu_si128(x16_0 + 2, x2_0); + _mm_storeu_si128(x16_0 + 3, x3_0); + _mm_storeu_si128(x16_1, x0_1); + _mm_storeu_si128(x16_1 + 1, x1_1); + _mm_storeu_si128(x16_1 + 2, x2_1); + _mm_storeu_si128(x16_1 + 3, x3_1); + x16_0 += 4, y16_0 += 4; + x16_1 += 4, y16_1 += 4; + bytes -= 64; + } while (bytes > 0); } - -//------------------------------------------------------------------------------ -// Fast Fourier Transform - -static GFSymbol skewVec[kFieldModulus]; // twisted factors used in FFT - -// IFFT in the proposed basis -static void IFLT(GFSymbol* data, const unsigned size, const unsigned index) +void xor_mem3( + void * LEO_RESTRICT vx_0, const void * LEO_RESTRICT vy_0, + void * LEO_RESTRICT vx_1, const void * LEO_RESTRICT vy_1, + void * LEO_RESTRICT vx_2, const void * LEO_RESTRICT vy_2, + unsigned bytes) { - for (unsigned depart_no = 1; depart_no < size; depart_no <<= 1) +#if defined(LEO_TRY_AVX2) + if (CpuHasAVX2) { - for (unsigned j = depart_no; j < size; j += (depart_no << 1)) + LEO_M256 * LEO_RESTRICT x32_0 = reinterpret_cast (vx_0); + const LEO_M256 * LEO_RESTRICT y32_0 = reinterpret_cast(vy_0); + LEO_M256 * LEO_RESTRICT x32_1 = reinterpret_cast (vx_1); + const LEO_M256 * LEO_RESTRICT y32_1 = reinterpret_cast(vy_1); + LEO_M256 * LEO_RESTRICT x32_2 = reinterpret_cast (vx_2); + const LEO_M256 * LEO_RESTRICT y32_2 = reinterpret_cast(vy_2); + do { - // If a large number of values are being XORed: - if (depart_no >= 8) - xor_mem(data + j, data + j - depart_no, depart_no * sizeof(GFSymbol)); - else - for (unsigned i = j - depart_no; i < j; ++i) - data[i + depart_no] ^= data[i]; - - const GFSymbol skew = skewVec[j + index - 1]; - - if (skew != kFieldModulus) - muladd_mem(data + j - depart_no, data + j, skew, depart_no); - } - } -} - -// FFT in the proposed basis -static void FLT(GFSymbol* data, const unsigned size, const unsigned index) -{ - for (unsigned depart_no = (size >> 1); depart_no > 0; depart_no >>= 1) - { - for (unsigned j = depart_no; j < size; j += (depart_no << 1)) + const LEO_M256 x0_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0), _mm256_loadu_si256(y32_0)); + const LEO_M256 x1_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 1), _mm256_loadu_si256(y32_0 + 1)); + const LEO_M256 x2_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 2), _mm256_loadu_si256(y32_0 + 2)); + const LEO_M256 x3_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 3), _mm256_loadu_si256(y32_0 + 3)); + const LEO_M256 x0_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1), _mm256_loadu_si256(y32_1)); + const LEO_M256 x1_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 1), _mm256_loadu_si256(y32_1 + 1)); + const LEO_M256 x2_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 2), _mm256_loadu_si256(y32_1 + 2)); + const LEO_M256 x3_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 3), _mm256_loadu_si256(y32_1 + 3)); + const LEO_M256 x0_2 = _mm256_xor_si256(_mm256_loadu_si256(x32_2), _mm256_loadu_si256(y32_2)); + const LEO_M256 x1_2 = _mm256_xor_si256(_mm256_loadu_si256(x32_2 + 1), _mm256_loadu_si256(y32_2 + 1)); + const LEO_M256 x2_2 = _mm256_xor_si256(_mm256_loadu_si256(x32_2 + 2), _mm256_loadu_si256(y32_2 + 2)); + const LEO_M256 x3_2 = _mm256_xor_si256(_mm256_loadu_si256(x32_2 + 3), _mm256_loadu_si256(y32_2 + 3)); + _mm256_storeu_si256(x32_0, x0_0); + _mm256_storeu_si256(x32_0 + 1, x1_0); + _mm256_storeu_si256(x32_0 + 2, x2_0); + _mm256_storeu_si256(x32_0 + 3, x3_0); + _mm256_storeu_si256(x32_1, x0_1); + _mm256_storeu_si256(x32_1 + 1, x1_1); + _mm256_storeu_si256(x32_1 + 2, x2_1); + _mm256_storeu_si256(x32_1 + 3, x3_1); + _mm256_storeu_si256(x32_2, x0_2); + _mm256_storeu_si256(x32_2 + 1, x1_2); + _mm256_storeu_si256(x32_2 + 2, x2_2); + _mm256_storeu_si256(x32_2 + 3, x3_2); + x32_0 += 4, y32_0 += 4; + x32_1 += 4, y32_1 += 4; + x32_2 += 4, y32_2 += 4; + bytes -= 128; + } while (bytes >= 128); + if (bytes > 0) { - const GFSymbol skew = skewVec[j + index - 1]; - - if (skew != kFieldModulus) - muladd_mem(data + j - depart_no, data + j, skew, depart_no); - - // If a large number of values are being XORed: - if (depart_no >= 8) - xor_mem(data + j, data + j - depart_no, depart_no * sizeof(GFSymbol)); - else - for (unsigned i = j - depart_no; i < j; ++i) - data[i + depart_no] ^= data[i]; + const LEO_M256 x0_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0), _mm256_loadu_si256(y32_0)); + const LEO_M256 x1_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 1), _mm256_loadu_si256(y32_0 + 1)); + const LEO_M256 x0_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1), _mm256_loadu_si256(y32_1)); + const LEO_M256 x1_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 1), _mm256_loadu_si256(y32_1 + 1)); + const LEO_M256 x0_2 = _mm256_xor_si256(_mm256_loadu_si256(x32_2), _mm256_loadu_si256(y32_2)); + const LEO_M256 x1_2 = _mm256_xor_si256(_mm256_loadu_si256(x32_2 + 1), _mm256_loadu_si256(y32_2 + 1)); + _mm256_storeu_si256(x32_0, x0_0); + _mm256_storeu_si256(x32_0 + 1, x1_0); + _mm256_storeu_si256(x32_1, x0_1); + _mm256_storeu_si256(x32_1 + 1, x1_1); + _mm256_storeu_si256(x32_2, x0_2); + _mm256_storeu_si256(x32_2 + 1, x1_2); } + return; } -} - - -//------------------------------------------------------------------------------ -// FFT Initialization - -static GFSymbol B[kFieldSize >> 1]; // factors used in formal derivative -static fwht_t log_walsh[kFieldSize]; // factors used in the evaluation of the error locator polynomial - -// Initialize skewVec[], B[], log_walsh[] -static void InitFieldOperations() -{ - GFSymbol temp[kGFBits - 1]; - - for (unsigned i = 1; i < kGFBits; ++i) - temp[i - 1] = (GFSymbol)((unsigned)1 << i); - - for (unsigned m = 0; m < (kGFBits - 1); ++m) +#endif // LEO_TRY_AVX2 + LEO_M128 * LEO_RESTRICT x16_0 = reinterpret_cast (vx_0); + const LEO_M128 * LEO_RESTRICT y16_0 = reinterpret_cast(vy_0); + LEO_M128 * LEO_RESTRICT x16_1 = reinterpret_cast (vx_1); + const LEO_M128 * LEO_RESTRICT y16_1 = reinterpret_cast(vy_1); + LEO_M128 * LEO_RESTRICT x16_2 = reinterpret_cast (vx_2); + const LEO_M128 * LEO_RESTRICT y16_2 = reinterpret_cast(vy_2); + do { - const unsigned step = (unsigned)1 << (m + 1); - - skewVec[((unsigned)1 << m) - 1] = 0; - - for (unsigned i = m; i < (kGFBits - 1); ++i) - { - const unsigned s = ((unsigned)1 << (i + 1)); - - for (unsigned j = ((unsigned)1 << m) - 1; j < s; j += step) - skewVec[j + s] = skewVec[j] ^ temp[i]; - } - - temp[m] = kFieldModulus - GFLog[mulE(temp[m], GFLog[temp[m] ^ 1])]; - - for (unsigned i = m + 1; i < (kGFBits - 1); ++i) - temp[i] = mulE(temp[i], (GFLog[temp[i] ^ 1] + temp[m]) % kFieldModulus); - } - - for (unsigned i = 0; i < kFieldSize; ++i) - skewVec[i] = GFLog[skewVec[i]]; - - temp[0] = kFieldModulus - temp[0]; - - for (unsigned i = 1; i < (kGFBits - 1); ++i) - temp[i] = (kFieldModulus - temp[i] + temp[i - 1]) % kFieldModulus; - - B[0] = 0; - for (unsigned i = 0; i < (kGFBits - 1); ++i) - { - const unsigned depart = ((unsigned)1 << i); - - for (unsigned j = 0; j < depart; ++j) - B[j + depart] = (B[j] + temp[i]) % kFieldModulus; - } - - for (unsigned i = 0; i < kFieldSize; ++i) - log_walsh[i] = GFLog[i]; - - log_walsh[0] = 0; - - FWHT(log_walsh, kGFBits); -} - - -//------------------------------------------------------------------------------ -// Encoder - -// Encoding alg for k/n<0.5: message is a power of two -static void encodeL(GFSymbol* data, const unsigned k, GFSymbol* codeword) -{ - memcpy(codeword, data, sizeof(GFSymbol) * k); - - IFLT(codeword, k, 0); - - for (unsigned i = k; i < kFieldSize; i += k) - { - memcpy(&codeword[i], codeword, sizeof(GFSymbol) * k); - - FLT(&codeword[i], k, i); - } - - memcpy(codeword, data, sizeof(GFSymbol) * k); -} - -// Encoding alg for k/n>0.5: parity is a power of two. -// data: message array. parity: parity array. mem: buffer(size>= n-k) -static void encodeH(const GFSymbol* data, const unsigned k, GFSymbol* parity, GFSymbol* mem) -{ - const unsigned t = kFieldSize - k; - - memset(parity, 0, sizeof(GFSymbol) * t); - - for (unsigned i = t; i < kFieldSize; i += t) - { - memcpy(mem, &data[i - t], sizeof(GFSymbol) * t); - - IFLT(mem, t, i); - - xor_mem(parity, mem, t * sizeof(GFSymbol)); - } - - FLT(parity, t, 0); -} - - -//------------------------------------------------------------------------------ -// Decoder - -static void decode(GFSymbol* codeword, unsigned k, const bool* erasure) -{ - fwht_t log_walsh2[kFieldSize]; - - // Compute the evaluations of the error locator polynomial - for (unsigned i = 0; i < kFieldSize; ++i) - log_walsh2[i] = erasure[i] ? 1 : 0; - - FWHT(log_walsh2, kGFBits); - - for (unsigned i = 0; i < kFieldSize; ++i) - log_walsh2[i] = ((unsigned)log_walsh2[i] * (unsigned)log_walsh[i]) % kFieldModulus; - - FWHT(log_walsh2, kGFBits); - - // k2 can be replaced with k - const unsigned k2 = kFieldSize; - //const unsigned k2 = k; // cannot actually be replaced with k. what else need to change? - - for (unsigned i = 0; i < kFieldSize; ++i) - { - if (erasure[i]) - { - codeword[i] = 0; - } - else - { - codeword[i] = mulE(codeword[i], log_walsh2[i]); - } - } - - IFLT(codeword, kFieldSize, 0); - - // formal derivative - for (unsigned i = 0; i < kFieldSize; i += 2) - { - codeword[i] = mulE(codeword[i], kFieldModulus - B[i >> 1]); - codeword[i + 1] = mulE(codeword[i + 1], kFieldModulus - B[i >> 1]); - } - - formal_derivative(codeword, k2); - - for (unsigned i = 0; i < k2; i += 2) - { - codeword[i] = mulE(codeword[i], B[i >> 1]); - codeword[i + 1] = mulE(codeword[i + 1], B[i >> 1]); - } - - FLT(codeword, k2, 0); - - for (unsigned i = 0; i < k2; ++i) - { - if (erasure[i]) - { - codeword[i] = mulE(codeword[i], kFieldModulus - log_walsh2[i]); - } - } -} - - -//------------------------------------------------------------------------------ -// Test Application - -void test(unsigned k, unsigned seed) -{ - srand(seed); - - //-----------Generating message---------- - - // Message array - GFSymbol data[kFieldSize] = {0}; - - // Filled with random numbers - for (unsigned i = kFieldSize - k; i < kFieldSize; ++i) - data[i] = (GFSymbol)rand(); - - - //---------encoding---------- - - GFSymbol codeword[kFieldSize]; - encodeH(&data[kFieldSize - k], k, data, codeword); - //encodeL(data, k, codeword); // does not seem to work with any input? what else needs to change? - - memcpy(codeword, data, sizeof(GFSymbol) * kFieldSize); - - - //--------erasure simulation--------- - - // Array indicating erasures - bool erasure[kFieldSize] = { - false - }; - - for (unsigned i = k; i < kFieldSize; ++i) - erasure[i] = true; - - // permuting the erasure array - for (unsigned i = kFieldSize - 1; i > 0; --i) - { - unsigned pos = rand() % (i + 1); - - if (i != pos) - { - bool tmp = erasure[i]; - erasure[i] = erasure[pos]; - erasure[pos] = tmp; - } - } - - // erasure codeword symbols - for (unsigned i = 0; i < kFieldSize; ++i) - if (erasure[i]) - codeword[i] = 0; - - - //---------main processing---------- - decode(codeword, k, erasure); - - // Check the correctness of the result - for (unsigned i = 0; i < kFieldSize; ++i) - { - if (erasure[i] == 1) - { - if (data[i] != codeword[i]) - { - printf("Decoding Error with seed = %d!\n", seed); - LEO_DEBUG_BREAK; - return; - } - } - } - - //printf("Decoding is successful!\n"); -} - - -//------------------------------------------------------------------------------ -// Entrypoint - -int main(int argc, char **argv) -{ - // Initialize architecture-specific code - leo_architecture_init(); - - // Fill GFLog table and GFExp table - InitField(); - - // Compute factors used in erasure decoder - InitFieldOperations(); - - unsigned seed = (unsigned)time(NULL); - for (;;) - { - // test(int k), k: message size - /* - EncodeH works for kFieldSize / 2 and kFieldSize * 3 / 4, etc, - s.t. the number of recovery pieces is a power of two - */ - test(kFieldSize / 2, seed); - - ++seed; - } - - return 0; + const LEO_M128 x0_0 = _mm_xor_si128(_mm_loadu_si128(x16_0), _mm_loadu_si128(y16_0)); + const LEO_M128 x1_0 = _mm_xor_si128(_mm_loadu_si128(x16_0 + 1), _mm_loadu_si128(y16_0 + 1)); + const LEO_M128 x2_0 = _mm_xor_si128(_mm_loadu_si128(x16_0 + 2), _mm_loadu_si128(y16_0 + 2)); + const LEO_M128 x3_0 = _mm_xor_si128(_mm_loadu_si128(x16_0 + 3), _mm_loadu_si128(y16_0 + 3)); + const LEO_M128 x0_1 = _mm_xor_si128(_mm_loadu_si128(x16_1), _mm_loadu_si128(y16_1)); + const LEO_M128 x1_1 = _mm_xor_si128(_mm_loadu_si128(x16_1 + 1), _mm_loadu_si128(y16_1 + 1)); + const LEO_M128 x2_1 = _mm_xor_si128(_mm_loadu_si128(x16_1 + 2), _mm_loadu_si128(y16_1 + 2)); + const LEO_M128 x3_1 = _mm_xor_si128(_mm_loadu_si128(x16_1 + 3), _mm_loadu_si128(y16_1 + 3)); + const LEO_M128 x0_2 = _mm_xor_si128(_mm_loadu_si128(x16_2), _mm_loadu_si128(y16_2)); + const LEO_M128 x1_2 = _mm_xor_si128(_mm_loadu_si128(x16_2 + 1), _mm_loadu_si128(y16_2 + 1)); + const LEO_M128 x2_2 = _mm_xor_si128(_mm_loadu_si128(x16_2 + 2), _mm_loadu_si128(y16_2 + 2)); + const LEO_M128 x3_2 = _mm_xor_si128(_mm_loadu_si128(x16_2 + 3), _mm_loadu_si128(y16_2 + 3)); + _mm_storeu_si128(x16_0, x0_0); + _mm_storeu_si128(x16_0 + 1, x1_0); + _mm_storeu_si128(x16_0 + 2, x2_0); + _mm_storeu_si128(x16_0 + 3, x3_0); + _mm_storeu_si128(x16_1, x0_1); + _mm_storeu_si128(x16_1 + 1, x1_1); + _mm_storeu_si128(x16_1 + 2, x2_1); + _mm_storeu_si128(x16_1 + 3, x3_1); + _mm_storeu_si128(x16_2, x0_2); + _mm_storeu_si128(x16_2 + 1, x1_2); + _mm_storeu_si128(x16_2 + 2, x2_2); + _mm_storeu_si128(x16_2 + 3, x3_2); + x16_0 += 4, y16_0 += 4; + x16_1 += 4, y16_1 += 4; + x16_2 += 4, y16_2 += 4; + bytes -= 64; + } while (bytes > 0); } diff --git a/LeopardCommon.h b/LeopardCommon.h index 17425c0..a737304 100644 --- a/LeopardCommon.h +++ b/LeopardCommon.h @@ -30,42 +30,20 @@ /* TODO: - + Refactor software - + I think it should be split up into several C++ modules - + Replace GFSymbol with a file data pointer - + New 16-bit Muladd inner loops - + Class to contain the (large) muladd tables - + Preliminary benchmarks for large data! + New 8-bit Muladd inner loops - + Benchmarks for smaller data! - + Write detailed comments for all the routines - + Look into getting EncodeL working so we can support smaller data (Ask Lin) - + Look into using k instead of k2 to speed up decoder (Ask Lin) - + Avoid performing FFT/IFFT intermediate calculations we're not going to use - + Benchmarks, fun! + + Benchmarks for smaller data! + + New 16-bit Muladd inner loops + + Benchmarks for large data! + + Use parallel row ops + Add multi-threading to split up long parallelizable calculations - + Final benchmarks! - + Finish up documentation + + Write detailed comments for all the routines + + Final benchmarks! + Release version 1 + + Finish up documentation - - Muladd implementation notes: - - Specialize for 1-3 rows at a time since often times we're multiplying by - the same (skew) value repeatedly, as the ISA-L library does here: - - https://github.com/01org/isa-l/blob/master/erasure_code/gf_3vect_mad_avx.asm#L258 - - Except we should be doing it for 16-bit Galois Field. - To implement that use the ALTMAP trick from Jerasure: - - http://lab.jerasure.org/jerasure/gf-complete/blob/master/src/gf_w16.c#L1140 - - Except we should also support AVX2 since that is a 40% perf boost, so put - the high and low bytes 32 bytes instead of 16 bytes apart. - - Also I think we should go ahead and precompute the multiply tables since - it avoids a bunch of memory lookups for each muladd, and only costs 8 MB. + TBD: + + Look into getting EncodeL working so we can support smaller data (Ask Lin) + + Look into using FFT_m instead of FFT_n for decoder */ #include @@ -191,4 +169,57 @@ extern bool CpuHasSSSE3; #endif // LEO_TARGET_MOBILE +//------------------------------------------------------------------------------ +// Portable Intrinsics + +#ifdef _MSC_VER +#include +#endif + +// Returns highest bit index 0..31 where the first non-zero bit is found +// Precondition: x != 0 +LEO_FORCE_INLINE unsigned LastNonzeroBit32(unsigned x) +{ +#ifdef _MSC_VER + unsigned long index; + // Note: Ignoring result because x != 0 + _BitScanReverse(&index, (uint32_t)x); + return (unsigned)index; +#else + // Note: Ignoring return value of 0 because x != 0 + return 31 - (unsigned)__builtin_clzl(x); +#endif +} + +// Returns next power of two at or above given value +LEO_FORCE_INLINE unsigned NextPow2(unsigned n) +{ + return 2UL << LastNonzeroBit32(n - 1); +} + + +//------------------------------------------------------------------------------ +// XOR Memory +// +// This works for both 8-bit and 16-bit finite fields + +// x[] ^= y[] +void xor_mem( + void * LEO_RESTRICT x, const void * LEO_RESTRICT y, + unsigned bytes); + +// For i = {0, 1}: x_i[] ^= x_i[] +void xor_mem2( + void * LEO_RESTRICT x_0, const void * LEO_RESTRICT y_0, + void * LEO_RESTRICT x_1, const void * LEO_RESTRICT y_1, + unsigned bytes); + +// For i = {0, 1, 2}: x_i[] ^= x_i[] +void xor_mem3( + void * LEO_RESTRICT x_0, const void * LEO_RESTRICT y_0, + void * LEO_RESTRICT x_1, const void * LEO_RESTRICT y_1, + void * LEO_RESTRICT x_2, const void * LEO_RESTRICT y_2, + unsigned bytes); + + } // namespace leopard diff --git a/LeopardDecoder.cpp b/LeopardDecoder.cpp deleted file mode 100644 index 71d22e2..0000000 --- a/LeopardDecoder.cpp +++ /dev/null @@ -1,1220 +0,0 @@ -/* - Copyright (c) 2017 Christopher A. Taylor. All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - * Neither the name of LHC-RS nor the names of its contributors may be - used to endorse or promote products derived from this software without - specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE - LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - POSSIBILITY OF SUCH DAMAGE. -*/ - -#include -#include -#include -#include -#include - - -/* - TODO: - + Write C API and unit tester - + Limit input to multiples of 64 bytes - + Replace GFSymbol with a file data pointer - + New 16-bit Muladd inner loops - + Class to contain the (large) muladd tables - + Preliminary benchmarks for large data! - + New 8-bit Muladd inner loops - + Benchmarks for smaller data! - + Refactor software - + Pick a name for the software better than LEO_RS - + I think it should be split up into several C++ modules - + Write detailed comments for all the routines - + Look into getting EncodeL working so we can support smaller data (Ask Lin) - + Look into using k instead of k2 to speed up decoder (Ask Lin) - + Avoid performing FFT/IFFT intermediate calculations we're not going to use - + Benchmarks, fun! - + Add multi-threading to split up long parallelizable calculations - + Final benchmarks! - + Finish up documentation - + Release version 1 - - - Muladd implementation notes: - - Specialize for 1-3 rows at a time since often times we're multiplying by - the same (skew) value repeatedly, as the ISA-L library does here: - - https://github.com/01org/isa-l/blob/master/erasure_code/gf_3vect_mad_avx.asm#L258 - - Except we should be doing it for 16-bit Galois Field. - To implement that use the ALTMAP trick from Jerasure: - - http://lab.jerasure.org/jerasure/gf-complete/blob/master/src/gf_w16.c#L1140 - - Except we should also support AVX2 since that is a 40% perf boost, so put - the high and low bytes 32 bytes instead of 16 bytes apart. - - Also I think we should go ahead and precompute the multiply tables since - it avoids a bunch of memory lookups for each muladd, and only costs 8 MB. -*/ - - -//------------------------------------------------------------------------------ -// Debug - -// Some bugs only repro in release mode, so this can be helpful -//#define LEO_DEBUG_IN_RELEASE - -#if defined(_DEBUG) || defined(DEBUG) || defined(LEO_DEBUG_IN_RELEASE) - #define LEO_DEBUG - #ifdef _WIN32 - #define LEO_DEBUG_BREAK __debugbreak() - #else - #define LEO_DEBUG_BREAK __builtin_trap() - #endif - #define LEO_DEBUG_ASSERT(cond) { if (!(cond)) { LEO_DEBUG_BREAK; } } -#else - #define LEO_DEBUG_BREAK ; - #define LEO_DEBUG_ASSERT(cond) ; -#endif - - -//------------------------------------------------------------------------------ -// Platform/Architecture - -#if defined(ANDROID) || defined(IOS) - #define LEO_TARGET_MOBILE -#endif // ANDROID - -#if defined(__AVX2__) || (defined (_MSC_VER) && _MSC_VER >= 1900) - #define LEO_TRY_AVX2 /* 256-bit */ - #include - #define LEO_ALIGN_BYTES 32 -#else // __AVX2__ - #define LEO_ALIGN_BYTES 16 -#endif // __AVX2__ - -#if !defined(LEO_TARGET_MOBILE) - // Note: MSVC currently only supports SSSE3 but not AVX2 - #include // SSSE3: _mm_shuffle_epi8 - #include // SSE2 -#endif // LEO_TARGET_MOBILE - -#if defined(HAVE_ARM_NEON_H) - #include -#endif // HAVE_ARM_NEON_H - -#if defined(LEO_TARGET_MOBILE) - - #define LEO_ALIGNED_ACCESSES /* Inputs must be aligned to LEO_ALIGN_BYTES */ - -# if defined(HAVE_ARM_NEON_H) - // Compiler-specific 128-bit SIMD register keyword - #define LEO_M128 uint8x16_t - #define LEO_TRY_NEON -#else - #define LEO_M128 uint64_t -# endif - -#else // LEO_TARGET_MOBILE - - // Compiler-specific 128-bit SIMD register keyword - #define LEO_M128 __m128i - -#endif // LEO_TARGET_MOBILE - -#ifdef LEO_TRY_AVX2 - // Compiler-specific 256-bit SIMD register keyword - #define LEO_M256 __m256i -#endif - -// Compiler-specific C++11 restrict keyword -#define LEO_RESTRICT __restrict - -// Compiler-specific force inline keyword -#ifdef _MSC_VER - #define LEO_FORCE_INLINE inline __forceinline -#else - #define LEO_FORCE_INLINE inline __attribute__((always_inline)) -#endif - -// Compiler-specific alignment keyword -// Note: Alignment only matters for ARM NEON where it should be 16 -#ifdef _MSC_VER - #define LEO_ALIGNED __declspec(align(LEO_ALIGN_BYTES)) -#else // _MSC_VER - #define LEO_ALIGNED __attribute__((aligned(LEO_ALIGN_BYTES))) -#endif // _MSC_VER - - -//------------------------------------------------------------------------------ -// Runtime CPU Architecture Check -// -// Feature checks stolen shamelessly from -// https://github.com/jedisct1/libsodium/blob/master/src/libsodium/sodium/runtime.c - -#if defined(HAVE_ANDROID_GETCPUFEATURES) - #include -#endif - -#if defined(LEO_TRY_NEON) -# if defined(IOS) && defined(__ARM_NEON__) - // Requires iPhone 5S or newer - static const bool CpuHasNeon = true; - static const bool CpuHasNeon64 = true; -# else - // Remember to add LOCAL_STATIC_LIBRARIES := cpufeatures - static bool CpuHasNeon = false; // V6 / V7 - static bool CpuHasNeon64 = false; // 64-bit -# endif -#endif - - -#if !defined(LEO_TARGET_MOBILE) - -#ifdef _MSC_VER - #include // __cpuid - #pragma warning(disable: 4752) // found Intel(R) Advanced Vector Extensions; consider using /arch:AVX -#endif - -#ifdef LEO_TRY_AVX2 -static bool CpuHasAVX2 = false; -#endif -static bool CpuHasSSSE3 = false; - -#define CPUID_EBX_AVX2 0x00000020 -#define CPUID_ECX_SSSE3 0x00000200 - -static void _cpuid(unsigned int cpu_info[4U], const unsigned int cpu_info_type) -{ -#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_AMD64) || defined(_M_IX86)) - __cpuid((int *) cpu_info, cpu_info_type); -#else //if defined(HAVE_CPUID) - cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0; -# ifdef __i386__ - __asm__ __volatile__ ("pushfl; pushfl; " - "popl %0; " - "movl %0, %1; xorl %2, %0; " - "pushl %0; " - "popfl; pushfl; popl %0; popfl" : - "=&r" (cpu_info[0]), "=&r" (cpu_info[1]) : - "i" (0x200000)); - if (((cpu_info[0] ^ cpu_info[1]) & 0x200000) == 0) { - return; /* LCOV_EXCL_LINE */ - } -# endif -# ifdef __i386__ - __asm__ __volatile__ ("xchgl %%ebx, %k1; cpuid; xchgl %%ebx, %k1" : - "=a" (cpu_info[0]), "=&r" (cpu_info[1]), - "=c" (cpu_info[2]), "=d" (cpu_info[3]) : - "0" (cpu_info_type), "2" (0U)); -# elif defined(__x86_64__) - __asm__ __volatile__ ("xchgq %%rbx, %q1; cpuid; xchgq %%rbx, %q1" : - "=a" (cpu_info[0]), "=&r" (cpu_info[1]), - "=c" (cpu_info[2]), "=d" (cpu_info[3]) : - "0" (cpu_info_type), "2" (0U)); -# else - __asm__ __volatile__ ("cpuid" : - "=a" (cpu_info[0]), "=b" (cpu_info[1]), - "=c" (cpu_info[2]), "=d" (cpu_info[3]) : - "0" (cpu_info_type), "2" (0U)); -# endif -#endif -} - -#endif // defined(LEO_TARGET_MOBILE) - - -static void leo_architecture_init() -{ -#if defined(LEO_TRY_NEON) && defined(HAVE_ANDROID_GETCPUFEATURES) - AndroidCpuFamily family = android_getCpuFamily(); - if (family == ANDROID_CPU_FAMILY_ARM) - { - if (android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON) - CpuHasNeon = true; - } - else if (family == ANDROID_CPU_FAMILY_ARM64) - { - CpuHasNeon = true; - if (android_getCpuFeatures() & ANDROID_CPU_ARM64_FEATURE_ASIMD) - CpuHasNeon64 = true; - } -#endif - -#if !defined(LEO_TARGET_MOBILE) - unsigned int cpu_info[4]; - - _cpuid(cpu_info, 1); - CpuHasSSSE3 = ((cpu_info[2] & CPUID_ECX_SSSE3) != 0); - -#if defined(LEO_TRY_AVX2) - _cpuid(cpu_info, 7); - CpuHasAVX2 = ((cpu_info[1] & CPUID_EBX_AVX2) != 0); -#endif // LEO_TRY_AVX2 - -#endif // LEO_TARGET_MOBILE -} - - -//------------------------------------------------------------------------------ -// SIMD-Safe Aligned Memory Allocations - -static const unsigned kAlignmentBytes = LEO_ALIGN_BYTES; - -LEO_FORCE_INLINE unsigned NextAlignedOffset(unsigned offset) -{ - return (offset + kAlignmentBytes - 1) & ~(kAlignmentBytes - 1); -} - -static LEO_FORCE_INLINE uint8_t* SIMDSafeAllocate(size_t size) -{ - uint8_t* data = (uint8_t*)calloc(1, kAlignmentBytes + size); - if (!data) - return nullptr; - unsigned offset = (unsigned)((uintptr_t)data % kAlignmentBytes); - data += kAlignmentBytes - offset; - data[-1] = (uint8_t)offset; - return data; -} - -static LEO_FORCE_INLINE void SIMDSafeFree(void* ptr) -{ - if (!ptr) - return; - uint8_t* data = (uint8_t*)ptr; - unsigned offset = data[-1]; - if (offset >= kAlignmentBytes) - { - LEO_DEBUG_BREAK; // Should never happen - return; - } - data -= kAlignmentBytes - offset; - free(data); -} - - -//------------------------------------------------------------------------------ -// Field - -//#define LEO_SHORT_FIELD - -#ifdef LEO_SHORT_FIELD -typedef uint8_t GFSymbol; -static const unsigned kGFBits = 8; -static const unsigned kGFPolynomial = 0x11D; -GFSymbol kGFBasis[kGFBits] = { - 1, 214, 152, 146, 86, 200, 88, 230 // Cantor basis -}; -#else -typedef uint16_t GFSymbol; -static const unsigned kGFBits = 16; -static const unsigned kGFPolynomial = 0x1002D; -GFSymbol kGFBasis[kGFBits] = { - 0x0001, 0xACCA, 0x3C0E, 0x163E, // Cantor basis - 0xC582, 0xED2E, 0x914C, 0x4012, - 0x6C98, 0x10D8, 0x6A72, 0xB900, - 0xFDB8, 0xFB34, 0xFF38, 0x991E -}; -#endif - -/* - Cantor Basis introduced by: - D. G. Cantor, "On arithmetical algorithms over finite fields", - Journal of Combinatorial Theory, Series A, vol. 50, no. 2, pp. 285-300, 1989. -*/ - -static const unsigned kFieldSize = (unsigned)1 << kGFBits; //Field size -static const unsigned kFieldModulus = kFieldSize - 1; - -static GFSymbol GFLog[kFieldSize]; -static GFSymbol GFExp[kFieldSize]; - -// Initialize GFLog[], GFExp[] -static void InitField() -{ - unsigned state = 1; - for (unsigned i = 0; i < kFieldModulus; ++i) - { - GFExp[state] = static_cast(i); - state <<= 1; - if (state >= kFieldSize) - state ^= kGFPolynomial; - } - GFExp[0] = kFieldModulus; - - // Conversion to chosen basis: - - GFLog[0] = 0; - for (unsigned i = 0; i < kGFBits; ++i) - { - const GFSymbol basis = kGFBasis[i]; - const unsigned width = (unsigned)(1UL << i); - - for (unsigned j = 0; j < width; ++j) - GFLog[j + width] = GFLog[j] ^ basis; - } - - for (unsigned i = 0; i < kFieldSize; ++i) - GFLog[i] = GFExp[GFLog[i]]; - - for (unsigned i = 0; i < kFieldSize; ++i) - GFExp[GFLog[i]] = i; - - GFExp[kFieldModulus] = GFExp[0]; -} - - -//------------------------------------------------------------------------------ -// Mod Q Field Operations -// -// Q is the maximum symbol value, e.g. 255 or 65535. - -// z = x + y (mod Q) -static inline GFSymbol AddModQ(GFSymbol a, GFSymbol b) -{ - const unsigned sum = (unsigned)a + b; - - // Partial reduction step, allowing for Q to be returned - return static_cast(sum + (sum >> kGFBits)); -} - -// z = x - y (mod Q) -static inline GFSymbol SubModQ(GFSymbol a, GFSymbol b) -{ - const unsigned dif = (unsigned)a - b; - - // Partial reduction step, allowing for Q to be returned - return static_cast(dif + (dif >> kGFBits)); -} - -// vx[] += vy[] * z -static void muladd_mem(GFSymbol * LEO_RESTRICT vx, const GFSymbol * LEO_RESTRICT vy, GFSymbol z, unsigned symbolCount) -{ - for (unsigned i = 0; i < symbolCount; ++i) - { - const GFSymbol a = vy[i]; - if (a == 0) - continue; - - GFSymbol sum1 = static_cast(AddModQ(GFLog[a & 0x0f], z)); - GFSymbol value1 = GFExp[sum1]; - if ((a & 0x0f) == 0) - { - value1 = 0; - } - GFSymbol sum2 = static_cast(AddModQ(GFLog[a & 0xf0], z)); - GFSymbol value2 = GFExp[sum2]; - if ((a & 0xf0) == 0) - { - value2 = 0; - } - GFSymbol sum3 = static_cast(AddModQ(GFLog[a & 0x0f00], z)); - GFSymbol value3 = GFExp[sum3]; - if ((a & 0x0f00) == 0) - { - value3 = 0; - } - GFSymbol sum4 = static_cast(AddModQ(GFLog[a & 0xf000], z)); - GFSymbol value4 = GFExp[sum4]; - if ((a & 0xf000) == 0) - { - value4 = 0; - } - - vx[i] ^= value1; - vx[i] ^= value2; - vx[i] ^= value3; - vx[i] ^= value4; - } -} - -// return a*GFExp[b] over GF(2^r) -static GFSymbol mulE(GFSymbol a, GFSymbol b) -{ - if (a == 0) - return 0; - - const GFSymbol sum = static_cast(AddModQ(GFLog[a], b)); - return GFExp[sum]; -} - - -//------------------------------------------------------------------------------ -// Fast Walsh-Hadamard Transform (FWHT) Mod Q -// -// Q is the maximum symbol value, e.g. 255 or 65535. - -// Define this to enable the optimized version of FWHT() -#define LEO_FWHT_OPTIMIZED - -typedef GFSymbol fwht_t; - -// {a, b} = {a + b, a - b} (Mod Q) -static LEO_FORCE_INLINE void FWHT_2(fwht_t& LEO_RESTRICT a, fwht_t& LEO_RESTRICT b) -{ - const fwht_t sum = AddModQ(a, b); - const fwht_t dif = SubModQ(a, b); - a = sum; - b = dif; -} - -/* - FWHT is a minor slice of the runtime and does not grow with data size, - but I did attempt a few additional optimizations that failed: - - I've attempted to vectorize (with partial reductions) FWHT_4(data, s), - which is 70% of the algorithm, but it was slower. Left in _attic_. - - I've attempted to avoid reductions in all or parts of the FWHT. - The final modular reduction ends up being slower than the savings. - Specifically I tried doing it for the whole FWHT and also I tried - doing it just for the FWHT_2 loop in the main routine, but both - approaches are slower than partial reductions. - - Replacing word reads with wider reads does speed up the operation, but - at too high a complexity cost relative to minor perf improvement. -*/ - -#ifndef LEO_FWHT_OPTIMIZED - -// Reference implementation -static void FWHT(fwht_t* data, const unsigned bits) -{ - const unsigned size = (unsigned)(1UL << bits); - for (unsigned width = 1; width < size; width <<= 1) - for (unsigned i = 0; i < size; i += (width << 1)) - for (unsigned j = i; j < (width + i); ++j) - FWHT_2(data[j], data[j + width]); -} - -#else - -static LEO_FORCE_INLINE void FWHT_4(fwht_t* data) -{ - fwht_t t0 = data[0]; - fwht_t t1 = data[1]; - fwht_t t2 = data[2]; - fwht_t t3 = data[3]; - FWHT_2(t0, t1); - FWHT_2(t2, t3); - FWHT_2(t0, t2); - FWHT_2(t1, t3); - data[0] = t0; - data[1] = t1; - data[2] = t2; - data[3] = t3; -} - -static LEO_FORCE_INLINE void FWHT_4(fwht_t* data, unsigned s) -{ - unsigned x = 0; - fwht_t t0 = data[x]; x += s; - fwht_t t1 = data[x]; x += s; - fwht_t t2 = data[x]; x += s; - fwht_t t3 = data[x]; - FWHT_2(t0, t1); - FWHT_2(t2, t3); - FWHT_2(t0, t2); - FWHT_2(t1, t3); - unsigned y = 0; - data[y] = t0; y += s; - data[y] = t1; y += s; - data[y] = t2; y += s; - data[y] = t3; -} - -static inline void FWHT_8(fwht_t* data) -{ - fwht_t t0 = data[0]; - fwht_t t1 = data[1]; - fwht_t t2 = data[2]; - fwht_t t3 = data[3]; - fwht_t t4 = data[4]; - fwht_t t5 = data[5]; - fwht_t t6 = data[6]; - fwht_t t7 = data[7]; - FWHT_2(t0, t1); - FWHT_2(t2, t3); - FWHT_2(t4, t5); - FWHT_2(t6, t7); - FWHT_2(t0, t2); - FWHT_2(t1, t3); - FWHT_2(t4, t6); - FWHT_2(t5, t7); - FWHT_2(t0, t4); - FWHT_2(t1, t5); - FWHT_2(t2, t6); - FWHT_2(t3, t7); - data[0] = t0; - data[1] = t1; - data[2] = t2; - data[3] = t3; - data[4] = t4; - data[5] = t5; - data[6] = t6; - data[7] = t7; -} - -static inline void FWHT_16(fwht_t* data) -{ - fwht_t t0 = data[0]; - fwht_t t1 = data[1]; - fwht_t t2 = data[2]; - fwht_t t3 = data[3]; - fwht_t t4 = data[4]; - fwht_t t5 = data[5]; - fwht_t t6 = data[6]; - fwht_t t7 = data[7]; - fwht_t t8 = data[8]; - fwht_t t9 = data[9]; - fwht_t t10 = data[10]; - fwht_t t11 = data[11]; - fwht_t t12 = data[12]; - fwht_t t13 = data[13]; - fwht_t t14 = data[14]; - fwht_t t15 = data[15]; - FWHT_2(t0, t1); - FWHT_2(t2, t3); - FWHT_2(t4, t5); - FWHT_2(t6, t7); - FWHT_2(t8, t9); - FWHT_2(t10, t11); - FWHT_2(t12, t13); - FWHT_2(t14, t15); - FWHT_2(t0, t2); - FWHT_2(t1, t3); - FWHT_2(t4, t6); - FWHT_2(t5, t7); - FWHT_2(t8, t10); - FWHT_2(t9, t11); - FWHT_2(t12, t14); - FWHT_2(t13, t15); - FWHT_2(t0, t4); - FWHT_2(t1, t5); - FWHT_2(t2, t6); - FWHT_2(t3, t7); - FWHT_2(t8, t12); - FWHT_2(t9, t13); - FWHT_2(t10, t14); - FWHT_2(t11, t15); - FWHT_2(t0, t8); - FWHT_2(t1, t9); - FWHT_2(t2, t10); - FWHT_2(t3, t11); - FWHT_2(t4, t12); - FWHT_2(t5, t13); - FWHT_2(t6, t14); - FWHT_2(t7, t15); - data[0] = t0; - data[1] = t1; - data[2] = t2; - data[3] = t3; - data[4] = t4; - data[5] = t5; - data[6] = t6; - data[7] = t7; - data[8] = t8; - data[9] = t9; - data[10] = t10; - data[11] = t11; - data[12] = t12; - data[13] = t13; - data[14] = t14; - data[15] = t15; -} - -static void FWHT_SmallData(fwht_t* data, unsigned ldn) -{ - const unsigned n = (1UL << ldn); - - if (n <= 2) - { - if (n == 2) - FWHT_2(data[0], data[1]); - return; - } - - for (unsigned ldm = ldn; ldm > 3; ldm -= 2) - { - unsigned m = (1UL << ldm); - unsigned m4 = (m >> 2); - for (unsigned r = 0; r < n; r += m) - for (unsigned j = 0; j < m4; j++) - FWHT_4(data + j + r, m4); - } - - if (ldn & 1) - { - for (unsigned i0 = 0; i0 < n; i0 += 8) - FWHT_8(data + i0); - } - else - { - for (unsigned i0 = 0; i0 < n; i0 += 4) - FWHT_4(data + i0); - } -} - -// Decimation in time (DIT) version -static void FWHT(fwht_t* data, const unsigned ldn) -{ - if (ldn <= 13) - { - FWHT_SmallData(data, ldn); - return; - } - - FWHT_2(data[2], data[3]); - FWHT_4(data + 4); - FWHT_8(data + 8); - FWHT_16(data + 16); - for (unsigned ldm = 5; ldm < ldn; ++ldm) - FWHT(data + (unsigned)(1UL << ldm), ldm); - - for (unsigned ldm = 0; ldm < ldn; ++ldm) - { - const unsigned mh = (1UL << ldm); - for (unsigned t1 = 0, t2 = mh; t1 < mh; ++t1, ++t2) - FWHT_2(data[t1], data[t2]); - } -} - -#endif - - -//------------------------------------------------------------------------------ -// Memory Buffer XOR - -static void xor_mem(void * LEO_RESTRICT vx, const void * LEO_RESTRICT vy, unsigned bytes) -{ - LEO_M128 * LEO_RESTRICT x16 = reinterpret_cast(vx); - const LEO_M128 * LEO_RESTRICT y16 = reinterpret_cast(vy); - -#if defined(LEO_TARGET_MOBILE) -# if defined(LEO_TRY_NEON) - // Handle multiples of 64 bytes - if (CpuHasNeon) - { - while (bytes >= 64) - { - LEO_M128 x0 = vld1q_u8(x16); - LEO_M128 x1 = vld1q_u8(x16 + 1); - LEO_M128 x2 = vld1q_u8(x16 + 2); - LEO_M128 x3 = vld1q_u8(x16 + 3); - LEO_M128 y0 = vld1q_u8(y16); - LEO_M128 y1 = vld1q_u8(y16 + 1); - LEO_M128 y2 = vld1q_u8(y16 + 2); - LEO_M128 y3 = vld1q_u8(y16 + 3); - - vst1q_u8(x16, veorq_u8(x0, y0)); - vst1q_u8(x16 + 1, veorq_u8(x1, y1)); - vst1q_u8(x16 + 2, veorq_u8(x2, y2)); - vst1q_u8(x16 + 3, veorq_u8(x3, y3)); - - bytes -= 64, x16 += 4, y16 += 4; - } - - // Handle multiples of 16 bytes - while (bytes >= 16) - { - LEO_M128 x0 = vld1q_u8(x16); - LEO_M128 y0 = vld1q_u8(y16); - - vst1q_u8(x16, veorq_u8(x0, y0)); - - bytes -= 16, ++x16, ++y16; - } - } - else -# endif // LEO_TRY_NEON - { - uint64_t * LEO_RESTRICT x8 = reinterpret_cast(x16); - const uint64_t * LEO_RESTRICT y8 = reinterpret_cast(y16); - - const unsigned count = (unsigned)bytes / 8; - for (unsigned ii = 0; ii < count; ++ii) - x8[ii] ^= y8[ii]; - - x16 = reinterpret_cast(x8 + count); - y16 = reinterpret_cast(y8 + count); - } -#else // LEO_TARGET_MOBILE -# if defined(LEO_TRY_AVX2) - if (CpuHasAVX2) - { - LEO_M256 * LEO_RESTRICT x32 = reinterpret_cast(x16); - const LEO_M256 * LEO_RESTRICT y32 = reinterpret_cast(y16); - - while (bytes >= 128) - { - LEO_M256 x0 = _mm256_loadu_si256(x32); - LEO_M256 y0 = _mm256_loadu_si256(y32); - x0 = _mm256_xor_si256(x0, y0); - LEO_M256 x1 = _mm256_loadu_si256(x32 + 1); - LEO_M256 y1 = _mm256_loadu_si256(y32 + 1); - x1 = _mm256_xor_si256(x1, y1); - LEO_M256 x2 = _mm256_loadu_si256(x32 + 2); - LEO_M256 y2 = _mm256_loadu_si256(y32 + 2); - x2 = _mm256_xor_si256(x2, y2); - LEO_M256 x3 = _mm256_loadu_si256(x32 + 3); - LEO_M256 y3 = _mm256_loadu_si256(y32 + 3); - x3 = _mm256_xor_si256(x3, y3); - - _mm256_storeu_si256(x32, x0); - _mm256_storeu_si256(x32 + 1, x1); - _mm256_storeu_si256(x32 + 2, x2); - _mm256_storeu_si256(x32 + 3, x3); - - bytes -= 128, x32 += 4, y32 += 4; - } - - // Handle multiples of 32 bytes - while (bytes >= 32) - { - // x[i] = x[i] xor y[i] - _mm256_storeu_si256(x32, - _mm256_xor_si256( - _mm256_loadu_si256(x32), - _mm256_loadu_si256(y32))); - - bytes -= 32, ++x32, ++y32; - } - - x16 = reinterpret_cast(x32); - y16 = reinterpret_cast(y32); - } - else -# endif // LEO_TRY_AVX2 - { - while (bytes >= 64) - { - LEO_M128 x0 = _mm_loadu_si128(x16); - LEO_M128 y0 = _mm_loadu_si128(y16); - x0 = _mm_xor_si128(x0, y0); - LEO_M128 x1 = _mm_loadu_si128(x16 + 1); - LEO_M128 y1 = _mm_loadu_si128(y16 + 1); - x1 = _mm_xor_si128(x1, y1); - LEO_M128 x2 = _mm_loadu_si128(x16 + 2); - LEO_M128 y2 = _mm_loadu_si128(y16 + 2); - x2 = _mm_xor_si128(x2, y2); - LEO_M128 x3 = _mm_loadu_si128(x16 + 3); - LEO_M128 y3 = _mm_loadu_si128(y16 + 3); - x3 = _mm_xor_si128(x3, y3); - - _mm_storeu_si128(x16, x0); - _mm_storeu_si128(x16 + 1, x1); - _mm_storeu_si128(x16 + 2, x2); - _mm_storeu_si128(x16 + 3, x3); - - bytes -= 64, x16 += 4, y16 += 4; - } - } -#endif // LEO_TARGET_MOBILE - - // Handle multiples of 16 bytes - while (bytes >= 16) - { - // x[i] = x[i] xor y[i] - _mm_storeu_si128(x16, - _mm_xor_si128( - _mm_loadu_si128(x16), - _mm_loadu_si128(y16))); - - bytes -= 16, ++x16, ++y16; - } - - uint8_t * LEO_RESTRICT x1 = reinterpret_cast(x16); - const uint8_t * LEO_RESTRICT y1 = reinterpret_cast(y16); - - // Handle a block of 8 bytes - const unsigned eight = bytes & 8; - if (eight) - { - uint64_t * LEO_RESTRICT x8 = reinterpret_cast(x1); - const uint64_t * LEO_RESTRICT y8 = reinterpret_cast(y1); - *x8 ^= *y8; - } - - // Handle a block of 4 bytes - const unsigned four = bytes & 4; - if (four) - { - uint32_t * LEO_RESTRICT x4 = reinterpret_cast(x1 + eight); - const uint32_t * LEO_RESTRICT y4 = reinterpret_cast(y1 + eight); - *x4 ^= *y4; - } - - // Handle final bytes - const unsigned offset = eight + four; - switch (bytes & 3) - { - case 3: x1[offset + 2] ^= y1[offset + 2]; - case 2: x1[offset + 1] ^= y1[offset + 1]; - case 1: x1[offset] ^= y1[offset]; - default: - break; - } -} - - -//------------------------------------------------------------------------------ -// Formal Derivative - -// Formal derivative of polynomial in the new basis -static void formal_derivative(GFSymbol* cos, const unsigned size) -{ - for (unsigned i = 1; i < size; ++i) - { - const unsigned leng = ((i ^ (i - 1)) + 1) >> 1; - - // If a large number of values are being XORed: - if (leng >= 8) - xor_mem(cos + i - leng, cos + i, leng * sizeof(GFSymbol)); - else - for (unsigned j = i - leng; j < i; j++) - cos[j] ^= cos[j + leng]; - } - - for (unsigned i = size; i < kFieldSize; i <<= 1) - xor_mem(cos, cos + i, size * sizeof(GFSymbol)); -} - - -//------------------------------------------------------------------------------ -// Fast Fourier Transform - -static GFSymbol skewVec[kFieldModulus]; // twisted factors used in FFT - -// IFFT in the proposed basis -static void IFLT(GFSymbol* data, const unsigned size, const unsigned index) -{ - for (unsigned depart_no = 1; depart_no < size; depart_no <<= 1) - { - for (unsigned j = depart_no; j < size; j += (depart_no << 1)) - { - // If a large number of values are being XORed: - if (depart_no >= 8) - xor_mem(data + j, data + j - depart_no, depart_no * sizeof(GFSymbol)); - else - for (unsigned i = j - depart_no; i < j; ++i) - data[i + depart_no] ^= data[i]; - - const GFSymbol skew = skewVec[j + index - 1]; - - if (skew != kFieldModulus) - muladd_mem(data + j - depart_no, data + j, skew, depart_no); - } - } -} - -// FFT in the proposed basis -static void FLT(GFSymbol* data, const unsigned size, const unsigned index) -{ - for (unsigned depart_no = (size >> 1); depart_no > 0; depart_no >>= 1) - { - for (unsigned j = depart_no; j < size; j += (depart_no << 1)) - { - const GFSymbol skew = skewVec[j + index - 1]; - - if (skew != kFieldModulus) - muladd_mem(data + j - depart_no, data + j, skew, depart_no); - - // If a large number of values are being XORed: - if (depart_no >= 8) - xor_mem(data + j, data + j - depart_no, depart_no * sizeof(GFSymbol)); - else - for (unsigned i = j - depart_no; i < j; ++i) - data[i + depart_no] ^= data[i]; - } - } -} - - -//------------------------------------------------------------------------------ -// FFT Initialization - -static GFSymbol B[kFieldSize >> 1]; // factors used in formal derivative -static fwht_t log_walsh[kFieldSize]; // factors used in the evaluation of the error locator polynomial - -// Initialize skewVec[], B[], log_walsh[] -static void InitFieldOperations() -{ - GFSymbol temp[kGFBits - 1]; - - for (unsigned i = 1; i < kGFBits; ++i) - temp[i - 1] = (GFSymbol)((unsigned)1 << i); - - for (unsigned m = 0; m < (kGFBits - 1); ++m) - { - const unsigned step = (unsigned)1 << (m + 1); - - skewVec[((unsigned)1 << m) - 1] = 0; - - for (unsigned i = m; i < (kGFBits - 1); ++i) - { - const unsigned s = ((unsigned)1 << (i + 1)); - - for (unsigned j = ((unsigned)1 << m) - 1; j < s; j += step) - skewVec[j + s] = skewVec[j] ^ temp[i]; - } - - temp[m] = kFieldModulus - GFLog[mulE(temp[m], GFLog[temp[m] ^ 1])]; - - for (unsigned i = m + 1; i < (kGFBits - 1); ++i) - temp[i] = mulE(temp[i], (GFLog[temp[i] ^ 1] + temp[m]) % kFieldModulus); - } - - for (unsigned i = 0; i < kFieldSize; ++i) - skewVec[i] = GFLog[skewVec[i]]; - - temp[0] = kFieldModulus - temp[0]; - - for (unsigned i = 1; i < (kGFBits - 1); ++i) - temp[i] = (kFieldModulus - temp[i] + temp[i - 1]) % kFieldModulus; - - B[0] = 0; - for (unsigned i = 0; i < (kGFBits - 1); ++i) - { - const unsigned depart = ((unsigned)1 << i); - - for (unsigned j = 0; j < depart; ++j) - B[j + depart] = (B[j] + temp[i]) % kFieldModulus; - } - - for (unsigned i = 0; i < kFieldSize; ++i) - log_walsh[i] = GFLog[i]; - - log_walsh[0] = 0; - - FWHT(log_walsh, kGFBits); -} - - -//------------------------------------------------------------------------------ -// Encoder - -// Encoding alg for k/n<0.5: message is a power of two -static void encodeL(GFSymbol* data, const unsigned k, GFSymbol* codeword) -{ - memcpy(codeword, data, sizeof(GFSymbol) * k); - - IFLT(codeword, k, 0); - - for (unsigned i = k; i < kFieldSize; i += k) - { - memcpy(&codeword[i], codeword, sizeof(GFSymbol) * k); - - FLT(&codeword[i], k, i); - } - - memcpy(codeword, data, sizeof(GFSymbol) * k); -} - -// Encoding alg for k/n>0.5: parity is a power of two. -// data: message array. parity: parity array. mem: buffer(size>= n-k) -static void encodeH(const GFSymbol* data, const unsigned k, GFSymbol* parity, GFSymbol* mem) -{ - const unsigned t = kFieldSize - k; - - memset(parity, 0, sizeof(GFSymbol) * t); - - for (unsigned i = t; i < kFieldSize; i += t) - { - memcpy(mem, &data[i - t], sizeof(GFSymbol) * t); - - IFLT(mem, t, i); - - xor_mem(parity, mem, t * sizeof(GFSymbol)); - } - - FLT(parity, t, 0); -} - - -//------------------------------------------------------------------------------ -// Decoder - -static void decode(GFSymbol* codeword, unsigned k, const bool* erasure) -{ - fwht_t log_walsh2[kFieldSize]; - - // Compute the evaluations of the error locator polynomial - for (unsigned i = 0; i < kFieldSize; ++i) - log_walsh2[i] = erasure[i] ? 1 : 0; - - FWHT(log_walsh2, kGFBits); - - for (unsigned i = 0; i < kFieldSize; ++i) - log_walsh2[i] = ((unsigned)log_walsh2[i] * (unsigned)log_walsh[i]) % kFieldModulus; - - FWHT(log_walsh2, kGFBits); - - // k2 can be replaced with k - const unsigned k2 = kFieldSize; - //const unsigned k2 = k; // cannot actually be replaced with k. what else need to change? - - for (unsigned i = 0; i < kFieldSize; ++i) - { - if (erasure[i]) - { - codeword[i] = 0; - } - else - { - codeword[i] = mulE(codeword[i], log_walsh2[i]); - } - } - - IFLT(codeword, kFieldSize, 0); - - // formal derivative - for (unsigned i = 0; i < kFieldSize; i += 2) - { - codeword[i] = mulE(codeword[i], kFieldModulus - B[i >> 1]); - codeword[i + 1] = mulE(codeword[i + 1], kFieldModulus - B[i >> 1]); - } - - formal_derivative(codeword, k2); - - for (unsigned i = 0; i < k2; i += 2) - { - codeword[i] = mulE(codeword[i], B[i >> 1]); - codeword[i + 1] = mulE(codeword[i + 1], B[i >> 1]); - } - - FLT(codeword, k2, 0); - - for (unsigned i = 0; i < k2; ++i) - { - if (erasure[i]) - { - codeword[i] = mulE(codeword[i], kFieldModulus - log_walsh2[i]); - } - } -} - - -//------------------------------------------------------------------------------ -// Test Application - -void test(unsigned k, unsigned seed) -{ - srand(seed); - - //-----------Generating message---------- - - // Message array - GFSymbol data[kFieldSize] = {0}; - - // Filled with random numbers - for (unsigned i = kFieldSize - k; i < kFieldSize; ++i) - data[i] = (GFSymbol)rand(); - - - //---------encoding---------- - - GFSymbol codeword[kFieldSize]; - encodeH(&data[kFieldSize - k], k, data, codeword); - //encodeL(data, k, codeword); // does not seem to work with any input? what else needs to change? - - memcpy(codeword, data, sizeof(GFSymbol) * kFieldSize); - - - //--------erasure simulation--------- - - // Array indicating erasures - bool erasure[kFieldSize] = { - false - }; - - for (unsigned i = k; i < kFieldSize; ++i) - erasure[i] = true; - - // permuting the erasure array - for (unsigned i = kFieldSize - 1; i > 0; --i) - { - unsigned pos = rand() % (i + 1); - - if (i != pos) - { - bool tmp = erasure[i]; - erasure[i] = erasure[pos]; - erasure[pos] = tmp; - } - } - - // erasure codeword symbols - for (unsigned i = 0; i < kFieldSize; ++i) - if (erasure[i]) - codeword[i] = 0; - - - //---------main processing---------- - decode(codeword, k, erasure); - - // Check the correctness of the result - for (unsigned i = 0; i < kFieldSize; ++i) - { - if (erasure[i] == 1) - { - if (data[i] != codeword[i]) - { - printf("Decoding Error with seed = %d!\n", seed); - LEO_DEBUG_BREAK; - return; - } - } - } - - //printf("Decoding is successful!\n"); -} - - -//------------------------------------------------------------------------------ -// Entrypoint - -int main(int argc, char **argv) -{ - // Initialize architecture-specific code - leo_architecture_init(); - - // Fill GFLog table and GFExp table - InitField(); - - // Compute factors used in erasure decoder - InitFieldOperations(); - - unsigned seed = (unsigned)time(NULL); - for (;;) - { - // test(int k), k: message size - /* - EncodeH works for kFieldSize / 2 and kFieldSize * 3 / 4, etc, - s.t. the number of recovery pieces is a power of two - */ - test(kFieldSize / 2, seed); - - ++seed; - } - - return 0; -} diff --git a/LeopardDecoder.h b/LeopardDecoder.h deleted file mode 100644 index 71d22e2..0000000 --- a/LeopardDecoder.h +++ /dev/null @@ -1,1220 +0,0 @@ -/* - Copyright (c) 2017 Christopher A. Taylor. All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - * Neither the name of LHC-RS nor the names of its contributors may be - used to endorse or promote products derived from this software without - specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE - LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - POSSIBILITY OF SUCH DAMAGE. -*/ - -#include -#include -#include -#include -#include - - -/* - TODO: - + Write C API and unit tester - + Limit input to multiples of 64 bytes - + Replace GFSymbol with a file data pointer - + New 16-bit Muladd inner loops - + Class to contain the (large) muladd tables - + Preliminary benchmarks for large data! - + New 8-bit Muladd inner loops - + Benchmarks for smaller data! - + Refactor software - + Pick a name for the software better than LEO_RS - + I think it should be split up into several C++ modules - + Write detailed comments for all the routines - + Look into getting EncodeL working so we can support smaller data (Ask Lin) - + Look into using k instead of k2 to speed up decoder (Ask Lin) - + Avoid performing FFT/IFFT intermediate calculations we're not going to use - + Benchmarks, fun! - + Add multi-threading to split up long parallelizable calculations - + Final benchmarks! - + Finish up documentation - + Release version 1 - - - Muladd implementation notes: - - Specialize for 1-3 rows at a time since often times we're multiplying by - the same (skew) value repeatedly, as the ISA-L library does here: - - https://github.com/01org/isa-l/blob/master/erasure_code/gf_3vect_mad_avx.asm#L258 - - Except we should be doing it for 16-bit Galois Field. - To implement that use the ALTMAP trick from Jerasure: - - http://lab.jerasure.org/jerasure/gf-complete/blob/master/src/gf_w16.c#L1140 - - Except we should also support AVX2 since that is a 40% perf boost, so put - the high and low bytes 32 bytes instead of 16 bytes apart. - - Also I think we should go ahead and precompute the multiply tables since - it avoids a bunch of memory lookups for each muladd, and only costs 8 MB. -*/ - - -//------------------------------------------------------------------------------ -// Debug - -// Some bugs only repro in release mode, so this can be helpful -//#define LEO_DEBUG_IN_RELEASE - -#if defined(_DEBUG) || defined(DEBUG) || defined(LEO_DEBUG_IN_RELEASE) - #define LEO_DEBUG - #ifdef _WIN32 - #define LEO_DEBUG_BREAK __debugbreak() - #else - #define LEO_DEBUG_BREAK __builtin_trap() - #endif - #define LEO_DEBUG_ASSERT(cond) { if (!(cond)) { LEO_DEBUG_BREAK; } } -#else - #define LEO_DEBUG_BREAK ; - #define LEO_DEBUG_ASSERT(cond) ; -#endif - - -//------------------------------------------------------------------------------ -// Platform/Architecture - -#if defined(ANDROID) || defined(IOS) - #define LEO_TARGET_MOBILE -#endif // ANDROID - -#if defined(__AVX2__) || (defined (_MSC_VER) && _MSC_VER >= 1900) - #define LEO_TRY_AVX2 /* 256-bit */ - #include - #define LEO_ALIGN_BYTES 32 -#else // __AVX2__ - #define LEO_ALIGN_BYTES 16 -#endif // __AVX2__ - -#if !defined(LEO_TARGET_MOBILE) - // Note: MSVC currently only supports SSSE3 but not AVX2 - #include // SSSE3: _mm_shuffle_epi8 - #include // SSE2 -#endif // LEO_TARGET_MOBILE - -#if defined(HAVE_ARM_NEON_H) - #include -#endif // HAVE_ARM_NEON_H - -#if defined(LEO_TARGET_MOBILE) - - #define LEO_ALIGNED_ACCESSES /* Inputs must be aligned to LEO_ALIGN_BYTES */ - -# if defined(HAVE_ARM_NEON_H) - // Compiler-specific 128-bit SIMD register keyword - #define LEO_M128 uint8x16_t - #define LEO_TRY_NEON -#else - #define LEO_M128 uint64_t -# endif - -#else // LEO_TARGET_MOBILE - - // Compiler-specific 128-bit SIMD register keyword - #define LEO_M128 __m128i - -#endif // LEO_TARGET_MOBILE - -#ifdef LEO_TRY_AVX2 - // Compiler-specific 256-bit SIMD register keyword - #define LEO_M256 __m256i -#endif - -// Compiler-specific C++11 restrict keyword -#define LEO_RESTRICT __restrict - -// Compiler-specific force inline keyword -#ifdef _MSC_VER - #define LEO_FORCE_INLINE inline __forceinline -#else - #define LEO_FORCE_INLINE inline __attribute__((always_inline)) -#endif - -// Compiler-specific alignment keyword -// Note: Alignment only matters for ARM NEON where it should be 16 -#ifdef _MSC_VER - #define LEO_ALIGNED __declspec(align(LEO_ALIGN_BYTES)) -#else // _MSC_VER - #define LEO_ALIGNED __attribute__((aligned(LEO_ALIGN_BYTES))) -#endif // _MSC_VER - - -//------------------------------------------------------------------------------ -// Runtime CPU Architecture Check -// -// Feature checks stolen shamelessly from -// https://github.com/jedisct1/libsodium/blob/master/src/libsodium/sodium/runtime.c - -#if defined(HAVE_ANDROID_GETCPUFEATURES) - #include -#endif - -#if defined(LEO_TRY_NEON) -# if defined(IOS) && defined(__ARM_NEON__) - // Requires iPhone 5S or newer - static const bool CpuHasNeon = true; - static const bool CpuHasNeon64 = true; -# else - // Remember to add LOCAL_STATIC_LIBRARIES := cpufeatures - static bool CpuHasNeon = false; // V6 / V7 - static bool CpuHasNeon64 = false; // 64-bit -# endif -#endif - - -#if !defined(LEO_TARGET_MOBILE) - -#ifdef _MSC_VER - #include // __cpuid - #pragma warning(disable: 4752) // found Intel(R) Advanced Vector Extensions; consider using /arch:AVX -#endif - -#ifdef LEO_TRY_AVX2 -static bool CpuHasAVX2 = false; -#endif -static bool CpuHasSSSE3 = false; - -#define CPUID_EBX_AVX2 0x00000020 -#define CPUID_ECX_SSSE3 0x00000200 - -static void _cpuid(unsigned int cpu_info[4U], const unsigned int cpu_info_type) -{ -#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_AMD64) || defined(_M_IX86)) - __cpuid((int *) cpu_info, cpu_info_type); -#else //if defined(HAVE_CPUID) - cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0; -# ifdef __i386__ - __asm__ __volatile__ ("pushfl; pushfl; " - "popl %0; " - "movl %0, %1; xorl %2, %0; " - "pushl %0; " - "popfl; pushfl; popl %0; popfl" : - "=&r" (cpu_info[0]), "=&r" (cpu_info[1]) : - "i" (0x200000)); - if (((cpu_info[0] ^ cpu_info[1]) & 0x200000) == 0) { - return; /* LCOV_EXCL_LINE */ - } -# endif -# ifdef __i386__ - __asm__ __volatile__ ("xchgl %%ebx, %k1; cpuid; xchgl %%ebx, %k1" : - "=a" (cpu_info[0]), "=&r" (cpu_info[1]), - "=c" (cpu_info[2]), "=d" (cpu_info[3]) : - "0" (cpu_info_type), "2" (0U)); -# elif defined(__x86_64__) - __asm__ __volatile__ ("xchgq %%rbx, %q1; cpuid; xchgq %%rbx, %q1" : - "=a" (cpu_info[0]), "=&r" (cpu_info[1]), - "=c" (cpu_info[2]), "=d" (cpu_info[3]) : - "0" (cpu_info_type), "2" (0U)); -# else - __asm__ __volatile__ ("cpuid" : - "=a" (cpu_info[0]), "=b" (cpu_info[1]), - "=c" (cpu_info[2]), "=d" (cpu_info[3]) : - "0" (cpu_info_type), "2" (0U)); -# endif -#endif -} - -#endif // defined(LEO_TARGET_MOBILE) - - -static void leo_architecture_init() -{ -#if defined(LEO_TRY_NEON) && defined(HAVE_ANDROID_GETCPUFEATURES) - AndroidCpuFamily family = android_getCpuFamily(); - if (family == ANDROID_CPU_FAMILY_ARM) - { - if (android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON) - CpuHasNeon = true; - } - else if (family == ANDROID_CPU_FAMILY_ARM64) - { - CpuHasNeon = true; - if (android_getCpuFeatures() & ANDROID_CPU_ARM64_FEATURE_ASIMD) - CpuHasNeon64 = true; - } -#endif - -#if !defined(LEO_TARGET_MOBILE) - unsigned int cpu_info[4]; - - _cpuid(cpu_info, 1); - CpuHasSSSE3 = ((cpu_info[2] & CPUID_ECX_SSSE3) != 0); - -#if defined(LEO_TRY_AVX2) - _cpuid(cpu_info, 7); - CpuHasAVX2 = ((cpu_info[1] & CPUID_EBX_AVX2) != 0); -#endif // LEO_TRY_AVX2 - -#endif // LEO_TARGET_MOBILE -} - - -//------------------------------------------------------------------------------ -// SIMD-Safe Aligned Memory Allocations - -static const unsigned kAlignmentBytes = LEO_ALIGN_BYTES; - -LEO_FORCE_INLINE unsigned NextAlignedOffset(unsigned offset) -{ - return (offset + kAlignmentBytes - 1) & ~(kAlignmentBytes - 1); -} - -static LEO_FORCE_INLINE uint8_t* SIMDSafeAllocate(size_t size) -{ - uint8_t* data = (uint8_t*)calloc(1, kAlignmentBytes + size); - if (!data) - return nullptr; - unsigned offset = (unsigned)((uintptr_t)data % kAlignmentBytes); - data += kAlignmentBytes - offset; - data[-1] = (uint8_t)offset; - return data; -} - -static LEO_FORCE_INLINE void SIMDSafeFree(void* ptr) -{ - if (!ptr) - return; - uint8_t* data = (uint8_t*)ptr; - unsigned offset = data[-1]; - if (offset >= kAlignmentBytes) - { - LEO_DEBUG_BREAK; // Should never happen - return; - } - data -= kAlignmentBytes - offset; - free(data); -} - - -//------------------------------------------------------------------------------ -// Field - -//#define LEO_SHORT_FIELD - -#ifdef LEO_SHORT_FIELD -typedef uint8_t GFSymbol; -static const unsigned kGFBits = 8; -static const unsigned kGFPolynomial = 0x11D; -GFSymbol kGFBasis[kGFBits] = { - 1, 214, 152, 146, 86, 200, 88, 230 // Cantor basis -}; -#else -typedef uint16_t GFSymbol; -static const unsigned kGFBits = 16; -static const unsigned kGFPolynomial = 0x1002D; -GFSymbol kGFBasis[kGFBits] = { - 0x0001, 0xACCA, 0x3C0E, 0x163E, // Cantor basis - 0xC582, 0xED2E, 0x914C, 0x4012, - 0x6C98, 0x10D8, 0x6A72, 0xB900, - 0xFDB8, 0xFB34, 0xFF38, 0x991E -}; -#endif - -/* - Cantor Basis introduced by: - D. G. Cantor, "On arithmetical algorithms over finite fields", - Journal of Combinatorial Theory, Series A, vol. 50, no. 2, pp. 285-300, 1989. -*/ - -static const unsigned kFieldSize = (unsigned)1 << kGFBits; //Field size -static const unsigned kFieldModulus = kFieldSize - 1; - -static GFSymbol GFLog[kFieldSize]; -static GFSymbol GFExp[kFieldSize]; - -// Initialize GFLog[], GFExp[] -static void InitField() -{ - unsigned state = 1; - for (unsigned i = 0; i < kFieldModulus; ++i) - { - GFExp[state] = static_cast(i); - state <<= 1; - if (state >= kFieldSize) - state ^= kGFPolynomial; - } - GFExp[0] = kFieldModulus; - - // Conversion to chosen basis: - - GFLog[0] = 0; - for (unsigned i = 0; i < kGFBits; ++i) - { - const GFSymbol basis = kGFBasis[i]; - const unsigned width = (unsigned)(1UL << i); - - for (unsigned j = 0; j < width; ++j) - GFLog[j + width] = GFLog[j] ^ basis; - } - - for (unsigned i = 0; i < kFieldSize; ++i) - GFLog[i] = GFExp[GFLog[i]]; - - for (unsigned i = 0; i < kFieldSize; ++i) - GFExp[GFLog[i]] = i; - - GFExp[kFieldModulus] = GFExp[0]; -} - - -//------------------------------------------------------------------------------ -// Mod Q Field Operations -// -// Q is the maximum symbol value, e.g. 255 or 65535. - -// z = x + y (mod Q) -static inline GFSymbol AddModQ(GFSymbol a, GFSymbol b) -{ - const unsigned sum = (unsigned)a + b; - - // Partial reduction step, allowing for Q to be returned - return static_cast(sum + (sum >> kGFBits)); -} - -// z = x - y (mod Q) -static inline GFSymbol SubModQ(GFSymbol a, GFSymbol b) -{ - const unsigned dif = (unsigned)a - b; - - // Partial reduction step, allowing for Q to be returned - return static_cast(dif + (dif >> kGFBits)); -} - -// vx[] += vy[] * z -static void muladd_mem(GFSymbol * LEO_RESTRICT vx, const GFSymbol * LEO_RESTRICT vy, GFSymbol z, unsigned symbolCount) -{ - for (unsigned i = 0; i < symbolCount; ++i) - { - const GFSymbol a = vy[i]; - if (a == 0) - continue; - - GFSymbol sum1 = static_cast(AddModQ(GFLog[a & 0x0f], z)); - GFSymbol value1 = GFExp[sum1]; - if ((a & 0x0f) == 0) - { - value1 = 0; - } - GFSymbol sum2 = static_cast(AddModQ(GFLog[a & 0xf0], z)); - GFSymbol value2 = GFExp[sum2]; - if ((a & 0xf0) == 0) - { - value2 = 0; - } - GFSymbol sum3 = static_cast(AddModQ(GFLog[a & 0x0f00], z)); - GFSymbol value3 = GFExp[sum3]; - if ((a & 0x0f00) == 0) - { - value3 = 0; - } - GFSymbol sum4 = static_cast(AddModQ(GFLog[a & 0xf000], z)); - GFSymbol value4 = GFExp[sum4]; - if ((a & 0xf000) == 0) - { - value4 = 0; - } - - vx[i] ^= value1; - vx[i] ^= value2; - vx[i] ^= value3; - vx[i] ^= value4; - } -} - -// return a*GFExp[b] over GF(2^r) -static GFSymbol mulE(GFSymbol a, GFSymbol b) -{ - if (a == 0) - return 0; - - const GFSymbol sum = static_cast(AddModQ(GFLog[a], b)); - return GFExp[sum]; -} - - -//------------------------------------------------------------------------------ -// Fast Walsh-Hadamard Transform (FWHT) Mod Q -// -// Q is the maximum symbol value, e.g. 255 or 65535. - -// Define this to enable the optimized version of FWHT() -#define LEO_FWHT_OPTIMIZED - -typedef GFSymbol fwht_t; - -// {a, b} = {a + b, a - b} (Mod Q) -static LEO_FORCE_INLINE void FWHT_2(fwht_t& LEO_RESTRICT a, fwht_t& LEO_RESTRICT b) -{ - const fwht_t sum = AddModQ(a, b); - const fwht_t dif = SubModQ(a, b); - a = sum; - b = dif; -} - -/* - FWHT is a minor slice of the runtime and does not grow with data size, - but I did attempt a few additional optimizations that failed: - - I've attempted to vectorize (with partial reductions) FWHT_4(data, s), - which is 70% of the algorithm, but it was slower. Left in _attic_. - - I've attempted to avoid reductions in all or parts of the FWHT. - The final modular reduction ends up being slower than the savings. - Specifically I tried doing it for the whole FWHT and also I tried - doing it just for the FWHT_2 loop in the main routine, but both - approaches are slower than partial reductions. - - Replacing word reads with wider reads does speed up the operation, but - at too high a complexity cost relative to minor perf improvement. -*/ - -#ifndef LEO_FWHT_OPTIMIZED - -// Reference implementation -static void FWHT(fwht_t* data, const unsigned bits) -{ - const unsigned size = (unsigned)(1UL << bits); - for (unsigned width = 1; width < size; width <<= 1) - for (unsigned i = 0; i < size; i += (width << 1)) - for (unsigned j = i; j < (width + i); ++j) - FWHT_2(data[j], data[j + width]); -} - -#else - -static LEO_FORCE_INLINE void FWHT_4(fwht_t* data) -{ - fwht_t t0 = data[0]; - fwht_t t1 = data[1]; - fwht_t t2 = data[2]; - fwht_t t3 = data[3]; - FWHT_2(t0, t1); - FWHT_2(t2, t3); - FWHT_2(t0, t2); - FWHT_2(t1, t3); - data[0] = t0; - data[1] = t1; - data[2] = t2; - data[3] = t3; -} - -static LEO_FORCE_INLINE void FWHT_4(fwht_t* data, unsigned s) -{ - unsigned x = 0; - fwht_t t0 = data[x]; x += s; - fwht_t t1 = data[x]; x += s; - fwht_t t2 = data[x]; x += s; - fwht_t t3 = data[x]; - FWHT_2(t0, t1); - FWHT_2(t2, t3); - FWHT_2(t0, t2); - FWHT_2(t1, t3); - unsigned y = 0; - data[y] = t0; y += s; - data[y] = t1; y += s; - data[y] = t2; y += s; - data[y] = t3; -} - -static inline void FWHT_8(fwht_t* data) -{ - fwht_t t0 = data[0]; - fwht_t t1 = data[1]; - fwht_t t2 = data[2]; - fwht_t t3 = data[3]; - fwht_t t4 = data[4]; - fwht_t t5 = data[5]; - fwht_t t6 = data[6]; - fwht_t t7 = data[7]; - FWHT_2(t0, t1); - FWHT_2(t2, t3); - FWHT_2(t4, t5); - FWHT_2(t6, t7); - FWHT_2(t0, t2); - FWHT_2(t1, t3); - FWHT_2(t4, t6); - FWHT_2(t5, t7); - FWHT_2(t0, t4); - FWHT_2(t1, t5); - FWHT_2(t2, t6); - FWHT_2(t3, t7); - data[0] = t0; - data[1] = t1; - data[2] = t2; - data[3] = t3; - data[4] = t4; - data[5] = t5; - data[6] = t6; - data[7] = t7; -} - -static inline void FWHT_16(fwht_t* data) -{ - fwht_t t0 = data[0]; - fwht_t t1 = data[1]; - fwht_t t2 = data[2]; - fwht_t t3 = data[3]; - fwht_t t4 = data[4]; - fwht_t t5 = data[5]; - fwht_t t6 = data[6]; - fwht_t t7 = data[7]; - fwht_t t8 = data[8]; - fwht_t t9 = data[9]; - fwht_t t10 = data[10]; - fwht_t t11 = data[11]; - fwht_t t12 = data[12]; - fwht_t t13 = data[13]; - fwht_t t14 = data[14]; - fwht_t t15 = data[15]; - FWHT_2(t0, t1); - FWHT_2(t2, t3); - FWHT_2(t4, t5); - FWHT_2(t6, t7); - FWHT_2(t8, t9); - FWHT_2(t10, t11); - FWHT_2(t12, t13); - FWHT_2(t14, t15); - FWHT_2(t0, t2); - FWHT_2(t1, t3); - FWHT_2(t4, t6); - FWHT_2(t5, t7); - FWHT_2(t8, t10); - FWHT_2(t9, t11); - FWHT_2(t12, t14); - FWHT_2(t13, t15); - FWHT_2(t0, t4); - FWHT_2(t1, t5); - FWHT_2(t2, t6); - FWHT_2(t3, t7); - FWHT_2(t8, t12); - FWHT_2(t9, t13); - FWHT_2(t10, t14); - FWHT_2(t11, t15); - FWHT_2(t0, t8); - FWHT_2(t1, t9); - FWHT_2(t2, t10); - FWHT_2(t3, t11); - FWHT_2(t4, t12); - FWHT_2(t5, t13); - FWHT_2(t6, t14); - FWHT_2(t7, t15); - data[0] = t0; - data[1] = t1; - data[2] = t2; - data[3] = t3; - data[4] = t4; - data[5] = t5; - data[6] = t6; - data[7] = t7; - data[8] = t8; - data[9] = t9; - data[10] = t10; - data[11] = t11; - data[12] = t12; - data[13] = t13; - data[14] = t14; - data[15] = t15; -} - -static void FWHT_SmallData(fwht_t* data, unsigned ldn) -{ - const unsigned n = (1UL << ldn); - - if (n <= 2) - { - if (n == 2) - FWHT_2(data[0], data[1]); - return; - } - - for (unsigned ldm = ldn; ldm > 3; ldm -= 2) - { - unsigned m = (1UL << ldm); - unsigned m4 = (m >> 2); - for (unsigned r = 0; r < n; r += m) - for (unsigned j = 0; j < m4; j++) - FWHT_4(data + j + r, m4); - } - - if (ldn & 1) - { - for (unsigned i0 = 0; i0 < n; i0 += 8) - FWHT_8(data + i0); - } - else - { - for (unsigned i0 = 0; i0 < n; i0 += 4) - FWHT_4(data + i0); - } -} - -// Decimation in time (DIT) version -static void FWHT(fwht_t* data, const unsigned ldn) -{ - if (ldn <= 13) - { - FWHT_SmallData(data, ldn); - return; - } - - FWHT_2(data[2], data[3]); - FWHT_4(data + 4); - FWHT_8(data + 8); - FWHT_16(data + 16); - for (unsigned ldm = 5; ldm < ldn; ++ldm) - FWHT(data + (unsigned)(1UL << ldm), ldm); - - for (unsigned ldm = 0; ldm < ldn; ++ldm) - { - const unsigned mh = (1UL << ldm); - for (unsigned t1 = 0, t2 = mh; t1 < mh; ++t1, ++t2) - FWHT_2(data[t1], data[t2]); - } -} - -#endif - - -//------------------------------------------------------------------------------ -// Memory Buffer XOR - -static void xor_mem(void * LEO_RESTRICT vx, const void * LEO_RESTRICT vy, unsigned bytes) -{ - LEO_M128 * LEO_RESTRICT x16 = reinterpret_cast(vx); - const LEO_M128 * LEO_RESTRICT y16 = reinterpret_cast(vy); - -#if defined(LEO_TARGET_MOBILE) -# if defined(LEO_TRY_NEON) - // Handle multiples of 64 bytes - if (CpuHasNeon) - { - while (bytes >= 64) - { - LEO_M128 x0 = vld1q_u8(x16); - LEO_M128 x1 = vld1q_u8(x16 + 1); - LEO_M128 x2 = vld1q_u8(x16 + 2); - LEO_M128 x3 = vld1q_u8(x16 + 3); - LEO_M128 y0 = vld1q_u8(y16); - LEO_M128 y1 = vld1q_u8(y16 + 1); - LEO_M128 y2 = vld1q_u8(y16 + 2); - LEO_M128 y3 = vld1q_u8(y16 + 3); - - vst1q_u8(x16, veorq_u8(x0, y0)); - vst1q_u8(x16 + 1, veorq_u8(x1, y1)); - vst1q_u8(x16 + 2, veorq_u8(x2, y2)); - vst1q_u8(x16 + 3, veorq_u8(x3, y3)); - - bytes -= 64, x16 += 4, y16 += 4; - } - - // Handle multiples of 16 bytes - while (bytes >= 16) - { - LEO_M128 x0 = vld1q_u8(x16); - LEO_M128 y0 = vld1q_u8(y16); - - vst1q_u8(x16, veorq_u8(x0, y0)); - - bytes -= 16, ++x16, ++y16; - } - } - else -# endif // LEO_TRY_NEON - { - uint64_t * LEO_RESTRICT x8 = reinterpret_cast(x16); - const uint64_t * LEO_RESTRICT y8 = reinterpret_cast(y16); - - const unsigned count = (unsigned)bytes / 8; - for (unsigned ii = 0; ii < count; ++ii) - x8[ii] ^= y8[ii]; - - x16 = reinterpret_cast(x8 + count); - y16 = reinterpret_cast(y8 + count); - } -#else // LEO_TARGET_MOBILE -# if defined(LEO_TRY_AVX2) - if (CpuHasAVX2) - { - LEO_M256 * LEO_RESTRICT x32 = reinterpret_cast(x16); - const LEO_M256 * LEO_RESTRICT y32 = reinterpret_cast(y16); - - while (bytes >= 128) - { - LEO_M256 x0 = _mm256_loadu_si256(x32); - LEO_M256 y0 = _mm256_loadu_si256(y32); - x0 = _mm256_xor_si256(x0, y0); - LEO_M256 x1 = _mm256_loadu_si256(x32 + 1); - LEO_M256 y1 = _mm256_loadu_si256(y32 + 1); - x1 = _mm256_xor_si256(x1, y1); - LEO_M256 x2 = _mm256_loadu_si256(x32 + 2); - LEO_M256 y2 = _mm256_loadu_si256(y32 + 2); - x2 = _mm256_xor_si256(x2, y2); - LEO_M256 x3 = _mm256_loadu_si256(x32 + 3); - LEO_M256 y3 = _mm256_loadu_si256(y32 + 3); - x3 = _mm256_xor_si256(x3, y3); - - _mm256_storeu_si256(x32, x0); - _mm256_storeu_si256(x32 + 1, x1); - _mm256_storeu_si256(x32 + 2, x2); - _mm256_storeu_si256(x32 + 3, x3); - - bytes -= 128, x32 += 4, y32 += 4; - } - - // Handle multiples of 32 bytes - while (bytes >= 32) - { - // x[i] = x[i] xor y[i] - _mm256_storeu_si256(x32, - _mm256_xor_si256( - _mm256_loadu_si256(x32), - _mm256_loadu_si256(y32))); - - bytes -= 32, ++x32, ++y32; - } - - x16 = reinterpret_cast(x32); - y16 = reinterpret_cast(y32); - } - else -# endif // LEO_TRY_AVX2 - { - while (bytes >= 64) - { - LEO_M128 x0 = _mm_loadu_si128(x16); - LEO_M128 y0 = _mm_loadu_si128(y16); - x0 = _mm_xor_si128(x0, y0); - LEO_M128 x1 = _mm_loadu_si128(x16 + 1); - LEO_M128 y1 = _mm_loadu_si128(y16 + 1); - x1 = _mm_xor_si128(x1, y1); - LEO_M128 x2 = _mm_loadu_si128(x16 + 2); - LEO_M128 y2 = _mm_loadu_si128(y16 + 2); - x2 = _mm_xor_si128(x2, y2); - LEO_M128 x3 = _mm_loadu_si128(x16 + 3); - LEO_M128 y3 = _mm_loadu_si128(y16 + 3); - x3 = _mm_xor_si128(x3, y3); - - _mm_storeu_si128(x16, x0); - _mm_storeu_si128(x16 + 1, x1); - _mm_storeu_si128(x16 + 2, x2); - _mm_storeu_si128(x16 + 3, x3); - - bytes -= 64, x16 += 4, y16 += 4; - } - } -#endif // LEO_TARGET_MOBILE - - // Handle multiples of 16 bytes - while (bytes >= 16) - { - // x[i] = x[i] xor y[i] - _mm_storeu_si128(x16, - _mm_xor_si128( - _mm_loadu_si128(x16), - _mm_loadu_si128(y16))); - - bytes -= 16, ++x16, ++y16; - } - - uint8_t * LEO_RESTRICT x1 = reinterpret_cast(x16); - const uint8_t * LEO_RESTRICT y1 = reinterpret_cast(y16); - - // Handle a block of 8 bytes - const unsigned eight = bytes & 8; - if (eight) - { - uint64_t * LEO_RESTRICT x8 = reinterpret_cast(x1); - const uint64_t * LEO_RESTRICT y8 = reinterpret_cast(y1); - *x8 ^= *y8; - } - - // Handle a block of 4 bytes - const unsigned four = bytes & 4; - if (four) - { - uint32_t * LEO_RESTRICT x4 = reinterpret_cast(x1 + eight); - const uint32_t * LEO_RESTRICT y4 = reinterpret_cast(y1 + eight); - *x4 ^= *y4; - } - - // Handle final bytes - const unsigned offset = eight + four; - switch (bytes & 3) - { - case 3: x1[offset + 2] ^= y1[offset + 2]; - case 2: x1[offset + 1] ^= y1[offset + 1]; - case 1: x1[offset] ^= y1[offset]; - default: - break; - } -} - - -//------------------------------------------------------------------------------ -// Formal Derivative - -// Formal derivative of polynomial in the new basis -static void formal_derivative(GFSymbol* cos, const unsigned size) -{ - for (unsigned i = 1; i < size; ++i) - { - const unsigned leng = ((i ^ (i - 1)) + 1) >> 1; - - // If a large number of values are being XORed: - if (leng >= 8) - xor_mem(cos + i - leng, cos + i, leng * sizeof(GFSymbol)); - else - for (unsigned j = i - leng; j < i; j++) - cos[j] ^= cos[j + leng]; - } - - for (unsigned i = size; i < kFieldSize; i <<= 1) - xor_mem(cos, cos + i, size * sizeof(GFSymbol)); -} - - -//------------------------------------------------------------------------------ -// Fast Fourier Transform - -static GFSymbol skewVec[kFieldModulus]; // twisted factors used in FFT - -// IFFT in the proposed basis -static void IFLT(GFSymbol* data, const unsigned size, const unsigned index) -{ - for (unsigned depart_no = 1; depart_no < size; depart_no <<= 1) - { - for (unsigned j = depart_no; j < size; j += (depart_no << 1)) - { - // If a large number of values are being XORed: - if (depart_no >= 8) - xor_mem(data + j, data + j - depart_no, depart_no * sizeof(GFSymbol)); - else - for (unsigned i = j - depart_no; i < j; ++i) - data[i + depart_no] ^= data[i]; - - const GFSymbol skew = skewVec[j + index - 1]; - - if (skew != kFieldModulus) - muladd_mem(data + j - depart_no, data + j, skew, depart_no); - } - } -} - -// FFT in the proposed basis -static void FLT(GFSymbol* data, const unsigned size, const unsigned index) -{ - for (unsigned depart_no = (size >> 1); depart_no > 0; depart_no >>= 1) - { - for (unsigned j = depart_no; j < size; j += (depart_no << 1)) - { - const GFSymbol skew = skewVec[j + index - 1]; - - if (skew != kFieldModulus) - muladd_mem(data + j - depart_no, data + j, skew, depart_no); - - // If a large number of values are being XORed: - if (depart_no >= 8) - xor_mem(data + j, data + j - depart_no, depart_no * sizeof(GFSymbol)); - else - for (unsigned i = j - depart_no; i < j; ++i) - data[i + depart_no] ^= data[i]; - } - } -} - - -//------------------------------------------------------------------------------ -// FFT Initialization - -static GFSymbol B[kFieldSize >> 1]; // factors used in formal derivative -static fwht_t log_walsh[kFieldSize]; // factors used in the evaluation of the error locator polynomial - -// Initialize skewVec[], B[], log_walsh[] -static void InitFieldOperations() -{ - GFSymbol temp[kGFBits - 1]; - - for (unsigned i = 1; i < kGFBits; ++i) - temp[i - 1] = (GFSymbol)((unsigned)1 << i); - - for (unsigned m = 0; m < (kGFBits - 1); ++m) - { - const unsigned step = (unsigned)1 << (m + 1); - - skewVec[((unsigned)1 << m) - 1] = 0; - - for (unsigned i = m; i < (kGFBits - 1); ++i) - { - const unsigned s = ((unsigned)1 << (i + 1)); - - for (unsigned j = ((unsigned)1 << m) - 1; j < s; j += step) - skewVec[j + s] = skewVec[j] ^ temp[i]; - } - - temp[m] = kFieldModulus - GFLog[mulE(temp[m], GFLog[temp[m] ^ 1])]; - - for (unsigned i = m + 1; i < (kGFBits - 1); ++i) - temp[i] = mulE(temp[i], (GFLog[temp[i] ^ 1] + temp[m]) % kFieldModulus); - } - - for (unsigned i = 0; i < kFieldSize; ++i) - skewVec[i] = GFLog[skewVec[i]]; - - temp[0] = kFieldModulus - temp[0]; - - for (unsigned i = 1; i < (kGFBits - 1); ++i) - temp[i] = (kFieldModulus - temp[i] + temp[i - 1]) % kFieldModulus; - - B[0] = 0; - for (unsigned i = 0; i < (kGFBits - 1); ++i) - { - const unsigned depart = ((unsigned)1 << i); - - for (unsigned j = 0; j < depart; ++j) - B[j + depart] = (B[j] + temp[i]) % kFieldModulus; - } - - for (unsigned i = 0; i < kFieldSize; ++i) - log_walsh[i] = GFLog[i]; - - log_walsh[0] = 0; - - FWHT(log_walsh, kGFBits); -} - - -//------------------------------------------------------------------------------ -// Encoder - -// Encoding alg for k/n<0.5: message is a power of two -static void encodeL(GFSymbol* data, const unsigned k, GFSymbol* codeword) -{ - memcpy(codeword, data, sizeof(GFSymbol) * k); - - IFLT(codeword, k, 0); - - for (unsigned i = k; i < kFieldSize; i += k) - { - memcpy(&codeword[i], codeword, sizeof(GFSymbol) * k); - - FLT(&codeword[i], k, i); - } - - memcpy(codeword, data, sizeof(GFSymbol) * k); -} - -// Encoding alg for k/n>0.5: parity is a power of two. -// data: message array. parity: parity array. mem: buffer(size>= n-k) -static void encodeH(const GFSymbol* data, const unsigned k, GFSymbol* parity, GFSymbol* mem) -{ - const unsigned t = kFieldSize - k; - - memset(parity, 0, sizeof(GFSymbol) * t); - - for (unsigned i = t; i < kFieldSize; i += t) - { - memcpy(mem, &data[i - t], sizeof(GFSymbol) * t); - - IFLT(mem, t, i); - - xor_mem(parity, mem, t * sizeof(GFSymbol)); - } - - FLT(parity, t, 0); -} - - -//------------------------------------------------------------------------------ -// Decoder - -static void decode(GFSymbol* codeword, unsigned k, const bool* erasure) -{ - fwht_t log_walsh2[kFieldSize]; - - // Compute the evaluations of the error locator polynomial - for (unsigned i = 0; i < kFieldSize; ++i) - log_walsh2[i] = erasure[i] ? 1 : 0; - - FWHT(log_walsh2, kGFBits); - - for (unsigned i = 0; i < kFieldSize; ++i) - log_walsh2[i] = ((unsigned)log_walsh2[i] * (unsigned)log_walsh[i]) % kFieldModulus; - - FWHT(log_walsh2, kGFBits); - - // k2 can be replaced with k - const unsigned k2 = kFieldSize; - //const unsigned k2 = k; // cannot actually be replaced with k. what else need to change? - - for (unsigned i = 0; i < kFieldSize; ++i) - { - if (erasure[i]) - { - codeword[i] = 0; - } - else - { - codeword[i] = mulE(codeword[i], log_walsh2[i]); - } - } - - IFLT(codeword, kFieldSize, 0); - - // formal derivative - for (unsigned i = 0; i < kFieldSize; i += 2) - { - codeword[i] = mulE(codeword[i], kFieldModulus - B[i >> 1]); - codeword[i + 1] = mulE(codeword[i + 1], kFieldModulus - B[i >> 1]); - } - - formal_derivative(codeword, k2); - - for (unsigned i = 0; i < k2; i += 2) - { - codeword[i] = mulE(codeword[i], B[i >> 1]); - codeword[i + 1] = mulE(codeword[i + 1], B[i >> 1]); - } - - FLT(codeword, k2, 0); - - for (unsigned i = 0; i < k2; ++i) - { - if (erasure[i]) - { - codeword[i] = mulE(codeword[i], kFieldModulus - log_walsh2[i]); - } - } -} - - -//------------------------------------------------------------------------------ -// Test Application - -void test(unsigned k, unsigned seed) -{ - srand(seed); - - //-----------Generating message---------- - - // Message array - GFSymbol data[kFieldSize] = {0}; - - // Filled with random numbers - for (unsigned i = kFieldSize - k; i < kFieldSize; ++i) - data[i] = (GFSymbol)rand(); - - - //---------encoding---------- - - GFSymbol codeword[kFieldSize]; - encodeH(&data[kFieldSize - k], k, data, codeword); - //encodeL(data, k, codeword); // does not seem to work with any input? what else needs to change? - - memcpy(codeword, data, sizeof(GFSymbol) * kFieldSize); - - - //--------erasure simulation--------- - - // Array indicating erasures - bool erasure[kFieldSize] = { - false - }; - - for (unsigned i = k; i < kFieldSize; ++i) - erasure[i] = true; - - // permuting the erasure array - for (unsigned i = kFieldSize - 1; i > 0; --i) - { - unsigned pos = rand() % (i + 1); - - if (i != pos) - { - bool tmp = erasure[i]; - erasure[i] = erasure[pos]; - erasure[pos] = tmp; - } - } - - // erasure codeword symbols - for (unsigned i = 0; i < kFieldSize; ++i) - if (erasure[i]) - codeword[i] = 0; - - - //---------main processing---------- - decode(codeword, k, erasure); - - // Check the correctness of the result - for (unsigned i = 0; i < kFieldSize; ++i) - { - if (erasure[i] == 1) - { - if (data[i] != codeword[i]) - { - printf("Decoding Error with seed = %d!\n", seed); - LEO_DEBUG_BREAK; - return; - } - } - } - - //printf("Decoding is successful!\n"); -} - - -//------------------------------------------------------------------------------ -// Entrypoint - -int main(int argc, char **argv) -{ - // Initialize architecture-specific code - leo_architecture_init(); - - // Fill GFLog table and GFExp table - InitField(); - - // Compute factors used in erasure decoder - InitFieldOperations(); - - unsigned seed = (unsigned)time(NULL); - for (;;) - { - // test(int k), k: message size - /* - EncodeH works for kFieldSize / 2 and kFieldSize * 3 / 4, etc, - s.t. the number of recovery pieces is a power of two - */ - test(kFieldSize / 2, seed); - - ++seed; - } - - return 0; -} diff --git a/LeopardEncoder.cpp b/LeopardEncoder.cpp deleted file mode 100644 index 71d22e2..0000000 --- a/LeopardEncoder.cpp +++ /dev/null @@ -1,1220 +0,0 @@ -/* - Copyright (c) 2017 Christopher A. Taylor. All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - * Neither the name of LHC-RS nor the names of its contributors may be - used to endorse or promote products derived from this software without - specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE - LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - POSSIBILITY OF SUCH DAMAGE. -*/ - -#include -#include -#include -#include -#include - - -/* - TODO: - + Write C API and unit tester - + Limit input to multiples of 64 bytes - + Replace GFSymbol with a file data pointer - + New 16-bit Muladd inner loops - + Class to contain the (large) muladd tables - + Preliminary benchmarks for large data! - + New 8-bit Muladd inner loops - + Benchmarks for smaller data! - + Refactor software - + Pick a name for the software better than LEO_RS - + I think it should be split up into several C++ modules - + Write detailed comments for all the routines - + Look into getting EncodeL working so we can support smaller data (Ask Lin) - + Look into using k instead of k2 to speed up decoder (Ask Lin) - + Avoid performing FFT/IFFT intermediate calculations we're not going to use - + Benchmarks, fun! - + Add multi-threading to split up long parallelizable calculations - + Final benchmarks! - + Finish up documentation - + Release version 1 - - - Muladd implementation notes: - - Specialize for 1-3 rows at a time since often times we're multiplying by - the same (skew) value repeatedly, as the ISA-L library does here: - - https://github.com/01org/isa-l/blob/master/erasure_code/gf_3vect_mad_avx.asm#L258 - - Except we should be doing it for 16-bit Galois Field. - To implement that use the ALTMAP trick from Jerasure: - - http://lab.jerasure.org/jerasure/gf-complete/blob/master/src/gf_w16.c#L1140 - - Except we should also support AVX2 since that is a 40% perf boost, so put - the high and low bytes 32 bytes instead of 16 bytes apart. - - Also I think we should go ahead and precompute the multiply tables since - it avoids a bunch of memory lookups for each muladd, and only costs 8 MB. -*/ - - -//------------------------------------------------------------------------------ -// Debug - -// Some bugs only repro in release mode, so this can be helpful -//#define LEO_DEBUG_IN_RELEASE - -#if defined(_DEBUG) || defined(DEBUG) || defined(LEO_DEBUG_IN_RELEASE) - #define LEO_DEBUG - #ifdef _WIN32 - #define LEO_DEBUG_BREAK __debugbreak() - #else - #define LEO_DEBUG_BREAK __builtin_trap() - #endif - #define LEO_DEBUG_ASSERT(cond) { if (!(cond)) { LEO_DEBUG_BREAK; } } -#else - #define LEO_DEBUG_BREAK ; - #define LEO_DEBUG_ASSERT(cond) ; -#endif - - -//------------------------------------------------------------------------------ -// Platform/Architecture - -#if defined(ANDROID) || defined(IOS) - #define LEO_TARGET_MOBILE -#endif // ANDROID - -#if defined(__AVX2__) || (defined (_MSC_VER) && _MSC_VER >= 1900) - #define LEO_TRY_AVX2 /* 256-bit */ - #include - #define LEO_ALIGN_BYTES 32 -#else // __AVX2__ - #define LEO_ALIGN_BYTES 16 -#endif // __AVX2__ - -#if !defined(LEO_TARGET_MOBILE) - // Note: MSVC currently only supports SSSE3 but not AVX2 - #include // SSSE3: _mm_shuffle_epi8 - #include // SSE2 -#endif // LEO_TARGET_MOBILE - -#if defined(HAVE_ARM_NEON_H) - #include -#endif // HAVE_ARM_NEON_H - -#if defined(LEO_TARGET_MOBILE) - - #define LEO_ALIGNED_ACCESSES /* Inputs must be aligned to LEO_ALIGN_BYTES */ - -# if defined(HAVE_ARM_NEON_H) - // Compiler-specific 128-bit SIMD register keyword - #define LEO_M128 uint8x16_t - #define LEO_TRY_NEON -#else - #define LEO_M128 uint64_t -# endif - -#else // LEO_TARGET_MOBILE - - // Compiler-specific 128-bit SIMD register keyword - #define LEO_M128 __m128i - -#endif // LEO_TARGET_MOBILE - -#ifdef LEO_TRY_AVX2 - // Compiler-specific 256-bit SIMD register keyword - #define LEO_M256 __m256i -#endif - -// Compiler-specific C++11 restrict keyword -#define LEO_RESTRICT __restrict - -// Compiler-specific force inline keyword -#ifdef _MSC_VER - #define LEO_FORCE_INLINE inline __forceinline -#else - #define LEO_FORCE_INLINE inline __attribute__((always_inline)) -#endif - -// Compiler-specific alignment keyword -// Note: Alignment only matters for ARM NEON where it should be 16 -#ifdef _MSC_VER - #define LEO_ALIGNED __declspec(align(LEO_ALIGN_BYTES)) -#else // _MSC_VER - #define LEO_ALIGNED __attribute__((aligned(LEO_ALIGN_BYTES))) -#endif // _MSC_VER - - -//------------------------------------------------------------------------------ -// Runtime CPU Architecture Check -// -// Feature checks stolen shamelessly from -// https://github.com/jedisct1/libsodium/blob/master/src/libsodium/sodium/runtime.c - -#if defined(HAVE_ANDROID_GETCPUFEATURES) - #include -#endif - -#if defined(LEO_TRY_NEON) -# if defined(IOS) && defined(__ARM_NEON__) - // Requires iPhone 5S or newer - static const bool CpuHasNeon = true; - static const bool CpuHasNeon64 = true; -# else - // Remember to add LOCAL_STATIC_LIBRARIES := cpufeatures - static bool CpuHasNeon = false; // V6 / V7 - static bool CpuHasNeon64 = false; // 64-bit -# endif -#endif - - -#if !defined(LEO_TARGET_MOBILE) - -#ifdef _MSC_VER - #include // __cpuid - #pragma warning(disable: 4752) // found Intel(R) Advanced Vector Extensions; consider using /arch:AVX -#endif - -#ifdef LEO_TRY_AVX2 -static bool CpuHasAVX2 = false; -#endif -static bool CpuHasSSSE3 = false; - -#define CPUID_EBX_AVX2 0x00000020 -#define CPUID_ECX_SSSE3 0x00000200 - -static void _cpuid(unsigned int cpu_info[4U], const unsigned int cpu_info_type) -{ -#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_AMD64) || defined(_M_IX86)) - __cpuid((int *) cpu_info, cpu_info_type); -#else //if defined(HAVE_CPUID) - cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0; -# ifdef __i386__ - __asm__ __volatile__ ("pushfl; pushfl; " - "popl %0; " - "movl %0, %1; xorl %2, %0; " - "pushl %0; " - "popfl; pushfl; popl %0; popfl" : - "=&r" (cpu_info[0]), "=&r" (cpu_info[1]) : - "i" (0x200000)); - if (((cpu_info[0] ^ cpu_info[1]) & 0x200000) == 0) { - return; /* LCOV_EXCL_LINE */ - } -# endif -# ifdef __i386__ - __asm__ __volatile__ ("xchgl %%ebx, %k1; cpuid; xchgl %%ebx, %k1" : - "=a" (cpu_info[0]), "=&r" (cpu_info[1]), - "=c" (cpu_info[2]), "=d" (cpu_info[3]) : - "0" (cpu_info_type), "2" (0U)); -# elif defined(__x86_64__) - __asm__ __volatile__ ("xchgq %%rbx, %q1; cpuid; xchgq %%rbx, %q1" : - "=a" (cpu_info[0]), "=&r" (cpu_info[1]), - "=c" (cpu_info[2]), "=d" (cpu_info[3]) : - "0" (cpu_info_type), "2" (0U)); -# else - __asm__ __volatile__ ("cpuid" : - "=a" (cpu_info[0]), "=b" (cpu_info[1]), - "=c" (cpu_info[2]), "=d" (cpu_info[3]) : - "0" (cpu_info_type), "2" (0U)); -# endif -#endif -} - -#endif // defined(LEO_TARGET_MOBILE) - - -static void leo_architecture_init() -{ -#if defined(LEO_TRY_NEON) && defined(HAVE_ANDROID_GETCPUFEATURES) - AndroidCpuFamily family = android_getCpuFamily(); - if (family == ANDROID_CPU_FAMILY_ARM) - { - if (android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON) - CpuHasNeon = true; - } - else if (family == ANDROID_CPU_FAMILY_ARM64) - { - CpuHasNeon = true; - if (android_getCpuFeatures() & ANDROID_CPU_ARM64_FEATURE_ASIMD) - CpuHasNeon64 = true; - } -#endif - -#if !defined(LEO_TARGET_MOBILE) - unsigned int cpu_info[4]; - - _cpuid(cpu_info, 1); - CpuHasSSSE3 = ((cpu_info[2] & CPUID_ECX_SSSE3) != 0); - -#if defined(LEO_TRY_AVX2) - _cpuid(cpu_info, 7); - CpuHasAVX2 = ((cpu_info[1] & CPUID_EBX_AVX2) != 0); -#endif // LEO_TRY_AVX2 - -#endif // LEO_TARGET_MOBILE -} - - -//------------------------------------------------------------------------------ -// SIMD-Safe Aligned Memory Allocations - -static const unsigned kAlignmentBytes = LEO_ALIGN_BYTES; - -LEO_FORCE_INLINE unsigned NextAlignedOffset(unsigned offset) -{ - return (offset + kAlignmentBytes - 1) & ~(kAlignmentBytes - 1); -} - -static LEO_FORCE_INLINE uint8_t* SIMDSafeAllocate(size_t size) -{ - uint8_t* data = (uint8_t*)calloc(1, kAlignmentBytes + size); - if (!data) - return nullptr; - unsigned offset = (unsigned)((uintptr_t)data % kAlignmentBytes); - data += kAlignmentBytes - offset; - data[-1] = (uint8_t)offset; - return data; -} - -static LEO_FORCE_INLINE void SIMDSafeFree(void* ptr) -{ - if (!ptr) - return; - uint8_t* data = (uint8_t*)ptr; - unsigned offset = data[-1]; - if (offset >= kAlignmentBytes) - { - LEO_DEBUG_BREAK; // Should never happen - return; - } - data -= kAlignmentBytes - offset; - free(data); -} - - -//------------------------------------------------------------------------------ -// Field - -//#define LEO_SHORT_FIELD - -#ifdef LEO_SHORT_FIELD -typedef uint8_t GFSymbol; -static const unsigned kGFBits = 8; -static const unsigned kGFPolynomial = 0x11D; -GFSymbol kGFBasis[kGFBits] = { - 1, 214, 152, 146, 86, 200, 88, 230 // Cantor basis -}; -#else -typedef uint16_t GFSymbol; -static const unsigned kGFBits = 16; -static const unsigned kGFPolynomial = 0x1002D; -GFSymbol kGFBasis[kGFBits] = { - 0x0001, 0xACCA, 0x3C0E, 0x163E, // Cantor basis - 0xC582, 0xED2E, 0x914C, 0x4012, - 0x6C98, 0x10D8, 0x6A72, 0xB900, - 0xFDB8, 0xFB34, 0xFF38, 0x991E -}; -#endif - -/* - Cantor Basis introduced by: - D. G. Cantor, "On arithmetical algorithms over finite fields", - Journal of Combinatorial Theory, Series A, vol. 50, no. 2, pp. 285-300, 1989. -*/ - -static const unsigned kFieldSize = (unsigned)1 << kGFBits; //Field size -static const unsigned kFieldModulus = kFieldSize - 1; - -static GFSymbol GFLog[kFieldSize]; -static GFSymbol GFExp[kFieldSize]; - -// Initialize GFLog[], GFExp[] -static void InitField() -{ - unsigned state = 1; - for (unsigned i = 0; i < kFieldModulus; ++i) - { - GFExp[state] = static_cast(i); - state <<= 1; - if (state >= kFieldSize) - state ^= kGFPolynomial; - } - GFExp[0] = kFieldModulus; - - // Conversion to chosen basis: - - GFLog[0] = 0; - for (unsigned i = 0; i < kGFBits; ++i) - { - const GFSymbol basis = kGFBasis[i]; - const unsigned width = (unsigned)(1UL << i); - - for (unsigned j = 0; j < width; ++j) - GFLog[j + width] = GFLog[j] ^ basis; - } - - for (unsigned i = 0; i < kFieldSize; ++i) - GFLog[i] = GFExp[GFLog[i]]; - - for (unsigned i = 0; i < kFieldSize; ++i) - GFExp[GFLog[i]] = i; - - GFExp[kFieldModulus] = GFExp[0]; -} - - -//------------------------------------------------------------------------------ -// Mod Q Field Operations -// -// Q is the maximum symbol value, e.g. 255 or 65535. - -// z = x + y (mod Q) -static inline GFSymbol AddModQ(GFSymbol a, GFSymbol b) -{ - const unsigned sum = (unsigned)a + b; - - // Partial reduction step, allowing for Q to be returned - return static_cast(sum + (sum >> kGFBits)); -} - -// z = x - y (mod Q) -static inline GFSymbol SubModQ(GFSymbol a, GFSymbol b) -{ - const unsigned dif = (unsigned)a - b; - - // Partial reduction step, allowing for Q to be returned - return static_cast(dif + (dif >> kGFBits)); -} - -// vx[] += vy[] * z -static void muladd_mem(GFSymbol * LEO_RESTRICT vx, const GFSymbol * LEO_RESTRICT vy, GFSymbol z, unsigned symbolCount) -{ - for (unsigned i = 0; i < symbolCount; ++i) - { - const GFSymbol a = vy[i]; - if (a == 0) - continue; - - GFSymbol sum1 = static_cast(AddModQ(GFLog[a & 0x0f], z)); - GFSymbol value1 = GFExp[sum1]; - if ((a & 0x0f) == 0) - { - value1 = 0; - } - GFSymbol sum2 = static_cast(AddModQ(GFLog[a & 0xf0], z)); - GFSymbol value2 = GFExp[sum2]; - if ((a & 0xf0) == 0) - { - value2 = 0; - } - GFSymbol sum3 = static_cast(AddModQ(GFLog[a & 0x0f00], z)); - GFSymbol value3 = GFExp[sum3]; - if ((a & 0x0f00) == 0) - { - value3 = 0; - } - GFSymbol sum4 = static_cast(AddModQ(GFLog[a & 0xf000], z)); - GFSymbol value4 = GFExp[sum4]; - if ((a & 0xf000) == 0) - { - value4 = 0; - } - - vx[i] ^= value1; - vx[i] ^= value2; - vx[i] ^= value3; - vx[i] ^= value4; - } -} - -// return a*GFExp[b] over GF(2^r) -static GFSymbol mulE(GFSymbol a, GFSymbol b) -{ - if (a == 0) - return 0; - - const GFSymbol sum = static_cast(AddModQ(GFLog[a], b)); - return GFExp[sum]; -} - - -//------------------------------------------------------------------------------ -// Fast Walsh-Hadamard Transform (FWHT) Mod Q -// -// Q is the maximum symbol value, e.g. 255 or 65535. - -// Define this to enable the optimized version of FWHT() -#define LEO_FWHT_OPTIMIZED - -typedef GFSymbol fwht_t; - -// {a, b} = {a + b, a - b} (Mod Q) -static LEO_FORCE_INLINE void FWHT_2(fwht_t& LEO_RESTRICT a, fwht_t& LEO_RESTRICT b) -{ - const fwht_t sum = AddModQ(a, b); - const fwht_t dif = SubModQ(a, b); - a = sum; - b = dif; -} - -/* - FWHT is a minor slice of the runtime and does not grow with data size, - but I did attempt a few additional optimizations that failed: - - I've attempted to vectorize (with partial reductions) FWHT_4(data, s), - which is 70% of the algorithm, but it was slower. Left in _attic_. - - I've attempted to avoid reductions in all or parts of the FWHT. - The final modular reduction ends up being slower than the savings. - Specifically I tried doing it for the whole FWHT and also I tried - doing it just for the FWHT_2 loop in the main routine, but both - approaches are slower than partial reductions. - - Replacing word reads with wider reads does speed up the operation, but - at too high a complexity cost relative to minor perf improvement. -*/ - -#ifndef LEO_FWHT_OPTIMIZED - -// Reference implementation -static void FWHT(fwht_t* data, const unsigned bits) -{ - const unsigned size = (unsigned)(1UL << bits); - for (unsigned width = 1; width < size; width <<= 1) - for (unsigned i = 0; i < size; i += (width << 1)) - for (unsigned j = i; j < (width + i); ++j) - FWHT_2(data[j], data[j + width]); -} - -#else - -static LEO_FORCE_INLINE void FWHT_4(fwht_t* data) -{ - fwht_t t0 = data[0]; - fwht_t t1 = data[1]; - fwht_t t2 = data[2]; - fwht_t t3 = data[3]; - FWHT_2(t0, t1); - FWHT_2(t2, t3); - FWHT_2(t0, t2); - FWHT_2(t1, t3); - data[0] = t0; - data[1] = t1; - data[2] = t2; - data[3] = t3; -} - -static LEO_FORCE_INLINE void FWHT_4(fwht_t* data, unsigned s) -{ - unsigned x = 0; - fwht_t t0 = data[x]; x += s; - fwht_t t1 = data[x]; x += s; - fwht_t t2 = data[x]; x += s; - fwht_t t3 = data[x]; - FWHT_2(t0, t1); - FWHT_2(t2, t3); - FWHT_2(t0, t2); - FWHT_2(t1, t3); - unsigned y = 0; - data[y] = t0; y += s; - data[y] = t1; y += s; - data[y] = t2; y += s; - data[y] = t3; -} - -static inline void FWHT_8(fwht_t* data) -{ - fwht_t t0 = data[0]; - fwht_t t1 = data[1]; - fwht_t t2 = data[2]; - fwht_t t3 = data[3]; - fwht_t t4 = data[4]; - fwht_t t5 = data[5]; - fwht_t t6 = data[6]; - fwht_t t7 = data[7]; - FWHT_2(t0, t1); - FWHT_2(t2, t3); - FWHT_2(t4, t5); - FWHT_2(t6, t7); - FWHT_2(t0, t2); - FWHT_2(t1, t3); - FWHT_2(t4, t6); - FWHT_2(t5, t7); - FWHT_2(t0, t4); - FWHT_2(t1, t5); - FWHT_2(t2, t6); - FWHT_2(t3, t7); - data[0] = t0; - data[1] = t1; - data[2] = t2; - data[3] = t3; - data[4] = t4; - data[5] = t5; - data[6] = t6; - data[7] = t7; -} - -static inline void FWHT_16(fwht_t* data) -{ - fwht_t t0 = data[0]; - fwht_t t1 = data[1]; - fwht_t t2 = data[2]; - fwht_t t3 = data[3]; - fwht_t t4 = data[4]; - fwht_t t5 = data[5]; - fwht_t t6 = data[6]; - fwht_t t7 = data[7]; - fwht_t t8 = data[8]; - fwht_t t9 = data[9]; - fwht_t t10 = data[10]; - fwht_t t11 = data[11]; - fwht_t t12 = data[12]; - fwht_t t13 = data[13]; - fwht_t t14 = data[14]; - fwht_t t15 = data[15]; - FWHT_2(t0, t1); - FWHT_2(t2, t3); - FWHT_2(t4, t5); - FWHT_2(t6, t7); - FWHT_2(t8, t9); - FWHT_2(t10, t11); - FWHT_2(t12, t13); - FWHT_2(t14, t15); - FWHT_2(t0, t2); - FWHT_2(t1, t3); - FWHT_2(t4, t6); - FWHT_2(t5, t7); - FWHT_2(t8, t10); - FWHT_2(t9, t11); - FWHT_2(t12, t14); - FWHT_2(t13, t15); - FWHT_2(t0, t4); - FWHT_2(t1, t5); - FWHT_2(t2, t6); - FWHT_2(t3, t7); - FWHT_2(t8, t12); - FWHT_2(t9, t13); - FWHT_2(t10, t14); - FWHT_2(t11, t15); - FWHT_2(t0, t8); - FWHT_2(t1, t9); - FWHT_2(t2, t10); - FWHT_2(t3, t11); - FWHT_2(t4, t12); - FWHT_2(t5, t13); - FWHT_2(t6, t14); - FWHT_2(t7, t15); - data[0] = t0; - data[1] = t1; - data[2] = t2; - data[3] = t3; - data[4] = t4; - data[5] = t5; - data[6] = t6; - data[7] = t7; - data[8] = t8; - data[9] = t9; - data[10] = t10; - data[11] = t11; - data[12] = t12; - data[13] = t13; - data[14] = t14; - data[15] = t15; -} - -static void FWHT_SmallData(fwht_t* data, unsigned ldn) -{ - const unsigned n = (1UL << ldn); - - if (n <= 2) - { - if (n == 2) - FWHT_2(data[0], data[1]); - return; - } - - for (unsigned ldm = ldn; ldm > 3; ldm -= 2) - { - unsigned m = (1UL << ldm); - unsigned m4 = (m >> 2); - for (unsigned r = 0; r < n; r += m) - for (unsigned j = 0; j < m4; j++) - FWHT_4(data + j + r, m4); - } - - if (ldn & 1) - { - for (unsigned i0 = 0; i0 < n; i0 += 8) - FWHT_8(data + i0); - } - else - { - for (unsigned i0 = 0; i0 < n; i0 += 4) - FWHT_4(data + i0); - } -} - -// Decimation in time (DIT) version -static void FWHT(fwht_t* data, const unsigned ldn) -{ - if (ldn <= 13) - { - FWHT_SmallData(data, ldn); - return; - } - - FWHT_2(data[2], data[3]); - FWHT_4(data + 4); - FWHT_8(data + 8); - FWHT_16(data + 16); - for (unsigned ldm = 5; ldm < ldn; ++ldm) - FWHT(data + (unsigned)(1UL << ldm), ldm); - - for (unsigned ldm = 0; ldm < ldn; ++ldm) - { - const unsigned mh = (1UL << ldm); - for (unsigned t1 = 0, t2 = mh; t1 < mh; ++t1, ++t2) - FWHT_2(data[t1], data[t2]); - } -} - -#endif - - -//------------------------------------------------------------------------------ -// Memory Buffer XOR - -static void xor_mem(void * LEO_RESTRICT vx, const void * LEO_RESTRICT vy, unsigned bytes) -{ - LEO_M128 * LEO_RESTRICT x16 = reinterpret_cast(vx); - const LEO_M128 * LEO_RESTRICT y16 = reinterpret_cast(vy); - -#if defined(LEO_TARGET_MOBILE) -# if defined(LEO_TRY_NEON) - // Handle multiples of 64 bytes - if (CpuHasNeon) - { - while (bytes >= 64) - { - LEO_M128 x0 = vld1q_u8(x16); - LEO_M128 x1 = vld1q_u8(x16 + 1); - LEO_M128 x2 = vld1q_u8(x16 + 2); - LEO_M128 x3 = vld1q_u8(x16 + 3); - LEO_M128 y0 = vld1q_u8(y16); - LEO_M128 y1 = vld1q_u8(y16 + 1); - LEO_M128 y2 = vld1q_u8(y16 + 2); - LEO_M128 y3 = vld1q_u8(y16 + 3); - - vst1q_u8(x16, veorq_u8(x0, y0)); - vst1q_u8(x16 + 1, veorq_u8(x1, y1)); - vst1q_u8(x16 + 2, veorq_u8(x2, y2)); - vst1q_u8(x16 + 3, veorq_u8(x3, y3)); - - bytes -= 64, x16 += 4, y16 += 4; - } - - // Handle multiples of 16 bytes - while (bytes >= 16) - { - LEO_M128 x0 = vld1q_u8(x16); - LEO_M128 y0 = vld1q_u8(y16); - - vst1q_u8(x16, veorq_u8(x0, y0)); - - bytes -= 16, ++x16, ++y16; - } - } - else -# endif // LEO_TRY_NEON - { - uint64_t * LEO_RESTRICT x8 = reinterpret_cast(x16); - const uint64_t * LEO_RESTRICT y8 = reinterpret_cast(y16); - - const unsigned count = (unsigned)bytes / 8; - for (unsigned ii = 0; ii < count; ++ii) - x8[ii] ^= y8[ii]; - - x16 = reinterpret_cast(x8 + count); - y16 = reinterpret_cast(y8 + count); - } -#else // LEO_TARGET_MOBILE -# if defined(LEO_TRY_AVX2) - if (CpuHasAVX2) - { - LEO_M256 * LEO_RESTRICT x32 = reinterpret_cast(x16); - const LEO_M256 * LEO_RESTRICT y32 = reinterpret_cast(y16); - - while (bytes >= 128) - { - LEO_M256 x0 = _mm256_loadu_si256(x32); - LEO_M256 y0 = _mm256_loadu_si256(y32); - x0 = _mm256_xor_si256(x0, y0); - LEO_M256 x1 = _mm256_loadu_si256(x32 + 1); - LEO_M256 y1 = _mm256_loadu_si256(y32 + 1); - x1 = _mm256_xor_si256(x1, y1); - LEO_M256 x2 = _mm256_loadu_si256(x32 + 2); - LEO_M256 y2 = _mm256_loadu_si256(y32 + 2); - x2 = _mm256_xor_si256(x2, y2); - LEO_M256 x3 = _mm256_loadu_si256(x32 + 3); - LEO_M256 y3 = _mm256_loadu_si256(y32 + 3); - x3 = _mm256_xor_si256(x3, y3); - - _mm256_storeu_si256(x32, x0); - _mm256_storeu_si256(x32 + 1, x1); - _mm256_storeu_si256(x32 + 2, x2); - _mm256_storeu_si256(x32 + 3, x3); - - bytes -= 128, x32 += 4, y32 += 4; - } - - // Handle multiples of 32 bytes - while (bytes >= 32) - { - // x[i] = x[i] xor y[i] - _mm256_storeu_si256(x32, - _mm256_xor_si256( - _mm256_loadu_si256(x32), - _mm256_loadu_si256(y32))); - - bytes -= 32, ++x32, ++y32; - } - - x16 = reinterpret_cast(x32); - y16 = reinterpret_cast(y32); - } - else -# endif // LEO_TRY_AVX2 - { - while (bytes >= 64) - { - LEO_M128 x0 = _mm_loadu_si128(x16); - LEO_M128 y0 = _mm_loadu_si128(y16); - x0 = _mm_xor_si128(x0, y0); - LEO_M128 x1 = _mm_loadu_si128(x16 + 1); - LEO_M128 y1 = _mm_loadu_si128(y16 + 1); - x1 = _mm_xor_si128(x1, y1); - LEO_M128 x2 = _mm_loadu_si128(x16 + 2); - LEO_M128 y2 = _mm_loadu_si128(y16 + 2); - x2 = _mm_xor_si128(x2, y2); - LEO_M128 x3 = _mm_loadu_si128(x16 + 3); - LEO_M128 y3 = _mm_loadu_si128(y16 + 3); - x3 = _mm_xor_si128(x3, y3); - - _mm_storeu_si128(x16, x0); - _mm_storeu_si128(x16 + 1, x1); - _mm_storeu_si128(x16 + 2, x2); - _mm_storeu_si128(x16 + 3, x3); - - bytes -= 64, x16 += 4, y16 += 4; - } - } -#endif // LEO_TARGET_MOBILE - - // Handle multiples of 16 bytes - while (bytes >= 16) - { - // x[i] = x[i] xor y[i] - _mm_storeu_si128(x16, - _mm_xor_si128( - _mm_loadu_si128(x16), - _mm_loadu_si128(y16))); - - bytes -= 16, ++x16, ++y16; - } - - uint8_t * LEO_RESTRICT x1 = reinterpret_cast(x16); - const uint8_t * LEO_RESTRICT y1 = reinterpret_cast(y16); - - // Handle a block of 8 bytes - const unsigned eight = bytes & 8; - if (eight) - { - uint64_t * LEO_RESTRICT x8 = reinterpret_cast(x1); - const uint64_t * LEO_RESTRICT y8 = reinterpret_cast(y1); - *x8 ^= *y8; - } - - // Handle a block of 4 bytes - const unsigned four = bytes & 4; - if (four) - { - uint32_t * LEO_RESTRICT x4 = reinterpret_cast(x1 + eight); - const uint32_t * LEO_RESTRICT y4 = reinterpret_cast(y1 + eight); - *x4 ^= *y4; - } - - // Handle final bytes - const unsigned offset = eight + four; - switch (bytes & 3) - { - case 3: x1[offset + 2] ^= y1[offset + 2]; - case 2: x1[offset + 1] ^= y1[offset + 1]; - case 1: x1[offset] ^= y1[offset]; - default: - break; - } -} - - -//------------------------------------------------------------------------------ -// Formal Derivative - -// Formal derivative of polynomial in the new basis -static void formal_derivative(GFSymbol* cos, const unsigned size) -{ - for (unsigned i = 1; i < size; ++i) - { - const unsigned leng = ((i ^ (i - 1)) + 1) >> 1; - - // If a large number of values are being XORed: - if (leng >= 8) - xor_mem(cos + i - leng, cos + i, leng * sizeof(GFSymbol)); - else - for (unsigned j = i - leng; j < i; j++) - cos[j] ^= cos[j + leng]; - } - - for (unsigned i = size; i < kFieldSize; i <<= 1) - xor_mem(cos, cos + i, size * sizeof(GFSymbol)); -} - - -//------------------------------------------------------------------------------ -// Fast Fourier Transform - -static GFSymbol skewVec[kFieldModulus]; // twisted factors used in FFT - -// IFFT in the proposed basis -static void IFLT(GFSymbol* data, const unsigned size, const unsigned index) -{ - for (unsigned depart_no = 1; depart_no < size; depart_no <<= 1) - { - for (unsigned j = depart_no; j < size; j += (depart_no << 1)) - { - // If a large number of values are being XORed: - if (depart_no >= 8) - xor_mem(data + j, data + j - depart_no, depart_no * sizeof(GFSymbol)); - else - for (unsigned i = j - depart_no; i < j; ++i) - data[i + depart_no] ^= data[i]; - - const GFSymbol skew = skewVec[j + index - 1]; - - if (skew != kFieldModulus) - muladd_mem(data + j - depart_no, data + j, skew, depart_no); - } - } -} - -// FFT in the proposed basis -static void FLT(GFSymbol* data, const unsigned size, const unsigned index) -{ - for (unsigned depart_no = (size >> 1); depart_no > 0; depart_no >>= 1) - { - for (unsigned j = depart_no; j < size; j += (depart_no << 1)) - { - const GFSymbol skew = skewVec[j + index - 1]; - - if (skew != kFieldModulus) - muladd_mem(data + j - depart_no, data + j, skew, depart_no); - - // If a large number of values are being XORed: - if (depart_no >= 8) - xor_mem(data + j, data + j - depart_no, depart_no * sizeof(GFSymbol)); - else - for (unsigned i = j - depart_no; i < j; ++i) - data[i + depart_no] ^= data[i]; - } - } -} - - -//------------------------------------------------------------------------------ -// FFT Initialization - -static GFSymbol B[kFieldSize >> 1]; // factors used in formal derivative -static fwht_t log_walsh[kFieldSize]; // factors used in the evaluation of the error locator polynomial - -// Initialize skewVec[], B[], log_walsh[] -static void InitFieldOperations() -{ - GFSymbol temp[kGFBits - 1]; - - for (unsigned i = 1; i < kGFBits; ++i) - temp[i - 1] = (GFSymbol)((unsigned)1 << i); - - for (unsigned m = 0; m < (kGFBits - 1); ++m) - { - const unsigned step = (unsigned)1 << (m + 1); - - skewVec[((unsigned)1 << m) - 1] = 0; - - for (unsigned i = m; i < (kGFBits - 1); ++i) - { - const unsigned s = ((unsigned)1 << (i + 1)); - - for (unsigned j = ((unsigned)1 << m) - 1; j < s; j += step) - skewVec[j + s] = skewVec[j] ^ temp[i]; - } - - temp[m] = kFieldModulus - GFLog[mulE(temp[m], GFLog[temp[m] ^ 1])]; - - for (unsigned i = m + 1; i < (kGFBits - 1); ++i) - temp[i] = mulE(temp[i], (GFLog[temp[i] ^ 1] + temp[m]) % kFieldModulus); - } - - for (unsigned i = 0; i < kFieldSize; ++i) - skewVec[i] = GFLog[skewVec[i]]; - - temp[0] = kFieldModulus - temp[0]; - - for (unsigned i = 1; i < (kGFBits - 1); ++i) - temp[i] = (kFieldModulus - temp[i] + temp[i - 1]) % kFieldModulus; - - B[0] = 0; - for (unsigned i = 0; i < (kGFBits - 1); ++i) - { - const unsigned depart = ((unsigned)1 << i); - - for (unsigned j = 0; j < depart; ++j) - B[j + depart] = (B[j] + temp[i]) % kFieldModulus; - } - - for (unsigned i = 0; i < kFieldSize; ++i) - log_walsh[i] = GFLog[i]; - - log_walsh[0] = 0; - - FWHT(log_walsh, kGFBits); -} - - -//------------------------------------------------------------------------------ -// Encoder - -// Encoding alg for k/n<0.5: message is a power of two -static void encodeL(GFSymbol* data, const unsigned k, GFSymbol* codeword) -{ - memcpy(codeword, data, sizeof(GFSymbol) * k); - - IFLT(codeword, k, 0); - - for (unsigned i = k; i < kFieldSize; i += k) - { - memcpy(&codeword[i], codeword, sizeof(GFSymbol) * k); - - FLT(&codeword[i], k, i); - } - - memcpy(codeword, data, sizeof(GFSymbol) * k); -} - -// Encoding alg for k/n>0.5: parity is a power of two. -// data: message array. parity: parity array. mem: buffer(size>= n-k) -static void encodeH(const GFSymbol* data, const unsigned k, GFSymbol* parity, GFSymbol* mem) -{ - const unsigned t = kFieldSize - k; - - memset(parity, 0, sizeof(GFSymbol) * t); - - for (unsigned i = t; i < kFieldSize; i += t) - { - memcpy(mem, &data[i - t], sizeof(GFSymbol) * t); - - IFLT(mem, t, i); - - xor_mem(parity, mem, t * sizeof(GFSymbol)); - } - - FLT(parity, t, 0); -} - - -//------------------------------------------------------------------------------ -// Decoder - -static void decode(GFSymbol* codeword, unsigned k, const bool* erasure) -{ - fwht_t log_walsh2[kFieldSize]; - - // Compute the evaluations of the error locator polynomial - for (unsigned i = 0; i < kFieldSize; ++i) - log_walsh2[i] = erasure[i] ? 1 : 0; - - FWHT(log_walsh2, kGFBits); - - for (unsigned i = 0; i < kFieldSize; ++i) - log_walsh2[i] = ((unsigned)log_walsh2[i] * (unsigned)log_walsh[i]) % kFieldModulus; - - FWHT(log_walsh2, kGFBits); - - // k2 can be replaced with k - const unsigned k2 = kFieldSize; - //const unsigned k2 = k; // cannot actually be replaced with k. what else need to change? - - for (unsigned i = 0; i < kFieldSize; ++i) - { - if (erasure[i]) - { - codeword[i] = 0; - } - else - { - codeword[i] = mulE(codeword[i], log_walsh2[i]); - } - } - - IFLT(codeword, kFieldSize, 0); - - // formal derivative - for (unsigned i = 0; i < kFieldSize; i += 2) - { - codeword[i] = mulE(codeword[i], kFieldModulus - B[i >> 1]); - codeword[i + 1] = mulE(codeword[i + 1], kFieldModulus - B[i >> 1]); - } - - formal_derivative(codeword, k2); - - for (unsigned i = 0; i < k2; i += 2) - { - codeword[i] = mulE(codeword[i], B[i >> 1]); - codeword[i + 1] = mulE(codeword[i + 1], B[i >> 1]); - } - - FLT(codeword, k2, 0); - - for (unsigned i = 0; i < k2; ++i) - { - if (erasure[i]) - { - codeword[i] = mulE(codeword[i], kFieldModulus - log_walsh2[i]); - } - } -} - - -//------------------------------------------------------------------------------ -// Test Application - -void test(unsigned k, unsigned seed) -{ - srand(seed); - - //-----------Generating message---------- - - // Message array - GFSymbol data[kFieldSize] = {0}; - - // Filled with random numbers - for (unsigned i = kFieldSize - k; i < kFieldSize; ++i) - data[i] = (GFSymbol)rand(); - - - //---------encoding---------- - - GFSymbol codeword[kFieldSize]; - encodeH(&data[kFieldSize - k], k, data, codeword); - //encodeL(data, k, codeword); // does not seem to work with any input? what else needs to change? - - memcpy(codeword, data, sizeof(GFSymbol) * kFieldSize); - - - //--------erasure simulation--------- - - // Array indicating erasures - bool erasure[kFieldSize] = { - false - }; - - for (unsigned i = k; i < kFieldSize; ++i) - erasure[i] = true; - - // permuting the erasure array - for (unsigned i = kFieldSize - 1; i > 0; --i) - { - unsigned pos = rand() % (i + 1); - - if (i != pos) - { - bool tmp = erasure[i]; - erasure[i] = erasure[pos]; - erasure[pos] = tmp; - } - } - - // erasure codeword symbols - for (unsigned i = 0; i < kFieldSize; ++i) - if (erasure[i]) - codeword[i] = 0; - - - //---------main processing---------- - decode(codeword, k, erasure); - - // Check the correctness of the result - for (unsigned i = 0; i < kFieldSize; ++i) - { - if (erasure[i] == 1) - { - if (data[i] != codeword[i]) - { - printf("Decoding Error with seed = %d!\n", seed); - LEO_DEBUG_BREAK; - return; - } - } - } - - //printf("Decoding is successful!\n"); -} - - -//------------------------------------------------------------------------------ -// Entrypoint - -int main(int argc, char **argv) -{ - // Initialize architecture-specific code - leo_architecture_init(); - - // Fill GFLog table and GFExp table - InitField(); - - // Compute factors used in erasure decoder - InitFieldOperations(); - - unsigned seed = (unsigned)time(NULL); - for (;;) - { - // test(int k), k: message size - /* - EncodeH works for kFieldSize / 2 and kFieldSize * 3 / 4, etc, - s.t. the number of recovery pieces is a power of two - */ - test(kFieldSize / 2, seed); - - ++seed; - } - - return 0; -} diff --git a/LeopardEncoder.h b/LeopardEncoder.h deleted file mode 100644 index 71d22e2..0000000 --- a/LeopardEncoder.h +++ /dev/null @@ -1,1220 +0,0 @@ -/* - Copyright (c) 2017 Christopher A. Taylor. All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - * Neither the name of LHC-RS nor the names of its contributors may be - used to endorse or promote products derived from this software without - specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE - LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - POSSIBILITY OF SUCH DAMAGE. -*/ - -#include -#include -#include -#include -#include - - -/* - TODO: - + Write C API and unit tester - + Limit input to multiples of 64 bytes - + Replace GFSymbol with a file data pointer - + New 16-bit Muladd inner loops - + Class to contain the (large) muladd tables - + Preliminary benchmarks for large data! - + New 8-bit Muladd inner loops - + Benchmarks for smaller data! - + Refactor software - + Pick a name for the software better than LEO_RS - + I think it should be split up into several C++ modules - + Write detailed comments for all the routines - + Look into getting EncodeL working so we can support smaller data (Ask Lin) - + Look into using k instead of k2 to speed up decoder (Ask Lin) - + Avoid performing FFT/IFFT intermediate calculations we're not going to use - + Benchmarks, fun! - + Add multi-threading to split up long parallelizable calculations - + Final benchmarks! - + Finish up documentation - + Release version 1 - - - Muladd implementation notes: - - Specialize for 1-3 rows at a time since often times we're multiplying by - the same (skew) value repeatedly, as the ISA-L library does here: - - https://github.com/01org/isa-l/blob/master/erasure_code/gf_3vect_mad_avx.asm#L258 - - Except we should be doing it for 16-bit Galois Field. - To implement that use the ALTMAP trick from Jerasure: - - http://lab.jerasure.org/jerasure/gf-complete/blob/master/src/gf_w16.c#L1140 - - Except we should also support AVX2 since that is a 40% perf boost, so put - the high and low bytes 32 bytes instead of 16 bytes apart. - - Also I think we should go ahead and precompute the multiply tables since - it avoids a bunch of memory lookups for each muladd, and only costs 8 MB. -*/ - - -//------------------------------------------------------------------------------ -// Debug - -// Some bugs only repro in release mode, so this can be helpful -//#define LEO_DEBUG_IN_RELEASE - -#if defined(_DEBUG) || defined(DEBUG) || defined(LEO_DEBUG_IN_RELEASE) - #define LEO_DEBUG - #ifdef _WIN32 - #define LEO_DEBUG_BREAK __debugbreak() - #else - #define LEO_DEBUG_BREAK __builtin_trap() - #endif - #define LEO_DEBUG_ASSERT(cond) { if (!(cond)) { LEO_DEBUG_BREAK; } } -#else - #define LEO_DEBUG_BREAK ; - #define LEO_DEBUG_ASSERT(cond) ; -#endif - - -//------------------------------------------------------------------------------ -// Platform/Architecture - -#if defined(ANDROID) || defined(IOS) - #define LEO_TARGET_MOBILE -#endif // ANDROID - -#if defined(__AVX2__) || (defined (_MSC_VER) && _MSC_VER >= 1900) - #define LEO_TRY_AVX2 /* 256-bit */ - #include - #define LEO_ALIGN_BYTES 32 -#else // __AVX2__ - #define LEO_ALIGN_BYTES 16 -#endif // __AVX2__ - -#if !defined(LEO_TARGET_MOBILE) - // Note: MSVC currently only supports SSSE3 but not AVX2 - #include // SSSE3: _mm_shuffle_epi8 - #include // SSE2 -#endif // LEO_TARGET_MOBILE - -#if defined(HAVE_ARM_NEON_H) - #include -#endif // HAVE_ARM_NEON_H - -#if defined(LEO_TARGET_MOBILE) - - #define LEO_ALIGNED_ACCESSES /* Inputs must be aligned to LEO_ALIGN_BYTES */ - -# if defined(HAVE_ARM_NEON_H) - // Compiler-specific 128-bit SIMD register keyword - #define LEO_M128 uint8x16_t - #define LEO_TRY_NEON -#else - #define LEO_M128 uint64_t -# endif - -#else // LEO_TARGET_MOBILE - - // Compiler-specific 128-bit SIMD register keyword - #define LEO_M128 __m128i - -#endif // LEO_TARGET_MOBILE - -#ifdef LEO_TRY_AVX2 - // Compiler-specific 256-bit SIMD register keyword - #define LEO_M256 __m256i -#endif - -// Compiler-specific C++11 restrict keyword -#define LEO_RESTRICT __restrict - -// Compiler-specific force inline keyword -#ifdef _MSC_VER - #define LEO_FORCE_INLINE inline __forceinline -#else - #define LEO_FORCE_INLINE inline __attribute__((always_inline)) -#endif - -// Compiler-specific alignment keyword -// Note: Alignment only matters for ARM NEON where it should be 16 -#ifdef _MSC_VER - #define LEO_ALIGNED __declspec(align(LEO_ALIGN_BYTES)) -#else // _MSC_VER - #define LEO_ALIGNED __attribute__((aligned(LEO_ALIGN_BYTES))) -#endif // _MSC_VER - - -//------------------------------------------------------------------------------ -// Runtime CPU Architecture Check -// -// Feature checks stolen shamelessly from -// https://github.com/jedisct1/libsodium/blob/master/src/libsodium/sodium/runtime.c - -#if defined(HAVE_ANDROID_GETCPUFEATURES) - #include -#endif - -#if defined(LEO_TRY_NEON) -# if defined(IOS) && defined(__ARM_NEON__) - // Requires iPhone 5S or newer - static const bool CpuHasNeon = true; - static const bool CpuHasNeon64 = true; -# else - // Remember to add LOCAL_STATIC_LIBRARIES := cpufeatures - static bool CpuHasNeon = false; // V6 / V7 - static bool CpuHasNeon64 = false; // 64-bit -# endif -#endif - - -#if !defined(LEO_TARGET_MOBILE) - -#ifdef _MSC_VER - #include // __cpuid - #pragma warning(disable: 4752) // found Intel(R) Advanced Vector Extensions; consider using /arch:AVX -#endif - -#ifdef LEO_TRY_AVX2 -static bool CpuHasAVX2 = false; -#endif -static bool CpuHasSSSE3 = false; - -#define CPUID_EBX_AVX2 0x00000020 -#define CPUID_ECX_SSSE3 0x00000200 - -static void _cpuid(unsigned int cpu_info[4U], const unsigned int cpu_info_type) -{ -#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_AMD64) || defined(_M_IX86)) - __cpuid((int *) cpu_info, cpu_info_type); -#else //if defined(HAVE_CPUID) - cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0; -# ifdef __i386__ - __asm__ __volatile__ ("pushfl; pushfl; " - "popl %0; " - "movl %0, %1; xorl %2, %0; " - "pushl %0; " - "popfl; pushfl; popl %0; popfl" : - "=&r" (cpu_info[0]), "=&r" (cpu_info[1]) : - "i" (0x200000)); - if (((cpu_info[0] ^ cpu_info[1]) & 0x200000) == 0) { - return; /* LCOV_EXCL_LINE */ - } -# endif -# ifdef __i386__ - __asm__ __volatile__ ("xchgl %%ebx, %k1; cpuid; xchgl %%ebx, %k1" : - "=a" (cpu_info[0]), "=&r" (cpu_info[1]), - "=c" (cpu_info[2]), "=d" (cpu_info[3]) : - "0" (cpu_info_type), "2" (0U)); -# elif defined(__x86_64__) - __asm__ __volatile__ ("xchgq %%rbx, %q1; cpuid; xchgq %%rbx, %q1" : - "=a" (cpu_info[0]), "=&r" (cpu_info[1]), - "=c" (cpu_info[2]), "=d" (cpu_info[3]) : - "0" (cpu_info_type), "2" (0U)); -# else - __asm__ __volatile__ ("cpuid" : - "=a" (cpu_info[0]), "=b" (cpu_info[1]), - "=c" (cpu_info[2]), "=d" (cpu_info[3]) : - "0" (cpu_info_type), "2" (0U)); -# endif -#endif -} - -#endif // defined(LEO_TARGET_MOBILE) - - -static void leo_architecture_init() -{ -#if defined(LEO_TRY_NEON) && defined(HAVE_ANDROID_GETCPUFEATURES) - AndroidCpuFamily family = android_getCpuFamily(); - if (family == ANDROID_CPU_FAMILY_ARM) - { - if (android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON) - CpuHasNeon = true; - } - else if (family == ANDROID_CPU_FAMILY_ARM64) - { - CpuHasNeon = true; - if (android_getCpuFeatures() & ANDROID_CPU_ARM64_FEATURE_ASIMD) - CpuHasNeon64 = true; - } -#endif - -#if !defined(LEO_TARGET_MOBILE) - unsigned int cpu_info[4]; - - _cpuid(cpu_info, 1); - CpuHasSSSE3 = ((cpu_info[2] & CPUID_ECX_SSSE3) != 0); - -#if defined(LEO_TRY_AVX2) - _cpuid(cpu_info, 7); - CpuHasAVX2 = ((cpu_info[1] & CPUID_EBX_AVX2) != 0); -#endif // LEO_TRY_AVX2 - -#endif // LEO_TARGET_MOBILE -} - - -//------------------------------------------------------------------------------ -// SIMD-Safe Aligned Memory Allocations - -static const unsigned kAlignmentBytes = LEO_ALIGN_BYTES; - -LEO_FORCE_INLINE unsigned NextAlignedOffset(unsigned offset) -{ - return (offset + kAlignmentBytes - 1) & ~(kAlignmentBytes - 1); -} - -static LEO_FORCE_INLINE uint8_t* SIMDSafeAllocate(size_t size) -{ - uint8_t* data = (uint8_t*)calloc(1, kAlignmentBytes + size); - if (!data) - return nullptr; - unsigned offset = (unsigned)((uintptr_t)data % kAlignmentBytes); - data += kAlignmentBytes - offset; - data[-1] = (uint8_t)offset; - return data; -} - -static LEO_FORCE_INLINE void SIMDSafeFree(void* ptr) -{ - if (!ptr) - return; - uint8_t* data = (uint8_t*)ptr; - unsigned offset = data[-1]; - if (offset >= kAlignmentBytes) - { - LEO_DEBUG_BREAK; // Should never happen - return; - } - data -= kAlignmentBytes - offset; - free(data); -} - - -//------------------------------------------------------------------------------ -// Field - -//#define LEO_SHORT_FIELD - -#ifdef LEO_SHORT_FIELD -typedef uint8_t GFSymbol; -static const unsigned kGFBits = 8; -static const unsigned kGFPolynomial = 0x11D; -GFSymbol kGFBasis[kGFBits] = { - 1, 214, 152, 146, 86, 200, 88, 230 // Cantor basis -}; -#else -typedef uint16_t GFSymbol; -static const unsigned kGFBits = 16; -static const unsigned kGFPolynomial = 0x1002D; -GFSymbol kGFBasis[kGFBits] = { - 0x0001, 0xACCA, 0x3C0E, 0x163E, // Cantor basis - 0xC582, 0xED2E, 0x914C, 0x4012, - 0x6C98, 0x10D8, 0x6A72, 0xB900, - 0xFDB8, 0xFB34, 0xFF38, 0x991E -}; -#endif - -/* - Cantor Basis introduced by: - D. G. Cantor, "On arithmetical algorithms over finite fields", - Journal of Combinatorial Theory, Series A, vol. 50, no. 2, pp. 285-300, 1989. -*/ - -static const unsigned kFieldSize = (unsigned)1 << kGFBits; //Field size -static const unsigned kFieldModulus = kFieldSize - 1; - -static GFSymbol GFLog[kFieldSize]; -static GFSymbol GFExp[kFieldSize]; - -// Initialize GFLog[], GFExp[] -static void InitField() -{ - unsigned state = 1; - for (unsigned i = 0; i < kFieldModulus; ++i) - { - GFExp[state] = static_cast(i); - state <<= 1; - if (state >= kFieldSize) - state ^= kGFPolynomial; - } - GFExp[0] = kFieldModulus; - - // Conversion to chosen basis: - - GFLog[0] = 0; - for (unsigned i = 0; i < kGFBits; ++i) - { - const GFSymbol basis = kGFBasis[i]; - const unsigned width = (unsigned)(1UL << i); - - for (unsigned j = 0; j < width; ++j) - GFLog[j + width] = GFLog[j] ^ basis; - } - - for (unsigned i = 0; i < kFieldSize; ++i) - GFLog[i] = GFExp[GFLog[i]]; - - for (unsigned i = 0; i < kFieldSize; ++i) - GFExp[GFLog[i]] = i; - - GFExp[kFieldModulus] = GFExp[0]; -} - - -//------------------------------------------------------------------------------ -// Mod Q Field Operations -// -// Q is the maximum symbol value, e.g. 255 or 65535. - -// z = x + y (mod Q) -static inline GFSymbol AddModQ(GFSymbol a, GFSymbol b) -{ - const unsigned sum = (unsigned)a + b; - - // Partial reduction step, allowing for Q to be returned - return static_cast(sum + (sum >> kGFBits)); -} - -// z = x - y (mod Q) -static inline GFSymbol SubModQ(GFSymbol a, GFSymbol b) -{ - const unsigned dif = (unsigned)a - b; - - // Partial reduction step, allowing for Q to be returned - return static_cast(dif + (dif >> kGFBits)); -} - -// vx[] += vy[] * z -static void muladd_mem(GFSymbol * LEO_RESTRICT vx, const GFSymbol * LEO_RESTRICT vy, GFSymbol z, unsigned symbolCount) -{ - for (unsigned i = 0; i < symbolCount; ++i) - { - const GFSymbol a = vy[i]; - if (a == 0) - continue; - - GFSymbol sum1 = static_cast(AddModQ(GFLog[a & 0x0f], z)); - GFSymbol value1 = GFExp[sum1]; - if ((a & 0x0f) == 0) - { - value1 = 0; - } - GFSymbol sum2 = static_cast(AddModQ(GFLog[a & 0xf0], z)); - GFSymbol value2 = GFExp[sum2]; - if ((a & 0xf0) == 0) - { - value2 = 0; - } - GFSymbol sum3 = static_cast(AddModQ(GFLog[a & 0x0f00], z)); - GFSymbol value3 = GFExp[sum3]; - if ((a & 0x0f00) == 0) - { - value3 = 0; - } - GFSymbol sum4 = static_cast(AddModQ(GFLog[a & 0xf000], z)); - GFSymbol value4 = GFExp[sum4]; - if ((a & 0xf000) == 0) - { - value4 = 0; - } - - vx[i] ^= value1; - vx[i] ^= value2; - vx[i] ^= value3; - vx[i] ^= value4; - } -} - -// return a*GFExp[b] over GF(2^r) -static GFSymbol mulE(GFSymbol a, GFSymbol b) -{ - if (a == 0) - return 0; - - const GFSymbol sum = static_cast(AddModQ(GFLog[a], b)); - return GFExp[sum]; -} - - -//------------------------------------------------------------------------------ -// Fast Walsh-Hadamard Transform (FWHT) Mod Q -// -// Q is the maximum symbol value, e.g. 255 or 65535. - -// Define this to enable the optimized version of FWHT() -#define LEO_FWHT_OPTIMIZED - -typedef GFSymbol fwht_t; - -// {a, b} = {a + b, a - b} (Mod Q) -static LEO_FORCE_INLINE void FWHT_2(fwht_t& LEO_RESTRICT a, fwht_t& LEO_RESTRICT b) -{ - const fwht_t sum = AddModQ(a, b); - const fwht_t dif = SubModQ(a, b); - a = sum; - b = dif; -} - -/* - FWHT is a minor slice of the runtime and does not grow with data size, - but I did attempt a few additional optimizations that failed: - - I've attempted to vectorize (with partial reductions) FWHT_4(data, s), - which is 70% of the algorithm, but it was slower. Left in _attic_. - - I've attempted to avoid reductions in all or parts of the FWHT. - The final modular reduction ends up being slower than the savings. - Specifically I tried doing it for the whole FWHT and also I tried - doing it just for the FWHT_2 loop in the main routine, but both - approaches are slower than partial reductions. - - Replacing word reads with wider reads does speed up the operation, but - at too high a complexity cost relative to minor perf improvement. -*/ - -#ifndef LEO_FWHT_OPTIMIZED - -// Reference implementation -static void FWHT(fwht_t* data, const unsigned bits) -{ - const unsigned size = (unsigned)(1UL << bits); - for (unsigned width = 1; width < size; width <<= 1) - for (unsigned i = 0; i < size; i += (width << 1)) - for (unsigned j = i; j < (width + i); ++j) - FWHT_2(data[j], data[j + width]); -} - -#else - -static LEO_FORCE_INLINE void FWHT_4(fwht_t* data) -{ - fwht_t t0 = data[0]; - fwht_t t1 = data[1]; - fwht_t t2 = data[2]; - fwht_t t3 = data[3]; - FWHT_2(t0, t1); - FWHT_2(t2, t3); - FWHT_2(t0, t2); - FWHT_2(t1, t3); - data[0] = t0; - data[1] = t1; - data[2] = t2; - data[3] = t3; -} - -static LEO_FORCE_INLINE void FWHT_4(fwht_t* data, unsigned s) -{ - unsigned x = 0; - fwht_t t0 = data[x]; x += s; - fwht_t t1 = data[x]; x += s; - fwht_t t2 = data[x]; x += s; - fwht_t t3 = data[x]; - FWHT_2(t0, t1); - FWHT_2(t2, t3); - FWHT_2(t0, t2); - FWHT_2(t1, t3); - unsigned y = 0; - data[y] = t0; y += s; - data[y] = t1; y += s; - data[y] = t2; y += s; - data[y] = t3; -} - -static inline void FWHT_8(fwht_t* data) -{ - fwht_t t0 = data[0]; - fwht_t t1 = data[1]; - fwht_t t2 = data[2]; - fwht_t t3 = data[3]; - fwht_t t4 = data[4]; - fwht_t t5 = data[5]; - fwht_t t6 = data[6]; - fwht_t t7 = data[7]; - FWHT_2(t0, t1); - FWHT_2(t2, t3); - FWHT_2(t4, t5); - FWHT_2(t6, t7); - FWHT_2(t0, t2); - FWHT_2(t1, t3); - FWHT_2(t4, t6); - FWHT_2(t5, t7); - FWHT_2(t0, t4); - FWHT_2(t1, t5); - FWHT_2(t2, t6); - FWHT_2(t3, t7); - data[0] = t0; - data[1] = t1; - data[2] = t2; - data[3] = t3; - data[4] = t4; - data[5] = t5; - data[6] = t6; - data[7] = t7; -} - -static inline void FWHT_16(fwht_t* data) -{ - fwht_t t0 = data[0]; - fwht_t t1 = data[1]; - fwht_t t2 = data[2]; - fwht_t t3 = data[3]; - fwht_t t4 = data[4]; - fwht_t t5 = data[5]; - fwht_t t6 = data[6]; - fwht_t t7 = data[7]; - fwht_t t8 = data[8]; - fwht_t t9 = data[9]; - fwht_t t10 = data[10]; - fwht_t t11 = data[11]; - fwht_t t12 = data[12]; - fwht_t t13 = data[13]; - fwht_t t14 = data[14]; - fwht_t t15 = data[15]; - FWHT_2(t0, t1); - FWHT_2(t2, t3); - FWHT_2(t4, t5); - FWHT_2(t6, t7); - FWHT_2(t8, t9); - FWHT_2(t10, t11); - FWHT_2(t12, t13); - FWHT_2(t14, t15); - FWHT_2(t0, t2); - FWHT_2(t1, t3); - FWHT_2(t4, t6); - FWHT_2(t5, t7); - FWHT_2(t8, t10); - FWHT_2(t9, t11); - FWHT_2(t12, t14); - FWHT_2(t13, t15); - FWHT_2(t0, t4); - FWHT_2(t1, t5); - FWHT_2(t2, t6); - FWHT_2(t3, t7); - FWHT_2(t8, t12); - FWHT_2(t9, t13); - FWHT_2(t10, t14); - FWHT_2(t11, t15); - FWHT_2(t0, t8); - FWHT_2(t1, t9); - FWHT_2(t2, t10); - FWHT_2(t3, t11); - FWHT_2(t4, t12); - FWHT_2(t5, t13); - FWHT_2(t6, t14); - FWHT_2(t7, t15); - data[0] = t0; - data[1] = t1; - data[2] = t2; - data[3] = t3; - data[4] = t4; - data[5] = t5; - data[6] = t6; - data[7] = t7; - data[8] = t8; - data[9] = t9; - data[10] = t10; - data[11] = t11; - data[12] = t12; - data[13] = t13; - data[14] = t14; - data[15] = t15; -} - -static void FWHT_SmallData(fwht_t* data, unsigned ldn) -{ - const unsigned n = (1UL << ldn); - - if (n <= 2) - { - if (n == 2) - FWHT_2(data[0], data[1]); - return; - } - - for (unsigned ldm = ldn; ldm > 3; ldm -= 2) - { - unsigned m = (1UL << ldm); - unsigned m4 = (m >> 2); - for (unsigned r = 0; r < n; r += m) - for (unsigned j = 0; j < m4; j++) - FWHT_4(data + j + r, m4); - } - - if (ldn & 1) - { - for (unsigned i0 = 0; i0 < n; i0 += 8) - FWHT_8(data + i0); - } - else - { - for (unsigned i0 = 0; i0 < n; i0 += 4) - FWHT_4(data + i0); - } -} - -// Decimation in time (DIT) version -static void FWHT(fwht_t* data, const unsigned ldn) -{ - if (ldn <= 13) - { - FWHT_SmallData(data, ldn); - return; - } - - FWHT_2(data[2], data[3]); - FWHT_4(data + 4); - FWHT_8(data + 8); - FWHT_16(data + 16); - for (unsigned ldm = 5; ldm < ldn; ++ldm) - FWHT(data + (unsigned)(1UL << ldm), ldm); - - for (unsigned ldm = 0; ldm < ldn; ++ldm) - { - const unsigned mh = (1UL << ldm); - for (unsigned t1 = 0, t2 = mh; t1 < mh; ++t1, ++t2) - FWHT_2(data[t1], data[t2]); - } -} - -#endif - - -//------------------------------------------------------------------------------ -// Memory Buffer XOR - -static void xor_mem(void * LEO_RESTRICT vx, const void * LEO_RESTRICT vy, unsigned bytes) -{ - LEO_M128 * LEO_RESTRICT x16 = reinterpret_cast(vx); - const LEO_M128 * LEO_RESTRICT y16 = reinterpret_cast(vy); - -#if defined(LEO_TARGET_MOBILE) -# if defined(LEO_TRY_NEON) - // Handle multiples of 64 bytes - if (CpuHasNeon) - { - while (bytes >= 64) - { - LEO_M128 x0 = vld1q_u8(x16); - LEO_M128 x1 = vld1q_u8(x16 + 1); - LEO_M128 x2 = vld1q_u8(x16 + 2); - LEO_M128 x3 = vld1q_u8(x16 + 3); - LEO_M128 y0 = vld1q_u8(y16); - LEO_M128 y1 = vld1q_u8(y16 + 1); - LEO_M128 y2 = vld1q_u8(y16 + 2); - LEO_M128 y3 = vld1q_u8(y16 + 3); - - vst1q_u8(x16, veorq_u8(x0, y0)); - vst1q_u8(x16 + 1, veorq_u8(x1, y1)); - vst1q_u8(x16 + 2, veorq_u8(x2, y2)); - vst1q_u8(x16 + 3, veorq_u8(x3, y3)); - - bytes -= 64, x16 += 4, y16 += 4; - } - - // Handle multiples of 16 bytes - while (bytes >= 16) - { - LEO_M128 x0 = vld1q_u8(x16); - LEO_M128 y0 = vld1q_u8(y16); - - vst1q_u8(x16, veorq_u8(x0, y0)); - - bytes -= 16, ++x16, ++y16; - } - } - else -# endif // LEO_TRY_NEON - { - uint64_t * LEO_RESTRICT x8 = reinterpret_cast(x16); - const uint64_t * LEO_RESTRICT y8 = reinterpret_cast(y16); - - const unsigned count = (unsigned)bytes / 8; - for (unsigned ii = 0; ii < count; ++ii) - x8[ii] ^= y8[ii]; - - x16 = reinterpret_cast(x8 + count); - y16 = reinterpret_cast(y8 + count); - } -#else // LEO_TARGET_MOBILE -# if defined(LEO_TRY_AVX2) - if (CpuHasAVX2) - { - LEO_M256 * LEO_RESTRICT x32 = reinterpret_cast(x16); - const LEO_M256 * LEO_RESTRICT y32 = reinterpret_cast(y16); - - while (bytes >= 128) - { - LEO_M256 x0 = _mm256_loadu_si256(x32); - LEO_M256 y0 = _mm256_loadu_si256(y32); - x0 = _mm256_xor_si256(x0, y0); - LEO_M256 x1 = _mm256_loadu_si256(x32 + 1); - LEO_M256 y1 = _mm256_loadu_si256(y32 + 1); - x1 = _mm256_xor_si256(x1, y1); - LEO_M256 x2 = _mm256_loadu_si256(x32 + 2); - LEO_M256 y2 = _mm256_loadu_si256(y32 + 2); - x2 = _mm256_xor_si256(x2, y2); - LEO_M256 x3 = _mm256_loadu_si256(x32 + 3); - LEO_M256 y3 = _mm256_loadu_si256(y32 + 3); - x3 = _mm256_xor_si256(x3, y3); - - _mm256_storeu_si256(x32, x0); - _mm256_storeu_si256(x32 + 1, x1); - _mm256_storeu_si256(x32 + 2, x2); - _mm256_storeu_si256(x32 + 3, x3); - - bytes -= 128, x32 += 4, y32 += 4; - } - - // Handle multiples of 32 bytes - while (bytes >= 32) - { - // x[i] = x[i] xor y[i] - _mm256_storeu_si256(x32, - _mm256_xor_si256( - _mm256_loadu_si256(x32), - _mm256_loadu_si256(y32))); - - bytes -= 32, ++x32, ++y32; - } - - x16 = reinterpret_cast(x32); - y16 = reinterpret_cast(y32); - } - else -# endif // LEO_TRY_AVX2 - { - while (bytes >= 64) - { - LEO_M128 x0 = _mm_loadu_si128(x16); - LEO_M128 y0 = _mm_loadu_si128(y16); - x0 = _mm_xor_si128(x0, y0); - LEO_M128 x1 = _mm_loadu_si128(x16 + 1); - LEO_M128 y1 = _mm_loadu_si128(y16 + 1); - x1 = _mm_xor_si128(x1, y1); - LEO_M128 x2 = _mm_loadu_si128(x16 + 2); - LEO_M128 y2 = _mm_loadu_si128(y16 + 2); - x2 = _mm_xor_si128(x2, y2); - LEO_M128 x3 = _mm_loadu_si128(x16 + 3); - LEO_M128 y3 = _mm_loadu_si128(y16 + 3); - x3 = _mm_xor_si128(x3, y3); - - _mm_storeu_si128(x16, x0); - _mm_storeu_si128(x16 + 1, x1); - _mm_storeu_si128(x16 + 2, x2); - _mm_storeu_si128(x16 + 3, x3); - - bytes -= 64, x16 += 4, y16 += 4; - } - } -#endif // LEO_TARGET_MOBILE - - // Handle multiples of 16 bytes - while (bytes >= 16) - { - // x[i] = x[i] xor y[i] - _mm_storeu_si128(x16, - _mm_xor_si128( - _mm_loadu_si128(x16), - _mm_loadu_si128(y16))); - - bytes -= 16, ++x16, ++y16; - } - - uint8_t * LEO_RESTRICT x1 = reinterpret_cast(x16); - const uint8_t * LEO_RESTRICT y1 = reinterpret_cast(y16); - - // Handle a block of 8 bytes - const unsigned eight = bytes & 8; - if (eight) - { - uint64_t * LEO_RESTRICT x8 = reinterpret_cast(x1); - const uint64_t * LEO_RESTRICT y8 = reinterpret_cast(y1); - *x8 ^= *y8; - } - - // Handle a block of 4 bytes - const unsigned four = bytes & 4; - if (four) - { - uint32_t * LEO_RESTRICT x4 = reinterpret_cast(x1 + eight); - const uint32_t * LEO_RESTRICT y4 = reinterpret_cast(y1 + eight); - *x4 ^= *y4; - } - - // Handle final bytes - const unsigned offset = eight + four; - switch (bytes & 3) - { - case 3: x1[offset + 2] ^= y1[offset + 2]; - case 2: x1[offset + 1] ^= y1[offset + 1]; - case 1: x1[offset] ^= y1[offset]; - default: - break; - } -} - - -//------------------------------------------------------------------------------ -// Formal Derivative - -// Formal derivative of polynomial in the new basis -static void formal_derivative(GFSymbol* cos, const unsigned size) -{ - for (unsigned i = 1; i < size; ++i) - { - const unsigned leng = ((i ^ (i - 1)) + 1) >> 1; - - // If a large number of values are being XORed: - if (leng >= 8) - xor_mem(cos + i - leng, cos + i, leng * sizeof(GFSymbol)); - else - for (unsigned j = i - leng; j < i; j++) - cos[j] ^= cos[j + leng]; - } - - for (unsigned i = size; i < kFieldSize; i <<= 1) - xor_mem(cos, cos + i, size * sizeof(GFSymbol)); -} - - -//------------------------------------------------------------------------------ -// Fast Fourier Transform - -static GFSymbol skewVec[kFieldModulus]; // twisted factors used in FFT - -// IFFT in the proposed basis -static void IFLT(GFSymbol* data, const unsigned size, const unsigned index) -{ - for (unsigned depart_no = 1; depart_no < size; depart_no <<= 1) - { - for (unsigned j = depart_no; j < size; j += (depart_no << 1)) - { - // If a large number of values are being XORed: - if (depart_no >= 8) - xor_mem(data + j, data + j - depart_no, depart_no * sizeof(GFSymbol)); - else - for (unsigned i = j - depart_no; i < j; ++i) - data[i + depart_no] ^= data[i]; - - const GFSymbol skew = skewVec[j + index - 1]; - - if (skew != kFieldModulus) - muladd_mem(data + j - depart_no, data + j, skew, depart_no); - } - } -} - -// FFT in the proposed basis -static void FLT(GFSymbol* data, const unsigned size, const unsigned index) -{ - for (unsigned depart_no = (size >> 1); depart_no > 0; depart_no >>= 1) - { - for (unsigned j = depart_no; j < size; j += (depart_no << 1)) - { - const GFSymbol skew = skewVec[j + index - 1]; - - if (skew != kFieldModulus) - muladd_mem(data + j - depart_no, data + j, skew, depart_no); - - // If a large number of values are being XORed: - if (depart_no >= 8) - xor_mem(data + j, data + j - depart_no, depart_no * sizeof(GFSymbol)); - else - for (unsigned i = j - depart_no; i < j; ++i) - data[i + depart_no] ^= data[i]; - } - } -} - - -//------------------------------------------------------------------------------ -// FFT Initialization - -static GFSymbol B[kFieldSize >> 1]; // factors used in formal derivative -static fwht_t log_walsh[kFieldSize]; // factors used in the evaluation of the error locator polynomial - -// Initialize skewVec[], B[], log_walsh[] -static void InitFieldOperations() -{ - GFSymbol temp[kGFBits - 1]; - - for (unsigned i = 1; i < kGFBits; ++i) - temp[i - 1] = (GFSymbol)((unsigned)1 << i); - - for (unsigned m = 0; m < (kGFBits - 1); ++m) - { - const unsigned step = (unsigned)1 << (m + 1); - - skewVec[((unsigned)1 << m) - 1] = 0; - - for (unsigned i = m; i < (kGFBits - 1); ++i) - { - const unsigned s = ((unsigned)1 << (i + 1)); - - for (unsigned j = ((unsigned)1 << m) - 1; j < s; j += step) - skewVec[j + s] = skewVec[j] ^ temp[i]; - } - - temp[m] = kFieldModulus - GFLog[mulE(temp[m], GFLog[temp[m] ^ 1])]; - - for (unsigned i = m + 1; i < (kGFBits - 1); ++i) - temp[i] = mulE(temp[i], (GFLog[temp[i] ^ 1] + temp[m]) % kFieldModulus); - } - - for (unsigned i = 0; i < kFieldSize; ++i) - skewVec[i] = GFLog[skewVec[i]]; - - temp[0] = kFieldModulus - temp[0]; - - for (unsigned i = 1; i < (kGFBits - 1); ++i) - temp[i] = (kFieldModulus - temp[i] + temp[i - 1]) % kFieldModulus; - - B[0] = 0; - for (unsigned i = 0; i < (kGFBits - 1); ++i) - { - const unsigned depart = ((unsigned)1 << i); - - for (unsigned j = 0; j < depart; ++j) - B[j + depart] = (B[j] + temp[i]) % kFieldModulus; - } - - for (unsigned i = 0; i < kFieldSize; ++i) - log_walsh[i] = GFLog[i]; - - log_walsh[0] = 0; - - FWHT(log_walsh, kGFBits); -} - - -//------------------------------------------------------------------------------ -// Encoder - -// Encoding alg for k/n<0.5: message is a power of two -static void encodeL(GFSymbol* data, const unsigned k, GFSymbol* codeword) -{ - memcpy(codeword, data, sizeof(GFSymbol) * k); - - IFLT(codeword, k, 0); - - for (unsigned i = k; i < kFieldSize; i += k) - { - memcpy(&codeword[i], codeword, sizeof(GFSymbol) * k); - - FLT(&codeword[i], k, i); - } - - memcpy(codeword, data, sizeof(GFSymbol) * k); -} - -// Encoding alg for k/n>0.5: parity is a power of two. -// data: message array. parity: parity array. mem: buffer(size>= n-k) -static void encodeH(const GFSymbol* data, const unsigned k, GFSymbol* parity, GFSymbol* mem) -{ - const unsigned t = kFieldSize - k; - - memset(parity, 0, sizeof(GFSymbol) * t); - - for (unsigned i = t; i < kFieldSize; i += t) - { - memcpy(mem, &data[i - t], sizeof(GFSymbol) * t); - - IFLT(mem, t, i); - - xor_mem(parity, mem, t * sizeof(GFSymbol)); - } - - FLT(parity, t, 0); -} - - -//------------------------------------------------------------------------------ -// Decoder - -static void decode(GFSymbol* codeword, unsigned k, const bool* erasure) -{ - fwht_t log_walsh2[kFieldSize]; - - // Compute the evaluations of the error locator polynomial - for (unsigned i = 0; i < kFieldSize; ++i) - log_walsh2[i] = erasure[i] ? 1 : 0; - - FWHT(log_walsh2, kGFBits); - - for (unsigned i = 0; i < kFieldSize; ++i) - log_walsh2[i] = ((unsigned)log_walsh2[i] * (unsigned)log_walsh[i]) % kFieldModulus; - - FWHT(log_walsh2, kGFBits); - - // k2 can be replaced with k - const unsigned k2 = kFieldSize; - //const unsigned k2 = k; // cannot actually be replaced with k. what else need to change? - - for (unsigned i = 0; i < kFieldSize; ++i) - { - if (erasure[i]) - { - codeword[i] = 0; - } - else - { - codeword[i] = mulE(codeword[i], log_walsh2[i]); - } - } - - IFLT(codeword, kFieldSize, 0); - - // formal derivative - for (unsigned i = 0; i < kFieldSize; i += 2) - { - codeword[i] = mulE(codeword[i], kFieldModulus - B[i >> 1]); - codeword[i + 1] = mulE(codeword[i + 1], kFieldModulus - B[i >> 1]); - } - - formal_derivative(codeword, k2); - - for (unsigned i = 0; i < k2; i += 2) - { - codeword[i] = mulE(codeword[i], B[i >> 1]); - codeword[i + 1] = mulE(codeword[i + 1], B[i >> 1]); - } - - FLT(codeword, k2, 0); - - for (unsigned i = 0; i < k2; ++i) - { - if (erasure[i]) - { - codeword[i] = mulE(codeword[i], kFieldModulus - log_walsh2[i]); - } - } -} - - -//------------------------------------------------------------------------------ -// Test Application - -void test(unsigned k, unsigned seed) -{ - srand(seed); - - //-----------Generating message---------- - - // Message array - GFSymbol data[kFieldSize] = {0}; - - // Filled with random numbers - for (unsigned i = kFieldSize - k; i < kFieldSize; ++i) - data[i] = (GFSymbol)rand(); - - - //---------encoding---------- - - GFSymbol codeword[kFieldSize]; - encodeH(&data[kFieldSize - k], k, data, codeword); - //encodeL(data, k, codeword); // does not seem to work with any input? what else needs to change? - - memcpy(codeword, data, sizeof(GFSymbol) * kFieldSize); - - - //--------erasure simulation--------- - - // Array indicating erasures - bool erasure[kFieldSize] = { - false - }; - - for (unsigned i = k; i < kFieldSize; ++i) - erasure[i] = true; - - // permuting the erasure array - for (unsigned i = kFieldSize - 1; i > 0; --i) - { - unsigned pos = rand() % (i + 1); - - if (i != pos) - { - bool tmp = erasure[i]; - erasure[i] = erasure[pos]; - erasure[pos] = tmp; - } - } - - // erasure codeword symbols - for (unsigned i = 0; i < kFieldSize; ++i) - if (erasure[i]) - codeword[i] = 0; - - - //---------main processing---------- - decode(codeword, k, erasure); - - // Check the correctness of the result - for (unsigned i = 0; i < kFieldSize; ++i) - { - if (erasure[i] == 1) - { - if (data[i] != codeword[i]) - { - printf("Decoding Error with seed = %d!\n", seed); - LEO_DEBUG_BREAK; - return; - } - } - } - - //printf("Decoding is successful!\n"); -} - - -//------------------------------------------------------------------------------ -// Entrypoint - -int main(int argc, char **argv) -{ - // Initialize architecture-specific code - leo_architecture_init(); - - // Fill GFLog table and GFExp table - InitField(); - - // Compute factors used in erasure decoder - InitFieldOperations(); - - unsigned seed = (unsigned)time(NULL); - for (;;) - { - // test(int k), k: message size - /* - EncodeH works for kFieldSize / 2 and kFieldSize * 3 / 4, etc, - s.t. the number of recovery pieces is a power of two - */ - test(kFieldSize / 2, seed); - - ++seed; - } - - return 0; -} diff --git a/LeopardFF16.cpp b/LeopardFF16.cpp index 71d22e2..bd5c1cb 100644 --- a/LeopardFF16.cpp +++ b/LeopardFF16.cpp @@ -9,7 +9,7 @@ * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - * Neither the name of LHC-RS nor the names of its contributors may be + * Neither the name of Leopard-RS nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. @@ -26,494 +26,75 @@ POSSIBILITY OF SUCH DAMAGE. */ +#include "LeopardFF16.h" #include -#include -#include -#include -#include +// Define this to enable the optimized version of FWHT() +#define LEO_FF16_FWHT_OPTIMIZED -/* - TODO: - + Write C API and unit tester - + Limit input to multiples of 64 bytes - + Replace GFSymbol with a file data pointer - + New 16-bit Muladd inner loops - + Class to contain the (large) muladd tables - + Preliminary benchmarks for large data! - + New 8-bit Muladd inner loops - + Benchmarks for smaller data! - + Refactor software - + Pick a name for the software better than LEO_RS - + I think it should be split up into several C++ modules - + Write detailed comments for all the routines - + Look into getting EncodeL working so we can support smaller data (Ask Lin) - + Look into using k instead of k2 to speed up decoder (Ask Lin) - + Avoid performing FFT/IFFT intermediate calculations we're not going to use - + Benchmarks, fun! - + Add multi-threading to split up long parallelizable calculations - + Final benchmarks! - + Finish up documentation - + Release version 1 - - - Muladd implementation notes: - - Specialize for 1-3 rows at a time since often times we're multiplying by - the same (skew) value repeatedly, as the ISA-L library does here: - - https://github.com/01org/isa-l/blob/master/erasure_code/gf_3vect_mad_avx.asm#L258 - - Except we should be doing it for 16-bit Galois Field. - To implement that use the ALTMAP trick from Jerasure: - - http://lab.jerasure.org/jerasure/gf-complete/blob/master/src/gf_w16.c#L1140 - - Except we should also support AVX2 since that is a 40% perf boost, so put - the high and low bytes 32 bytes instead of 16 bytes apart. - - Also I think we should go ahead and precompute the multiply tables since - it avoids a bunch of memory lookups for each muladd, and only costs 8 MB. -*/ +namespace leopard { namespace ff16 { //------------------------------------------------------------------------------ -// Debug +// Datatypes and Constants -// Some bugs only repro in release mode, so this can be helpful -//#define LEO_DEBUG_IN_RELEASE +// Modulus for field operations +static const ffe_t kModulus = 65535; -#if defined(_DEBUG) || defined(DEBUG) || defined(LEO_DEBUG_IN_RELEASE) - #define LEO_DEBUG - #ifdef _WIN32 - #define LEO_DEBUG_BREAK __debugbreak() - #else - #define LEO_DEBUG_BREAK __builtin_trap() - #endif - #define LEO_DEBUG_ASSERT(cond) { if (!(cond)) { LEO_DEBUG_BREAK; } } -#else - #define LEO_DEBUG_BREAK ; - #define LEO_DEBUG_ASSERT(cond) ; -#endif +// LFSR Polynomial that generates the field elements +static const unsigned kPolynomial = 0x1002D; - -//------------------------------------------------------------------------------ -// Platform/Architecture - -#if defined(ANDROID) || defined(IOS) - #define LEO_TARGET_MOBILE -#endif // ANDROID - -#if defined(__AVX2__) || (defined (_MSC_VER) && _MSC_VER >= 1900) - #define LEO_TRY_AVX2 /* 256-bit */ - #include - #define LEO_ALIGN_BYTES 32 -#else // __AVX2__ - #define LEO_ALIGN_BYTES 16 -#endif // __AVX2__ - -#if !defined(LEO_TARGET_MOBILE) - // Note: MSVC currently only supports SSSE3 but not AVX2 - #include // SSSE3: _mm_shuffle_epi8 - #include // SSE2 -#endif // LEO_TARGET_MOBILE - -#if defined(HAVE_ARM_NEON_H) - #include -#endif // HAVE_ARM_NEON_H - -#if defined(LEO_TARGET_MOBILE) - - #define LEO_ALIGNED_ACCESSES /* Inputs must be aligned to LEO_ALIGN_BYTES */ - -# if defined(HAVE_ARM_NEON_H) - // Compiler-specific 128-bit SIMD register keyword - #define LEO_M128 uint8x16_t - #define LEO_TRY_NEON -#else - #define LEO_M128 uint64_t -# endif - -#else // LEO_TARGET_MOBILE - - // Compiler-specific 128-bit SIMD register keyword - #define LEO_M128 __m128i - -#endif // LEO_TARGET_MOBILE - -#ifdef LEO_TRY_AVX2 - // Compiler-specific 256-bit SIMD register keyword - #define LEO_M256 __m256i -#endif - -// Compiler-specific C++11 restrict keyword -#define LEO_RESTRICT __restrict - -// Compiler-specific force inline keyword -#ifdef _MSC_VER - #define LEO_FORCE_INLINE inline __forceinline -#else - #define LEO_FORCE_INLINE inline __attribute__((always_inline)) -#endif - -// Compiler-specific alignment keyword -// Note: Alignment only matters for ARM NEON where it should be 16 -#ifdef _MSC_VER - #define LEO_ALIGNED __declspec(align(LEO_ALIGN_BYTES)) -#else // _MSC_VER - #define LEO_ALIGNED __attribute__((aligned(LEO_ALIGN_BYTES))) -#endif // _MSC_VER - - -//------------------------------------------------------------------------------ -// Runtime CPU Architecture Check -// -// Feature checks stolen shamelessly from -// https://github.com/jedisct1/libsodium/blob/master/src/libsodium/sodium/runtime.c - -#if defined(HAVE_ANDROID_GETCPUFEATURES) - #include -#endif - -#if defined(LEO_TRY_NEON) -# if defined(IOS) && defined(__ARM_NEON__) - // Requires iPhone 5S or newer - static const bool CpuHasNeon = true; - static const bool CpuHasNeon64 = true; -# else - // Remember to add LOCAL_STATIC_LIBRARIES := cpufeatures - static bool CpuHasNeon = false; // V6 / V7 - static bool CpuHasNeon64 = false; // 64-bit -# endif -#endif - - -#if !defined(LEO_TARGET_MOBILE) - -#ifdef _MSC_VER - #include // __cpuid - #pragma warning(disable: 4752) // found Intel(R) Advanced Vector Extensions; consider using /arch:AVX -#endif - -#ifdef LEO_TRY_AVX2 -static bool CpuHasAVX2 = false; -#endif -static bool CpuHasSSSE3 = false; - -#define CPUID_EBX_AVX2 0x00000020 -#define CPUID_ECX_SSSE3 0x00000200 - -static void _cpuid(unsigned int cpu_info[4U], const unsigned int cpu_info_type) -{ -#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_AMD64) || defined(_M_IX86)) - __cpuid((int *) cpu_info, cpu_info_type); -#else //if defined(HAVE_CPUID) - cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0; -# ifdef __i386__ - __asm__ __volatile__ ("pushfl; pushfl; " - "popl %0; " - "movl %0, %1; xorl %2, %0; " - "pushl %0; " - "popfl; pushfl; popl %0; popfl" : - "=&r" (cpu_info[0]), "=&r" (cpu_info[1]) : - "i" (0x200000)); - if (((cpu_info[0] ^ cpu_info[1]) & 0x200000) == 0) { - return; /* LCOV_EXCL_LINE */ - } -# endif -# ifdef __i386__ - __asm__ __volatile__ ("xchgl %%ebx, %k1; cpuid; xchgl %%ebx, %k1" : - "=a" (cpu_info[0]), "=&r" (cpu_info[1]), - "=c" (cpu_info[2]), "=d" (cpu_info[3]) : - "0" (cpu_info_type), "2" (0U)); -# elif defined(__x86_64__) - __asm__ __volatile__ ("xchgq %%rbx, %q1; cpuid; xchgq %%rbx, %q1" : - "=a" (cpu_info[0]), "=&r" (cpu_info[1]), - "=c" (cpu_info[2]), "=d" (cpu_info[3]) : - "0" (cpu_info_type), "2" (0U)); -# else - __asm__ __volatile__ ("cpuid" : - "=a" (cpu_info[0]), "=b" (cpu_info[1]), - "=c" (cpu_info[2]), "=d" (cpu_info[3]) : - "0" (cpu_info_type), "2" (0U)); -# endif -#endif -} - -#endif // defined(LEO_TARGET_MOBILE) - - -static void leo_architecture_init() -{ -#if defined(LEO_TRY_NEON) && defined(HAVE_ANDROID_GETCPUFEATURES) - AndroidCpuFamily family = android_getCpuFamily(); - if (family == ANDROID_CPU_FAMILY_ARM) - { - if (android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON) - CpuHasNeon = true; - } - else if (family == ANDROID_CPU_FAMILY_ARM64) - { - CpuHasNeon = true; - if (android_getCpuFeatures() & ANDROID_CPU_ARM64_FEATURE_ASIMD) - CpuHasNeon64 = true; - } -#endif - -#if !defined(LEO_TARGET_MOBILE) - unsigned int cpu_info[4]; - - _cpuid(cpu_info, 1); - CpuHasSSSE3 = ((cpu_info[2] & CPUID_ECX_SSSE3) != 0); - -#if defined(LEO_TRY_AVX2) - _cpuid(cpu_info, 7); - CpuHasAVX2 = ((cpu_info[1] & CPUID_EBX_AVX2) != 0); -#endif // LEO_TRY_AVX2 - -#endif // LEO_TARGET_MOBILE -} - - -//------------------------------------------------------------------------------ -// SIMD-Safe Aligned Memory Allocations - -static const unsigned kAlignmentBytes = LEO_ALIGN_BYTES; - -LEO_FORCE_INLINE unsigned NextAlignedOffset(unsigned offset) -{ - return (offset + kAlignmentBytes - 1) & ~(kAlignmentBytes - 1); -} - -static LEO_FORCE_INLINE uint8_t* SIMDSafeAllocate(size_t size) -{ - uint8_t* data = (uint8_t*)calloc(1, kAlignmentBytes + size); - if (!data) - return nullptr; - unsigned offset = (unsigned)((uintptr_t)data % kAlignmentBytes); - data += kAlignmentBytes - offset; - data[-1] = (uint8_t)offset; - return data; -} - -static LEO_FORCE_INLINE void SIMDSafeFree(void* ptr) -{ - if (!ptr) - return; - uint8_t* data = (uint8_t*)ptr; - unsigned offset = data[-1]; - if (offset >= kAlignmentBytes) - { - LEO_DEBUG_BREAK; // Should never happen - return; - } - data -= kAlignmentBytes - offset; - free(data); -} - - -//------------------------------------------------------------------------------ -// Field - -//#define LEO_SHORT_FIELD - -#ifdef LEO_SHORT_FIELD -typedef uint8_t GFSymbol; -static const unsigned kGFBits = 8; -static const unsigned kGFPolynomial = 0x11D; -GFSymbol kGFBasis[kGFBits] = { - 1, 214, 152, 146, 86, 200, 88, 230 // Cantor basis -}; -#else -typedef uint16_t GFSymbol; -static const unsigned kGFBits = 16; -static const unsigned kGFPolynomial = 0x1002D; -GFSymbol kGFBasis[kGFBits] = { +// Basis used for generating logarithm tables +static const ffe_t kBasis[kBits] = { 0x0001, 0xACCA, 0x3C0E, 0x163E, // Cantor basis 0xC582, 0xED2E, 0x914C, 0x4012, 0x6C98, 0x10D8, 0x6A72, 0xB900, 0xFDB8, 0xFB34, 0xFF38, 0x991E }; -#endif - -/* - Cantor Basis introduced by: - D. G. Cantor, "On arithmetical algorithms over finite fields", - Journal of Combinatorial Theory, Series A, vol. 50, no. 2, pp. 285-300, 1989. -*/ - -static const unsigned kFieldSize = (unsigned)1 << kGFBits; //Field size -static const unsigned kFieldModulus = kFieldSize - 1; - -static GFSymbol GFLog[kFieldSize]; -static GFSymbol GFExp[kFieldSize]; - -// Initialize GFLog[], GFExp[] -static void InitField() -{ - unsigned state = 1; - for (unsigned i = 0; i < kFieldModulus; ++i) - { - GFExp[state] = static_cast(i); - state <<= 1; - if (state >= kFieldSize) - state ^= kGFPolynomial; - } - GFExp[0] = kFieldModulus; - - // Conversion to chosen basis: - - GFLog[0] = 0; - for (unsigned i = 0; i < kGFBits; ++i) - { - const GFSymbol basis = kGFBasis[i]; - const unsigned width = (unsigned)(1UL << i); - - for (unsigned j = 0; j < width; ++j) - GFLog[j + width] = GFLog[j] ^ basis; - } - - for (unsigned i = 0; i < kFieldSize; ++i) - GFLog[i] = GFExp[GFLog[i]]; - - for (unsigned i = 0; i < kFieldSize; ++i) - GFExp[GFLog[i]] = i; - - GFExp[kFieldModulus] = GFExp[0]; -} //------------------------------------------------------------------------------ -// Mod Q Field Operations -// -// Q is the maximum symbol value, e.g. 255 or 65535. +// Field Operations -// z = x + y (mod Q) -static inline GFSymbol AddModQ(GFSymbol a, GFSymbol b) +// z = x + y (mod kModulus) +static inline ffe_t AddMod(const ffe_t a, const ffe_t b) { const unsigned sum = (unsigned)a + b; - // Partial reduction step, allowing for Q to be returned - return static_cast(sum + (sum >> kGFBits)); + // Partial reduction step, allowing for kModulus to be returned + return static_cast(sum + (sum >> kBits)); } -// z = x - y (mod Q) -static inline GFSymbol SubModQ(GFSymbol a, GFSymbol b) +// z = x - y (mod kModulus) +static inline ffe_t SubMod(const ffe_t a, const ffe_t b) { const unsigned dif = (unsigned)a - b; - // Partial reduction step, allowing for Q to be returned - return static_cast(dif + (dif >> kGFBits)); -} - -// vx[] += vy[] * z -static void muladd_mem(GFSymbol * LEO_RESTRICT vx, const GFSymbol * LEO_RESTRICT vy, GFSymbol z, unsigned symbolCount) -{ - for (unsigned i = 0; i < symbolCount; ++i) - { - const GFSymbol a = vy[i]; - if (a == 0) - continue; - - GFSymbol sum1 = static_cast(AddModQ(GFLog[a & 0x0f], z)); - GFSymbol value1 = GFExp[sum1]; - if ((a & 0x0f) == 0) - { - value1 = 0; - } - GFSymbol sum2 = static_cast(AddModQ(GFLog[a & 0xf0], z)); - GFSymbol value2 = GFExp[sum2]; - if ((a & 0xf0) == 0) - { - value2 = 0; - } - GFSymbol sum3 = static_cast(AddModQ(GFLog[a & 0x0f00], z)); - GFSymbol value3 = GFExp[sum3]; - if ((a & 0x0f00) == 0) - { - value3 = 0; - } - GFSymbol sum4 = static_cast(AddModQ(GFLog[a & 0xf000], z)); - GFSymbol value4 = GFExp[sum4]; - if ((a & 0xf000) == 0) - { - value4 = 0; - } - - vx[i] ^= value1; - vx[i] ^= value2; - vx[i] ^= value3; - vx[i] ^= value4; - } -} - -// return a*GFExp[b] over GF(2^r) -static GFSymbol mulE(GFSymbol a, GFSymbol b) -{ - if (a == 0) - return 0; - - const GFSymbol sum = static_cast(AddModQ(GFLog[a], b)); - return GFExp[sum]; + // Partial reduction step, allowing for kModulus to be returned + return static_cast(dif + (dif >> kBits)); } //------------------------------------------------------------------------------ -// Fast Walsh-Hadamard Transform (FWHT) Mod Q -// -// Q is the maximum symbol value, e.g. 255 or 65535. +// Fast Walsh-Hadamard Transform (FWHT) (mod kModulus) -// Define this to enable the optimized version of FWHT() -#define LEO_FWHT_OPTIMIZED - -typedef GFSymbol fwht_t; +#if defined(LEO_FF16_FWHT_OPTIMIZED) // {a, b} = {a + b, a - b} (Mod Q) -static LEO_FORCE_INLINE void FWHT_2(fwht_t& LEO_RESTRICT a, fwht_t& LEO_RESTRICT b) +static LEO_FORCE_INLINE void FWHT_2(ffe_t& LEO_RESTRICT a, ffe_t& LEO_RESTRICT b) { - const fwht_t sum = AddModQ(a, b); - const fwht_t dif = SubModQ(a, b); + const ffe_t sum = AddMod(a, b); + const ffe_t dif = SubMod(a, b); a = sum; b = dif; } -/* - FWHT is a minor slice of the runtime and does not grow with data size, - but I did attempt a few additional optimizations that failed: - - I've attempted to vectorize (with partial reductions) FWHT_4(data, s), - which is 70% of the algorithm, but it was slower. Left in _attic_. - - I've attempted to avoid reductions in all or parts of the FWHT. - The final modular reduction ends up being slower than the savings. - Specifically I tried doing it for the whole FWHT and also I tried - doing it just for the FWHT_2 loop in the main routine, but both - approaches are slower than partial reductions. - - Replacing word reads with wider reads does speed up the operation, but - at too high a complexity cost relative to minor perf improvement. -*/ - -#ifndef LEO_FWHT_OPTIMIZED - -// Reference implementation -static void FWHT(fwht_t* data, const unsigned bits) +static LEO_FORCE_INLINE void FWHT_4(ffe_t* data) { - const unsigned size = (unsigned)(1UL << bits); - for (unsigned width = 1; width < size; width <<= 1) - for (unsigned i = 0; i < size; i += (width << 1)) - for (unsigned j = i; j < (width + i); ++j) - FWHT_2(data[j], data[j + width]); -} - -#else - -static LEO_FORCE_INLINE void FWHT_4(fwht_t* data) -{ - fwht_t t0 = data[0]; - fwht_t t1 = data[1]; - fwht_t t2 = data[2]; - fwht_t t3 = data[3]; + ffe_t t0 = data[0]; + ffe_t t1 = data[1]; + ffe_t t2 = data[2]; + ffe_t t3 = data[3]; FWHT_2(t0, t1); FWHT_2(t2, t3); FWHT_2(t0, t2); @@ -524,13 +105,13 @@ static LEO_FORCE_INLINE void FWHT_4(fwht_t* data) data[3] = t3; } -static LEO_FORCE_INLINE void FWHT_4(fwht_t* data, unsigned s) +static LEO_FORCE_INLINE void FWHT_4(ffe_t* data, unsigned s) { unsigned x = 0; - fwht_t t0 = data[x]; x += s; - fwht_t t1 = data[x]; x += s; - fwht_t t2 = data[x]; x += s; - fwht_t t3 = data[x]; + ffe_t t0 = data[x]; x += s; + ffe_t t1 = data[x]; x += s; + ffe_t t2 = data[x]; x += s; + ffe_t t3 = data[x]; FWHT_2(t0, t1); FWHT_2(t2, t3); FWHT_2(t0, t2); @@ -542,16 +123,16 @@ static LEO_FORCE_INLINE void FWHT_4(fwht_t* data, unsigned s) data[y] = t3; } -static inline void FWHT_8(fwht_t* data) +static inline void FWHT_8(ffe_t* data) { - fwht_t t0 = data[0]; - fwht_t t1 = data[1]; - fwht_t t2 = data[2]; - fwht_t t3 = data[3]; - fwht_t t4 = data[4]; - fwht_t t5 = data[5]; - fwht_t t6 = data[6]; - fwht_t t7 = data[7]; + ffe_t t0 = data[0]; + ffe_t t1 = data[1]; + ffe_t t2 = data[2]; + ffe_t t3 = data[3]; + ffe_t t4 = data[4]; + ffe_t t5 = data[5]; + ffe_t t6 = data[6]; + ffe_t t7 = data[7]; FWHT_2(t0, t1); FWHT_2(t2, t3); FWHT_2(t4, t5); @@ -574,24 +155,24 @@ static inline void FWHT_8(fwht_t* data) data[7] = t7; } -static inline void FWHT_16(fwht_t* data) +static inline void FWHT_16(ffe_t* data) { - fwht_t t0 = data[0]; - fwht_t t1 = data[1]; - fwht_t t2 = data[2]; - fwht_t t3 = data[3]; - fwht_t t4 = data[4]; - fwht_t t5 = data[5]; - fwht_t t6 = data[6]; - fwht_t t7 = data[7]; - fwht_t t8 = data[8]; - fwht_t t9 = data[9]; - fwht_t t10 = data[10]; - fwht_t t11 = data[11]; - fwht_t t12 = data[12]; - fwht_t t13 = data[13]; - fwht_t t14 = data[14]; - fwht_t t15 = data[15]; + ffe_t t0 = data[0]; + ffe_t t1 = data[1]; + ffe_t t2 = data[2]; + ffe_t t3 = data[3]; + ffe_t t4 = data[4]; + ffe_t t5 = data[5]; + ffe_t t6 = data[6]; + ffe_t t7 = data[7]; + ffe_t t8 = data[8]; + ffe_t t9 = data[9]; + ffe_t t10 = data[10]; + ffe_t t11 = data[11]; + ffe_t t12 = data[12]; + ffe_t t13 = data[13]; + ffe_t t14 = data[14]; + ffe_t t15 = data[15]; FWHT_2(t0, t1); FWHT_2(t2, t3); FWHT_2(t4, t5); @@ -642,7 +223,7 @@ static inline void FWHT_16(fwht_t* data) data[15] = t15; } -static void FWHT_SmallData(fwht_t* data, unsigned ldn) +static void FWHT_SmallData(ffe_t* data, unsigned ldn) { const unsigned n = (1UL << ldn); @@ -675,7 +256,7 @@ static void FWHT_SmallData(fwht_t* data, unsigned ldn) } // Decimation in time (DIT) version -static void FWHT(fwht_t* data, const unsigned ldn) +static void FWHT(ffe_t* data, const unsigned ldn) { if (ldn <= 13) { @@ -698,523 +279,774 @@ static void FWHT(fwht_t* data, const unsigned ldn) } } -#endif +#else // LEO_FF16_FWHT_OPTIMIZED + +// Reference implementation +void FWHT(ffe_t* data, const unsigned bits) +{ + const unsigned size = (unsigned)(1UL << bits); + for (unsigned width = 1; width < size; width <<= 1) + for (unsigned i = 0; i < size; i += (width << 1)) + for (unsigned j = i; j < (width + i); ++j) + FWHT_2(data[j], data[j + width]); +} + +#endif // LEO_FF16_FWHT_OPTIMIZED + +// Transform specialized for the finite field order +void FWHT(ffe_t data[kOrder]) +{ + FWHT(data, kBits); +} //------------------------------------------------------------------------------ -// Memory Buffer XOR +// Logarithm Tables -static void xor_mem(void * LEO_RESTRICT vx, const void * LEO_RESTRICT vy, unsigned bytes) +static ffe_t LogLUT[kOrder]; +static ffe_t ExpLUT[kOrder]; + + +// Initialize LogLUT[], ExpLUT[] +static void InitializeLogarithmTables() { - LEO_M128 * LEO_RESTRICT x16 = reinterpret_cast(vx); - const LEO_M128 * LEO_RESTRICT y16 = reinterpret_cast(vy); + // LFSR table generation: -#if defined(LEO_TARGET_MOBILE) -# if defined(LEO_TRY_NEON) - // Handle multiples of 64 bytes - if (CpuHasNeon) + unsigned state = 1; + for (unsigned i = 0; i < kModulus; ++i) { - while (bytes >= 64) + ExpLUT[state] = static_cast(i); + state <<= 1; + if (state >= kOrder) + state ^= kPolynomial; + } + ExpLUT[0] = kModulus; + + // Conversion to chosen basis: + + LogLUT[0] = 0; + for (unsigned i = 0; i < kBits; ++i) + { + const ffe_t basis = kBasis[i]; + const unsigned width = static_cast(1UL << i); + + for (unsigned j = 0; j < width; ++j) + LogLUT[j + width] = LogLUT[j] ^ basis; + } + + for (unsigned i = 0; i < kOrder; ++i) + LogLUT[i] = ExpLUT[LogLUT[i]]; + + for (unsigned i = 0; i < kOrder; ++i) + ExpLUT[LogLUT[i]] = i; + + ExpLUT[kModulus] = ExpLUT[0]; +} + +//------------------------------------------------------------------------------ +// Multiplies + +/* + Muladd implementation notes: + + Specialize for 1-3 rows at a time since often times we're multiplying by + the same (skew) value repeatedly, as the ISA-L library does here: + + https://github.com/01org/isa-l/blob/master/erasure_code/gf_3vect_mad_avx.asm#L258 + + Except we should be doing it for 16-bit Galois Field. + To implement that use the ALTMAP trick from Jerasure: + + http://lab.jerasure.org/jerasure/gf-complete/blob/master/src/gf_w16.c#L1140 + + Except we should also support AVX2 since that is a 40% perf boost, so put + the high and low bytes 32 bytes instead of 16 bytes apart. + + Also I think we should go ahead and precompute the multiply tables since + it avoids a bunch of memory lookups for each muladd, and only costs 8 MB. +*/ + +// We require memory to be aligned since the SIMD instructions benefit from +// or require aligned accesses to the table data. +struct { + LEO_ALIGNED LEO_M128 LUT[65536][4]; +} static Multiply128LUT; +#if defined(LEO_TRY_AVX2) +struct { + LEO_ALIGNED LEO_M256 LUT[65536][4]; +} static Multiply256LUT; +#endif // LEO_TRY_AVX2 + +// Returns a * b +static ffe_t FFEMultiply(ffe_t a, ffe_t b) +{ + if (a == 0 || b == 0) + return 0; + return ExpLUT[AddMod(LogLUT[a], LogLUT[b])]; +} + +// Returns a * Log(b) +static ffe_t FFEMultiplyLog(ffe_t a, ffe_t log_b) +{ + if (a == 0) + return 0; + return ExpLUT[AddMod(LogLUT[a], b)]; +} + +bool InitializeMultiplyTables() +{ + for (int y = 0; y < 256; ++y) + { + uint8_t lo[16], hi[16]; + for (unsigned char x = 0; x < 16; ++x) { - LEO_M128 x0 = vld1q_u8(x16); - LEO_M128 x1 = vld1q_u8(x16 + 1); - LEO_M128 x2 = vld1q_u8(x16 + 2); - LEO_M128 x3 = vld1q_u8(x16 + 3); - LEO_M128 y0 = vld1q_u8(y16); - LEO_M128 y1 = vld1q_u8(y16 + 1); - LEO_M128 y2 = vld1q_u8(y16 + 2); - LEO_M128 y3 = vld1q_u8(y16 + 3); - - vst1q_u8(x16, veorq_u8(x0, y0)); - vst1q_u8(x16 + 1, veorq_u8(x1, y1)); - vst1q_u8(x16 + 2, veorq_u8(x2, y2)); - vst1q_u8(x16 + 3, veorq_u8(x3, y3)); - - bytes -= 64, x16 += 4, y16 += 4; + lo[x] = FFEMultiply(x, static_cast(y)); + hi[x] = FFEMultiply(x << 4, static_cast(y)); } - // Handle multiples of 16 bytes - while (bytes >= 16) + const LEO_M128 table_lo = _mm_loadu_si128((LEO_M128*)lo); + const LEO_M128 table_hi = _mm_loadu_si128((LEO_M128*)hi); + + _mm_storeu_si128(Multiply128LUT.Lo + y, table_lo); + _mm_storeu_si128(Multiply128LUT.Hi + y, table_hi); + +#if defined(LEO_TRY_AVX2) + if (CpuHasAVX2) { - LEO_M128 x0 = vld1q_u8(x16); - LEO_M128 y0 = vld1q_u8(y16); - - vst1q_u8(x16, veorq_u8(x0, y0)); - - bytes -= 16, ++x16, ++y16; + _mm256_storeu_si256(Multiply256LUT.Lo + y, + _mm256_broadcastsi128_si256(table_lo)); + _mm256_storeu_si256(Multiply256LUT.Hi + y, + _mm256_broadcastsi128_si256(table_hi)); } +#endif // LEO_TRY_AVX2 } - else -# endif // LEO_TRY_NEON + + return true; +} + +// vx[] = vy[] * m +void mul_mem_set( + void * LEO_RESTRICT vx, const void * LEO_RESTRICT vy, + ffe_t m, uint64_t bytes) +{ + if (m <= 1) { - uint64_t * LEO_RESTRICT x8 = reinterpret_cast(x16); - const uint64_t * LEO_RESTRICT y8 = reinterpret_cast(y16); - - const unsigned count = (unsigned)bytes / 8; - for (unsigned ii = 0; ii < count; ++ii) - x8[ii] ^= y8[ii]; - - x16 = reinterpret_cast(x8 + count); - y16 = reinterpret_cast(y8 + count); + if (m == 1) + memcpy(vx, vy, bytes); + else + memset(vx, 0, bytes); + return; } -#else // LEO_TARGET_MOBILE -# if defined(LEO_TRY_AVX2) + +#if defined(LEO_TRY_AVX2) if (CpuHasAVX2) { - LEO_M256 * LEO_RESTRICT x32 = reinterpret_cast(x16); - const LEO_M256 * LEO_RESTRICT y32 = reinterpret_cast(y16); + const LEO_M256 table_lo_y = _mm256_loadu_si256(Multiply256LUT.Lo + m); + const LEO_M256 table_hi_y = _mm256_loadu_si256(Multiply256LUT.Hi + m); - while (bytes >= 128) + const LEO_M256 clr_mask = _mm256_set1_epi8(0x0f); + + LEO_M256 * LEO_RESTRICT z32 = reinterpret_cast(vx); + const LEO_M256 * LEO_RESTRICT x32 = reinterpret_cast(vy); + + const unsigned count = bytes / 64; + for (unsigned i = 0; i < count; ++i) { - LEO_M256 x0 = _mm256_loadu_si256(x32); - LEO_M256 y0 = _mm256_loadu_si256(y32); - x0 = _mm256_xor_si256(x0, y0); - LEO_M256 x1 = _mm256_loadu_si256(x32 + 1); - LEO_M256 y1 = _mm256_loadu_si256(y32 + 1); - x1 = _mm256_xor_si256(x1, y1); - LEO_M256 x2 = _mm256_loadu_si256(x32 + 2); - LEO_M256 y2 = _mm256_loadu_si256(y32 + 2); - x2 = _mm256_xor_si256(x2, y2); - LEO_M256 x3 = _mm256_loadu_si256(x32 + 3); - LEO_M256 y3 = _mm256_loadu_si256(y32 + 3); - x3 = _mm256_xor_si256(x3, y3); + LEO_M256 x0 = _mm256_loadu_si256(x32 + i * 2); + LEO_M256 l0 = _mm256_and_si256(x0, clr_mask); + x0 = _mm256_srli_epi64(x0, 4); + LEO_M256 h0 = _mm256_and_si256(x0, clr_mask); + l0 = _mm256_shuffle_epi8(table_lo_y, l0); + h0 = _mm256_shuffle_epi8(table_hi_y, h0); + _mm256_storeu_si256(z32 + i * 2, _mm256_xor_si256(l0, h0)); - _mm256_storeu_si256(x32, x0); - _mm256_storeu_si256(x32 + 1, x1); - _mm256_storeu_si256(x32 + 2, x2); - _mm256_storeu_si256(x32 + 3, x3); - - bytes -= 128, x32 += 4, y32 += 4; + LEO_M256 x1 = _mm256_loadu_si256(x32 + i * 2 + 1); + LEO_M256 l1 = _mm256_and_si256(x1, clr_mask); + x1 = _mm256_srli_epi64(x1, 4); + LEO_M256 h1 = _mm256_and_si256(x1, clr_mask); + l1 = _mm256_shuffle_epi8(table_lo_y, l1); + h1 = _mm256_shuffle_epi8(table_hi_y, h1); + _mm256_storeu_si256(z32 + i * 2 + 1, _mm256_xor_si256(l1, h1)); } + return; + } +#endif // LEO_TRY_AVX2 - // Handle multiples of 32 bytes - while (bytes >= 32) + const LEO_M128 table_lo_y = _mm_loadu_si128(Multiply128LUT.Lo + m); + const LEO_M128 table_hi_y = _mm_loadu_si128(Multiply128LUT.Hi + m); + + const LEO_M128 clr_mask = _mm_set1_epi8(0x0f); + + LEO_M128 * LEO_RESTRICT x16 = reinterpret_cast (vx); + const LEO_M128 * LEO_RESTRICT y16 = reinterpret_cast(vy); + + do + { + LEO_M128 x3 = _mm_loadu_si128(y16 + 3); + LEO_M128 l3 = _mm_and_si128(x3, clr_mask); + x3 = _mm_srli_epi64(x3, 4); + LEO_M128 h3 = _mm_and_si128(x3, clr_mask); + l3 = _mm_shuffle_epi8(table_lo_y, l3); + h3 = _mm_shuffle_epi8(table_hi_y, h3); + + LEO_M128 x2 = _mm_loadu_si128(y16 + 2); + LEO_M128 l2 = _mm_and_si128(x2, clr_mask); + x2 = _mm_srli_epi64(x2, 4); + LEO_M128 h2 = _mm_and_si128(x2, clr_mask); + l2 = _mm_shuffle_epi8(table_lo_y, l2); + h2 = _mm_shuffle_epi8(table_hi_y, h2); + + LEO_M128 x1 = _mm_loadu_si128(y16 + 1); + LEO_M128 l1 = _mm_and_si128(x1, clr_mask); + x1 = _mm_srli_epi64(x1, 4); + LEO_M128 h1 = _mm_and_si128(x1, clr_mask); + l1 = _mm_shuffle_epi8(table_lo_y, l1); + h1 = _mm_shuffle_epi8(table_hi_y, h1); + + LEO_M128 x0 = _mm_loadu_si128(y16); + LEO_M128 l0 = _mm_and_si128(x0, clr_mask); + x0 = _mm_srli_epi64(x0, 4); + LEO_M128 h0 = _mm_and_si128(x0, clr_mask); + l0 = _mm_shuffle_epi8(table_lo_y, l0); + h0 = _mm_shuffle_epi8(table_hi_y, h0); + + _mm_storeu_si128(x16 + 3, _mm_xor_si128(l3, h3)); + _mm_storeu_si128(x16 + 2, _mm_xor_si128(l2, h2)); + _mm_storeu_si128(x16 + 1, _mm_xor_si128(l1, h1)); + _mm_storeu_si128(x16, _mm_xor_si128(l0, h0)); + + x16 += 4, y16 += 4; + bytes -= 64; + } while (bytes > 0); +} + +// vx0[] *= m, vx1[] *= m +void mul_mem2_inplace( + void * LEO_RESTRICT vx_0, + void * LEO_RESTRICT vx_1, + ffe_t m, uint64_t bytes) +{ + if (m <= 1) + { + if (m == 0) { - // x[i] = x[i] xor y[i] - _mm256_storeu_si256(x32, - _mm256_xor_si256( - _mm256_loadu_si256(x32), - _mm256_loadu_si256(y32))); - - bytes -= 32, ++x32, ++y32; + memset(vx_0, 0, bytes); + memset(vx_1, 0, bytes); } - - x16 = reinterpret_cast(x32); - y16 = reinterpret_cast(y32); + return; } - else -# endif // LEO_TRY_AVX2 + +#if defined(LEO_TRY_AVX2) + if (CpuHasAVX2) { - while (bytes >= 64) + const LEO_M256 table_lo_y = _mm256_loadu_si256(Multiply256LUT.Lo + m); + const LEO_M256 table_hi_y = _mm256_loadu_si256(Multiply256LUT.Hi + m); + + const LEO_M256 clr_mask = _mm256_set1_epi8(0x0f); + + LEO_M256 * LEO_RESTRICT x32_0 = reinterpret_cast(vx_0); + LEO_M256 * LEO_RESTRICT x32_1 = reinterpret_cast(vx_1); + + do { - LEO_M128 x0 = _mm_loadu_si128(x16); - LEO_M128 y0 = _mm_loadu_si128(y16); - x0 = _mm_xor_si128(x0, y0); - LEO_M128 x1 = _mm_loadu_si128(x16 + 1); - LEO_M128 y1 = _mm_loadu_si128(y16 + 1); - x1 = _mm_xor_si128(x1, y1); - LEO_M128 x2 = _mm_loadu_si128(x16 + 2); - LEO_M128 y2 = _mm_loadu_si128(y16 + 2); - x2 = _mm_xor_si128(x2, y2); - LEO_M128 x3 = _mm_loadu_si128(x16 + 3); - LEO_M128 y3 = _mm_loadu_si128(y16 + 3); - x3 = _mm_xor_si128(x3, y3); + LEO_M256 x0_0 = _mm256_loadu_si256(x32_0 + 1); + LEO_M256 l0_0 = _mm256_and_si256(x0_0, clr_mask); + x0_0 = _mm256_srli_epi64(x0_0, 4); + LEO_M256 h0_0 = _mm256_and_si256(x0_0, clr_mask); + l0_0 = _mm256_shuffle_epi8(table_lo_y, l0_0); + h0_0 = _mm256_shuffle_epi8(table_hi_y, h0_0); + l0_0 = _mm256_xor_si256(l0_0, h0_0); - _mm_storeu_si128(x16, x0); - _mm_storeu_si128(x16 + 1, x1); - _mm_storeu_si128(x16 + 2, x2); - _mm_storeu_si128(x16 + 3, x3); + LEO_M256 x1_0 = _mm256_loadu_si256(x32_0); + LEO_M256 l1_0 = _mm256_and_si256(x1_0, clr_mask); + x1_0 = _mm256_srli_epi64(x1_0, 4); + LEO_M256 h1_0 = _mm256_and_si256(x1_0, clr_mask); + l1_0 = _mm256_shuffle_epi8(table_lo_y, l1_0); + h1_0 = _mm256_shuffle_epi8(table_hi_y, h1_0); + l1_0 = _mm256_xor_si256(l1_0, h1_0); - bytes -= 64, x16 += 4, y16 += 4; - } + LEO_M256 x0_1 = _mm256_loadu_si256(x32_1 + 1); + LEO_M256 l0_1 = _mm256_and_si256(x0_1, clr_mask); + x0_1 = _mm256_srli_epi64(x0_1, 4); + LEO_M256 h0_1 = _mm256_and_si256(x0_1, clr_mask); + l0_1 = _mm256_shuffle_epi8(table_lo_y, l0_1); + h0_1 = _mm256_shuffle_epi8(table_hi_y, h0_1); + l0_1 = _mm256_xor_si256(l0_1, h0_1); + + LEO_M256 x1_1 = _mm256_loadu_si256(x32_1); + LEO_M256 l1_1 = _mm256_and_si256(x1_1, clr_mask); + x1_1 = _mm256_srli_epi64(x1_1, 4); + LEO_M256 h1_1 = _mm256_and_si256(x1_1, clr_mask); + l1_1 = _mm256_shuffle_epi8(table_lo_y, l1_1); + h1_1 = _mm256_shuffle_epi8(table_hi_y, h1_1); + l1_1 = _mm256_xor_si256(l1_1, h1_1); + + _mm256_storeu_si256(x32_0 + 1, l0_0); + _mm256_storeu_si256(x32_0, l1_0); + _mm256_storeu_si256(x32_1 + 1, l0_1); + _mm256_storeu_si256(x32_1, l1_1); + + x32_0 += 2; + x32_1 += 2; + bytes -= 64; + } while (bytes > 0); + return; } -#endif // LEO_TARGET_MOBILE +#endif // LEO_TRY_AVX2 - // Handle multiples of 16 bytes - while (bytes >= 16) + const LEO_M128 table_lo_y = _mm_loadu_si128(Multiply128LUT.Lo + m); + const LEO_M128 table_hi_y = _mm_loadu_si128(Multiply128LUT.Hi + m); + + const LEO_M128 clr_mask = _mm_set1_epi8(0x0f); + + LEO_M128 * LEO_RESTRICT x16_0 = reinterpret_cast(vx_0); + LEO_M128 * LEO_RESTRICT x16_1 = reinterpret_cast(vx_1); + + do { - // x[i] = x[i] xor y[i] - _mm_storeu_si128(x16, - _mm_xor_si128( - _mm_loadu_si128(x16), - _mm_loadu_si128(y16))); + LEO_M128 x3 = _mm_loadu_si128(x16_0 + 3); + LEO_M128 l3 = _mm_and_si128(x3, clr_mask); + x3 = _mm_srli_epi64(x3, 4); + LEO_M128 h3 = _mm_and_si128(x3, clr_mask); + l3 = _mm_shuffle_epi8(table_lo_y, l3); + h3 = _mm_shuffle_epi8(table_hi_y, h3); - bytes -= 16, ++x16, ++y16; - } + LEO_M128 x2 = _mm_loadu_si128(x16_0 + 2); + LEO_M128 l2 = _mm_and_si128(x2, clr_mask); + x2 = _mm_srli_epi64(x2, 4); + LEO_M128 h2 = _mm_and_si128(x2, clr_mask); + l2 = _mm_shuffle_epi8(table_lo_y, l2); + h2 = _mm_shuffle_epi8(table_hi_y, h2); - uint8_t * LEO_RESTRICT x1 = reinterpret_cast(x16); - const uint8_t * LEO_RESTRICT y1 = reinterpret_cast(y16); + LEO_M128 x1 = _mm_loadu_si128(x16_0 + 1); + LEO_M128 l1 = _mm_and_si128(x1, clr_mask); + x1 = _mm_srli_epi64(x1, 4); + LEO_M128 h1 = _mm_and_si128(x1, clr_mask); + l1 = _mm_shuffle_epi8(table_lo_y, l1); + h1 = _mm_shuffle_epi8(table_hi_y, h1); - // Handle a block of 8 bytes - const unsigned eight = bytes & 8; - if (eight) - { - uint64_t * LEO_RESTRICT x8 = reinterpret_cast(x1); - const uint64_t * LEO_RESTRICT y8 = reinterpret_cast(y1); - *x8 ^= *y8; - } + LEO_M128 x0 = _mm_loadu_si128(x16_0); + LEO_M128 l0 = _mm_and_si128(x0, clr_mask); + x0 = _mm_srli_epi64(x0, 4); + LEO_M128 h0 = _mm_and_si128(x0, clr_mask); + l0 = _mm_shuffle_epi8(table_lo_y, l0); + h0 = _mm_shuffle_epi8(table_hi_y, h0); - // Handle a block of 4 bytes - const unsigned four = bytes & 4; - if (four) - { - uint32_t * LEO_RESTRICT x4 = reinterpret_cast(x1 + eight); - const uint32_t * LEO_RESTRICT y4 = reinterpret_cast(y1 + eight); - *x4 ^= *y4; - } + _mm_storeu_si128(x16_0 + 3, _mm_xor_si128(l3, h3)); + _mm_storeu_si128(x16_0 + 2, _mm_xor_si128(l2, h2)); + _mm_storeu_si128(x16_0 + 1, _mm_xor_si128(l1, h1)); + _mm_storeu_si128(x16_0, _mm_xor_si128(l0, h0)); - // Handle final bytes - const unsigned offset = eight + four; - switch (bytes & 3) - { - case 3: x1[offset + 2] ^= y1[offset + 2]; - case 2: x1[offset + 1] ^= y1[offset + 1]; - case 1: x1[offset] ^= y1[offset]; - default: - break; - } + // FIXME: Add second one here + + x16_0 += 4; + x16_1 += 4; + bytes -= 64; + } while (bytes > 0); } //------------------------------------------------------------------------------ -// Formal Derivative +// FFT Operations -// Formal derivative of polynomial in the new basis -static void formal_derivative(GFSymbol* cos, const unsigned size) +// x[] ^= y[] * m, y[] ^= x[] +void fft_butterfly( + void * LEO_RESTRICT x, void * LEO_RESTRICT y, + ffe_t m, uint64_t bytes) { - for (unsigned i = 1; i < size; ++i) - { - const unsigned leng = ((i ^ (i - 1)) + 1) >> 1; - // If a large number of values are being XORed: - if (leng >= 8) - xor_mem(cos + i - leng, cos + i, leng * sizeof(GFSymbol)); - else - for (unsigned j = i - leng; j < i; j++) - cos[j] ^= cos[j + leng]; - } +} + +// For i = {0, 1}: x_i[] ^= y_i[] * m, y_i[] ^= x_i[] +void fft_butterfly2( + void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0, + void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1, + ffe_t m, uint64_t bytes) +{ + +} + +// For i = {0, 1, 2}: x_i[] ^= y_i[] * m, y_i[] ^= x_i[] +void fft_butterfly3( + void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0, + void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1, + void * LEO_RESTRICT x_2, void * LEO_RESTRICT y_2, + ffe_t m, uint64_t bytes) +{ - for (unsigned i = size; i < kFieldSize; i <<= 1) - xor_mem(cos, cos + i, size * sizeof(GFSymbol)); } //------------------------------------------------------------------------------ -// Fast Fourier Transform +// IFFT Operations -static GFSymbol skewVec[kFieldModulus]; // twisted factors used in FFT - -// IFFT in the proposed basis -static void IFLT(GFSymbol* data, const unsigned size, const unsigned index) +// y[] ^= x[], x[] ^= y[] * m +void ifft_butterfly( + void * LEO_RESTRICT x, void * LEO_RESTRICT y, + ffe_t m, uint64_t bytes) { - for (unsigned depart_no = 1; depart_no < size; depart_no <<= 1) - { - for (unsigned j = depart_no; j < size; j += (depart_no << 1)) - { - // If a large number of values are being XORed: - if (depart_no >= 8) - xor_mem(data + j, data + j - depart_no, depart_no * sizeof(GFSymbol)); - else - for (unsigned i = j - depart_no; i < j; ++i) - data[i + depart_no] ^= data[i]; - const GFSymbol skew = skewVec[j + index - 1]; - - if (skew != kFieldModulus) - muladd_mem(data + j - depart_no, data + j, skew, depart_no); - } - } } -// FFT in the proposed basis -static void FLT(GFSymbol* data, const unsigned size, const unsigned index) +// For i = {0, 1}: y_i[] ^= x_i[], x_i[] ^= y_i[] * m +void ifft_butterfly2( + void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0, + void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1, + ffe_t m, uint64_t bytes) { - for (unsigned depart_no = (size >> 1); depart_no > 0; depart_no >>= 1) - { - for (unsigned j = depart_no; j < size; j += (depart_no << 1)) - { - const GFSymbol skew = skewVec[j + index - 1]; - if (skew != kFieldModulus) - muladd_mem(data + j - depart_no, data + j, skew, depart_no); +} + +// For i = {0, 1, 2}: y_i[] ^= x_i[], x_i[] ^= y_i[] * m +void ifft_butterfly3( + void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0, + void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1, + void * LEO_RESTRICT x_2, void * LEO_RESTRICT y_2, + ffe_t m, uint64_t bytes) +{ - // If a large number of values are being XORed: - if (depart_no >= 8) - xor_mem(data + j, data + j - depart_no, depart_no * sizeof(GFSymbol)); - else - for (unsigned i = j - depart_no; i < j; ++i) - data[i + depart_no] ^= data[i]; - } - } } //------------------------------------------------------------------------------ -// FFT Initialization +// FFT -static GFSymbol B[kFieldSize >> 1]; // factors used in formal derivative -static fwht_t log_walsh[kFieldSize]; // factors used in the evaluation of the error locator polynomial +static ffe_t FFTSkew[kFieldModulus]; // twisted factors used in FFT +static ffe_t LogWalsh[kOrder]; // factors used in the evaluation of the error locator polynomial -// Initialize skewVec[], B[], log_walsh[] -static void InitFieldOperations() +void FFTInitialize() { - GFSymbol temp[kGFBits - 1]; + ffe_t temp[kBits - 1]; - for (unsigned i = 1; i < kGFBits; ++i) - temp[i - 1] = (GFSymbol)((unsigned)1 << i); + for (unsigned i = 1; i < kBits; ++i) + temp[i - 1] = (ffe_t)((unsigned)1 << i); - for (unsigned m = 0; m < (kGFBits - 1); ++m) + for (unsigned m = 0; m < (kBits - 1); ++m) { const unsigned step = (unsigned)1 << (m + 1); - skewVec[((unsigned)1 << m) - 1] = 0; + FFTSkew[((unsigned)1 << m) - 1] = 0; - for (unsigned i = m; i < (kGFBits - 1); ++i) + for (unsigned i = m; i < (kBits - 1); ++i) { const unsigned s = ((unsigned)1 << (i + 1)); for (unsigned j = ((unsigned)1 << m) - 1; j < s; j += step) - skewVec[j + s] = skewVec[j] ^ temp[i]; + FFTSkew[j + s] = FFTSkew[j] ^ temp[i]; } - temp[m] = kFieldModulus - GFLog[mulE(temp[m], GFLog[temp[m] ^ 1])]; + // TBD: This can be cleaned up + temp[m] = kFieldModulus - LogLUT[FFEMultiply(temp[m], temp[m] ^ 1)]; - for (unsigned i = m + 1; i < (kGFBits - 1); ++i) - temp[i] = mulE(temp[i], (GFLog[temp[i] ^ 1] + temp[m]) % kFieldModulus); + for (unsigned i = m + 1; i < (kBits - 1); ++i) + temp[i] = FFEMultiplyLog(temp[i], (LogLUT[temp[i] ^ 1] + temp[m]) % kFieldModulus); } - for (unsigned i = 0; i < kFieldSize; ++i) - skewVec[i] = GFLog[skewVec[i]]; + for (unsigned i = 0; i < kOrder; ++i) + FFTSkew[i] = LogLUT[FFTSkew[i]]; temp[0] = kFieldModulus - temp[0]; - for (unsigned i = 1; i < (kGFBits - 1); ++i) + for (unsigned i = 1; i < (kBits - 1); ++i) temp[i] = (kFieldModulus - temp[i] + temp[i - 1]) % kFieldModulus; - B[0] = 0; - for (unsigned i = 0; i < (kGFBits - 1); ++i) - { - const unsigned depart = ((unsigned)1 << i); + for (unsigned i = 0; i < kOrder; ++i) + LogWalsh[i] = LogLUT[i]; - for (unsigned j = 0; j < depart; ++j) - B[j + depart] = (B[j] + temp[i]) % kFieldModulus; - } + LogWalsh[0] = 0; - for (unsigned i = 0; i < kFieldSize; ++i) - log_walsh[i] = GFLog[i]; - - log_walsh[0] = 0; - - FWHT(log_walsh, kGFBits); + FWHT(LogWalsh, kBits); } //------------------------------------------------------------------------------ -// Encoder +// Encode -// Encoding alg for k/n<0.5: message is a power of two -static void encodeL(GFSymbol* data, const unsigned k, GFSymbol* codeword) +void Encode( + uint64_t buffer_bytes, + unsigned original_count, + unsigned recovery_count, + unsigned m, + void* const * const data, + void** work) { - memcpy(codeword, data, sizeof(GFSymbol) * k); + // work <- data - IFLT(codeword, k, 0); + // FIXME: Unroll first loop to eliminate this + for (unsigned i = 0; i < m; ++i) + memcpy(work[i], data[i], buffer_bytes); - for (unsigned i = k; i < kFieldSize; i += k) + // work <- IFFT(data, m, m) + + for (unsigned width = 1; width < m; width <<= 1) { - memcpy(&codeword[i], codeword, sizeof(GFSymbol) * k); - - FLT(&codeword[i], k, i); - } - - memcpy(codeword, data, sizeof(GFSymbol) * k); -} - -// Encoding alg for k/n>0.5: parity is a power of two. -// data: message array. parity: parity array. mem: buffer(size>= n-k) -static void encodeH(const GFSymbol* data, const unsigned k, GFSymbol* parity, GFSymbol* mem) -{ - const unsigned t = kFieldSize - k; - - memset(parity, 0, sizeof(GFSymbol) * t); - - for (unsigned i = t; i < kFieldSize; i += t) - { - memcpy(mem, &data[i - t], sizeof(GFSymbol) * t); - - IFLT(mem, t, i); - - xor_mem(parity, mem, t * sizeof(GFSymbol)); - } - - FLT(parity, t, 0); -} - - -//------------------------------------------------------------------------------ -// Decoder - -static void decode(GFSymbol* codeword, unsigned k, const bool* erasure) -{ - fwht_t log_walsh2[kFieldSize]; - - // Compute the evaluations of the error locator polynomial - for (unsigned i = 0; i < kFieldSize; ++i) - log_walsh2[i] = erasure[i] ? 1 : 0; - - FWHT(log_walsh2, kGFBits); - - for (unsigned i = 0; i < kFieldSize; ++i) - log_walsh2[i] = ((unsigned)log_walsh2[i] * (unsigned)log_walsh[i]) % kFieldModulus; - - FWHT(log_walsh2, kGFBits); - - // k2 can be replaced with k - const unsigned k2 = kFieldSize; - //const unsigned k2 = k; // cannot actually be replaced with k. what else need to change? - - for (unsigned i = 0; i < kFieldSize; ++i) - { - if (erasure[i]) + for (unsigned j = width; j < m; j += (width << 1)) { - codeword[i] = 0; - } - else - { - codeword[i] = mulE(codeword[i], log_walsh2[i]); - } - } + const ffe_t skew = FFTSkew[j + m - 1]; - IFLT(codeword, kFieldSize, 0); - - // formal derivative - for (unsigned i = 0; i < kFieldSize; i += 2) - { - codeword[i] = mulE(codeword[i], kFieldModulus - B[i >> 1]); - codeword[i + 1] = mulE(codeword[i + 1], kFieldModulus - B[i >> 1]); - } - - formal_derivative(codeword, k2); - - for (unsigned i = 0; i < k2; i += 2) - { - codeword[i] = mulE(codeword[i], B[i >> 1]); - codeword[i + 1] = mulE(codeword[i + 1], B[i >> 1]); - } - - FLT(codeword, k2, 0); - - for (unsigned i = 0; i < k2; ++i) - { - if (erasure[i]) - { - codeword[i] = mulE(codeword[i], kFieldModulus - log_walsh2[i]); - } - } -} - - -//------------------------------------------------------------------------------ -// Test Application - -void test(unsigned k, unsigned seed) -{ - srand(seed); - - //-----------Generating message---------- - - // Message array - GFSymbol data[kFieldSize] = {0}; - - // Filled with random numbers - for (unsigned i = kFieldSize - k; i < kFieldSize; ++i) - data[i] = (GFSymbol)rand(); - - - //---------encoding---------- - - GFSymbol codeword[kFieldSize]; - encodeH(&data[kFieldSize - k], k, data, codeword); - //encodeL(data, k, codeword); // does not seem to work with any input? what else needs to change? - - memcpy(codeword, data, sizeof(GFSymbol) * kFieldSize); - - - //--------erasure simulation--------- - - // Array indicating erasures - bool erasure[kFieldSize] = { - false - }; - - for (unsigned i = k; i < kFieldSize; ++i) - erasure[i] = true; - - // permuting the erasure array - for (unsigned i = kFieldSize - 1; i > 0; --i) - { - unsigned pos = rand() % (i + 1); - - if (i != pos) - { - bool tmp = erasure[i]; - erasure[i] = erasure[pos]; - erasure[pos] = tmp; - } - } - - // erasure codeword symbols - for (unsigned i = 0; i < kFieldSize; ++i) - if (erasure[i]) - codeword[i] = 0; - - - //---------main processing---------- - decode(codeword, k, erasure); - - // Check the correctness of the result - for (unsigned i = 0; i < kFieldSize; ++i) - { - if (erasure[i] == 1) - { - if (data[i] != codeword[i]) + if (skew != kFieldModulus) { - printf("Decoding Error with seed = %d!\n", seed); - LEO_DEBUG_BREAK; - return; + for (unsigned i = j - width; i < j; ++i) + ifft_butterfly(work[i], work[i + width], skew, buffer_bytes); + } + else + { + for (unsigned i = j - width; i < j; ++i) + xor_mem(work[i + width], work[i], buffer_bytes); } } } - //printf("Decoding is successful!\n"); + for (unsigned i = m; i + m <= original_count; i += m) + { + // temp <- data + i + + void** temp = work + m; + + // FIXME: Unroll first loop to eliminate this + for (unsigned j = 0; j < m; ++j) + memcpy(temp[j], data[j], buffer_bytes); + + // temp <- IFFT(temp, m, m + i) + + for (unsigned width = 1; width < m; width <<= 1) + { + for (unsigned j = width; j < m; j += (width << 1)) + { + const ffe_t skew = FFTSkew[j + m + i - 1]; + + if (skew != kFieldModulus) + { + for (unsigned k = j - width; k < j; ++k) + ifft_butterfly(temp[k], temp[k + width], skew, buffer_bytes); + } + else + { + for (unsigned k = j - width; k < j; ++k) + xor_mem(temp[k + width], temp[k], buffer_bytes); + } + } + } + + // work <- work XOR temp + + // FIXME: Unroll last loop to eliminate this + for (unsigned j = 0; j < m; ++j) + xor_mem(work[j], temp[j], buffer_bytes); + } + + const unsigned last_count = original_count % m; + if (last_count != 0) + { + const unsigned i = original_count - last_count; + + // temp <- data + i + + void** temp = work + m; + + for (unsigned j = 0; j < last_count; ++j) + memcpy(temp[j], data[j], buffer_bytes); + for (unsigned j = last_count; j < m; ++j) + memset(temp[j], 0, buffer_bytes); + + // temp <- IFFT(temp, m, m + i) + + for (unsigned width = 1, shift = 1; width < m; width <<= 1, ++shift) + { + // Calculate stop considering that the right is all zeroes + const unsigned stop = ((last_count + width - 1) >> shift) << shift; + + for (unsigned j = width; j < stop; j += (width << 1)) + { + const ffe_t skew = FFTSkew[j + m + i - 1]; + + if (skew != kFieldModulus) + { + for (unsigned k = j - width; k < j; ++k) + ifft_butterfly(temp[k], temp[k + width], skew, buffer_bytes); + } + else + { + for (unsigned k = j - width; k < j; ++k) + xor_mem(temp[k + width], temp[k], buffer_bytes); + } + } + } + + // work <- work XOR temp + + // FIXME: Unroll last loop to eliminate this + for (unsigned j = 0; j < m; ++j) + xor_mem(work[j], temp[j], buffer_bytes); + } + + // work <- FFT(work, m, 0) + + for (unsigned width = (m >> 1); width > 0; width >>= 1) + { + const ffe_t* skewLUT = FFTSkew + width - 1; + const unsigned range = width << 1; + + for (unsigned j = 0; j < m; j += range) + { + const ffe_t skew = skewLUT[j]; + + if (skew != kFieldModulus) + { + for (unsigned k = j, count = j + width; k < count; ++k) + fft_butterfly(data[k], data[k + width], skew, buffer_bytes); + } + else + { + for (unsigned k = j, count = j + width; k < count; ++k) + xor_mem(work[k + width], work[k], buffer_bytes); + } + } + } } //------------------------------------------------------------------------------ -// Entrypoint +// Decode -int main(int argc, char **argv) +void Decode( + uint64_t buffer_bytes, + unsigned original_count, + unsigned recovery_count, + unsigned m, // NextPow2(recovery_count) + unsigned n, // NextPow2(m + original_count) = work_count + void* const * const original, // original_count entries + void* const * const recovery, // recovery_count entries + void** work) // n entries { - // Initialize architecture-specific code - leo_architecture_init(); + // Fill in error locations - // Fill GFLog table and GFExp table - InitField(); + ffe_t ErrorLocations[kOrder]; + for (unsigned i = 0; i < recovery_count; ++i) + ErrorLocations[i] = recovery[i] ? 0 : 1; + for (unsigned i = recovery_count; i < m; ++i) + ErrorLocations[i] = 1; + for (unsigned i = 0; i < original_count; ++i) + ErrorLocations[i + m] = original[i] ? 0 : 1; + memset(ErrorLocations + m + original_count, 0, (n - original_count - m) * sizeof(ffe_t)); - // Compute factors used in erasure decoder - InitFieldOperations(); + // Evaluate error locator polynomial - unsigned seed = (unsigned)time(NULL); - for (;;) + FWHT(ErrorLocations, kBits); + + for (unsigned i = 0; i < kOrder; ++i) + ErrorLocations[i] = ((unsigned)ErrorLocations[i] * (unsigned)LogWalsh[i]) % kFieldModulus; + + FWHT(ErrorLocations, kBits); + + // work <- recovery data + + for (unsigned i = 0; i < recovery_count; ++i) { - // test(int k), k: message size - /* - EncodeH works for kFieldSize / 2 and kFieldSize * 3 / 4, etc, - s.t. the number of recovery pieces is a power of two - */ - test(kFieldSize / 2, seed); + if (recovery[i]) + mul_mem_set(work[i], recovery[i], ErrorLocations[i], buffer_bytes); + else + memset(work[i], 0, buffer_bytes); + } + for (unsigned i = recovery_count; i < m; ++i) + memset(work[i], 0, buffer_bytes); - ++seed; + // work <- original data + + for (unsigned i = 0; i < original_count; ++i) + { + if (original[i]) + mul_mem_set(work[m + i], original[i], ErrorLocations[m + i], buffer_bytes); + else + memset(work[m + i], 0, buffer_bytes); + } + for (unsigned i = m + original_count; i < n; ++i) + memset(work[i], 0, buffer_bytes); + + // work <- IFFT(work, n, 0) + + for (unsigned width = 1; width < n; width <<= 1) + { + for (unsigned j = width; j < n; j += (width << 1)) + { + const ffe_t skew = FFTSkew[j - 1]; + + if (skew != kFieldModulus) + { + for (unsigned i = j - width; i < j; ++i) + ifft_butterfly(work[i], work[i + width], skew, buffer_bytes); + } + else + { + for (unsigned i = j - width; i < j; ++i) + xor_mem(work[i + width], work[i], buffer_bytes); + } + } } - return 0; + // work <- FormalDerivative(work, n) + + for (unsigned i = 1; i < n; ++i) + { + const unsigned width = ((i ^ (i - 1)) + 1) >> 1; + + // If a large number of values are being XORed: + for (unsigned j = i - width; j < i; ++j) + xor_mem(work[j], work[j + width], buffer_bytes); + } + + // work <- FFT(work, n, 0) truncated to m + original_count + + const unsigned output_count = m + original_count; + for (unsigned width = (n >> 1); width > 0; width >>= 1) + { + const ffe_t* skewLUT = FFTSkew + width - 1; + const unsigned range = width << 1; + + for (unsigned j = (m < range) ? 0 : m; j < output_count; j += range) + { + const ffe_t skew = skewLUT[j]; + + if (skew != kFieldModulus) + { + for (unsigned i = j; i < j + width; ++i) + fft_butterfly(work[i], work[i + width], skew, buffer_bytes); + } + else + { + for (unsigned i = j; i < j + width; ++i) + xor_mem(work[i + width], work[i], buffer_bytes); + } + } + } + + // Reveal erasures + + for (unsigned i = 0; i < original_count; ++i) + if (!original[i]) + mul_mem_set(work[i], work[i + m], kFieldModulus - ErrorLocations[i], buffer_bytes); } + + +//------------------------------------------------------------------------------ +// API + +static bool IsInitialized = false; + +bool Initialize() +{ + if (IsInitialized) + return true; + + if (!CpuHasSSSE3) + return false; + + InitializeLogarithmTables(); + FFTInitialize(); + + IsInitialized = true; + return true; +} + + +}} // namespace leopard::ff16 diff --git a/LeopardFF16.h b/LeopardFF16.h index 71d22e2..981b9a9 100644 --- a/LeopardFF16.h +++ b/LeopardFF16.h @@ -9,7 +9,7 @@ * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - * Neither the name of LHC-RS nor the names of its contributors may be + * Neither the name of Leopard-RS nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. @@ -26,1195 +26,133 @@ POSSIBILITY OF SUCH DAMAGE. */ -#include -#include -#include -#include -#include +#pragma once +#include "LeopardCommon.h" /* - TODO: - + Write C API and unit tester - + Limit input to multiples of 64 bytes - + Replace GFSymbol with a file data pointer - + New 16-bit Muladd inner loops - + Class to contain the (large) muladd tables - + Preliminary benchmarks for large data! - + New 8-bit Muladd inner loops - + Benchmarks for smaller data! - + Refactor software - + Pick a name for the software better than LEO_RS - + I think it should be split up into several C++ modules - + Write detailed comments for all the routines - + Look into getting EncodeL working so we can support smaller data (Ask Lin) - + Look into using k instead of k2 to speed up decoder (Ask Lin) - + Avoid performing FFT/IFFT intermediate calculations we're not going to use - + Benchmarks, fun! - + Add multi-threading to split up long parallelizable calculations - + Final benchmarks! - + Finish up documentation - + Release version 1 + 16-bit Finite Field Math - - Muladd implementation notes: - - Specialize for 1-3 rows at a time since often times we're multiplying by - the same (skew) value repeatedly, as the ISA-L library does here: - - https://github.com/01org/isa-l/blob/master/erasure_code/gf_3vect_mad_avx.asm#L258 - - Except we should be doing it for 16-bit Galois Field. - To implement that use the ALTMAP trick from Jerasure: - - http://lab.jerasure.org/jerasure/gf-complete/blob/master/src/gf_w16.c#L1140 - - Except we should also support AVX2 since that is a 40% perf boost, so put - the high and low bytes 32 bytes instead of 16 bytes apart. - - Also I think we should go ahead and precompute the multiply tables since - it avoids a bunch of memory lookups for each muladd, and only costs 8 MB. + This finite field contains 65536 elements and so each element is one byte. + This library is designed for data that is a multiple of 64 bytes in size. */ - -//------------------------------------------------------------------------------ -// Debug - -// Some bugs only repro in release mode, so this can be helpful -//#define LEO_DEBUG_IN_RELEASE - -#if defined(_DEBUG) || defined(DEBUG) || defined(LEO_DEBUG_IN_RELEASE) - #define LEO_DEBUG - #ifdef _WIN32 - #define LEO_DEBUG_BREAK __debugbreak() - #else - #define LEO_DEBUG_BREAK __builtin_trap() - #endif - #define LEO_DEBUG_ASSERT(cond) { if (!(cond)) { LEO_DEBUG_BREAK; } } -#else - #define LEO_DEBUG_BREAK ; - #define LEO_DEBUG_ASSERT(cond) ; -#endif +namespace leopard { namespace ff16 { //------------------------------------------------------------------------------ -// Platform/Architecture +// Datatypes and Constants -#if defined(ANDROID) || defined(IOS) - #define LEO_TARGET_MOBILE -#endif // ANDROID +// Finite field element type +typedef uint16_t ffe_t; -#if defined(__AVX2__) || (defined (_MSC_VER) && _MSC_VER >= 1900) - #define LEO_TRY_AVX2 /* 256-bit */ - #include - #define LEO_ALIGN_BYTES 32 -#else // __AVX2__ - #define LEO_ALIGN_BYTES 16 -#endif // __AVX2__ +// Number of bits per element +static const unsigned kBits = 16; -#if !defined(LEO_TARGET_MOBILE) - // Note: MSVC currently only supports SSSE3 but not AVX2 - #include // SSSE3: _mm_shuffle_epi8 - #include // SSE2 -#endif // LEO_TARGET_MOBILE - -#if defined(HAVE_ARM_NEON_H) - #include -#endif // HAVE_ARM_NEON_H - -#if defined(LEO_TARGET_MOBILE) - - #define LEO_ALIGNED_ACCESSES /* Inputs must be aligned to LEO_ALIGN_BYTES */ - -# if defined(HAVE_ARM_NEON_H) - // Compiler-specific 128-bit SIMD register keyword - #define LEO_M128 uint8x16_t - #define LEO_TRY_NEON -#else - #define LEO_M128 uint64_t -# endif - -#else // LEO_TARGET_MOBILE - - // Compiler-specific 128-bit SIMD register keyword - #define LEO_M128 __m128i - -#endif // LEO_TARGET_MOBILE - -#ifdef LEO_TRY_AVX2 - // Compiler-specific 256-bit SIMD register keyword - #define LEO_M256 __m256i -#endif - -// Compiler-specific C++11 restrict keyword -#define LEO_RESTRICT __restrict - -// Compiler-specific force inline keyword -#ifdef _MSC_VER - #define LEO_FORCE_INLINE inline __forceinline -#else - #define LEO_FORCE_INLINE inline __attribute__((always_inline)) -#endif - -// Compiler-specific alignment keyword -// Note: Alignment only matters for ARM NEON where it should be 16 -#ifdef _MSC_VER - #define LEO_ALIGNED __declspec(align(LEO_ALIGN_BYTES)) -#else // _MSC_VER - #define LEO_ALIGNED __attribute__((aligned(LEO_ALIGN_BYTES))) -#endif // _MSC_VER +// Finite field order: Number of elements in the field +static const unsigned kOrder = 65536; //------------------------------------------------------------------------------ -// Runtime CPU Architecture Check -// -// Feature checks stolen shamelessly from -// https://github.com/jedisct1/libsodium/blob/master/src/libsodium/sodium/runtime.c +// Fast Walsh-Hadamard Transform (FWHT) (mod kModulus) -#if defined(HAVE_ANDROID_GETCPUFEATURES) - #include -#endif +// Transform for a variable number of bits (up to kOrder) +void FWHT(ffe_t* data, const unsigned bits); -#if defined(LEO_TRY_NEON) -# if defined(IOS) && defined(__ARM_NEON__) - // Requires iPhone 5S or newer - static const bool CpuHasNeon = true; - static const bool CpuHasNeon64 = true; -# else - // Remember to add LOCAL_STATIC_LIBRARIES := cpufeatures - static bool CpuHasNeon = false; // V6 / V7 - static bool CpuHasNeon64 = false; // 64-bit -# endif -#endif - - -#if !defined(LEO_TARGET_MOBILE) - -#ifdef _MSC_VER - #include // __cpuid - #pragma warning(disable: 4752) // found Intel(R) Advanced Vector Extensions; consider using /arch:AVX -#endif - -#ifdef LEO_TRY_AVX2 -static bool CpuHasAVX2 = false; -#endif -static bool CpuHasSSSE3 = false; - -#define CPUID_EBX_AVX2 0x00000020 -#define CPUID_ECX_SSSE3 0x00000200 - -static void _cpuid(unsigned int cpu_info[4U], const unsigned int cpu_info_type) -{ -#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_AMD64) || defined(_M_IX86)) - __cpuid((int *) cpu_info, cpu_info_type); -#else //if defined(HAVE_CPUID) - cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0; -# ifdef __i386__ - __asm__ __volatile__ ("pushfl; pushfl; " - "popl %0; " - "movl %0, %1; xorl %2, %0; " - "pushl %0; " - "popfl; pushfl; popl %0; popfl" : - "=&r" (cpu_info[0]), "=&r" (cpu_info[1]) : - "i" (0x200000)); - if (((cpu_info[0] ^ cpu_info[1]) & 0x200000) == 0) { - return; /* LCOV_EXCL_LINE */ - } -# endif -# ifdef __i386__ - __asm__ __volatile__ ("xchgl %%ebx, %k1; cpuid; xchgl %%ebx, %k1" : - "=a" (cpu_info[0]), "=&r" (cpu_info[1]), - "=c" (cpu_info[2]), "=d" (cpu_info[3]) : - "0" (cpu_info_type), "2" (0U)); -# elif defined(__x86_64__) - __asm__ __volatile__ ("xchgq %%rbx, %q1; cpuid; xchgq %%rbx, %q1" : - "=a" (cpu_info[0]), "=&r" (cpu_info[1]), - "=c" (cpu_info[2]), "=d" (cpu_info[3]) : - "0" (cpu_info_type), "2" (0U)); -# else - __asm__ __volatile__ ("cpuid" : - "=a" (cpu_info[0]), "=b" (cpu_info[1]), - "=c" (cpu_info[2]), "=d" (cpu_info[3]) : - "0" (cpu_info_type), "2" (0U)); -# endif -#endif -} - -#endif // defined(LEO_TARGET_MOBILE) - - -static void leo_architecture_init() -{ -#if defined(LEO_TRY_NEON) && defined(HAVE_ANDROID_GETCPUFEATURES) - AndroidCpuFamily family = android_getCpuFamily(); - if (family == ANDROID_CPU_FAMILY_ARM) - { - if (android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON) - CpuHasNeon = true; - } - else if (family == ANDROID_CPU_FAMILY_ARM64) - { - CpuHasNeon = true; - if (android_getCpuFeatures() & ANDROID_CPU_ARM64_FEATURE_ASIMD) - CpuHasNeon64 = true; - } -#endif - -#if !defined(LEO_TARGET_MOBILE) - unsigned int cpu_info[4]; - - _cpuid(cpu_info, 1); - CpuHasSSSE3 = ((cpu_info[2] & CPUID_ECX_SSSE3) != 0); - -#if defined(LEO_TRY_AVX2) - _cpuid(cpu_info, 7); - CpuHasAVX2 = ((cpu_info[1] & CPUID_EBX_AVX2) != 0); -#endif // LEO_TRY_AVX2 - -#endif // LEO_TARGET_MOBILE -} +// Transform specialized for the finite field order +void FWHT(ffe_t data[kOrder]); //------------------------------------------------------------------------------ -// SIMD-Safe Aligned Memory Allocations +// Multiplies -static const unsigned kAlignmentBytes = LEO_ALIGN_BYTES; +// x[] = y[] * m +void mul_mem_set( + void * LEO_RESTRICT x, const void * LEO_RESTRICT y, + ffe_t m, uint64_t bytes); -LEO_FORCE_INLINE unsigned NextAlignedOffset(unsigned offset) -{ - return (offset + kAlignmentBytes - 1) & ~(kAlignmentBytes - 1); -} - -static LEO_FORCE_INLINE uint8_t* SIMDSafeAllocate(size_t size) -{ - uint8_t* data = (uint8_t*)calloc(1, kAlignmentBytes + size); - if (!data) - return nullptr; - unsigned offset = (unsigned)((uintptr_t)data % kAlignmentBytes); - data += kAlignmentBytes - offset; - data[-1] = (uint8_t)offset; - return data; -} - -static LEO_FORCE_INLINE void SIMDSafeFree(void* ptr) -{ - if (!ptr) - return; - uint8_t* data = (uint8_t*)ptr; - unsigned offset = data[-1]; - if (offset >= kAlignmentBytes) - { - LEO_DEBUG_BREAK; // Should never happen - return; - } - data -= kAlignmentBytes - offset; - free(data); -} +// For i = {0, 1}: x_i[] *= m +void mul_mem2_inplace( + void * LEO_RESTRICT x_0, + void * LEO_RESTRICT x_1, + ffe_t m, uint64_t bytes); //------------------------------------------------------------------------------ -// Field +// FFT Operations -//#define LEO_SHORT_FIELD +// x[] ^= y[] * m, y[] ^= x[] +void fft_butterfly( + void * LEO_RESTRICT x, void * LEO_RESTRICT y, + ffe_t m, uint64_t bytes); -#ifdef LEO_SHORT_FIELD -typedef uint8_t GFSymbol; -static const unsigned kGFBits = 8; -static const unsigned kGFPolynomial = 0x11D; -GFSymbol kGFBasis[kGFBits] = { - 1, 214, 152, 146, 86, 200, 88, 230 // Cantor basis -}; -#else -typedef uint16_t GFSymbol; -static const unsigned kGFBits = 16; -static const unsigned kGFPolynomial = 0x1002D; -GFSymbol kGFBasis[kGFBits] = { - 0x0001, 0xACCA, 0x3C0E, 0x163E, // Cantor basis - 0xC582, 0xED2E, 0x914C, 0x4012, - 0x6C98, 0x10D8, 0x6A72, 0xB900, - 0xFDB8, 0xFB34, 0xFF38, 0x991E -}; -#endif +// For i = {0, 1}: x_i[] ^= y_i[] * m, y_i[] ^= x_i[] +void fft_butterfly2( + void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0, + void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1, + ffe_t m, uint64_t bytes); -/* - Cantor Basis introduced by: - D. G. Cantor, "On arithmetical algorithms over finite fields", - Journal of Combinatorial Theory, Series A, vol. 50, no. 2, pp. 285-300, 1989. -*/ - -static const unsigned kFieldSize = (unsigned)1 << kGFBits; //Field size -static const unsigned kFieldModulus = kFieldSize - 1; - -static GFSymbol GFLog[kFieldSize]; -static GFSymbol GFExp[kFieldSize]; - -// Initialize GFLog[], GFExp[] -static void InitField() -{ - unsigned state = 1; - for (unsigned i = 0; i < kFieldModulus; ++i) - { - GFExp[state] = static_cast(i); - state <<= 1; - if (state >= kFieldSize) - state ^= kGFPolynomial; - } - GFExp[0] = kFieldModulus; - - // Conversion to chosen basis: - - GFLog[0] = 0; - for (unsigned i = 0; i < kGFBits; ++i) - { - const GFSymbol basis = kGFBasis[i]; - const unsigned width = (unsigned)(1UL << i); - - for (unsigned j = 0; j < width; ++j) - GFLog[j + width] = GFLog[j] ^ basis; - } - - for (unsigned i = 0; i < kFieldSize; ++i) - GFLog[i] = GFExp[GFLog[i]]; - - for (unsigned i = 0; i < kFieldSize; ++i) - GFExp[GFLog[i]] = i; - - GFExp[kFieldModulus] = GFExp[0]; -} +// For i = {0, 1, 2}: x_i[] ^= y_i[] * m, y_i[] ^= x_i[] +void fft_butterfly3( + void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0, + void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1, + void * LEO_RESTRICT x_2, void * LEO_RESTRICT y_2, + ffe_t m, uint64_t bytes); //------------------------------------------------------------------------------ -// Mod Q Field Operations -// -// Q is the maximum symbol value, e.g. 255 or 65535. +// IFFT Operations -// z = x + y (mod Q) -static inline GFSymbol AddModQ(GFSymbol a, GFSymbol b) -{ - const unsigned sum = (unsigned)a + b; +// y[] ^= x[], x[] ^= y[] * m +void ifft_butterfly( + void * LEO_RESTRICT x, void * LEO_RESTRICT y, + ffe_t m, uint64_t bytes); - // Partial reduction step, allowing for Q to be returned - return static_cast(sum + (sum >> kGFBits)); -} +// For i = {0, 1}: y_i[] ^= x_i[], x_i[] ^= y_i[] * m +void ifft_butterfly2( + void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0, + void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1, + ffe_t m, uint64_t bytes); -// z = x - y (mod Q) -static inline GFSymbol SubModQ(GFSymbol a, GFSymbol b) -{ - const unsigned dif = (unsigned)a - b; - - // Partial reduction step, allowing for Q to be returned - return static_cast(dif + (dif >> kGFBits)); -} - -// vx[] += vy[] * z -static void muladd_mem(GFSymbol * LEO_RESTRICT vx, const GFSymbol * LEO_RESTRICT vy, GFSymbol z, unsigned symbolCount) -{ - for (unsigned i = 0; i < symbolCount; ++i) - { - const GFSymbol a = vy[i]; - if (a == 0) - continue; - - GFSymbol sum1 = static_cast(AddModQ(GFLog[a & 0x0f], z)); - GFSymbol value1 = GFExp[sum1]; - if ((a & 0x0f) == 0) - { - value1 = 0; - } - GFSymbol sum2 = static_cast(AddModQ(GFLog[a & 0xf0], z)); - GFSymbol value2 = GFExp[sum2]; - if ((a & 0xf0) == 0) - { - value2 = 0; - } - GFSymbol sum3 = static_cast(AddModQ(GFLog[a & 0x0f00], z)); - GFSymbol value3 = GFExp[sum3]; - if ((a & 0x0f00) == 0) - { - value3 = 0; - } - GFSymbol sum4 = static_cast(AddModQ(GFLog[a & 0xf000], z)); - GFSymbol value4 = GFExp[sum4]; - if ((a & 0xf000) == 0) - { - value4 = 0; - } - - vx[i] ^= value1; - vx[i] ^= value2; - vx[i] ^= value3; - vx[i] ^= value4; - } -} - -// return a*GFExp[b] over GF(2^r) -static GFSymbol mulE(GFSymbol a, GFSymbol b) -{ - if (a == 0) - return 0; - - const GFSymbol sum = static_cast(AddModQ(GFLog[a], b)); - return GFExp[sum]; -} +// For i = {0, 1, 2}: y_i[] ^= x_i[], x_i[] ^= y_i[] * m +void ifft_butterfly3( + void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0, + void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1, + void * LEO_RESTRICT x_2, void * LEO_RESTRICT y_2, + ffe_t m, uint64_t bytes); //------------------------------------------------------------------------------ -// Fast Walsh-Hadamard Transform (FWHT) Mod Q -// -// Q is the maximum symbol value, e.g. 255 or 65535. +// Encode -// Define this to enable the optimized version of FWHT() -#define LEO_FWHT_OPTIMIZED - -typedef GFSymbol fwht_t; - -// {a, b} = {a + b, a - b} (Mod Q) -static LEO_FORCE_INLINE void FWHT_2(fwht_t& LEO_RESTRICT a, fwht_t& LEO_RESTRICT b) -{ - const fwht_t sum = AddModQ(a, b); - const fwht_t dif = SubModQ(a, b); - a = sum; - b = dif; -} - -/* - FWHT is a minor slice of the runtime and does not grow with data size, - but I did attempt a few additional optimizations that failed: - - I've attempted to vectorize (with partial reductions) FWHT_4(data, s), - which is 70% of the algorithm, but it was slower. Left in _attic_. - - I've attempted to avoid reductions in all or parts of the FWHT. - The final modular reduction ends up being slower than the savings. - Specifically I tried doing it for the whole FWHT and also I tried - doing it just for the FWHT_2 loop in the main routine, but both - approaches are slower than partial reductions. - - Replacing word reads with wider reads does speed up the operation, but - at too high a complexity cost relative to minor perf improvement. -*/ - -#ifndef LEO_FWHT_OPTIMIZED - -// Reference implementation -static void FWHT(fwht_t* data, const unsigned bits) -{ - const unsigned size = (unsigned)(1UL << bits); - for (unsigned width = 1; width < size; width <<= 1) - for (unsigned i = 0; i < size; i += (width << 1)) - for (unsigned j = i; j < (width + i); ++j) - FWHT_2(data[j], data[j + width]); -} - -#else - -static LEO_FORCE_INLINE void FWHT_4(fwht_t* data) -{ - fwht_t t0 = data[0]; - fwht_t t1 = data[1]; - fwht_t t2 = data[2]; - fwht_t t3 = data[3]; - FWHT_2(t0, t1); - FWHT_2(t2, t3); - FWHT_2(t0, t2); - FWHT_2(t1, t3); - data[0] = t0; - data[1] = t1; - data[2] = t2; - data[3] = t3; -} - -static LEO_FORCE_INLINE void FWHT_4(fwht_t* data, unsigned s) -{ - unsigned x = 0; - fwht_t t0 = data[x]; x += s; - fwht_t t1 = data[x]; x += s; - fwht_t t2 = data[x]; x += s; - fwht_t t3 = data[x]; - FWHT_2(t0, t1); - FWHT_2(t2, t3); - FWHT_2(t0, t2); - FWHT_2(t1, t3); - unsigned y = 0; - data[y] = t0; y += s; - data[y] = t1; y += s; - data[y] = t2; y += s; - data[y] = t3; -} - -static inline void FWHT_8(fwht_t* data) -{ - fwht_t t0 = data[0]; - fwht_t t1 = data[1]; - fwht_t t2 = data[2]; - fwht_t t3 = data[3]; - fwht_t t4 = data[4]; - fwht_t t5 = data[5]; - fwht_t t6 = data[6]; - fwht_t t7 = data[7]; - FWHT_2(t0, t1); - FWHT_2(t2, t3); - FWHT_2(t4, t5); - FWHT_2(t6, t7); - FWHT_2(t0, t2); - FWHT_2(t1, t3); - FWHT_2(t4, t6); - FWHT_2(t5, t7); - FWHT_2(t0, t4); - FWHT_2(t1, t5); - FWHT_2(t2, t6); - FWHT_2(t3, t7); - data[0] = t0; - data[1] = t1; - data[2] = t2; - data[3] = t3; - data[4] = t4; - data[5] = t5; - data[6] = t6; - data[7] = t7; -} - -static inline void FWHT_16(fwht_t* data) -{ - fwht_t t0 = data[0]; - fwht_t t1 = data[1]; - fwht_t t2 = data[2]; - fwht_t t3 = data[3]; - fwht_t t4 = data[4]; - fwht_t t5 = data[5]; - fwht_t t6 = data[6]; - fwht_t t7 = data[7]; - fwht_t t8 = data[8]; - fwht_t t9 = data[9]; - fwht_t t10 = data[10]; - fwht_t t11 = data[11]; - fwht_t t12 = data[12]; - fwht_t t13 = data[13]; - fwht_t t14 = data[14]; - fwht_t t15 = data[15]; - FWHT_2(t0, t1); - FWHT_2(t2, t3); - FWHT_2(t4, t5); - FWHT_2(t6, t7); - FWHT_2(t8, t9); - FWHT_2(t10, t11); - FWHT_2(t12, t13); - FWHT_2(t14, t15); - FWHT_2(t0, t2); - FWHT_2(t1, t3); - FWHT_2(t4, t6); - FWHT_2(t5, t7); - FWHT_2(t8, t10); - FWHT_2(t9, t11); - FWHT_2(t12, t14); - FWHT_2(t13, t15); - FWHT_2(t0, t4); - FWHT_2(t1, t5); - FWHT_2(t2, t6); - FWHT_2(t3, t7); - FWHT_2(t8, t12); - FWHT_2(t9, t13); - FWHT_2(t10, t14); - FWHT_2(t11, t15); - FWHT_2(t0, t8); - FWHT_2(t1, t9); - FWHT_2(t2, t10); - FWHT_2(t3, t11); - FWHT_2(t4, t12); - FWHT_2(t5, t13); - FWHT_2(t6, t14); - FWHT_2(t7, t15); - data[0] = t0; - data[1] = t1; - data[2] = t2; - data[3] = t3; - data[4] = t4; - data[5] = t5; - data[6] = t6; - data[7] = t7; - data[8] = t8; - data[9] = t9; - data[10] = t10; - data[11] = t11; - data[12] = t12; - data[13] = t13; - data[14] = t14; - data[15] = t15; -} - -static void FWHT_SmallData(fwht_t* data, unsigned ldn) -{ - const unsigned n = (1UL << ldn); - - if (n <= 2) - { - if (n == 2) - FWHT_2(data[0], data[1]); - return; - } - - for (unsigned ldm = ldn; ldm > 3; ldm -= 2) - { - unsigned m = (1UL << ldm); - unsigned m4 = (m >> 2); - for (unsigned r = 0; r < n; r += m) - for (unsigned j = 0; j < m4; j++) - FWHT_4(data + j + r, m4); - } - - if (ldn & 1) - { - for (unsigned i0 = 0; i0 < n; i0 += 8) - FWHT_8(data + i0); - } - else - { - for (unsigned i0 = 0; i0 < n; i0 += 4) - FWHT_4(data + i0); - } -} - -// Decimation in time (DIT) version -static void FWHT(fwht_t* data, const unsigned ldn) -{ - if (ldn <= 13) - { - FWHT_SmallData(data, ldn); - return; - } - - FWHT_2(data[2], data[3]); - FWHT_4(data + 4); - FWHT_8(data + 8); - FWHT_16(data + 16); - for (unsigned ldm = 5; ldm < ldn; ++ldm) - FWHT(data + (unsigned)(1UL << ldm), ldm); - - for (unsigned ldm = 0; ldm < ldn; ++ldm) - { - const unsigned mh = (1UL << ldm); - for (unsigned t1 = 0, t2 = mh; t1 < mh; ++t1, ++t2) - FWHT_2(data[t1], data[t2]); - } -} - -#endif +void Encode( + uint64_t buffer_bytes, + unsigned original_count, + unsigned recovery_count, + unsigned m, // = NextPow2(recovery_count) * 2 = work_count + void* const * const data, + void** work); // Size of GetEncodeWorkCount() //------------------------------------------------------------------------------ -// Memory Buffer XOR +// Decode -static void xor_mem(void * LEO_RESTRICT vx, const void * LEO_RESTRICT vy, unsigned bytes) -{ - LEO_M128 * LEO_RESTRICT x16 = reinterpret_cast(vx); - const LEO_M128 * LEO_RESTRICT y16 = reinterpret_cast(vy); - -#if defined(LEO_TARGET_MOBILE) -# if defined(LEO_TRY_NEON) - // Handle multiples of 64 bytes - if (CpuHasNeon) - { - while (bytes >= 64) - { - LEO_M128 x0 = vld1q_u8(x16); - LEO_M128 x1 = vld1q_u8(x16 + 1); - LEO_M128 x2 = vld1q_u8(x16 + 2); - LEO_M128 x3 = vld1q_u8(x16 + 3); - LEO_M128 y0 = vld1q_u8(y16); - LEO_M128 y1 = vld1q_u8(y16 + 1); - LEO_M128 y2 = vld1q_u8(y16 + 2); - LEO_M128 y3 = vld1q_u8(y16 + 3); - - vst1q_u8(x16, veorq_u8(x0, y0)); - vst1q_u8(x16 + 1, veorq_u8(x1, y1)); - vst1q_u8(x16 + 2, veorq_u8(x2, y2)); - vst1q_u8(x16 + 3, veorq_u8(x3, y3)); - - bytes -= 64, x16 += 4, y16 += 4; - } - - // Handle multiples of 16 bytes - while (bytes >= 16) - { - LEO_M128 x0 = vld1q_u8(x16); - LEO_M128 y0 = vld1q_u8(y16); - - vst1q_u8(x16, veorq_u8(x0, y0)); - - bytes -= 16, ++x16, ++y16; - } - } - else -# endif // LEO_TRY_NEON - { - uint64_t * LEO_RESTRICT x8 = reinterpret_cast(x16); - const uint64_t * LEO_RESTRICT y8 = reinterpret_cast(y16); - - const unsigned count = (unsigned)bytes / 8; - for (unsigned ii = 0; ii < count; ++ii) - x8[ii] ^= y8[ii]; - - x16 = reinterpret_cast(x8 + count); - y16 = reinterpret_cast(y8 + count); - } -#else // LEO_TARGET_MOBILE -# if defined(LEO_TRY_AVX2) - if (CpuHasAVX2) - { - LEO_M256 * LEO_RESTRICT x32 = reinterpret_cast(x16); - const LEO_M256 * LEO_RESTRICT y32 = reinterpret_cast(y16); - - while (bytes >= 128) - { - LEO_M256 x0 = _mm256_loadu_si256(x32); - LEO_M256 y0 = _mm256_loadu_si256(y32); - x0 = _mm256_xor_si256(x0, y0); - LEO_M256 x1 = _mm256_loadu_si256(x32 + 1); - LEO_M256 y1 = _mm256_loadu_si256(y32 + 1); - x1 = _mm256_xor_si256(x1, y1); - LEO_M256 x2 = _mm256_loadu_si256(x32 + 2); - LEO_M256 y2 = _mm256_loadu_si256(y32 + 2); - x2 = _mm256_xor_si256(x2, y2); - LEO_M256 x3 = _mm256_loadu_si256(x32 + 3); - LEO_M256 y3 = _mm256_loadu_si256(y32 + 3); - x3 = _mm256_xor_si256(x3, y3); - - _mm256_storeu_si256(x32, x0); - _mm256_storeu_si256(x32 + 1, x1); - _mm256_storeu_si256(x32 + 2, x2); - _mm256_storeu_si256(x32 + 3, x3); - - bytes -= 128, x32 += 4, y32 += 4; - } - - // Handle multiples of 32 bytes - while (bytes >= 32) - { - // x[i] = x[i] xor y[i] - _mm256_storeu_si256(x32, - _mm256_xor_si256( - _mm256_loadu_si256(x32), - _mm256_loadu_si256(y32))); - - bytes -= 32, ++x32, ++y32; - } - - x16 = reinterpret_cast(x32); - y16 = reinterpret_cast(y32); - } - else -# endif // LEO_TRY_AVX2 - { - while (bytes >= 64) - { - LEO_M128 x0 = _mm_loadu_si128(x16); - LEO_M128 y0 = _mm_loadu_si128(y16); - x0 = _mm_xor_si128(x0, y0); - LEO_M128 x1 = _mm_loadu_si128(x16 + 1); - LEO_M128 y1 = _mm_loadu_si128(y16 + 1); - x1 = _mm_xor_si128(x1, y1); - LEO_M128 x2 = _mm_loadu_si128(x16 + 2); - LEO_M128 y2 = _mm_loadu_si128(y16 + 2); - x2 = _mm_xor_si128(x2, y2); - LEO_M128 x3 = _mm_loadu_si128(x16 + 3); - LEO_M128 y3 = _mm_loadu_si128(y16 + 3); - x3 = _mm_xor_si128(x3, y3); - - _mm_storeu_si128(x16, x0); - _mm_storeu_si128(x16 + 1, x1); - _mm_storeu_si128(x16 + 2, x2); - _mm_storeu_si128(x16 + 3, x3); - - bytes -= 64, x16 += 4, y16 += 4; - } - } -#endif // LEO_TARGET_MOBILE - - // Handle multiples of 16 bytes - while (bytes >= 16) - { - // x[i] = x[i] xor y[i] - _mm_storeu_si128(x16, - _mm_xor_si128( - _mm_loadu_si128(x16), - _mm_loadu_si128(y16))); - - bytes -= 16, ++x16, ++y16; - } - - uint8_t * LEO_RESTRICT x1 = reinterpret_cast(x16); - const uint8_t * LEO_RESTRICT y1 = reinterpret_cast(y16); - - // Handle a block of 8 bytes - const unsigned eight = bytes & 8; - if (eight) - { - uint64_t * LEO_RESTRICT x8 = reinterpret_cast(x1); - const uint64_t * LEO_RESTRICT y8 = reinterpret_cast(y1); - *x8 ^= *y8; - } - - // Handle a block of 4 bytes - const unsigned four = bytes & 4; - if (four) - { - uint32_t * LEO_RESTRICT x4 = reinterpret_cast(x1 + eight); - const uint32_t * LEO_RESTRICT y4 = reinterpret_cast(y1 + eight); - *x4 ^= *y4; - } - - // Handle final bytes - const unsigned offset = eight + four; - switch (bytes & 3) - { - case 3: x1[offset + 2] ^= y1[offset + 2]; - case 2: x1[offset + 1] ^= y1[offset + 1]; - case 1: x1[offset] ^= y1[offset]; - default: - break; - } -} +void Decode( + uint64_t buffer_bytes, + unsigned original_count, + unsigned recovery_count, + unsigned m, // = NextPow2(recovery_count) + unsigned n, // = NextPow2(m + original_count) = work_count + void* const * const original, // original_count entries + void* const * const recovery, // recovery_count entries + void** work); // n entries //------------------------------------------------------------------------------ -// Formal Derivative +// API -// Formal derivative of polynomial in the new basis -static void formal_derivative(GFSymbol* cos, const unsigned size) -{ - for (unsigned i = 1; i < size; ++i) - { - const unsigned leng = ((i ^ (i - 1)) + 1) >> 1; +// Returns false if the self-test fails +bool Initialize(); - // If a large number of values are being XORed: - if (leng >= 8) - xor_mem(cos + i - leng, cos + i, leng * sizeof(GFSymbol)); - else - for (unsigned j = i - leng; j < i; j++) - cos[j] ^= cos[j + leng]; - } - for (unsigned i = size; i < kFieldSize; i <<= 1) - xor_mem(cos, cos + i, size * sizeof(GFSymbol)); -} - - -//------------------------------------------------------------------------------ -// Fast Fourier Transform - -static GFSymbol skewVec[kFieldModulus]; // twisted factors used in FFT - -// IFFT in the proposed basis -static void IFLT(GFSymbol* data, const unsigned size, const unsigned index) -{ - for (unsigned depart_no = 1; depart_no < size; depart_no <<= 1) - { - for (unsigned j = depart_no; j < size; j += (depart_no << 1)) - { - // If a large number of values are being XORed: - if (depart_no >= 8) - xor_mem(data + j, data + j - depart_no, depart_no * sizeof(GFSymbol)); - else - for (unsigned i = j - depart_no; i < j; ++i) - data[i + depart_no] ^= data[i]; - - const GFSymbol skew = skewVec[j + index - 1]; - - if (skew != kFieldModulus) - muladd_mem(data + j - depart_no, data + j, skew, depart_no); - } - } -} - -// FFT in the proposed basis -static void FLT(GFSymbol* data, const unsigned size, const unsigned index) -{ - for (unsigned depart_no = (size >> 1); depart_no > 0; depart_no >>= 1) - { - for (unsigned j = depart_no; j < size; j += (depart_no << 1)) - { - const GFSymbol skew = skewVec[j + index - 1]; - - if (skew != kFieldModulus) - muladd_mem(data + j - depart_no, data + j, skew, depart_no); - - // If a large number of values are being XORed: - if (depart_no >= 8) - xor_mem(data + j, data + j - depart_no, depart_no * sizeof(GFSymbol)); - else - for (unsigned i = j - depart_no; i < j; ++i) - data[i + depart_no] ^= data[i]; - } - } -} - - -//------------------------------------------------------------------------------ -// FFT Initialization - -static GFSymbol B[kFieldSize >> 1]; // factors used in formal derivative -static fwht_t log_walsh[kFieldSize]; // factors used in the evaluation of the error locator polynomial - -// Initialize skewVec[], B[], log_walsh[] -static void InitFieldOperations() -{ - GFSymbol temp[kGFBits - 1]; - - for (unsigned i = 1; i < kGFBits; ++i) - temp[i - 1] = (GFSymbol)((unsigned)1 << i); - - for (unsigned m = 0; m < (kGFBits - 1); ++m) - { - const unsigned step = (unsigned)1 << (m + 1); - - skewVec[((unsigned)1 << m) - 1] = 0; - - for (unsigned i = m; i < (kGFBits - 1); ++i) - { - const unsigned s = ((unsigned)1 << (i + 1)); - - for (unsigned j = ((unsigned)1 << m) - 1; j < s; j += step) - skewVec[j + s] = skewVec[j] ^ temp[i]; - } - - temp[m] = kFieldModulus - GFLog[mulE(temp[m], GFLog[temp[m] ^ 1])]; - - for (unsigned i = m + 1; i < (kGFBits - 1); ++i) - temp[i] = mulE(temp[i], (GFLog[temp[i] ^ 1] + temp[m]) % kFieldModulus); - } - - for (unsigned i = 0; i < kFieldSize; ++i) - skewVec[i] = GFLog[skewVec[i]]; - - temp[0] = kFieldModulus - temp[0]; - - for (unsigned i = 1; i < (kGFBits - 1); ++i) - temp[i] = (kFieldModulus - temp[i] + temp[i - 1]) % kFieldModulus; - - B[0] = 0; - for (unsigned i = 0; i < (kGFBits - 1); ++i) - { - const unsigned depart = ((unsigned)1 << i); - - for (unsigned j = 0; j < depart; ++j) - B[j + depart] = (B[j] + temp[i]) % kFieldModulus; - } - - for (unsigned i = 0; i < kFieldSize; ++i) - log_walsh[i] = GFLog[i]; - - log_walsh[0] = 0; - - FWHT(log_walsh, kGFBits); -} - - -//------------------------------------------------------------------------------ -// Encoder - -// Encoding alg for k/n<0.5: message is a power of two -static void encodeL(GFSymbol* data, const unsigned k, GFSymbol* codeword) -{ - memcpy(codeword, data, sizeof(GFSymbol) * k); - - IFLT(codeword, k, 0); - - for (unsigned i = k; i < kFieldSize; i += k) - { - memcpy(&codeword[i], codeword, sizeof(GFSymbol) * k); - - FLT(&codeword[i], k, i); - } - - memcpy(codeword, data, sizeof(GFSymbol) * k); -} - -// Encoding alg for k/n>0.5: parity is a power of two. -// data: message array. parity: parity array. mem: buffer(size>= n-k) -static void encodeH(const GFSymbol* data, const unsigned k, GFSymbol* parity, GFSymbol* mem) -{ - const unsigned t = kFieldSize - k; - - memset(parity, 0, sizeof(GFSymbol) * t); - - for (unsigned i = t; i < kFieldSize; i += t) - { - memcpy(mem, &data[i - t], sizeof(GFSymbol) * t); - - IFLT(mem, t, i); - - xor_mem(parity, mem, t * sizeof(GFSymbol)); - } - - FLT(parity, t, 0); -} - - -//------------------------------------------------------------------------------ -// Decoder - -static void decode(GFSymbol* codeword, unsigned k, const bool* erasure) -{ - fwht_t log_walsh2[kFieldSize]; - - // Compute the evaluations of the error locator polynomial - for (unsigned i = 0; i < kFieldSize; ++i) - log_walsh2[i] = erasure[i] ? 1 : 0; - - FWHT(log_walsh2, kGFBits); - - for (unsigned i = 0; i < kFieldSize; ++i) - log_walsh2[i] = ((unsigned)log_walsh2[i] * (unsigned)log_walsh[i]) % kFieldModulus; - - FWHT(log_walsh2, kGFBits); - - // k2 can be replaced with k - const unsigned k2 = kFieldSize; - //const unsigned k2 = k; // cannot actually be replaced with k. what else need to change? - - for (unsigned i = 0; i < kFieldSize; ++i) - { - if (erasure[i]) - { - codeword[i] = 0; - } - else - { - codeword[i] = mulE(codeword[i], log_walsh2[i]); - } - } - - IFLT(codeword, kFieldSize, 0); - - // formal derivative - for (unsigned i = 0; i < kFieldSize; i += 2) - { - codeword[i] = mulE(codeword[i], kFieldModulus - B[i >> 1]); - codeword[i + 1] = mulE(codeword[i + 1], kFieldModulus - B[i >> 1]); - } - - formal_derivative(codeword, k2); - - for (unsigned i = 0; i < k2; i += 2) - { - codeword[i] = mulE(codeword[i], B[i >> 1]); - codeword[i + 1] = mulE(codeword[i + 1], B[i >> 1]); - } - - FLT(codeword, k2, 0); - - for (unsigned i = 0; i < k2; ++i) - { - if (erasure[i]) - { - codeword[i] = mulE(codeword[i], kFieldModulus - log_walsh2[i]); - } - } -} - - -//------------------------------------------------------------------------------ -// Test Application - -void test(unsigned k, unsigned seed) -{ - srand(seed); - - //-----------Generating message---------- - - // Message array - GFSymbol data[kFieldSize] = {0}; - - // Filled with random numbers - for (unsigned i = kFieldSize - k; i < kFieldSize; ++i) - data[i] = (GFSymbol)rand(); - - - //---------encoding---------- - - GFSymbol codeword[kFieldSize]; - encodeH(&data[kFieldSize - k], k, data, codeword); - //encodeL(data, k, codeword); // does not seem to work with any input? what else needs to change? - - memcpy(codeword, data, sizeof(GFSymbol) * kFieldSize); - - - //--------erasure simulation--------- - - // Array indicating erasures - bool erasure[kFieldSize] = { - false - }; - - for (unsigned i = k; i < kFieldSize; ++i) - erasure[i] = true; - - // permuting the erasure array - for (unsigned i = kFieldSize - 1; i > 0; --i) - { - unsigned pos = rand() % (i + 1); - - if (i != pos) - { - bool tmp = erasure[i]; - erasure[i] = erasure[pos]; - erasure[pos] = tmp; - } - } - - // erasure codeword symbols - for (unsigned i = 0; i < kFieldSize; ++i) - if (erasure[i]) - codeword[i] = 0; - - - //---------main processing---------- - decode(codeword, k, erasure); - - // Check the correctness of the result - for (unsigned i = 0; i < kFieldSize; ++i) - { - if (erasure[i] == 1) - { - if (data[i] != codeword[i]) - { - printf("Decoding Error with seed = %d!\n", seed); - LEO_DEBUG_BREAK; - return; - } - } - } - - //printf("Decoding is successful!\n"); -} - - -//------------------------------------------------------------------------------ -// Entrypoint - -int main(int argc, char **argv) -{ - // Initialize architecture-specific code - leo_architecture_init(); - - // Fill GFLog table and GFExp table - InitField(); - - // Compute factors used in erasure decoder - InitFieldOperations(); - - unsigned seed = (unsigned)time(NULL); - for (;;) - { - // test(int k), k: message size - /* - EncodeH works for kFieldSize / 2 and kFieldSize * 3 / 4, etc, - s.t. the number of recovery pieces is a power of two - */ - test(kFieldSize / 2, seed); - - ++seed; - } - - return 0; -} +}} // namespace leopard::ff16 diff --git a/LeopardFF8.cpp b/LeopardFF8.cpp index 030a555..1e7d7cd 100644 --- a/LeopardFF8.cpp +++ b/LeopardFF8.cpp @@ -9,7 +9,7 @@ * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - * Neither the name of LHC-RS nor the names of its contributors may be + * Neither the name of Leopard-RS nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. @@ -27,6 +27,10 @@ */ #include "LeopardFF8.h" +#include + +// Define this to enable the optimized version of FWHT() +#define LEO_FF8_FWHT_OPTIMIZED namespace leopard { namespace ff8 { @@ -34,6 +38,9 @@ namespace leopard { namespace ff8 { //------------------------------------------------------------------------------ // Datatypes and Constants +// Modulus for field operations +static const ffe_t kModulus = 255; + // LFSR Polynomial that generates the field elements static const unsigned kPolynomial = 0x11D; @@ -47,9 +54,6 @@ static const ffe_t kBasis[kBits] = { //------------------------------------------------------------------------------ // Field Operations -// Modulus for field operations -static const ffe_t kModulus = 255; - // z = x + y (mod kModulus) static inline ffe_t AddMod(const ffe_t a, const ffe_t b) { @@ -69,50 +73,6 @@ static inline ffe_t SubMod(const ffe_t a, const ffe_t b) } -//------------------------------------------------------------------------------ -// Logarithm Tables - -static ffe_t LogLUT[kOrder]; -static ffe_t ExpLUT[kOrder]; - - -// Initialize LogLUT[], ExpLUT[] -static void InitializeLogarithmTables() -{ - // LFSR table generation: - - unsigned state = 1; - for (unsigned i = 0; i < kModulus; ++i) - { - ExpLUT[state] = static_cast(i); - state <<= 1; - if (state >= kOrder) - state ^= kPolynomial; - } - ExpLUT[0] = kModulus; - - // Conversion to chosen basis: - - LogLUT[0] = 0; - for (unsigned i = 0; i < kBits; ++i) - { - const ffe_t basis = kBasis[i]; - const unsigned width = static_cast(1UL << i); - - for (unsigned j = 0; j < width; ++j) - LogLUT[j + width] = LogLUT[j] ^ basis; - } - - for (unsigned i = 0; i < kOrder; ++i) - LogLUT[i] = ExpLUT[LogLUT[i]]; - - for (unsigned i = 0; i < kOrder; ++i) - ExpLUT[LogLUT[i]] = i; - - ExpLUT[kModulus] = ExpLUT[0]; -} - - //------------------------------------------------------------------------------ // Fast Walsh-Hadamard Transform (FWHT) (mod kModulus) @@ -248,234 +208,47 @@ void FWHT(ffe_t data[kOrder]) //------------------------------------------------------------------------------ -// XOR Memory +// Logarithm Tables -void xor_mem( - void * LEO_RESTRICT vx, const void * LEO_RESTRICT vy, - unsigned bytes) +static ffe_t LogLUT[kOrder]; +static ffe_t ExpLUT[kOrder]; + + +// Initialize LogLUT[], ExpLUT[] +static void InitializeLogarithmTables() { -#if defined(LEO_TRY_AVX2) - if (CpuHasAVX2) - { - LEO_M256 * LEO_RESTRICT x32 = reinterpret_cast(vx); - const LEO_M256 * LEO_RESTRICT y32 = reinterpret_cast(vy); - do - { - const LEO_M256 x0 = _mm256_xor_si256(_mm256_loadu_si256(x32), _mm256_loadu_si256(y32)); - const LEO_M256 x1 = _mm256_xor_si256(_mm256_loadu_si256(x32 + 1), _mm256_loadu_si256(y32 + 1)); - const LEO_M256 x2 = _mm256_xor_si256(_mm256_loadu_si256(x32 + 2), _mm256_loadu_si256(y32 + 2)); - const LEO_M256 x3 = _mm256_xor_si256(_mm256_loadu_si256(x32 + 3), _mm256_loadu_si256(y32 + 3)); - _mm256_storeu_si256(x32, x0); - _mm256_storeu_si256(x32 + 1, x1); - _mm256_storeu_si256(x32 + 2, x2); - _mm256_storeu_si256(x32 + 3, x3); - bytes -= 128, x32 += 4, y32 += 4; - } while (bytes >= 128); - if (bytes > 0) - { - const LEO_M256 x0 = _mm256_xor_si256(_mm256_loadu_si256(x32), _mm256_loadu_si256(y32)); - const LEO_M256 x1 = _mm256_xor_si256(_mm256_loadu_si256(x32 + 1), _mm256_loadu_si256(y32 + 1)); - _mm256_storeu_si256(x32, x0); - _mm256_storeu_si256(x32 + 1, x1); - } - return; - } -#endif // LEO_TRY_AVX2 - LEO_M128 * LEO_RESTRICT x16 = reinterpret_cast(vx); - const LEO_M128 * LEO_RESTRICT y16 = reinterpret_cast(vy); - do - { - const LEO_M128 x0 = _mm_xor_si128(_mm_loadu_si128(x16), _mm_loadu_si128(y16)); - const LEO_M128 x1 = _mm_xor_si128(_mm_loadu_si128(x16 + 1), _mm_loadu_si128(y16 + 1)); - const LEO_M128 x2 = _mm_xor_si128(_mm_loadu_si128(x16 + 2), _mm_loadu_si128(y16 + 2)); - const LEO_M128 x3 = _mm_xor_si128(_mm_loadu_si128(x16 + 3), _mm_loadu_si128(y16 + 3)); - _mm_storeu_si128(x16, x0); - _mm_storeu_si128(x16 + 1, x1); - _mm_storeu_si128(x16 + 2, x2); - _mm_storeu_si128(x16 + 3, x3); - bytes -= 64, x16 += 4, y16 += 4; - } while (bytes > 0); -} + // LFSR table generation: -void xor_mem2( - void * LEO_RESTRICT vx_0, const void * LEO_RESTRICT vy_0, - void * LEO_RESTRICT vx_1, const void * LEO_RESTRICT vy_1, - unsigned bytes) -{ -#if defined(LEO_TRY_AVX2) - if (CpuHasAVX2) + unsigned state = 1; + for (unsigned i = 0; i < kModulus; ++i) { - LEO_M256 * LEO_RESTRICT x32_0 = reinterpret_cast (vx_0); - const LEO_M256 * LEO_RESTRICT y32_0 = reinterpret_cast(vy_0); - LEO_M256 * LEO_RESTRICT x32_1 = reinterpret_cast (vx_1); - const LEO_M256 * LEO_RESTRICT y32_1 = reinterpret_cast(vy_1); - do - { - const LEO_M256 x0_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0), _mm256_loadu_si256(y32_0)); - const LEO_M256 x1_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 1), _mm256_loadu_si256(y32_0 + 1)); - const LEO_M256 x2_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 2), _mm256_loadu_si256(y32_0 + 2)); - const LEO_M256 x3_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 3), _mm256_loadu_si256(y32_0 + 3)); - const LEO_M256 x0_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1), _mm256_loadu_si256(y32_1)); - const LEO_M256 x1_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 1), _mm256_loadu_si256(y32_1 + 1)); - const LEO_M256 x2_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 2), _mm256_loadu_si256(y32_1 + 2)); - const LEO_M256 x3_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 3), _mm256_loadu_si256(y32_1 + 3)); - _mm256_storeu_si256(x32_0, x0_0); - _mm256_storeu_si256(x32_0 + 1, x1_0); - _mm256_storeu_si256(x32_0 + 2, x2_0); - _mm256_storeu_si256(x32_0 + 3, x3_0); - _mm256_storeu_si256(x32_1, x0_1); - _mm256_storeu_si256(x32_1 + 1, x1_1); - _mm256_storeu_si256(x32_1 + 2, x2_1); - _mm256_storeu_si256(x32_1 + 3, x3_1); - x32_0 += 4, y32_0 += 4; - x32_1 += 4, y32_1 += 4; - bytes -= 128; - } while (bytes >= 128); - if (bytes > 0) - { - const LEO_M256 x0_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0), _mm256_loadu_si256(y32_0)); - const LEO_M256 x1_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 1), _mm256_loadu_si256(y32_0 + 1)); - const LEO_M256 x0_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1), _mm256_loadu_si256(y32_1)); - const LEO_M256 x1_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 1), _mm256_loadu_si256(y32_1 + 1)); - _mm256_storeu_si256(x32_0, x0_0); - _mm256_storeu_si256(x32_0 + 1, x1_0); - _mm256_storeu_si256(x32_1, x0_1); - _mm256_storeu_si256(x32_1 + 1, x1_1); - } - return; + ExpLUT[state] = static_cast(i); + state <<= 1; + if (state >= kOrder) + state ^= kPolynomial; } -#endif // LEO_TRY_AVX2 - LEO_M128 * LEO_RESTRICT x16_0 = reinterpret_cast (vx_0); - const LEO_M128 * LEO_RESTRICT y16_0 = reinterpret_cast(vy_0); - LEO_M128 * LEO_RESTRICT x16_1 = reinterpret_cast (vx_1); - const LEO_M128 * LEO_RESTRICT y16_1 = reinterpret_cast(vy_1); - do - { - const LEO_M128 x0_0 = _mm_xor_si128(_mm_loadu_si128(x16_0), _mm_loadu_si128(y16_0)); - const LEO_M128 x1_0 = _mm_xor_si128(_mm_loadu_si128(x16_0 + 1), _mm_loadu_si128(y16_0 + 1)); - const LEO_M128 x2_0 = _mm_xor_si128(_mm_loadu_si128(x16_0 + 2), _mm_loadu_si128(y16_0 + 2)); - const LEO_M128 x3_0 = _mm_xor_si128(_mm_loadu_si128(x16_0 + 3), _mm_loadu_si128(y16_0 + 3)); - const LEO_M128 x0_1 = _mm_xor_si128(_mm_loadu_si128(x16_1), _mm_loadu_si128(y16_1)); - const LEO_M128 x1_1 = _mm_xor_si128(_mm_loadu_si128(x16_1 + 1), _mm_loadu_si128(y16_1 + 1)); - const LEO_M128 x2_1 = _mm_xor_si128(_mm_loadu_si128(x16_1 + 2), _mm_loadu_si128(y16_1 + 2)); - const LEO_M128 x3_1 = _mm_xor_si128(_mm_loadu_si128(x16_1 + 3), _mm_loadu_si128(y16_1 + 3)); - _mm_storeu_si128(x16_0, x0_0); - _mm_storeu_si128(x16_0 + 1, x1_0); - _mm_storeu_si128(x16_0 + 2, x2_0); - _mm_storeu_si128(x16_0 + 3, x3_0); - _mm_storeu_si128(x16_1, x0_1); - _mm_storeu_si128(x16_1 + 1, x1_1); - _mm_storeu_si128(x16_1 + 2, x2_1); - _mm_storeu_si128(x16_1 + 3, x3_1); - x16_0 += 4, y16_0 += 4; - x16_1 += 4, y16_1 += 4; - bytes -= 64; - } while (bytes > 0); -} + ExpLUT[0] = kModulus; -void xor_mem3( - void * LEO_RESTRICT vx_0, const void * LEO_RESTRICT vy_0, - void * LEO_RESTRICT vx_1, const void * LEO_RESTRICT vy_1, - void * LEO_RESTRICT vx_2, const void * LEO_RESTRICT vy_2, - unsigned bytes) -{ -#if defined(LEO_TRY_AVX2) - if (CpuHasAVX2) + // Conversion to chosen basis: + + LogLUT[0] = 0; + for (unsigned i = 0; i < kBits; ++i) { - LEO_M256 * LEO_RESTRICT x32_0 = reinterpret_cast (vx_0); - const LEO_M256 * LEO_RESTRICT y32_0 = reinterpret_cast(vy_0); - LEO_M256 * LEO_RESTRICT x32_1 = reinterpret_cast (vx_1); - const LEO_M256 * LEO_RESTRICT y32_1 = reinterpret_cast(vy_1); - LEO_M256 * LEO_RESTRICT x32_2 = reinterpret_cast (vx_2); - const LEO_M256 * LEO_RESTRICT y32_2 = reinterpret_cast(vy_2); - do - { - const LEO_M256 x0_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0), _mm256_loadu_si256(y32_0)); - const LEO_M256 x1_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 1), _mm256_loadu_si256(y32_0 + 1)); - const LEO_M256 x2_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 2), _mm256_loadu_si256(y32_0 + 2)); - const LEO_M256 x3_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 3), _mm256_loadu_si256(y32_0 + 3)); - const LEO_M256 x0_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1), _mm256_loadu_si256(y32_1)); - const LEO_M256 x1_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 1), _mm256_loadu_si256(y32_1 + 1)); - const LEO_M256 x2_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 2), _mm256_loadu_si256(y32_1 + 2)); - const LEO_M256 x3_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 3), _mm256_loadu_si256(y32_1 + 3)); - const LEO_M256 x0_2 = _mm256_xor_si256(_mm256_loadu_si256(x32_2), _mm256_loadu_si256(y32_2)); - const LEO_M256 x1_2 = _mm256_xor_si256(_mm256_loadu_si256(x32_2 + 1), _mm256_loadu_si256(y32_2 + 1)); - const LEO_M256 x2_2 = _mm256_xor_si256(_mm256_loadu_si256(x32_2 + 2), _mm256_loadu_si256(y32_2 + 2)); - const LEO_M256 x3_2 = _mm256_xor_si256(_mm256_loadu_si256(x32_2 + 3), _mm256_loadu_si256(y32_2 + 3)); - _mm256_storeu_si256(x32_0, x0_0); - _mm256_storeu_si256(x32_0 + 1, x1_0); - _mm256_storeu_si256(x32_0 + 2, x2_0); - _mm256_storeu_si256(x32_0 + 3, x3_0); - _mm256_storeu_si256(x32_1, x0_1); - _mm256_storeu_si256(x32_1 + 1, x1_1); - _mm256_storeu_si256(x32_1 + 2, x2_1); - _mm256_storeu_si256(x32_1 + 3, x3_1); - _mm256_storeu_si256(x32_2, x0_2); - _mm256_storeu_si256(x32_2 + 1, x1_2); - _mm256_storeu_si256(x32_2 + 2, x2_2); - _mm256_storeu_si256(x32_2 + 3, x3_2); - x32_0 += 4, y32_0 += 4; - x32_1 += 4, y32_1 += 4; - x32_2 += 4, y32_2 += 4; - bytes -= 128; - } while (bytes >= 128); - if (bytes > 0) - { - const LEO_M256 x0_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0), _mm256_loadu_si256(y32_0)); - const LEO_M256 x1_0 = _mm256_xor_si256(_mm256_loadu_si256(x32_0 + 1), _mm256_loadu_si256(y32_0 + 1)); - const LEO_M256 x0_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1), _mm256_loadu_si256(y32_1)); - const LEO_M256 x1_1 = _mm256_xor_si256(_mm256_loadu_si256(x32_1 + 1), _mm256_loadu_si256(y32_1 + 1)); - const LEO_M256 x0_2 = _mm256_xor_si256(_mm256_loadu_si256(x32_2), _mm256_loadu_si256(y32_2)); - const LEO_M256 x1_2 = _mm256_xor_si256(_mm256_loadu_si256(x32_2 + 1), _mm256_loadu_si256(y32_2 + 1)); - _mm256_storeu_si256(x32_0, x0_0); - _mm256_storeu_si256(x32_0 + 1, x1_0); - _mm256_storeu_si256(x32_1, x0_1); - _mm256_storeu_si256(x32_1 + 1, x1_1); - _mm256_storeu_si256(x32_2, x0_2); - _mm256_storeu_si256(x32_2 + 1, x1_2); - } - return; + const ffe_t basis = kBasis[i]; + const unsigned width = static_cast(1UL << i); + + for (unsigned j = 0; j < width; ++j) + LogLUT[j + width] = LogLUT[j] ^ basis; } -#endif // LEO_TRY_AVX2 - LEO_M128 * LEO_RESTRICT x16_0 = reinterpret_cast (vx_0); - const LEO_M128 * LEO_RESTRICT y16_0 = reinterpret_cast(vy_0); - LEO_M128 * LEO_RESTRICT x16_1 = reinterpret_cast (vx_1); - const LEO_M128 * LEO_RESTRICT y16_1 = reinterpret_cast(vy_1); - LEO_M128 * LEO_RESTRICT x16_2 = reinterpret_cast (vx_2); - const LEO_M128 * LEO_RESTRICT y16_2 = reinterpret_cast(vy_2); - do - { - const LEO_M128 x0_0 = _mm_xor_si128(_mm_loadu_si128(x16_0), _mm_loadu_si128(y16_0)); - const LEO_M128 x1_0 = _mm_xor_si128(_mm_loadu_si128(x16_0 + 1), _mm_loadu_si128(y16_0 + 1)); - const LEO_M128 x2_0 = _mm_xor_si128(_mm_loadu_si128(x16_0 + 2), _mm_loadu_si128(y16_0 + 2)); - const LEO_M128 x3_0 = _mm_xor_si128(_mm_loadu_si128(x16_0 + 3), _mm_loadu_si128(y16_0 + 3)); - const LEO_M128 x0_1 = _mm_xor_si128(_mm_loadu_si128(x16_1), _mm_loadu_si128(y16_1)); - const LEO_M128 x1_1 = _mm_xor_si128(_mm_loadu_si128(x16_1 + 1), _mm_loadu_si128(y16_1 + 1)); - const LEO_M128 x2_1 = _mm_xor_si128(_mm_loadu_si128(x16_1 + 2), _mm_loadu_si128(y16_1 + 2)); - const LEO_M128 x3_1 = _mm_xor_si128(_mm_loadu_si128(x16_1 + 3), _mm_loadu_si128(y16_1 + 3)); - const LEO_M128 x0_2 = _mm_xor_si128(_mm_loadu_si128(x16_2), _mm_loadu_si128(y16_2)); - const LEO_M128 x1_2 = _mm_xor_si128(_mm_loadu_si128(x16_2 + 1), _mm_loadu_si128(y16_2 + 1)); - const LEO_M128 x2_2 = _mm_xor_si128(_mm_loadu_si128(x16_2 + 2), _mm_loadu_si128(y16_2 + 2)); - const LEO_M128 x3_2 = _mm_xor_si128(_mm_loadu_si128(x16_2 + 3), _mm_loadu_si128(y16_2 + 3)); - _mm_storeu_si128(x16_0, x0_0); - _mm_storeu_si128(x16_0 + 1, x1_0); - _mm_storeu_si128(x16_0 + 2, x2_0); - _mm_storeu_si128(x16_0 + 3, x3_0); - _mm_storeu_si128(x16_1, x0_1); - _mm_storeu_si128(x16_1 + 1, x1_1); - _mm_storeu_si128(x16_1 + 2, x2_1); - _mm_storeu_si128(x16_1 + 3, x3_1); - _mm_storeu_si128(x16_2, x0_2); - _mm_storeu_si128(x16_2 + 1, x1_2); - _mm_storeu_si128(x16_2 + 2, x2_2); - _mm_storeu_si128(x16_2 + 3, x3_2); - x16_0 += 4, y16_0 += 4; - x16_1 += 4, y16_1 += 4; - x16_2 += 4, y16_2 += 4; - bytes -= 64; - } while (bytes > 0); -} + for (unsigned i = 0; i < kOrder; ++i) + LogLUT[i] = ExpLUT[LogLUT[i]]; + + for (unsigned i = 0; i < kOrder; ++i) + ExpLUT[LogLUT[i]] = i; + + ExpLUT[kModulus] = ExpLUT[0]; +} //------------------------------------------------------------------------------ // Multiplies @@ -485,12 +258,12 @@ void xor_mem3( struct { LEO_ALIGNED LEO_M128 Lo[256]; LEO_ALIGNED LEO_M128 Hi[256]; -} Multiply128LUT; +} static Multiply128LUT; #if defined(LEO_TRY_AVX2) struct { LEO_ALIGNED LEO_M256 Lo[256]; LEO_ALIGNED LEO_M256 Hi[256]; -} Multiply256LUT; +} static Multiply256LUT; #endif // LEO_TRY_AVX2 // Returns a * b @@ -501,14 +274,19 @@ static ffe_t FFEMultiply(ffe_t a, ffe_t b) return ExpLUT[AddMod(LogLUT[a], LogLUT[b])]; } +// Returns a * Log(b) +static ffe_t FFEMultiplyLog(ffe_t a, ffe_t log_b) +{ + if (a == 0) + return 0; + return ExpLUT[AddMod(LogLUT[a], b)]; +} + bool InitializeMultiplyTables() { - // Reuse aligned self test buffers to load table data - uint8_t* lo = m_SelfTestBuffers.A; - uint8_t* hi = m_SelfTestBuffers.B; - for (int y = 0; y < 256; ++y) { + uint8_t lo[16], hi[16]; for (unsigned char x = 0; x < 16; ++x) { lo[x] = FFEMultiply(x, static_cast(y)); @@ -517,15 +295,17 @@ bool InitializeMultiplyTables() const LEO_M128 table_lo = _mm_loadu_si128((LEO_M128*)lo); const LEO_M128 table_hi = _mm_loadu_si128((LEO_M128*)hi); + _mm_storeu_si128(Multiply128LUT.Lo + y, table_lo); _mm_storeu_si128(Multiply128LUT.Hi + y, table_hi); + #if defined(LEO_TRY_AVX2) if (CpuHasAVX2) { - const LEO_M256 table_lo2 = _mm256_broadcastsi128_si256(table_lo); - const LEO_M256 table_hi2 = _mm256_broadcastsi128_si256(table_hi); - _mm256_storeu_si256(Multiply256LUT.Lo + y, table_lo2); - _mm256_storeu_si256(Multiply256LUT.Hi + y, table_hi2); + _mm256_storeu_si256(Multiply256LUT.Lo + y, + _mm256_broadcastsi128_si256(table_lo)); + _mm256_storeu_si256(Multiply256LUT.Hi + y, + _mm256_broadcastsi128_si256(table_hi)); } #endif // LEO_TRY_AVX2 } @@ -536,7 +316,7 @@ bool InitializeMultiplyTables() // vx[] = vy[] * m void mul_mem_set( void * LEO_RESTRICT vx, const void * LEO_RESTRICT vy, - ffe_t m, unsigned bytes) + ffe_t m, uint64_t bytes) { if (m <= 1) { @@ -633,7 +413,7 @@ void mul_mem_set( void mul_mem2_inplace( void * LEO_RESTRICT vx_0, void * LEO_RESTRICT vx_1, - ffe_t m, unsigned bytes) + ffe_t m, uint64_t bytes) { if (m <= 1) { @@ -759,28 +539,28 @@ void mul_mem2_inplace( // FFT Operations // x[] ^= y[] * m, y[] ^= x[] -void mul_fft( +void fft_butterfly( void * LEO_RESTRICT x, void * LEO_RESTRICT y, - ffe_t m, unsigned bytes) + ffe_t m, uint64_t bytes) { } // For i = {0, 1}: x_i[] ^= y_i[] * m, y_i[] ^= x_i[] -void mul_fft2( +void fft_butterfly2( void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0, void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1, - ffe_t m, unsigned bytes) + ffe_t m, uint64_t bytes) { } // For i = {0, 1, 2}: x_i[] ^= y_i[] * m, y_i[] ^= x_i[] -void mul_fft3( +void fft_butterfly3( void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0, void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1, void * LEO_RESTRICT x_2, void * LEO_RESTRICT y_2, - ffe_t m, unsigned bytes) + ffe_t m, uint64_t bytes) { } @@ -790,33 +570,348 @@ void mul_fft3( // IFFT Operations // y[] ^= x[], x[] ^= y[] * m -void mul_ifft( +void ifft_butterfly( void * LEO_RESTRICT x, void * LEO_RESTRICT y, - ffe_t m, unsigned bytes) + ffe_t m, uint64_t bytes) { } // For i = {0, 1}: y_i[] ^= x_i[], x_i[] ^= y_i[] * m -void mul_ifft2( +void ifft_butterfly2( void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0, void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1, - ffe_t m, unsigned bytes) + ffe_t m, uint64_t bytes) { } // For i = {0, 1, 2}: y_i[] ^= x_i[], x_i[] ^= y_i[] * m -void mul_ifft3( +void ifft_butterfly3( void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0, void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1, void * LEO_RESTRICT x_2, void * LEO_RESTRICT y_2, - ffe_t m, unsigned bytes) + ffe_t m, uint64_t bytes) { } +//------------------------------------------------------------------------------ +// FFT + +static ffe_t FFTSkew[kFieldModulus]; // twisted factors used in FFT +static ffe_t LogWalsh[kOrder]; // factors used in the evaluation of the error locator polynomial + +void FFTInitialize() +{ + ffe_t temp[kBits - 1]; + + for (unsigned i = 1; i < kBits; ++i) + temp[i - 1] = (ffe_t)((unsigned)1 << i); + + for (unsigned m = 0; m < (kBits - 1); ++m) + { + const unsigned step = (unsigned)1 << (m + 1); + + FFTSkew[((unsigned)1 << m) - 1] = 0; + + for (unsigned i = m; i < (kBits - 1); ++i) + { + const unsigned s = ((unsigned)1 << (i + 1)); + + for (unsigned j = ((unsigned)1 << m) - 1; j < s; j += step) + FFTSkew[j + s] = FFTSkew[j] ^ temp[i]; + } + + // TBD: This can be cleaned up + temp[m] = kFieldModulus - LogLUT[FFEMultiply(temp[m], temp[m] ^ 1)]; + + for (unsigned i = m + 1; i < (kBits - 1); ++i) + temp[i] = FFEMultiplyLog(temp[i], (LogLUT[temp[i] ^ 1] + temp[m]) % kFieldModulus); + } + + for (unsigned i = 0; i < kOrder; ++i) + FFTSkew[i] = LogLUT[FFTSkew[i]]; + + // Precalculate FWHT(Log[i]): + + for (unsigned i = 0; i < kOrder; ++i) + LogWalsh[i] = LogLUT[i]; + LogWalsh[0] = 0; + FWHT(LogWalsh, kBits); +} + + +//------------------------------------------------------------------------------ +// Encode + +void Encode( + uint64_t buffer_bytes, + unsigned original_count, + unsigned recovery_count, + unsigned m, + void* const * const data, + void** work) +{ + // work <- data + + // FIXME: Unroll first loop to eliminate this + for (unsigned i = 0; i < m; ++i) + memcpy(work[i], data[i], buffer_bytes); + + // work <- IFFT(data, m, m) + + for (unsigned width = 1; width < m; width <<= 1) + { + for (unsigned j = width; j < m; j += (width << 1)) + { + const ffe_t skew = FFTSkew[j + m - 1]; + + if (skew != kFieldModulus) + { + for (unsigned i = j - width; i < j; ++i) + ifft_butterfly(work[i], work[i + width], skew, buffer_bytes); + } + else + { + for (unsigned i = j - width; i < j; ++i) + xor_mem(work[i + width], work[i], buffer_bytes); + } + } + } + + for (unsigned i = m; i + m <= original_count; i += m) + { + // temp <- data + i + + void** temp = work + m; + + // FIXME: Unroll first loop to eliminate this + for (unsigned j = 0; j < m; ++j) + memcpy(temp[j], data[j], buffer_bytes); + + // temp <- IFFT(temp, m, m + i) + + for (unsigned width = 1; width < m; width <<= 1) + { + for (unsigned j = width; j < m; j += (width << 1)) + { + const ffe_t skew = FFTSkew[j + m + i - 1]; + + if (skew != kFieldModulus) + { + for (unsigned k = j - width; k < j; ++k) + ifft_butterfly(temp[k], temp[k + width], skew, buffer_bytes); + } + else + { + for (unsigned k = j - width; k < j; ++k) + xor_mem(temp[k + width], temp[k], buffer_bytes); + } + } + } + + // work <- work XOR temp + + // FIXME: Unroll last loop to eliminate this + for (unsigned j = 0; j < m; ++j) + xor_mem(work[j], temp[j], buffer_bytes); + } + + const unsigned last_count = original_count % m; + if (last_count != 0) + { + const unsigned i = original_count - last_count; + + // temp <- data + i + + void** temp = work + m; + + for (unsigned j = 0; j < last_count; ++j) + memcpy(temp[j], data[j], buffer_bytes); + for (unsigned j = last_count; j < m; ++j) + memset(temp[j], 0, buffer_bytes); + + // temp <- IFFT(temp, m, m + i) + + for (unsigned width = 1, shift = 1; width < m; width <<= 1, ++shift) + { + // Calculate stop considering that the right is all zeroes + const unsigned stop = ((last_count + width - 1) >> shift) << shift; + + for (unsigned j = width; j < stop; j += (width << 1)) + { + const ffe_t skew = FFTSkew[j + m + i - 1]; + + if (skew != kFieldModulus) + { + for (unsigned k = j - width; k < j; ++k) + ifft_butterfly(temp[k], temp[k + width], skew, buffer_bytes); + } + else + { + for (unsigned k = j - width; k < j; ++k) + xor_mem(temp[k + width], temp[k], buffer_bytes); + } + } + } + + // work <- work XOR temp + + // FIXME: Unroll last loop to eliminate this + for (unsigned j = 0; j < m; ++j) + xor_mem(work[j], temp[j], buffer_bytes); + } + + // work <- FFT(work, m, 0) + + for (unsigned width = (m >> 1); width > 0; width >>= 1) + { + const ffe_t* skewLUT = FFTSkew + width - 1; + const unsigned range = width << 1; + + for (unsigned j = 0; j < m; j += range) + { + const ffe_t skew = skewLUT[j]; + + if (skew != kFieldModulus) + { + for (unsigned k = j, count = j + width; k < count; ++k) + fft_butterfly(data[k], data[k + width], skew, buffer_bytes); + } + else + { + for (unsigned k = j, count = j + width; k < count; ++k) + xor_mem(work[k + width], work[k], buffer_bytes); + } + } + } +} + + +//------------------------------------------------------------------------------ +// Decode + +void Decode( + uint64_t buffer_bytes, + unsigned original_count, + unsigned recovery_count, + unsigned m, // NextPow2(recovery_count) + unsigned n, // NextPow2(m + original_count) = work_count + void* const * const original, // original_count entries + void* const * const recovery, // recovery_count entries + void** work) // n entries +{ + // Fill in error locations + + ffe_t ErrorLocations[kOrder]; + for (unsigned i = 0; i < recovery_count; ++i) + ErrorLocations[i] = recovery[i] ? 0 : 1; + for (unsigned i = recovery_count; i < m; ++i) + ErrorLocations[i] = 1; + for (unsigned i = 0; i < original_count; ++i) + ErrorLocations[i + m] = original[i] ? 0 : 1; + memset(ErrorLocations + m + original_count, 0, (n - original_count - m) * sizeof(ffe_t)); + + // Evaluate error locator polynomial + + FWHT(ErrorLocations, kBits); + + for (unsigned i = 0; i < kOrder; ++i) + ErrorLocations[i] = ((unsigned)ErrorLocations[i] * (unsigned)LogWalsh[i]) % kFieldModulus; + + FWHT(ErrorLocations, kBits); + + // work <- recovery data + + for (unsigned i = 0; i < recovery_count; ++i) + { + if (recovery[i]) + mul_mem_set(work[i], recovery[i], ErrorLocations[i], buffer_bytes); + else + memset(work[i], 0, buffer_bytes); + } + for (unsigned i = recovery_count; i < m; ++i) + memset(work[i], 0, buffer_bytes); + + // work <- original data + + for (unsigned i = 0; i < original_count; ++i) + { + if (original[i]) + mul_mem_set(work[m + i], original[i], ErrorLocations[m + i], buffer_bytes); + else + memset(work[m + i], 0, buffer_bytes); + } + for (unsigned i = m + original_count; i < n; ++i) + memset(work[i], 0, buffer_bytes); + + // work <- IFFT(work, n, 0) + + for (unsigned width = 1; width < n; width <<= 1) + { + for (unsigned j = width; j < n; j += (width << 1)) + { + const ffe_t skew = FFTSkew[j - 1]; + + if (skew != kFieldModulus) + { + for (unsigned i = j - width; i < j; ++i) + ifft_butterfly(work[i], work[i + width], skew, buffer_bytes); + } + else + { + for (unsigned i = j - width; i < j; ++i) + xor_mem(work[i + width], work[i], buffer_bytes); + } + } + } + + // work <- FormalDerivative(work, n) + + for (unsigned i = 1; i < n; ++i) + { + const unsigned width = ((i ^ (i - 1)) + 1) >> 1; + + // If a large number of values are being XORed: + for (unsigned j = i - width; j < i; ++j) + xor_mem(work[j], work[j + width], buffer_bytes); + } + + // work <- FFT(work, n, 0) truncated to m + original_count + + const unsigned output_count = m + original_count; + for (unsigned width = (n >> 1); width > 0; width >>= 1) + { + const ffe_t* skewLUT = FFTSkew + width - 1; + const unsigned range = width << 1; + + for (unsigned j = (m < range) ? 0 : m; j < output_count; j += range) + { + const ffe_t skew = skewLUT[j]; + + if (skew != kFieldModulus) + { + for (unsigned i = j; i < j + width; ++i) + fft_butterfly(work[i], work[i + width], skew, buffer_bytes); + } + else + { + for (unsigned i = j; i < j + width; ++i) + xor_mem(work[i + width], work[i], buffer_bytes); + } + } + } + + // Reveal erasures + + for (unsigned i = 0; i < original_count; ++i) + if (!original[i]) + mul_mem_set(work[i], work[i + m], kFieldModulus - ErrorLocations[i], buffer_bytes); +} + + //------------------------------------------------------------------------------ // API @@ -831,6 +926,7 @@ bool Initialize() return false; InitializeLogarithmTables(); + FFTInitialize(); IsInitialized = true; return true; diff --git a/LeopardFF8.h b/LeopardFF8.h index 1ef933b..88efa3f 100644 --- a/LeopardFF8.h +++ b/LeopardFF8.h @@ -56,9 +56,6 @@ static const unsigned kOrder = 256; //------------------------------------------------------------------------------ // Fast Walsh-Hadamard Transform (FWHT) (mod kModulus) -// Define this to enable the optimized version of FWHT() -#define LEO_FF8_FWHT_OPTIMIZED - // Transform for a variable number of bits (up to kOrder) void FWHT(ffe_t* data, const unsigned bits); @@ -66,85 +63,89 @@ void FWHT(ffe_t* data, const unsigned bits); void FWHT(ffe_t data[kOrder]); -//------------------------------------------------------------------------------ -// XOR Memory - -// x[] ^= y[] -void xor_mem( - void * LEO_RESTRICT x, const void * LEO_RESTRICT y, - unsigned bytes); - -// For i = {0, 1}: x_i[] ^= x_i[] -void xor_mem2( - void * LEO_RESTRICT x_0, const void * LEO_RESTRICT y_0, - void * LEO_RESTRICT x_1, const void * LEO_RESTRICT y_1, - unsigned bytes); - -// For i = {0, 1, 2}: x_i[] ^= x_i[] -void xor_mem3( - void * LEO_RESTRICT x_0, const void * LEO_RESTRICT y_0, - void * LEO_RESTRICT x_1, const void * LEO_RESTRICT y_1, - void * LEO_RESTRICT x_2, const void * LEO_RESTRICT y_2, - unsigned bytes); - - //------------------------------------------------------------------------------ // Multiplies // x[] = y[] * m void mul_mem_set( void * LEO_RESTRICT x, const void * LEO_RESTRICT y, - ffe_t m, unsigned bytes); + ffe_t m, uint64_t bytes); // For i = {0, 1}: x_i[] *= m void mul_mem2_inplace( void * LEO_RESTRICT x_0, void * LEO_RESTRICT x_1, - ffe_t m, unsigned bytes); + ffe_t m, uint64_t bytes); //------------------------------------------------------------------------------ // FFT Operations // x[] ^= y[] * m, y[] ^= x[] -void mul_fft( +void fft_butterfly( void * LEO_RESTRICT x, void * LEO_RESTRICT y, - ffe_t m, unsigned bytes); + ffe_t m, uint64_t bytes); // For i = {0, 1}: x_i[] ^= y_i[] * m, y_i[] ^= x_i[] -void mul_fft2( +void fft_butterfly2( void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0, void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1, - ffe_t m, unsigned bytes); + ffe_t m, uint64_t bytes); // For i = {0, 1, 2}: x_i[] ^= y_i[] * m, y_i[] ^= x_i[] -void mul_fft3( +void fft_butterfly3( void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0, void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1, void * LEO_RESTRICT x_2, void * LEO_RESTRICT y_2, - ffe_t m, unsigned bytes); + ffe_t m, uint64_t bytes); //------------------------------------------------------------------------------ // IFFT Operations // y[] ^= x[], x[] ^= y[] * m -void mul_ifft( +void ifft_butterfly( void * LEO_RESTRICT x, void * LEO_RESTRICT y, - ffe_t m, unsigned bytes); + ffe_t m, uint64_t bytes); // For i = {0, 1}: y_i[] ^= x_i[], x_i[] ^= y_i[] * m -void mul_ifft2( +void ifft_butterfly2( void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0, void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1, - ffe_t m, unsigned bytes); + ffe_t m, uint64_t bytes); // For i = {0, 1, 2}: y_i[] ^= x_i[], x_i[] ^= y_i[] * m -void mul_ifft3( +void ifft_butterfly3( void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0, void * LEO_RESTRICT x_1, void * LEO_RESTRICT y_1, void * LEO_RESTRICT x_2, void * LEO_RESTRICT y_2, - ffe_t m, unsigned bytes); + ffe_t m, uint64_t bytes); + + +//------------------------------------------------------------------------------ +// Encode + +void Encode( + uint64_t buffer_bytes, + unsigned original_count, + unsigned recovery_count, + unsigned m, // = NextPow2(recovery_count) * 2 = work_count + void* const * const data, + void** work); // Size of GetEncodeWorkCount() + + +//------------------------------------------------------------------------------ +// Decode + +void Decode( + uint64_t buffer_bytes, + unsigned original_count, + unsigned recovery_count, + unsigned m, // = NextPow2(recovery_count) + unsigned n, // = NextPow2(m + original_count) = work_count + void* const * const original, // original_count entries + void* const * const recovery, // recovery_count entries + void** work); // n entries //------------------------------------------------------------------------------ diff --git a/docs/HighRateDecoder.pdf b/docs/HighRateDecoder.pdf new file mode 100644 index 0000000..6ce5054 Binary files /dev/null and b/docs/HighRateDecoder.pdf differ diff --git a/docs/LowRateDecoder.pdf b/docs/LowRateDecoder.pdf new file mode 100644 index 0000000..93ba65e Binary files /dev/null and b/docs/LowRateDecoder.pdf differ diff --git a/leopard.cpp b/leopard.cpp index 5c694fd..51850f9 100644 --- a/leopard.cpp +++ b/leopard.cpp @@ -27,8 +27,8 @@ */ #include "leopard.h" -#include "FecalEncoder.h" -#include "FecalDecoder.h" +#include "LeopardFF8.h" +#include "LeopardFF16.h" extern "C" { @@ -38,134 +38,152 @@ extern "C" { static bool m_Initialized = false; -FECAL_EXPORT int fecal_init_(int version) +LEO_EXPORT int leo_init_(int version) { - if (version != FECAL_VERSION) - return Fecal_InvalidInput; + if (version != LEO_VERSION) + return Leopard_InvalidInput; - if (0 != gf256_init()) - return Fecal_Platform; + if (!leopard::ff8::Initialize()) + return Leopard_Platform; + + if (!leopard::ff16::Initialize()) + return Leopard_Platform; m_Initialized = true; - return Fecal_Success; + return Leopard_Success; } //------------------------------------------------------------------------------ // Encoder API -FECAL_EXPORT FecalEncoder fecal_encoder_create(unsigned input_count, void* const * const input_data, uint64_t total_bytes) +LEO_EXPORT unsigned leo_encode_work_count( + unsigned original_count, + unsigned recovery_count) { - if (input_count <= 0 || !input_data || total_bytes < input_count) - { - FECAL_DEBUG_BREAK; // Invalid input - return nullptr; - } - - FECAL_DEBUG_ASSERT(m_Initialized); // Must call fecal_init() first - if (!m_Initialized) - return nullptr; - - fecal::Encoder* encoder = new(std::nothrow) fecal::Encoder; - if (!encoder) - { - FECAL_DEBUG_BREAK; // Out of memory - return nullptr; - } - - if (Fecal_Success != encoder->Initialize(input_count, input_data, total_bytes)) - { - delete encoder; - return nullptr; - } - - return reinterpret_cast( encoder ); + return leopard::NextPow2(recovery_count) * 2; } -FECAL_EXPORT int fecal_encode(FecalEncoder encoder_v, FecalSymbol* symbol) +LEO_EXPORT LeopardResult leo_encode( + uint64_t buffer_bytes, // Number of bytes in each data buffer + unsigned original_count, // Number of original_data[] buffer pointers + unsigned recovery_count, // Number of recovery_data[] buffer pointers + unsigned work_count, // Number of work_data[] buffer pointers, from leo_encode_work_count() + void* const * const original_data, // Array of pointers to original data buffers + void** work_data, // Array of work buffers + unsigned flags) // Operation flags { - fecal::Encoder* encoder = reinterpret_cast( encoder_v ); - if (!encoder || !symbol) - return Fecal_InvalidInput; + if (buffer_bytes <= 0 || buffer_bytes % 64 != 0) + return Leopard_InvalidSize; - return encoder->Encode(*symbol); -} + if (recovery_count <= 0 || recovery_count > original_count) + return Leopard_InvalidCounts; -FECAL_EXPORT void fecal_free(void* codec_v) -{ - if (codec_v) + if (!original_data || !work_data) + return Leopard_InvalidInput; + + const unsigned m = leopard::NextPow2(recovery_count); + const unsigned n = leopard::NextPow2(m + original_count); + + if (work_count != m * 2) + return Leopard_InvalidCounts; + + const bool mt = (flags & LeopardFlags_Multithreaded) != 0; + + if (n <= leopard::ff8::kOrder) { - fecal::ICodec* icodec = reinterpret_cast( codec_v ); - delete icodec; + leopard::ff8::Encode( + buffer_bytes, + original_count, + recovery_count, + m, + original_data, + work_data); } + else if (n <= leopard::ff16::kOrder) + { + leopard::ff16::Encode( + buffer_bytes, + original_count, + recovery_count, + m, + original_data, + work_data); + } + else + return Leopard_TooMuchData; + + return Leopard_Success; } //------------------------------------------------------------------------------ // Decoder API -FECAL_EXPORT FecalDecoder fecal_decoder_create(unsigned input_count, uint64_t total_bytes) +LEO_EXPORT unsigned leo_decode_work_count( + unsigned original_count, + unsigned recovery_count) { - if (input_count <= 0 || total_bytes < input_count) + const unsigned m = leopard::NextPow2(recovery_count); + const unsigned n = leopard::NextPow2(m + original_count); + return n; +} + +LEO_EXPORT LeopardResult leo_decode( + uint64_t buffer_bytes, // Number of bytes in each data buffer + unsigned original_count, // Number of original_data[] buffer pointers + unsigned recovery_count, // Number of recovery_data[] buffer pointers + unsigned work_count, // Number of buffer pointers in work_data[] + void* const * const original_data, // Array of original data buffers + void* const * const recovery_data, // Array of recovery data buffers + void** work_data, // Array of work data buffers + unsigned flags) // Operation flags +{ + if (buffer_bytes <= 0 || buffer_bytes % 64 != 0) + return Leopard_InvalidSize; + + if (recovery_count <= 0 || recovery_count > original_count) + return Leopard_InvalidCounts; + + if (!original_data || !recovery_data || !work_data) + return Leopard_InvalidInput; + + const unsigned m = leopard::NextPow2(recovery_count); + const unsigned n = leopard::NextPow2(m + original_count); + + if (work_count != n) + return Leopard_InvalidCounts; + + const bool mt = (flags & LeopardFlags_Multithreaded) != 0; + + if (n <= leopard::ff8::kOrder) { - FECAL_DEBUG_BREAK; // Invalid input - return nullptr; + leopard::ff8::Decode( + buffer_bytes, + original_count, + recovery_count, + m, + n, + original_data, + recovery_data, + work_data); } - - FECAL_DEBUG_ASSERT(m_Initialized); // Must call fecal_init() first - if (!m_Initialized) - return nullptr; - - fecal::Decoder* decoder = new(std::nothrow) fecal::Decoder; - if (!decoder) + else if (n <= leopard::ff16::kOrder) { - FECAL_DEBUG_BREAK; // Out of memory - return nullptr; + leopard::ff16::Decode( + buffer_bytes, + original_count, + recovery_count, + m, + n, + original_data, + recovery_data, + work_data); } + else + return Leopard_TooMuchData; - if (Fecal_Success != decoder->Initialize(input_count, total_bytes)) - { - delete decoder; - return nullptr; - } - - return reinterpret_cast( decoder ); -} - -FECAL_EXPORT int fecal_decoder_add_original(FecalDecoder decoder_v, const FecalSymbol* symbol) -{ - fecal::Decoder* decoder = reinterpret_cast( decoder_v ); - if (!decoder || !symbol) - return Fecal_InvalidInput; - - return decoder->AddOriginal(*symbol); -} - -FECAL_EXPORT int fecal_decoder_add_recovery(FecalDecoder decoder_v, const FecalSymbol* symbol) -{ - fecal::Decoder* decoder = reinterpret_cast( decoder_v ); - if (!decoder || !symbol) - return Fecal_InvalidInput; - - return decoder->AddRecovery(*symbol); -} - -FECAL_EXPORT int fecal_decode(FecalDecoder decoder_v, RecoveredSymbols* symbols) -{ - fecal::Decoder* decoder = reinterpret_cast( decoder_v ); - if (!decoder || !symbols) - return Fecal_InvalidInput; - - return decoder->Decode(*symbols); -} - -FECAL_EXPORT int fecal_decoder_get(FecalDecoder decoder_v, unsigned input_index, FecalSymbol* symbol) -{ - fecal::Decoder* decoder = reinterpret_cast( decoder_v ); - if (!decoder || !symbol) - return Fecal_InvalidInput; - - return decoder->GetOriginal(input_index, *symbol); + return Leopard_Success; } diff --git a/leopard.h b/leopard.h index 8c0e85f..e8a6b4f 100644 --- a/leopard.h +++ b/leopard.h @@ -59,6 +59,7 @@ # endif #endif +#include #ifdef __cplusplus extern "C" { @@ -90,14 +91,13 @@ typedef enum LeopardResultT Leopard_Success = 0, // Operation succeeded Leopard_TooMuchData = -1, // Buffer counts are too high - Leopard_InvalidBlockSize = -2, // Buffer size must be a multiple of 64 bytes - Leopard_InvalidInput = -3, // A function parameter was invalid - Leopard_Platform = -4, // Platform is unsupported - Leopard_OutOfMemory = -5, // Out of memory error occurred - Leopard_Unexpected = -6, // Unexpected error - Software bug? + Leopard_InvalidSize = -2, // Buffer size must be a multiple of 64 bytes + Leopard_InvalidCounts = -3, // Invalid counts provided + Leopard_InvalidInput = -4, // A function parameter was invalid + Leopard_Platform = -5, // Platform is unsupported } LeopardResult; -// Results +// Flags typedef enum LeopardFlagsT { LeopardFlags_Defaults = 0, // Default settings @@ -119,7 +119,6 @@ typedef enum LeopardFlagsT Returns the work_count value to pass into leo_encode(). Returns 0 on invalid input. */ - LEO_EXPORT unsigned leo_encode_work_count( unsigned original_count, unsigned recovery_count); @@ -138,6 +137,8 @@ LEO_EXPORT unsigned leo_encode_work_count( flags: Flags for encoding e.g. LeopardFlag_Multithreaded The sum of original_count + recovery_count must not exceed 65536. + The recovery_count <= original_count. + The buffer_bytes must be a multiple of 64. Each buffer should have the same number of bytes. Even the last piece must be rounded up to the block size. @@ -153,15 +154,11 @@ LEO_EXPORT unsigned leo_encode_work_count( ((uint64_t)total_bytes + original_count - 1) / original_count); Returns Leopard_Success on success. - The first set of recovery_count buffers in work_data will be the result. - - Returns Leopard_TooMuchData if the data is too large. - Returns Leopard_InvalidBlockSize if the data is the wrong size. - Returns Leopard_InvalidInput on invalid input. + * The first set of recovery_count buffers in work_data will be the result. Returns other values on errors. */ LEO_EXPORT LeopardResult leo_encode( - unsigned buffer_bytes, // Number of bytes in each data buffer + uint64_t buffer_bytes, // Number of bytes in each data buffer unsigned original_count, // Number of original_data[] buffer pointers unsigned recovery_count, // Number of recovery_data[] buffer pointers unsigned work_count, // Number of work_data[] buffer pointers, from leo_encode_work_count() @@ -183,7 +180,6 @@ LEO_EXPORT LeopardResult leo_encode( Returns the work_count value to pass into leo_encode(). Returns 0 on invalid input. */ - LEO_EXPORT unsigned leo_decode_work_count( unsigned original_count, unsigned recovery_count); @@ -211,7 +207,7 @@ LEO_EXPORT unsigned leo_decode_work_count( Returns other values on errors. */ LEO_EXPORT LeopardResult leo_decode( - unsigned buffer_bytes, // Number of bytes in each data buffer + uint64_t buffer_bytes, // Number of bytes in each data buffer unsigned original_count, // Number of original_data[] buffer pointers unsigned recovery_count, // Number of recovery_data[] buffer pointers unsigned work_count, // Number of buffer pointers in work_data[] diff --git a/proj/Leopard.sln b/proj/Leopard.sln index bafad8e..daa9f58 100644 --- a/proj/Leopard.sln +++ b/proj/Leopard.sln @@ -1,12 +1,14 @@  Microsoft Visual Studio Solution File, Format Version 12.00 -# Visual Studio 15 -VisualStudioVersion = 15.0.26127.3 +# Visual Studio 14 +VisualStudioVersion = 14.0.25420.1 MinimumVisualStudioVersion = 10.0.40219.1 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "Leopard", "Leopard.vcxproj", "{32176592-2F30-4BD5-B645-EB11C8D3453E}" EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "LeopardBenchmark", "..\tests\proj\Benchmark.vcxproj", "{97FCA15F-EAF3-4F1A-AFF8-83E693DA9D45}" EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "LeopardExperiments", "..\tests\proj\Experiments.vcxproj", "{97FCA15F-EAF3-4F1A-AFF8-83E693DA9D65}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Win32 = Debug|Win32 @@ -31,6 +33,14 @@ Global {97FCA15F-EAF3-4F1A-AFF8-83E693DA9D45}.Release|Win32.Build.0 = Release|Win32 {97FCA15F-EAF3-4F1A-AFF8-83E693DA9D45}.Release|x64.ActiveCfg = Release|x64 {97FCA15F-EAF3-4F1A-AFF8-83E693DA9D45}.Release|x64.Build.0 = Release|x64 + {97FCA15F-EAF3-4F1A-AFF8-83E693DA9D65}.Debug|Win32.ActiveCfg = Debug|Win32 + {97FCA15F-EAF3-4F1A-AFF8-83E693DA9D65}.Debug|Win32.Build.0 = Debug|Win32 + {97FCA15F-EAF3-4F1A-AFF8-83E693DA9D65}.Debug|x64.ActiveCfg = Debug|x64 + {97FCA15F-EAF3-4F1A-AFF8-83E693DA9D65}.Debug|x64.Build.0 = Debug|x64 + {97FCA15F-EAF3-4F1A-AFF8-83E693DA9D65}.Release|Win32.ActiveCfg = Release|Win32 + {97FCA15F-EAF3-4F1A-AFF8-83E693DA9D65}.Release|Win32.Build.0 = Release|Win32 + {97FCA15F-EAF3-4F1A-AFF8-83E693DA9D65}.Release|x64.ActiveCfg = Release|x64 + {97FCA15F-EAF3-4F1A-AFF8-83E693DA9D65}.Release|x64.Build.0 = Release|x64 EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE diff --git a/proj/Leopard.vcxproj b/proj/Leopard.vcxproj index da9a8ad..c5c69b5 100644 --- a/proj/Leopard.vcxproj +++ b/proj/Leopard.vcxproj @@ -21,16 +21,12 @@ - - - - @@ -38,34 +34,33 @@ {32176592-2F30-4BD5-B645-EB11C8D3453E} GF65536 Leopard - 10.0.14393.0 StaticLibrary true MultiByte - v141 + v140 StaticLibrary true MultiByte - v141 + v140 StaticLibrary false true MultiByte - v141 + v140 StaticLibrary false true MultiByte - v141 + v140 diff --git a/proj/Leopard.vcxproj.filters b/proj/Leopard.vcxproj.filters index 079edb1..df7d586 100644 --- a/proj/Leopard.vcxproj.filters +++ b/proj/Leopard.vcxproj.filters @@ -21,12 +21,6 @@ Source Files - - Source Files - - - Source Files - Source Files @@ -35,12 +29,6 @@ - - Source Files - - - Source Files - Source Files diff --git a/tests/experiments.cpp b/tests/experiments.cpp new file mode 100644 index 0000000..f2c6a4e --- /dev/null +++ b/tests/experiments.cpp @@ -0,0 +1,615 @@ +/* + Copyright (c) 2017 Christopher A. Taylor. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + * Neither the name of LHC-RS nor the names of its contributors may be + used to endorse or promote products derived from this software without + specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. +*/ + +#include +#include +#include +#include +#include + + +//------------------------------------------------------------------------------ +// Debug + +// Some bugs only repro in release mode, so this can be helpful +//#define LEO_DEBUG_IN_RELEASE + +#if defined(_DEBUG) || defined(DEBUG) || defined(LEO_DEBUG_IN_RELEASE) + #define LEO_DEBUG + #ifdef _WIN32 + #define LEO_DEBUG_BREAK __debugbreak() + #else + #define LEO_DEBUG_BREAK __builtin_trap() + #endif + #define LEO_DEBUG_ASSERT(cond) { if (!(cond)) { LEO_DEBUG_BREAK; } } +#else + #define LEO_DEBUG_BREAK ; + #define LEO_DEBUG_ASSERT(cond) ; +#endif + + +//------------------------------------------------------------------------------ +// Platform/Architecture + +// Compiler-specific C++11 restrict keyword +#define LEO_RESTRICT __restrict + +// Compiler-specific force inline keyword +#ifdef _MSC_VER + #define LEO_FORCE_INLINE inline __forceinline +#else + #define LEO_FORCE_INLINE inline __attribute__((always_inline)) +#endif + + + + +//------------------------------------------------------------------------------ +// Field + +//#define LEO_SHORT_FIELD + +#ifdef LEO_SHORT_FIELD +typedef uint8_t ffe_t; +static const unsigned kGFBits = 8; +static const unsigned kGFPolynomial = 0x11D; +ffe_t kGFBasis[kGFBits] = { + 1, 214, 152, 146, 86, 200, 88, 230 // Cantor basis +}; +#else +typedef uint16_t ffe_t; +static const unsigned kGFBits = 16; +static const unsigned kGFPolynomial = 0x1002D; +ffe_t kGFBasis[kGFBits] = { + 0x0001, 0xACCA, 0x3C0E, 0x163E, // Cantor basis + 0xC582, 0xED2E, 0x914C, 0x4012, + 0x6C98, 0x10D8, 0x6A72, 0xB900, + 0xFDB8, 0xFB34, 0xFF38, 0x991E +}; +#endif + +/* + Cantor Basis introduced by: + D. G. Cantor, "On arithmetical algorithms over finite fields", + Journal of Combinatorial Theory, Series A, vol. 50, no. 2, pp. 285-300, 1989. +*/ + +static const unsigned kFieldSize = (unsigned)1 << kGFBits; //Field size +static const unsigned kFieldModulus = kFieldSize - 1; + +static ffe_t GFLog[kFieldSize]; +static ffe_t GFExp[kFieldSize]; + +// Initialize GFLog[], GFExp[] +static void InitField() +{ + unsigned state = 1; + for (unsigned i = 0; i < kFieldModulus; ++i) + { + GFExp[state] = static_cast(i); + state <<= 1; + if (state >= kFieldSize) + state ^= kGFPolynomial; + } + GFExp[0] = kFieldModulus; + + // Conversion to chosen basis: + + GFLog[0] = 0; + for (unsigned i = 0; i < kGFBits; ++i) + { + const ffe_t basis = kGFBasis[i]; + const unsigned width = (unsigned)(1UL << i); + + for (unsigned j = 0; j < width; ++j) + GFLog[j + width] = GFLog[j] ^ basis; + } + + for (unsigned i = 0; i < kFieldSize; ++i) + GFLog[i] = GFExp[GFLog[i]]; + + for (unsigned i = 0; i < kFieldSize; ++i) + GFExp[GFLog[i]] = i; + + GFExp[kFieldModulus] = GFExp[0]; +} + + +//------------------------------------------------------------------------------ +// Mod Q Field Operations +// +// Q is the maximum symbol value, e.g. 255 or 65535. + +// z = x + y (mod Q) +static inline ffe_t AddModQ(ffe_t a, ffe_t b) +{ + const unsigned sum = (unsigned)a + b; + + // Partial reduction step, allowing for Q to be returned + return static_cast(sum + (sum >> kGFBits)); +} + +// z = x - y (mod Q) +static inline ffe_t SubModQ(ffe_t a, ffe_t b) +{ + const unsigned dif = (unsigned)a - b; + + // Partial reduction step, allowing for Q to be returned + return static_cast(dif + (dif >> kGFBits)); +} + +// return a*GFExp[b] over GF(2^r) +static ffe_t mulE(ffe_t a, ffe_t b) +{ + if (a == 0) + return 0; + + const ffe_t sum = static_cast(AddModQ(GFLog[a], b)); + return GFExp[sum]; +} + + +//------------------------------------------------------------------------------ +// Fast Walsh-Hadamard Transform (FWHT) Mod Q +// +// Q is the maximum symbol value, e.g. 255 or 65535. + +// Define this to enable the optimized version of FWHT() +#define LEO_FWHT_OPTIMIZED + +typedef ffe_t fwht_t; + +// {a, b} = {a + b, a - b} (Mod Q) +static LEO_FORCE_INLINE void FWHT_2(fwht_t& LEO_RESTRICT a, fwht_t& LEO_RESTRICT b) +{ + const fwht_t sum = AddModQ(a, b); + const fwht_t dif = SubModQ(a, b); + a = sum; + b = dif; +} + +// Reference implementation +static void FWHT(fwht_t* data, const unsigned bits) +{ + const unsigned size = (unsigned)(1UL << bits); + for (unsigned width = 1; width < size; width <<= 1) + for (unsigned i = 0; i < size; i += (width << 1)) + for (unsigned j = i; j < (width + i); ++j) + FWHT_2(data[j], data[j + width]); +} + + +//------------------------------------------------------------------------------ +// Formal Derivative + +// Formal derivative of polynomial in the new basis +static void formal_derivative(ffe_t* cos, const unsigned size) +{ + /* + Left to right xoring data ahead into data behind. + + If the data ends in all zeroes, this can simply stop. + */ + for (unsigned i = 1; i < size; ++i) + { + const unsigned leng = ((i ^ (i - 1)) + 1) >> 1; + + // If a large number of values are being XORed: + for (unsigned j = i - leng; j < i; ++j) + cos[j] ^= cos[j + leng]; + } + + // Doesn't seem to be needed +#if 0 + /* + Same here - Zeroes on the right are preserved + */ + for (unsigned i = size; i < kFieldSize; i <<= 1) + { + for (unsigned j = 0; j < size; ++j) + cos[j] ^= cos[j + i]; + } +#endif +} + + +//------------------------------------------------------------------------------ +// Fast Fourier Transform + +static ffe_t skewVec[kFieldModulus]; // twisted factors used in FFT + +static LEO_FORCE_INLINE void ifft_butterfly(ffe_t& a, ffe_t& b, ffe_t skew) +{ + b ^= a; + a ^= mulE(b, skew); +} + +// IFFT in the proposed basis +static void IFLT(ffe_t* data, const unsigned size, const unsigned index) +{ + for (unsigned width = 1; width < size; width <<= 1) + { + for (unsigned j = width; j < size; j += (width << 1)) + { + const ffe_t skew = skewVec[j + index - 1]; + + if (skew != kFieldModulus) + { + for (unsigned i = j - width; i < j; ++i) + ifft_butterfly(data[i], data[i + width], skew); + } + else + { + for (unsigned i = j - width; i < j; ++i) + data[i + width] ^= data[i]; + } + } + } +} + +static LEO_FORCE_INLINE void fft_butterfly(ffe_t& a, ffe_t& b, ffe_t skew) +{ + a ^= mulE(b, skew); + b ^= a; +} + +// FFT in the proposed basis +static void FLT(ffe_t* data, const unsigned size, const unsigned skewIndex, const unsigned output_elements) +{ + for (unsigned width = (size >> 1); width > 0; width >>= 1) + { + const ffe_t* skewLUT = skewVec + width + skewIndex - 1; + + for (unsigned j = 0; j < output_elements; j += (width << 1)) + { + const ffe_t skew = skewLUT[j]; + + if (skew != kFieldModulus) + { + for (unsigned i = j; i < j + width; ++i) + fft_butterfly(data[i], data[i + width], skew); + } + else + { + for (unsigned i = j; i < j + width; ++i) + data[i + width] ^= data[i]; + } + } + } +} + + +//------------------------------------------------------------------------------ +// FFT Initialization + +//static ffe_t B[kFieldSize >> 1]; // factors used in formal derivative +static fwht_t log_walsh[kFieldSize]; // factors used in the evaluation of the error locator polynomial + +// Initialize skewVec[], B[], log_walsh[] +static void InitFieldOperations() +{ + ffe_t temp[kGFBits - 1]; + + for (unsigned i = 1; i < kGFBits; ++i) + temp[i - 1] = (ffe_t)((unsigned)1 << i); + + for (unsigned m = 0; m < (kGFBits - 1); ++m) + { + const unsigned step = (unsigned)1 << (m + 1); + + skewVec[((unsigned)1 << m) - 1] = 0; + + for (unsigned i = m; i < (kGFBits - 1); ++i) + { + const unsigned s = ((unsigned)1 << (i + 1)); + + for (unsigned j = ((unsigned)1 << m) - 1; j < s; j += step) + skewVec[j + s] = skewVec[j] ^ temp[i]; + } + + temp[m] = kFieldModulus - GFLog[mulE(temp[m], GFLog[temp[m] ^ 1])]; + + for (unsigned i = m + 1; i < (kGFBits - 1); ++i) + temp[i] = mulE(temp[i], (GFLog[temp[i] ^ 1] + temp[m]) % kFieldModulus); + } + + for (unsigned i = 0; i < kFieldSize; ++i) + skewVec[i] = GFLog[skewVec[i]]; + +#if 0 + temp[0] = kFieldModulus - temp[0]; + + for (unsigned i = 1; i < (kGFBits - 1); ++i) + temp[i] = (kFieldModulus - temp[i] + temp[i - 1]) % kFieldModulus; + + B[0] = 0; + for (unsigned i = 0; i < (kGFBits - 1); ++i) + { + const unsigned depart = ((unsigned)1 << i); + + for (unsigned j = 0; j < depart; ++j) + B[j + depart] = (B[j] + temp[i]) % kFieldModulus; + } +#endif + + for (unsigned i = 0; i < kFieldSize; ++i) + log_walsh[i] = GFLog[i]; + + log_walsh[0] = 0; + + FWHT(log_walsh, kGFBits); +} + + +//------------------------------------------------------------------------------ +// Encoder + +// Encoding alg for k/n<0.5: message is a power of two +static void encodeL(ffe_t* data, const unsigned k, ffe_t* codeword) +{ + memcpy(codeword, data, sizeof(ffe_t) * k); + + IFLT(codeword, k, 0); + + for (unsigned i = k; i < kFieldSize; i += k) + { + memcpy(&codeword[i], codeword, sizeof(ffe_t) * k); + + FLT(&codeword[i], k, i, k); + } + + memcpy(codeword, data, sizeof(ffe_t) * k); +} + +// Encoding alg for k/n>0.5: parity is a power of two. +// data: message array. parity: parity array. mem: buffer(size>= n-k) +static void encodeH(const ffe_t* data, const unsigned m, const unsigned original_count, ffe_t* parity, ffe_t* mem) +{ + // Note: Assumes data is padded with zeroes out to the next multiple of m + + memcpy(parity, data, m * sizeof(ffe_t)); + IFLT(parity, m, m); + + for (unsigned i = m; i < original_count; i += m) + { + memcpy(mem, data + i, m * sizeof(ffe_t)); + IFLT(mem, m, m + i); + for (unsigned j = 0; j < m; ++j) + parity[j] ^= mem[j]; + } + + FLT(parity, m, 0, m); +} + + +//------------------------------------------------------------------------------ +// Decoder + +static void decode(ffe_t* codeword, const unsigned m, const unsigned original_count, const unsigned n, const bool* erasure) +{ + fwht_t log_walsh2[kFieldSize]; + + // Compute the evaluations of the error locator polynomial + for (unsigned i = 0; i < kFieldSize; ++i) + log_walsh2[i] = erasure[i] ? 1 : 0; + + FWHT(log_walsh2, kGFBits); + + for (unsigned i = 0; i < kFieldSize; ++i) + log_walsh2[i] = ((unsigned)log_walsh2[i] * (unsigned)log_walsh[i]) % kFieldModulus; + + FWHT(log_walsh2, kGFBits); + + // k2 can be replaced with k + //const unsigned k2 = kFieldSize; + //const unsigned k2 = k; // cannot actually be replaced with k. maybe for encodeL() only? + + for (unsigned i = 0; i < m + original_count; ++i) + { + if (erasure[i]) + { + codeword[i] = 0; + } + else + { + codeword[i] = mulE(codeword[i], log_walsh2[i]); + } + } + for (unsigned i = m + original_count; i < n; ++i) + codeword[i] = 0; + + IFLT(codeword, n, 0); + + // Note: This is not needed to recover successfully... +#if 0 + // formal derivative + // Note: Preserves zeroes on the right + for (unsigned i = 0; i < m + original_count; i += 2) + { + codeword[i] = mulE(codeword[i], kFieldModulus - B[i >> 1]); + codeword[i + 1] = mulE(codeword[i + 1], kFieldModulus - B[i >> 1]); + } +#endif + + formal_derivative(codeword, n); + +#if 0 + // Note: Preserves zeroes on the right + for (unsigned i = 0; i < m + original_count; i += 2) + { + codeword[i] = mulE(codeword[i], B[i >> 1]); + codeword[i + 1] = mulE(codeword[i + 1], B[i >> 1]); + } +#endif + + FLT(codeword, n, 0, m + original_count); + + for (unsigned i = 0; i < kFieldSize; ++i) + { + if (erasure[i]) + { + codeword[i] = mulE(codeword[i], kFieldModulus - log_walsh2[i]); + } + } +} + + +#ifdef _MSC_VER +#include +#endif + +// Returns highest bit index 0..63 where the first non-zero bit is found +// Precondition: x != 0 +LEO_FORCE_INLINE unsigned LastNonzeroBit64(uint64_t x) +{ +#ifdef _MSC_VER +#ifdef _WIN64 + unsigned long index; + // Note: Ignoring result because x != 0 + _BitScanReverse64(&index, x); + return (unsigned)index; +#else + unsigned long index; + if (0 != _BitScanReverse(&index, (uint32_t)x)) + return (unsigned)index; + // Note: Ignoring result because x != 0 + _BitScanReverse(&index, (uint32_t)(x >> 32)); + return (unsigned)index + 32; +#endif +#else + // Note: Ignoring return value of 0 because x != 0 + return 63 - (unsigned)__builtin_clzll(x); +#endif +} + + +//------------------------------------------------------------------------------ +// Test Application + +void test(unsigned original_count, unsigned recovery_count, unsigned seed) +{ + unsigned m = 2UL << LastNonzeroBit64(recovery_count - 1); + unsigned n = 2UL << LastNonzeroBit64(m + original_count - 1); + + srand(seed); + + //-----------Generating message---------- + + // Message array + ffe_t data[kFieldSize] = {0}; + + // Filled with random numbers + for (unsigned i = m; i < m + original_count; ++i) + data[i] = (ffe_t)rand(); + + + //---------encoding---------- + + ffe_t codeword[kFieldSize] = {}; + // First m codewords are for the parity data + encodeH(data + m, m, original_count, data, codeword); + //encodeL(data, k, codeword); // does not seem to work with any input? what else needs to change? + + memcpy(codeword, data, sizeof(ffe_t) * kFieldSize); + + + //--------erasure simulation--------- + + // Array indicating erasures + bool erasure[kFieldSize] = { + false + }; + + // Tag the first "recovery_count" elements as erasures + for (unsigned i = m; i < m + recovery_count; ++i) + erasure[i] = true; + + // permuting the erasure array + for (unsigned i = m + original_count - 1; i > 0; --i) + { + unsigned pos = rand() % (i + 1); + + if (i != pos) + { + bool tmp = erasure[i]; + erasure[i] = erasure[pos]; + erasure[pos] = tmp; + } + } + + + //---------main processing---------- + decode(codeword, m, original_count, n, erasure); + + // Check the correctness of the result + for (unsigned i = 0; i < kFieldSize; ++i) + { + if (erasure[i]) + { + if (data[i] != codeword[i]) + { + printf("Decoding Error with seed = %d!\n", seed); + LEO_DEBUG_BREAK; + return; + } + } + } + + printf(":D "); +} + + +//------------------------------------------------------------------------------ +// Entrypoint + +int main(int argc, char **argv) +{ + // Fill GFLog table and GFExp table + InitField(); + + // Compute factors used in erasure decoder + InitFieldOperations(); + + unsigned seed = (unsigned)time(NULL); + for (;;) + { +#ifdef LEO_SHORT_FIELD + const unsigned input_count = 100; + const unsigned recovery_count = 20; +#else // LEO_SHORT_FIELD + const unsigned input_count = 10000; + const unsigned recovery_count = 2000; +#endif // LEO_SHORT_FIELD + + test(input_count, recovery_count, seed); + + ++seed; + } + + return 0; +} diff --git a/tests/proj/Benchmark.vcxproj b/tests/proj/Benchmark.vcxproj index 6c008f5..41583ff 100644 --- a/tests/proj/Benchmark.vcxproj +++ b/tests/proj/Benchmark.vcxproj @@ -20,36 +20,35 @@ {97FCA15F-EAF3-4F1A-AFF8-83E693DA9D45} - Fecal + Leopard LeopardBenchmark - 10.0.14393.0 Application true MultiByte - v141 + v140 Application true MultiByte - v141 + v140 Application false true MultiByte - v141 + v140 Application false true MultiByte - v141 + v140 diff --git a/tests/proj/Experiments.filters b/tests/proj/Experiments.filters new file mode 100644 index 0000000..50a05dd --- /dev/null +++ b/tests/proj/Experiments.filters @@ -0,0 +1,22 @@ + + + + + {4FC737F1-C7A5-4376-A066-2A32D752A2FF} + cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx + + + {93995380-89BD-4b04-88EB-625FBE52EBFB} + h;hh;hpp;hxx;hm;inl;inc;xsd + + + {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} + rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms + + + + + Source Files + + + \ No newline at end of file diff --git a/tests/proj/Experiments.vcxproj b/tests/proj/Experiments.vcxproj new file mode 100644 index 0000000..187d804 --- /dev/null +++ b/tests/proj/Experiments.vcxproj @@ -0,0 +1,181 @@ + + + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + + + {97FCA15F-EAF3-4F1A-AFF8-83E693DA9D65} + Leopard + LeopardExperiments + + + + Application + true + MultiByte + v140 + + + Application + true + MultiByte + v140 + + + Application + false + true + MultiByte + v140 + + + Application + false + true + MultiByte + v140 + + + + + + + + + + + + + + + + + + + Output/$(ProjectName)/$(Configuration)/$(Platform)/ + Obj/$(ProjectName)/$(Configuration)/$(Platform)/ + + + Output/$(ProjectName)/$(Configuration)/$(Platform)/ + Obj/$(ProjectName)/$(Configuration)/$(Platform)/ + + + Output/$(ProjectName)/$(Configuration)/$(Platform)/ + Obj/$(ProjectName)/$(Configuration)/$(Platform)/ + + + Output/$(ProjectName)/$(Configuration)/$(Platform)/ + Obj/$(ProjectName)/$(Configuration)/$(Platform)/ + + + + Level3 + Disabled + true + MultiThreadedDebug + _MBCS;%(PreprocessorDefinitions) + + + true + + + + + + + + + + + Level3 + Disabled + true + MultiThreadedDebug + _MBCS;%(PreprocessorDefinitions) + + + true + + + + + + + + + + + Level3 + MaxSpeed + true + true + true + AnySuitable + Speed + false + MultiThreaded + true + _MBCS;%(PreprocessorDefinitions) + + + true + true + true + + + + + + + + + + + Level3 + MaxSpeed + true + true + true + AnySuitable + Speed + false + MultiThreaded + true + _MBCS;%(PreprocessorDefinitions) + + + true + true + true + + + + + + + + + + + + + + + + + \ No newline at end of file