/* Copyright (c) 2017 Christopher A. Taylor. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Leopard-RS nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "LeopardCommon.h" namespace leopard { //------------------------------------------------------------------------------ // Runtime CPU Architecture Check // // Feature checks stolen shamelessly from // https://github.com/jedisct1/libsodium/blob/master/src/libsodium/sodium/runtime.c #if defined(HAVE_ANDROID_GETCPUFEATURES) #include #endif #if defined(LEO_TRY_NEON) # if defined(IOS) && defined(__ARM_NEON__) // Requires iPhone 5S or newer # else // Remember to add LOCAL_STATIC_LIBRARIES := cpufeatures bool CpuHasNeon = false; // V6 / V7 bool CpuHasNeon64 = false; // 64-bit # endif #endif #if !defined(LEO_TARGET_MOBILE) #ifdef _MSC_VER #include // __cpuid #pragma warning(disable: 4752) // found Intel(R) Advanced Vector Extensions; consider using /arch:AVX #endif #ifdef LEO_TRY_AVX2 bool CpuHasAVX2 = false; #endif bool CpuHasSSSE3 = false; #define CPUID_EBX_AVX2 0x00000020 #define CPUID_ECX_SSSE3 0x00000200 static void _cpuid(unsigned int cpu_info[4U], const unsigned int cpu_info_type) { #if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_AMD64) || defined(_M_IX86)) __cpuid((int *) cpu_info, cpu_info_type); #else //if defined(HAVE_CPUID) cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0; # ifdef __i386__ __asm__ __volatile__ ("pushfl; pushfl; " "popl %0; " "movl %0, %1; xorl %2, %0; " "pushl %0; " "popfl; pushfl; popl %0; popfl" : "=&r" (cpu_info[0]), "=&r" (cpu_info[1]) : "i" (0x200000)); if (((cpu_info[0] ^ cpu_info[1]) & 0x200000) == 0) { return; /* LCOV_EXCL_LINE */ } # endif # ifdef __i386__ __asm__ __volatile__ ("xchgl %%ebx, %k1; cpuid; xchgl %%ebx, %k1" : "=a" (cpu_info[0]), "=&r" (cpu_info[1]), "=c" (cpu_info[2]), "=d" (cpu_info[3]) : "0" (cpu_info_type), "2" (0U)); # elif defined(__x86_64__) __asm__ __volatile__ ("xchgq %%rbx, %q1; cpuid; xchgq %%rbx, %q1" : "=a" (cpu_info[0]), "=&r" (cpu_info[1]), "=c" (cpu_info[2]), "=d" (cpu_info[3]) : "0" (cpu_info_type), "2" (0U)); # else __asm__ __volatile__ ("cpuid" : "=a" (cpu_info[0]), "=b" (cpu_info[1]), "=c" (cpu_info[2]), "=d" (cpu_info[3]) : "0" (cpu_info_type), "2" (0U)); # endif #endif } #endif // defined(LEO_TARGET_MOBILE) void InitializeCPUArch() { #if defined(LEO_TRY_NEON) && defined(HAVE_ANDROID_GETCPUFEATURES) AndroidCpuFamily family = android_getCpuFamily(); if (family == ANDROID_CPU_FAMILY_ARM) { if (android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON) CpuHasNeon = true; } else if (family == ANDROID_CPU_FAMILY_ARM64) { CpuHasNeon = true; if (android_getCpuFeatures() & ANDROID_CPU_ARM64_FEATURE_ASIMD) CpuHasNeon64 = true; } #endif #if !defined(LEO_TARGET_MOBILE) unsigned int cpu_info[4]; _cpuid(cpu_info, 1); CpuHasSSSE3 = ((cpu_info[2] & CPUID_ECX_SSSE3) != 0); #if defined(LEO_TRY_AVX2) _cpuid(cpu_info, 7); CpuHasAVX2 = ((cpu_info[1] & CPUID_EBX_AVX2) != 0); #endif // LEO_TRY_AVX2 #endif // LEO_TARGET_MOBILE } // vx[] += vy[] * z static void muladd_mem(GFSymbol * LEO_RESTRICT vx, const GFSymbol * LEO_RESTRICT vy, GFSymbol z, unsigned symbolCount) { for (unsigned i = 0; i < symbolCount; ++i) { const GFSymbol a = vy[i]; if (a == 0) continue; GFSymbol sum1 = static_cast(AddModQ(GFLog[a & 0x0f], z)); GFSymbol value1 = GFExp[sum1]; if ((a & 0x0f) == 0) { value1 = 0; } GFSymbol sum2 = static_cast(AddModQ(GFLog[a & 0xf0], z)); GFSymbol value2 = GFExp[sum2]; if ((a & 0xf0) == 0) { value2 = 0; } GFSymbol sum3 = static_cast(AddModQ(GFLog[a & 0x0f00], z)); GFSymbol value3 = GFExp[sum3]; if ((a & 0x0f00) == 0) { value3 = 0; } GFSymbol sum4 = static_cast(AddModQ(GFLog[a & 0xf000], z)); GFSymbol value4 = GFExp[sum4]; if ((a & 0xf000) == 0) { value4 = 0; } vx[i] ^= value1; vx[i] ^= value2; vx[i] ^= value3; vx[i] ^= value4; } } // return a*GFExp[b] over GF(2^r) static GFSymbol mulE(GFSymbol a, GFSymbol b) { if (a == 0) return 0; const GFSymbol sum = static_cast(AddModQ(GFLog[a], b)); return GFExp[sum]; } //------------------------------------------------------------------------------ // Fast Walsh-Hadamard Transform (FWHT) Mod Q // // Q is the maximum symbol value, e.g. 255 or 65535. // Define this to enable the optimized version of FWHT() #define LEO_FWHT_OPTIMIZED typedef GFSymbol fwht_t; // {a, b} = {a + b, a - b} (Mod Q) static LEO_FORCE_INLINE void FWHT_2(fwht_t& LEO_RESTRICT a, fwht_t& LEO_RESTRICT b) { const fwht_t sum = AddModQ(a, b); const fwht_t dif = SubModQ(a, b); a = sum; b = dif; } /* FWHT is a minor slice of the runtime and does not grow with data size, but I did attempt a few additional optimizations that failed: I've attempted to vectorize (with partial reductions) FWHT_4(data, s), which is 70% of the algorithm, but it was slower. Left in _attic_. I've attempted to avoid reductions in all or parts of the FWHT. The final modular reduction ends up being slower than the savings. Specifically I tried doing it for the whole FWHT and also I tried doing it just for the FWHT_2 loop in the main routine, but both approaches are slower than partial reductions. Replacing word reads with wider reads does speed up the operation, but at too high a complexity cost relative to minor perf improvement. */ #ifndef LEO_FWHT_OPTIMIZED // Reference implementation static void FWHT(fwht_t* data, const unsigned bits) { const unsigned size = (unsigned)(1UL << bits); for (unsigned width = 1; width < size; width <<= 1) for (unsigned i = 0; i < size; i += (width << 1)) for (unsigned j = i; j < (width + i); ++j) FWHT_2(data[j], data[j + width]); } #else static LEO_FORCE_INLINE void FWHT_4(fwht_t* data) { fwht_t t0 = data[0]; fwht_t t1 = data[1]; fwht_t t2 = data[2]; fwht_t t3 = data[3]; FWHT_2(t0, t1); FWHT_2(t2, t3); FWHT_2(t0, t2); FWHT_2(t1, t3); data[0] = t0; data[1] = t1; data[2] = t2; data[3] = t3; } static LEO_FORCE_INLINE void FWHT_4(fwht_t* data, unsigned s) { unsigned x = 0; fwht_t t0 = data[x]; x += s; fwht_t t1 = data[x]; x += s; fwht_t t2 = data[x]; x += s; fwht_t t3 = data[x]; FWHT_2(t0, t1); FWHT_2(t2, t3); FWHT_2(t0, t2); FWHT_2(t1, t3); unsigned y = 0; data[y] = t0; y += s; data[y] = t1; y += s; data[y] = t2; y += s; data[y] = t3; } static inline void FWHT_8(fwht_t* data) { fwht_t t0 = data[0]; fwht_t t1 = data[1]; fwht_t t2 = data[2]; fwht_t t3 = data[3]; fwht_t t4 = data[4]; fwht_t t5 = data[5]; fwht_t t6 = data[6]; fwht_t t7 = data[7]; FWHT_2(t0, t1); FWHT_2(t2, t3); FWHT_2(t4, t5); FWHT_2(t6, t7); FWHT_2(t0, t2); FWHT_2(t1, t3); FWHT_2(t4, t6); FWHT_2(t5, t7); FWHT_2(t0, t4); FWHT_2(t1, t5); FWHT_2(t2, t6); FWHT_2(t3, t7); data[0] = t0; data[1] = t1; data[2] = t2; data[3] = t3; data[4] = t4; data[5] = t5; data[6] = t6; data[7] = t7; } static inline void FWHT_16(fwht_t* data) { fwht_t t0 = data[0]; fwht_t t1 = data[1]; fwht_t t2 = data[2]; fwht_t t3 = data[3]; fwht_t t4 = data[4]; fwht_t t5 = data[5]; fwht_t t6 = data[6]; fwht_t t7 = data[7]; fwht_t t8 = data[8]; fwht_t t9 = data[9]; fwht_t t10 = data[10]; fwht_t t11 = data[11]; fwht_t t12 = data[12]; fwht_t t13 = data[13]; fwht_t t14 = data[14]; fwht_t t15 = data[15]; FWHT_2(t0, t1); FWHT_2(t2, t3); FWHT_2(t4, t5); FWHT_2(t6, t7); FWHT_2(t8, t9); FWHT_2(t10, t11); FWHT_2(t12, t13); FWHT_2(t14, t15); FWHT_2(t0, t2); FWHT_2(t1, t3); FWHT_2(t4, t6); FWHT_2(t5, t7); FWHT_2(t8, t10); FWHT_2(t9, t11); FWHT_2(t12, t14); FWHT_2(t13, t15); FWHT_2(t0, t4); FWHT_2(t1, t5); FWHT_2(t2, t6); FWHT_2(t3, t7); FWHT_2(t8, t12); FWHT_2(t9, t13); FWHT_2(t10, t14); FWHT_2(t11, t15); FWHT_2(t0, t8); FWHT_2(t1, t9); FWHT_2(t2, t10); FWHT_2(t3, t11); FWHT_2(t4, t12); FWHT_2(t5, t13); FWHT_2(t6, t14); FWHT_2(t7, t15); data[0] = t0; data[1] = t1; data[2] = t2; data[3] = t3; data[4] = t4; data[5] = t5; data[6] = t6; data[7] = t7; data[8] = t8; data[9] = t9; data[10] = t10; data[11] = t11; data[12] = t12; data[13] = t13; data[14] = t14; data[15] = t15; } static void FWHT_SmallData(fwht_t* data, unsigned ldn) { const unsigned n = (1UL << ldn); if (n <= 2) { if (n == 2) FWHT_2(data[0], data[1]); return; } for (unsigned ldm = ldn; ldm > 3; ldm -= 2) { unsigned m = (1UL << ldm); unsigned m4 = (m >> 2); for (unsigned r = 0; r < n; r += m) for (unsigned j = 0; j < m4; j++) FWHT_4(data + j + r, m4); } if (ldn & 1) { for (unsigned i0 = 0; i0 < n; i0 += 8) FWHT_8(data + i0); } else { for (unsigned i0 = 0; i0 < n; i0 += 4) FWHT_4(data + i0); } } // Decimation in time (DIT) version static void FWHT(fwht_t* data, const unsigned ldn) { if (ldn <= 13) { FWHT_SmallData(data, ldn); return; } FWHT_2(data[2], data[3]); FWHT_4(data + 4); FWHT_8(data + 8); FWHT_16(data + 16); for (unsigned ldm = 5; ldm < ldn; ++ldm) FWHT(data + (unsigned)(1UL << ldm), ldm); for (unsigned ldm = 0; ldm < ldn; ++ldm) { const unsigned mh = (1UL << ldm); for (unsigned t1 = 0, t2 = mh; t1 < mh; ++t1, ++t2) FWHT_2(data[t1], data[t2]); } } #endif //------------------------------------------------------------------------------ // Memory Buffer XOR static void xor_mem(void * LEO_RESTRICT vx, const void * LEO_RESTRICT vy, unsigned bytes) { LEO_M128 * LEO_RESTRICT x16 = reinterpret_cast(vx); const LEO_M128 * LEO_RESTRICT y16 = reinterpret_cast(vy); #if defined(LEO_TARGET_MOBILE) # if defined(LEO_TRY_NEON) // Handle multiples of 64 bytes if (CpuHasNeon) { while (bytes >= 64) { LEO_M128 x0 = vld1q_u8(x16); LEO_M128 x1 = vld1q_u8(x16 + 1); LEO_M128 x2 = vld1q_u8(x16 + 2); LEO_M128 x3 = vld1q_u8(x16 + 3); LEO_M128 y0 = vld1q_u8(y16); LEO_M128 y1 = vld1q_u8(y16 + 1); LEO_M128 y2 = vld1q_u8(y16 + 2); LEO_M128 y3 = vld1q_u8(y16 + 3); vst1q_u8(x16, veorq_u8(x0, y0)); vst1q_u8(x16 + 1, veorq_u8(x1, y1)); vst1q_u8(x16 + 2, veorq_u8(x2, y2)); vst1q_u8(x16 + 3, veorq_u8(x3, y3)); bytes -= 64, x16 += 4, y16 += 4; } // Handle multiples of 16 bytes while (bytes >= 16) { LEO_M128 x0 = vld1q_u8(x16); LEO_M128 y0 = vld1q_u8(y16); vst1q_u8(x16, veorq_u8(x0, y0)); bytes -= 16, ++x16, ++y16; } } else # endif // LEO_TRY_NEON { uint64_t * LEO_RESTRICT x8 = reinterpret_cast(x16); const uint64_t * LEO_RESTRICT y8 = reinterpret_cast(y16); const unsigned count = (unsigned)bytes / 8; for (unsigned ii = 0; ii < count; ++ii) x8[ii] ^= y8[ii]; x16 = reinterpret_cast(x8 + count); y16 = reinterpret_cast(y8 + count); } #else // LEO_TARGET_MOBILE # if defined(LEO_TRY_AVX2) if (CpuHasAVX2) { LEO_M256 * LEO_RESTRICT x32 = reinterpret_cast(x16); const LEO_M256 * LEO_RESTRICT y32 = reinterpret_cast(y16); while (bytes >= 128) { LEO_M256 x0 = _mm256_loadu_si256(x32); LEO_M256 y0 = _mm256_loadu_si256(y32); x0 = _mm256_xor_si256(x0, y0); LEO_M256 x1 = _mm256_loadu_si256(x32 + 1); LEO_M256 y1 = _mm256_loadu_si256(y32 + 1); x1 = _mm256_xor_si256(x1, y1); LEO_M256 x2 = _mm256_loadu_si256(x32 + 2); LEO_M256 y2 = _mm256_loadu_si256(y32 + 2); x2 = _mm256_xor_si256(x2, y2); LEO_M256 x3 = _mm256_loadu_si256(x32 + 3); LEO_M256 y3 = _mm256_loadu_si256(y32 + 3); x3 = _mm256_xor_si256(x3, y3); _mm256_storeu_si256(x32, x0); _mm256_storeu_si256(x32 + 1, x1); _mm256_storeu_si256(x32 + 2, x2); _mm256_storeu_si256(x32 + 3, x3); bytes -= 128, x32 += 4, y32 += 4; } // Handle multiples of 32 bytes while (bytes >= 32) { // x[i] = x[i] xor y[i] _mm256_storeu_si256(x32, _mm256_xor_si256( _mm256_loadu_si256(x32), _mm256_loadu_si256(y32))); bytes -= 32, ++x32, ++y32; } x16 = reinterpret_cast(x32); y16 = reinterpret_cast(y32); } else # endif // LEO_TRY_AVX2 { while (bytes >= 64) { LEO_M128 x0 = _mm_loadu_si128(x16); LEO_M128 y0 = _mm_loadu_si128(y16); x0 = _mm_xor_si128(x0, y0); LEO_M128 x1 = _mm_loadu_si128(x16 + 1); LEO_M128 y1 = _mm_loadu_si128(y16 + 1); x1 = _mm_xor_si128(x1, y1); LEO_M128 x2 = _mm_loadu_si128(x16 + 2); LEO_M128 y2 = _mm_loadu_si128(y16 + 2); x2 = _mm_xor_si128(x2, y2); LEO_M128 x3 = _mm_loadu_si128(x16 + 3); LEO_M128 y3 = _mm_loadu_si128(y16 + 3); x3 = _mm_xor_si128(x3, y3); _mm_storeu_si128(x16, x0); _mm_storeu_si128(x16 + 1, x1); _mm_storeu_si128(x16 + 2, x2); _mm_storeu_si128(x16 + 3, x3); bytes -= 64, x16 += 4, y16 += 4; } } #endif // LEO_TARGET_MOBILE // Handle multiples of 16 bytes while (bytes >= 16) { // x[i] = x[i] xor y[i] _mm_storeu_si128(x16, _mm_xor_si128( _mm_loadu_si128(x16), _mm_loadu_si128(y16))); bytes -= 16, ++x16, ++y16; } uint8_t * LEO_RESTRICT x1 = reinterpret_cast(x16); const uint8_t * LEO_RESTRICT y1 = reinterpret_cast(y16); // Handle a block of 8 bytes const unsigned eight = bytes & 8; if (eight) { uint64_t * LEO_RESTRICT x8 = reinterpret_cast(x1); const uint64_t * LEO_RESTRICT y8 = reinterpret_cast(y1); *x8 ^= *y8; } // Handle a block of 4 bytes const unsigned four = bytes & 4; if (four) { uint32_t * LEO_RESTRICT x4 = reinterpret_cast(x1 + eight); const uint32_t * LEO_RESTRICT y4 = reinterpret_cast(y1 + eight); *x4 ^= *y4; } // Handle final bytes const unsigned offset = eight + four; switch (bytes & 3) { case 3: x1[offset + 2] ^= y1[offset + 2]; case 2: x1[offset + 1] ^= y1[offset + 1]; case 1: x1[offset] ^= y1[offset]; default: break; } } //------------------------------------------------------------------------------ // Formal Derivative // Formal derivative of polynomial in the new basis static void formal_derivative(GFSymbol* cos, const unsigned size) { for (unsigned i = 1; i < size; ++i) { const unsigned leng = ((i ^ (i - 1)) + 1) >> 1; // If a large number of values are being XORed: if (leng >= 8) xor_mem(cos + i - leng, cos + i, leng * sizeof(GFSymbol)); else for (unsigned j = i - leng; j < i; j++) cos[j] ^= cos[j + leng]; } for (unsigned i = size; i < kFieldSize; i <<= 1) xor_mem(cos, cos + i, size * sizeof(GFSymbol)); } //------------------------------------------------------------------------------ // Fast Fourier Transform static GFSymbol skewVec[kFieldModulus]; // twisted factors used in FFT // IFFT in the proposed basis static void IFLT(GFSymbol* data, const unsigned size, const unsigned index) { for (unsigned depart_no = 1; depart_no < size; depart_no <<= 1) { for (unsigned j = depart_no; j < size; j += (depart_no << 1)) { // If a large number of values are being XORed: if (depart_no >= 8) xor_mem(data + j, data + j - depart_no, depart_no * sizeof(GFSymbol)); else for (unsigned i = j - depart_no; i < j; ++i) data[i + depart_no] ^= data[i]; const GFSymbol skew = skewVec[j + index - 1]; if (skew != kFieldModulus) muladd_mem(data + j - depart_no, data + j, skew, depart_no); } } } // FFT in the proposed basis static void FLT(GFSymbol* data, const unsigned size, const unsigned index) { for (unsigned depart_no = (size >> 1); depart_no > 0; depart_no >>= 1) { for (unsigned j = depart_no; j < size; j += (depart_no << 1)) { const GFSymbol skew = skewVec[j + index - 1]; if (skew != kFieldModulus) muladd_mem(data + j - depart_no, data + j, skew, depart_no); // If a large number of values are being XORed: if (depart_no >= 8) xor_mem(data + j, data + j - depart_no, depart_no * sizeof(GFSymbol)); else for (unsigned i = j - depart_no; i < j; ++i) data[i + depart_no] ^= data[i]; } } } //------------------------------------------------------------------------------ // FFT Initialization static GFSymbol B[kFieldSize >> 1]; // factors used in formal derivative static fwht_t log_walsh[kFieldSize]; // factors used in the evaluation of the error locator polynomial // Initialize skewVec[], B[], log_walsh[] static void InitFieldOperations() { GFSymbol temp[kGFBits - 1]; for (unsigned i = 1; i < kGFBits; ++i) temp[i - 1] = (GFSymbol)((unsigned)1 << i); for (unsigned m = 0; m < (kGFBits - 1); ++m) { const unsigned step = (unsigned)1 << (m + 1); skewVec[((unsigned)1 << m) - 1] = 0; for (unsigned i = m; i < (kGFBits - 1); ++i) { const unsigned s = ((unsigned)1 << (i + 1)); for (unsigned j = ((unsigned)1 << m) - 1; j < s; j += step) skewVec[j + s] = skewVec[j] ^ temp[i]; } temp[m] = kFieldModulus - GFLog[mulE(temp[m], GFLog[temp[m] ^ 1])]; for (unsigned i = m + 1; i < (kGFBits - 1); ++i) temp[i] = mulE(temp[i], (GFLog[temp[i] ^ 1] + temp[m]) % kFieldModulus); } for (unsigned i = 0; i < kFieldSize; ++i) skewVec[i] = GFLog[skewVec[i]]; temp[0] = kFieldModulus - temp[0]; for (unsigned i = 1; i < (kGFBits - 1); ++i) temp[i] = (kFieldModulus - temp[i] + temp[i - 1]) % kFieldModulus; B[0] = 0; for (unsigned i = 0; i < (kGFBits - 1); ++i) { const unsigned depart = ((unsigned)1 << i); for (unsigned j = 0; j < depart; ++j) B[j + depart] = (B[j] + temp[i]) % kFieldModulus; } for (unsigned i = 0; i < kFieldSize; ++i) log_walsh[i] = GFLog[i]; log_walsh[0] = 0; FWHT(log_walsh, kGFBits); } //------------------------------------------------------------------------------ // Encoder // Encoding alg for k/n<0.5: message is a power of two static void encodeL(GFSymbol* data, const unsigned k, GFSymbol* codeword) { memcpy(codeword, data, sizeof(GFSymbol) * k); IFLT(codeword, k, 0); for (unsigned i = k; i < kFieldSize; i += k) { memcpy(&codeword[i], codeword, sizeof(GFSymbol) * k); FLT(&codeword[i], k, i); } memcpy(codeword, data, sizeof(GFSymbol) * k); } // Encoding alg for k/n>0.5: parity is a power of two. // data: message array. parity: parity array. mem: buffer(size>= n-k) static void encodeH(const GFSymbol* data, const unsigned k, GFSymbol* parity, GFSymbol* mem) { const unsigned t = kFieldSize - k; memset(parity, 0, sizeof(GFSymbol) * t); for (unsigned i = t; i < kFieldSize; i += t) { memcpy(mem, &data[i - t], sizeof(GFSymbol) * t); IFLT(mem, t, i); xor_mem(parity, mem, t * sizeof(GFSymbol)); } FLT(parity, t, 0); } //------------------------------------------------------------------------------ // Decoder static void decode(GFSymbol* codeword, unsigned k, const bool* erasure) { fwht_t log_walsh2[kFieldSize]; // Compute the evaluations of the error locator polynomial for (unsigned i = 0; i < kFieldSize; ++i) log_walsh2[i] = erasure[i] ? 1 : 0; FWHT(log_walsh2, kGFBits); for (unsigned i = 0; i < kFieldSize; ++i) log_walsh2[i] = ((unsigned)log_walsh2[i] * (unsigned)log_walsh[i]) % kFieldModulus; FWHT(log_walsh2, kGFBits); // k2 can be replaced with k const unsigned k2 = kFieldSize; //const unsigned k2 = k; // cannot actually be replaced with k. what else need to change? for (unsigned i = 0; i < kFieldSize; ++i) { if (erasure[i]) { codeword[i] = 0; } else { codeword[i] = mulE(codeword[i], log_walsh2[i]); } } IFLT(codeword, kFieldSize, 0); // formal derivative for (unsigned i = 0; i < kFieldSize; i += 2) { codeword[i] = mulE(codeword[i], kFieldModulus - B[i >> 1]); codeword[i + 1] = mulE(codeword[i + 1], kFieldModulus - B[i >> 1]); } formal_derivative(codeword, k2); for (unsigned i = 0; i < k2; i += 2) { codeword[i] = mulE(codeword[i], B[i >> 1]); codeword[i + 1] = mulE(codeword[i + 1], B[i >> 1]); } FLT(codeword, k2, 0); for (unsigned i = 0; i < k2; ++i) { if (erasure[i]) { codeword[i] = mulE(codeword[i], kFieldModulus - log_walsh2[i]); } } } //------------------------------------------------------------------------------ // Test Application void test(unsigned k, unsigned seed) { srand(seed); //-----------Generating message---------- // Message array GFSymbol data[kFieldSize] = {0}; // Filled with random numbers for (unsigned i = kFieldSize - k; i < kFieldSize; ++i) data[i] = (GFSymbol)rand(); //---------encoding---------- GFSymbol codeword[kFieldSize]; encodeH(&data[kFieldSize - k], k, data, codeword); //encodeL(data, k, codeword); // does not seem to work with any input? what else needs to change? memcpy(codeword, data, sizeof(GFSymbol) * kFieldSize); //--------erasure simulation--------- // Array indicating erasures bool erasure[kFieldSize] = { false }; for (unsigned i = k; i < kFieldSize; ++i) erasure[i] = true; // permuting the erasure array for (unsigned i = kFieldSize - 1; i > 0; --i) { unsigned pos = rand() % (i + 1); if (i != pos) { bool tmp = erasure[i]; erasure[i] = erasure[pos]; erasure[pos] = tmp; } } // erasure codeword symbols for (unsigned i = 0; i < kFieldSize; ++i) if (erasure[i]) codeword[i] = 0; //---------main processing---------- decode(codeword, k, erasure); // Check the correctness of the result for (unsigned i = 0; i < kFieldSize; ++i) { if (erasure[i] == 1) { if (data[i] != codeword[i]) { printf("Decoding Error with seed = %d!\n", seed); LEO_DEBUG_BREAK; return; } } } //printf("Decoding is successful!\n"); } //------------------------------------------------------------------------------ // Entrypoint int main(int argc, char **argv) { // Initialize architecture-specific code leo_architecture_init(); // Fill GFLog table and GFExp table InitField(); // Compute factors used in erasure decoder InitFieldOperations(); unsigned seed = (unsigned)time(NULL); for (;;) { // test(int k), k: message size /* EncodeH works for kFieldSize / 2 and kFieldSize * 3 / 4, etc, s.t. the number of recovery pieces is a power of two */ test(kFieldSize / 2, seed); ++seed; } return 0; } } // namespace leopard