mirror of https://github.com/status-im/leopard.git
413 lines
12 KiB
C++
413 lines
12 KiB
C++
/*
|
|
Copyright (c) 2017 Christopher A. Taylor. All rights reserved.
|
|
|
|
Redistribution and use in source and binary forms, with or without
|
|
modification, are permitted provided that the following conditions are met:
|
|
|
|
* Redistributions of source code must retain the above copyright notice,
|
|
this list of conditions and the following disclaimer.
|
|
* Redistributions in binary form must reproduce the above copyright notice,
|
|
this list of conditions and the following disclaimer in the documentation
|
|
and/or other materials provided with the distribution.
|
|
* Neither the name of Leopard-RS nor the names of its contributors may be
|
|
used to endorse or promote products derived from this software without
|
|
specific prior written permission.
|
|
|
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
#pragma once
|
|
|
|
/*
|
|
TODO:
|
|
|
|
+ Look into 12-bit fields as a performance optimization
|
|
+ Unroll first/final butterflies to avoid extra copies/xors in encoder
|
|
+ Skip a lot of the initial FWHT() layers that are just operating on zeroes
|
|
+ For the actual FFT(), I should be unrolling the bottom two layers
|
|
and performing them in a specialized function that does 2 <=> 2 and
|
|
then 1<=>1, 1<=>1 operations in local registers/cache
|
|
+ Multithreading
|
|
+ Evaluate the error locator polynomial based on fast polynomial interpolations in O(k log^2 k)
|
|
+ Look into getting EncodeL working so we can support larger recovery sets
|
|
+ Implement the decoder algorithm from {3} based on the Forney algorithm
|
|
*/
|
|
|
|
/*
|
|
FFT Data Layout:
|
|
|
|
We pack the data into memory in this order:
|
|
|
|
[Recovery Data (Power of Two = M)] [Original Data] [Zero Padding out to 65536]
|
|
|
|
For encoding, the placement is implied instead of actual memory layout.
|
|
For decoding, the layout is explicitly used.
|
|
*/
|
|
|
|
/*
|
|
Encoder algorithm:
|
|
|
|
The encoder is described in {3}. Operations are done O(K Log M),
|
|
where K is the original data size, and M is up to twice the
|
|
size of the recovery set.
|
|
|
|
Roughly in brief:
|
|
|
|
Recovery = FFT( IFFT(Data_0) xor IFFT(Data_1) xor ... )
|
|
|
|
It walks the original data M chunks at a time performing the IFFT.
|
|
Each IFFT intermediate result is XORed together into the first M chunks of
|
|
the data layout. Finally the FFT is performed.
|
|
|
|
Encoder optimizations:
|
|
* The first IFFT can be performed directly in the first M chunks.
|
|
* The zero padding can be skipped while performing the final IFFT.
|
|
Unrolling is used in the code to accomplish both these optimizations.
|
|
* The final FFT can be truncated also if recovery set is not a power of 2.
|
|
It is easy to truncate the FFT by ending the inner loop early.
|
|
*/
|
|
|
|
/*
|
|
Decoder algorithm:
|
|
|
|
The decoder is described in {1}. Operations are done O(N Log N), where N is up
|
|
to twice the size of the original data as described below.
|
|
|
|
Roughly in brief:
|
|
|
|
Original = -ErrLocator * FFT( Derivative( IFFT( ErrLocator * ReceivedData ) ) )
|
|
|
|
|
|
Precalculations:
|
|
---------------
|
|
|
|
At startup initialization, FFTInitialize() precalculates FWT(L) as
|
|
described by equation (92) in {1}, where L = Log[i] for i = 0..Order,
|
|
Order = 256 or 65536 for FF8/16. This is stored in the LogWalsh vector.
|
|
|
|
It also precalculates the FFT skew factors (s_i) as described by
|
|
equation (28). This is stored in the FFTSkew vector.
|
|
|
|
For memory workspace N data chunks are needed, where N is a power of two
|
|
at or above M + K. K is the original data size and M is the next power
|
|
of two above the recovery data size. For example for K = 200 pieces of
|
|
data and 10% redundancy, there are 20 redundant pieces, which rounds up
|
|
to 32 = M. M + K = 232 pieces, so N rounds up to 256.
|
|
|
|
|
|
Online calculations:
|
|
-------------------
|
|
|
|
At runtime, the error locator polynomial is evaluated using the
|
|
Fast Walsh-Hadamard transform as described in {1} equation (92).
|
|
|
|
At runtime the data is explicit laid out in workspace memory like this:
|
|
[Recovery Data (Power of Two = M)] [Original Data (K)] [Zero Padding out to N]
|
|
|
|
Data that was lost is replaced with zeroes.
|
|
Data that was received, including recovery data, is multiplied by the error
|
|
locator polynomial as it is copied into the workspace.
|
|
|
|
The IFFT is applied to the entire workspace of N chunks.
|
|
Since the IFFT starts with pairs of inputs and doubles in width at each
|
|
iteration, the IFFT is optimized by skipping zero padding at the end until
|
|
it starts mixing with non-zero data.
|
|
|
|
The formal derivative is applied to the entire workspace of N chunks.
|
|
|
|
The FFT is applied to the entire workspace of N chunks.
|
|
The FFT is optimized by only performing intermediate calculations required
|
|
to recover lost data. Since it starts wide and ends up working on adjacent
|
|
pairs, at some point the intermediate results are not needed for data that
|
|
will not be read by the application. This optimization is implemented by
|
|
the ErrorBitfield class.
|
|
|
|
Finally, only recovered data is multiplied by the negative of the
|
|
error locator polynomial as it is copied into the front of the
|
|
workspace for the application to retrieve.
|
|
*/
|
|
|
|
/*
|
|
Finite field arithmetic optimizations:
|
|
|
|
For faster finite field multiplication, large tables are precomputed and
|
|
applied during encoding/decoding on 64 bytes of data at a time using
|
|
SSSE3 or AVX2 vector instructions and the ALTMAP approach from Jerasure.
|
|
|
|
Addition in this finite field is XOR, and a vectorized memory XOR routine
|
|
is also used.
|
|
*/
|
|
|
|
#include "leopard.h"
|
|
|
|
#include <stdint.h>
|
|
|
|
|
|
//------------------------------------------------------------------------------
|
|
// Constants
|
|
|
|
// Define this to enable the optimized version of FWHT()
|
|
#define LEO_FWHT_OPT
|
|
|
|
// Avoid scheduling FFT operations that are unused
|
|
#define LEO_SCHEDULE_OPT
|
|
|
|
// Avoid calculating final FFT values in decoder using bitfield
|
|
#define LEO_ERROR_BITFIELD_OPT
|
|
|
|
// Optimize M=1 case
|
|
#define LEO_M1_OPT
|
|
|
|
// Unroll inner loops 4 times
|
|
#define LEO_USE_VECTOR4_OPT
|
|
|
|
|
|
//------------------------------------------------------------------------------
|
|
// Debug
|
|
|
|
// Some bugs only repro in release mode, so this can be helpful
|
|
//#define LEO_DEBUG_IN_RELEASE
|
|
|
|
#if defined(_DEBUG) || defined(DEBUG) || defined(LEO_DEBUG_IN_RELEASE)
|
|
#define LEO_DEBUG
|
|
#ifdef _WIN32
|
|
#define LEO_DEBUG_BREAK __debugbreak()
|
|
#else
|
|
#define LEO_DEBUG_BREAK __builtin_trap()
|
|
#endif
|
|
#define LEO_DEBUG_ASSERT(cond) { if (!(cond)) { LEO_DEBUG_BREAK; } }
|
|
#else
|
|
#define LEO_DEBUG_BREAK ;
|
|
#define LEO_DEBUG_ASSERT(cond) ;
|
|
#endif
|
|
|
|
|
|
//------------------------------------------------------------------------------
|
|
// Platform/Architecture
|
|
|
|
#ifdef _MSC_VER
|
|
#include <intrin.h>
|
|
#endif
|
|
|
|
#if defined(ANDROID) || defined(IOS)
|
|
#define LEO_TARGET_MOBILE
|
|
#endif // ANDROID
|
|
|
|
#if defined(__AVX2__) || (defined (_MSC_VER) && _MSC_VER >= 1900)
|
|
#define LEO_TRY_AVX2 /* 256-bit */
|
|
#include <immintrin.h>
|
|
#define LEO_ALIGN_BYTES 32
|
|
#else // __AVX2__
|
|
#define LEO_ALIGN_BYTES 16
|
|
#endif // __AVX2__
|
|
|
|
#if !defined(LEO_TARGET_MOBILE)
|
|
// Note: MSVC currently only supports SSSE3 but not AVX2
|
|
#include <tmmintrin.h> // SSSE3: _mm_shuffle_epi8
|
|
#include <emmintrin.h> // SSE2
|
|
#endif // LEO_TARGET_MOBILE
|
|
|
|
#if defined(HAVE_ARM_NEON_H)
|
|
#include <arm_neon.h>
|
|
#endif // HAVE_ARM_NEON_H
|
|
|
|
#if defined(LEO_TARGET_MOBILE)
|
|
|
|
#define LEO_ALIGNED_ACCESSES /* Inputs must be aligned to LEO_ALIGN_BYTES */
|
|
|
|
# if defined(HAVE_ARM_NEON_H)
|
|
// Compiler-specific 128-bit SIMD register keyword
|
|
#define LEO_M128 uint8x16_t
|
|
#define LEO_TRY_NEON
|
|
#else
|
|
#define LEO_M128 uint64_t
|
|
# endif
|
|
|
|
#else // LEO_TARGET_MOBILE
|
|
|
|
// Compiler-specific 128-bit SIMD register keyword
|
|
#define LEO_M128 __m128i
|
|
|
|
#endif // LEO_TARGET_MOBILE
|
|
|
|
#ifdef LEO_TRY_AVX2
|
|
// Compiler-specific 256-bit SIMD register keyword
|
|
#define LEO_M256 __m256i
|
|
#endif
|
|
|
|
// Compiler-specific C++11 restrict keyword
|
|
#define LEO_RESTRICT __restrict
|
|
|
|
// Compiler-specific force inline keyword
|
|
#ifdef _MSC_VER
|
|
#define LEO_FORCE_INLINE inline __forceinline
|
|
#else
|
|
#define LEO_FORCE_INLINE inline __attribute__((always_inline))
|
|
#endif
|
|
|
|
// Compiler-specific alignment keyword
|
|
// Note: Alignment only matters for ARM NEON where it should be 16
|
|
#ifdef _MSC_VER
|
|
#define LEO_ALIGNED __declspec(align(LEO_ALIGN_BYTES))
|
|
#else // _MSC_VER
|
|
#define LEO_ALIGNED __attribute__((aligned(LEO_ALIGN_BYTES)))
|
|
#endif // _MSC_VER
|
|
|
|
|
|
namespace leopard {
|
|
|
|
|
|
//------------------------------------------------------------------------------
|
|
// Runtime CPU Architecture Check
|
|
|
|
// Initialize CPU architecture flags
|
|
void InitializeCPUArch();
|
|
|
|
|
|
#if defined(LEO_TRY_NEON)
|
|
# if defined(IOS) && defined(__ARM_NEON__)
|
|
// Does device support NEON?
|
|
static const bool CpuHasNeon = true;
|
|
static const bool CpuHasNeon64 = true;
|
|
# else
|
|
// Does device support NEON?
|
|
// Remember to add LOCAL_STATIC_LIBRARIES := cpufeatures
|
|
extern bool CpuHasNeon; // V6 / V7
|
|
extern bool CpuHasNeon64; // 64-bit
|
|
# endif
|
|
#endif
|
|
|
|
#if !defined(LEO_TARGET_MOBILE)
|
|
# if defined(LEO_TRY_AVX2)
|
|
// Does CPU support AVX2?
|
|
extern bool CpuHasAVX2;
|
|
# endif
|
|
// Does CPU support SSSE3?
|
|
extern bool CpuHasSSSE3;
|
|
#endif // LEO_TARGET_MOBILE
|
|
|
|
|
|
//------------------------------------------------------------------------------
|
|
// Portable Intrinsics
|
|
|
|
// Returns highest bit index 0..31 where the first non-zero bit is found
|
|
// Precondition: x != 0
|
|
LEO_FORCE_INLINE unsigned LastNonzeroBit32(unsigned x)
|
|
{
|
|
#ifdef _MSC_VER
|
|
unsigned long index;
|
|
// Note: Ignoring result because x != 0
|
|
_BitScanReverse(&index, (uint32_t)x);
|
|
return (unsigned)index;
|
|
#else
|
|
// Note: Ignoring return value of 0 because x != 0
|
|
return 31 - (unsigned)__builtin_clzl(x);
|
|
#endif
|
|
}
|
|
|
|
// Returns next power of two at or above given value
|
|
LEO_FORCE_INLINE unsigned NextPow2(unsigned n)
|
|
{
|
|
return 2UL << LastNonzeroBit32(n - 1);
|
|
}
|
|
|
|
|
|
//------------------------------------------------------------------------------
|
|
// XOR Memory
|
|
//
|
|
// This works for both 8-bit and 16-bit finite fields
|
|
|
|
// x[] ^= y[]
|
|
void xor_mem(
|
|
void * LEO_RESTRICT x, const void * LEO_RESTRICT y,
|
|
uint64_t bytes);
|
|
|
|
#ifdef LEO_M1_OPT
|
|
|
|
// x[] ^= y[] ^ z[]
|
|
void xor_mem_2to1(
|
|
void * LEO_RESTRICT x,
|
|
const void * LEO_RESTRICT y,
|
|
const void * LEO_RESTRICT z,
|
|
uint64_t bytes);
|
|
|
|
#endif // LEO_M1_OPT
|
|
|
|
#ifdef LEO_USE_VECTOR4_OPT
|
|
|
|
// For i = {0, 1, 2, 3}: x_i[] ^= x_i[]
|
|
void xor_mem4(
|
|
void * LEO_RESTRICT x_0, const void * LEO_RESTRICT y_0,
|
|
void * LEO_RESTRICT x_1, const void * LEO_RESTRICT y_1,
|
|
void * LEO_RESTRICT x_2, const void * LEO_RESTRICT y_2,
|
|
void * LEO_RESTRICT x_3, const void * LEO_RESTRICT y_3,
|
|
uint64_t bytes);
|
|
|
|
#endif // LEO_USE_VECTOR4_OPT
|
|
|
|
// x[] ^= y[]
|
|
void VectorXOR(
|
|
const uint64_t bytes,
|
|
unsigned count,
|
|
void** x,
|
|
void** y);
|
|
|
|
|
|
//------------------------------------------------------------------------------
|
|
// XORSummer
|
|
|
|
class XORSummer
|
|
{
|
|
public:
|
|
// Set the addition destination and byte count
|
|
LEO_FORCE_INLINE void Initialize(void* dest, uint64_t bytes)
|
|
{
|
|
DestBuffer = dest;
|
|
Bytes = bytes;
|
|
Waiting = nullptr;
|
|
}
|
|
|
|
// Accumulate some source data
|
|
LEO_FORCE_INLINE void Add(const void* src)
|
|
{
|
|
#ifdef LEO_M1_OPT
|
|
if (Waiting)
|
|
{
|
|
xor_mem_2to1(DestBuffer, src, Waiting, Bytes);
|
|
Waiting = nullptr;
|
|
}
|
|
else
|
|
Waiting = src;
|
|
#else // LEO_M1_OPT
|
|
xor_mem(DestBuffer, src, Bytes);
|
|
#endif // LEO_M1_OPT
|
|
}
|
|
|
|
// Finalize in the destination buffer
|
|
LEO_FORCE_INLINE void Finalize()
|
|
{
|
|
#ifdef LEO_M1_OPT
|
|
if (Waiting)
|
|
xor_mem(DestBuffer, Waiting, Bytes);
|
|
#endif // LEO_M1_OPT
|
|
}
|
|
|
|
protected:
|
|
void* DestBuffer;
|
|
uint64_t Bytes;
|
|
const void* Waiting;
|
|
};
|
|
|
|
|
|
} // namespace leopard
|