This commit is contained in:
Christopher Taylor 2017-05-30 02:23:33 -07:00
parent c53b075eda
commit be5c625d3d
3 changed files with 28 additions and 32 deletions

View File

@ -28,6 +28,17 @@
#pragma once
/*
TODO:
+ Multithreading
+ Look into 12-bit fields as a performance optimization
+ Look into shortening the FWHT() since it takes a lot of decoder runtime
+ Unroll first/final butterflies to avoid extra copies/xors in encoder
+ Look into getting EncodeL working so we can support smaller data
+ Implement the faster decoder algorithm from {3}
*/
/*
FFT Data Layout:

View File

@ -445,7 +445,7 @@ void mul_mem(
do
{
#define LEO_MUL_256(x_ptr, y_ptr) { \
const LEO_M256 A_lo = _mm256_loadu_si256(y_ptr); \
const LEO_M256 A_lo = _mm256_loadu_si256(y_ptr); \
const LEO_M256 A_hi = _mm256_loadu_si256(y_ptr + 1); \
LEO_M256 data_0 = _mm256_and_si256(A_lo, clr_mask); \
LEO_M256 data_1 = _mm256_srli_epi64(A_lo, 4); \
@ -494,7 +494,7 @@ void mul_mem(
do
{
#define LEO_MUL_128(x_ptr, y_ptr) { \
const LEO_M128 A_lo = _mm_loadu_si128(y_ptr); \
const LEO_M128 A_lo = _mm_loadu_si128(y_ptr); \
const LEO_M128 A_hi = _mm_loadu_si128(y_ptr + 2); \
LEO_M128 data_0 = _mm_and_si128(A_lo, clr_mask); \
LEO_M128 data_1 = _mm_srli_epi64(A_lo, 4); \
@ -542,7 +542,7 @@ void fft_butterfly(
do
{
#define LEO_FFTB_256(x_ptr, y_ptr) { \
LEO_M256 y_lo = _mm256_loadu_si256(y_ptr); \
LEO_M256 y_lo = _mm256_loadu_si256(y_ptr); \
LEO_M256 data_0 = _mm256_and_si256(y_lo, clr_mask); \
LEO_M256 data_1 = _mm256_srli_epi64(y_lo, 4); \
data_1 = _mm256_and_si256(data_1, clr_mask); \
@ -558,7 +558,7 @@ void fft_butterfly(
prod_lo = _mm256_xor_si256(prod_lo, _mm256_shuffle_epi8(T3_lo, data_1)); \
prod_hi = _mm256_xor_si256(prod_hi, _mm256_shuffle_epi8(T2_hi, data_0)); \
prod_hi = _mm256_xor_si256(prod_hi, _mm256_shuffle_epi8(T3_hi, data_1)); \
LEO_M256 x_lo = _mm256_loadu_si256(x_ptr); \
LEO_M256 x_lo = _mm256_loadu_si256(x_ptr); \
LEO_M256 x_hi = _mm256_loadu_si256(x_ptr + 1); \
x_lo = _mm256_xor_si256(prod_lo, x_lo); \
_mm256_storeu_si256(x_ptr, x_lo); \
@ -589,7 +589,7 @@ void fft_butterfly(
do
{
#define LEO_FFTB_128(x_ptr, y_ptr) { \
LEO_M128 y_lo = _mm_loadu_si128(y_ptr); \
LEO_M128 y_lo = _mm_loadu_si128(y_ptr); \
LEO_M128 data_0 = _mm_and_si128(y_lo, clr_mask); \
LEO_M128 data_1 = _mm_srli_epi64(y_lo, 4); \
data_1 = _mm_and_si128(data_1, clr_mask); \
@ -605,7 +605,7 @@ void fft_butterfly(
prod_lo = _mm_xor_si128(prod_lo, _mm_shuffle_epi8(T3_lo, data_1)); \
prod_hi = _mm_xor_si128(prod_hi, _mm_shuffle_epi8(T2_hi, data_0)); \
prod_hi = _mm_xor_si128(prod_hi, _mm_shuffle_epi8(T3_hi, data_1)); \
LEO_M128 x_lo = _mm_loadu_si128(x_ptr); \
LEO_M128 x_lo = _mm_loadu_si128(x_ptr); \
LEO_M128 x_hi = _mm_loadu_si128(x_ptr + 2); \
x_lo = _mm_xor_si128(prod_lo, x_lo); \
_mm_storeu_si128(x_ptr, x_lo); \
@ -728,8 +728,8 @@ void ifft_butterfly(
do
{
#define LEO_IFFTB_256(x_ptr, y_ptr) { \
LEO_M256 x_lo = _mm256_loadu_si256(x_ptr); \
LEO_M256 y_lo = _mm256_loadu_si256(y_ptr); \
LEO_M256 x_lo = _mm256_loadu_si256(x_ptr); \
LEO_M256 y_lo = _mm256_loadu_si256(y_ptr); \
y_lo = _mm256_xor_si256(y_lo, x_lo); \
_mm256_storeu_si256(y_ptr, y_lo); \
LEO_M256 data_0 = _mm256_and_si256(y_lo, clr_mask); \
@ -775,8 +775,8 @@ void ifft_butterfly(
do
{
#define LEO_IFFTB_128(x_ptr, y_ptr) { \
LEO_M128 x_lo = _mm_loadu_si128(x_ptr); \
LEO_M128 y_lo = _mm_loadu_si128(y_ptr); \
LEO_M128 x_lo = _mm_loadu_si128(x_ptr); \
LEO_M128 y_lo = _mm_loadu_si128(y_ptr); \
y_lo = _mm_xor_si128(y_lo, x_lo); \
_mm_storeu_si128(y_ptr, y_lo); \
LEO_M128 data_0 = _mm_and_si128(y_lo, clr_mask); \

View File

@ -47,7 +47,7 @@
{1} S.-J. Lin, T. Y. Al-Naffouri, Y. S. Han, and W.-H. Chung,
"Novel Polynomial Basis with Fast Fourier Transform
and Its Application to Reed-Solomon Erasure Codes"
and Its Application to Reed-Solomon Erasure Codes"
IEEE Trans. on Information Theory, pp. 6284-6299, November, 2016.
{2} D. G. Cantor, "On arithmetical algorithms over finite fields",
@ -58,23 +58,8 @@
IEEE Commun. Lett., vol.16, no.12, pp. 2036-2039, Dec. 2012.
{4} Plank, J. S., Greenan, K. M., Miller, E. L., "Screaming fast Galois Field
arithmetic using Intel SIMD instructions." In: FAST-2013: 11th Usenix
Conference on File and Storage Technologies, San Jose, 2013
*/
/*
TODO:
+ Add multi-threading to split up long parallelizable calculations
+ Final benchmarks!
+ Release version 1
+ Finish up documentation
TBD:
+ Look into 12-bit fields as a performance optimization
+ Look into shortening the FWHT() since it takes a lot of decoder runtime
+ Unroll first/final butterflies to avoid extra copies/xors in encoder
+ Look into getting EncodeL working so we can support smaller data (Ask Lin)
+ Look into using FFT_m instead of FFT_n for decoder
arithmetic using Intel SIMD instructions." In: FAST-2013: 11th Usenix
Conference on File and Storage Technologies, San Jose, 2013
*/
// Library version
@ -160,11 +145,11 @@ typedef enum LeopardFlagsT
/*
leo_encode_work_count()
Calculate the number of work_data buffers to provide to leo_encode().
Calculate the number of work_data buffers to provide to leo_encode().
The sum of original_count + recovery_count must not exceed 65536.
Returns the work_count value to pass into leo_encode().
Returns the work_count value to pass into leo_encode().
Returns 0 on invalid input.
*/
LEO_EXPORT unsigned leo_encode_work_count(
@ -221,11 +206,11 @@ LEO_EXPORT LeopardResult leo_encode(
/*
leo_decode_work_count()
Calculate the number of work_data buffers to provide to leo_decode().
Calculate the number of work_data buffers to provide to leo_decode().
The sum of original_count + recovery_count must not exceed 65536.
Returns the work_count value to pass into leo_encode().
Returns the work_count value to pass into leo_encode().
Returns 0 on invalid input.
*/
LEO_EXPORT unsigned leo_decode_work_count(