From 4d785616897a9d40989f0f918716063ec865fecd Mon Sep 17 00:00:00 2001
From: Christopher Taylor <christopher.taylor@oculus.com>
Date: Wed, 24 May 2017 01:23:19 -0700
Subject: [PATCH] Small improvements, add TODOs

---
 lhc_rs.cpp | 269 +++++++++++++++++++++++++++++++++++------------------
 1 file changed, 177 insertions(+), 92 deletions(-)

diff --git a/lhc_rs.cpp b/lhc_rs.cpp
index 6552593..76cb178 100644
--- a/lhc_rs.cpp
+++ b/lhc_rs.cpp
@@ -12,6 +12,50 @@
 #include <stdlib.h>
 
 
+/*
+    TODO:
+    + Write C API and unit tester
+        + Limit input to multiples of 64 bytes
+    + Replace GFSymbol with a file data pointer
+    + New 16-bit Muladd inner loops
+        + Class to contain the (large) muladd tables
+    + Preliminary benchmarks for large data!
+    + New 8-bit Muladd inner loops
+    + Benchmarks for smaller data!
+    + Refactor software
+        + Pick a name for the software better than LHC_RS
+        + I think it should be split up into several C++ modules
+    + Write detailed comments for all the routines
+    + Look into getting EncodeL working so we can support smaller data (Ask Lin)
+    + Look into using k instead of k2 to speed up decoder (Ask Lin)
+    + Avoid performing FFT/IFFT intermediate calculations we're not going to use
+    + Benchmarks, fun!
+    + Add multi-threading to split up long parallelizable calculations
+    + Final benchmarks!
+    + Finish up documentation
+    + Release version 1
+
+
+    Muladd implementation notes:
+
+    Specialize for 1-3 rows at a time since often times we're multiplying by
+    the same (skew) value repeatedly, as the ISA-L library does here:
+
+    https://github.com/01org/isa-l/blob/master/erasure_code/gf_3vect_mad_avx.asm#L258
+
+    Except we should be doing it for 16-bit Galois Field.
+    To implement that use the ALTMAP trick from Jerasure:
+
+    http://lab.jerasure.org/jerasure/gf-complete/blob/master/src/gf_w16.c#L1140
+
+    Except we should also support AVX2 since that is a 40% perf boost, so put
+    the high and low bytes 32 bytes instead of 16 bytes apart.
+
+    Also I think we should go ahead and precompute the multiply tables since
+    it avoids a bunch of memory lookups for each muladd, and only costs 8 MB.
+*/
+
+
 //------------------------------------------------------------------------------
 // Debug
 
@@ -250,7 +294,9 @@ static LHC_FORCE_INLINE void SIMDSafeFree(void* ptr)
 //------------------------------------------------------------------------------
 // Field
 
-#if 1
+//#define LHC_SHORT_FIELD
+
+#ifdef LHC_SHORT_FIELD
 typedef uint8_t GFSymbol;
 static const unsigned kGFBits = 8;
 static const unsigned kGFPolynomial = 0x11D;
@@ -390,6 +436,7 @@ static GFSymbol mulE(GFSymbol a, GFSymbol b)
     return GFExp[sum];
 }
 
+
 //------------------------------------------------------------------------------
 // Fast Walsh-Hadamard Transform (FWHT) Mod Q
 //
@@ -398,34 +445,54 @@ static GFSymbol mulE(GFSymbol a, GFSymbol b)
 // Define this to enable the optimized version of FWHT()
 #define LHC_FWHT_OPTIMIZED
 
+typedef GFSymbol fwht_t;
+
+// {a, b} = {a + b, a - b} (Mod Q)
+static LHC_FORCE_INLINE void FWHT_2(fwht_t& LHC_RESTRICT a, fwht_t& LHC_RESTRICT b)
+{
+    const fwht_t sum = AddModQ(a, b);
+    const fwht_t dif = SubModQ(a, b);
+    a = sum;
+    b = dif;
+}
+
+/*
+    FWHT is a minor slice of the runtime and does not grow with data size,
+    but I did attempt a few additional optimizations that failed:
+
+    I've attempted to vectorize (with partial reductions) FWHT_4(data, s),
+    which is 70% of the algorithm, but it was slower.  Left in _attic_.
+
+    I've attempted to avoid reductions in all or parts of the FWHT.
+    The final modular reduction ends up being slower than the savings.
+    Specifically I tried doing it for the whole FWHT and also I tried
+    doing it just for the FWHT_2 loop in the main routine, but both
+    approaches are slower than partial reductions.
+
+    Replacing word reads with wider reads does speed up the operation, but
+    at too high a complexity cost relative to minor perf improvement.
+*/
+
 #ifndef LHC_FWHT_OPTIMIZED
 
 // Reference implementation
-static void FWHT(GFSymbol* data, const unsigned bits)
+static void FWHT(fwht_t* data, const unsigned bits)
 {
     const unsigned size = (unsigned)(1UL << bits);
     for (unsigned width = 1; width < size; width <<= 1)
         for (unsigned i = 0; i < size; i += (width << 1))
             for (unsigned j = i; j < (width + i); ++j)
-                CrossAddSubModQ(data[j], data[j + width]);
+                FWHT_2(data[j], data[j + width]);
 }
 
 #else
 
-// {a, b} = {a + b, a - b} (mod Q)
-static inline void FWHT_2(GFSymbol& a, GFSymbol& b)
+static LHC_FORCE_INLINE void FWHT_4(fwht_t* data)
 {
-    const GFSymbol dif = SubModQ(a, b);
-    const GFSymbol sum = AddModQ(a, b);
-    a = sum, b = dif;
-}
-
-static inline void FWHT_4(GFSymbol* data)
-{
-    GFSymbol t0 = data[0];
-    GFSymbol t1 = data[1];
-    GFSymbol t2 = data[2];
-    GFSymbol t3 = data[3];
+    fwht_t t0 = data[0];
+    fwht_t t1 = data[1];
+    fwht_t t2 = data[2];
+    fwht_t t3 = data[3];
     FWHT_2(t0, t1);
     FWHT_2(t2, t3);
     FWHT_2(t0, t2);
@@ -436,13 +503,13 @@ static inline void FWHT_4(GFSymbol* data)
     data[3] = t3;
 }
 
-static inline void FWHT_4(GFSymbol* data, unsigned s)
+static LHC_FORCE_INLINE void FWHT_4(fwht_t* data, unsigned s)
 {
     unsigned x = 0;
-    GFSymbol t0 = data[x];  x += s;
-    GFSymbol t1 = data[x];  x += s;
-    GFSymbol t2 = data[x];  x += s;
-    GFSymbol t3 = data[x];
+    fwht_t t0 = data[x];  x += s;
+    fwht_t t1 = data[x];  x += s;
+    fwht_t t2 = data[x];  x += s;
+    fwht_t t3 = data[x];
     FWHT_2(t0, t1);
     FWHT_2(t2, t3);
     FWHT_2(t0, t2);
@@ -454,16 +521,16 @@ static inline void FWHT_4(GFSymbol* data, unsigned s)
     data[y] = t3;
 }
 
-static inline void FWHT_8(GFSymbol* data)
+static inline void FWHT_8(fwht_t* data)
 {
-    GFSymbol t0 = data[0];
-    GFSymbol t1 = data[1];
-    GFSymbol t2 = data[2];
-    GFSymbol t3 = data[3];
-    GFSymbol t4 = data[4];
-    GFSymbol t5 = data[5];
-    GFSymbol t6 = data[6];
-    GFSymbol t7 = data[7];
+    fwht_t t0 = data[0];
+    fwht_t t1 = data[1];
+    fwht_t t2 = data[2];
+    fwht_t t3 = data[3];
+    fwht_t t4 = data[4];
+    fwht_t t5 = data[5];
+    fwht_t t6 = data[6];
+    fwht_t t7 = data[7];
     FWHT_2(t0, t1);
     FWHT_2(t2, t3);
     FWHT_2(t4, t5);
@@ -486,24 +553,24 @@ static inline void FWHT_8(GFSymbol* data)
     data[7] = t7;
 }
 
-static inline void FWHT_16(GFSymbol* data)
+static inline void FWHT_16(fwht_t* data)
 {
-    GFSymbol t0 = data[0];
-    GFSymbol t1 = data[1];
-    GFSymbol t2 = data[2];
-    GFSymbol t3 = data[3];
-    GFSymbol t4 = data[4];
-    GFSymbol t5 = data[5];
-    GFSymbol t6 = data[6];
-    GFSymbol t7 = data[7];
-    GFSymbol t8 = data[8];
-    GFSymbol t9 = data[9];
-    GFSymbol t10 = data[10];
-    GFSymbol t11 = data[11];
-    GFSymbol t12 = data[12];
-    GFSymbol t13 = data[13];
-    GFSymbol t14 = data[14];
-    GFSymbol t15 = data[15];
+    fwht_t t0 = data[0];
+    fwht_t t1 = data[1];
+    fwht_t t2 = data[2];
+    fwht_t t3 = data[3];
+    fwht_t t4 = data[4];
+    fwht_t t5 = data[5];
+    fwht_t t6 = data[6];
+    fwht_t t7 = data[7];
+    fwht_t t8 = data[8];
+    fwht_t t9 = data[9];
+    fwht_t t10 = data[10];
+    fwht_t t11 = data[11];
+    fwht_t t12 = data[12];
+    fwht_t t13 = data[13];
+    fwht_t t14 = data[14];
+    fwht_t t15 = data[15];
     FWHT_2(t0, t1);
     FWHT_2(t2, t3);
     FWHT_2(t4, t5);
@@ -554,7 +621,7 @@ static inline void FWHT_16(GFSymbol* data)
     data[15] = t15;
 }
 
-void FWHT_SmallData(GFSymbol* data, unsigned ldn)
+static void FWHT_SmallData(fwht_t* data, unsigned ldn)
 {
     const unsigned n = (1UL << ldn);
 
@@ -586,8 +653,8 @@ void FWHT_SmallData(GFSymbol* data, unsigned ldn)
     }
 }
 
-// Decimation in time version of the transform
-static void FWHT(GFSymbol* data, const unsigned ldn)
+// Decimation in time (DIT) version
+static void FWHT(fwht_t* data, const unsigned ldn)
 {
     if (ldn <= 13)
     {
@@ -602,10 +669,9 @@ static void FWHT(GFSymbol* data, const unsigned ldn)
     for (unsigned ldm = 5; ldm < ldn; ++ldm)
         FWHT(data + (unsigned)(1UL << ldm), ldm);
 
-    for (unsigned ldm = 1; ldm <= ldn; ++ldm)
+    for (unsigned ldm = 0; ldm < ldn; ++ldm)
     {
-        const unsigned m = (1UL << ldm);
-        const unsigned mh = (m >> 1);
+        const unsigned mh = (1UL << ldm);
         for (unsigned t1 = 0, t2 = mh; t1 < mh; ++t1, ++t2)
             FWHT_2(data[t1], data[t2]);
     }
@@ -866,12 +932,12 @@ static void FLT(GFSymbol* data, const unsigned size, const unsigned index)
 // FFT Initialization
 
 static GFSymbol B[kFieldSize >> 1];     // factors used in formal derivative
-static GFSymbol log_walsh[kFieldSize];  // factors used in the evaluation of the error locator polynomial
+static fwht_t log_walsh[kFieldSize];  // factors used in the evaluation of the error locator polynomial
 
 // Initialize skewVec[], B[], log_walsh[]
 static void InitFieldOperations()
 {
-	GFSymbol temp[kGFBits - 1];
+    GFSymbol temp[kGFBits - 1];
 
     for (unsigned i = 1; i < kGFBits; ++i)
         temp[i - 1] = (GFSymbol)((unsigned)1 << i);
@@ -904,7 +970,7 @@ static void InitFieldOperations()
     for (unsigned i = 1; i < (kGFBits - 1); ++i)
         temp[i] = (kFieldModulus - temp[i] + temp[i - 1]) % kFieldModulus;
 
-	B[0] = 0;
+    B[0] = 0;
     for (unsigned i = 0; i < (kGFBits - 1); ++i)
     {
         const unsigned depart = ((unsigned)1 << i);
@@ -916,7 +982,7 @@ static void InitFieldOperations()
     for (unsigned i = 0; i < kFieldSize; ++i)
         log_walsh[i] = GFLog[i];
 
-	log_walsh[0] = 0;
+    log_walsh[0] = 0;
 
     FWHT(log_walsh, kGFBits);
 }
@@ -928,9 +994,9 @@ static void InitFieldOperations()
 // Encoding alg for k/n<0.5: message is a power of two
 static void encodeL(GFSymbol* data, const unsigned k, GFSymbol* codeword)
 {
-	memcpy(codeword, data, sizeof(GFSymbol) * k);
+    memcpy(codeword, data, sizeof(GFSymbol) * k);
 
-	IFLT(codeword, k, 0);
+    IFLT(codeword, k, 0);
 
     for (unsigned i = k; i < kFieldSize; i += k)
     {
@@ -946,7 +1012,7 @@ static void encodeL(GFSymbol* data, const unsigned k, GFSymbol* codeword)
 // data: message array. parity: parity array. mem: buffer(size>= n-k)
 static void encodeH(const GFSymbol* data, const unsigned k, GFSymbol* parity, GFSymbol* mem)
 {
-	const unsigned t = kFieldSize - k;
+    const unsigned t = kFieldSize - k;
 
     memset(parity, 0, sizeof(GFSymbol) * t);
 
@@ -966,9 +1032,9 @@ static void encodeH(const GFSymbol* data, const unsigned k, GFSymbol* parity, GF
 //------------------------------------------------------------------------------
 // Decoder
 
-static void decode(GFSymbol* codeword, const bool* erasure)
+static void decode(GFSymbol* codeword, unsigned k, const bool* erasure)
 {
-    GFSymbol log_walsh2[kFieldSize];
+    fwht_t log_walsh2[kFieldSize];
 
     // Compute the evaluations of the error locator polynomial
     for (unsigned i = 0; i < kFieldSize; ++i)
@@ -977,30 +1043,36 @@ static void decode(GFSymbol* codeword, const bool* erasure)
     FWHT(log_walsh2, kGFBits);
 
     for (unsigned i = 0; i < kFieldSize; ++i)
-        log_walsh2[i] = ((unsigned)log_walsh2[i] * log_walsh[i]) % kFieldModulus;
+        log_walsh2[i] = ((unsigned)log_walsh2[i] * (unsigned)log_walsh[i]) % kFieldModulus;
 
     FWHT(log_walsh2, kGFBits);
 
-    for (unsigned i = 0; i < kFieldSize; ++i)
-        if (erasure[i])
-            log_walsh2[i] = kFieldModulus - log_walsh2[i];
-
     // k2 can be replaced with k
-	const unsigned k2 = kFieldSize;
+    const unsigned k2 = kFieldSize;
+    //const unsigned k2 = k; // cannot actually be replaced with k.  what else need to change?
 
     for (unsigned i = 0; i < kFieldSize; ++i)
-        codeword[i] = erasure[i] ? 0 : mulE(codeword[i], log_walsh2[i]);
+    {
+        if (erasure[i])
+        {
+            codeword[i] = 0;
+        }
+        else
+        {
+            codeword[i] = mulE(codeword[i], log_walsh2[i]);
+        }
+    }
 
     IFLT(codeword, kFieldSize, 0);
 
     // formal derivative
     for (unsigned i = 0; i < kFieldSize; i += 2)
     {
-		codeword[i] = mulE(codeword[i], kFieldModulus - B[i >> 1]);
-		codeword[i + 1] = mulE(codeword[i + 1], kFieldModulus - B[i >> 1]);
-	}
+        codeword[i] = mulE(codeword[i], kFieldModulus - B[i >> 1]);
+        codeword[i + 1] = mulE(codeword[i + 1], kFieldModulus - B[i >> 1]);
+    }
 
-	formal_derivative(codeword, k2);
+    formal_derivative(codeword, k2);
 
     for (unsigned i = 0; i < k2; i += 2)
     {
@@ -1008,41 +1080,48 @@ static void decode(GFSymbol* codeword, const bool* erasure)
         codeword[i + 1] = mulE(codeword[i + 1], B[i >> 1]);
     }
 
-	FLT(codeword, k2, 0);
+    FLT(codeword, k2, 0);
 
     for (unsigned i = 0; i < k2; ++i)
-        codeword[i] = erasure[i] ? mulE(codeword[i], log_walsh2[i]) : 0;
+    {
+        if (erasure[i])
+        {
+            codeword[i] = mulE(codeword[i], kFieldModulus - log_walsh2[i]);
+        }
+    }
 }
 
 
 //------------------------------------------------------------------------------
 // Test Application
 
-void test(unsigned k)
+void test(unsigned k, unsigned seed)
 {
-	//-----------Generating message----------
+    srand(seed);
+
+    //-----------Generating message----------
 
     // Message array
-	GFSymbol data[kFieldSize] = {0};
+    GFSymbol data[kFieldSize] = {0};
 
     // Filled with random numbers
     for (unsigned i = kFieldSize - k; i < kFieldSize; ++i)
         data[i] = (GFSymbol)rand();
 
 
-	//---------encoding----------
+    //---------encoding----------
 
-	GFSymbol codeword[kFieldSize];
-	encodeH(&data[kFieldSize - k], k, data, codeword);
-	//encodeL(data, k, codeword);
+    GFSymbol codeword[kFieldSize];
+    encodeH(&data[kFieldSize - k], k, data, codeword);
+    //encodeL(data, k, codeword); // does not seem to work with any input?  what else needs to change?
 
-	memcpy(codeword, data, sizeof(GFSymbol) * kFieldSize);
+    memcpy(codeword, data, sizeof(GFSymbol) * kFieldSize);
 
 
-	//--------erasure simulation---------
+    //--------erasure simulation---------
 
     // Array indicating erasures
-	bool erasure[kFieldSize] = {
+    bool erasure[kFieldSize] = {
         false
     };
 
@@ -1068,8 +1147,8 @@ void test(unsigned k)
             codeword[i] = 0;
 
 
-	//---------main processing----------
-    decode(codeword, erasure);
+    //---------main processing----------
+    decode(codeword, k, erasure);
 
     // Check the correctness of the result
     for (unsigned i = 0; i < kFieldSize; ++i)
@@ -1078,13 +1157,14 @@ void test(unsigned k)
         {
             if (data[i] != codeword[i])
             {
-                printf("Decoding Error!\n");
+                printf("Decoding Error with seed = %d!\n", seed);
+                LHC_DEBUG_BREAK;
                 return;
             }
         }
     }
 
-    printf("Decoding is successful!\n");
+    //printf("Decoding is successful!\n");
 }
 
 
@@ -1093,8 +1173,6 @@ void test(unsigned k)
 
 int main(int argc, char **argv)
 {
-    srand((unsigned)time(NULL));
-
     // Initialize architecture-specific code
     lhc_architecture_init();
 
@@ -1104,10 +1182,17 @@ int main(int argc, char **argv)
     // Compute factors used in erasure decoder
     InitFieldOperations();
 
+    unsigned seed = (unsigned)time(NULL);
     for (;;)
     {
         // test(int k), k: message size
-        test(kFieldSize / 2);
+        /*
+            EncodeH works for kFieldSize / 2 and kFieldSize * 3 / 4, etc,
+            s.t. the number of recovery pieces is a power of two
+        */
+        test(kFieldSize / 2, seed);
+
+        ++seed;
     }
 
     return 0;