diff --git a/LeopardCommon.h b/LeopardCommon.h
index 04abdf6..62cb94f 100644
--- a/LeopardCommon.h
+++ b/LeopardCommon.h
@@ -77,6 +77,9 @@
     Unrolling is used in the code to accomplish both these optimizations.
     * The final FFT can be truncated also if recovery set is not a power of 2.
     It is easy to truncate the FFT by ending the inner loop early.
+    * The FFT operations can be unrolled two layers at a time so that instead
+    of writing the result of the first layer out and reading it back in for
+    the second layer, those interactions can happen in registers immediately.
 */
 
 /*
@@ -126,6 +129,7 @@
     it starts mixing with non-zero data.
 
     The formal derivative is applied to the entire workspace of N chunks.
+    This is a massive XOR loop that runs 4 columns in parallel for speed.
 
     The FFT is applied to the entire workspace of N chunks.
     The FFT is optimized by only performing intermediate calculations required
@@ -165,20 +169,23 @@
 // Define this to enable the optimized version of FWHT()
 #define LEO_FWHT_OPT
 
-// Avoid scheduling FFT operations that are unused
-#define LEO_SCHEDULE_OPT
-
 // Avoid calculating final FFT values in decoder using bitfield
 #define LEO_ERROR_BITFIELD_OPT
 
 // Optimize M=1 case
 #define LEO_M1_OPT
 
+// Interleave butterfly operations between layer pairs in FFT
+#define LEO_INTERLEAVE_BUTTERFLY4_OPT
+
+
+// FIXME: Remove these when FF16 is done
+
 // Unroll inner loops 4 times
 #define LEO_USE_VECTOR4_OPT
 
-// Interleave butterfly operations between layer pairs in FFT
-#define LEO_INTERLEAVE_BUTTERFLY4_OPT
+// Avoid scheduling FFT operations that are unused
+#define LEO_SCHEDULE_OPT
 
 
 //------------------------------------------------------------------------------
@@ -380,41 +387,39 @@ class XORSummer
 {
 public:
     // Set the addition destination and byte count
-    LEO_FORCE_INLINE void Initialize(void* dest, uint64_t bytes)
+    LEO_FORCE_INLINE void Initialize(void* dest)
     {
         DestBuffer = dest;
-        Bytes = bytes;
         Waiting = nullptr;
     }
 
     // Accumulate some source data
-    LEO_FORCE_INLINE void Add(const void* src)
+    LEO_FORCE_INLINE void Add(const void* src, const uint64_t bytes)
     {
 #ifdef LEO_M1_OPT
         if (Waiting)
         {
-            xor_mem_2to1(DestBuffer, src, Waiting, Bytes);
+            xor_mem_2to1(DestBuffer, src, Waiting, bytes);
             Waiting = nullptr;
         }
         else
             Waiting = src;
 #else // LEO_M1_OPT
-        xor_mem(DestBuffer, src, Bytes);
+        xor_mem(DestBuffer, src, bytes);
 #endif // LEO_M1_OPT
     }
 
     // Finalize in the destination buffer
-    LEO_FORCE_INLINE void Finalize()
+    LEO_FORCE_INLINE void Finalize(const uint64_t bytes)
     {
 #ifdef LEO_M1_OPT
         if (Waiting)
-            xor_mem(DestBuffer, Waiting, Bytes);
+            xor_mem(DestBuffer, Waiting, bytes);
 #endif // LEO_M1_OPT
     }
 
 protected:
     void* DestBuffer;
-    uint64_t Bytes;
     const void* Waiting;
 };
 
diff --git a/leopard.cpp b/leopard.cpp
index 4baed74..37bd8e4 100644
--- a/leopard.cpp
+++ b/leopard.cpp
@@ -111,12 +111,12 @@ static void EncodeM1(
     memcpy(recovery_data, original_data[0], buffer_bytes);
 
     leopard::XORSummer summer;
-    summer.Initialize(recovery_data, buffer_bytes);
+    summer.Initialize(recovery_data);
 
     for (unsigned i = 1; i < original_count; ++i)
-        summer.Add(original_data[i]);
+        summer.Add(original_data[i], buffer_bytes);
 
-    summer.Finalize();
+    summer.Finalize(buffer_bytes);
 }
 
 LEO_EXPORT LeopardResult leo_encode(
@@ -223,13 +223,13 @@ static void DecodeM1(
     memcpy(work_data, recovery_data, buffer_bytes);
 
     leopard::XORSummer summer;
-    summer.Initialize(work_data, buffer_bytes);
+    summer.Initialize(work_data);
 
     for (unsigned i = 0; i < original_count; ++i)
         if (original_data[i])
-            summer.Add(original_data[i]);
+            summer.Add(original_data[i], buffer_bytes);
 
-    summer.Finalize();
+    summer.Finalize(buffer_bytes);
 }
 
 LEO_EXPORT LeopardResult leo_decode(