diff --git a/LeopardCommon.cpp b/LeopardCommon.cpp
index 141e03d..6f03b5b 100644
--- a/LeopardCommon.cpp
+++ b/LeopardCommon.cpp
@@ -43,11 +43,11 @@ namespace leopard {
 
 #if defined(LEO_TRY_NEON)
 # if defined(IOS) && defined(__ARM_NEON__)
-// Requires iPhone 5S or newer
+    // Requires iPhone 5S or newer
 # else
-// Remember to add LOCAL_STATIC_LIBRARIES := cpufeatures
-bool CpuHasNeon = false; // V6 / V7
-bool CpuHasNeon64 = false; // 64-bit
+    // Remember to add LOCAL_STATIC_LIBRARIES := cpufeatures
+    bool CpuHasNeon = false; // V6 / V7
+    bool CpuHasNeon64 = false; // 64-bit
 # endif
 #endif
 
@@ -60,8 +60,9 @@ bool CpuHasNeon64 = false; // 64-bit
 #endif
 
 #ifdef LEO_TRY_AVX2
-bool CpuHasAVX2 = false;
+    bool CpuHasAVX2 = false;
 #endif
+
 bool CpuHasSSSE3 = false;
 
 #define CPUID_EBX_AVX2    0x00000020
@@ -191,6 +192,8 @@ void xor_mem(
     } while (bytes > 0);
 }
 
+#ifdef LEO_USE_VECTOR4_OPT
+
 void xor_mem4(
     void * LEO_RESTRICT vx_0, const void * LEO_RESTRICT vy_0,
     void * LEO_RESTRICT vx_1, const void * LEO_RESTRICT vy_1,
@@ -321,12 +324,15 @@ void xor_mem4(
     } while (bytes > 0);
 }
 
+#endif // LEO_USE_VECTOR4_OPT
+
 void VectorXOR(
     const uint64_t bytes,
     unsigned count,
     void** x,
     void** y)
 {
+#ifdef LEO_USE_VECTOR4_OPT
     while (count >= 4)
     {
         xor_mem4(
@@ -338,9 +344,10 @@ void VectorXOR(
         x += 4, y += 4;
         count -= 4;
     }
+#endif // LEO_USE_VECTOR4_OPT
 
     for (unsigned i = 0; i < count; ++i)
-        xor_mem(y[i], x[i], bytes);
+        xor_mem(x[i], y[i], bytes);
 }
 
 
diff --git a/LeopardCommon.h b/LeopardCommon.h
index 7cc9764..7535854 100644
--- a/LeopardCommon.h
+++ b/LeopardCommon.h
@@ -51,6 +51,16 @@
 #include <stdint.h>
 
 
+//------------------------------------------------------------------------------
+// Constants
+
+// Unroll inner loops 4 times
+//#define LEO_USE_VECTOR4_OPT
+
+// Define this to enable the optimized version of FWHT()
+//#define LEO_FWHT_OPT
+
+
 //------------------------------------------------------------------------------
 // Debug
 
@@ -152,26 +162,27 @@ namespace leopard {
 // Initialize CPU architecture flags
 void InitializeCPUArch();
 
+
 #if defined(LEO_TRY_NEON)
 # if defined(IOS) && defined(__ARM_NEON__)
-// Does device support NEON?
-static const bool CpuHasNeon = true;
-static const bool CpuHasNeon64 = true;
+    // Does device support NEON?
+    static const bool CpuHasNeon = true;
+    static const bool CpuHasNeon64 = true;
 # else
-// Does device support NEON?
-// Remember to add LOCAL_STATIC_LIBRARIES := cpufeatures
-extern bool CpuHasNeon; // V6 / V7
-extern bool CpuHasNeon64; // 64-bit
+    // Does device support NEON?
+    // Remember to add LOCAL_STATIC_LIBRARIES := cpufeatures
+    extern bool CpuHasNeon; // V6 / V7
+    extern bool CpuHasNeon64; // 64-bit
 # endif
 #endif
 
 #if !defined(LEO_TARGET_MOBILE)
 # if defined(LEO_TRY_AVX2)
-// Does CPU support AVX2?
-extern bool CpuHasAVX2;
+    // Does CPU support AVX2?
+    extern bool CpuHasAVX2;
 # endif
-// Does CPU support SSSE3?
-extern bool CpuHasSSSE3;
+    // Does CPU support SSSE3?
+    extern bool CpuHasSSSE3;
 #endif // LEO_TARGET_MOBILE
 
 
@@ -210,6 +221,8 @@ void xor_mem(
     void * LEO_RESTRICT x, const void * LEO_RESTRICT y,
     uint64_t bytes);
 
+#ifdef LEO_USE_VECTOR4_OPT
+
 // For i = {0, 1, 2, 3}: x_i[] ^= x_i[]
 void xor_mem4(
     void * LEO_RESTRICT x_0, const void * LEO_RESTRICT y_0,
@@ -218,6 +231,8 @@ void xor_mem4(
     void * LEO_RESTRICT x_3, const void * LEO_RESTRICT y_3,
     uint64_t bytes);
 
+#endif // LEO_USE_VECTOR4_OPT
+
 // x[] ^= y[]
 void VectorXOR(
     const uint64_t bytes,
diff --git a/LeopardFF16.cpp b/LeopardFF16.cpp
index 66ccf04..7cfb9b4 100644
--- a/LeopardFF16.cpp
+++ b/LeopardFF16.cpp
@@ -32,9 +32,6 @@
 
 #include <string.h>
 
-// Define this to enable the optimized version of FWHT()
-#define LEO_FF16_FWHT_OPTIMIZED
-
 namespace leopard { namespace ff16 {
 
 
@@ -84,7 +81,7 @@ static inline ffe_t SubMod(const ffe_t a, const ffe_t b)
 //------------------------------------------------------------------------------
 // Fast Walsh-Hadamard Transform (FWHT) (mod kModulus)
 
-#if defined(LEO_FF16_FWHT_OPTIMIZED)
+#if defined(LEO_FWHT_OPT)
 
 // {a, b} = {a + b, a - b} (Mod Q)
 static LEO_FORCE_INLINE void FWHT_2(ffe_t& LEO_RESTRICT a, ffe_t& LEO_RESTRICT b)
@@ -285,7 +282,7 @@ static void FWHT(ffe_t* data, const unsigned ldn)
     }
 }
 
-#else // LEO_FF16_FWHT_OPTIMIZED
+#else // LEO_FWHT_OPT
 
 // Reference implementation
 void FWHT(ffe_t* data, const unsigned bits)
@@ -297,7 +294,7 @@ void FWHT(ffe_t* data, const unsigned bits)
                 FWHT_2(data[j], data[j + width]);
 }
 
-#endif // LEO_FF16_FWHT_OPTIMIZED
+#endif // LEO_FWHT_OPT
 
 // Transform specialized for the finite field order
 void FWHT(ffe_t data[kOrder])
diff --git a/LeopardFF8.cpp b/LeopardFF8.cpp
index ca8a05b..eccc5c4 100644
--- a/LeopardFF8.cpp
+++ b/LeopardFF8.cpp
@@ -32,9 +32,6 @@
 
 #include <string.h>
 
-// Define this to enable the optimized version of FWHT()
-#define LEO_FF8_FWHT_OPTIMIZED
-
 namespace leopard { namespace ff8 {
 
 
@@ -81,8 +78,6 @@ static inline ffe_t SubMod(const ffe_t a, const ffe_t b)
 //------------------------------------------------------------------------------
 // Fast Walsh-Hadamard Transform (FWHT) (mod kModulus)
 
-#if defined(LEO_FF8_FWHT_OPTIMIZED)
-
 // {a, b} = {a + b, a - b} (Mod Q)
 static LEO_FORCE_INLINE void FWHT_2(ffe_t& LEO_RESTRICT a, ffe_t& LEO_RESTRICT b)
 {
@@ -92,6 +87,8 @@ static LEO_FORCE_INLINE void FWHT_2(ffe_t& LEO_RESTRICT a, ffe_t& LEO_RESTRICT b
     b = dif;
 }
 
+#if defined(LEO_FWHT_OPT)
+
 static LEO_FORCE_INLINE void FWHT_4(ffe_t* data)
 {
     ffe_t t0 = data[0];
@@ -191,7 +188,7 @@ static void FWHT(ffe_t* data, const unsigned ldn)
     }
 }
 
-#else // LEO_FF8_FWHT_OPTIMIZED
+#else // LEO_FWHT_OPT
 
 // Reference implementation
 void FWHT(ffe_t* data, const unsigned bits)
@@ -203,7 +200,7 @@ void FWHT(ffe_t* data, const unsigned bits)
                 FWHT_2(data[j], data[j + width]);
 }
 
-#endif // LEO_FF8_FWHT_OPTIMIZED
+#endif // LEO_FWHT_OPT
 
 // Transform specialized for the finite field order
 void FWHT(ffe_t data[kOrder])
@@ -272,7 +269,7 @@ struct {
 #endif // LEO_TRY_AVX2
 
 // Returns a * Log(b)
-static ffe_t FFEMultiplyLog(ffe_t a, ffe_t log_b)
+static ffe_t MultiplyLog(ffe_t a, ffe_t log_b)
 {
     if (a == 0)
         return 0;
@@ -285,10 +282,10 @@ void InitializeMultiplyTables()
     for (int log_y = 0; log_y < 256; ++log_y)
     {
         uint8_t lo[16], hi[16];
-        for (unsigned char x = 0; x < 16; ++x)
+        for (uint8_t x = 0; x < 16; ++x)
         {
-            lo[x] = FFEMultiplyLog(x, static_cast<uint8_t>(log_y));
-            hi[x] = FFEMultiplyLog(x << 4, static_cast<uint8_t>(log_y));
+            lo[x] = MultiplyLog(x, static_cast<uint8_t>(log_y));
+            hi[x] = MultiplyLog(x << 4, static_cast<uint8_t>(log_y));
         }
 
         const LEO_M128 table_lo = _mm_loadu_si128((LEO_M128*)lo);
@@ -454,6 +451,7 @@ void fft_butterfly(
     } while (bytes > 0);
 }
 
+#ifdef LEO_USE_VECTOR4_OPT
 
 void fft_butterfly4(
     void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
@@ -548,6 +546,8 @@ void fft_butterfly4(
     } while (bytes > 0);
 }
 
+#endif // LEO_USE_VECTOR4_OPT
+
 
 //------------------------------------------------------------------------------
 // IFFT Operations
@@ -626,6 +626,7 @@ void ifft_butterfly(
     } while (bytes > 0);
 }
 
+#ifdef LEO_USE_VECTOR4_OPT
 
 void ifft_butterfly4(
     void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
@@ -720,6 +721,8 @@ void ifft_butterfly4(
     } while (bytes > 0);
 }
 
+#endif // LEO_USE_VECTOR4_OPT
+
 
 //------------------------------------------------------------------------------
 // FFT
@@ -751,12 +754,12 @@ static void FFTInitialize()
                 FFTSkew[j + s] = FFTSkew[j] ^ temp[i];
         }
 
-        temp[m] = kModulus - LogLUT[FFEMultiplyLog(temp[m], LogLUT[temp[m] ^ 1])];
+        temp[m] = kModulus - LogLUT[MultiplyLog(temp[m], LogLUT[temp[m] ^ 1])];
 
         for (unsigned i = m + 1; i < (kBits - 1); ++i)
         {
             const ffe_t sum = AddMod(LogLUT[temp[i] ^ 1], temp[m]);
-            temp[i] = FFEMultiplyLog(temp[i], sum);
+            temp[i] = MultiplyLog(temp[i], sum);
         }
     }
 
@@ -780,10 +783,11 @@ void VectorFFTButterfly(
 {
     if (skew == kModulus)
     {
-        VectorXOR(bytes, count, x, y);
+        VectorXOR(bytes, count, y, x);
         return;
     }
 
+#ifdef LEO_USE_VECTOR4_OPT
     while (count >= 4)
     {
         fft_butterfly4(
@@ -795,6 +799,7 @@ void VectorFFTButterfly(
         x += 4, y += 4;
         count -= 4;
     }
+#endif // LEO_USE_VECTOR4_OPT
 
     for (unsigned i = 0; i < count; ++i)
         fft_butterfly(x[i], y[i], skew, bytes);
@@ -809,10 +814,11 @@ void VectorIFFTButterfly(
 {
     if (skew == kModulus)
     {
-        VectorXOR(bytes, count, x, y);
+        VectorXOR(bytes, count, y, x);
         return;
     }
 
+#ifdef LEO_USE_VECTOR4_OPT
     while (count >= 4)
     {
         ifft_butterfly4(
@@ -824,6 +830,7 @@ void VectorIFFTButterfly(
         x += 4, y += 4;
         count -= 4;
     }
+#endif // LEO_USE_VECTOR4_OPT
 
     for (unsigned i = 0; i < count; ++i)
         ifft_butterfly(x[i], y[i], skew, bytes);
diff --git a/LeopardFF8.h b/LeopardFF8.h
index bcd0200..4bc7115 100644
--- a/LeopardFF8.h
+++ b/LeopardFF8.h
@@ -86,6 +86,8 @@ void fft_butterfly(
     void * LEO_RESTRICT x, void * LEO_RESTRICT y,
     ffe_t log_m, uint64_t bytes);
 
+#ifdef LEO_USE_VECTOR4_OPT
+
 // Unroll 4 rows at a time
 void fft_butterfly4(
     void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
@@ -94,6 +96,8 @@ void fft_butterfly4(
     void * LEO_RESTRICT x_3, void * LEO_RESTRICT y_3,
     ffe_t log_m, uint64_t bytes);
 
+#endif // LEO_USE_VECTOR4_OPT
+
 
 //------------------------------------------------------------------------------
 // IFFT Operations
@@ -107,6 +111,8 @@ void ifft_butterfly(
     void * LEO_RESTRICT x, void * LEO_RESTRICT y,
     ffe_t log_m, uint64_t bytes);
 
+#ifdef LEO_USE_VECTOR4_OPT
+
 // Unroll 4 rows at a time
 void ifft_butterfly4(
     void * LEO_RESTRICT x_0, void * LEO_RESTRICT y_0,
@@ -115,6 +121,12 @@ void ifft_butterfly4(
     void * LEO_RESTRICT x_3, void * LEO_RESTRICT y_3,
     ffe_t log_m, uint64_t bytes);
 
+#endif // LEO_USE_VECTOR4_OPT
+
+
+//------------------------------------------------------------------------------
+// FFT
+
 void VectorFFTButterfly(
     const uint64_t bytes,
     unsigned count,
diff --git a/tests/benchmark.cpp b/tests/benchmark.cpp
index 0856089..5fcc1c1 100644
--- a/tests/benchmark.cpp
+++ b/tests/benchmark.cpp
@@ -552,6 +552,8 @@ static void BasicTest(const TestParameters& params)
 //------------------------------------------------------------------------------
 // Parallel XOR Benchmark
 
+#ifdef LEO_USE_VECTOR4_OPT
+
 // Demonstrate about 10% performance boost by doing parallel rows for XORs
 void ParallelXORBenchmark()
 {
@@ -605,12 +607,16 @@ void ParallelXORBenchmark()
     t_4.Print(iteration_count);
 }
 
+#endif // LEO_USE_VECTOR4_OPT
+
 
 //------------------------------------------------------------------------------
 // Parallel Butterfly8 Benchmark
 
 #ifdef LEO_HAS_FF8
 
+#ifdef LEO_USE_VECTOR4_OPT
+
 // Demonstrate performance boost by doing parallel rows for Butterfly8s
 void ParallelButterfly8Benchmark()
 {
@@ -670,6 +676,8 @@ void ParallelButterfly8Benchmark()
     t_4.Print(iteration_count);
 }
 
+#endif // LEO_USE_VECTOR4_OPT
+
 #endif // LEO_HAS_FF8
 
 
@@ -678,6 +686,8 @@ void ParallelButterfly8Benchmark()
 
 #ifdef LEO_HAS_FF16
 
+#ifdef LEO_USE_VECTOR4_OPT
+
 // Demonstrate performance boost by doing parallel rows for Butterfly16s
 void ParallelButterfly16Benchmark()
 {
@@ -737,6 +747,8 @@ void ParallelButterfly16Benchmark()
     t_4.Print(iteration_count);
 }
 
+#endif // LEO_USE_VECTOR4_OPT
+
 #endif // LEO_HAS_FF8