diff --git a/LeopardCommon.cpp b/LeopardCommon.cpp
index 4b78854..02aea18 100644
--- a/LeopardCommon.cpp
+++ b/LeopardCommon.cpp
@@ -175,6 +175,7 @@ void xor_mem(
         return;
     }
 #endif // LEO_TRY_AVX2
+
     LEO_M128 * LEO_RESTRICT x16 = reinterpret_cast<LEO_M128 *>(vx);
     const LEO_M128 * LEO_RESTRICT y16 = reinterpret_cast<const LEO_M128 *>(vy);
     do
@@ -223,6 +224,7 @@ void xor_mem_2to1(
             x32 += 4, y32 += 4, z32 += 4;
             bytes -= 128;
         };
+
         if (bytes > 0)
         {
             LEO_M256 x0 = _mm256_xor_si256(_mm256_loadu_si256(x32),     _mm256_loadu_si256(y32));
@@ -232,9 +234,11 @@ void xor_mem_2to1(
             _mm256_storeu_si256(x32, x0);
             _mm256_storeu_si256(x32 + 1, x1);
         }
+
         return;
     }
 #endif // LEO_TRY_AVX2
+
     LEO_M128 * LEO_RESTRICT x16 = reinterpret_cast<LEO_M128 *>(x);
     const LEO_M128 * LEO_RESTRICT y16 = reinterpret_cast<const LEO_M128 *>(y);
     const LEO_M128 * LEO_RESTRICT z16 = reinterpret_cast<const LEO_M128 *>(z);
diff --git a/LeopardCommon.h b/LeopardCommon.h
index 44fd146..715c1c7 100644
--- a/LeopardCommon.h
+++ b/LeopardCommon.h
@@ -159,6 +159,10 @@
 //------------------------------------------------------------------------------
 // Constants
 
+// Enable 8-bit or 16-bit fields
+#define LEO_HAS_FF8
+#define LEO_HAS_FF16
+
 // Define this to enable the optimized version of FWHT()
 #define LEO_FWHT_OPT
 
diff --git a/LeopardFF8.cpp b/LeopardFF8.cpp
index 4e7909a..263bdb8 100644
--- a/LeopardFF8.cpp
+++ b/LeopardFF8.cpp
@@ -228,11 +228,36 @@ struct {
 } static Multiply256LUT[kOrder];
 #endif // LEO_TRY_AVX2
 
+static ffe_t Multiply8LUT[256 * 256];
+
 
 void InitializeMultiplyTables()
 {
     if (!CpuHasSSSE3)
+    {
+        for (unsigned x = 0; x < 256; ++x)
+        {
+            ffe_t* lut = Multiply8LUT + x;
+
+            if (x == 0)
+            {
+                for (unsigned log_y = 0; log_y < 256; ++log_y, lut += 256)
+                    lut[log_y] = 0;
+            }
+            else
+            {
+                const ffe_t log_x = LogLUT[x];
+
+                for (unsigned log_y = 0; log_y < 256; ++log_y, lut += 256)
+                {
+                    const ffe_t prod = ExpLUT[AddMod(log_x, log_y)];
+                    *lut = prod;
+                }
+            }
+        }
+
         return;
+    }
 
     // For each value we could multiply by:
     for (unsigned log_m = 0; log_m < kOrder; ++log_m)
@@ -334,16 +359,16 @@ void mul_mem(
     }
 
     // Reference version:
+    const ffe_t* LEO_RESTRICT lut = Multiply8LUT + log_m * 256;
     ffe_t * LEO_RESTRICT x1 = reinterpret_cast<ffe_t *>(x);
     const ffe_t * LEO_RESTRICT y1 = reinterpret_cast<const ffe_t *>(y);
 
     do
     {
         for (unsigned j = 0; j < 64; ++j)
-            x1[j] = MultiplyLog(y1[j], log_m);
+            x1[j] = lut[y1[j]];
 
-        x1 += 64;
-        y1 += 64;
+        x1 += 64, y1 += 64;
         bytes -= 64;
     } while (bytes > 0);
 }
@@ -567,25 +592,47 @@ void ifft_butterfly(
     }
 
     // Reference version:
+    const ffe_t* LEO_RESTRICT lut = Multiply8LUT + log_m * 256;
+
+    xor_mem(y, x, bytes);
+
+#ifdef LEO_TARGET_MOBILE
     ffe_t * LEO_RESTRICT x1 = reinterpret_cast<ffe_t *>(x);
     ffe_t * LEO_RESTRICT y1 = reinterpret_cast<ffe_t *>(y);
 
     do
     {
         for (unsigned j = 0; j < 64; ++j)
-        {
-            ffe_t x_0 = x1[j];
-            ffe_t y_0 = y1[j];
-            y_0 ^= x_0;
-            x_0 ^= MultiplyLog(y_0, log_m);
-            x1[j] = x_0;
-            y1[j] = y_0;
-        }
+            x1[j] ^= lut[y1[j]];
 
-        x1 += 64;
-        y1 += 64;
+        x1 += 64, y1 += 64;
         bytes -= 64;
     } while (bytes > 0);
+#else
+    uint64_t * LEO_RESTRICT x8 = reinterpret_cast<uint64_t *>(x);
+    ffe_t * LEO_RESTRICT y1 = reinterpret_cast<ffe_t *>(y);
+
+    do
+    {
+        for (unsigned j = 0; j < 8; ++j)
+        {
+            uint64_t x_0 = x8[j];
+            x_0 ^= (uint64_t)lut[y1[0]];
+            x_0 ^= (uint64_t)lut[y1[1]] << 8;
+            x_0 ^= (uint64_t)lut[y1[2]] << 16;
+            x_0 ^= (uint64_t)lut[y1[3]] << 24;
+            x_0 ^= (uint64_t)lut[y1[4]] << 32;
+            x_0 ^= (uint64_t)lut[y1[5]] << 40;
+            x_0 ^= (uint64_t)lut[y1[6]] << 48;
+            x_0 ^= (uint64_t)lut[y1[7]] << 56;
+            x8[j] = x_0;
+            y1 += 8;
+        }
+
+        x8 += 8;
+        bytes -= 64;
+    } while (bytes > 0);
+#endif
 }
 
 // 4-way butterfly
@@ -783,6 +830,10 @@ void IFFT_DIT(
             memset(work[i], 0, bytes);
     }
 
+    // I tried splitting up the first few layers into L3-cache sized blocks but
+    // found that it only provides about 5% performance boost, which is not
+    // worth the extra complexity.
+
     // Decimation in time: Unroll 2 layers at a time
     unsigned dist = 1, dist4 = 4;
     for (; dist4 <= m; dist = dist4, dist4 <<= 2)
@@ -974,6 +1025,9 @@ void fft_butterfly(
     }
 
     // Reference version:
+    const ffe_t* LEO_RESTRICT lut = Multiply8LUT + log_m * 256;
+
+#ifdef LEO_TARGET_MOBILE
     ffe_t * LEO_RESTRICT x1 = reinterpret_cast<ffe_t *>(x);
     ffe_t * LEO_RESTRICT y1 = reinterpret_cast<ffe_t *>(y);
 
@@ -983,15 +1037,40 @@ void fft_butterfly(
         {
             ffe_t x_0 = x1[j];
             ffe_t y_0 = y1[j];
-            x_0 ^= MultiplyLog(y_0, log_m);
+            x_0 ^= lut[y_0];
             x1[j] = x_0;
             y1[j] = y_0 ^ x_0;
         }
 
-        x1 += 64;
-        y1 += 64;
+        x1 += 64, y1 += 64;
         bytes -= 64;
     } while (bytes > 0);
+#else
+    uint64_t * LEO_RESTRICT x8 = reinterpret_cast<uint64_t *>(x);
+    uint64_t * LEO_RESTRICT y8 = reinterpret_cast<uint64_t *>(y);
+    ffe_t * LEO_RESTRICT y1 = reinterpret_cast<ffe_t *>(y);
+
+    do
+    {
+        for (unsigned j = 0; j < 8; ++j)
+        {
+            uint64_t x_0 = x8[j], y_0 = y8[j];
+            x_0 ^= (uint64_t)lut[y1[0]];
+            x_0 ^= (uint64_t)lut[y1[1]] << 8;
+            x_0 ^= (uint64_t)lut[y1[2]] << 16;
+            x_0 ^= (uint64_t)lut[y1[3]] << 24;
+            x_0 ^= (uint64_t)lut[y1[4]] << 32;
+            x_0 ^= (uint64_t)lut[y1[5]] << 40;
+            x_0 ^= (uint64_t)lut[y1[6]] << 48;
+            x_0 ^= (uint64_t)lut[y1[7]] << 56;
+            x8[j] = x_0, y8[j] = y_0 ^ x_0;
+            y1 += 8;
+        }
+
+        x8 += 8, y8 += 8;
+        bytes -= 64;
+    } while (bytes > 0);
+#endif
 }
 
 #ifdef LEO_USE_VECTOR4_OPT
@@ -1153,8 +1232,8 @@ static void FFT_DIT4(
 
             _mm256_storeu_si256(work0, work0_reg);
             _mm256_storeu_si256(work1, work1_reg);
+            work0++, work1++;
 
-            // First layer:
             if (log_m23 != kModulus)
             {
                 LEO_FFTB4_256(work2_reg, work3_reg, t23_lo, t23_hi);
@@ -1163,8 +1242,7 @@ static void FFT_DIT4(
 
             _mm256_storeu_si256(work2, work2_reg);
             _mm256_storeu_si256(work3, work3_reg);
-
-            work0++, work1++, work2++, work3++;
+            work2++, work3++;
 
             bytes -= 32;
         } while (bytes > 0);
@@ -1221,8 +1299,8 @@ static void FFT_DIT4(
 
             _mm_storeu_si128(work0, work0_reg);
             _mm_storeu_si128(work1, work1_reg);
+            work0++, work1++;
 
-            // First layer:
             if (log_m23 != kModulus)
             {
                 LEO_FFTB4_128(work2_reg, work3_reg, t23_lo, t23_hi);
@@ -1231,8 +1309,7 @@ static void FFT_DIT4(
 
             _mm_storeu_si128(work2, work2_reg);
             _mm_storeu_si128(work3, work3_reg);
-
-            work0++, work1++, work2++, work3++;
+            work2++, work3++;
 
             bytes -= 16;
         } while (bytes > 0);
diff --git a/leopard.cpp b/leopard.cpp
index 3a70269..4163e15 100644
--- a/leopard.cpp
+++ b/leopard.cpp
@@ -27,6 +27,7 @@
 */
 
 #include "leopard.h"
+#include "LeopardCommon.h"
 
 #ifdef LEO_HAS_FF8
     #include "LeopardFF8.h"
diff --git a/leopard.h b/leopard.h
index 1d9e036..cf6a3be 100644
--- a/leopard.h
+++ b/leopard.h
@@ -65,10 +65,6 @@
 // Library version
 #define LEO_VERSION 1
 
-// Enable 8-bit or 16-bit fields
-#define LEO_HAS_FF8
-#define LEO_HAS_FF16
-
 // Tweak if the functions are exported or statically linked
 //#define LEO_DLL /* Defined when building/linking as DLL */
 //#define LEO_BUILDING /* Defined by the library makefile */