diff --git a/LeopardCommon.h b/LeopardCommon.h
index ec59611..da6fe8a 100644
--- a/LeopardCommon.h
+++ b/LeopardCommon.h
@@ -34,7 +34,6 @@
     + Look into 12-bit fields as a performance optimization
     + Unroll first/final butterflies to avoid extra copies/xors in encoder
     + Skip a lot of the initial FWHT() layers that are just operating on zeroes
-    + Skip a lot of the final FWHT() layers that are not needed for calculation
     + For the actual FFT(), I should be unrolling the bottom two layers
         and performing them in a specialized function that does 2 <=> 2 and
         then 1<=>1, 1<=>1 operations in local registers/cache
@@ -160,7 +159,7 @@
 // Define this to enable the optimized version of FWHT()
 #define LEO_FWHT_OPT
 
-// Avoid scheduling reduced FFT operations that are unneeded
+// Avoid scheduling FFT operations that are unused
 #define LEO_SCHEDULE_OPT
 
 // Avoid calculating final FFT values in decoder using bitfield
diff --git a/LeopardFF16.cpp b/LeopardFF16.cpp
index fa1399f..f861803 100644
--- a/LeopardFF16.cpp
+++ b/LeopardFF16.cpp
@@ -364,7 +364,7 @@ static void InitializeLogarithmTables()
 /*
     The multiplication algorithm used follows the approach outlined in {4}.
     Specifically section 7 outlines the algorithm used here for 16-bit fields.
-    I use the ALTMAP memory layout since I do not need to convert in/out of it.
+    The ALTMAP memory layout is used since there is no need to convert in/out.
 */
 
 struct {
@@ -903,6 +903,7 @@ static ffe_t FFTSkew[kModulus];
 // Factors used in the evaluation of the error locator polynomial
 static ffe_t LogWalsh[kOrder];
 
+
 static void FFTInitialize()
 {
     ffe_t temp[kBits - 1];
@@ -1328,12 +1329,12 @@ void ReedSolomonDecode(
 
     // Evaluate error locator polynomial
 
-    FWHT(ErrorLocations, kBits);
+    FWHT(ErrorLocations);
 
     for (unsigned i = 0; i < kOrder; ++i)
         ErrorLocations[i] = ((unsigned)ErrorLocations[i] * (unsigned)LogWalsh[i]) % kModulus;
 
-    FWHT(ErrorLocations, kBits);
+    FWHT(ErrorLocations);
 
     // work <- recovery data
 
diff --git a/LeopardFF8.cpp b/LeopardFF8.cpp
index 2f3f1a3..9fc1650 100644
--- a/LeopardFF8.cpp
+++ b/LeopardFF8.cpp
@@ -121,38 +121,6 @@ static LEO_FORCE_INLINE void FWHT_4(ffe_t* data, unsigned s)
     data[y] = t3;
 }
 
-static void FWHT_8(ffe_t* data)
-{
-    ffe_t t0 = data[0];
-    ffe_t t1 = data[1];
-    ffe_t t2 = data[2];
-    ffe_t t3 = data[3];
-    ffe_t t4 = data[4];
-    ffe_t t5 = data[5];
-    ffe_t t6 = data[6];
-    ffe_t t7 = data[7];
-    FWHT_2(t0, t1);
-    FWHT_2(t2, t3);
-    FWHT_2(t4, t5);
-    FWHT_2(t6, t7);
-    FWHT_2(t0, t2);
-    FWHT_2(t1, t3);
-    FWHT_2(t4, t6);
-    FWHT_2(t5, t7);
-    FWHT_2(t0, t4);
-    FWHT_2(t1, t5);
-    FWHT_2(t2, t6);
-    FWHT_2(t3, t7);
-    data[0] = t0;
-    data[1] = t1;
-    data[2] = t2;
-    data[3] = t3;
-    data[4] = t4;
-    data[5] = t5;
-    data[6] = t6;
-    data[7] = t7;
-}
-
 // Decimation in time (DIT) version
 static void FWHT(ffe_t* data, const unsigned bits)
 {
@@ -174,16 +142,8 @@ static void FWHT(ffe_t* data, const unsigned bits)
                 FWHT_4(data + j + r, m4);
     }
 
-    if (bits & 1)
-    {
-        for (unsigned i0 = 0; i0 < n; i0 += 8)
-            FWHT_8(data + i0);
-    }
-    else
-    {
-        for (unsigned i0 = 0; i0 < n; i0 += 4)
-            FWHT_4(data + i0);
-    }
+    for (unsigned i0 = 0; i0 < n; i0 += 4)
+        FWHT_4(data + i0);
 }
 
 #else // LEO_FWHT_OPT
@@ -790,230 +750,393 @@ static void FFTInitialize()
     FWHT(LogWalsh, kBits);
 }
 
-void VectorFFTButterfly(
-    const uint64_t bytes,
-    unsigned count,
-    void** x,
-    void** y,
-    const ffe_t log_m)
-{
-    if (log_m == kModulus)
-    {
-        VectorXOR(bytes, count, y, x);
-        return;
-    }
-
-#ifdef LEO_USE_VECTOR4_OPT
-    while (count >= 4)
-    {
-        fft_butterfly4(
-            x[0], y[0],
-            x[1], y[1],
-            x[2], y[2],
-            x[3], y[3],
-            log_m, bytes);
-        x += 4, y += 4;
-        count -= 4;
-    }
-#endif // LEO_USE_VECTOR4_OPT
-
-    for (unsigned i = 0; i < count; ++i)
-        fft_butterfly(x[i], y[i], log_m, bytes);
-}
-
-void VectorIFFTButterfly(
-    const uint64_t bytes,
-    unsigned count,
-    void** x,
-    void** y,
-    const ffe_t log_m)
-{
-    if (log_m == kModulus)
-    {
-        VectorXOR(bytes, count, y, x);
-        return;
-    }
-
-#ifdef LEO_USE_VECTOR4_OPT
-    while (count >= 4)
-    {
-        ifft_butterfly4(
-            x[0], y[0],
-            x[1], y[1],
-            x[2], y[2],
-            x[3], y[3],
-            log_m, bytes);
-        x += 4, y += 4;
-        count -= 4;
-    }
-#endif // LEO_USE_VECTOR4_OPT
-
-    for (unsigned i = 0; i < count; ++i)
-        ifft_butterfly(x[i], y[i], log_m, bytes);
-}
-
 
 //------------------------------------------------------------------------------
 // Reed-Solomon Encode
 
+/*
+    Decimation in time IFFT:
+
+    The decimation in time IFFT algorithm allows us to unroll 2 layers at a time,
+    performing calculations on local registers and faster cache memory.
+
+    Each ^___^ below indicates a butterfly between the associated indices.
+
+    The ifft_butterfly(x, y) operation:
+
+        if (log_m != kModulus)
+            x[] ^= exp(log(y[]) + log_m)
+        y[] ^= x[]
+
+    Layer 0:
+        0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7
+        ^_^ ^_^ ^_^ ^_^ ^_^ ^_^ ^_^ ^_^
+
+    Layer 1:
+        0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7
+        ^___^   ^___^   ^___^   ^___^
+          ^___^   ^___^   ^___^   ^___^
+  
+    Layer 2:
+        0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7 
+        ^_______^       ^_______^
+          ^_______^       ^_______^
+            ^_______^       ^_______^
+              ^_______^       ^_______^
+
+    Layer 3:
+        0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7
+        ^_______________^
+          ^_______________^
+            ^_______________^
+              ^_______________^
+                ^_______________^
+                  ^_______________^
+                    ^_______________^
+                      ^_______________^
+
+    DIT layer 0-1 operations, grouped 4 at a time:
+        {0-1, 2-3, 0-2, 1-3},
+        {4-5, 6-7, 4-6, 5-7},
+
+    DIT layer 1-2 operations, grouped 4 at a time:
+        {0-2, 4-6, 0-4, 2-6},
+        {1-3, 5-7, 1-5, 3-7},
+
+    DIT layer 2-3 operations, grouped 4 at a time:
+        {0-4, 0'-4', 0-0', 4-4'},
+        {1-5, 1'-5', 1-1', 5-5'},
+*/
+
+// 4-way butterfly
+static void IFFT_DIT4(
+    const uint64_t bytes,
+    void** work,
+    unsigned dist,
+    const ffe_t log_m01,
+    const ffe_t log_m23,
+    const ffe_t log_m02)
+{
+    // FIXME: Interleave these
+
+    // First layer:
+    if (log_m01 == kModulus)
+        xor_mem(work[dist], work[0], bytes);
+    else
+        ifft_butterfly(work[0], work[dist], log_m01, bytes);
+
+    if (log_m23 == kModulus)
+        xor_mem(work[dist * 3], work[dist * 2], bytes);
+    else
+        ifft_butterfly(work[dist * 2], work[dist * 3], log_m23, bytes);
+
+    // Second layer:
+    if (log_m02 == kModulus)
+    {
+        xor_mem(work[dist * 2], work[0], bytes);
+        xor_mem(work[dist * 3], work[dist], bytes);
+    }
+    else
+    {
+        ifft_butterfly(work[0], work[dist * 2], log_m02, bytes);
+        ifft_butterfly(work[dist], work[dist * 3], log_m02, bytes);
+    }
+}
+
+void IFFT_DIT(
+    const uint64_t bytes,
+    void* const* data,
+    const unsigned m_truncated,
+    void** work,
+    void** xor_result,
+    const unsigned m,
+    const ffe_t* skewLUT)
+{
+    // FIXME: Roll into first layer
+    if (data)
+    {
+        for (unsigned i = 0; i < m_truncated; ++i)
+            memcpy(work[i], data[i], bytes);
+        for (unsigned i = m_truncated; i < m; ++i)
+            memset(work[i], 0, bytes);
+    }
+
+    // Decimation in time: Unroll 2 layers at a time
+    unsigned dist = 1, dist4 = 4;
+    for (; dist4 <= m; dist = dist4, dist4 <<= 2)
+    {
+        // FIXME: Walk this in reverse order every other pair of layers for better cache locality
+        // FIXME: m_truncated
+
+        // For each set of dist*4 elements:
+        for (unsigned r = 0; r < m_truncated; r += dist4)
+        {
+            const ffe_t log_m01 = skewLUT[r + dist];
+            const ffe_t log_m23 = skewLUT[r + dist * 3];
+            const ffe_t log_m02 = skewLUT[r + dist * 2];
+
+            // For each set of dist elements:
+            for (unsigned i = r; i < r + dist; ++i)
+            {
+                IFFT_DIT4(
+                    bytes,
+                    work + i,
+                    dist,
+                    log_m01,
+                    log_m23,
+                    log_m02);
+            }
+        }
+
+        data = nullptr;
+    }
+
+    // If there is one layer left:
+    if (dist < m)
+    {
+        const ffe_t log_m = skewLUT[dist];
+
+        if (log_m == kModulus)
+        {
+            for (unsigned i = 0; i < dist; ++i)
+                VectorXOR(bytes, dist, work + dist, work);
+        }
+        else
+        {
+            for (unsigned i = 0; i < dist; ++i)
+            {
+                ifft_butterfly(
+                    work[i],
+                    work[i + dist],
+                    log_m,
+                    bytes);
+            }
+        }
+    }
+
+    // FIXME: Roll into last layer
+    if (xor_result)
+        for (unsigned i = 0; i < m; ++i)
+            xor_mem(xor_result[i], work[i], bytes);
+}
+
+/*
+    Decimation in time FFT:
+
+    The decimation in time FFT algorithm allows us to unroll 2 layers at a time,
+    performing calculations on local registers and faster cache memory.
+
+    Each ^___^ below indicates a butterfly between the associated indices.
+
+    The fft_butterfly(x, y) operation:
+
+        y[] ^= x[]
+        if (log_m != kModulus)
+            x[] ^= exp(log(y[]) + log_m)
+
+    Layer 0:
+        0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7
+        ^_______________^
+          ^_______________^
+            ^_______________^
+              ^_______________^
+                ^_______________^
+                  ^_______________^
+                    ^_______________^
+                      ^_______________^
+
+    Layer 1:
+        0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7 
+        ^_______^       ^_______^
+          ^_______^       ^_______^
+            ^_______^       ^_______^
+              ^_______^       ^_______^
+  
+    Layer 2:
+        0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7
+        ^___^   ^___^   ^___^   ^___^
+          ^___^   ^___^   ^___^   ^___^
+
+    Layer 3:
+        0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7
+        ^_^ ^_^ ^_^ ^_^ ^_^ ^_^ ^_^ ^_^
+
+    DIT layer 0-1 operations, grouped 4 at a time:
+        {0-0', 4-4', 0-4, 0'-4'},
+        {1-1', 5-5', 1-5, 1'-5'},
+
+    DIT layer 1-2 operations, grouped 4 at a time:
+        {0-4, 2-6, 0-2, 4-6},
+        {1-5, 3-7, 1-3, 5-7},
+
+    DIT layer 2-3 operations, grouped 4 at a time:
+        {0-2, 1-3, 0-1, 2-3},
+        {4-6, 5-7, 4-5, 6-7},
+*/
+
+static void FFT_DIT4(
+    const uint64_t bytes,
+    void** work,
+    const unsigned dist,
+    const ffe_t log_m01,
+    const ffe_t log_m23,
+    const ffe_t log_m02)
+{
+    // FIXME: Interleave
+
+    // First layer:
+    if (log_m02 == kModulus)
+    {
+        xor_mem(work[dist * 2], work[0], bytes);
+        xor_mem(work[dist * 3], work[dist], bytes);
+    }
+    else
+    {
+        fft_butterfly(work[0], work[dist * 2], log_m02, bytes);
+        fft_butterfly(work[dist], work[dist * 3], log_m02, bytes);
+    }
+
+    // Second layer:
+    if (log_m01 == kModulus)
+        xor_mem(work[dist], work[0], bytes);
+    else
+        fft_butterfly(work[0], work[dist], log_m01, bytes);
+
+    if (log_m23 == kModulus)
+        xor_mem(work[dist * 3], work[dist * 2], bytes);
+    else
+        fft_butterfly(work[dist * 2], work[dist * 3], log_m23, bytes);
+}
+
+void FFT_DIT(
+    const uint64_t bytes,
+    void** work,
+    const unsigned m_truncated,
+    const unsigned m,
+    const ffe_t* skewLUT)
+{
+    // Decimation in time: Unroll 2 layers at a time
+    unsigned dist4 = m, dist = m >> 2;
+    for (; dist != 0; dist4 = dist, dist >>= 2)
+    {
+        // FIXME: Walk this in reverse order every other pair of layers for better cache locality
+        // FIXME: m_truncated
+
+        // For each set of dist*4 elements:
+        for (unsigned r = 0; r < m_truncated; r += dist4)
+        {
+            const ffe_t log_m01 = skewLUT[r + dist];
+            const ffe_t log_m23 = skewLUT[r + dist * 3];
+            const ffe_t log_m02 = skewLUT[r + dist * 2];
+
+            // For each set of dist elements:
+            for (unsigned i = r; i < r + dist; ++i)
+            {
+                FFT_DIT4(
+                    bytes,
+                    work + i,
+                    dist,
+                    log_m01,
+                    log_m23,
+                    log_m02);
+            }
+        }
+    }
+
+    // If there is one layer left:
+    if (dist4 == 2)
+    {
+        for (unsigned r = 0; r < m_truncated; r += 2)
+        {
+            const ffe_t log_m = skewLUT[r + 1];
+
+            if (log_m == kModulus)
+                xor_mem(work[r + 1], work[r], bytes);
+            else
+            {
+                fft_butterfly(
+                    work[r],
+                    work[r + 1],
+                    log_m,
+                    bytes);
+            }
+        }
+    }
+}
+
 void ReedSolomonEncode(
     uint64_t buffer_bytes,
     unsigned original_count,
     unsigned recovery_count,
     unsigned m,
-    void* const * data,
+    void* const* data,
     void** work)
 {
-    // work <- data
-
-    // TBD: Unroll first loop to eliminate this
-    unsigned first_end = m;
-    if (original_count < m)
-    {
-        first_end = original_count;
-        for (unsigned i = original_count; i < m; ++i)
-            memset(work[i], 0, buffer_bytes);
-    }
-    for (unsigned i = 0; i < first_end; ++i)
-        memcpy(work[i], data[i], buffer_bytes);
-
     // work <- IFFT(data, m, m)
 
-    for (unsigned width = 1; width < m; width <<= 1)
-    {
-        const unsigned range = width << 1;
-        const ffe_t* skewLUT = FFTSkew + width + m - 1;
+    const ffe_t* skewLUT = FFTSkew + m - 1;
 
-#ifdef LEO_SCHEDULE_OPT
-        for (unsigned j = 0; j < first_end; j += range)
-#else
-        for (unsigned j = 0; j < m; j += range)
-#endif
-        {
-            VectorIFFTButterfly(
-                buffer_bytes,
-                width,
-                work + j,
-                work + j + width,
-                skewLUT[j]);
-        }
-    }
+    IFFT_DIT(
+        buffer_bytes,
+        data,
+        original_count < m ? original_count : m,
+        work,
+        nullptr, // No xor output
+        m,
+        skewLUT);
 
     if (m >= original_count)
         goto skip_body;
 
+    // For sets of m data pieces:
     for (unsigned i = m; i + m <= original_count; i += m)
     {
-        // temp <- data + i
-
         data += m;
-        void** temp = work + m;
+        skewLUT += m;
 
-        // TBD: Unroll first loop to eliminate this
-        for (unsigned j = 0; j < m; ++j)
-            memcpy(temp[j], data[j], buffer_bytes);
+        // work <- work xor IFFT(data + i, m, m + i)
 
-        // temp <- IFFT(temp, m, m + i)
-
-        const ffe_t* skewLUT = FFTSkew + m + i - 1;
-
-        for (unsigned width = 1; width < m; width <<= 1)
-        {
-            const unsigned range = width << 1;
-
-            for (unsigned j = width; j < m; j += range)
-            {
-                VectorIFFTButterfly(
-                    buffer_bytes,
-                    width,
-                    temp + j - width,
-                    temp + j,
-                    skewLUT[j]);
-            }
-        }
-
-        // work <- work XOR temp
-
-        // TBD: Unroll last loop to eliminate this
-        VectorXOR(
+        IFFT_DIT(
             buffer_bytes,
+            data, // data source
             m,
-            work,
-            temp);
+            work + m, // temporary workspace
+            work, // xor destination
+            m,
+            skewLUT);
     }
 
+    // Handle final partial set of m pieces:
     const unsigned last_count = original_count % m;
     if (last_count != 0)
     {
         const unsigned i = original_count - last_count;
 
-        // temp <- data + i
-
         data += m;
-        void** temp = work + m;
+        skewLUT += m;
 
-        for (unsigned j = 0; j < last_count; ++j)
-            memcpy(temp[j], data[j], buffer_bytes);
-        for (unsigned j = last_count; j < m; ++j)
-            memset(temp[j], 0, buffer_bytes);
+        // work <- work xor IFFT(data + i, m, m + i)
 
-        // temp <- IFFT(temp, m, m + i)
-
-        for (unsigned width = 1, shift = 1; width < m; width <<= 1, ++shift)
-        {
-            const unsigned range = width << 1;
-            const ffe_t* skewLUT = FFTSkew + width + m + i - 1;
-
-#ifdef LEO_SCHEDULE_OPT
-            // Calculate stop considering that the right is all zeroes
-            const unsigned stop = ((last_count + range - 1) >> shift) << shift;
-            for (unsigned j = 0; j < stop; j += range)
-#else
-            for (unsigned j = 0; j < m; j += range)
-#endif
-            {
-                VectorIFFTButterfly(
-                    buffer_bytes,
-                    width,
-                    temp + j,
-                    temp + j + width,
-                    skewLUT[j]);
-            }
-        }
-
-        // work <- work XOR temp
-
-        // TBD: Unroll last loop to eliminate this
-        VectorXOR(
+        IFFT_DIT(
             buffer_bytes,
+            data, // data source
+            last_count,
+            work + m, // temporary workspace
+            work, // xor destination
             m,
-            work,
-            temp);
+            skewLUT);
     }
 
 skip_body:
 
     // work <- FFT(work, m, 0)
 
-    for (unsigned width = (m >> 1); width > 0; width >>= 1)
-    {
-        const ffe_t* skewLUT = FFTSkew + width - 1;
-        const unsigned range = width << 1;
-
-#ifdef LEO_SCHEDULE_OPT
-        for (unsigned j = 0; j < recovery_count; j += range)
-#else
-        for (unsigned j = 0; j < m; j += range)
-#endif
-        {
-            VectorFFTButterfly(
-                buffer_bytes,
-                width,
-                work + j,
-                work + j + width,
-                skewLUT[j]);
-        }
-    }
+    FFT_DIT(
+        buffer_bytes,
+        work,
+        recovery_count,
+        m,
+        FFTSkew - 1);
 }
 
 
@@ -1088,6 +1211,68 @@ void ErrorBitfield::Prepare()
 //------------------------------------------------------------------------------
 // Reed-Solomon Decode
 
+void VectorFFTButterfly(
+    const uint64_t bytes,
+    unsigned count,
+    void** x,
+    void** y,
+    const ffe_t log_m)
+{
+    if (log_m == kModulus)
+    {
+        VectorXOR(bytes, count, y, x);
+        return;
+    }
+
+#ifdef LEO_USE_VECTOR4_OPT
+    while (count >= 4)
+    {
+        fft_butterfly4(
+            x[0], y[0],
+            x[1], y[1],
+            x[2], y[2],
+            x[3], y[3],
+            log_m, bytes);
+        x += 4, y += 4;
+        count -= 4;
+    }
+#endif // LEO_USE_VECTOR4_OPT
+
+    for (unsigned i = 0; i < count; ++i)
+        fft_butterfly(x[i], y[i], log_m, bytes);
+}
+
+void VectorIFFTButterfly(
+    const uint64_t bytes,
+    unsigned count,
+    void** x,
+    void** y,
+    const ffe_t log_m)
+{
+    if (log_m == kModulus)
+    {
+        VectorXOR(bytes, count, y, x);
+        return;
+    }
+
+#ifdef LEO_USE_VECTOR4_OPT
+    while (count >= 4)
+    {
+        ifft_butterfly4(
+            x[0], y[0],
+            x[1], y[1],
+            x[2], y[2],
+            x[3], y[3],
+            log_m, bytes);
+        x += 4, y += 4;
+        count -= 4;
+    }
+#endif // LEO_USE_VECTOR4_OPT
+
+    for (unsigned i = 0; i < count; ++i)
+        ifft_butterfly(x[i], y[i], log_m, bytes);
+}
+
 void ReedSolomonDecode(
     uint64_t buffer_bytes,
     unsigned original_count,
@@ -1127,12 +1312,12 @@ void ReedSolomonDecode(
 
     // Evaluate error locator polynomial
 
-    FWHT(ErrorLocations, kBits);
+    FWHT(ErrorLocations);
 
     for (unsigned i = 0; i < kOrder; ++i)
         ErrorLocations[i] = ((unsigned)ErrorLocations[i] * (unsigned)LogWalsh[i]) % kModulus;
 
-    FWHT(ErrorLocations, kBits);
+    FWHT(ErrorLocations);
 
     // work <- recovery data
 
diff --git a/LeopardFF8.h b/LeopardFF8.h
index 51187f6..33f790f 100644
--- a/LeopardFF8.h
+++ b/LeopardFF8.h
@@ -134,34 +134,6 @@ void ifft_butterfly4(
 #endif // LEO_USE_VECTOR4_OPT
 
 
-//------------------------------------------------------------------------------
-// FFT
-
-/*
-    if (log_m != kModulus)
-        x[] ^= exp(log(y[]) + log_m)
-    y[] ^= x[]
-*/
-void VectorFFTButterfly(
-    const uint64_t bytes,
-    unsigned count,
-    void** x,
-    void** y,
-    const ffe_t log_m);
-
-/*
-    y[] ^= x[]
-    if (log_m != kModulus)
-        x[] ^= exp(log(y[]) + log_m)
-*/
-void VectorIFFTButterfly(
-    const uint64_t bytes,
-    unsigned count,
-    void** x,
-    void** y,
-    const ffe_t log_m);
-
-
 //------------------------------------------------------------------------------
 // Reed-Solomon Encode
 
diff --git a/tests/benchmark.cpp b/tests/benchmark.cpp
index 86c779c..7b05b05 100644
--- a/tests/benchmark.cpp
+++ b/tests/benchmark.cpp
@@ -42,14 +42,14 @@ using namespace std;
 struct TestParameters
 {
 #ifdef LEO_HAS_FF16
-    unsigned original_count = 1000; // under 65536
-    unsigned recovery_count = 200; // under 65536 - original_count
+    unsigned original_count = 100; // under 65536
+    unsigned recovery_count = 20; // under 65536 - original_count
 #else
     unsigned original_count = 128; // under 65536
     unsigned recovery_count = 128; // under 65536 - original_count
 #endif
-    unsigned buffer_bytes = 2560; // multiple of 64 bytes
-    unsigned loss_count = 500; // some fraction of original_count
+    unsigned buffer_bytes = 64; // multiple of 64 bytes
+    unsigned loss_count = 32768; // some fraction of original_count
     unsigned seed = 2;
     bool multithreaded = true;
 };
@@ -399,7 +399,7 @@ static LEO_FORCE_INLINE void SIMDSafeFree(void* ptr)
 
 static bool BasicTest(const TestParameters& params)
 {
-    const unsigned kTrials = params.original_count > 8000 ? 1 : 10;
+    const unsigned kTrials = params.original_count > 8000 ? 1 : 100000;
 
     std::vector<uint8_t*> original_data(params.original_count);
 
@@ -807,8 +807,8 @@ int main(int argc, char **argv)
     if (!BasicTest(params))
         goto Failed;
 
-
-    static const unsigned kMaxRandomData = 32768;
+#if 0
+    static const unsigned kMaxRandomData = 128;
 
     prng.Seed(params.seed, 8);
     for (;; ++params.seed)
@@ -822,8 +822,9 @@ int main(int argc, char **argv)
         if (!BasicTest(params))
             goto Failed;
     }
+#endif
 
-#ifdef LEO_BENCH_ALL_256_PARAMS
+#if 1
     for (unsigned original_count = 1; original_count <= 256; ++original_count)
     {
         for (unsigned recovery_count = 1; recovery_count <= original_count; ++recovery_count)