diff --git a/LeopardCommon.h b/LeopardCommon.h index 581e815..f6c16c5 100644 --- a/LeopardCommon.h +++ b/LeopardCommon.h @@ -32,11 +32,10 @@ TODO: Short-term: - + Unroll first/final butterflies to avoid extra copies/xors in encoder (17% of encode time) - + Add compile-time selectable XOR-only rowops instead of MULADD + Multithreading Mid-term: + + Add compile-time selectable XOR-only rowops instead of MULADD + Look into 12-bit fields as a performance optimization Long-term: diff --git a/LeopardFF16.cpp b/LeopardFF16.cpp index 903da9d..1888b4b 100644 --- a/LeopardFF16.cpp +++ b/LeopardFF16.cpp @@ -832,11 +832,10 @@ static void IFFT_DIT_Encoder( } } + // I tried unrolling this but it does not provide more than 5% performance + // improvement for 16-bit finite fields, so it's not worth the complexity. if (xor_result) - { - for (unsigned i = 0; i < m; ++i) - xor_mem(xor_result[i], work[i], bytes); - } + VectorXOR(bytes, m, xor_result, work); }