Implements fast division for long divisors (#7)

2025-02-23 07:18:14 +00:00 · 2021-06-28 12:42:14 +01:00 · 2021-06-28 12:42:14 +01:00 · ba81164330
commit ba81164330
parent 244bfe8740
13 changed files with 1120 additions and 99 deletions
--- a/.gitignore
+++ b/.gitignore
@ -3,6 +3,7 @@
 *_test
 *_bench
 *_debug
+*_tune
 *.prof
 *.out
 *.log
--- a/6
+++ b/6
@ -875,7 +875,7 @@ EXCLUDE                = inc \
                         tmp \
                         src/test_util.h \
                         src/bench_util.h \
-                         src/debug_util.h \
+                         src/debug_util.h

 # The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
 # directories that are symbolic links (a Unix file system feature) are excluded
@ -891,7 +891,9 @@ EXCLUDE_SYMLINKS       = NO
 # Note that the wildcards are matched against the file with absolute path, so to
 # exclude all test directories for example use the pattern */test/*

-EXCLUDE_PATTERNS       =
+EXCLUDE_PATTERNS       = *_test.c \
+                         *_bench.c \
+                         *_tune.c

 # The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
 # (namespaces, classes, functions, etc.) that should be excluded from the
--- a/src/Makefile
+++ b/src/Makefile
@ -1,6 +1,7 @@
 TESTS = bls12_381_test das_extension_test c_kzg_util_test fft_common_test fft_fr_test fft_g1_test \
 	fk20_proofs_test kzg_proofs_test poly_test recover_test utility_test zero_poly_test
-BENCH = fft_fr_bench fft_g1_bench recover_bench zero_poly_bench kzg_proofs_bench
+BENCH = fft_fr_bench fft_g1_bench recover_bench zero_poly_bench kzg_proofs_bench poly_bench
+TUNE = poly_mul_tune poly_div_tune
 LIB_SRC = bls12_381.c c_kzg_util.c das_extension.c fft_common.c fft_fr.c fft_g1.c fk20_proofs.c kzg_proofs.c poly.c recover.c utility.c zero_poly.c
 LIB_OBJ = $(LIB_SRC:.c=.o)

@ -30,6 +31,12 @@ libckzg.a: $(LIB_OBJ) Makefile
 	clang -Wall -I$(INCLUDE_DIRS) $(CFLAGS) $(KZG_CFLAGS) -o $@ $@.c bench_util.o test_util.o $(LIB_OBJ) -L../lib -lblst
 	./$@

+# Tuning
+%_tune: KZG_CFLAGS += -O
+%_tune: %_tune.c bench_util.o test_util.o $(LIB_OBJ) Makefile
+	clang -Wall -I$(INCLUDE_DIRS) $(CFLAGS) $(KZG_CFLAGS) -o $@ $@.c bench_util.o test_util.o $(LIB_OBJ) -L../lib -lblst
+	./$@
+
 lib: KZG_CFLAGS += -O
 lib: clean libckzg.a

@ -45,5 +52,6 @@ clean:
 	rm -f libckzg.a
 	rm -f $(TESTS)
 	rm -f $(BENCH)
+	rm -f $(TUNE)
 	rm -f *_debug
 	rm -f a.out
--- a/src/c_kzg_util.h
+++ b/src/c_kzg_util.h
@ -20,6 +20,10 @@
 #include "c_kzg.h"
 #include "poly.h"

+/** @def min_u64
+ *
+ * Safely find the minimum of two uint_64s.
+ */
 #define min_u64(a, b)                                                                                                  \
    ({                                                                                                                 \
        uint64_t _a = (a);                                                                                             \
@ -27,6 +31,17 @@
        _a < _b ? _a : _b;                                                                                             \
    })

+/** @def max_u64
+ *
+ * Safely find the maximum of two uint_64s.
+ */
+#define max_u64(a, b)                                                                                                  \
+    ({                                                                                                                 \
+        uint64_t _a = (a);                                                                                             \
+        uint64_t _b = (b);                                                                                             \
+        _a > _b ? _a : _b;                                                                                             \
+    })
+
 C_KZG_RET c_kzg_malloc(void **p, size_t n);
 C_KZG_RET new_uint64_array(uint64_t **x, size_t n);
 C_KZG_RET new_fr_array(fr_t **x, size_t n);
--- a/src/kzg_proofs.c
+++ b/src/kzg_proofs.c
@ -124,7 +124,7 @@ C_KZG_RET compute_proof_multi(g1_t *out, const poly *p, const fr_t *x0, uint64_t
    divisor.coeffs[n] = fr_one;

    // Calculate q = p / (x^n - x0^n)
-    TRY(new_poly_long_div(&q, p, &divisor));
+    TRY(new_poly_div(&q, p, &divisor));

    TRY(commit_to_poly(out, &q, ks));

--- a/src/poly.c
+++ b/src/poly.c
@ -15,12 +15,13 @@
 */

 /**
- *  @file poly.c
+ * @file poly.c
 *
 * Operations on polynomials defined over the finite field.
 */

 #include "c_kzg_util.h"
+#include "utility.h"
 #include "poly.h"

 /**
@ -34,6 +35,23 @@ static uint64_t poly_quotient_length(const poly *dividend, const poly *divisor)
    return dividend->length >= divisor->length ? dividend->length - divisor->length + 1 : 0;
 }

+/**
+ * Return a copy of a polynomial ensuring that the order is correct.
+ *
+ * @param[in] p A pointer to the polynomial to be normalised
+ * @return The normalised polynomial
+ */
+static poly poly_norm(const poly *p) {
+    poly ret = *p;
+    while (ret.length > 0 && fr_is_zero(&ret.coeffs[ret.length - 1])) {
+        ret.length--;
+    }
+    if (ret.length == 0) {
+        ret.coeffs = NULL;
+    }
+    return ret;
+}
+
 /**
 * Evaluate a polynomial over the finite field at a point.
 *
@ -67,35 +85,46 @@ void eval_poly(fr_t *out, const poly *p, const fr_t *x) {
 }

 /**
- * Polynomial division in the finite field.
+ * Polynomial division in the finite field via long division.
 *
 * Returns the polynomial resulting from dividing @p dividend by @p divisor.
 *
- * @remark @p out must be an uninitialised #poly. Space is allocated for it here, which
- * must be later reclaimed by calling #free_poly().
+ * Should be O(m.n) where m is the length of the dividend, and n the length of the divisor.
 *
- * @param[out] out      An uninitialised poly type that will contain the result of the division
+ * @remark @p out must be sized large enough for the resulting polynomial.
+ *
+ * @remark For some ranges of @p dividend and @p divisor, #poly_fast_div is much, much faster.
+ *
+ * @param[out] out      An appropriately sized poly type that will contain the result of the division
 * @param[in]  dividend The dividend polynomial
 * @param[in]  divisor  The divisor polynomial
 * @retval C_CZK_OK      All is well
 * @retval C_CZK_BADARGS Invalid parameters were supplied
 * @retval C_CZK_MALLOC  Memory allocation failed
 */
-C_KZG_RET new_poly_long_div(poly *out, const poly *dividend, const poly *divisor) {
+C_KZG_RET poly_long_div(poly *out, const poly *dividend, const poly *divisor) {
    uint64_t a_pos = dividend->length - 1;
    uint64_t b_pos = divisor->length - 1;
    uint64_t diff = a_pos - b_pos;
-    fr_t a[dividend->length];
+    fr_t *a;

    // Dividing by zero is undefined
    CHECK(divisor->length > 0);

-    // Initialise the output polynomial
-    TRY(new_poly(out, poly_quotient_length(dividend, divisor)));
+    // The divisor's highest coefficient must be non-zero
+    CHECK(!fr_is_zero(&divisor->coeffs[divisor->length - 1]));
+
+    // Deal with the size of the output polynomial
+    uint64_t out_length = poly_quotient_length(dividend, divisor);
+    CHECK(out->length >= out_length);
+    out->length = out_length;

    // If the divisor is larger than the dividend, the result is zero-length
-    if (out->length == 0) return C_KZG_OK;
+    if (out_length == 0) {
+        return C_KZG_OK;
+    }

+    TRY(new_fr_array(&a, dividend->length));
    for (uint64_t i = 0; i < dividend->length; i++) {
        a[i] = dividend->coeffs[i];
    }
@ -113,9 +142,393 @@ C_KZG_RET new_poly_long_div(poly *out, const poly *dividend, const poly *divisor
    }
    fr_div(&out->coeffs[0], &a[a_pos], &divisor->coeffs[b_pos]);

+    free(a);
    return C_KZG_OK;
 }

+/**
+ * Calculate the (possibly truncated) product of two polynomials.
+ *
+ * The size of the output polynomial determines the number of coefficients returned.
+ *
+ * @param[in,out] out The result of the division - its size determines the number of coefficients returned
+ * @param[in]     a   The muliplicand polynomial
+ * @param[in]     b   The multiplier polynomial
+ * @retval C_CZK_OK   All is well
+ */
+C_KZG_RET poly_mul_direct(poly *out, const poly *a, const poly *b) {
+
+    uint64_t a_degree = a->length - 1;
+    uint64_t b_degree = b->length - 1;
+
+    for (uint64_t k = 0; k < out->length; k++) {
+        out->coeffs[k] = fr_zero;
+    }
+
+    // Truncate the output to the length of the output polynomial
+    for (uint64_t i = 0; i <= a_degree; i++) {
+        for (uint64_t j = 0; j <= b_degree && i + j < out->length; j++) {
+            fr_t tmp;
+            fr_mul(&tmp, &a->coeffs[i], &b->coeffs[j]);
+            fr_add(&out->coeffs[i + j], &out->coeffs[i + j], &tmp);
+        }
+    }
+
+    return C_KZG_OK;
+}
+/**
+ * Pad with zeros or truncate an array of field elements to a specific size.
+ *
+ * @param[out] out   The padded/truncated array
+ * @param[in]  in    The original array to be padded
+ * @param[in]  n_in  The number of elements of @p in to take
+ * @param[in]  n_out The length of @p out
+ */
+void pad(fr_t *out, const fr_t *in, uint64_t n_in, uint64_t n_out) {
+    uint64_t num = min_u64(n_in, n_out);
+    for (int i = 0; i < num; i++) {
+        out[i] = in[i];
+    }
+    for (int i = num; i < n_out; i++) {
+        out[i] = fr_zero;
+    }
+}
+
+/**
+ * Calculate the (possibly truncated) product of two polynomials.
+ *
+ * The size of the output polynomial determines the number of coefficients returned.
+ *
+ * @remark This version uses FFTs to calculate the product via convolution, and is very efficient for large
+ * calculations. If @p fs is supplied as NULL, then the FFTSettings are allocated internally, otherwise the supplied
+ * settings are used, which must be sufficiently sized for the calculation.
+ *
+ * @param[in,out] out The result of the division - its size determines the number of coefficients returned
+ * @param[in]     a   The muliplicand polynomial
+ * @param[in]     b   The multiplier polynomial
+ * @param[in]     fs_ Either NULL or a sufficiently sized FFTSettings structure
+ * @retval C_CZK_OK   All is well
+ * @retval C_CZK_BADARGS Invalid parameters were supplied
+ * @retval C_CZK_ERROR   An internal error occurred
+ * @retval C_CZK_MALLOC  Memory allocation failed
+ */
+C_KZG_RET poly_mul_fft(poly *out, const poly *a, const poly *b, FFTSettings *fs_) {
+
+    // Truncate a and b so as not to do excess work for the number of coefficients required.
+    uint64_t a_len = min_u64(a->length, out->length);
+    uint64_t b_len = min_u64(b->length, out->length);
+    uint64_t length = next_power_of_two(a_len + b_len - 1);
+
+    // If the FFT settings are NULL then make a local set, otherwise use the ones passed in.
+    FFTSettings fs, *fs_p;
+    if (fs_ != NULL) {
+        fs_p = fs_;
+    } else {
+        fs_p = &fs;
+        int scale = log2_pow2(length); // TODO only good up to length < 32 bits
+        TRY(new_fft_settings(fs_p, scale));
+    }
+    CHECK(length <= fs_p->max_width);
+
+    fr_t *a_pad, *b_pad, *a_fft, *b_fft;
+    TRY(new_fr_array(&a_pad, length));
+    TRY(new_fr_array(&b_pad, length));
+    pad(a_pad, a->coeffs, a_len, length);
+    pad(b_pad, b->coeffs, b_len, length);
+
+    TRY(new_fr_array(&a_fft, length));
+    TRY(new_fr_array(&b_fft, length));
+    TRY(fft_fr(a_fft, a_pad, false, length, fs_p));
+    TRY(fft_fr(b_fft, b_pad, false, length, fs_p));
+
+    fr_t *ab_fft = a_pad; // reuse the a_pad array
+    fr_t *ab = b_pad;     // reuse the b_pad array
+    for (uint64_t i = 0; i < length; i++) {
+        fr_mul(&ab_fft[i], &a_fft[i], &b_fft[i]);
+    }
+    TRY(fft_fr(ab, ab_fft, true, length, fs_p));
+
+    // Copy result to output
+    uint64_t data_len = min_u64(out->length, length);
+    for (uint64_t i = 0; i < data_len; i++) {
+        out->coeffs[i] = ab[i];
+    }
+    for (uint64_t i = data_len; i < out->length; i++) {
+        out->coeffs[i] = fr_zero;
+    }
+
+    free(a_pad);
+    free(b_pad);
+    free(a_fft);
+    free(b_fft);
+    if (fs_p == &fs) {
+        free_fft_settings(fs_p);
+    }
+
+    return C_KZG_OK;
+}
+
+/**
+ * Calculate terms in the inverse of a polynomial.
+ *
+ * Returns terms in the expansion of `1 / b(x)` (aka the Maclaurin series).
+ *
+ * The size of @p out determines the number of terms returned.
+ *
+ * This is a non-recursive version of the algorithm in https://tc-arg.tk/pdfs/2020/fft.pdf theorem 3.4.
+ *
+ * @remark The constant term of @p b must be nonzero.
+ *
+ * @param[in, out] out A poly whose length determines the number of terms returned
+ * @param[in]      b   The polynomial to be inverted
+ * @retval C_CZK_OK      All is well
+ * @retval C_CZK_BADARGS Invalid parameters were supplied
+ * @retval C_CZK_ERROR   An internal error occurred
+ * @retval C_CZK_MALLOC  Memory allocation failed
+ */
+C_KZG_RET poly_inverse(poly *out, poly *b) {
+    poly tmp0, tmp1;
+    fr_t fr_two;
+
+    CHECK(out->length > 0);
+    CHECK(b->length > 0);
+    CHECK(!fr_is_zero(&b->coeffs[0]));
+
+    // If the input polynomial is constant, the remainder of the series is zero
+    if (b->length == 1) {
+        fr_inv(&out->coeffs[0], &b->coeffs[0]);
+        for (uint64_t i = 1; i < out->length; i++) {
+            out->coeffs[i] = fr_zero;
+        }
+        return C_KZG_OK;
+    }
+
+    uint64_t length = out->length;
+    uint64_t maxd = length - 1;
+    uint64_t d = 0;
+
+    // Max space for multiplications is (2 * length - 1)
+    int scale = log2_pow2(next_power_of_two(2 * length - 1));
+    FFTSettings fs;
+    TRY(new_fft_settings(&fs, scale));
+
+    // To store intermediate results
+    TRY(new_poly(&tmp0, length));
+    TRY(new_poly(&tmp1, length));
+
+    // Base case for d == 0
+    fr_inv(&out->coeffs[0], &b->coeffs[0]);
+    out->length = 1;
+
+    uint64_t mask = (uint64_t)1 << log2_u64(maxd);
+    while (mask) {
+
+        d = 2 * d + ((maxd & mask) != 0);
+        mask >>= 1;
+
+        // b.c -> tmp0 (we're using out for c)
+        tmp0.length = min_u64(d + 1, b->length + out->length - 1);
+        TRY(poly_mul_(&tmp0, b, out, &fs));
+
+        // 2 - b.c -> tmp0
+        for (int i = 0; i < tmp0.length; i++) {
+            fr_negate(&tmp0.coeffs[i], &tmp0.coeffs[i]);
+        }
+        fr_from_uint64(&fr_two, 2);
+        fr_add(&tmp0.coeffs[0], &tmp0.coeffs[0], &fr_two);
+
+        // c.(2 - b.c) -> tmp1;
+        tmp1.length = d + 1;
+        TRY(poly_mul_(&tmp1, out, &tmp0, &fs));
+
+        // tmp1 -> c
+        out->length = tmp1.length;
+        for (uint64_t i = 0; i < out->length; i++) {
+            out->coeffs[i] = tmp1.coeffs[i];
+        }
+    }
+    ASSERT(d + 1 == length);
+
+    free_poly(&tmp0);
+    free_poly(&tmp1);
+    free_fft_settings(&fs);
+
+    return C_KZG_OK;
+}
+
+/**
+ * Reverse the order of the coefficients of a polynomial.
+ *
+ * Corresponds to returning x^n.p(1/x).
+ *
+ * @param[out] out The flipped polynomial. Its size must be the same as the size of @p in
+ * @param[in]  in  The polynomial to be flipped
+ * @retval C_CZK_OK      All is well
+ * @retval C_CZK_BADARGS Invalid parameters were supplied
+ */
+C_KZG_RET poly_flip(poly *out, const poly *in) {
+    CHECK(out->length == in->length);
+    for (uint64_t i = 0; i < in->length; i++) {
+        out->coeffs[out->length - i - 1] = in->coeffs[i];
+    }
+    return C_KZG_OK;
+}
+
+/**
+ * Fast polynomial division in the finite field.
+ *
+ * Returns the polynomial resulting from dividing @p dividend by @p divisor.
+ *
+ * Implements https://tc-arg.tk/pdfs/2020/fft.pdf theorem 3.5.
+ *
+ * Should be O(m.log(m)) where m is the length of the dividend.
+ *
+ * @remark @p out must be sized large enough for the resulting polynomial.
+ *
+ * @remark For some ranges of @p dividend and @p divisor, #poly_long_div may be a little faster.
+ *
+ * @param[out] out      An appropriately sized poly type that will contain the result of the division
+ * @param[in]  dividend The dividend polynomial
+ * @param[in]  divisor  The divisor polynomial
+ * @retval C_CZK_OK      All is well
+ * @retval C_CZK_BADARGS Invalid parameters were supplied
+ * @retval C_CZK_ERROR   An internal error occurred
+ * @retval C_CZK_MALLOC  Memory allocation failed
+ */
+C_KZG_RET poly_fast_div(poly *out, const poly *dividend, const poly *divisor) {
+
+    // Dividing by zero is undefined
+    CHECK(divisor->length > 0);
+
+    // The divisor's highest coefficient must be non-zero
+    CHECK(!fr_is_zero(&divisor->coeffs[divisor->length - 1]));
+
+    uint64_t m = dividend->length - 1;
+    uint64_t n = divisor->length - 1;
+
+    // If the divisor is larger than the dividend, the result is zero-length
+    if (n > m) {
+        out->length = 0;
+        return C_KZG_OK;
+    }
+
+    // Ensure the output poly has enough space allocated
+    CHECK(out->length >= m - n + 1);
+
+    // Ensure that the divisor is well-formed for the inverse operation
+    CHECK(!fr_is_zero(&divisor->coeffs[divisor->length - 1]));
+
+    // Special case for divisor.length == 1 (it's a constant)
+    if (divisor->length == 1) {
+        out->length = dividend->length;
+        for (uint64_t i = 0; i < out->length; i++) {
+            fr_div(&out->coeffs[i], &dividend->coeffs[i], &divisor->coeffs[0]);
+        }
+        return C_KZG_OK;
+    }
+
+    poly a_flip, b_flip;
+    TRY(new_poly(&a_flip, dividend->length));
+    TRY(new_poly(&b_flip, divisor->length));
+    TRY(poly_flip(&a_flip, dividend));
+    TRY(poly_flip(&b_flip, divisor));
+
+    poly inv_b_flip;
+    TRY(new_poly(&inv_b_flip, m - n + 1));
+    TRY(poly_inverse(&inv_b_flip, &b_flip));
+
+    poly q_flip;
+    // We need only m - n + 1 coefficients of q_flip
+    TRY(new_poly(&q_flip, m - n + 1));
+    TRY(poly_mul(&q_flip, &a_flip, &inv_b_flip));
+
+    out->length = m - n + 1;
+    TRY(poly_flip(out, &q_flip));
+
+    free_poly(&a_flip);
+    free_poly(&b_flip);
+    free_poly(&inv_b_flip);
+    free_poly(&q_flip);
+
+    return C_KZG_OK;
+}
+
+/**
+ * Calculate the (possibly truncated) product of two polynomials.
+ *
+ * This is just a wrapper around #poly_mul_direct and #poly_mul_fft that selects the faster based on the size of the
+ * problem.
+ *
+ * @param[in,out] out The result of the division - its size determines the number of coefficients returned
+ * @param[in]     a   The muliplicand polynomial
+ * @param[in]     b   The multiplier polynomial
+ * @param[in]     fs  Either NULL or a sufficiently sized FFTSettings structure
+ * @retval C_CZK_OK   All is well
+ * @retval C_CZK_BADARGS Invalid parameters were supplied
+ * @retval C_CZK_ERROR   An internal error occurred
+ * @retval C_CZK_MALLOC  Memory allocation failed
+ *
+ * @todo Implement routine selection
+ */
+C_KZG_RET poly_mul_(poly *out, const poly *a, const poly *b, FFTSettings *fs) {
+    if (a->length < 64 || b->length < 64 || out->length < 128) { // Tunable parameter
+        return poly_mul_direct(out, a, b);
+    } else {
+        return poly_mul_fft(out, a, b, fs);
+    }
+}
+
+/**
+ * Calculate the (possibly truncated) product of two polynomials.
+ *
+ * This is just a wrapper around #poly_mul_direct and #poly_mul_fft that selects the faster based on the size of the
+ * problem.
+ *
+ * @param[in,out] out The result of the division - its size determines the number of coefficients returned
+ * @param[in]     a   The muliplicand polynomial
+ * @param[in]     b   The multiplier polynomial
+ * @retval C_CZK_OK   All is well
+ * @retval C_CZK_BADARGS Invalid parameters were supplied
+ * @retval C_CZK_ERROR   An internal error occurred
+ * @retval C_CZK_MALLOC  Memory allocation failed
+ *
+ * @todo Implement routine selection
+ */
+C_KZG_RET poly_mul(poly *out, const poly *a, const poly *b) {
+    return poly_mul_(out, a, b, NULL);
+}
+
+/**
+ * Polynomial division in the finite field.
+ *
+ * Returns the polynomial resulting from dividing @p dividend_ by @p divisor_.
+ *
+ * This is a wrapper around #poly_long_div and #poly_fast_div that chooses the fastest based on problem size.
+ *
+ * @remark @p out must be an uninitialised #poly. Space is allocated for it here, which
+ * must be later reclaimed by calling #free_poly().
+ *
+ * @param[out] out       An uninitialised poly type that will contain the result of the division
+ * @param[in]  dividend_ The dividend polynomial
+ * @param[in]  divisor_  The divisor polynomial
+ * @retval C_CZK_OK      All is well
+ * @retval C_CZK_BADARGS Invalid parameters were supplied
+ * @retval C_CZK_ERROR   An internal error occurred
+ * @retval C_CZK_MALLOC  Memory allocation failed
+ */
+C_KZG_RET new_poly_div(poly *out, const poly *dividend_, const poly *divisor_) {
+
+    poly dividend = poly_norm(dividend_);
+    poly divisor = poly_norm(divisor_);
+
+    TRY(new_poly(out, poly_quotient_length(&dividend, &divisor)));
+    if (divisor.length >= dividend.length || divisor.length < 128) { // Tunable paramter
+        return poly_long_div(out, &dividend, &divisor);
+    } else {
+        return poly_fast_div(out, &dividend, &divisor);
+    }
+}
+
 /**
 * Initialise an empty polynomial of the given size.
 *
--- a/src/poly.h
+++ b/src/poly.h
@ -20,6 +20,7 @@
 #define POLY_H

 #include "c_kzg.h"
+#include "fft_fr.h"

 /**
 * Defines a polynomial whose coefficients are members of the finite field F_r.
@ -32,7 +33,14 @@ typedef struct {
 } poly;

 void eval_poly(fr_t *out, const poly *p, const fr_t *x);
-C_KZG_RET new_poly_long_div(poly *out, const poly *dividend, const poly *divisor);
+C_KZG_RET poly_long_div(poly *out, const poly *dividend, const poly *divisor);
+C_KZG_RET poly_mul_direct(poly *out, const poly *a, const poly *b);
+C_KZG_RET poly_mul_fft(poly *out, const poly *a, const poly *b, FFTSettings *fs);
+C_KZG_RET poly_inverse(poly *out, poly *b);
+C_KZG_RET poly_fast_div(poly *out, const poly *dividend, const poly *divisor);
+C_KZG_RET poly_mul(poly *out, const poly *a, const poly *b);
+C_KZG_RET poly_mul_(poly *out, const poly *a, const poly *b, FFTSettings *fs);
+C_KZG_RET new_poly_div(poly *out, const poly *dividend, const poly *divisor);
 C_KZG_RET new_poly(poly *out, uint64_t length);
 C_KZG_RET new_poly_with_coeffs(poly *out, const fr_t *coeffs, uint64_t length);
 void free_poly(poly *p);
--- a/src/poly_bench.c
+++ b/src/poly_bench.c
@ -0,0 +1,97 @@
+/*
+ * Copyright 2021 Benjamin Edgington
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdlib.h> // malloc(), free(), atoi()
+#include <stdio.h>  // printf()
+#include <assert.h> // assert()
+#include <unistd.h> // EXIT_SUCCESS/FAILURE
+#include "bench_util.h"
+#include "test_util.h"
+#include "poly.h"
+
+// Run the benchmark for `max_seconds` and return the time per iteration in nanoseconds.
+long run_bench(int scale, int max_seconds) {
+    timespec_t t0, t1;
+    unsigned long total_time = 0, nits = 0;
+
+    uint64_t width = (uint64_t)1 << scale;
+
+    int dividend_length = width;
+    int divisor_length = width / 2; // What would be a relevant value of kzg multi-proofs?
+
+    poly dividend, divisor, q;
+    new_poly(&dividend, dividend_length);
+    new_poly(&divisor, divisor_length);
+
+    for (int i = 0; i < dividend_length; i++) {
+        dividend.coeffs[i] = rand_fr();
+    }
+    for (int i = 0; i < divisor_length; i++) {
+        divisor.coeffs[i] = rand_fr();
+    }
+
+    // Ensure that the polynomials' orders corresponds to their lengths
+    if (fr_is_zero(&dividend.coeffs[dividend.length - 1])) {
+        dividend.coeffs[dividend.length - 1] = fr_one;
+    }
+    if (fr_is_zero(&divisor.coeffs[divisor.length - 1])) {
+        divisor.coeffs[divisor.length - 1] = fr_one;
+    }
+
+    while (total_time < max_seconds * NANO) {
+        clock_gettime(CLOCK_REALTIME, &t0);
+
+        assert(C_KZG_OK == new_poly_div(&q, &dividend, &divisor));
+
+        clock_gettime(CLOCK_REALTIME, &t1);
+        nits++;
+        total_time += tdiff(t0, t1);
+
+        free_poly(&q);
+    }
+
+    free_poly(&dividend);
+    free_poly(&divisor);
+
+    return total_time / nits;
+}
+
+int main(int argc, char *argv[]) {
+    int nsec = 0;
+
+    switch (argc) {
+    case 1:
+        nsec = NSEC;
+        break;
+    case 2:
+        nsec = atoi(argv[1]);
+        break;
+    default:
+        break;
+    };
+
+    if (nsec == 0) {
+        printf("Usage: %s [test time in seconds > 0]\n", argv[0]);
+        exit(EXIT_FAILURE);
+    }
+
+    printf("*** Benchmarking Polynomial Division, %d second%s per test.\n", nsec, nsec == 1 ? "" : "s");
+    for (int scale = 6; scale <= 15; scale++) {
+        printf("new_poly_div/scale_%d %lu ns/op\n", scale, run_bench(scale, nsec));
+    }
+
+    return EXIT_SUCCESS;
+}
--- a/src/poly_div_tune.c
+++ b/src/poly_div_tune.c
@ -0,0 +1,119 @@
+/*
+ * Copyright 2021 Benjamin Edgington
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdlib.h> // malloc(), free(), atoi()
+#include <stdio.h>  // printf()
+#include <assert.h> // assert()
+#include <unistd.h> // EXIT_SUCCESS/FAILURE
+#include "bench_util.h"
+#include "test_util.h"
+#include "poly.h"
+
+// Run the benchmark for `max_seconds` and return the time per iteration in nanoseconds.
+long run_bench(int scale_0, int scale_1, int max_seconds) {
+    timespec_t t0, t1;
+    unsigned long total_time = 0, nits = 0;
+
+    uint64_t width_0 = (uint64_t)1 << scale_0;
+    // uint64_t width_1 = (uint64_t)1 << scale_1;
+    uint64_t width_1 = width_0 - ((uint64_t)1 << scale_1);
+
+    int dividend_length = width_0;
+    int divisor_length = width_1;
+
+    poly dividend, divisor, q;
+    new_poly(&dividend, dividend_length);
+    new_poly(&divisor, divisor_length);
+
+    for (int i = 0; i < dividend_length; i++) {
+        dividend.coeffs[i] = rand_fr();
+    }
+    for (int i = 0; i < divisor_length; i++) {
+        divisor.coeffs[i] = rand_fr();
+    }
+
+    // Ensure that the polynomials' orders corresponds to their lengths
+    if (fr_is_zero(&dividend.coeffs[dividend.length - 1])) {
+        dividend.coeffs[dividend.length - 1] = fr_one;
+    }
+    if (fr_is_zero(&divisor.coeffs[divisor.length - 1])) {
+        divisor.coeffs[divisor.length - 1] = fr_one;
+    }
+
+    new_poly(&q, dividend.length - divisor.length + 1);
+
+    while (total_time < max_seconds * NANO) {
+        clock_gettime(CLOCK_REALTIME, &t0);
+
+#if 0
+        assert(C_KZG_OK == poly_long_div(&q, &dividend, &divisor));
+#else
+        assert(C_KZG_OK == poly_fast_div(&q, &dividend, &divisor));
+#endif
+        clock_gettime(CLOCK_REALTIME, &t1);
+        nits++;
+        total_time += tdiff(t0, t1);
+    }
+
+    free_poly(&dividend);
+    free_poly(&divisor);
+    free_poly(&q);
+
+    return total_time / nits;
+}
+
+int main(int argc, char *argv[]) {
+    int nsec = 0;
+
+    switch (argc) {
+    case 1:
+        nsec = NSEC;
+        break;
+    case 2:
+        nsec = atoi(argv[1]);
+        break;
+    default:
+        break;
+    };
+
+    if (nsec == 0) {
+        printf("Usage: %s [test time in seconds > 0]\n", argv[0]);
+        exit(EXIT_FAILURE);
+    }
+
+    int scale_min = 5;
+    int scale_max = 14;
+
+#if 0
+    printf("*** Benchmarking poly_long_div() %d second%s per test.\n", nsec, nsec == 1 ? "" : "s");
+#else
+    printf("*** Benchmarking poly_fast_div() %d second%s per test.\n", nsec, nsec == 1 ? "" : "s");
+#endif
+    printf(",");
+    for (int i = /* scale_min */ 0; i <= scale_max; i++) {
+        printf("%d,", i);
+    }
+    printf("\n");
+    for (int scale_0 = scale_min; scale_0 <= scale_max; scale_0++) {
+        printf("%d,", scale_0);
+        for (int scale_1 = /* scale_min */ 0; scale_1 < scale_0; scale_1++) {
+            printf("%lu,", run_bench(scale_0, scale_1, nsec));
+        }
+        printf("\n");
+    }
+
+    return EXIT_SUCCESS;
+}
--- a/src/poly_mul_tune.c
+++ b/src/poly_mul_tune.c
@ -0,0 +1,116 @@
+/*
+ * Copyright 2021 Benjamin Edgington
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdlib.h> // malloc(), free(), atoi()
+#include <stdio.h>  // printf()
+#include <assert.h> // assert()
+#include <unistd.h> // EXIT_SUCCESS/FAILURE
+#include "bench_util.h"
+#include "test_util.h"
+#include "poly.h"
+
+// Run the benchmark for `max_seconds` and return the time per iteration in nanoseconds.
+long run_bench(int scale_0, int scale_1, int max_seconds) {
+    timespec_t t0, t1;
+    unsigned long total_time = 0, nits = 0;
+
+    uint64_t width_0 = (uint64_t)1 << scale_0;
+    uint64_t width_1 = (uint64_t)1 << scale_1;
+
+    poly multiplicand, multiplier, r;
+    new_poly(&multiplicand, width_0);
+    new_poly(&multiplier, width_1);
+
+    for (int i = 0; i < width_0; i++) {
+        multiplicand.coeffs[i] = rand_fr();
+    }
+    for (int i = 0; i < width_1; i++) {
+        multiplier.coeffs[i] = rand_fr();
+    }
+
+    // Ensure that the polynomials' orders corresponds to their lengths
+    if (fr_is_zero(&multiplicand.coeffs[multiplicand.length - 1])) {
+        multiplicand.coeffs[multiplicand.length - 1] = fr_one;
+    }
+    if (fr_is_zero(&multiplier.coeffs[multiplier.length - 1])) {
+        multiplier.coeffs[multiplier.length - 1] = fr_one;
+    }
+
+    new_poly(&r, multiplicand.length + multiplier.length - 1);
+
+    while (total_time < max_seconds * NANO) {
+        clock_gettime(CLOCK_REALTIME, &t0);
+
+#if 0
+        assert(C_KZG_OK == poly_mul_fft(&r, &multiplicand, &multiplier, NULL));
+#else
+        assert(C_KZG_OK == poly_mul(&r, &multiplicand, &multiplier));
+#endif
+
+        clock_gettime(CLOCK_REALTIME, &t1);
+        nits++;
+        total_time += tdiff(t0, t1);
+    }
+
+    free_poly(&multiplicand);
+    free_poly(&multiplier);
+    free_poly(&r);
+
+    return total_time / nits;
+}
+
+int main(int argc, char *argv[]) {
+    int nsec = 0;
+
+    switch (argc) {
+    case 1:
+        nsec = NSEC;
+        break;
+    case 2:
+        nsec = atoi(argv[1]);
+        break;
+    default:
+        break;
+    };
+
+    if (nsec == 0) {
+        printf("Usage: %s [test time in seconds > 0]\n", argv[0]);
+        exit(EXIT_FAILURE);
+    }
+
+    int scale_min = 5;
+    int scale_max = 12;
+
+#if 1
+    printf("*** Benchmarking poly_mul_fft() %d second%s per test.\n", nsec, nsec == 1 ? "" : "s");
+#else
+    printf("*** Benchmarking poly_mul_direct() %d second%s per test.\n", nsec, nsec == 1 ? "" : "s");
+#endif
+    printf(",");
+    for (int i = scale_min; i <= scale_max; i++) {
+        printf("%d,", i);
+    }
+    printf("\n");
+    for (int scale_0 = scale_min; scale_0 <= scale_max; scale_0++) {
+        printf("%d,", scale_0);
+        for (int scale_1 = scale_min; scale_1 <= scale_max; scale_1++) {
+            printf("%lu,", run_bench(scale_0, scale_1, nsec));
+        }
+        printf("\n");
+    }
+
+    return EXIT_SUCCESS;
+}
--- a/src/poly_test.c
+++ b/src/poly_test.c
@ -18,97 +18,93 @@
 #include "test_util.h"
 #include "poly.h"

-void poly_div_0(void) {
-    fr_t a[3], b[2], expected[2];
-    poly dividend, divisor, actual;
+// If anyone has a nicer way to initialise these test data, I'd love to hear it.
+typedef struct polydata {
+    int length;
+    int coeffs[];
+} polydata;

-    // Calculate (x^2 - 1) / (x + 1) = x - 1
+// (x^2 - 1) / (x + 1) = x - 1
+polydata test_0_0 = {3, {-1, 0, 1}};
+polydata test_0_1 = {2, {1, 1}};
+polydata test_0_2 = {2, {-1, 1}};

-    // Dividend
-    fr_from_uint64(&a[0], 1);
-    fr_negate(&a[0], &a[0]);
-    fr_from_uint64(&a[1], 0);
-    fr_from_uint64(&a[2], 1);
-    dividend.length = 3;
-    dividend.coeffs = a;
+// (12x^3 - 11x^2 + 9x + 18) / (4x + 3) = 3x^2 - 5x + 6
+polydata test_1_0 = {4, {18, 9, -11, 12}};
+polydata test_1_1 = {2, {3, 4}};
+polydata test_1_2 = {3, {6, -5, 3}};

-    // Divisor
-    fr_from_uint64(&b[0], 1);
-    fr_from_uint64(&b[1], 1);
-    divisor.length = 2;
-    divisor.coeffs = b;
+// (x + 1) / (x^2 - 1) = nil
+polydata test_2_0 = {2, {1, 1}};
+polydata test_2_1 = {3, {-1, 0, 2}};
+polydata test_2_2 = {0, {}};

-    // Known result
-    fr_from_uint64(&expected[0], 1);
-    fr_negate(&expected[0], &expected[0]);
-    fr_from_uint64(&expected[1], 1);
+// (10x^2 + 20x + 30) / 10 = x^2 + 2x + 3
+polydata test_3_0 = {3, {30, 20, 10}};
+polydata test_3_1 = {1, {10}};
+polydata test_3_2 = {3, {3, 2, 1}};

-    TEST_CHECK(C_KZG_OK == new_poly_long_div(&actual, &dividend, &divisor));
-    TEST_CHECK(fr_equal(&expected[0], &actual.coeffs[0]));
-    TEST_CHECK(fr_equal(&expected[1], &actual.coeffs[1]));
+// (x^2 + x) / (x + 1) = x
+polydata test_4_0 = {3, {0, 1, 1}};
+polydata test_4_1 = {2, {1, 1}};
+polydata test_4_2 = {2, {0, 1}};

-    free_poly(&actual);
+// (x^2 + x + 1) / 1 = x^2 + x + 1
+polydata test_5_0 = {3, {1, 1, 1}};
+polydata test_5_1 = {1, {1}};
+polydata test_5_2 = {3, {1, 1, 1}};
+
+// (x^2 + x + 1) / (0x + 1) = x^2 + x + 1
+polydata test_6_0 = {3, {1, 1, 1}};
+polydata test_6_1 = {2, {1, 0}}; // The highest coefficient is zero
+polydata test_6_2 = {3, {1, 1, 1}};
+
+polydata *test[][3] = {{&test_0_0, &test_0_1, &test_0_2}, {&test_1_0, &test_1_1, &test_1_2},
+                       {&test_2_0, &test_2_1, &test_2_2}, {&test_3_0, &test_3_1, &test_3_2},
+                       {&test_4_0, &test_4_1, &test_4_2}, {&test_5_0, &test_5_1, &test_5_2},
+                       {&test_6_0, &test_6_1, &test_6_2}};
+
+/* Internal utility function */
+void new_test_poly(poly *p, polydata *data) {
+    new_poly(p, data->length);
+    for (int i = 0; i < p->length; i++) {
+        int coeff = data->coeffs[i];
+        if (coeff >= 0) {
+            fr_from_uint64(&p->coeffs[i], coeff);
+        } else {
+            fr_from_uint64(&p->coeffs[i], -coeff);
+            fr_negate(&p->coeffs[i], &p->coeffs[i]);
+        }
+    }
 }

-void poly_div_1(void) {
-    fr_t a[4], b[2], expected[3];
-    poly dividend, divisor, actual;
+void poly_test_div(void) {
+    poly dividend, divisor, expected, actual;
+    int ntest = sizeof test / sizeof test[0];

-    // Calculate (12x^3 - 11x^2 + 9x + 18) / (4x + 3) = 3x^2 - 5x + 6
+    for (int i = 0; i < ntest; i++) {
+        new_test_poly(&dividend, test[i][0]);
+        new_test_poly(&divisor, test[i][1]);
+        new_test_poly(&expected, test[i][2]);

-    // Dividend
-    fr_from_uint64(&a[0], 18);
-    fr_from_uint64(&a[1], 9);
-    fr_from_uint64(&a[2], 11);
-    fr_negate(&a[2], &a[2]);
-    fr_from_uint64(&a[3], 12);
-    dividend.length = 4;
-    dividend.coeffs = a;
+        if (TEST_CHECK(C_KZG_OK == new_poly_div(&actual, &dividend, &divisor))) {
+            if (TEST_CHECK(actual.length == expected.length)) {
+                for (int j = 0; j < actual.length; j++) {
+                    TEST_CHECK(fr_equal(&actual.coeffs[j], &expected.coeffs[j]));
+                    TEST_MSG("Failed test %d with incorrect value", i);
+                }
+            } else {
+                TEST_MSG("Failed test %d with incorrect length.", i);
+            }
+        } else {
+            TEST_MSG("Failed test %d with bad return value.", i);
+        }

-    // Divisor
-    fr_from_uint64(&b[0], 3);
-    fr_from_uint64(&b[1], 4);
-    divisor.length = 2;
-    divisor.coeffs = b;
-
-    // Known result
-    fr_from_uint64(&expected[0], 6);
-    fr_from_uint64(&expected[1], 5);
-    fr_negate(&expected[1], &expected[1]);
-    fr_from_uint64(&expected[2], 3);
-
-    TEST_CHECK(C_KZG_OK == new_poly_long_div(&actual, &dividend, &divisor));
-    TEST_CHECK(fr_equal(&expected[0], &actual.coeffs[0]));
-    TEST_CHECK(fr_equal(&expected[1], &actual.coeffs[1]));
-    TEST_CHECK(fr_equal(&expected[2], &actual.coeffs[2]));
-
-    free_poly(&actual);
-}
-
-void poly_div_2(void) {
-    fr_t a[2], b[3];
-    poly dividend, divisor, actual;
-
-    // Calculate (x + 1) / (x^2 - 1) = nil
-
-    // Dividend
-    fr_from_uint64(&a[0], 1);
-    fr_from_uint64(&a[1], 1);
-    dividend.length = 2;
-    dividend.coeffs = a;
-
-    // Divisor
-    fr_from_uint64(&b[0], 1);
-    fr_negate(&b[0], &b[0]);
-    fr_from_uint64(&b[1], 0);
-    fr_from_uint64(&b[2], 1);
-    divisor.length = 3;
-    divisor.coeffs = b;
-
-    TEST_CHECK(C_KZG_OK == new_poly_long_div(&actual, &dividend, &divisor));
-    TEST_CHECK(NULL == actual.coeffs);
-
-    free_poly(&actual);
+        free_poly(&dividend);
+        free_poly(&divisor);
+        free_poly(&expected);
+        free_poly(&actual);
+    }
 }

 void poly_div_by_zero(void) {
@ -126,7 +122,10 @@ void poly_div_by_zero(void) {
    // Divisor
    new_poly(&divisor, 0);

-    TEST_CHECK(C_KZG_BADARGS == new_poly_long_div(&dummy, &dividend, &divisor));
+    TEST_CHECK(C_KZG_BADARGS == new_poly_div(&dummy, &dividend, &divisor));
+
+    free_poly(&divisor);
+    free_poly(&dummy);
 }

 void poly_eval_check(void) {
@ -176,14 +175,242 @@ void poly_eval_nil_check(void) {
    free_poly(&p);
 }

+void poly_mul_direct_test(void) {
+
+    // Calculate (3x^2 - 5x + 6) * (4x + 3) = 12x^3 - 11x^2 + 9x + 18
+    static polydata multiplier_data = {3, {6, -5, 3}};
+    static polydata multiplicand_data = {2, {3, 4}};
+    static polydata expected_data = {4, {18, 9, -11, 12}};
+
+    poly multiplicand, multiplier, expected, actual0, actual1;
+
+    new_test_poly(&multiplicand, &multiplicand_data);
+    new_test_poly(&multiplier, &multiplier_data);
+    new_test_poly(&expected, &expected_data);
+
+    new_poly(&actual0, 4);
+    TEST_CHECK(C_KZG_OK == poly_mul_direct(&actual0, &multiplicand, &multiplier));
+    TEST_CHECK(fr_equal(&expected.coeffs[0], &actual0.coeffs[0]));
+    TEST_CHECK(fr_equal(&expected.coeffs[1], &actual0.coeffs[1]));
+    TEST_CHECK(fr_equal(&expected.coeffs[2], &actual0.coeffs[2]));
+    TEST_CHECK(fr_equal(&expected.coeffs[3], &actual0.coeffs[3]));
+
+    // Check commutativity
+    new_poly(&actual1, 4);
+    TEST_CHECK(C_KZG_OK == poly_mul_direct(&actual1, &multiplier, &multiplicand));
+    TEST_CHECK(fr_equal(&expected.coeffs[0], &actual1.coeffs[0]));
+    TEST_CHECK(fr_equal(&expected.coeffs[1], &actual1.coeffs[1]));
+    TEST_CHECK(fr_equal(&expected.coeffs[2], &actual1.coeffs[2]));
+    TEST_CHECK(fr_equal(&expected.coeffs[3], &actual1.coeffs[3]));
+
+    free_poly(&multiplicand);
+    free_poly(&multiplier);
+    free_poly(&expected);
+    free_poly(&actual0);
+    free_poly(&actual1);
+}
+
+void poly_mul_fft_test(void) {
+
+    // Calculate (3x^2 - 5x + 6) * (4x + 3) = 12x^3 - 11x^2 + 9x + 18
+    static polydata multiplier_data = {3, {6, -5, 3}};
+    static polydata multiplicand_data = {2, {3, 4}};
+    static polydata expected_data = {4, {18, 9, -11, 12}};
+
+    poly multiplicand, multiplier, expected, actual0, actual1;
+
+    new_test_poly(&multiplicand, &multiplicand_data);
+    new_test_poly(&multiplier, &multiplier_data);
+    new_test_poly(&expected, &expected_data);
+
+    new_poly(&actual0, 4);
+    TEST_CHECK(C_KZG_OK == poly_mul_fft(&actual0, &multiplicand, &multiplier, NULL));
+    TEST_CHECK(fr_equal(&expected.coeffs[0], &actual0.coeffs[0]));
+    TEST_CHECK(fr_equal(&expected.coeffs[1], &actual0.coeffs[1]));
+    TEST_CHECK(fr_equal(&expected.coeffs[2], &actual0.coeffs[2]));
+    TEST_CHECK(fr_equal(&expected.coeffs[3], &actual0.coeffs[3]));
+
+    // Check commutativity
+    new_poly(&actual1, 4);
+    TEST_CHECK(C_KZG_OK == poly_mul_fft(&actual1, &multiplier, &multiplicand, NULL));
+    TEST_CHECK(fr_equal(&expected.coeffs[0], &actual1.coeffs[0]));
+    TEST_CHECK(fr_equal(&expected.coeffs[1], &actual1.coeffs[1]));
+    TEST_CHECK(fr_equal(&expected.coeffs[2], &actual1.coeffs[2]));
+    TEST_CHECK(fr_equal(&expected.coeffs[3], &actual1.coeffs[3]));
+
+    free_poly(&multiplicand);
+    free_poly(&multiplier);
+    free_poly(&expected);
+    free_poly(&actual0);
+    free_poly(&actual1);
+}
+
+void poly_inverse_simple_0(void) {
+
+    // 1 / (1 - x) = 1 + x + x^2 + ...
+
+    poly p, q;
+    int d = 16; // The number of terms to take
+
+    new_poly(&p, 2);
+    p.coeffs[0] = fr_one;
+    p.coeffs[1] = fr_one;
+    fr_negate(&p.coeffs[1], &p.coeffs[1]);
+
+    new_poly(&q, d);
+    TEST_CHECK(C_KZG_OK == poly_inverse(&q, &p));
+
+    for (int i = 0; i < d; i++) {
+        TEST_CHECK(fr_is_one(&q.coeffs[i]));
+    }
+
+    free_poly(&p);
+    free_poly(&q);
+}
+
+void poly_inverse_simple_1(void) {
+
+    // 1 / (1 + x) = 1 - x + x^2 - ...
+
+    poly p, q;
+    int d = 16; // The number of terms to take
+
+    new_poly(&p, 2);
+    p.coeffs[0] = fr_one;
+    p.coeffs[1] = fr_one;
+
+    new_poly(&q, d);
+    TEST_CHECK(C_KZG_OK == poly_inverse(&q, &p));
+
+    for (int i = 0; i < d; i++) {
+        fr_t tmp = q.coeffs[i];
+        if (i & 1) {
+            fr_negate(&tmp, &tmp);
+        }
+        TEST_CHECK(fr_is_one(&tmp));
+    }
+
+    free_poly(&p);
+    free_poly(&q);
+}
+
+void poly_mul_random(void) {
+
+    // Compare the output of poly_mul_direct() and poly_mul_fft()
+
+    poly multiplicand, multiplier;
+    poly q0, q1;
+
+    for (int k = 0; k < 256; k++) {
+
+        int multiplicand_length = 1 + rand() % 1000;
+        int multiplier_length = 1 + rand() % 1000;
+        int out_length = 1 + rand() % 1000;
+
+        new_poly(&multiplicand, multiplicand_length);
+        new_poly(&multiplier, multiplier_length);
+
+        for (int i = 0; i < multiplicand_length; i++) {
+            multiplicand.coeffs[i] = rand_fr();
+        }
+        for (int i = 0; i < multiplier_length; i++) {
+            multiplier.coeffs[i] = rand_fr();
+        }
+
+        // Ensure that the polynomials' orders corresponds to their lengths
+        if (fr_is_zero(&multiplicand.coeffs[multiplicand.length - 1])) {
+            multiplicand.coeffs[multiplicand.length - 1] = fr_one;
+        }
+        if (fr_is_zero(&multiplier.coeffs[multiplier.length - 1])) {
+            multiplier.coeffs[multiplier.length - 1] = fr_one;
+        }
+
+        new_poly(&q0, out_length); // Truncate the result
+        TEST_CHECK(C_KZG_OK == poly_mul_direct(&q0, &multiplicand, &multiplier));
+
+        new_poly(&q1, out_length);
+        TEST_CHECK(C_KZG_OK == poly_mul_fft(&q1, &multiplicand, &multiplier, NULL));
+
+        TEST_CHECK(q1.length == q0.length);
+        for (int i = 0; i < q0.length; i++) {
+            if (!TEST_CHECK(fr_equal(&q0.coeffs[i], &q1.coeffs[i]))) {
+                TEST_MSG("round = %d, i = %d, multiplicand_length = %lu, multiplier_length = %lu, out_length = %lu", k,
+                         i, multiplicand.length, multiplier.length, q0.length);
+            }
+        }
+
+        free_poly(&multiplicand);
+        free_poly(&multiplier);
+        free_poly(&q0);
+        free_poly(&q1);
+    }
+}
+
+void poly_div_random(void) {
+
+    // Compare the output of poly_fast_div() and poly_long_div()
+
+    poly dividend, divisor;
+    poly q0, q1;
+
+    for (int k = 0; k < 256; k++) {
+
+        int dividend_length = 2 + rand() % 1000;
+        int divisor_length = 1 + rand() % dividend_length;
+
+        new_poly(&dividend, dividend_length);
+        new_poly(&divisor, divisor_length);
+
+        for (int i = 0; i < dividend_length; i++) {
+            dividend.coeffs[i] = rand_fr();
+        }
+        for (int i = 0; i < divisor_length; i++) {
+            divisor.coeffs[i] = rand_fr();
+        }
+
+        // Ensure that the polynomials' orders corresponds to their lengths
+        if (fr_is_zero(&dividend.coeffs[dividend.length - 1])) {
+            dividend.coeffs[dividend.length - 1] = fr_one;
+        }
+        if (fr_is_zero(&divisor.coeffs[divisor.length - 1])) {
+            divisor.coeffs[divisor.length - 1] = fr_one;
+        }
+
+        new_poly(&q0, dividend.length - divisor.length + 1);
+        TEST_CHECK(C_KZG_OK == poly_long_div(&q0, &dividend, &divisor));
+
+        new_poly(&q1, dividend.length - divisor.length + 1);
+        TEST_CHECK(C_KZG_OK == poly_fast_div(&q1, &dividend, &divisor));
+
+        TEST_CHECK(q1.length == q0.length);
+        for (int i = 0; i < q0.length; i++) {
+            if (!TEST_CHECK(fr_equal(&q0.coeffs[i], &q1.coeffs[i]))) {
+                TEST_MSG("round = %d, dividend_length = %lu, divisor_length = %lu, i = %d", k, dividend.length,
+                         divisor.length, i);
+            }
+        }
+
+        free_poly(&dividend);
+        free_poly(&divisor);
+        free_poly(&q0);
+        free_poly(&q1);
+    }
+}
+
 TEST_LIST = {
    {"POLY_TEST", title},
-    {"poly_div_0", poly_div_0},
-    {"poly_div_1", poly_div_1},
-    {"poly_div_2", poly_div_2},
+    {"poly_test_div", poly_test_div},
+#ifndef DEBUG
    {"poly_div_by_zero", poly_div_by_zero},
+#endif
    {"poly_eval_check", poly_eval_check},
    {"poly_eval_0_check", poly_eval_0_check},
    {"poly_eval_nil_check", poly_eval_nil_check},
+    {"poly_mul_direct_test", poly_mul_direct_test},
+    {"poly_mul_fft_test", poly_mul_fft_test},
+    {"poly_inverse_simple_0", poly_inverse_simple_0},
+    {"poly_inverse_simple_1", poly_inverse_simple_1},
+    {"poly_mul_random", poly_mul_random},
+    {"poly_div_random", poly_div_random},
    {NULL, NULL} /* zero record marks the end of the list */
 };
--- a/src/utility.c
+++ b/src/utility.c
@ -58,6 +58,20 @@ int log2_pow2(uint32_t n) {
    return r;
 }

+/**
+ * Calculate log base two of a power of two.
+ *
+ * In other words, the bit index of the highest one bit.
+ *
+ * @param[in] n The 64 bit unsigned integer to take the logarithm of
+ * @return the log base two of n
+ */
+int log2_u64(uint64_t n) {
+    int r = 0;
+    while (n >>= 1) r++;
+    return r;
+}
+
 /**
 * Return the next highest power of two.
 *
--- a/src/utility.h
+++ b/src/utility.h
@ -44,6 +44,7 @@

 bool is_power_of_two(uint64_t n);
 int log2_pow2(uint32_t n);
+int log2_u64(uint64_t n);
 uint64_t next_power_of_two(uint64_t v);
 uint32_t reverse_bits(uint32_t a);
 uint32_t reverse_bits_limited(uint32_t n, uint32_t value);