Implement in-place FFTs for field elements
This commit is contained in:
parent
e2c74624d7
commit
5cc7ea488c
|
@ -1,6 +1,6 @@
|
||||||
TESTS = bls12_381_test das_extension_test c_kzg_util_test fft_common_test fft_fr_test fft_g1_test \
|
TESTS = bls12_381_test das_extension_test c_kzg_util_test fft_common_test fft_fr_test fft_g1_test \
|
||||||
fk20_proofs_test kzg_proofs_test poly_test utility_test
|
fk20_proofs_test kzg_proofs_test poly_test utility_test
|
||||||
BENCH = fft_fr_bench fft_g1_bench
|
BENCH = fft_fr_bench fft_g1_bench fft_fr_in_place_0_bench fft_fr_in_place_1_bench
|
||||||
LIB_SRC = bls12_381.c c_kzg_util.c das_extension.c fft_common.c fft_fr.c fft_g1.c fk20_proofs.c kzg_proofs.c poly.c utility.c
|
LIB_SRC = bls12_381.c c_kzg_util.c das_extension.c fft_common.c fft_fr.c fft_g1.c fk20_proofs.c kzg_proofs.c poly.c utility.c
|
||||||
LIB_OBJ = $(LIB_SRC:.c=.o)
|
LIB_OBJ = $(LIB_SRC:.c=.o)
|
||||||
|
|
||||||
|
|
74
src/fft_fr.c
74
src/fft_fr.c
|
@ -111,3 +111,77 @@ C_KZG_RET fft_fr(fr_t *out, const fr_t *in, bool inverse, uint64_t n, const FFTS
|
||||||
}
|
}
|
||||||
return C_KZG_OK;
|
return C_KZG_OK;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Perform an in-place FFT by copying the results over the input.
|
||||||
|
*
|
||||||
|
* @remark This is almost as fast as #fft_fr. It is slower than #fft_fr_in_place_lomem, but, unlike that routine,
|
||||||
|
* allocates a extra array the same size as the input data to copy the results.
|
||||||
|
*
|
||||||
|
* @param[in,out] data The input data and output results (array of length @p n)
|
||||||
|
* @param[in] inverse `false` for forward transform, `true` for inverse transform
|
||||||
|
* @param[in] n Length of the FFT, must be a power of two
|
||||||
|
* @param[in] fs Pointer to previously initialised FFTSettings structure with `max_width` at least @p n.
|
||||||
|
* @retval C_CZK_OK All is well
|
||||||
|
* @retval C_CZK_BADARGS Invalid parameters were supplied
|
||||||
|
* @retval C_CZK_MALLOC Memory allocation failed
|
||||||
|
*/
|
||||||
|
C_KZG_RET fft_fr_in_place(fr_t *data, bool inverse, uint64_t n, const FFTSettings *fs) {
|
||||||
|
fr_t *out;
|
||||||
|
CHECK(n <= fs->max_width);
|
||||||
|
CHECK(is_power_of_two(n));
|
||||||
|
TRY(new_fr_array(&out, fs->max_width));
|
||||||
|
TRY(fft_fr(out, data, inverse, n, fs));
|
||||||
|
for (uint64_t i = 0; i < n; i++) {
|
||||||
|
data[i] = out[i];
|
||||||
|
}
|
||||||
|
free(out);
|
||||||
|
return C_KZG_OK;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Perform an in-place FFT without allocating any extra memory.
|
||||||
|
*
|
||||||
|
* @remark This is about 25% slower than #fft_fr_in_place, but does not allocate any extra memory, whereas that routine
|
||||||
|
* has a memory overhead the same size as the input array.
|
||||||
|
*
|
||||||
|
* @param[in,out] data The input data and output results (array of length @p n)
|
||||||
|
* @param[in] inverse `false` for forward transform, `true` for inverse transform
|
||||||
|
* @param[in] n Length of the FFT, must be a power of two
|
||||||
|
* @param[in] fs Pointer to previously initialised FFTSettings structure with `max_width` at least @p n.
|
||||||
|
* @retval C_CZK_OK All is well
|
||||||
|
* @retval C_CZK_BADARGS Invalid parameters were supplied
|
||||||
|
*/
|
||||||
|
C_KZG_RET fft_fr_in_place_lomem(fr_t *data, bool inverse, uint64_t n, const FFTSettings *fs) {
|
||||||
|
uint64_t stride = fs->max_width / n;
|
||||||
|
CHECK(n <= fs->max_width);
|
||||||
|
CHECK(is_power_of_two(n));
|
||||||
|
|
||||||
|
fr_t *roots = inverse ? fs->reverse_roots_of_unity : fs->expanded_roots_of_unity;
|
||||||
|
|
||||||
|
reverse_bit_order(data, sizeof data[0], n);
|
||||||
|
|
||||||
|
uint64_t m = 1;
|
||||||
|
while (m < n) {
|
||||||
|
m <<= 1;
|
||||||
|
for (uint64_t k = 0; k < n; k += m) {
|
||||||
|
for (uint64_t j = 0; j < m / 2; j++) {
|
||||||
|
fr_t t, w = roots[j * stride * n / m];
|
||||||
|
fr_mul(&t, &w, &data[k + j + m / 2]);
|
||||||
|
fr_sub(&data[k + j + m / 2], &data[k + j], &t);
|
||||||
|
fr_add(&data[k + j], &data[k + j], &t);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (inverse) {
|
||||||
|
fr_t inv_len;
|
||||||
|
fr_from_uint64(&inv_len, n);
|
||||||
|
fr_inv(&inv_len, &inv_len);
|
||||||
|
for (uint64_t i = 0; i < n; i++) {
|
||||||
|
fr_mul(&data[i], &data[i], &inv_len);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return C_KZG_OK;
|
||||||
|
}
|
||||||
|
|
|
@ -21,3 +21,5 @@
|
||||||
void fft_fr_slow(fr_t *out, const fr_t *in, uint64_t stride, const fr_t *roots, uint64_t roots_stride, uint64_t n);
|
void fft_fr_slow(fr_t *out, const fr_t *in, uint64_t stride, const fr_t *roots, uint64_t roots_stride, uint64_t n);
|
||||||
void fft_fr_fast(fr_t *out, const fr_t *in, uint64_t stride, const fr_t *roots, uint64_t roots_stride, uint64_t n);
|
void fft_fr_fast(fr_t *out, const fr_t *in, uint64_t stride, const fr_t *roots, uint64_t roots_stride, uint64_t n);
|
||||||
C_KZG_RET fft_fr(fr_t *out, const fr_t *in, bool inverse, uint64_t n, const FFTSettings *fs);
|
C_KZG_RET fft_fr(fr_t *out, const fr_t *in, bool inverse, uint64_t n, const FFTSettings *fs);
|
||||||
|
C_KZG_RET fft_fr_in_place(fr_t *data, bool inverse, uint64_t n, const FFTSettings *fs);
|
||||||
|
C_KZG_RET fft_fr_in_place_lomem(fr_t *data, bool inverse, uint64_t n, const FFTSettings *fs);
|
||||||
|
|
|
@ -0,0 +1,80 @@
|
||||||
|
/*
|
||||||
|
* Copyright 2021 Benjamin Edgington
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <stdlib.h> // malloc(), free(), atoi()
|
||||||
|
#include <stdio.h> // printf()
|
||||||
|
#include <assert.h> // assert()
|
||||||
|
#include <unistd.h> // EXIT_SUCCESS/FAILURE
|
||||||
|
#include "bench_util.h"
|
||||||
|
#include "test_util.h"
|
||||||
|
#include "fft_fr.h"
|
||||||
|
|
||||||
|
// Run the benchmark for `max_seconds` and return the time per iteration in nanoseconds.
|
||||||
|
long run_bench(int scale, int max_seconds) {
|
||||||
|
timespec_t t0, t1;
|
||||||
|
unsigned long total_time = 0, nits = 0;
|
||||||
|
FFTSettings fs;
|
||||||
|
|
||||||
|
assert(C_KZG_OK == new_fft_settings(&fs, scale));
|
||||||
|
|
||||||
|
// Allocate on the heap to avoid stack overflow for large sizes
|
||||||
|
fr_t *data;
|
||||||
|
data = malloc(fs.max_width * sizeof(fr_t));
|
||||||
|
|
||||||
|
// Fill with randomness
|
||||||
|
for (uint64_t i = 0; i < fs.max_width; i++) {
|
||||||
|
data[i] = rand_fr();
|
||||||
|
}
|
||||||
|
|
||||||
|
while (total_time < max_seconds * NANO) {
|
||||||
|
clock_gettime(CLOCK_REALTIME, &t0);
|
||||||
|
assert(C_KZG_OK == fft_fr_in_place_lomem(data, false, fs.max_width, &fs));
|
||||||
|
clock_gettime(CLOCK_REALTIME, &t1);
|
||||||
|
nits++;
|
||||||
|
total_time += tdiff(t0, t1);
|
||||||
|
}
|
||||||
|
|
||||||
|
free(data);
|
||||||
|
|
||||||
|
return total_time / nits;
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char *argv[]) {
|
||||||
|
int nsec = 0;
|
||||||
|
|
||||||
|
switch (argc) {
|
||||||
|
case 1:
|
||||||
|
nsec = NSEC;
|
||||||
|
break;
|
||||||
|
case 2:
|
||||||
|
nsec = atoi(argv[1]);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
};
|
||||||
|
|
||||||
|
if (nsec == 0) {
|
||||||
|
printf("Usage: %s [test time in seconds > 0]\n", argv[0]);
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("*** Benchmarking FFT_fr in-place, low memory, %d second%s per test.\n", nsec, nsec == 1 ? "" : "s");
|
||||||
|
for (int scale = 4; scale <= 15; scale++) {
|
||||||
|
printf("fft_fr/scale_%d %lu ns/op\n", scale, run_bench(scale, nsec));
|
||||||
|
}
|
||||||
|
|
||||||
|
return EXIT_SUCCESS;
|
||||||
|
}
|
|
@ -0,0 +1,80 @@
|
||||||
|
/*
|
||||||
|
* Copyright 2021 Benjamin Edgington
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <stdlib.h> // malloc(), free(), atoi()
|
||||||
|
#include <stdio.h> // printf()
|
||||||
|
#include <assert.h> // assert()
|
||||||
|
#include <unistd.h> // EXIT_SUCCESS/FAILURE
|
||||||
|
#include "bench_util.h"
|
||||||
|
#include "test_util.h"
|
||||||
|
#include "fft_fr.h"
|
||||||
|
|
||||||
|
// Run the benchmark for `max_seconds` and return the time per iteration in nanoseconds.
|
||||||
|
long run_bench(int scale, int max_seconds) {
|
||||||
|
timespec_t t0, t1;
|
||||||
|
unsigned long total_time = 0, nits = 0;
|
||||||
|
FFTSettings fs;
|
||||||
|
|
||||||
|
assert(C_KZG_OK == new_fft_settings(&fs, scale));
|
||||||
|
|
||||||
|
// Allocate on the heap to avoid stack overflow for large sizes
|
||||||
|
fr_t *data;
|
||||||
|
data = malloc(fs.max_width * sizeof(fr_t));
|
||||||
|
|
||||||
|
// Fill with randomness
|
||||||
|
for (uint64_t i = 0; i < fs.max_width; i++) {
|
||||||
|
data[i] = rand_fr();
|
||||||
|
}
|
||||||
|
|
||||||
|
while (total_time < max_seconds * NANO) {
|
||||||
|
clock_gettime(CLOCK_REALTIME, &t0);
|
||||||
|
assert(C_KZG_OK == fft_fr_in_place(data, false, fs.max_width, &fs));
|
||||||
|
clock_gettime(CLOCK_REALTIME, &t1);
|
||||||
|
nits++;
|
||||||
|
total_time += tdiff(t0, t1);
|
||||||
|
}
|
||||||
|
|
||||||
|
free(data);
|
||||||
|
|
||||||
|
return total_time / nits;
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char *argv[]) {
|
||||||
|
int nsec = 0;
|
||||||
|
|
||||||
|
switch (argc) {
|
||||||
|
case 1:
|
||||||
|
nsec = NSEC;
|
||||||
|
break;
|
||||||
|
case 2:
|
||||||
|
nsec = atoi(argv[1]);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
};
|
||||||
|
|
||||||
|
if (nsec == 0) {
|
||||||
|
printf("Usage: %s [test time in seconds > 0]\n", argv[0]);
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("*** Benchmarking FFT_fr in-place, %d second%s per test.\n", nsec, nsec == 1 ? "" : "s");
|
||||||
|
for (int scale = 4; scale <= 15; scale++) {
|
||||||
|
printf("fft_fr/scale_%d %lu ns/op\n", scale, run_bench(scale, nsec));
|
||||||
|
}
|
||||||
|
|
||||||
|
return EXIT_SUCCESS;
|
||||||
|
}
|
|
@ -128,12 +128,86 @@ void stride_fft(void) {
|
||||||
free_fft_settings(&fs2);
|
free_fft_settings(&fs2);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void in_place_fft(void) {
|
||||||
|
// Initialise: ascending values of i, and arbitrary size
|
||||||
|
uint64_t size = 10;
|
||||||
|
uint64_t width = (uint64_t)1 << size;
|
||||||
|
FFTSettings fs;
|
||||||
|
TEST_CHECK(new_fft_settings(&fs, size) == C_KZG_OK);
|
||||||
|
|
||||||
|
fr_t data[width], coeffs[width];
|
||||||
|
for (int i = 0; i < width; i++) {
|
||||||
|
fr_from_uint64(data + i, i);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Output is separate from input
|
||||||
|
TEST_CHECK(fft_fr(coeffs, data, false, width, &fs) == C_KZG_OK);
|
||||||
|
// Output and input array is the same
|
||||||
|
TEST_CHECK(fft_fr_in_place(data, false, width, &fs) == C_KZG_OK);
|
||||||
|
|
||||||
|
for (int i = 0; i < width; i++) {
|
||||||
|
TEST_CHECK(fr_equal(&coeffs[i], &data[i]));
|
||||||
|
}
|
||||||
|
|
||||||
|
free_fft_settings(&fs);
|
||||||
|
}
|
||||||
|
|
||||||
|
void in_place_inverse_fft(void) {
|
||||||
|
// Initialise: ascending values of i, and arbitrary size
|
||||||
|
uint64_t size = 11;
|
||||||
|
uint64_t width = (uint64_t)1 << size;
|
||||||
|
FFTSettings fs;
|
||||||
|
TEST_CHECK(new_fft_settings(&fs, size + 1) == C_KZG_OK);
|
||||||
|
|
||||||
|
fr_t data[width], coeffs[width];
|
||||||
|
for (int i = 0; i < width; i++) {
|
||||||
|
fr_from_uint64(data + i, i);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Output is separate from input
|
||||||
|
TEST_CHECK(fft_fr(coeffs, data, true, width, &fs) == C_KZG_OK);
|
||||||
|
// Output and input array is the same
|
||||||
|
TEST_CHECK(fft_fr_in_place(data, true, width, &fs) == C_KZG_OK);
|
||||||
|
|
||||||
|
for (int i = 0; i < width; i++) {
|
||||||
|
TEST_CHECK(fr_equal(&coeffs[i], &data[i]));
|
||||||
|
}
|
||||||
|
|
||||||
|
free_fft_settings(&fs);
|
||||||
|
}
|
||||||
|
|
||||||
|
void in_place_fft_lomem(void) {
|
||||||
|
// Initialise: ascending values of i, and arbitrary size
|
||||||
|
uint64_t size = 13;
|
||||||
|
uint64_t width = (uint64_t)1 << size;
|
||||||
|
FFTSettings fs;
|
||||||
|
// Make this one strided for good measure
|
||||||
|
TEST_CHECK(new_fft_settings(&fs, size + 2) == C_KZG_OK);
|
||||||
|
|
||||||
|
fr_t data0[width], data1[width];
|
||||||
|
for (int i = 0; i < width; i++) {
|
||||||
|
fr_from_uint64(data0 + i, i);
|
||||||
|
data1[i] = data0[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_CHECK(fft_fr_in_place_lomem(data0, false, width, &fs) == C_KZG_OK);
|
||||||
|
TEST_CHECK(fft_fr_in_place(data1, false, width, &fs) == C_KZG_OK);
|
||||||
|
|
||||||
|
for (int i = 0; i < width; i++) {
|
||||||
|
TEST_CHECK(fr_equal(&data0[i], &data1[i]));
|
||||||
|
}
|
||||||
|
|
||||||
|
free_fft_settings(&fs);
|
||||||
|
}
|
||||||
|
|
||||||
TEST_LIST = {
|
TEST_LIST = {
|
||||||
{"FFT_FR_TEST", title},
|
{"FFT_FR_TEST", title},
|
||||||
{"compare_sft_fft", compare_sft_fft},
|
{"compare_sft_fft", compare_sft_fft},
|
||||||
{"roundtrip_fft", roundtrip_fft},
|
{"roundtrip_fft", roundtrip_fft},
|
||||||
{"inverse_fft", inverse_fft},
|
{"inverse_fft", inverse_fft},
|
||||||
{"stride_fft", stride_fft},
|
{"stride_fft", stride_fft},
|
||||||
|
{"in_place_fft", in_place_fft},
|
||||||
|
{"in_place_inverse_fft", in_place_inverse_fft},
|
||||||
|
{"in_place_fft_lomem", in_place_fft_lomem},
|
||||||
{NULL, NULL} /* zero record marks the end of the list */
|
{NULL, NULL} /* zero record marks the end of the list */
|
||||||
};
|
};
|
||||||
|
|
|
@ -17,6 +17,7 @@
|
||||||
#include "../inc/acutest.h"
|
#include "../inc/acutest.h"
|
||||||
#include "test_util.h"
|
#include "test_util.h"
|
||||||
#include "utility.h"
|
#include "utility.h"
|
||||||
|
#include "c_kzg_util.h"
|
||||||
|
|
||||||
static uint32_t rev_bits_slow(uint32_t a) {
|
static uint32_t rev_bits_slow(uint32_t a) {
|
||||||
uint32_t ret = 0;
|
uint32_t ret = 0;
|
||||||
|
@ -139,6 +140,32 @@ void test_reverse_bit_order_fr(void) {
|
||||||
TEST_CHECK(true == fr_equal(&b[n - 1], &a[n - 1]));
|
TEST_CHECK(true == fr_equal(&b[n - 1], &a[n - 1]));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void test_reverse_bit_order_fr_large(void) {
|
||||||
|
int size = 22, n = 1 << size;
|
||||||
|
fr_t *a, *b;
|
||||||
|
|
||||||
|
TEST_CHECK(C_KZG_OK == new_fr_array(&a, n));
|
||||||
|
TEST_CHECK(C_KZG_OK == new_fr_array(&b, n));
|
||||||
|
|
||||||
|
for (int i = 0; i < n; i++) {
|
||||||
|
fr_from_uint64(&a[i], i);
|
||||||
|
b[i] = a[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_CHECK(C_KZG_OK == reverse_bit_order(a, sizeof(fr_t), n));
|
||||||
|
for (int i = 0; i < n; i++) {
|
||||||
|
TEST_CHECK(true == fr_equal(&b[reverse_bits(i) >> (32 - size)], &a[i]));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Hand check a few select values
|
||||||
|
TEST_CHECK(true == fr_equal(&b[0], &a[0]));
|
||||||
|
TEST_CHECK(false == fr_equal(&b[1], &a[1]));
|
||||||
|
TEST_CHECK(true == fr_equal(&b[n - 1], &a[n - 1]));
|
||||||
|
|
||||||
|
free(a);
|
||||||
|
free(b);
|
||||||
|
}
|
||||||
|
|
||||||
TEST_LIST = {
|
TEST_LIST = {
|
||||||
{"UTILITY_TEST", title},
|
{"UTILITY_TEST", title},
|
||||||
{"is_power_of_two_works", is_power_of_two_works},
|
{"is_power_of_two_works", is_power_of_two_works},
|
||||||
|
@ -150,5 +177,6 @@ TEST_LIST = {
|
||||||
{"test_reverse_bits_random", test_reverse_bits_random},
|
{"test_reverse_bits_random", test_reverse_bits_random},
|
||||||
{"test_reverse_bit_order_g1", test_reverse_bit_order_g1},
|
{"test_reverse_bit_order_g1", test_reverse_bit_order_g1},
|
||||||
{"test_reverse_bit_order_fr", test_reverse_bit_order_fr},
|
{"test_reverse_bit_order_fr", test_reverse_bit_order_fr},
|
||||||
|
{"test_reverse_bit_order_fr_large", test_reverse_bit_order_fr_large},
|
||||||
{NULL, NULL} /* zero record marks the end of the list */
|
{NULL, NULL} /* zero record marks the end of the list */
|
||||||
};
|
};
|
||||||
|
|
Loading…
Reference in New Issue