Add benchmarking for FFTs

2021-02-07 20:58:19 +00:00 · 2021-02-07 20:58:19 +00:00 · b9bc4bb496
parent 80c513f653
commit b9bc4bb496
9 changed files with 216 additions and 6 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,7 +1,10 @@
 *.o
 *.a
 *_test
-a.out
+*_bench
 *.prof
 *.out
 *.log
 tmp/
 inc/blst.h
 inc/blst_aux.h
--- a/src/Makefile
+++ b/src/Makefile
@ -1,5 +1,6 @@
 TESTS = blst_util_test c_kzg_util_test fft_common_test fft_fr_test fft_g1_test \
 	kzg_proofs_test poly_test
 BENCH = fft_fr_bench fft_g1_bench
 LIB_SRC = blst_util.c c_kzg_util.c fft_common.c fft_fr.c fft_g1.c kzg_proofs.c poly.c
 LIB_OBJ = $(LIB_SRC:.c=.o)
@ -17,13 +18,25 @@ libckzg.a: $(LIB_OBJ) Makefile
 	clang -Wall $(CFLAGS) -o $@ $@.c debug_util.o libckzg.a -L../lib -lblst
 	./$@
 %_bench: %_bench.c bench_util.o $(LIB_OBJ) Makefile
 	clang -Wall $(CFLAGS) -o $@ $@.c bench_util.o $(LIB_OBJ) -L../lib -lblst
 	./$@
 lib: clean libckzg.a
-debuglib: CFLAGS += -g -DDEBUG
+debuglib: CFLAGS += -O1 -DDEBUG
 debuglib: clean libckzg.a
 optlib: CFLAGS += -O2
 optlib: clean libckzg.a
 profilelib: CFLAGS += -fprofile-instr-generate -fcoverage-mapping
 profilelib: clean libckzg.a
 test: $(TESTS)
 bench: $(BENCH)
 clean:
 	rm -f *.o
 	rm -f libckzg.a
--- a/src/bench_util.c
+++ b/src/bench_util.c
@ -0,0 +1,47 @@
 /*
 * Copyright 2021 Benjamin Edgington
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include <stdlib.h> // rand()
 #include "bench_util.h"
 #include "blst_util.h"
 unsigned long tdiff(timespec start, timespec end) {
    return (end.tv_sec - start.tv_sec) * NANO + (end.tv_nsec - start.tv_nsec);
 }
 uint64_t rand_uint64() {
    uint64_t a = (uint64_t)rand();
    uint64_t b = (uint64_t)rand();
    return a << 32 | b;
 }
 blst_fr rand_fr() {
    blst_fr ret;
    uint64_t a[4];
    a[0] = rand_uint64();
    a[1] = rand_uint64();
    a[2] = rand_uint64();
    a[3] = rand_uint64();
    blst_fr_from_uint64(&ret, a);
    return ret;
 }
 blst_p1 rand_g1() {
    blst_p1 ret;
    blst_fr random = rand_fr();
    p1_mul(&ret, blst_p1_generator(), &random);
    return ret;
 }
--- a/src/bench_util.h
+++ b/src/bench_util.h
@ -0,0 +1,27 @@
 /*
 * Copyright 2021 Benjamin Edgington
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include <time.h> // CLOCK_REALTIME, clock_gettime(), timespec
 #include "c_kzg.h"
 typedef struct timespec timespec;
 #define NANO 1000000000L
 unsigned long tdiff(timespec start, timespec end);
 uint64_t rand_uint64();
 blst_fr rand_fr();
 blst_p1 rand_g1();
--- a/src/blst_util.c
+++ b/src/blst_util.c
@ -60,7 +60,6 @@ void fr_pow(blst_fr *out, const blst_fr *a, uint64_t n) {
    }
 }
 // TODO: Is there really no better way to do this?
 void p1_mul(blst_p1 *out, const blst_p1 *a, const blst_fr *b) {
    blst_scalar s;
    blst_scalar_from_fr(&s, b);
@ -73,7 +72,6 @@ void p1_sub(blst_p1 *out, const blst_p1 *a, const blst_p1 *b) {
    blst_p1_add_or_double(out, a, &bneg);
 }
 // TODO: Is there really no better way to do this?
 void p2_mul(blst_p2 *out, const blst_p2 *a, const blst_fr *b) {
    blst_scalar s;
    blst_scalar_from_fr(&s, b);
--- a/src/fft_fr.c
+++ b/src/fft_fr.c
@ -37,7 +37,7 @@ void fft_fr_slow(blst_fr *out, const blst_fr *in, uint64_t stride, const blst_fr
 void fft_fr_fast(blst_fr *out, const blst_fr *in, uint64_t stride, const blst_fr *roots, uint64_t roots_stride,
                 uint64_t l) {
    uint64_t half = l / 2;
-    if (half > 2) { // TODO: Tunable parameter
+    if (half > 0) { // TODO: Tunable parameter
        fft_fr_fast(out, in, stride * 2, roots, roots_stride * 2, half);
        fft_fr_fast(out + half, in + stride, stride * 2, roots, roots_stride * 2, half);
        for (uint64_t i = 0; i < half; i++) {
--- a/src/fft_fr_bench.c
+++ b/src/fft_fr_bench.c
@ -0,0 +1,61 @@
 /*
 * Copyright 2021 Benjamin Edgington
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include <stdlib.h> // malloc(), free()
 #include <stdio.h>  // printf()
 #include <assert.h> // assert()
 #include "bench_util.h"
 #include "fft_fr.h"
 // Run the benchmark for `max_seconds` and return the time per iteration in nanoseconds.
 long run_bench(int scale, int max_seconds) {
    timespec t0, t1;
    unsigned long total_time = 0, nits = 0;
    FFTSettings fs;
    assert(C_KZG_OK == new_fft_settings(&fs, scale));
    // Allocate on the heap to avoid stack overflow for large sizes
    blst_fr *data, *out;
    data = malloc(fs.max_width * sizeof(blst_fr));
    out = malloc(fs.max_width * sizeof(blst_fr));
    // Fill with randomness
    for (uint64_t i = 0; i < fs.max_width; i++) {
        data[i] = rand_fr();
    }
    while (total_time < max_seconds * NANO) {
        clock_gettime(CLOCK_REALTIME, &t0);
        assert(C_KZG_OK == fft_fr(out, data, &fs, false, fs.max_width));
        clock_gettime(CLOCK_REALTIME, &t1);
        nits++;
        total_time += tdiff(t0, t1);
    }
    free(out);
    free(data);
    return total_time / nits;
 }
 #define NSEC 1
 int main(void) {
    printf("*** Benchmarking FFT_fr, %d second%s per test.\n", NSEC, NSEC == 1 ? "" : "s");
    for (int scale = 4; scale < 16; scale++) {
        printf("fft_fr/scale_%d %lu ns/op\n", scale, run_bench(scale, 1));
    }
 }
--- a/src/fft_g1.c
+++ b/src/fft_g1.c
@ -36,7 +36,7 @@ void fft_g1_slow(blst_p1 *out, blst_p1 *in, uint64_t stride, blst_fr *roots, uin
 // Fast Fourier Transform
 void fft_g1_fast(blst_p1 *out, blst_p1 *in, uint64_t stride, blst_fr *roots, uint64_t roots_stride, uint64_t l) {
    uint64_t half = l / 2;
-    if (half > 2) { // TODO: Tunable parameter
+    if (half > 0) { // Tunable parameter
        fft_g1_fast(out, in, stride * 2, roots, roots_stride * 2, half);
        fft_g1_fast(out + half, in + stride, stride * 2, roots, roots_stride * 2, half);
        for (uint64_t i = 0; i < half; i++) {
--- a/src/fft_g1_bench.c
+++ b/src/fft_g1_bench.c
@ -0,0 +1,61 @@
 /*
 * Copyright 2021 Benjamin Edgington
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include <stdlib.h> // malloc(), free()
 #include <stdio.h>  // printf()
 #include <assert.h> // assert()
 #include "bench_util.h"
 #include "fft_g1.h"
 // Run the benchmark for `max_seconds` and return the time per iteration in nanoseconds.
 long run_bench(int scale, int max_seconds) {
    timespec t0, t1;
    unsigned long total_time = 0, nits = 0;
    FFTSettings fs;
    assert(C_KZG_OK == new_fft_settings(&fs, scale));
    // Allocate on the heap to avoid stack overflow for large sizes
    blst_p1 *data, *out;
    data = malloc(fs.max_width * sizeof(blst_p1));
    out = malloc(fs.max_width * sizeof(blst_p1));
    // Fill with randomness
    for (uint64_t i = 0; i < fs.max_width; i++) {
        data[i] = rand_g1();
    }
    while (total_time < max_seconds * NANO) {
        clock_gettime(CLOCK_REALTIME, &t0);
        assert(C_KZG_OK == fft_g1(out, data, &fs, false, fs.max_width));
        clock_gettime(CLOCK_REALTIME, &t1);
        nits++;
        total_time += tdiff(t0, t1);
    }
    free(out);
    free(data);
    return total_time / nits;
 }
 #define NSEC 1
 int main(void) {
    printf("*** Benchmarking FFT_g1, %d second%s per test.\n", NSEC, NSEC == 1 ? "" : "s");
    for (int scale = 4; scale < 16; scale++) {
        printf("fft_g1/scale_%d %lu ns/op\n", scale, run_bench(scale, 1));
    }
 }