Add benchmarking for FFTs

This commit is contained in:
Ben Edgington 2021-02-07 20:58:19 +00:00
parent 80c513f653
commit b9bc4bb496
9 changed files with 216 additions and 6 deletions

5
.gitignore vendored
View File

@ -1,7 +1,10 @@
*.o *.o
*.a *.a
*_test *_test
a.out *_bench
*.prof
*.out
*.log
tmp/ tmp/
inc/blst.h inc/blst.h
inc/blst_aux.h inc/blst_aux.h

View File

@ -1,5 +1,6 @@
TESTS = blst_util_test c_kzg_util_test fft_common_test fft_fr_test fft_g1_test \ TESTS = blst_util_test c_kzg_util_test fft_common_test fft_fr_test fft_g1_test \
kzg_proofs_test poly_test kzg_proofs_test poly_test
BENCH = fft_fr_bench fft_g1_bench
LIB_SRC = blst_util.c c_kzg_util.c fft_common.c fft_fr.c fft_g1.c kzg_proofs.c poly.c LIB_SRC = blst_util.c c_kzg_util.c fft_common.c fft_fr.c fft_g1.c kzg_proofs.c poly.c
LIB_OBJ = $(LIB_SRC:.c=.o) LIB_OBJ = $(LIB_SRC:.c=.o)
@ -17,13 +18,25 @@ libckzg.a: $(LIB_OBJ) Makefile
clang -Wall $(CFLAGS) -o $@ $@.c debug_util.o libckzg.a -L../lib -lblst clang -Wall $(CFLAGS) -o $@ $@.c debug_util.o libckzg.a -L../lib -lblst
./$@ ./$@
%_bench: %_bench.c bench_util.o $(LIB_OBJ) Makefile
clang -Wall $(CFLAGS) -o $@ $@.c bench_util.o $(LIB_OBJ) -L../lib -lblst
./$@
lib: clean libckzg.a lib: clean libckzg.a
debuglib: CFLAGS += -g -DDEBUG debuglib: CFLAGS += -O1 -DDEBUG
debuglib: clean libckzg.a debuglib: clean libckzg.a
optlib: CFLAGS += -O2
optlib: clean libckzg.a
profilelib: CFLAGS += -fprofile-instr-generate -fcoverage-mapping
profilelib: clean libckzg.a
test: $(TESTS) test: $(TESTS)
bench: $(BENCH)
clean: clean:
rm -f *.o rm -f *.o
rm -f libckzg.a rm -f libckzg.a

47
src/bench_util.c Normal file
View File

@ -0,0 +1,47 @@
/*
* Copyright 2021 Benjamin Edgington
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <stdlib.h> // rand()
#include "bench_util.h"
#include "blst_util.h"
unsigned long tdiff(timespec start, timespec end) {
return (end.tv_sec - start.tv_sec) * NANO + (end.tv_nsec - start.tv_nsec);
}
uint64_t rand_uint64() {
uint64_t a = (uint64_t)rand();
uint64_t b = (uint64_t)rand();
return a << 32 | b;
}
blst_fr rand_fr() {
blst_fr ret;
uint64_t a[4];
a[0] = rand_uint64();
a[1] = rand_uint64();
a[2] = rand_uint64();
a[3] = rand_uint64();
blst_fr_from_uint64(&ret, a);
return ret;
}
blst_p1 rand_g1() {
blst_p1 ret;
blst_fr random = rand_fr();
p1_mul(&ret, blst_p1_generator(), &random);
return ret;
}

27
src/bench_util.h Normal file
View File

@ -0,0 +1,27 @@
/*
* Copyright 2021 Benjamin Edgington
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <time.h> // CLOCK_REALTIME, clock_gettime(), timespec
#include "c_kzg.h"
typedef struct timespec timespec;
#define NANO 1000000000L
unsigned long tdiff(timespec start, timespec end);
uint64_t rand_uint64();
blst_fr rand_fr();
blst_p1 rand_g1();

View File

@ -60,7 +60,6 @@ void fr_pow(blst_fr *out, const blst_fr *a, uint64_t n) {
} }
} }
// TODO: Is there really no better way to do this?
void p1_mul(blst_p1 *out, const blst_p1 *a, const blst_fr *b) { void p1_mul(blst_p1 *out, const blst_p1 *a, const blst_fr *b) {
blst_scalar s; blst_scalar s;
blst_scalar_from_fr(&s, b); blst_scalar_from_fr(&s, b);
@ -73,7 +72,6 @@ void p1_sub(blst_p1 *out, const blst_p1 *a, const blst_p1 *b) {
blst_p1_add_or_double(out, a, &bneg); blst_p1_add_or_double(out, a, &bneg);
} }
// TODO: Is there really no better way to do this?
void p2_mul(blst_p2 *out, const blst_p2 *a, const blst_fr *b) { void p2_mul(blst_p2 *out, const blst_p2 *a, const blst_fr *b) {
blst_scalar s; blst_scalar s;
blst_scalar_from_fr(&s, b); blst_scalar_from_fr(&s, b);

View File

@ -37,7 +37,7 @@ void fft_fr_slow(blst_fr *out, const blst_fr *in, uint64_t stride, const blst_fr
void fft_fr_fast(blst_fr *out, const blst_fr *in, uint64_t stride, const blst_fr *roots, uint64_t roots_stride, void fft_fr_fast(blst_fr *out, const blst_fr *in, uint64_t stride, const blst_fr *roots, uint64_t roots_stride,
uint64_t l) { uint64_t l) {
uint64_t half = l / 2; uint64_t half = l / 2;
if (half > 2) { // TODO: Tunable parameter if (half > 0) { // TODO: Tunable parameter
fft_fr_fast(out, in, stride * 2, roots, roots_stride * 2, half); fft_fr_fast(out, in, stride * 2, roots, roots_stride * 2, half);
fft_fr_fast(out + half, in + stride, stride * 2, roots, roots_stride * 2, half); fft_fr_fast(out + half, in + stride, stride * 2, roots, roots_stride * 2, half);
for (uint64_t i = 0; i < half; i++) { for (uint64_t i = 0; i < half; i++) {

61
src/fft_fr_bench.c Normal file
View File

@ -0,0 +1,61 @@
/*
* Copyright 2021 Benjamin Edgington
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <stdlib.h> // malloc(), free()
#include <stdio.h> // printf()
#include <assert.h> // assert()
#include "bench_util.h"
#include "fft_fr.h"
// Run the benchmark for `max_seconds` and return the time per iteration in nanoseconds.
long run_bench(int scale, int max_seconds) {
timespec t0, t1;
unsigned long total_time = 0, nits = 0;
FFTSettings fs;
assert(C_KZG_OK == new_fft_settings(&fs, scale));
// Allocate on the heap to avoid stack overflow for large sizes
blst_fr *data, *out;
data = malloc(fs.max_width * sizeof(blst_fr));
out = malloc(fs.max_width * sizeof(blst_fr));
// Fill with randomness
for (uint64_t i = 0; i < fs.max_width; i++) {
data[i] = rand_fr();
}
while (total_time < max_seconds * NANO) {
clock_gettime(CLOCK_REALTIME, &t0);
assert(C_KZG_OK == fft_fr(out, data, &fs, false, fs.max_width));
clock_gettime(CLOCK_REALTIME, &t1);
nits++;
total_time += tdiff(t0, t1);
}
free(out);
free(data);
return total_time / nits;
}
#define NSEC 1
int main(void) {
printf("*** Benchmarking FFT_fr, %d second%s per test.\n", NSEC, NSEC == 1 ? "" : "s");
for (int scale = 4; scale < 16; scale++) {
printf("fft_fr/scale_%d %lu ns/op\n", scale, run_bench(scale, 1));
}
}

View File

@ -36,7 +36,7 @@ void fft_g1_slow(blst_p1 *out, blst_p1 *in, uint64_t stride, blst_fr *roots, uin
// Fast Fourier Transform // Fast Fourier Transform
void fft_g1_fast(blst_p1 *out, blst_p1 *in, uint64_t stride, blst_fr *roots, uint64_t roots_stride, uint64_t l) { void fft_g1_fast(blst_p1 *out, blst_p1 *in, uint64_t stride, blst_fr *roots, uint64_t roots_stride, uint64_t l) {
uint64_t half = l / 2; uint64_t half = l / 2;
if (half > 2) { // TODO: Tunable parameter if (half > 0) { // Tunable parameter
fft_g1_fast(out, in, stride * 2, roots, roots_stride * 2, half); fft_g1_fast(out, in, stride * 2, roots, roots_stride * 2, half);
fft_g1_fast(out + half, in + stride, stride * 2, roots, roots_stride * 2, half); fft_g1_fast(out + half, in + stride, stride * 2, roots, roots_stride * 2, half);
for (uint64_t i = 0; i < half; i++) { for (uint64_t i = 0; i < half; i++) {

61
src/fft_g1_bench.c Normal file
View File

@ -0,0 +1,61 @@
/*
* Copyright 2021 Benjamin Edgington
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <stdlib.h> // malloc(), free()
#include <stdio.h> // printf()
#include <assert.h> // assert()
#include "bench_util.h"
#include "fft_g1.h"
// Run the benchmark for `max_seconds` and return the time per iteration in nanoseconds.
long run_bench(int scale, int max_seconds) {
timespec t0, t1;
unsigned long total_time = 0, nits = 0;
FFTSettings fs;
assert(C_KZG_OK == new_fft_settings(&fs, scale));
// Allocate on the heap to avoid stack overflow for large sizes
blst_p1 *data, *out;
data = malloc(fs.max_width * sizeof(blst_p1));
out = malloc(fs.max_width * sizeof(blst_p1));
// Fill with randomness
for (uint64_t i = 0; i < fs.max_width; i++) {
data[i] = rand_g1();
}
while (total_time < max_seconds * NANO) {
clock_gettime(CLOCK_REALTIME, &t0);
assert(C_KZG_OK == fft_g1(out, data, &fs, false, fs.max_width));
clock_gettime(CLOCK_REALTIME, &t1);
nits++;
total_time += tdiff(t0, t1);
}
free(out);
free(data);
return total_time / nits;
}
#define NSEC 1
int main(void) {
printf("*** Benchmarking FFT_g1, %d second%s per test.\n", NSEC, NSEC == 1 ? "" : "s");
for (int scale = 4; scale < 16; scale++) {
printf("fft_g1/scale_%d %lu ns/op\n", scale, run_bench(scale, 1));
}
}