# Constantine # Copyright (c) 2018-2019 Status Research & Development GmbH # Copyright (c) 2020-Present Mamy André-Ratsimbazafy # Licensed and distributed under either of # * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). # * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). # at your option. This file may not be copied, modified, or distributed except according to those terms. import # Internals ../constantine/math/config/curves, ../constantine/math/arithmetic, ../constantine/math/elliptic/[ ec_shortweierstrass_affine, ec_shortweierstrass_projective, ec_shortweierstrass_jacobian, ec_shortweierstrass_jacobian_extended, ec_shortweierstrass_batch_ops_parallel, ec_multi_scalar_mul, ec_scalar_mul, ec_multi_scalar_mul_parallel], ../constantine/math/constants/zoo_subgroups, # Threadpool ../constantine/threadpool/[threadpool, partitioners], # Helpers ../helpers/prng_unsafe, ./bench_elliptic_template, ./bench_blueprint export bench_elliptic_template # ############################################################ # # Parallel Benchmark definitions # # ############################################################ proc multiAddParallelBench*(EC: typedesc, numPoints: int, iters: int) = var points = newSeq[ECP_ShortW_Aff[EC.F, EC.G]](numPoints) for i in 0 ..< numPoints: points[i] = rng.random_unsafe(ECP_ShortW_Aff[EC.F, EC.G]) var r{.noInit.}: EC var tp = Threadpool.new() bench("EC parallel batch add (" & align($tp.numThreads, 2) & " threads) " & $EC.G & " (" & $numPoints & " points)", EC, iters): tp.sum_reduce_vartime_parallel(r, points) tp.shutdown() proc msmParallelBench*(EC: typedesc, numPoints: int, iters: int) = const bits = EC.F.C.getCurveOrderBitwidth() var points = newSeq[ECP_ShortW_Aff[EC.F, EC.G]](numPoints) var scalars = newSeq[BigInt[bits]](numPoints) # Creating millions of points and clearing their cofactor takes a long long time var tp = Threadpool.new() proc genCoefPointPairs(rngSeed: uint64, start, len: int, points: ptr ECP_ShortW_Aff[EC.F, EC.G], scalars: ptr BigInt[bits]) {.nimcall.} = let points = cast[ptr UncheckedArray[ECP_ShortW_Aff[EC.F, EC.G]]](points) # TODO use views to reduce verbosity let scalars = cast[ptr UncheckedArray[BigInt[bits]]](scalars) # RNGs are not threadsafe, create a threadlocal one seeded from the global RNG var threadRng: RngState threadRng.seed(rngSeed) for i in start ..< start + len: var tmp = threadRng.random_unsafe(EC) tmp.clearCofactor() points[i].affine(tmp) scalars[i] = rng.random_unsafe(BigInt[bits]) let chunks = balancedChunksPrioNumber(0, numPoints, tp.numThreads) syncScope: for (id, start, size) in items(chunks): tp.spawn genCoefPointPairs(rng.next(), start, size, points[0].addr, scalars[0].addr) # Even if child threads are sleeping, it seems like perf is lower when there are threads around # maybe because the kernel has more overhead or time quantum to keep track off so shut them down. tp.shutdown() var r{.noInit.}: EC var startNaive, stopNaive, startMSMbaseline, stopMSMbaseline, startMSMopt, stopMSMopt, startMSMpara, stopMSMpara: MonoTime if numPoints <= 100000: startNaive = getMonotime() bench("EC scalar muls " & align($numPoints, 10) & " (" & $bits & "-bit coefs, points)", EC, iters): var tmp: EC r.setInf() for i in 0 ..< points.len: tmp.fromAffine(points[i]) tmp.scalarMul(scalars[i]) r += tmp stopNaive = getMonotime() if numPoints <= 100000: startMSMbaseline = getMonotime() bench("EC multi-scalar-mul baseline " & align($numPoints, 10) & " (" & $bits & "-bit coefs, points)", EC, iters): r.multiScalarMul_reference_vartime(scalars, points) stopMSMbaseline = getMonotime() block: startMSMopt = getMonotime() bench("EC multi-scalar-mul optimized " & align($numPoints, 10) & " (" & $bits & "-bit coefs, points)", EC, iters): r.multiScalarMul_vartime(scalars, points) stopMSMopt = getMonotime() block: tp = Threadpool.new() startMSMpara = getMonotime() bench("EC multi-scalar-mul" & align($tp.numThreads & " threads", 11) & align($numPoints, 10) & " (" & $bits & "-bit coefs, points)", EC, iters): tp.multiScalarMul_vartime_parallel(r, scalars, points) stopMSMpara = getMonotime() tp.shutdown() let perfNaive = inNanoseconds((stopNaive-startNaive) div iters) let perfMSMbaseline = inNanoseconds((stopMSMbaseline-startMSMbaseline) div iters) let perfMSMopt = inNanoseconds((stopMSMopt-startMSMopt) div iters) let perfMSMpara = inNanoseconds((stopMSMpara-startMSMpara) div iters) if numPoints <= 100000: let speedupBaseline = float(perfNaive) / float(perfMSMbaseline) echo &"Speedup ratio baseline over naive linear combination: {speedupBaseline:>6.3f}x" let speedupOpt = float(perfNaive) / float(perfMSMopt) echo &"Speedup ratio optimized over naive linear combination: {speedupOpt:>6.3f}x" let speedupOptBaseline = float(perfMSMbaseline) / float(perfMSMopt) echo &"Speedup ratio optimized over baseline linear combination: {speedupOptBaseline:>6.3f}x" let speedupParaOpt = float(perfMSMopt) / float(perfMSMpara) echo &"Speedup ratio parallel over optimized linear combination: {speedupParaOpt:>6.3f}x"