constantine/constantine/math/elliptic/ec_multi_scalar_mul_parallel.nim
Mamy Ratsimbazafy 0f9b9e9606
Parallel Ethereum protocols (BLS signature and KZG) (#279)
* BLS sig: parallel batch verification

* BLS: speedup parallel batch verify with Miller loops on local threads

* shutdown bench

* nit: import style

* implement parallel KZG

* Parallel KZG commitments

* add benchmarks of KZG

* rename protocol file

* small optim: reorder await

* fix rebase

* Faster parallel BLS verification

* fix commitment status replacing previous error in verify_blob_kzg_proof_batch_parallel

* 2x faster parallel EC sum for less than 8192 points
2023-10-06 09:58:20 +02:00

590 lines
25 KiB
Nim

# Constantine
# Copyright (c) 2018-2019 Status Research & Development GmbH
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.
import ./ec_multi_scalar_mul_scheduler,
./ec_multi_scalar_mul,
./ec_endomorphism_accel,
../extension_fields,
../constants/zoo_endomorphisms,
../../threadpool/[threadpool, partitioners]
export bestBucketBitSize
# No exceptions allowed in core cryptographic operations
{.push raises: [].}
{.push checks: off.}
# ########################################################### #
# #
# Parallel Multi Scalar Multiplication #
# #
# ########################################################### #
#
# Writeup
#
# Recall the reference implementation in pseudocode
#
# func multiScalarMulImpl_reference_vartime
#
# c <- fn(numPoints) with `fn` a function that minimizes the total number of Elliptic Curve additions
# in the order of log2(numPoints) - 3
# numWindows <- ⌈coefBits/c⌉
# numBuckets <- 2ᶜ⁻¹
# r <- ∅ (The elliptic curve infinity point)
#
# miniMSMs[0..<numWindows] <- ∅
#
# // 0.a MiniMSMs accumulation
# for w in 0 ..< numWindows:
#
# // 1.a Bucket accumulation
# buckets[0..<numBuckets] <- ∅
# for j in 0 ..< numPoints:
# b <- coefs[j].getWindowAt(w*c, c)
# buckets[b] += points[j]
#
# // 1.r Bucket reduction
# accumBuckets <- ∅
# for k in countdown(numBuckets-1, 0):
# accumBuckets += buckets[k]
# miniMSMs[w] += accumBuckets
#
# // 0.r MiniMSM reduction
# for w in countdown(numWindows-1, 0):
# for _ in 0 ..< c:
# r.double()
# r += miniMSMs[w]
#
# return r
#
# A comprehensive mapping: inputSize, c, numBuckets is in ec_multi_scalar_mul_scheduler.nim
#
# -------inputs------- c ----buckets---- queue length collision map bytes num collisions collision %
# 2^5 32 5 2^4 16 -108 8 -216 -675.0%
# 2^10 1024 9 2^8 256 52 32 208 20.3%
# 2^13 8192 11 2^10 1024 180 128 1440 17.6%
# 2^16 65536 14 2^13 8192 432 1024 3456 5.3%
# 2^18 262144 16 2^15 32768 640 4096 5120 2.0% <- ~10MB of buckets
# 2^20 1048576 17 2^16 65536 756 8192 12096 1.2%
# 2^26 67108864 23 2^22 4194304 1620 524288 25920 0.0%
#
# The coef bits is usually between 128 to 377 depending on endomorphisms and the elliptic curve.
#
# Starting from 64 points, parallelism seems to always be beneficial (serial takes over 1 ms on laptop)
#
# There are 3 parallelism opportunities:
# - 0.a MiniMSMs accumulation a.k.a "window-level paralllism"
# is straightforward as there are no data dependencies at all.
# - 1.a Buckets accumulation a.k.a "bucket-level parallelism".
# Buckets needs to be parallelized over buckets and not points to avoid synchronization between threads.
# The disadvantage is that all threads scan all the points.
# - and doing separate MSMs over part of the points, a.k.a "msm-level parallelism".
# As the number of points grows, the cost of scalar-mul per point diminishes at the rate O(n/log n) as we can increase the window size `c`
# to reduce the number of operations. However when `c` reaches 16, memory bandwidth becomes another bottleneck
# hence parallelizing at this level becomes interesting.
#
# We can also parallelize the reductions but they would require extra doublings to "place the reduction" at the right bits.
# Example:
# let's say we compute the binary number 0b11010110
# Each 1 is add+double, each 0 is just double.
# We can split the computation in parallel 0b1101 << 4 and 0b0110
# but now we need 4 extra doublings to shift the high part in the correct place.
# Alternatively we can do "latency hiding", we start the computation before all results are available, and wait for the next part to finish.
#
# Now, with a small c, say 1024 inputs, c=9, the window-level parallelism is large: 14 to 28 for 128-bit to 256-bit coefs.
# For large c, say 262k inputs, c=16, the window-level parallelism is small: 8 to 16 for 128-bit to 256-bit coefs.
#
# Zero Knowledge protocols need to operate on millions of points, so we want to fully occupy high-end CPUs
# - AMD EPYC 9654 96C/192T on 2 sockets hence 384 threads
# - Intel Xeon Platinum 8490H 60C/120T on 8 sockets hence 960 threads
#
# Bucket-level parallism has a multiplicative factor on parallelism exposed
# Impact in order of importance of a high chunking factor:
# + the more parallelism opportunities we offer.
# - the more collision we have when setting up sparse vector affine addition
# + the more fine-grained the bucket reduction can be interleaved
# - the more passes over the data there are.
# - the more memory we used
#
# Hence do we want inner parallelism to be
# - a fraction of the number of threads?
# - a multiple of the number of threads?
# - a fraction of the number of buckets?
#
# Back to latency hiding,
# 1. The reductions can be done bottom bits to top bits or top bits to bottom bits.
# Often bits/c has remainder top bits that are smaller so top to bottom would allow
# to start the reduction while the rest of the windows are still being processed.
# 2. a thread processes its own tasks in a LIFO manner.
# But thieves steal tasks in FIFO manner.
#
# Due to 1, it's probable best to reduce in staggered manner from top to bottom.
# But then in which order to issue accumulations?
# 1. Do we schedule the top bits first, in hope they would be stolen. (FIFO thefts)
# 2. or do we schedule the top bits last, so that once we reduce, we directly schedule a related task. (LIFO dequeueing)
#
# Lastly we can go further on latency hiding for the bucket-level parallelism,
# having decreasing range sizes so that the top ranges are ready earlier for interleaving reduction.
# Parallel MSM Jacobian Extended
# ------------------------------
proc bucketAccumReduce_jacext_zeroMem[EC, F, G; bits: static int](
windowSum: ptr EC,
buckets: ptr ECP_ShortW_JacExt[F, G] or ptr UncheckedArray[ECP_ShortW_JacExt[F, G]],
bitIndex: int, miniMsmKind: static MiniMsmKind, c: static int,
coefs: ptr UncheckedArray[BigInt[bits]], points: ptr UncheckedArray[ECP_ShortW_Aff[F, G]], N: int) =
const numBuckets = 1 shl (c-1)
let buckets = cast[ptr UncheckedArray[ECP_ShortW_JacExt[F, G]]](buckets)
zeroMem(buckets, sizeof(ECP_ShortW_JacExt[F, G]) * numBuckets)
bucketAccumReduce_jacext(windowSum[], buckets, bitIndex, miniMsmKind, c, coefs, points, N)
proc msmJacExt_vartime_parallel*[bits: static int, EC, F, G](
tp: Threadpool,
r: ptr EC,
coefs: ptr UncheckedArray[BigInt[bits]], points: ptr UncheckedArray[ECP_ShortW_Aff[F, G]],
N: int, c: static int) =
# Prologue
# --------
const numBuckets = 1 shl (c-1)
const numFullWindows = bits div c
const numWindows = numFullWindows + 1 # Even if `bits div c` is exact, the signed recoding needs to see an extra 0 after the MSB
# Instead of storing the result in futures, risking them being scattered in memory
# we store them in a contiguous array, and the synchronizing future just returns a bool.
# top window is done on this thread
let miniMSMsResults = allocHeapArray(EC, numFullWindows)
let miniMSMsReady = allocStackArray(FlowVar[bool], numFullWindows)
let bucketsMatrix = allocHeapArray(ECP_ShortW_JacExt[F, G], numBuckets*numWindows)
# Algorithm
# ---------
block: # 1. Bucket accumulation and reduction
miniMSMsReady[0] = tp.spawnAwaitable bucketAccumReduce_jacext_zeroMem(
miniMSMsResults[0].addr,
bucketsMatrix[0].addr,
bitIndex = 0, kBottomWindow, c,
coefs, points, N)
for w in 1 ..< numFullWindows:
miniMSMsReady[w] = tp.spawnAwaitable bucketAccumReduce_jacext_zeroMem(
miniMSMsResults[w].addr,
bucketsMatrix[w*numBuckets].addr,
bitIndex = w*c, kFullWindow, c,
coefs, points, N)
# Last window is done sync on this thread, directly initializing r
const excess = bits mod c
const top = bits-excess
when top != 0:
when excess != 0:
bucketAccumReduce_jacext_zeroMem(
r,
bucketsMatrix[numFullWindows*numBuckets].addr,
bitIndex = top, kTopWindow, c,
coefs, points, N)
else:
r[].setInf()
# 3. Final reduction, r initialized to what would be miniMSMsReady[numWindows-1]
when excess != 0:
for w in countdown(numWindows-2, 0):
for _ in 0 ..< c:
r[].double()
discard sync miniMSMsReady[w]
r[].sum_vartime(r[], miniMSMsResults[w])
elif numWindows >= 2:
discard sync miniMSMsReady[numWindows-2]
r[] = miniMSMsResults[numWindows-2]
for w in countdown(numWindows-3, 0):
for _ in 0 ..< c:
r[].double()
discard sync miniMSMsReady[w]
r[].sum_vartime(r[], miniMSMsResults[w])
# Cleanup
# -------
miniMSMsResults.freeHeap()
bucketsMatrix.freeHeap()
# Parallel MSM Affine - bucket accumulation
# -----------------------------------------
proc bucketAccumReduce_serial[bits: static int, EC, F, G](
r: ptr EC,
bitIndex: int,
miniMsmKind: static MiniMsmKind, c: static int,
coefs: ptr UncheckedArray[BigInt[bits]],
points: ptr UncheckedArray[ECP_ShortW_Aff[F, G]],
N: int) =
const (numBuckets, queueLen) = c.deriveSchedulerConstants()
let buckets = allocHeap(Buckets[numBuckets, F, G])
let sched = allocHeap(Scheduler[numBuckets, queueLen, F, G])
sched.init(points, buckets, 0, numBuckets.int32)
# 1. Bucket Accumulation
sched.schedAccumulate(bitIndex, miniMsmKind, c, coefs, N)
# 2. Bucket Reduction
var windowSum{.noInit.}: ECP_ShortW_JacExt[F, G]
windowSum.bucketReduce(sched.buckets)
r[].fromJacobianExtended_vartime(windowSum)
# Cleanup
# ----------------
sched.freeHeap()
buckets.freeHeap()
proc bucketAccumReduce_parallel[bits: static int, EC, F, G](
tp: Threadpool,
r: ptr EC,
bitIndex: int,
miniMsmKind: static MiniMsmKind, c: static int,
coefs: ptr UncheckedArray[BigInt[bits]],
points: ptr UncheckedArray[ECP_ShortW_Aff[F, G]],
N: int) =
const (numBuckets, queueLen) = c.deriveSchedulerConstants()
const windowParallelism = bits div c # It's actually ceilDiv instead of floorDiv, but the last iteration might be too small
var bucketParallelism = 1'i32
while windowParallelism*bucketParallelism < tp.numThreads:
bucketParallelism = bucketParallelism shl 1
let numChunks = bucketParallelism
let chunkSize = int32(numBuckets) shr log2_vartime(cast[uint32](numChunks)) # Both are power of 2 so exact division
let chunksReadiness = allocStackArray(FlowVar[bool], numChunks-1) # Last chunk is done on this thread
let buckets = allocHeap(Buckets[numBuckets, F, G])
let scheds = allocHeapArray(Scheduler[numBuckets, queueLen, F, G], numChunks)
block: # 1. Bucket Accumulation
for chunkID in 0'i32 ..< numChunks-1:
let idx = chunkID*chunkSize
scheds[chunkID].addr.init(points, buckets, idx, idx+chunkSize)
chunksReadiness[chunkID] = tp.spawnAwaitable schedAccumulate(scheds[chunkID].addr, bitIndex, miniMsmKind, c, coefs, N)
# Last bucket is done sync on this thread
scheds[numChunks-1].addr.init(points, buckets, (numChunks-1)*chunkSize, int32 numBuckets)
scheds[numChunks-1].addr.schedAccumulate(bitIndex, miniMsmKind, c, coefs, N)
block: # 2. Bucket reduction with latency hiding
var windowSum{.noInit.}: ECP_ShortW_JacExt[F, G]
var accumBuckets{.noinit.}: ECP_ShortW_JacExt[F, G]
if kAffine in buckets.status[numBuckets-1]:
if kJacExt in buckets.status[numBuckets-1]:
accumBuckets.madd_vartime(buckets.ptJacExt[numBuckets-1], buckets.ptAff[numBuckets-1])
else:
accumBuckets.fromAffine(buckets.ptAff[numBuckets-1])
elif kJacExt in buckets.status[numBuckets-1]:
accumBuckets = buckets.ptJacExt[numBuckets-1]
else:
accumBuckets.setInf()
windowSum = accumBuckets
buckets.reset(numBuckets-1)
var nextBatch = numBuckets-1-chunkSize
var nextFutureIdx = numChunks-2
for k in countdown(numBuckets-2, 0):
if k == nextBatch:
discard sync(chunksReadiness[nextFutureIdx])
nextBatch -= chunkSize
nextFutureIdx -= 1
if kAffine in buckets.status[k]:
if kJacExt in buckets.status[k]:
var t{.noInit.}: ECP_ShortW_JacExt[F, G]
t.madd_vartime(buckets.ptJacExt[k], buckets.ptAff[k])
accumBuckets += t
else:
accumBuckets += buckets.ptAff[k]
elif kJacExt in buckets.status[k]:
accumBuckets += buckets.ptJacExt[k]
buckets.reset(k)
windowSum += accumBuckets
r[].fromJacobianExtended_vartime(windowSum)
# Cleanup
# ----------------
scheds.freeHeap()
buckets.freeHeap()
# Parallel MSM Affine - window-level only
# ---------------------------------------
proc msmAffine_vartime_parallel*[bits: static int, EC, F, G](
tp: Threadpool,
r: ptr EC,
coefs: ptr UncheckedArray[BigInt[bits]], points: ptr UncheckedArray[ECP_ShortW_Aff[F, G]],
N: int, c: static int, useParallelBuckets: static bool) =
# Prologue
# --------
const numBuckets = 1 shl (c-1)
const numFullWindows = bits div c
const numWindows = numFullWindows + 1 # Even if `bits div c` is exact, the signed recoding needs to see an extra 0 after the MSB
# Instead of storing the result in futures, risking them being scattered in memory
# we store them in a contiguous array, and the synchronizing future just returns a bool.
# top window is done on this thread
type EC = typeof(r[])
let miniMSMsResults = allocHeapArray(EC, numFullWindows)
let miniMSMsReady = allocStackArray(Flowvar[bool], numFullWindows)
# Algorithm
# ---------
# 1. mini-MSMs: Bucket accumulation and reduction
when useParallelBuckets:
miniMSMsReady[0] = tp.spawnAwaitable bucketAccumReduce_parallel(
tp, miniMSMsResults[0].addr,
bitIndex = 0, kBottomWindow, c,
coefs, points, N)
for w in 1 ..< numFullWindows:
miniMSMsReady[w] = tp.spawnAwaitable bucketAccumReduce_parallel(
tp, miniMSMsResults[w].addr,
bitIndex = w*c, kFullWindow, c,
coefs, points, N)
else:
miniMSMsReady[0] = tp.spawnAwaitable bucketAccumReduce_serial(
miniMSMsResults[0].addr,
bitIndex = 0, kBottomWindow, c,
coefs, points, N)
for w in 1 ..< numFullWindows:
miniMSMsReady[w] = tp.spawnAwaitable bucketAccumReduce_serial(
miniMSMsResults[w].addr,
bitIndex = w*c, kFullWindow, c,
coefs, points, N)
# Last window is done sync on this thread, directly initializing r
const excess = bits mod c
const top = bits-excess
when top != 0:
when excess != 0:
let buckets = allocHeapArray(ECP_ShortW_JacExt[F, G], numBuckets)
zeroMem(buckets[0].addr, sizeof(ECP_ShortW_JacExt[F, G]) * numBuckets)
r[].bucketAccumReduce_jacext(buckets, bitIndex = top, kTopWindow, c,
coefs, points, N)
buckets.freeHeap()
else:
r[].setInf()
# 2. Final reduction with latency hiding, r initialized to what would be miniMSMsReady[numWindows-1]
when excess != 0:
for w in countdown(numWindows-2, 0):
for _ in 0 ..< c:
r[].double()
discard sync miniMSMsReady[w]
r[].sum_vartime(r[], miniMSMsResults[w])
elif numWindows >= 2:
discard sync miniMSMsReady[numWindows-2]
r[] = miniMSMsResults[numWindows-2]
for w in countdown(numWindows-3, 0):
for _ in 0 ..< c:
r[].double()
discard sync miniMSMsReady[w]
r[].sum_vartime(r[], miniMSMsResults[w])
# Cleanup
# -------
miniMSMsResults.freeHeap()
proc msmAffine_vartime_parallel_split[bits: static int, EC, F, G](
tp: Threadpool,
r: ptr EC,
coefs: ptr UncheckedArray[BigInt[bits]], points: ptr UncheckedArray[ECP_ShortW_Aff[F, G]],
N: int, c: static int, useParallelBuckets: static bool) =
# Parallelism levels:
# - MSM parallelism: compute independent MSMs, this increases the number of total ops
# - window parallelism: compute a MSM outer loop on different threads, this has no tradeoffs
# - bucket parallelism: handle range of buckets on different threads, threads do superfluous overlapping memory reads
#
# It seems to be beneficial to have both MSM and bucket level parallelism.
# Probably by guaranteeing 2x more tasks than threads, we avoid starvation.
var windowParallelism = bits div c # It's actually ceilDiv instead of floorDiv, but the last iteration might be too small
var msmParallelism = 1'i32
while windowParallelism*msmParallelism < tp.numThreads:
windowParallelism = bits div c # This is an approximation
msmParallelism = msmParallelism shl 1
if msmParallelism == 1:
msmAffine_vartime_parallel(tp, r, coefs, points, N, c, useParallelBuckets)
return
let chunkingDescriptor = balancedChunksPrioNumber(0, N, msmParallelism)
let splitMSMsResults = allocHeapArray(typeof(r[]), msmParallelism-1)
let splitMSMsReady = allocStackArray(Flowvar[bool], msmParallelism-1)
for (i, start, len) in items(chunkingDescriptor):
if i != msmParallelism-1:
splitMSMsReady[i] = tp.spawnAwaitable msmAffine_vartime_parallel(
tp, splitMSMsResults[i].addr,
coefs +% start, points +% start, len,
c, useParallelBuckets)
else: # Run last on this thread
msmAffine_vartime_parallel(
tp, r,
coefs +% start, points +% start, len,
c, useParallelBuckets)
for i in countdown(msmParallelism-2, 0):
discard sync splitMSMsReady[i]
r[].sum_vartime(r[], splitMSMsResults[i])
freeHeap(splitMSMsResults)
proc applyEndomorphism_parallel[bits: static int, F, G](
tp: Threadpool,
coefs: ptr UncheckedArray[BigInt[bits]],
points: ptr UncheckedArray[ECP_ShortW_Aff[F, G]],
N: int): auto =
## Decompose (coefs, points) into mini-scalars
## Returns a new triplet (endoCoefs, endoPoints, N)
## endoCoefs and endoPoints MUST be freed afterwards
const M = when F is Fp: 2
elif F is Fp2: 4
else: {.error: "Unconfigured".}
const L = bits.ceilDiv_vartime(M) + 1
let splitCoefs = allocHeapArray(array[M, BigInt[L]], N)
let endoBasis = allocHeapArray(array[M, ECP_ShortW_Aff[F, G]], N)
syncScope:
tp.parallelFor i in 0 ..< N:
captures: {coefs, points, splitCoefs, endoBasis}
var negatePoints {.noinit.}: array[M, SecretBool]
splitCoefs[i].decomposeEndo(negatePoints, coefs[i], F)
if negatePoints[0].bool:
endoBasis[i][0].neg(points[i])
else:
endoBasis[i][0] = points[i]
when F is Fp:
endoBasis[i][1].x.prod(points[i].x, F.C.getCubicRootOfUnity_mod_p())
if negatePoints[1].bool:
endoBasis[i][1].y.neg(points[i].y)
else:
endoBasis[i][1].y = points[i].y
else:
staticFor m, 1, M:
endoBasis[i][m].frobenius_psi(points[i], m)
if negatePoints[m].bool:
endoBasis[i][m].neg()
let endoCoefs = cast[ptr UncheckedArray[BigInt[L]]](splitCoefs)
let endoPoints = cast[ptr UncheckedArray[ECP_ShortW_Aff[F, G]]](endoBasis)
return (endoCoefs, endoPoints, M*N)
template withEndo[bits: static int, EC, F, G](
msmProc: untyped,
tp: Threadpool,
r: ptr EC,
coefs: ptr UncheckedArray[BigInt[bits]],
points: ptr UncheckedArray[ECP_ShortW_Aff[F, G]],
N: int, c: static int) =
when bits <= F.C.getCurveOrderBitwidth() and hasEndomorphismAcceleration(F.C):
let (endoCoefs, endoPoints, endoN) = applyEndomorphism_parallel(tp, coefs, points, N)
# Given that bits and N changed, we are able to use a bigger `c`
# but it has no significant impact on performance
msmProc(tp, r, endoCoefs, endoPoints, endoN, c)
freeHeap(endoCoefs)
freeHeap(endoPoints)
else:
msmProc(tp, r, coefs, points, N, c)
template withEndo[bits: static int, EC, F, G](
msmProc: untyped,
tp: Threadpool,
r: ptr EC,
coefs: ptr UncheckedArray[BigInt[bits]],
points: ptr UncheckedArray[ECP_ShortW_Aff[F, G]],
N: int, c: static int, useParallelBuckets: static bool) =
when bits <= F.C.getCurveOrderBitwidth() and hasEndomorphismAcceleration(F.C):
let (endoCoefs, endoPoints, endoN) = applyEndomorphism_parallel(tp, coefs, points, N)
# Given that bits and N changed, we are able to use a bigger `c`
# but it has no significant impact on performance
msmProc(tp, r, endoCoefs, endoPoints, endoN, c, useParallelBuckets)
freeHeap(endoCoefs)
freeHeap(endoPoints)
else:
msmProc(tp, r, coefs, points, N, c, useParallelBuckets)
proc multiScalarMul_dispatch_vartime_parallel[bits: static int, EC, F, G](
tp: Threadpool,
r: ptr EC, coefs: ptr UncheckedArray[BigInt[bits]],
points: ptr UncheckedArray[ECP_ShortW_Aff[F, G]], N: int) =
## Multiscalar multiplication:
## r <- [a₀]P₀ + [a₁]P₁ + ... + [aₙ]Pₙ
let c = bestBucketBitSize(N, bits, useSignedBuckets = true, useManualTuning = true)
# Given that bits and N change after applying an endomorphism,
# we are able to use a bigger `c`
# but it has no significant impact on performance
case c
of 2: withEndo(msmJacExt_vartime_parallel, tp, r, coefs, points, N, c = 2)
of 3: withEndo(msmJacExt_vartime_parallel, tp, r, coefs, points, N, c = 3)
of 4: withEndo(msmJacExt_vartime_parallel, tp, r, coefs, points, N, c = 4)
of 5: withEndo(msmJacExt_vartime_parallel, tp, r, coefs, points, N, c = 5)
of 6: withEndo(msmJacExt_vartime_parallel, tp, r, coefs, points, N, c = 6)
of 7: msmJacExt_vartime_parallel(tp, r, coefs, points, N, c = 7)
of 8: msmJacExt_vartime_parallel(tp, r, coefs, points, N, c = 8)
of 9: withEndo(msmAffine_vartime_parallel_split, tp, r, coefs, points, N, c = 9, useParallelBuckets = true)
of 10: withEndo(msmAffine_vartime_parallel_split, tp, r, coefs, points, N, c = 10, useParallelBuckets = true)
of 11: msmAffine_vartime_parallel_split(tp, r, coefs, points, N, c = 10, useParallelBuckets = true)
of 12: msmAffine_vartime_parallel_split(tp, r, coefs, points, N, c = 11, useParallelBuckets = true)
of 13: msmAffine_vartime_parallel_split(tp, r, coefs, points, N, c = 12, useParallelBuckets = true)
of 14: msmAffine_vartime_parallel_split(tp, r, coefs, points, N, c = 13, useParallelBuckets = true)
of 15: msmAffine_vartime_parallel_split(tp, r, coefs, points, N, c = 14, useParallelBuckets = true)
of 16: msmAffine_vartime_parallel_split(tp, r, coefs, points, N, c = 15, useParallelBuckets = true)
of 17: msmAffine_vartime_parallel_split(tp, r, coefs, points, N, c = 16, useParallelBuckets = true)
else:
unreachable()
proc multiScalarMul_vartime_parallel*[bits: static int, EC, F, G](
tp: Threadpool,
r: var EC,
coefs: openArray[BigInt[bits]],
points: openArray[ECP_ShortW_Aff[F, G]]) {.meter, inline.} =
## Multiscalar multiplication:
## r <- [a₀]P₀ + [a₁]P₁ + ... + [aₙ]Pₙ
## This function can be nested in another parallel function
debug: doAssert coefs.len == points.len
let N = points.len
tp.multiScalarMul_dispatch_vartime_parallel(r.addr, coefs.asUnchecked(), points.asUnchecked(), N)
proc multiScalarMul_vartime_parallel*[bits: static int, EC, F, G](
tp: Threadpool,
r: ptr EC,
coefs: ptr UncheckedArray[BigInt[bits]],
points: ptr UncheckedArray[ECP_ShortW_Aff[F, G]],
len: int) {.meter, inline.} =
## Multiscalar multiplication:
## r <- [a₀]P₀ + [a₁]P₁ + ... + [aₙ]Pₙ
## This function can be nested in another parallel function
tp.multiScalarMul_dispatch_vartime_parallel(r, coefs, points, len)