Mamy Ratsimbazafy bf32c2d408
Parallel for (#222)
* introduce reserve threads to minimize latency and maximize throughput when awaiting a future

* introduce a ceilDiv proc

* threadpool: implement parallel-for loops

* 10x perf improvement by not waking reserveBackoff on syncAll

* bench overhead: new reserve system might introduce too much wakeup latency, 2x slower, for fine-grained parallelism

* add parallelForStrided

* Threadpool: Implement parallel reductions

* refactor parallel loop codegen: introduce descriptor, parsing and codegen stages

* parallel strided, test transpose bench

* tight loop is faster when backoff is not inline

* no POSIX stuff on windows, larger types for histogram bench

* fix tests

* max RSS overflow?

* missed an undefined var

* exit histogram on 32-bit

* forgot to return early dor 32-bit
2023-02-24 09:47:36 +01:00

892 lines
30 KiB
Nim

# Constantine
# Copyright (c) 2018-2019 Status Research & Development GmbH
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.
import
../../platforms/abstractions,
./limbs, ./limbs_unsaturated
# No exceptions allowed
{.push raises: [].}
{.push checks: off.}
# ############################################################
#
# Primitives based on Bézout's identity
#
# ############################################################
#
# Bézout's identity is the linear Diophantine equation
# au + bv = c
#
# The solution c is gcd(a, b)
# if a and b are coprime, gcd(a, b) = 1
# au + bv = 1
#
# Hence modulo b we have
# au + bv ≡ 1 (mod b)
# au ≡ 1 (mod b)
# So u is the modular multiplicative inverse of a (mod b)
#
# As we can use the Extended Euclidean Algorithm to find
# the GCD and the Bézout coefficient, we can use it to find the
# modular multiplicaative inverse.
# ############################################################
#
# Modular inversion (mod 2ᵏ)
#
# ############################################################
func invModBitwidth(a: BaseType): BaseType =
# Modular inverse algorithm:
# Explanation p11 "Dumas iterations" based on Newton-Raphson:
# - Cetin Kaya Koc (2017), https://eprint.iacr.org/2017/411
# - Jean-Guillaume Dumas (2012), https://arxiv.org/pdf/1209.6626v2.pdf
# - Colin Plumb (1994), http://groups.google.com/groups?selm=1994Apr6.093116.27805%40mnemosyne.cs.du.edu
# Other sources:
# - https://crypto.stackexchange.com/questions/47493/how-to-determine-the-multiplicative-inverse-modulo-64-or-other-power-of-two
# - https://mumble.net/~campbell/2015/01/21/inverse-mod-power-of-two
# - http://marc-b-reynolds.github.io/math/2017/09/18/ModInverse.html
# We are in a special case
# where m = 2^WordBitWidth.
# For a and m to be coprimes, a must be odd.
#
# We have the following relation
# ax ≡ 1 (mod 2ᵏ) <=> ax(2 - ax) ≡ 1 (mod 2²ᵏ)
# which grows in O(log(log(a)))
debug: doAssert (a and 1) == 1, "a must be odd"
const k = log2_vartime(a.sizeof().uint32 * 8)
result = a # Start from an inverse of M0 modulo 2, M0 is odd and it's own inverse
for _ in 0 ..< k: # at each iteration we get the inverse mod(2^2k)
result *= 2 - a * result # x' = x(2 - ax)
func invMod2powK(M0: BaseType, k: static BaseType): BaseType =
## Compute 1/M mod 2ᵏ
## from M[0]
# Algorithm: Cetin Kaya Koc (2017) p11, https://eprint.iacr.org/2017/411
# Once you have a modular inverse (mod 2ˢ) you can reduce
# (mod 2ᵏ) to have the modular inverse (mod 2ᵏ)
static: doAssert k <= WordBitWidth
const maskMod = (1 shl k)-1
M0.invModBitwidth() and maskMod
# ###############################################################
#
# Modular inversion
#
# ###############################################################
# Algorithm by Bernstein-Yang
# ------------------------------------------------------------
#
# - Original Bernstein-Yang paper, https://eprint.iacr.org/2019/266
# - Executable spec and description by Dettman-Wuille, https://github.com/bitcoin-core/secp256k1/blob/85b00a1/doc/safegcd_implementation.md
# - Formal bound verification by Wuille, https://github.com/sipa/safegcd-bounds
# - Formal verification by Hvass-Aranha-Spitters, https://eprint.iacr.org/2021/549
#
# We implement the half-delta divstep variant
type TransitionMatrix = object
## Bernstein-Yang Jumpdivstep transition matrix
## [u v]
## t = [q r]
## It it is scaled by 2ᵏ
u, v, q, r: SignedSecretWord
debug:
# Debugging helpers
func checkDeterminant(t: TransitionMatrix, u, v, q, r: SignedSecretWord, k, numIters: int): bool =
# The determinant of t must be a power of two. This guarantees that multiplication with t
# does not change the gcd of f and g, apart from adding a power-of-2 factor to it (which
# will be divided out again).
# Each divstep's individual matrix has determinant 2⁻¹,
# the aggregate of numIters of them will have determinant 2ⁿᵘᵐᴵᵗᵉʳˢ. Multiplying with the initial
# 2ᵏ*identity (which has determinant 2²ᵏ) means the result has determinant 2²ᵏ⁻ⁿᵘᵐᴵᵗᵉʳˢ.
let
u = SecretWord u
v = SecretWord v
q = SecretWord q
r = SecretWord r
var a, b: array[2, SecretWord]
var e: array[2, SecretWord]
smul(a[1], a[0], u, r)
smul(b[1], b[0], v, q)
var borrow: Borrow
subB(borrow, a[0], a[0], b[0], Borrow(0))
subB(borrow, a[1], a[1], b[1], borrow)
let d = 2*k - numIters
b[0] = Zero; b[1] = Zero
b[d div WordBitWidth] = One shl (d mod WordBitWidth)
return bool(a == b)
func canonicalize(
a: var LimbsUnsaturated,
signMask: SignedSecretWord,
M: LimbsUnsaturated
) =
## Compute a = sign*a (mod M)
##
## with a in range (-2*M, M)
## result in range [0, M)
const
UnsatBitWidth = WordBitWidth - a.Excess
Max = SignedSecretWord(MaxWord shr a.Excess)
# Operate in registers
var z = a
# Add M if `z` is negative
# -> range (-M, M)
z.cadd(M, z.isNegMask())
# Negate if sign is negative
# -> range (-M, M)
z.cneg(signMask)
# Normalize words to range (-2^UnsatBitwidth, 2^UnsatBitwidth)
for i in 0 ..< z.words.len-1:
z[i+1] = z[i+1] + z[i].ashr(UnsatBitWidth)
z[i] = z[i] and Max
# Add M if `z` is negative
# -> range (0, M)
z.cadd(M, z.isNegMask())
# Normalize words to range (-2^UnsatBitwidth, 2^UnsatBitwidth)
for i in 0 ..< z.words.len-1:
z[i+1] = z[i+1] + z[i].ashr(UnsatBitWidth)
z[i] = z[i] and Max
a = z
proc partitionDivsteps(bits, wordBitWidth: int): tuple[totalIters, numChunks, chunkSize, cutoff: int] =
# Given the field modulus number of bits
# and the effective word size
# Returns:
# - the total number of iterations that guarantees GCD convergence
# - the number of chunks of divsteps to compute
# - the base number of divsteps per chunk
# - a cutoff chunk,
# before this chunk ID, the number of divsteps is "base number + 1"
# afterward it's "base number"
let totalIters =
if bits == 256:
# https://github.com/sipa/safegcd-bounds/tree/master/coq
# For 256-bit inputs, 590 divsteps are sufficient with hddivstep variant (half-delta divstep)
# for gcd(f, g) with 0 <= g <= f <= Modulus (inversion g == 1)
# The generic formula reports 591
590
else:
# https://github.com/sipa/safegcd-bounds/blob/master/genproofhd.md
# For any input, for gcd(f, g) with 0 <= g <= f <= Modulus with hddivstep variant (half-delta divstep)
# (inversion g == 1)
(45907*bits + 26313) div 19929
let numChunks = totalIters.ceilDiv_vartime(wordBitWidth)
let chunkSize = totalIters div numChunks
let cutoff = totalIters mod numChunks
return (totalIters, numChunks, chunkSize, cutoff)
func batchedDivsteps(
t: var TransitionMatrix,
hdelta: SignedSecretWord,
f0, g0: SignedSecretWord,
numIters: int,
k: static int
): SignedSecretWord =
## Bernstein-Yang half-delta (hdelta) batch of divsteps
##
## Output:
## - return hdelta for the next batch of divsteps
## - mutate t, the transition matrix to apply `numIters` divsteps at once
## t is scaled by 2ᵏ
##
## Input:
## - f0, bottom limb of f
## - g0, bottom limb of g
## - numIters, number of iterations requested in this batch of divsteps
## - k, the maximum batch size, transition matrix is scaled by 2ᵏ
var
u = SignedSecretWord(1 shl k)
v = SignedSecretWord(0)
q = SignedSecretWord(0)
r = SignedSecretWord(1 shl k)
f = f0
g = g0
hdelta = hdelta
for i in k-numIters ..< k:
debug:
func reportLoop() =
debugEcho " iterations: [", k-numIters, ", ", k, ")", " (", numIters, " iterations in total)"
debugEcho " i: ", i, ", hdelta: ", int(hdelta)
# debugEcho " f: 0b", BiggestInt(f).toBin(64), ", g: 0b", BiggestInt(g).toBin(64), " | f: ", int(f), ", g: ", int(g)
# debugEcho " u: 0b", BiggestInt(u).toBin(64), ", v: 0b", BiggestInt(v).toBin(64), " | u: ", int(u), ", v: ", int(v)
# debugEcho " q: 0b", BiggestInt(q).toBin(64), ", r: 0b", BiggestInt(r).toBin(64), " | q: ", int(q), ", r: ", int(r)
doAssert (BaseType(f) and 1) == 1, (reportLoop(); "f must be odd)")
doAssert bool(not(uint(u or v or q or r) and (if i == 0: high(uint) else: high(uint) shr (i - 1)))), (reportLoop(); "Min trailing zeros count decreases at each iteration")
doAssert bool(u.ashr(k-i)*f0 + v.ashr(k-i)*g0 == f.lshl(i)), (reportLoop(); "Applying the transition matrix to (f₀, g₀) returns current (f, g)")
doAssert bool(q.ashr(k-i)*f0 + r.ashr(k-i)*g0 == g.lshl(i)), (reportLoop(); "Applying the transition matrix to (f₀, g₀) returns current (f, g)")
# Conditional masks for (hdelta < 0) and g odd
let c1 = hdelta.isNegMask()
let c2 = g.isOddMask()
# x, y, z, conditional complement of f, u, v
let x = f xor c1
let y = u xor c1
let z = v xor c1
# conditional substraction from g, q, r
g.csub(x, c2)
q.csub(y, c2)
r.csub(z, c2)
# c3 = (hdelta >= 0) and g odd
let c3 = c2 and not c1
# hdelta = -hdelta or hdelta+1
hdelta = (hdelta xor c3) + SignedSecretWord(1)
# Conditional rollback substraction
f.cadd(g, c3)
u.cadd(q, c3)
v.cadd(r, c3)
# Shifts
g = g.lshr(1)
q = q.ashr(1)
r = r.ashr(1)
t.u = u
t.v = v
t.q = q
t.r = r
debug:
doAssert bool(u*f0 + v*g0 == f.lshl(k)), "Applying the final matrix to (f₀, g₀) gives the final (f, g)"
doAssert bool(q*f0 + r*g0 == g.lshl(k)), "Applying the final matrix to (f₀, g₀) gives the final (f, g)"
doAssert checkDeterminant(t, u, v, q, r, k, numIters)
return hdelta
func matVecMul_shr_k_mod_M[N, E: static int](
t: TransitionMatrix,
d, e: var LimbsUnsaturated[N, E],
k: static int,
M: LimbsUnsaturated[N, E],
invMod2powK: SecretWord
) =
## Compute
##
## [u v] [d]
## [q r]/2ᵏ.[e] mod M
##
## d, e will be in range (-2*modulus,modulus)
## and output limbs in (-2ᵏ, 2ᵏ)
static: doAssert k == WordBitWidth - E
const Max = SignedSecretWord(MaxWord shr E)
let
u = t.u
v = t.v
q = t.q
r = t.r
let sign_d = d.isNegMask()
let sign_e = e.isNegMask()
# Double-signed-word carries
var cd, ce: DSWord
# First iteration of [u v] [d]
# [q r].[e]
cd.ssumprodAccNoCarry(u, d[0], v, e[0])
ce.ssumprodAccNoCarry(q, d[0], r, e[0])
# Compute me and md, multiples of M
# such as the bottom k bits if d and e are 0
# This allows fusing division by 2ᵏ
# i.e. (mx * M) mod 2ᵏ = x mod 2ᵏ
var md, me = SignedSecretWord(0)
md.cadd(u, sign_d)
md.cadd(v, sign_e)
me.cadd(q, sign_d)
me.cadd(r, sign_e)
md = md - (SignedSecretWord(invMod2powK * SecretWord(cd.lo) + SecretWord(md)) and Max)
me = me - (SignedSecretWord(invMod2powK * SecretWord(ce.lo) + SecretWord(me)) and Max)
# First iteration of [u v] [d] [md]
# [q r].[e] + [me].M[0]
# k bottom bits are 0
cd.smulAccNoCarry(md, M[0])
ce.smulAccNoCarry(me, M[0])
cd.ashr(k)
ce.ashr(k)
for i in 1 ..< N:
cd.ssumprodAccNoCarry(u, d[i], v, e[i])
ce.ssumprodAccNoCarry(q, d[i], r, e[i])
cd.smulAccNoCarry(md, M[i])
ce.smulAccNoCarry(me, M[i])
d[i-1] = cd.lo and Max
e[i-1] = ce.lo and Max
cd.ashr(k)
ce.ashr(k)
d[N-1] = cd.lo
e[N-1] = ce.lo
template matVecMul_shr_k_impl(
t: TransitionMatrix,
f, g: var LimbsUnsaturated,
Excess: static int,
numLimbsLeft: int or static int,
k: static int
) =
## Compute
##
## [u v] [f]
## [q r].[g] / 2ᵏ
##
## Template so that it can be specialized
## when iteration number is fixed and compiler can unroll, in constant-time case
## or variable and the full buffer might not be used (vartime)
static: doAssert k == WordBitWidth - Excess
const Max = SignedSecretWord(MaxWord shr Excess)
let
u = t.u
v = t.v
q = t.q
r = t.r
# Double-signed-word carries
var cf, cg: DSWord
# First iteration of [u v] [f]
# [q r].[g]
cf.ssumprodAccNoCarry(u, f[0], v, g[0])
cg.ssumprodAccNoCarry(q, f[0], r, g[0])
# bottom k bits are zero by construction
debug:
doAssert BaseType(cf.lo and Max) == 0, "bottom k bits should be 0, cf.lo: " & $BaseType(cf.lo)
doAssert BaseType(cg.lo and Max) == 0, "bottom k bits should be 0, cg.lo: " & $BaseType(cg.lo)
cf.ashr(k)
cg.ashr(k)
for i in 1 ..< numLimbsLeft:
cf.ssumprodAccNoCarry(u, f[i], v, g[i])
cg.ssumprodAccNoCarry(q, f[i], r, g[i])
f[i-1] = cf.lo and Max
g[i-1] = cg.lo and Max
cf.ashr(k)
cg.ashr(k)
f[numLimbsLeft-1] = cf.lo
g[numLimbsLeft-1] = cg.lo
func matVecMul_shr_k[N, E: static int](t: TransitionMatrix, f, g: var LimbsUnsaturated[N, E], k: static int) =
matVecMul_shr_k_impl(t, f, g, E, N, k)
func invmodImpl[N, E](
a: var LimbsUnsaturated[N, E],
F, M: LimbsUnsaturated[N, E],
invMod2powK: SecretWord,
k, bits: static int) =
## Modular inversion using Bernstein-Yang algorithm
## r ≡ F.a⁻¹ (mod M)
# hdelta = delta-1/2, delta starts at 1/2 for the half-delta variant
var hdelta = SignedSecretWord(0)
var d{.noInit.}, e{.noInit.}: LimbsUnsaturated[N, E]
var f{.noInit.}, g{.noInit.}: LimbsUnsaturated[N, E]
d.setZero()
e = F
# g < f for partitioning / iteration count formula
f = M
g = a
const partition = partitionDivsteps(bits, k)
for i in 0 ..< partition.numChunks:
var t{.noInit.}: TransitionMatrix
let numIters = partition.chunkSize + int(i < partition.cutoff)
# Compute transition matrix and next hdelta
hdelta = t.batchedDivsteps(hdelta, f[0], g[0], numIters, k)
# Apply the transition matrix
# [u v] [d]
# [q r]/2ᵏ.[e] mod M
t.matVecMul_shr_k_mod_M(d, e, k, M, invMod2powK)
# [u v] [f]
# [q r]/2ᵏ.[g]
t.matVecMul_shr_k(f, g, k)
d.canonicalize(signMask = f.isNegMask(), M)
a = d
func invmod*(
r: var Limbs, a: Limbs,
F, M: Limbs, bits: static int) =
## Compute the scaled modular inverse of ``a`` modulo M
## r ≡ F.a⁻¹ (mod M)
##
## M MUST be odd, M does not need to be prime.
## ``a`` MUST be less than M.
const Excess = 2
const k = WordBitWidth - Excess
const NumUnsatWords = bits.ceilDiv_vartime(k)
# Convert values to unsaturated repr
var m2 {.noInit.}: LimbsUnsaturated[NumUnsatWords, Excess]
var factor {.noInit.}: LimbsUnsaturated[NumUnsatWords, Excess]
m2.fromPackedRepr(M)
factor.fromPackedRepr(F)
let m0invK = SecretWord invMod2powK(BaseType M[0], k)
var a2 {.noInit.}: LimbsUnsaturated[NumUnsatWords, Excess]
a2.fromPackedRepr(a)
a2.invmodImpl(factor, m2, m0invK, k, bits)
r.fromUnsatRepr(a2)
func invmod*(
r: var Limbs, a: Limbs,
F, M: static Limbs, bits: static int) =
## Compute the scaled modular inverse of ``a`` modulo M
## r ≡ F.a⁻¹ (mod M) (compile-time factor and modulus overload)
##
## with F and M known at compile-time
##
## M MUST be odd, M does not need to be prime.
## ``a`` MUST be less than M.
const Excess = 2
const k = WordBitWidth - Excess
const NumUnsatWords = bits.ceilDiv_vartime(k)
# Convert values to unsaturated repr
const m2 = LimbsUnsaturated[NumUnsatWords, Excess].fromPackedRepr(M)
const factor = LimbsUnsaturated[NumUnsatWords, Excess].fromPackedRepr(F)
const m0invK = SecretWord invMod2powK(BaseType M[0], k)
var a2 {.noInit.}: LimbsUnsaturated[NumUnsatWords, Excess]
a2.fromPackedRepr(a)
a2.invmodImpl(factor, m2, m0invK, k, bits)
r.fromUnsatRepr(a2)
# ############################################################
#
# Euler criterion, Legendre/Jacobi/Krönecker symbol
#
# ############################################################
#
# The Euler criterion, i.e. the quadratic residuosity test, for p an odd prime, is:
# a^((p-1)/2) ≡ 1 (mod p), iff a is a square
# ≡ -1 (mod p), iff a is quadratic non-residue
# ≡ 0 (mod p), iff a is 0
# derived from Fermat's Little Theorem
#
# The Legendre symbol is a function with p odd prime
# (a/p)ₗ ≡ 1 (mod p), iff a is a square
# ≡ -1 (mod p), iff a is quadratic non-residue
# ≡ 0 (mod p), iff a is 0
#
# The Jacobi symbol generalizes the Legendre symbol for any odd n:
# (a/n)ⱼ = ∏ᵢ (a/pᵢ)ₗ
# is the product of legendre symbol (a/pᵢ)ₗ with pᵢ the prime factors of n
#
# Those symbols can be computed either via exponentiation (Fermat's Little Theorem)
# or using slight modifications to the Extended Euclidean Algorithm for GCD.
func batchedDivstepsSymbol(
t: var TransitionMatrix,
hdelta: SignedSecretWord,
f0, g0: SignedSecretWord,
numIters: int,
k: static int
): tuple[hdelta, L: SignedSecretWord] =
## Bernstein-Yang half-delta (hdelta) batch of divsteps
## with Legendre symbol tracking
##
## Output:
## - return hdelta for the next batch of divsteps
## - Returns the intermediate Legendre symbol
## - mutate t, the transition matrix to apply `numIters` divsteps at once
## t is scaled by 2ᵏ
##
## Input:
## - f0, bottom limb of f
## - g0, bottom limb of g
## - numIters, number of iterations requested in this batch of divsteps
## - k, the maximum batch size, transition matrix is scaled by 2ᵏ
var
u = SignedSecretWord(1 shl (k-numIters))
v = SignedSecretWord(0)
q = SignedSecretWord(0)
r = SignedSecretWord(1 shl (k-numIters))
f = f0
g = g0
hdelta = hdelta
L = SignedSecretWord(0)
for i in k-numIters ..< k:
debug:
func reportLoop() =
debugEcho " iterations: [", k-numIters, ", ", k, ")", " (", numIters, " iterations in total)"
debugEcho " i: ", i, ", hdelta: ", int(hdelta)
# debugEcho " f: 0b", BiggestInt(f).toBin(64), ", g: 0b", BiggestInt(g).toBin(64), " | f: ", int(f), ", g: ", int(g)
# debugEcho " u: 0b", BiggestInt(u).toBin(64), ", v: 0b", BiggestInt(v).toBin(64), " | u: ", int(u), ", v: ", int(v)
# debugEcho " q: 0b", BiggestInt(q).toBin(64), ", r: 0b", BiggestInt(r).toBin(64), " | q: ", int(q), ", r: ", int(r)
doAssert (BaseType(f) and 1) == 1, (reportLoop(); "f must be odd)")
doAssert bool(u*f0 + v*g0 == f.lshl(i)), (reportLoop(); "Applying the transition matrix to (f₀, g₀) returns current (f, g)")
doAssert bool(q*f0 + r*g0 == g.lshl(i)), (reportLoop(); "Applying the transition matrix to (f₀, g₀) returns current (f, g)")
let fi = f
# Conditional masks for (hdelta < 0) and g odd
let c1 = hdelta.isNegMask()
let c2 = g.isOddMask()
# x, y, z, conditional negated complement of f, u, v
let x = (f xor c1) - c1
let y = (u xor c1) - c1
let z = (v xor c1) - c1
# conditional addition g, q, r
g.cadd(x, c2)
q.cadd(y, c2)
r.cadd(z, c2)
# c3 = (hdelta < 0) and g odd
let c3 = c2 and c1
# hdelta = -hdelta-2 or hdelta-1
hdelta = (hdelta xor c3) - SignedSecretWord(1)
# Conditionally rollback
f.cadd(g, c3)
u.cadd(q, c3)
v.cadd(r, c3)
# Shifts
g = g.lshr(1)
u = u.lshl(1)
v = v.lshl(1)
L = L + (((fi and f) xor f.lshr(1)) and SignedSecretWord(2))
L = L + (L.isOdd() xor v.isNeg())
L = L and SignedSecretWord(3)
t.u = u
t.v = v
t.q = q
t.r = r
debug:
doAssert bool(u*f0 + v*g0 == f.lshl(k)), "Applying the final matrix to (f₀, g₀) gives the final (f, g)"
doAssert bool(q*f0 + r*g0 == g.lshl(k)), "Applying the final matrix to (f₀, g₀) gives the final (f, g)"
doAssert checkDeterminant(t, u, v, q, r, k, numIters)
return (hdelta, L)
func legendreImpl[N, E](
a: var LimbsUnsaturated[N, E],
M: LimbsUnsaturated[N, E],
k, bits: static int): SecretWord =
## Legendre symbol / Quadratic Residuosity Test
## using Bernstein-Yang algorithm
# hdelta = delta-1/2, delta starts at 1/2 for the half-delta variant
var hdelta = SignedSecretWord(0)
var f{.noInit.}, g{.noInit.}: LimbsUnsaturated[N, E]
# g < f for partitioning / iteration count formula
f = M
g = a
const partition = partitionDivsteps(bits, k)
const UnsatBitWidth = WordBitWidth - a.Excess
var # Track and accumulate Legendre symbol transitions
accL = SignedSecretWord(0)
L = SignedSecretWord(0)
for i in 0 ..< partition.numChunks:
var t{.noInit.}: TransitionMatrix
let numIters = partition.chunkSize + int(i < partition.cutoff)
# Compute transition matrix and next hdelta
when f.words.len > 1:
(hdelta, L) = t.batchedDivstepsSymbol(
hdelta,
# the symbol computation needs to see the extra 2 next bits.
f[0] or f[1].lshl(UnsatBitWidth),
g[0] or g[1].lshl(UnsatBitWidth),
numIters, k)
else:
(hdelta, L) = t.batchedDivstepsSymbol(hdelta, f[0], g[0], numIters, k)
# [u v] [f]
# [q r]/2ᵏ.[g]
t.matVecMul_shr_k(f, g, k)
accL = (accL + L) and SignedSecretWord(3)
accL = (accL + ((accL.isOdd() xor f.isNeg()))) and SignedSecretWord(3)
accL = (accL + accL.isOdd()) and SignedSecretWord(3)
accL = SignedSecretWord(1)-accL
accL.csetZero(not f.isZeroMask())
return SecretWord(accL)
func legendre*(a, M: Limbs, bits: static int): SecretWord =
## Compute the Legendre symbol
##
## (a/p)ₗ ≡ a^((p-1)/2) ≡ 1 (mod p), iff a is a square
## ≡ -1 (mod p), iff a is quadratic non-residue
## ≡ 0 (mod p), iff a is 0
const Excess = 2
const k = WordBitWidth - Excess
const NumUnsatWords = bits.ceilDiv_vartime(k)
# Convert values to unsaturated repr
var m2 {.noInit.}: LimbsUnsaturated[NumUnsatWords, Excess]
m2.fromPackedRepr(M)
var a2 {.noInit.}: LimbsUnsaturated[NumUnsatWords, Excess]
a2.fromPackedRepr(a)
legendreImpl(a2, m2, k, bits)
func legendre*(a: Limbs, M: static Limbs, bits: static int): SecretWord =
## Compute the Legendre symbol (compile-time modulus overload)
##
## (a/p)ₗ ≡ a^((p-1)/2) ≡ 1 (mod p), iff a is a square
## ≡ -1 (mod p), iff a is quadratic non-residue
## ≡ 0 (mod p), iff a is 0
const Excess = 2
const k = WordBitWidth - Excess
const NumUnsatWords = bits.ceilDiv_vartime(k)
# Convert values to unsaturated repr
const m2 = LimbsUnsaturated[NumUnsatWords, Excess].fromPackedRepr(M)
var a2 {.noInit.}: LimbsUnsaturated[NumUnsatWords, Excess]
a2.fromPackedRepr(a)
legendreImpl(a2, m2, k, bits)
# ############################################################
#
# Variable-time optimizations
#
# ############################################################
const NegInvMod256 = [
# Stores tab[i div 2] = -i⁻¹ (mod 256), with i odd
# See "invModBitwidth" on "Dumas iterations"
# ax ≡ 1 (mod 2ᵏ) <=> ax(2 - ax) ≡ 1 (mod 2^(2k))
# a⁻¹ (mod 256) = a(2-a²)
-1, -235, -141, -183, -57, -227, -133, -239,
-241, -91, -253, -167, -41, -83, -245, -223,
-225, -203, -109, -151, -25, -195, -101, -207,
-209, -59, -221, -135, -9, -51, -213, -191,
-193, -171, -77, -119, -249, -163, -69, -175,
-177, -27, -189, -103, -233, -19, -181, -159,
-161, -139, -45, -87, -217, -131, -37, -143,
-145, -251, -157, -71, -201, -243, -149, -127,
-129, -107, -13, -55, -185, -99, -5, -111,
-113, -219, -125, -39, -169, -211, -117, -95,
-97, -75, -237, -23, -153, -67, -229, -79,
-81, -187, -93, -7, -137, -179, -85, -63,
-65, -43, -205, -247, -121, -35, -197, -47,
-49, -155, -61, -231, -105, -147, -53, -31,
-33, -11, -173, -215, -89, -3, -165, -15,
-17, -123, -29, -199, -73, -115, -21, -255]
func batchedDivsteps_vartime(
t: var TransitionMatrix,
eta: SignedSecretWord,
f0, g0: SecretWord,
k: static int
): SignedSecretWord {.tags:[Vartime].} =
## Bernstein-Yang eta (-delta) batch of divsteps
## **Variable-Time**
##
## Output:
## - return eta for the next batch of divsteps
## - mutate t, the transition matrix to apply `numIters` divsteps at once
## t is scaled by 2ᵏ
##
## Input:
## - f0, bottom limb of f
## - g0, bottom limb of g
## - k, the maximum batch size, transition matrix is scaled by 2ᵏ
template swapNeg(a, b) =
var tmp = -a
a = b
b = tmp
var
u = One
v = Zero
q = Zero
r = One
f = f0
g = g0
eta = cast[SignedBaseType](eta)
bitsLeft = cast[SignedBaseType](k)
while true:
# Count zeros up to bitsLeft and process a batch of divsteps up to that number
let zeros = (g.BaseType or (1.BaseType shl bitsLeft)).countTrailingZeroBits_vartime()
g = g shr zeros
u = u shl zeros
v = v shl zeros
eta -= cast[SignedBaseType](zeros)
bitsLeft -= cast[SignedBaseType](zeros)
if bitsLeft == 0:
break
# Now process, the 1's.
if eta < 0:
eta = -eta
swapNeg(f, g)
swapNeg(u, q)
swapNeg(v, r)
# We process up to 6 1's at once
const mask6 = SecretWord((1 shl 6) - 1)
let limit = min(eta+1, bitsLeft)
let maskLimit = (MaxWord shr (WordBitWidth - limit)) and mask6
# Find the multiple of f to add to cancel the bottom min(limit, 6) bits of g
let w = (g * SecretWord NegInvMod256[int((f and mask6) shr 1)]) and maskLimit
# Next iteration will have at least 6 0's to process at once
g += f*w
q += u*w
r += v*w
t.u = SignedSecretWord u
t.v = SignedSecretWord v
t.q = SignedSecretWord q
t.r = SignedSecretWord r
return SignedSecretWord(eta)
func matVecMul_shr_k_partial(t: TransitionMatrix, f, g: var LimbsUnsaturated, len: int, k: static int) =
## Matrix-Vector multiplication with top part of f and g being zeros
matVecMul_shr_k_impl(t, f, g, LimbsUnsaturated.Excess, len, k)
func isZero_vartime(a: LimbsUnsaturated, limbsLeft: int): bool {.tags:[VarTime].} =
for i in 0 ..< limbsLeft:
if a[i].int != 0:
return false
return true
func discardUnusedLimb_vartime[N, E: static int](limbsLeft: var int, f, g: var LimbsUnsaturated[N, E]) {.tags:[VarTime].} =
## If f and g both don't use their last limb, it will propagate the sign down to the previous one
if limbsLeft == 1:
return
let fn = f[limbsLeft-1]
let gn = g[limbsLeft-1]
var mask = SignedSecretWord(0)
mask = mask or (fn xor fn.isNegMask()) # 0 if last limb has nothing left but its sign
mask = mask or (gn xor gn.isNegMask()) # 0 if last limb has nothing left but its sign
if cast[SignedBaseType](mask) == 0:
f[limbsLeft-2] = f[limbsLeft-2] or fn.lshl(WordBitWidth-E) # if only sign is left, the last limb is 11..11 if negative
g[limbsLeft-2] = g[limbsLeft-2] or gn.lshl(WordBitWidth-E) # or 00..00 if positive
limbsLeft -= 1
func invmodImpl_vartime[N, E: static int](
a: var LimbsUnsaturated[N, E],
F, M: LimbsUnsaturated[N, E],
invMod2powK: SecretWord,
k, bits: static int) {.tags:[VarTime].} =
## **Variable-time** Modular inversion using Bernstein-Yang algorithm
## r ≡ F.a⁻¹ (mod M)
# eta = -delta
var eta = cast[SignedSecretWord](-1)
var d{.noInit.}, e{.noInit.}: LimbsUnsaturated[N, E]
var f{.noInit.}, g{.noInit.}: LimbsUnsaturated[N, E]
d.setZero()
e = F
f = M
g = a
var limbsLeft = N
while true:
var t{.noInit.}: TransitionMatrix
# Compute transition matrix and next eta
eta = t.batchedDivsteps_vartime(eta, SecretWord f[0], SecretWord g[0], k)
# Apply the transition matrix
# [u v] [d]
# [q r]/2ᵏ.[e] mod M
t.matVecMul_shr_k_mod_M(d, e, k, M, invMod2powK)
# [u v] [f]
# [q r]/2ᵏ.[g]
t.matVecMul_shr_k_partial(f, g, limbsLeft, k)
if g.isZero_vartime(limbsLeft):
break
limbsLeft.discardUnusedLimb_vartime(f, g)
d.canonicalize(signMask = f[limbsLeft-1].isNegMask(), M)
a = d
func invmod_vartime*(
r: var Limbs, a: Limbs,
F, M: Limbs, bits: static int) {.tags:[VarTime].} =
## Compute the scaled modular inverse of ``a`` modulo M
## r ≡ F.a⁻¹ (mod M)
##
## M MUST be odd, M does not need to be prime.
## ``a`` MUST be less than M.
const Excess = 2
const k = WordBitWidth - Excess
const NumUnsatWords = bits.ceilDiv_vartime(k)
# Convert values to unsaturated repr
var m2 {.noInit.}: LimbsUnsaturated[NumUnsatWords, Excess]
var factor {.noInit.}: LimbsUnsaturated[NumUnsatWords, Excess]
m2.fromPackedRepr(M)
factor.fromPackedRepr(F)
let m0invK = SecretWord invMod2powK(BaseType M[0], k)
var a2 {.noInit.}: LimbsUnsaturated[NumUnsatWords, Excess]
a2.fromPackedRepr(a)
a2.invmodImpl_vartime(factor, m2, m0invK, k, bits)
r.fromUnsatRepr(a2)
func invmod_vartime*(
r: var Limbs, a: Limbs,
F, M: static Limbs, bits: static int) {.tags:[VarTime].} =
## Compute the scaled modular inverse of ``a`` modulo M
## r ≡ F.a⁻¹ (mod M) (compile-time factor and modulus overload)
##
## with F and M known at compile-time
##
## M MUST be odd, M does not need to be prime.
## ``a`` MUST be less than M.
const Excess = 2
const k = WordBitWidth - Excess
const NumUnsatWords = bits.ceilDiv_vartime(k)
# Convert values to unsaturated repr
const m2 = LimbsUnsaturated[NumUnsatWords, Excess].fromPackedRepr(M)
const factor = LimbsUnsaturated[NumUnsatWords, Excess].fromPackedRepr(F)
const m0invK = SecretWord invMod2powK(BaseType M[0], k)
var a2 {.noInit.}: LimbsUnsaturated[NumUnsatWords, Excess]
a2.fromPackedRepr(a)
a2.invmodImpl_vartime(factor, m2, m0invK, k, bits)
r.fromUnsatRepr(a2)