mirror of
https://github.com/logos-storage/constantine.git
synced 2026-01-02 13:13:07 +00:00
* introduce reserve threads to minimize latency and maximize throughput when awaiting a future * introduce a ceilDiv proc * threadpool: implement parallel-for loops * 10x perf improvement by not waking reserveBackoff on syncAll * bench overhead: new reserve system might introduce too much wakeup latency, 2x slower, for fine-grained parallelism * add parallelForStrided * Threadpool: Implement parallel reductions * refactor parallel loop codegen: introduce descriptor, parsing and codegen stages * parallel strided, test transpose bench * tight loop is faster when backoff is not inline * no POSIX stuff on windows, larger types for histogram bench * fix tests * max RSS overflow? * missed an undefined var * exit histogram on 32-bit * forgot to return early dor 32-bit
892 lines
30 KiB
Nim
892 lines
30 KiB
Nim
# Constantine
|
|
# Copyright (c) 2018-2019 Status Research & Development GmbH
|
|
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
|
|
# Licensed and distributed under either of
|
|
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
|
|
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
|
|
# at your option. This file may not be copied, modified, or distributed except according to those terms.
|
|
|
|
import
|
|
../../platforms/abstractions,
|
|
./limbs, ./limbs_unsaturated
|
|
|
|
# No exceptions allowed
|
|
{.push raises: [].}
|
|
{.push checks: off.}
|
|
|
|
# ############################################################
|
|
#
|
|
# Primitives based on Bézout's identity
|
|
#
|
|
# ############################################################
|
|
#
|
|
# Bézout's identity is the linear Diophantine equation
|
|
# au + bv = c
|
|
#
|
|
# The solution c is gcd(a, b)
|
|
# if a and b are coprime, gcd(a, b) = 1
|
|
# au + bv = 1
|
|
#
|
|
# Hence modulo b we have
|
|
# au + bv ≡ 1 (mod b)
|
|
# au ≡ 1 (mod b)
|
|
# So u is the modular multiplicative inverse of a (mod b)
|
|
#
|
|
# As we can use the Extended Euclidean Algorithm to find
|
|
# the GCD and the Bézout coefficient, we can use it to find the
|
|
# modular multiplicaative inverse.
|
|
|
|
# ############################################################
|
|
#
|
|
# Modular inversion (mod 2ᵏ)
|
|
#
|
|
# ############################################################
|
|
|
|
func invModBitwidth(a: BaseType): BaseType =
|
|
# Modular inverse algorithm:
|
|
# Explanation p11 "Dumas iterations" based on Newton-Raphson:
|
|
# - Cetin Kaya Koc (2017), https://eprint.iacr.org/2017/411
|
|
# - Jean-Guillaume Dumas (2012), https://arxiv.org/pdf/1209.6626v2.pdf
|
|
# - Colin Plumb (1994), http://groups.google.com/groups?selm=1994Apr6.093116.27805%40mnemosyne.cs.du.edu
|
|
# Other sources:
|
|
# - https://crypto.stackexchange.com/questions/47493/how-to-determine-the-multiplicative-inverse-modulo-64-or-other-power-of-two
|
|
# - https://mumble.net/~campbell/2015/01/21/inverse-mod-power-of-two
|
|
# - http://marc-b-reynolds.github.io/math/2017/09/18/ModInverse.html
|
|
|
|
# We are in a special case
|
|
# where m = 2^WordBitWidth.
|
|
# For a and m to be coprimes, a must be odd.
|
|
#
|
|
# We have the following relation
|
|
# ax ≡ 1 (mod 2ᵏ) <=> ax(2 - ax) ≡ 1 (mod 2²ᵏ)
|
|
# which grows in O(log(log(a)))
|
|
debug: doAssert (a and 1) == 1, "a must be odd"
|
|
|
|
const k = log2_vartime(a.sizeof().uint32 * 8)
|
|
result = a # Start from an inverse of M0 modulo 2, M0 is odd and it's own inverse
|
|
for _ in 0 ..< k: # at each iteration we get the inverse mod(2^2k)
|
|
result *= 2 - a * result # x' = x(2 - ax)
|
|
|
|
func invMod2powK(M0: BaseType, k: static BaseType): BaseType =
|
|
## Compute 1/M mod 2ᵏ
|
|
## from M[0]
|
|
# Algorithm: Cetin Kaya Koc (2017) p11, https://eprint.iacr.org/2017/411
|
|
# Once you have a modular inverse (mod 2ˢ) you can reduce
|
|
# (mod 2ᵏ) to have the modular inverse (mod 2ᵏ)
|
|
static: doAssert k <= WordBitWidth
|
|
const maskMod = (1 shl k)-1
|
|
M0.invModBitwidth() and maskMod
|
|
|
|
# ###############################################################
|
|
#
|
|
# Modular inversion
|
|
#
|
|
# ###############################################################
|
|
|
|
# Algorithm by Bernstein-Yang
|
|
# ------------------------------------------------------------
|
|
#
|
|
# - Original Bernstein-Yang paper, https://eprint.iacr.org/2019/266
|
|
# - Executable spec and description by Dettman-Wuille, https://github.com/bitcoin-core/secp256k1/blob/85b00a1/doc/safegcd_implementation.md
|
|
# - Formal bound verification by Wuille, https://github.com/sipa/safegcd-bounds
|
|
# - Formal verification by Hvass-Aranha-Spitters, https://eprint.iacr.org/2021/549
|
|
#
|
|
# We implement the half-delta divstep variant
|
|
|
|
type TransitionMatrix = object
|
|
## Bernstein-Yang Jumpdivstep transition matrix
|
|
## [u v]
|
|
## t = [q r]
|
|
## It it is scaled by 2ᵏ
|
|
u, v, q, r: SignedSecretWord
|
|
|
|
debug:
|
|
# Debugging helpers
|
|
|
|
func checkDeterminant(t: TransitionMatrix, u, v, q, r: SignedSecretWord, k, numIters: int): bool =
|
|
# The determinant of t must be a power of two. This guarantees that multiplication with t
|
|
# does not change the gcd of f and g, apart from adding a power-of-2 factor to it (which
|
|
# will be divided out again).
|
|
# Each divstep's individual matrix has determinant 2⁻¹,
|
|
# the aggregate of numIters of them will have determinant 2ⁿᵘᵐᴵᵗᵉʳˢ. Multiplying with the initial
|
|
# 2ᵏ*identity (which has determinant 2²ᵏ) means the result has determinant 2²ᵏ⁻ⁿᵘᵐᴵᵗᵉʳˢ.
|
|
let
|
|
u = SecretWord u
|
|
v = SecretWord v
|
|
q = SecretWord q
|
|
r = SecretWord r
|
|
|
|
var a, b: array[2, SecretWord]
|
|
var e: array[2, SecretWord]
|
|
smul(a[1], a[0], u, r)
|
|
smul(b[1], b[0], v, q)
|
|
|
|
var borrow: Borrow
|
|
subB(borrow, a[0], a[0], b[0], Borrow(0))
|
|
subB(borrow, a[1], a[1], b[1], borrow)
|
|
|
|
let d = 2*k - numIters
|
|
b[0] = Zero; b[1] = Zero
|
|
b[d div WordBitWidth] = One shl (d mod WordBitWidth)
|
|
|
|
return bool(a == b)
|
|
|
|
func canonicalize(
|
|
a: var LimbsUnsaturated,
|
|
signMask: SignedSecretWord,
|
|
M: LimbsUnsaturated
|
|
) =
|
|
## Compute a = sign*a (mod M)
|
|
##
|
|
## with a in range (-2*M, M)
|
|
## result in range [0, M)
|
|
|
|
const
|
|
UnsatBitWidth = WordBitWidth - a.Excess
|
|
Max = SignedSecretWord(MaxWord shr a.Excess)
|
|
|
|
# Operate in registers
|
|
var z = a
|
|
|
|
# Add M if `z` is negative
|
|
# -> range (-M, M)
|
|
z.cadd(M, z.isNegMask())
|
|
# Negate if sign is negative
|
|
# -> range (-M, M)
|
|
z.cneg(signMask)
|
|
# Normalize words to range (-2^UnsatBitwidth, 2^UnsatBitwidth)
|
|
for i in 0 ..< z.words.len-1:
|
|
z[i+1] = z[i+1] + z[i].ashr(UnsatBitWidth)
|
|
z[i] = z[i] and Max
|
|
|
|
# Add M if `z` is negative
|
|
# -> range (0, M)
|
|
z.cadd(M, z.isNegMask())
|
|
# Normalize words to range (-2^UnsatBitwidth, 2^UnsatBitwidth)
|
|
for i in 0 ..< z.words.len-1:
|
|
z[i+1] = z[i+1] + z[i].ashr(UnsatBitWidth)
|
|
z[i] = z[i] and Max
|
|
|
|
a = z
|
|
|
|
proc partitionDivsteps(bits, wordBitWidth: int): tuple[totalIters, numChunks, chunkSize, cutoff: int] =
|
|
# Given the field modulus number of bits
|
|
# and the effective word size
|
|
# Returns:
|
|
# - the total number of iterations that guarantees GCD convergence
|
|
# - the number of chunks of divsteps to compute
|
|
# - the base number of divsteps per chunk
|
|
# - a cutoff chunk,
|
|
# before this chunk ID, the number of divsteps is "base number + 1"
|
|
# afterward it's "base number"
|
|
let totalIters =
|
|
if bits == 256:
|
|
# https://github.com/sipa/safegcd-bounds/tree/master/coq
|
|
# For 256-bit inputs, 590 divsteps are sufficient with hddivstep variant (half-delta divstep)
|
|
# for gcd(f, g) with 0 <= g <= f <= Modulus (inversion g == 1)
|
|
# The generic formula reports 591
|
|
590
|
|
else:
|
|
# https://github.com/sipa/safegcd-bounds/blob/master/genproofhd.md
|
|
# For any input, for gcd(f, g) with 0 <= g <= f <= Modulus with hddivstep variant (half-delta divstep)
|
|
# (inversion g == 1)
|
|
(45907*bits + 26313) div 19929
|
|
let numChunks = totalIters.ceilDiv_vartime(wordBitWidth)
|
|
let chunkSize = totalIters div numChunks
|
|
let cutoff = totalIters mod numChunks
|
|
return (totalIters, numChunks, chunkSize, cutoff)
|
|
|
|
func batchedDivsteps(
|
|
t: var TransitionMatrix,
|
|
hdelta: SignedSecretWord,
|
|
f0, g0: SignedSecretWord,
|
|
numIters: int,
|
|
k: static int
|
|
): SignedSecretWord =
|
|
## Bernstein-Yang half-delta (hdelta) batch of divsteps
|
|
##
|
|
## Output:
|
|
## - return hdelta for the next batch of divsteps
|
|
## - mutate t, the transition matrix to apply `numIters` divsteps at once
|
|
## t is scaled by 2ᵏ
|
|
##
|
|
## Input:
|
|
## - f0, bottom limb of f
|
|
## - g0, bottom limb of g
|
|
## - numIters, number of iterations requested in this batch of divsteps
|
|
## - k, the maximum batch size, transition matrix is scaled by 2ᵏ
|
|
var
|
|
u = SignedSecretWord(1 shl k)
|
|
v = SignedSecretWord(0)
|
|
q = SignedSecretWord(0)
|
|
r = SignedSecretWord(1 shl k)
|
|
f = f0
|
|
g = g0
|
|
|
|
hdelta = hdelta
|
|
|
|
for i in k-numIters ..< k:
|
|
debug:
|
|
func reportLoop() =
|
|
debugEcho " iterations: [", k-numIters, ", ", k, ")", " (", numIters, " iterations in total)"
|
|
debugEcho " i: ", i, ", hdelta: ", int(hdelta)
|
|
# debugEcho " f: 0b", BiggestInt(f).toBin(64), ", g: 0b", BiggestInt(g).toBin(64), " | f: ", int(f), ", g: ", int(g)
|
|
# debugEcho " u: 0b", BiggestInt(u).toBin(64), ", v: 0b", BiggestInt(v).toBin(64), " | u: ", int(u), ", v: ", int(v)
|
|
# debugEcho " q: 0b", BiggestInt(q).toBin(64), ", r: 0b", BiggestInt(r).toBin(64), " | q: ", int(q), ", r: ", int(r)
|
|
|
|
doAssert (BaseType(f) and 1) == 1, (reportLoop(); "f must be odd)")
|
|
doAssert bool(not(uint(u or v or q or r) and (if i == 0: high(uint) else: high(uint) shr (i - 1)))), (reportLoop(); "Min trailing zeros count decreases at each iteration")
|
|
doAssert bool(u.ashr(k-i)*f0 + v.ashr(k-i)*g0 == f.lshl(i)), (reportLoop(); "Applying the transition matrix to (f₀, g₀) returns current (f, g)")
|
|
doAssert bool(q.ashr(k-i)*f0 + r.ashr(k-i)*g0 == g.lshl(i)), (reportLoop(); "Applying the transition matrix to (f₀, g₀) returns current (f, g)")
|
|
|
|
# Conditional masks for (hdelta < 0) and g odd
|
|
let c1 = hdelta.isNegMask()
|
|
let c2 = g.isOddMask()
|
|
# x, y, z, conditional complement of f, u, v
|
|
let x = f xor c1
|
|
let y = u xor c1
|
|
let z = v xor c1
|
|
# conditional substraction from g, q, r
|
|
g.csub(x, c2)
|
|
q.csub(y, c2)
|
|
r.csub(z, c2)
|
|
# c3 = (hdelta >= 0) and g odd
|
|
let c3 = c2 and not c1
|
|
# hdelta = -hdelta or hdelta+1
|
|
hdelta = (hdelta xor c3) + SignedSecretWord(1)
|
|
# Conditional rollback substraction
|
|
f.cadd(g, c3)
|
|
u.cadd(q, c3)
|
|
v.cadd(r, c3)
|
|
# Shifts
|
|
g = g.lshr(1)
|
|
q = q.ashr(1)
|
|
r = r.ashr(1)
|
|
|
|
t.u = u
|
|
t.v = v
|
|
t.q = q
|
|
t.r = r
|
|
debug:
|
|
doAssert bool(u*f0 + v*g0 == f.lshl(k)), "Applying the final matrix to (f₀, g₀) gives the final (f, g)"
|
|
doAssert bool(q*f0 + r*g0 == g.lshl(k)), "Applying the final matrix to (f₀, g₀) gives the final (f, g)"
|
|
doAssert checkDeterminant(t, u, v, q, r, k, numIters)
|
|
|
|
return hdelta
|
|
|
|
func matVecMul_shr_k_mod_M[N, E: static int](
|
|
t: TransitionMatrix,
|
|
d, e: var LimbsUnsaturated[N, E],
|
|
k: static int,
|
|
M: LimbsUnsaturated[N, E],
|
|
invMod2powK: SecretWord
|
|
) =
|
|
## Compute
|
|
##
|
|
## [u v] [d]
|
|
## [q r]/2ᵏ.[e] mod M
|
|
##
|
|
## d, e will be in range (-2*modulus,modulus)
|
|
## and output limbs in (-2ᵏ, 2ᵏ)
|
|
|
|
static: doAssert k == WordBitWidth - E
|
|
const Max = SignedSecretWord(MaxWord shr E)
|
|
|
|
let
|
|
u = t.u
|
|
v = t.v
|
|
q = t.q
|
|
r = t.r
|
|
|
|
let sign_d = d.isNegMask()
|
|
let sign_e = e.isNegMask()
|
|
|
|
# Double-signed-word carries
|
|
var cd, ce: DSWord
|
|
|
|
# First iteration of [u v] [d]
|
|
# [q r].[e]
|
|
cd.ssumprodAccNoCarry(u, d[0], v, e[0])
|
|
ce.ssumprodAccNoCarry(q, d[0], r, e[0])
|
|
|
|
# Compute me and md, multiples of M
|
|
# such as the bottom k bits if d and e are 0
|
|
# This allows fusing division by 2ᵏ
|
|
# i.e. (mx * M) mod 2ᵏ = x mod 2ᵏ
|
|
var md, me = SignedSecretWord(0)
|
|
md.cadd(u, sign_d)
|
|
md.cadd(v, sign_e)
|
|
me.cadd(q, sign_d)
|
|
me.cadd(r, sign_e)
|
|
|
|
md = md - (SignedSecretWord(invMod2powK * SecretWord(cd.lo) + SecretWord(md)) and Max)
|
|
me = me - (SignedSecretWord(invMod2powK * SecretWord(ce.lo) + SecretWord(me)) and Max)
|
|
|
|
# First iteration of [u v] [d] [md]
|
|
# [q r].[e] + [me].M[0]
|
|
# k bottom bits are 0
|
|
cd.smulAccNoCarry(md, M[0])
|
|
ce.smulAccNoCarry(me, M[0])
|
|
cd.ashr(k)
|
|
ce.ashr(k)
|
|
|
|
for i in 1 ..< N:
|
|
cd.ssumprodAccNoCarry(u, d[i], v, e[i])
|
|
ce.ssumprodAccNoCarry(q, d[i], r, e[i])
|
|
cd.smulAccNoCarry(md, M[i])
|
|
ce.smulAccNoCarry(me, M[i])
|
|
d[i-1] = cd.lo and Max
|
|
e[i-1] = ce.lo and Max
|
|
cd.ashr(k)
|
|
ce.ashr(k)
|
|
|
|
d[N-1] = cd.lo
|
|
e[N-1] = ce.lo
|
|
|
|
template matVecMul_shr_k_impl(
|
|
t: TransitionMatrix,
|
|
f, g: var LimbsUnsaturated,
|
|
Excess: static int,
|
|
numLimbsLeft: int or static int,
|
|
k: static int
|
|
) =
|
|
## Compute
|
|
##
|
|
## [u v] [f]
|
|
## [q r].[g] / 2ᵏ
|
|
##
|
|
## Template so that it can be specialized
|
|
## when iteration number is fixed and compiler can unroll, in constant-time case
|
|
## or variable and the full buffer might not be used (vartime)
|
|
|
|
static: doAssert k == WordBitWidth - Excess
|
|
const Max = SignedSecretWord(MaxWord shr Excess)
|
|
|
|
let
|
|
u = t.u
|
|
v = t.v
|
|
q = t.q
|
|
r = t.r
|
|
|
|
# Double-signed-word carries
|
|
var cf, cg: DSWord
|
|
|
|
# First iteration of [u v] [f]
|
|
# [q r].[g]
|
|
cf.ssumprodAccNoCarry(u, f[0], v, g[0])
|
|
cg.ssumprodAccNoCarry(q, f[0], r, g[0])
|
|
# bottom k bits are zero by construction
|
|
debug:
|
|
doAssert BaseType(cf.lo and Max) == 0, "bottom k bits should be 0, cf.lo: " & $BaseType(cf.lo)
|
|
doAssert BaseType(cg.lo and Max) == 0, "bottom k bits should be 0, cg.lo: " & $BaseType(cg.lo)
|
|
|
|
cf.ashr(k)
|
|
cg.ashr(k)
|
|
|
|
for i in 1 ..< numLimbsLeft:
|
|
cf.ssumprodAccNoCarry(u, f[i], v, g[i])
|
|
cg.ssumprodAccNoCarry(q, f[i], r, g[i])
|
|
f[i-1] = cf.lo and Max
|
|
g[i-1] = cg.lo and Max
|
|
cf.ashr(k)
|
|
cg.ashr(k)
|
|
|
|
f[numLimbsLeft-1] = cf.lo
|
|
g[numLimbsLeft-1] = cg.lo
|
|
|
|
func matVecMul_shr_k[N, E: static int](t: TransitionMatrix, f, g: var LimbsUnsaturated[N, E], k: static int) =
|
|
matVecMul_shr_k_impl(t, f, g, E, N, k)
|
|
|
|
func invmodImpl[N, E](
|
|
a: var LimbsUnsaturated[N, E],
|
|
F, M: LimbsUnsaturated[N, E],
|
|
invMod2powK: SecretWord,
|
|
k, bits: static int) =
|
|
## Modular inversion using Bernstein-Yang algorithm
|
|
## r ≡ F.a⁻¹ (mod M)
|
|
|
|
# hdelta = delta-1/2, delta starts at 1/2 for the half-delta variant
|
|
var hdelta = SignedSecretWord(0)
|
|
var d{.noInit.}, e{.noInit.}: LimbsUnsaturated[N, E]
|
|
var f{.noInit.}, g{.noInit.}: LimbsUnsaturated[N, E]
|
|
|
|
d.setZero()
|
|
e = F
|
|
|
|
# g < f for partitioning / iteration count formula
|
|
f = M
|
|
g = a
|
|
const partition = partitionDivsteps(bits, k)
|
|
|
|
for i in 0 ..< partition.numChunks:
|
|
var t{.noInit.}: TransitionMatrix
|
|
let numIters = partition.chunkSize + int(i < partition.cutoff)
|
|
# Compute transition matrix and next hdelta
|
|
hdelta = t.batchedDivsteps(hdelta, f[0], g[0], numIters, k)
|
|
# Apply the transition matrix
|
|
# [u v] [d]
|
|
# [q r]/2ᵏ.[e] mod M
|
|
t.matVecMul_shr_k_mod_M(d, e, k, M, invMod2powK)
|
|
# [u v] [f]
|
|
# [q r]/2ᵏ.[g]
|
|
t.matVecMul_shr_k(f, g, k)
|
|
|
|
d.canonicalize(signMask = f.isNegMask(), M)
|
|
a = d
|
|
|
|
func invmod*(
|
|
r: var Limbs, a: Limbs,
|
|
F, M: Limbs, bits: static int) =
|
|
## Compute the scaled modular inverse of ``a`` modulo M
|
|
## r ≡ F.a⁻¹ (mod M)
|
|
##
|
|
## M MUST be odd, M does not need to be prime.
|
|
## ``a`` MUST be less than M.
|
|
const Excess = 2
|
|
const k = WordBitWidth - Excess
|
|
const NumUnsatWords = bits.ceilDiv_vartime(k)
|
|
|
|
# Convert values to unsaturated repr
|
|
var m2 {.noInit.}: LimbsUnsaturated[NumUnsatWords, Excess]
|
|
var factor {.noInit.}: LimbsUnsaturated[NumUnsatWords, Excess]
|
|
m2.fromPackedRepr(M)
|
|
factor.fromPackedRepr(F)
|
|
let m0invK = SecretWord invMod2powK(BaseType M[0], k)
|
|
|
|
var a2 {.noInit.}: LimbsUnsaturated[NumUnsatWords, Excess]
|
|
a2.fromPackedRepr(a)
|
|
a2.invmodImpl(factor, m2, m0invK, k, bits)
|
|
r.fromUnsatRepr(a2)
|
|
|
|
func invmod*(
|
|
r: var Limbs, a: Limbs,
|
|
F, M: static Limbs, bits: static int) =
|
|
## Compute the scaled modular inverse of ``a`` modulo M
|
|
## r ≡ F.a⁻¹ (mod M) (compile-time factor and modulus overload)
|
|
##
|
|
## with F and M known at compile-time
|
|
##
|
|
## M MUST be odd, M does not need to be prime.
|
|
## ``a`` MUST be less than M.
|
|
|
|
const Excess = 2
|
|
const k = WordBitWidth - Excess
|
|
const NumUnsatWords = bits.ceilDiv_vartime(k)
|
|
|
|
# Convert values to unsaturated repr
|
|
const m2 = LimbsUnsaturated[NumUnsatWords, Excess].fromPackedRepr(M)
|
|
const factor = LimbsUnsaturated[NumUnsatWords, Excess].fromPackedRepr(F)
|
|
const m0invK = SecretWord invMod2powK(BaseType M[0], k)
|
|
|
|
var a2 {.noInit.}: LimbsUnsaturated[NumUnsatWords, Excess]
|
|
a2.fromPackedRepr(a)
|
|
a2.invmodImpl(factor, m2, m0invK, k, bits)
|
|
r.fromUnsatRepr(a2)
|
|
|
|
# ############################################################
|
|
#
|
|
# Euler criterion, Legendre/Jacobi/Krönecker symbol
|
|
#
|
|
# ############################################################
|
|
#
|
|
# The Euler criterion, i.e. the quadratic residuosity test, for p an odd prime, is:
|
|
# a^((p-1)/2) ≡ 1 (mod p), iff a is a square
|
|
# ≡ -1 (mod p), iff a is quadratic non-residue
|
|
# ≡ 0 (mod p), iff a is 0
|
|
# derived from Fermat's Little Theorem
|
|
#
|
|
# The Legendre symbol is a function with p odd prime
|
|
# (a/p)ₗ ≡ 1 (mod p), iff a is a square
|
|
# ≡ -1 (mod p), iff a is quadratic non-residue
|
|
# ≡ 0 (mod p), iff a is 0
|
|
#
|
|
# The Jacobi symbol generalizes the Legendre symbol for any odd n:
|
|
# (a/n)ⱼ = ∏ᵢ (a/pᵢ)ₗ
|
|
# is the product of legendre symbol (a/pᵢ)ₗ with pᵢ the prime factors of n
|
|
#
|
|
# Those symbols can be computed either via exponentiation (Fermat's Little Theorem)
|
|
# or using slight modifications to the Extended Euclidean Algorithm for GCD.
|
|
|
|
func batchedDivstepsSymbol(
|
|
t: var TransitionMatrix,
|
|
hdelta: SignedSecretWord,
|
|
f0, g0: SignedSecretWord,
|
|
numIters: int,
|
|
k: static int
|
|
): tuple[hdelta, L: SignedSecretWord] =
|
|
## Bernstein-Yang half-delta (hdelta) batch of divsteps
|
|
## with Legendre symbol tracking
|
|
##
|
|
## Output:
|
|
## - return hdelta for the next batch of divsteps
|
|
## - Returns the intermediate Legendre symbol
|
|
## - mutate t, the transition matrix to apply `numIters` divsteps at once
|
|
## t is scaled by 2ᵏ
|
|
##
|
|
## Input:
|
|
## - f0, bottom limb of f
|
|
## - g0, bottom limb of g
|
|
## - numIters, number of iterations requested in this batch of divsteps
|
|
## - k, the maximum batch size, transition matrix is scaled by 2ᵏ
|
|
|
|
var
|
|
u = SignedSecretWord(1 shl (k-numIters))
|
|
v = SignedSecretWord(0)
|
|
q = SignedSecretWord(0)
|
|
r = SignedSecretWord(1 shl (k-numIters))
|
|
f = f0
|
|
g = g0
|
|
|
|
hdelta = hdelta
|
|
L = SignedSecretWord(0)
|
|
|
|
for i in k-numIters ..< k:
|
|
debug:
|
|
func reportLoop() =
|
|
debugEcho " iterations: [", k-numIters, ", ", k, ")", " (", numIters, " iterations in total)"
|
|
debugEcho " i: ", i, ", hdelta: ", int(hdelta)
|
|
# debugEcho " f: 0b", BiggestInt(f).toBin(64), ", g: 0b", BiggestInt(g).toBin(64), " | f: ", int(f), ", g: ", int(g)
|
|
# debugEcho " u: 0b", BiggestInt(u).toBin(64), ", v: 0b", BiggestInt(v).toBin(64), " | u: ", int(u), ", v: ", int(v)
|
|
# debugEcho " q: 0b", BiggestInt(q).toBin(64), ", r: 0b", BiggestInt(r).toBin(64), " | q: ", int(q), ", r: ", int(r)
|
|
|
|
doAssert (BaseType(f) and 1) == 1, (reportLoop(); "f must be odd)")
|
|
doAssert bool(u*f0 + v*g0 == f.lshl(i)), (reportLoop(); "Applying the transition matrix to (f₀, g₀) returns current (f, g)")
|
|
doAssert bool(q*f0 + r*g0 == g.lshl(i)), (reportLoop(); "Applying the transition matrix to (f₀, g₀) returns current (f, g)")
|
|
|
|
let fi = f
|
|
|
|
# Conditional masks for (hdelta < 0) and g odd
|
|
let c1 = hdelta.isNegMask()
|
|
let c2 = g.isOddMask()
|
|
# x, y, z, conditional negated complement of f, u, v
|
|
let x = (f xor c1) - c1
|
|
let y = (u xor c1) - c1
|
|
let z = (v xor c1) - c1
|
|
# conditional addition g, q, r
|
|
g.cadd(x, c2)
|
|
q.cadd(y, c2)
|
|
r.cadd(z, c2)
|
|
# c3 = (hdelta < 0) and g odd
|
|
let c3 = c2 and c1
|
|
# hdelta = -hdelta-2 or hdelta-1
|
|
hdelta = (hdelta xor c3) - SignedSecretWord(1)
|
|
# Conditionally rollback
|
|
f.cadd(g, c3)
|
|
u.cadd(q, c3)
|
|
v.cadd(r, c3)
|
|
# Shifts
|
|
g = g.lshr(1)
|
|
u = u.lshl(1)
|
|
v = v.lshl(1)
|
|
|
|
L = L + (((fi and f) xor f.lshr(1)) and SignedSecretWord(2))
|
|
L = L + (L.isOdd() xor v.isNeg())
|
|
L = L and SignedSecretWord(3)
|
|
|
|
t.u = u
|
|
t.v = v
|
|
t.q = q
|
|
t.r = r
|
|
debug:
|
|
doAssert bool(u*f0 + v*g0 == f.lshl(k)), "Applying the final matrix to (f₀, g₀) gives the final (f, g)"
|
|
doAssert bool(q*f0 + r*g0 == g.lshl(k)), "Applying the final matrix to (f₀, g₀) gives the final (f, g)"
|
|
doAssert checkDeterminant(t, u, v, q, r, k, numIters)
|
|
|
|
return (hdelta, L)
|
|
|
|
func legendreImpl[N, E](
|
|
a: var LimbsUnsaturated[N, E],
|
|
M: LimbsUnsaturated[N, E],
|
|
k, bits: static int): SecretWord =
|
|
## Legendre symbol / Quadratic Residuosity Test
|
|
## using Bernstein-Yang algorithm
|
|
|
|
# hdelta = delta-1/2, delta starts at 1/2 for the half-delta variant
|
|
var hdelta = SignedSecretWord(0)
|
|
var f{.noInit.}, g{.noInit.}: LimbsUnsaturated[N, E]
|
|
|
|
# g < f for partitioning / iteration count formula
|
|
f = M
|
|
g = a
|
|
const partition = partitionDivsteps(bits, k)
|
|
const UnsatBitWidth = WordBitWidth - a.Excess
|
|
|
|
var # Track and accumulate Legendre symbol transitions
|
|
accL = SignedSecretWord(0)
|
|
L = SignedSecretWord(0)
|
|
|
|
for i in 0 ..< partition.numChunks:
|
|
var t{.noInit.}: TransitionMatrix
|
|
let numIters = partition.chunkSize + int(i < partition.cutoff)
|
|
# Compute transition matrix and next hdelta
|
|
when f.words.len > 1:
|
|
(hdelta, L) = t.batchedDivstepsSymbol(
|
|
hdelta,
|
|
# the symbol computation needs to see the extra 2 next bits.
|
|
f[0] or f[1].lshl(UnsatBitWidth),
|
|
g[0] or g[1].lshl(UnsatBitWidth),
|
|
numIters, k)
|
|
else:
|
|
(hdelta, L) = t.batchedDivstepsSymbol(hdelta, f[0], g[0], numIters, k)
|
|
# [u v] [f]
|
|
# [q r]/2ᵏ.[g]
|
|
t.matVecMul_shr_k(f, g, k)
|
|
accL = (accL + L) and SignedSecretWord(3)
|
|
accL = (accL + ((accL.isOdd() xor f.isNeg()))) and SignedSecretWord(3)
|
|
|
|
accL = (accL + accL.isOdd()) and SignedSecretWord(3)
|
|
accL = SignedSecretWord(1)-accL
|
|
accL.csetZero(not f.isZeroMask())
|
|
return SecretWord(accL)
|
|
|
|
func legendre*(a, M: Limbs, bits: static int): SecretWord =
|
|
## Compute the Legendre symbol
|
|
##
|
|
## (a/p)ₗ ≡ a^((p-1)/2) ≡ 1 (mod p), iff a is a square
|
|
## ≡ -1 (mod p), iff a is quadratic non-residue
|
|
## ≡ 0 (mod p), iff a is 0
|
|
const Excess = 2
|
|
const k = WordBitWidth - Excess
|
|
const NumUnsatWords = bits.ceilDiv_vartime(k)
|
|
|
|
# Convert values to unsaturated repr
|
|
var m2 {.noInit.}: LimbsUnsaturated[NumUnsatWords, Excess]
|
|
m2.fromPackedRepr(M)
|
|
|
|
var a2 {.noInit.}: LimbsUnsaturated[NumUnsatWords, Excess]
|
|
a2.fromPackedRepr(a)
|
|
|
|
legendreImpl(a2, m2, k, bits)
|
|
|
|
func legendre*(a: Limbs, M: static Limbs, bits: static int): SecretWord =
|
|
## Compute the Legendre symbol (compile-time modulus overload)
|
|
##
|
|
## (a/p)ₗ ≡ a^((p-1)/2) ≡ 1 (mod p), iff a is a square
|
|
## ≡ -1 (mod p), iff a is quadratic non-residue
|
|
## ≡ 0 (mod p), iff a is 0
|
|
|
|
const Excess = 2
|
|
const k = WordBitWidth - Excess
|
|
const NumUnsatWords = bits.ceilDiv_vartime(k)
|
|
|
|
# Convert values to unsaturated repr
|
|
const m2 = LimbsUnsaturated[NumUnsatWords, Excess].fromPackedRepr(M)
|
|
|
|
var a2 {.noInit.}: LimbsUnsaturated[NumUnsatWords, Excess]
|
|
a2.fromPackedRepr(a)
|
|
|
|
legendreImpl(a2, m2, k, bits)
|
|
|
|
|
|
# ############################################################
|
|
#
|
|
# Variable-time optimizations
|
|
#
|
|
# ############################################################
|
|
|
|
const NegInvMod256 = [
|
|
# Stores tab[i div 2] = -i⁻¹ (mod 256), with i odd
|
|
# See "invModBitwidth" on "Dumas iterations"
|
|
# ax ≡ 1 (mod 2ᵏ) <=> ax(2 - ax) ≡ 1 (mod 2^(2k))
|
|
# a⁻¹ (mod 256) = a(2-a²)
|
|
-1, -235, -141, -183, -57, -227, -133, -239,
|
|
-241, -91, -253, -167, -41, -83, -245, -223,
|
|
-225, -203, -109, -151, -25, -195, -101, -207,
|
|
-209, -59, -221, -135, -9, -51, -213, -191,
|
|
-193, -171, -77, -119, -249, -163, -69, -175,
|
|
-177, -27, -189, -103, -233, -19, -181, -159,
|
|
-161, -139, -45, -87, -217, -131, -37, -143,
|
|
-145, -251, -157, -71, -201, -243, -149, -127,
|
|
-129, -107, -13, -55, -185, -99, -5, -111,
|
|
-113, -219, -125, -39, -169, -211, -117, -95,
|
|
-97, -75, -237, -23, -153, -67, -229, -79,
|
|
-81, -187, -93, -7, -137, -179, -85, -63,
|
|
-65, -43, -205, -247, -121, -35, -197, -47,
|
|
-49, -155, -61, -231, -105, -147, -53, -31,
|
|
-33, -11, -173, -215, -89, -3, -165, -15,
|
|
-17, -123, -29, -199, -73, -115, -21, -255]
|
|
|
|
func batchedDivsteps_vartime(
|
|
t: var TransitionMatrix,
|
|
eta: SignedSecretWord,
|
|
f0, g0: SecretWord,
|
|
k: static int
|
|
): SignedSecretWord {.tags:[Vartime].} =
|
|
## Bernstein-Yang eta (-delta) batch of divsteps
|
|
## **Variable-Time**
|
|
##
|
|
## Output:
|
|
## - return eta for the next batch of divsteps
|
|
## - mutate t, the transition matrix to apply `numIters` divsteps at once
|
|
## t is scaled by 2ᵏ
|
|
##
|
|
## Input:
|
|
## - f0, bottom limb of f
|
|
## - g0, bottom limb of g
|
|
## - k, the maximum batch size, transition matrix is scaled by 2ᵏ
|
|
|
|
template swapNeg(a, b) =
|
|
var tmp = -a
|
|
a = b
|
|
b = tmp
|
|
|
|
var
|
|
u = One
|
|
v = Zero
|
|
q = Zero
|
|
r = One
|
|
f = f0
|
|
g = g0
|
|
|
|
eta = cast[SignedBaseType](eta)
|
|
bitsLeft = cast[SignedBaseType](k)
|
|
|
|
while true:
|
|
# Count zeros up to bitsLeft and process a batch of divsteps up to that number
|
|
let zeros = (g.BaseType or (1.BaseType shl bitsLeft)).countTrailingZeroBits_vartime()
|
|
g = g shr zeros
|
|
u = u shl zeros
|
|
v = v shl zeros
|
|
eta -= cast[SignedBaseType](zeros)
|
|
bitsLeft -= cast[SignedBaseType](zeros)
|
|
|
|
if bitsLeft == 0:
|
|
break
|
|
|
|
# Now process, the 1's.
|
|
if eta < 0:
|
|
eta = -eta
|
|
swapNeg(f, g)
|
|
swapNeg(u, q)
|
|
swapNeg(v, r)
|
|
|
|
# We process up to 6 1's at once
|
|
const mask6 = SecretWord((1 shl 6) - 1)
|
|
let limit = min(eta+1, bitsLeft)
|
|
let maskLimit = (MaxWord shr (WordBitWidth - limit)) and mask6
|
|
# Find the multiple of f to add to cancel the bottom min(limit, 6) bits of g
|
|
let w = (g * SecretWord NegInvMod256[int((f and mask6) shr 1)]) and maskLimit
|
|
|
|
# Next iteration will have at least 6 0's to process at once
|
|
g += f*w
|
|
q += u*w
|
|
r += v*w
|
|
|
|
t.u = SignedSecretWord u
|
|
t.v = SignedSecretWord v
|
|
t.q = SignedSecretWord q
|
|
t.r = SignedSecretWord r
|
|
return SignedSecretWord(eta)
|
|
|
|
func matVecMul_shr_k_partial(t: TransitionMatrix, f, g: var LimbsUnsaturated, len: int, k: static int) =
|
|
## Matrix-Vector multiplication with top part of f and g being zeros
|
|
matVecMul_shr_k_impl(t, f, g, LimbsUnsaturated.Excess, len, k)
|
|
|
|
func isZero_vartime(a: LimbsUnsaturated, limbsLeft: int): bool {.tags:[VarTime].} =
|
|
for i in 0 ..< limbsLeft:
|
|
if a[i].int != 0:
|
|
return false
|
|
return true
|
|
|
|
func discardUnusedLimb_vartime[N, E: static int](limbsLeft: var int, f, g: var LimbsUnsaturated[N, E]) {.tags:[VarTime].} =
|
|
## If f and g both don't use their last limb, it will propagate the sign down to the previous one
|
|
if limbsLeft == 1:
|
|
return
|
|
|
|
let fn = f[limbsLeft-1]
|
|
let gn = g[limbsLeft-1]
|
|
var mask = SignedSecretWord(0)
|
|
mask = mask or (fn xor fn.isNegMask()) # 0 if last limb has nothing left but its sign
|
|
mask = mask or (gn xor gn.isNegMask()) # 0 if last limb has nothing left but its sign
|
|
if cast[SignedBaseType](mask) == 0:
|
|
f[limbsLeft-2] = f[limbsLeft-2] or fn.lshl(WordBitWidth-E) # if only sign is left, the last limb is 11..11 if negative
|
|
g[limbsLeft-2] = g[limbsLeft-2] or gn.lshl(WordBitWidth-E) # or 00..00 if positive
|
|
limbsLeft -= 1
|
|
|
|
func invmodImpl_vartime[N, E: static int](
|
|
a: var LimbsUnsaturated[N, E],
|
|
F, M: LimbsUnsaturated[N, E],
|
|
invMod2powK: SecretWord,
|
|
k, bits: static int) {.tags:[VarTime].} =
|
|
## **Variable-time** Modular inversion using Bernstein-Yang algorithm
|
|
## r ≡ F.a⁻¹ (mod M)
|
|
|
|
# eta = -delta
|
|
var eta = cast[SignedSecretWord](-1)
|
|
var d{.noInit.}, e{.noInit.}: LimbsUnsaturated[N, E]
|
|
var f{.noInit.}, g{.noInit.}: LimbsUnsaturated[N, E]
|
|
|
|
d.setZero()
|
|
e = F
|
|
|
|
f = M
|
|
g = a
|
|
|
|
var limbsLeft = N
|
|
|
|
while true:
|
|
var t{.noInit.}: TransitionMatrix
|
|
# Compute transition matrix and next eta
|
|
eta = t.batchedDivsteps_vartime(eta, SecretWord f[0], SecretWord g[0], k)
|
|
# Apply the transition matrix
|
|
# [u v] [d]
|
|
# [q r]/2ᵏ.[e] mod M
|
|
t.matVecMul_shr_k_mod_M(d, e, k, M, invMod2powK)
|
|
# [u v] [f]
|
|
# [q r]/2ᵏ.[g]
|
|
t.matVecMul_shr_k_partial(f, g, limbsLeft, k)
|
|
if g.isZero_vartime(limbsLeft):
|
|
break
|
|
limbsLeft.discardUnusedLimb_vartime(f, g)
|
|
|
|
d.canonicalize(signMask = f[limbsLeft-1].isNegMask(), M)
|
|
a = d
|
|
|
|
func invmod_vartime*(
|
|
r: var Limbs, a: Limbs,
|
|
F, M: Limbs, bits: static int) {.tags:[VarTime].} =
|
|
## Compute the scaled modular inverse of ``a`` modulo M
|
|
## r ≡ F.a⁻¹ (mod M)
|
|
##
|
|
## M MUST be odd, M does not need to be prime.
|
|
## ``a`` MUST be less than M.
|
|
const Excess = 2
|
|
const k = WordBitWidth - Excess
|
|
const NumUnsatWords = bits.ceilDiv_vartime(k)
|
|
|
|
# Convert values to unsaturated repr
|
|
var m2 {.noInit.}: LimbsUnsaturated[NumUnsatWords, Excess]
|
|
var factor {.noInit.}: LimbsUnsaturated[NumUnsatWords, Excess]
|
|
m2.fromPackedRepr(M)
|
|
factor.fromPackedRepr(F)
|
|
let m0invK = SecretWord invMod2powK(BaseType M[0], k)
|
|
|
|
var a2 {.noInit.}: LimbsUnsaturated[NumUnsatWords, Excess]
|
|
a2.fromPackedRepr(a)
|
|
a2.invmodImpl_vartime(factor, m2, m0invK, k, bits)
|
|
r.fromUnsatRepr(a2)
|
|
|
|
func invmod_vartime*(
|
|
r: var Limbs, a: Limbs,
|
|
F, M: static Limbs, bits: static int) {.tags:[VarTime].} =
|
|
## Compute the scaled modular inverse of ``a`` modulo M
|
|
## r ≡ F.a⁻¹ (mod M) (compile-time factor and modulus overload)
|
|
##
|
|
## with F and M known at compile-time
|
|
##
|
|
## M MUST be odd, M does not need to be prime.
|
|
## ``a`` MUST be less than M.
|
|
|
|
const Excess = 2
|
|
const k = WordBitWidth - Excess
|
|
const NumUnsatWords = bits.ceilDiv_vartime(k)
|
|
|
|
# Convert values to unsaturated repr
|
|
const m2 = LimbsUnsaturated[NumUnsatWords, Excess].fromPackedRepr(M)
|
|
const factor = LimbsUnsaturated[NumUnsatWords, Excess].fromPackedRepr(F)
|
|
const m0invK = SecretWord invMod2powK(BaseType M[0], k)
|
|
|
|
var a2 {.noInit.}: LimbsUnsaturated[NumUnsatWords, Excess]
|
|
a2.fromPackedRepr(a)
|
|
a2.invmodImpl_vartime(factor, m2, m0invK, k, bits)
|
|
r.fromUnsatRepr(a2) |