diff --git a/constantine/arithmetic/assembly/limbs_asm_mul_mont_x86.nim b/constantine/arithmetic/assembly/limbs_asm_mul_mont_x86.nim index 18db837..479d723 100644 --- a/constantine/arithmetic/assembly/limbs_asm_mul_mont_x86.nim +++ b/constantine/arithmetic/assembly/limbs_asm_mul_mont_x86.nim @@ -38,7 +38,7 @@ static: doAssert UseASM_X86_64 macro mulMont_CIOS_sparebit_gen[N: static int]( r_PIR: var Limbs[N], a_PIR, b_PIR, M_PIR: Limbs[N], m0ninv_REG: BaseType, - skipReduction: static bool + skipFinalSub: static bool ): untyped = ## Generate an optimized Montgomery Multiplication kernel ## using the CIOS method @@ -175,7 +175,7 @@ macro mulMont_CIOS_sparebit_gen[N: static int]( ctx.mov rax, r # move r away from scratchspace that will be used for final substraction let r2 = rax.asArrayAddr(len = N) - if skipReduction: + if skipFinalSub: for i in 0 ..< N: ctx.mov r2[i], t[i] else: @@ -185,14 +185,14 @@ macro mulMont_CIOS_sparebit_gen[N: static int]( ) result.add ctx.generate() -func mulMont_CIOS_sparebit_asm*(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType, skipReduction: static bool = false) = +func mulMont_CIOS_sparebit_asm*(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType, skipFinalSub: static bool = false) = ## Constant-time Montgomery multiplication - ## If "skipReduction" is set + ## If "skipFinalSub" is set ## the result is in the range [0, 2M) ## otherwise the result is in the range [0, M) ## ## This procedure can only be called if the modulus doesn't use the full bitwidth of its underlying representation - r.mulMont_CIOS_sparebit_gen(a, b, M, m0ninv, skipReduction) + r.mulMont_CIOS_sparebit_gen(a, b, M, m0ninv, skipFinalSub) # Montgomery Squaring # ------------------------------------------------------------ @@ -209,8 +209,8 @@ func squareMont_CIOS_asm*[N]( r: var Limbs[N], a, M: Limbs[N], m0ninv: BaseType, - hasSpareBit, skipReduction: static bool) = + hasSpareBit, skipFinalSub: static bool) = ## Constant-time modular squaring var r2x {.noInit.}: Limbs[2*N] r2x.square_asm_inline(a) - r.redcMont_asm_inline(r2x, M, m0ninv, hasSpareBit, skipReduction) + r.redcMont_asm_inline(r2x, M, m0ninv, hasSpareBit, skipFinalSub) diff --git a/constantine/arithmetic/assembly/limbs_asm_mul_mont_x86_adx_bmi2.nim b/constantine/arithmetic/assembly/limbs_asm_mul_mont_x86_adx_bmi2.nim index d46c064..8ba367a 100644 --- a/constantine/arithmetic/assembly/limbs_asm_mul_mont_x86_adx_bmi2.nim +++ b/constantine/arithmetic/assembly/limbs_asm_mul_mont_x86_adx_bmi2.nim @@ -179,7 +179,7 @@ proc partialRedx( macro mulMont_CIOS_sparebit_adx_gen[N: static int]( r_PIR: var Limbs[N], a_PIR, b_PIR, M_PIR: Limbs[N], m0ninv_REG: BaseType, - skipReduction: static bool): untyped = + skipFinalSub: static bool): untyped = ## Generate an optimized Montgomery Multiplication kernel ## using the CIOS method ## This requires the most significant word of the Modulus @@ -268,7 +268,7 @@ macro mulMont_CIOS_sparebit_adx_gen[N: static int]( lo, C ) - if skipReduction: + if skipFinalSub: for i in 0 ..< N: ctx.mov r[i], t[i] else: @@ -279,14 +279,14 @@ macro mulMont_CIOS_sparebit_adx_gen[N: static int]( result.add ctx.generate -func mulMont_CIOS_sparebit_asm_adx*(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType, skipReduction: static bool = false) = +func mulMont_CIOS_sparebit_asm_adx*(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType, skipFinalSub: static bool = false) = ## Constant-time Montgomery multiplication - ## If "skipReduction" is set + ## If "skipFinalSub" is set ## the result is in the range [0, 2M) ## otherwise the result is in the range [0, M) ## ## This procedure can only be called if the modulus doesn't use the full bitwidth of its underlying representation - r.mulMont_CIOS_sparebit_adx_gen(a, b, M, m0ninv, skipReduction) + r.mulMont_CIOS_sparebit_adx_gen(a, b, M, m0ninv, skipFinalSub) # Montgomery Squaring # ------------------------------------------------------------ @@ -295,8 +295,8 @@ func squareMont_CIOS_asm_adx*[N]( r: var Limbs[N], a, M: Limbs[N], m0ninv: BaseType, - hasSpareBit, skipReduction: static bool) = + hasSpareBit, skipFinalSub: static bool) = ## Constant-time modular squaring var r2x {.noInit.}: Limbs[2*N] r2x.square_asm_adx_inline(a) - r.redcMont_asm_adx(r2x, M, m0ninv, hasSpareBit, skipReduction) + r.redcMont_asm_adx(r2x, M, m0ninv, hasSpareBit, skipFinalSub) diff --git a/constantine/arithmetic/assembly/limbs_asm_redc_mont_x86.nim b/constantine/arithmetic/assembly/limbs_asm_redc_mont_x86.nim index 403366a..fd7c5aa 100644 --- a/constantine/arithmetic/assembly/limbs_asm_redc_mont_x86.nim +++ b/constantine/arithmetic/assembly/limbs_asm_redc_mont_x86.nim @@ -34,7 +34,7 @@ macro redc2xMont_gen*[N: static int]( a_PIR: array[N*2, SecretWord], M_PIR: array[N, SecretWord], m0ninv_REG: BaseType, - hasSpareBit, skipReduction: static bool + hasSpareBit, skipFinalSub: static bool ) = # No register spilling handling @@ -153,7 +153,7 @@ macro redc2xMont_gen*[N: static int]( # v is invalidated from now on let t = repackRegisters(v, u[N], u[N+1]) - if hasSpareBit and skipReduction: + if hasSpareBit and skipFinalSub: for i in 0 ..< N: ctx.mov r[i], t[i] elif hasSpareBit: @@ -170,22 +170,22 @@ func redcMont_asm_inline*[N: static int]( M: array[N, SecretWord], m0ninv: BaseType, hasSpareBit: static bool, - skipReduction: static bool = false + skipFinalSub: static bool = false ) {.inline.} = ## Constant-time Montgomery reduction ## Inline-version - redc2xMont_gen(r, a, M, m0ninv, hasSpareBit, skipReduction) + redc2xMont_gen(r, a, M, m0ninv, hasSpareBit, skipFinalSub) func redcMont_asm*[N: static int]( r: var array[N, SecretWord], a: array[N*2, SecretWord], M: array[N, SecretWord], m0ninv: BaseType, - hasSpareBit, skipReduction: static bool + hasSpareBit, skipFinalSub: static bool ) = ## Constant-time Montgomery reduction static: doAssert UseASM_X86_64, "This requires x86-64." - redcMont_asm_inline(r, a, M, m0ninv, hasSpareBit, skipReduction) + redcMont_asm_inline(r, a, M, m0ninv, hasSpareBit, skipFinalSub) # Montgomery conversion # ---------------------------------------------------------- @@ -351,8 +351,8 @@ when isMainModule: var a_sqr{.noInit.}, na_sqr{.noInit.}: Limbs[2] var a_sqr_comba{.noInit.}, na_sqr_comba{.noInit.}: Limbs[2] - a_sqr.redcMont_asm(adbl_sqr, M, 1, hasSpareBit = false, skipReduction = false) - na_sqr.redcMont_asm(nadbl_sqr, M, 1, hasSpareBit = false, skipReduction = false) + a_sqr.redcMont_asm(adbl_sqr, M, 1, hasSpareBit = false, skipFinalSub = false) + na_sqr.redcMont_asm(nadbl_sqr, M, 1, hasSpareBit = false, skipFinalSub = false) a_sqr_comba.redc2xMont_Comba(adbl_sqr, M, 1) na_sqr_comba.redc2xMont_Comba(nadbl_sqr, M, 1) diff --git a/constantine/arithmetic/assembly/limbs_asm_redc_mont_x86_adx_bmi2.nim b/constantine/arithmetic/assembly/limbs_asm_redc_mont_x86_adx_bmi2.nim index c836710..28021be 100644 --- a/constantine/arithmetic/assembly/limbs_asm_redc_mont_x86_adx_bmi2.nim +++ b/constantine/arithmetic/assembly/limbs_asm_redc_mont_x86_adx_bmi2.nim @@ -38,7 +38,7 @@ macro redc2xMont_adx_gen[N: static int]( a_PIR: array[N*2, SecretWord], M_PIR: array[N, SecretWord], m0ninv_REG: BaseType, - hasSpareBit, skipReduction: static bool + hasSpareBit, skipFinalSub: static bool ) = # No register spilling handling @@ -131,7 +131,7 @@ macro redc2xMont_adx_gen[N: static int]( let t = repackRegisters(v, u[N]) - if hasSpareBit and skipReduction: + if hasSpareBit and skipFinalSub: for i in 0 ..< N: ctx.mov r[i], t[i] elif hasSpareBit: @@ -148,11 +148,11 @@ func redcMont_asm_adx_inline*[N: static int]( M: array[N, SecretWord], m0ninv: BaseType, hasSpareBit: static bool, - skipReduction: static bool = false + skipFinalSub: static bool = false ) {.inline.} = ## Constant-time Montgomery reduction ## Inline-version - redc2xMont_adx_gen(r, a, M, m0ninv, hasSpareBit, skipReduction) + redc2xMont_adx_gen(r, a, M, m0ninv, hasSpareBit, skipFinalSub) func redcMont_asm_adx*[N: static int]( r: var array[N, SecretWord], @@ -160,10 +160,10 @@ func redcMont_asm_adx*[N: static int]( M: array[N, SecretWord], m0ninv: BaseType, hasSpareBit: static bool, - skipReduction: static bool = false + skipFinalSub: static bool = false ) = ## Constant-time Montgomery reduction - redcMont_asm_adx_inline(r, a, M, m0ninv, hasSpareBit, skipReduction) + redcMont_asm_adx_inline(r, a, M, m0ninv, hasSpareBit, skipFinalSub) # Montgomery conversion diff --git a/constantine/arithmetic/bigints_montgomery.nim b/constantine/arithmetic/bigints_montgomery.nim index 550ae8f..39abc21 100644 --- a/constantine/arithmetic/bigints_montgomery.nim +++ b/constantine/arithmetic/bigints_montgomery.nim @@ -24,7 +24,7 @@ import # # ############################################################ -func getMont*(mres: var BigInt, a, N, r2modM: BigInt, m0ninv: static BaseType, spareBits: static int) = +func getMont*(mres: var BigInt, a, N, r2modM: BigInt, m0ninv: BaseType, spareBits: static int) = ## Convert a BigInt from its natural representation ## to the Montgomery residue form ## @@ -41,7 +41,7 @@ func getMont*(mres: var BigInt, a, N, r2modM: BigInt, m0ninv: static BaseType, s ## and R = (2^WordBitWidth)^W getMont(mres.limbs, a.limbs, N.limbs, r2modM.limbs, m0ninv, spareBits) -func fromMont*[mBits](r: var BigInt[mBits], a, M: BigInt[mBits], m0ninv: static BaseType, spareBits: static int) = +func fromMont*[mBits](r: var BigInt[mBits], a, M: BigInt[mBits], m0ninv: BaseType, spareBits: static int) = ## Convert a BigInt from its Montgomery residue form ## to the natural representation ## @@ -52,20 +52,20 @@ func fromMont*[mBits](r: var BigInt[mBits], a, M: BigInt[mBits], m0ninv: static fromMont(r.limbs, a.limbs, M.limbs, m0ninv, spareBits) func mulMont*(r: var BigInt, a, b, M: BigInt, negInvModWord: static BaseType, - spareBits: static int, skipReduction: static bool = false) = + spareBits: static int, skipFinalSub: static bool = false) = ## Compute r <- a*b (mod M) in the Montgomery domain ## ## This resets r to zero before processing. Use {.noInit.} ## to avoid duplicating with Nim zero-init policy - mulMont(r.limbs, a.limbs, b.limbs, M.limbs, negInvModWord, spareBits, skipReduction) + mulMont(r.limbs, a.limbs, b.limbs, M.limbs, negInvModWord, spareBits, skipFinalSub) func squareMont*(r: var BigInt, a, M: BigInt, negInvModWord: static BaseType, - spareBits: static int, skipReduction: static bool = false) = + spareBits: static int, skipFinalSub: static bool = false) = ## Compute r <- a^2 (mod M) in the Montgomery domain ## ## This resets r to zero before processing. Use {.noInit.} ## to avoid duplicating with Nim zero-init policy - squareMont(r.limbs, a.limbs, M.limbs, negInvModWord, spareBits, skipReduction) + squareMont(r.limbs, a.limbs, M.limbs, negInvModWord, spareBits, skipFinalSub) func powMont*[mBits: static int]( a: var BigInt[mBits], exponent: openarray[byte], diff --git a/constantine/arithmetic/finite_fields.nim b/constantine/arithmetic/finite_fields.nim index 3ffc5c9..e8e0d07 100644 --- a/constantine/arithmetic/finite_fields.nim +++ b/constantine/arithmetic/finite_fields.nim @@ -214,14 +214,14 @@ func double*(r: var FF, a: FF) {.meter.} = overflowed = overflowed or not(r.mres < FF.fieldMod()) discard csub(r.mres, FF.fieldMod(), overflowed) -func prod*(r: var FF, a, b: FF, skipReduction: static bool = false) {.meter.} = +func prod*(r: var FF, a, b: FF, skipFinalSub: static bool = false) {.meter.} = ## Store the product of ``a`` by ``b`` modulo p into ``r`` ## ``r`` is initialized / overwritten - r.mres.mulMont(a.mres, b.mres, FF.fieldMod(), FF.getNegInvModWord(), FF.getSpareBits(), skipReduction) + r.mres.mulMont(a.mres, b.mres, FF.fieldMod(), FF.getNegInvModWord(), FF.getSpareBits(), skipFinalSub) -func square*(r: var FF, a: FF, skipReduction: static bool = false) {.meter.} = +func square*(r: var FF, a: FF, skipFinalSub: static bool = false) {.meter.} = ## Squaring modulo p - r.mres.squareMont(a.mres, FF.fieldMod(), FF.getNegInvModWord(), FF.getSpareBits(), skipReduction) + r.mres.squareMont(a.mres, FF.fieldMod(), FF.getNegInvModWord(), FF.getSpareBits(), skipFinalSub) func neg*(r: var FF, a: FF) {.meter.} = ## Negate modulo p @@ -413,38 +413,23 @@ func `*=`*(a: var FF, b: FF) {.meter.} = ## Multiplication modulo p a.prod(a, b) -func square*(a: var FF, skipReduction: static bool = false) {.meter.} = +func square*(a: var FF, skipFinalSub: static bool = false) {.meter.} = ## Squaring modulo p - a.square(a, skipReduction) + a.square(a, skipFinalSub) -func square_repeated*(a: var FF, num: int, skipReduction: static bool = false) {.meter.} = +func square_repeated*(a: var FF, num: int, skipFinalSub: static bool = false) {.meter.} = ## Repeated squarings - # Except in Tonelli-Shanks, num is always known at compile-time - # and square repeated is inlined, so the compiler should optimize the branches away. - - # TODO: understand the conditions to avoid the final substraction - for _ in 0 ..< num: - a.square(skipReduction = false) + ## Assumes at least 1 squaring + for _ in 0 ..< num-1: + a.square(skipFinalSub = true) + a.square(skipFinalSub) -func square_repeated*(r: var FF, a: FF, num: int, skipReduction: static bool = false) {.meter.} = +func square_repeated*(r: var FF, a: FF, num: int, skipFinalSub: static bool = false) {.meter.} = ## Repeated squarings - - # TODO: understand the conditions to avoid the final substraction - r.square(a) - for _ in 1 ..< num: - r.square() - -func square_repeated_then_mul*(a: var FF, num: int, b: FF, skipReduction: static bool = false) {.meter.} = - ## Square `a`, `num` times and then multiply by b - ## Assumes at least 1 squaring - a.square_repeated(num, skipReduction = false) - a.prod(a, b, skipReduction = skipReduction) - -func square_repeated_then_mul*(r: var FF, a: FF, num: int, b: FF, skipReduction: static bool = false) {.meter.} = - ## Square `a`, `num` times and then multiply by b - ## Assumes at least 1 squaring - r.square_repeated(a, num, skipReduction = false) - r.prod(r, b, skipReduction = skipReduction) + r.square(a, skipFinalSub = true) + for _ in 1 ..< num-1: + r.square(skipFinalSub = true) + r.square(skipFinalSub) func `*=`*(a: var FF, b: static int) = ## Multiplication by a small integer known at compile-time @@ -550,3 +535,46 @@ template mulCheckSparse*(a: var Fp, b: Fp) = {.pop.} # inline {.pop.} # raises no exceptions + +# ############################################################ +# +# Field arithmetic ergonomic macros +# +# ############################################################ + +import std/macros + +macro addchain*(fn: untyped): untyped = + ## Modify all prod, `*=`, square, square_repeated calls + ## to skipFinalSub except the very last call. + ## This assumes straight-line code. + fn.expectKind(nnkFuncDef) + + result = fn + var body = newStmtList() + + for i, statement in fn[^1]: + statement.expectKind({nnkCommentStmt, nnkVarSection, nnkCall, nnkInfix}) + + var s = statement.copyNimTree() + if i + 1 != result[^1].len: + # Modify all but the last + if s.kind == nnkCall: + doAssert s[0].kind == nnkDotExpr, "Only method call syntax or infix syntax is supported in addition chains" + doAssert s[0][1].eqIdent"prod" or s[0][1].eqIdent"square" or s[0][1].eqIdent"square_repeated" + s.add newLit(true) + elif s.kind == nnkInfix: + doAssert s[0].eqIdent"*=" + # a *= b -> prod(a, a, b, true) + s = newCall( + bindSym"prod", + s[1], + s[1], + s[2], + newLit(true) + ) + + body.add s + + result[^1] = body + # echo result.toStrLit() \ No newline at end of file diff --git a/constantine/arithmetic/finite_fields_square_root.nim b/constantine/arithmetic/finite_fields_square_root.nim index c6c955f..5943a9a 100644 --- a/constantine/arithmetic/finite_fields_square_root.nim +++ b/constantine/arithmetic/finite_fields_square_root.nim @@ -194,13 +194,14 @@ func invsqrt_tonelli_shanks_pre( t.square(z) t *= a r = z - var b = t - var root = Fp.C.tonelliShanks(root_of_unity) + var b {.noInit.} = t + var root {.noInit.} = Fp.C.tonelliShanks(root_of_unity) var buf {.noInit.}: Fp for i in countdown(e, 2, 1): - b.square_repeated(i-2) + if i-2 >= 1: + b.square_repeated(i-2) let bNotOne = not b.isOne() buf.prod(r, root) diff --git a/constantine/arithmetic/limbs_montgomery.nim b/constantine/arithmetic/limbs_montgomery.nim index 92d39ee..df9e1db 100644 --- a/constantine/arithmetic/limbs_montgomery.nim +++ b/constantine/arithmetic/limbs_montgomery.nim @@ -57,15 +57,14 @@ func redc2xMont_CIOS[N: static int]( r: var array[N, SecretWord], a: array[N*2, SecretWord], M: array[N, SecretWord], - m0ninv: BaseType, skipReduction: static bool = false) = + m0ninv: BaseType, skipFinalSub: static bool = false) = ## Montgomery reduce a double-precision bigint modulo M ## ## This maps - ## - [0, 4p²) -> [0, 2p) with skipReduction + ## - [0, 4p²) -> [0, 2p) with skipFinalSub ## - [0, 4p²) -> [0, p) without ## - ## SkipReduction skips the final substraction step. - ## For skipReduction, M needs to have a spare bit in it's representation i.e. unused MSB. + ## skipFinalSub skips the final substraction step. # - Analyzing and Comparing Montgomery Multiplication Algorithms # Cetin Kaya Koc and Tolga Acar and Burton S. Kaliski Jr. # http://pdfs.semanticscholar.org/5e39/41ff482ec3ee41dc53c3298f0be085c69483.pdf @@ -119,7 +118,7 @@ func redc2xMont_CIOS[N: static int]( addC(carry, res[i], a[i+N], res[i], carry) # Final substraction - when not skipReduction: + when not skipFinalSub: discard res.csub(M, SecretWord(carry).isNonZero() or not(res < M)) r = res @@ -127,15 +126,14 @@ func redc2xMont_Comba[N: static int]( r: var array[N, SecretWord], a: array[N*2, SecretWord], M: array[N, SecretWord], - m0ninv: BaseType, skipReduction: static bool = false) = + m0ninv: BaseType, skipFinalSub: static bool = false) = ## Montgomery reduce a double-precision bigint modulo M ## ## This maps - ## - [0, 4p²) -> [0, 2p) with skipReduction + ## - [0, 4p²) -> [0, 2p) with skipFinalSub ## - [0, 4p²) -> [0, p) without ## - ## SkipReduction skips the final substraction step. - ## For skipReduction, M needs to have a spare bit in it's representation i.e. unused MSB. + ## skipFinalSub skips the final substraction step. # We use Product Scanning / Comba multiplication var t, u, v = Zero var carry: Carry @@ -171,14 +169,14 @@ func redc2xMont_Comba[N: static int]( addC(carry, z[N-1], v, a[2*N-1], Carry(0)) # Final substraction - when not skipReduction: + when not skipFinalSub: discard z.csub(M, SecretBool(carry) or not(z < M)) r = z # Montgomery Multiplication # ------------------------------------------------------------ -func mulMont_CIOS_sparebit(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType, skipReduction: static bool = false) = +func mulMont_CIOS_sparebit(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType, skipFinalSub: static bool = false) = ## Montgomery Multiplication using Coarse Grained Operand Scanning (CIOS) ## and no-carry optimization. ## This requires the most significant word of the Modulus @@ -186,10 +184,10 @@ func mulMont_CIOS_sparebit(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType, skipR ## https://hackmd.io/@gnark/modular_multiplication ## ## This maps - ## - [0, 2p) -> [0, 2p) with skipReduction + ## - [0, 2p) -> [0, 2p) with skipFinalSub ## - [0, 2p) -> [0, p) without ## - ## SkipReduction skips the final substraction step. + ## skipFinalSub skips the final substraction step. # We want all the computation to be kept in registers # hence we use a temporary `t`, hoping that the compiler does it. @@ -213,7 +211,7 @@ func mulMont_CIOS_sparebit(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType, skipR t[N-1] = C + A - when not skipReduction: + when not skipFinalSub: discard t.csub(M, not(t < M)) r = t @@ -265,15 +263,14 @@ func mulMont_CIOS(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType) {.used.} = discard t.csub(M, tN.isNonZero() or not(t < M)) # TODO: (t >= M) is unnecessary for prime in the form (2^64)ʷ r = t -func mulMont_FIPS(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType, skipReduction: static bool = false) = +func mulMont_FIPS(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType, skipFinalSub: static bool = false) = ## Montgomery Multiplication using Finely Integrated Product Scanning (FIPS) ## ## This maps - ## - [0, 2p) -> [0, 2p) with skipReduction + ## - [0, 2p) -> [0, 2p) with skipFinalSub ## - [0, 2p) -> [0, p) without ## - ## SkipReduction skips the final substraction step. - ## For skipReduction, M needs to have a spare bit in it's representation i.e. unused MSB. + ## skipFinalSub skips the final substraction step. # - Architectural Enhancements for Montgomery # Multiplication on Embedded RISC Processors # Johann Großschädl and Guy-Armand Kamendje, 2003 @@ -306,7 +303,7 @@ func mulMont_FIPS(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType, skipReduction: u = t t = Zero - when not skipReduction: + when not skipFinalSub: discard z.csub(M, v.isNonZero() or not(z < M)) r = z @@ -386,39 +383,46 @@ func fromMont_CIOS(r: var Limbs, a, M: Limbs, m0ninv: BaseType) = # Exported API # ------------------------------------------------------------ +# Skipping reduction requires the modulus M <= R/4 +# On 64-bit R is the multiple of 2⁶⁴ immediately larger than M +# +# Montgomery Arithmetic from a Software Perspective +# Bos and Montgomery, 2017 +# https://eprint.iacr.org/2017/1057.pdf + # TODO upstream, using Limbs[N] breaks semcheck func redc2xMont*[N: static int]( r: var array[N, SecretWord], a: array[N*2, SecretWord], M: array[N, SecretWord], m0ninv: BaseType, - spareBits: static int, skipReduction: static bool = false) {.inline.} = + spareBits: static int, skipFinalSub: static bool = false) {.inline.} = ## Montgomery reduce a double-precision bigint modulo M - const skipReduction = skipReduction and spareBits >= 1 + const skipFinalSub = skipFinalSub and spareBits >= 2 when UseASM_X86_64 and r.len <= 6: # ADX implies BMI2 if ({.noSideEffect.}: hasAdx()): - redcMont_asm_adx(r, a, M, m0ninv, spareBits >= 1, skipReduction) + redcMont_asm_adx(r, a, M, m0ninv, spareBits >= 1, skipFinalSub) else: when r.len in {3..6}: - redcMont_asm(r, a, M, m0ninv, spareBits >= 1, skipReduction) + redcMont_asm(r, a, M, m0ninv, spareBits >= 1, skipFinalSub) else: - redc2xMont_CIOS(r, a, M, m0ninv, skipReduction) + redc2xMont_CIOS(r, a, M, m0ninv, skipFinalSub) # redc2xMont_Comba(r, a, M, m0ninv) elif UseASM_X86_64 and r.len in {3..6}: # TODO: Assembly faster than GCC but slower than Clang - redcMont_asm(r, a, M, m0ninv, spareBits >= 1, skipReduction) + redcMont_asm(r, a, M, m0ninv, spareBits >= 1, skipFinalSub) else: - redc2xMont_CIOS(r, a, M, m0ninv, skipReduction) - # redc2xMont_Comba(r, a, M, m0ninv, skipReduction) + redc2xMont_CIOS(r, a, M, m0ninv, skipFinalSub) + # redc2xMont_Comba(r, a, M, m0ninv, skipFinalSub) func mulMont*( r: var Limbs, a, b, M: Limbs, m0ninv: BaseType, spareBits: static int, - skipReduction: static bool = false) {.inline.} = + skipFinalSub: static bool = false) {.inline.} = ## Compute r <- a*b (mod M) in the Montgomery domain ## `m0ninv` = -1/M (mod SecretWord). Our words are 2^32 or 2^64 ## @@ -438,36 +442,28 @@ func mulMont*( # i.e. c'R <- a'R b'R * R^-1 (mod M) in the natural domain # as in the Montgomery domain all numbers are scaled by R - # Many curve moduli are "Montgomery-friendly" which means that m0ninv is 1 - # This saves N basic type multiplication and potentially many register mov - # as well as unless using "mulx" instruction, x86 "mul" requires very specific registers. - # - # The implementation is visible from here, the compiler can make decision whether to: - # - specialize/duplicate code for m0ninv == 1 (especially if only 1 curve is needed) - # - keep it generic and optimize code size - - const skipReduction = skipReduction and spareBits >= 1 + const skipFinalSub = skipFinalSub and spareBits >= 2 when spareBits >= 1: when UseASM_X86_64 and a.len in {2 .. 6}: # TODO: handle spilling # ADX implies BMI2 if ({.noSideEffect.}: hasAdx()): - mulMont_CIOS_sparebit_asm_adx(r, a, b, M, m0ninv, skipReduction) + mulMont_CIOS_sparebit_asm_adx(r, a, b, M, m0ninv, skipFinalSub) else: - mulMont_CIOS_sparebit_asm(r, a, b, M, m0ninv, skipReduction) + mulMont_CIOS_sparebit_asm(r, a, b, M, m0ninv, skipFinalSub) else: - mulMont_CIOS_sparebit(r, a, b, M, m0ninv, skipReduction) + mulMont_CIOS_sparebit(r, a, b, M, m0ninv, skipFinalSub) else: - mulMont_FIPS(r, a, b, M, m0ninv, skipReduction) + mulMont_FIPS(r, a, b, M, m0ninv, skipFinalSub) func squareMont*[N](r: var Limbs[N], a, M: Limbs[N], m0ninv: BaseType, spareBits: static int, - skipReduction: static bool = false) {.inline.} = + skipFinalSub: static bool = false) {.inline.} = ## Compute r <- a^2 (mod M) in the Montgomery domain ## `m0ninv` = -1/M (mod SecretWord). Our words are 2^31 or 2^63 - const skipReduction = skipReduction and spareBits >= 1 + const skipFinalSub = skipFinalSub and spareBits >= 2 when UseASM_X86_64 and a.len in {4, 6}: # ADX implies BMI2 @@ -476,20 +472,20 @@ func squareMont*[N](r: var Limbs[N], a, M: Limbs[N], # which uses unfused squaring then Montgomery reduction # is slightly slower than fused Montgomery multiplication when spareBits >= 1: - mulMont_CIOS_sparebit_asm_adx(r, a, a, M, m0ninv, skipReduction) + mulMont_CIOS_sparebit_asm_adx(r, a, a, M, m0ninv, skipFinalSub) else: - squareMont_CIOS_asm_adx(r, a, M, m0ninv, spareBits >= 1, skipReduction) + squareMont_CIOS_asm_adx(r, a, M, m0ninv, spareBits >= 1, skipFinalSub) else: - squareMont_CIOS_asm(r, a, M, m0ninv, spareBits >= 1, skipReduction) + squareMont_CIOS_asm(r, a, M, m0ninv, spareBits >= 1, skipFinalSub) elif UseASM_X86_64: var r2x {.noInit.}: Limbs[2*N] r2x.square(a) - r.redc2xMont(r2x, M, m0ninv, spareBits, skipReduction) + r.redc2xMont(r2x, M, m0ninv, spareBits, skipFinalSub) else: - mulMont(r, a, a, M, m0ninv, spareBits, skipReduction) + mulMont(r, a, a, M, m0ninv, spareBits, skipFinalSub) func fromMont*(r: var Limbs, a, M: Limbs, - m0ninv: static BaseType, spareBits: static int) = + m0ninv: BaseType, spareBits: static int) = ## Transform a bigint ``a`` from it's Montgomery N-residue representation (mod N) ## to the regular natural representation (mod N) ## @@ -517,7 +513,7 @@ func fromMont*(r: var Limbs, a, M: Limbs, fromMont_CIOS(r, a, M, m0ninv) func getMont*(r: var Limbs, a, M, r2modM: Limbs, - m0ninv: static BaseType, spareBits: static int) = + m0ninv: BaseType, spareBits: static int) = ## Transform a bigint ``a`` from it's natural representation (mod N) ## to a the Montgomery n-residue representation ## @@ -580,7 +576,7 @@ func getWindowLen(bufLen: int): uint = func powMontPrologue( a: var Limbs, M, one: Limbs, - m0ninv: static BaseType, + m0ninv: BaseType, scratchspace: var openarray[Limbs], spareBits: static int ): uint = @@ -605,7 +601,7 @@ func powMontSquarings( a: var Limbs, exponent: openarray[byte], M: Limbs, - m0ninv: static BaseType, + m0ninv: BaseType, tmp: var Limbs, window: uint, acc, acc_len: var uint, @@ -654,7 +650,7 @@ func powMont*( a: var Limbs, exponent: openarray[byte], M, one: Limbs, - m0ninv: static BaseType, + m0ninv: BaseType, scratchspace: var openarray[Limbs], spareBits: static int ) = @@ -721,7 +717,7 @@ func powMontUnsafeExponent*( a: var Limbs, exponent: openarray[byte], M, one: Limbs, - m0ninv: static BaseType, + m0ninv: BaseType, scratchspace: var openarray[Limbs], spareBits: static int ) = diff --git a/constantine/curves/bls12_377_sqrt.nim b/constantine/curves/bls12_377_sqrt.nim index a68267a..3af0f4b 100644 --- a/constantine/curves/bls12_377_sqrt.nim +++ b/constantine/curves/bls12_377_sqrt.nim @@ -28,7 +28,7 @@ const func precompute_tonelli_shanks_addchain*( r: var Fp[BLS12_377], - a: Fp[BLS12_377]) = + a: Fp[BLS12_377]) {.addchain.} = ## Does a^BLS12_377_TonelliShanks_exponent ## via an addition-chain diff --git a/constantine/curves/bls12_381_sqrt.nim b/constantine/curves/bls12_381_sqrt.nim index 0269bf6..e1fda36 100644 --- a/constantine/curves/bls12_381_sqrt.nim +++ b/constantine/curves/bls12_381_sqrt.nim @@ -16,7 +16,7 @@ import # # ############################################################ -func invsqrt_addchain*(r: var Fp[BLS12_381], a: Fp[BLS12_381]) = +func invsqrt_addchain*(r: var Fp[BLS12_381], a: Fp[BLS12_381]) {.addchain.} = var x10 {.noInit.}: Fp[BLS12_381] x100 {.noInit.}: Fp[BLS12_381] diff --git a/constantine/curves/bn254_nogami_sqrt.nim b/constantine/curves/bn254_nogami_sqrt.nim index 4ae7a1e..3626e2d 100644 --- a/constantine/curves/bn254_nogami_sqrt.nim +++ b/constantine/curves/bn254_nogami_sqrt.nim @@ -16,7 +16,7 @@ import # # ############################################################ -func invsqrt_addchain*(r: var Fp[BN254_Nogami], a: Fp[BN254_Nogami]) = +func invsqrt_addchain*(r: var Fp[BN254_Nogami], a: Fp[BN254_Nogami]) {.addchain.} = var x10 {.noInit.}: Fp[BN254_Nogami] x11 {.noInit.}: Fp[BN254_Nogami] diff --git a/constantine/curves/bn254_snarks_sqrt.nim b/constantine/curves/bn254_snarks_sqrt.nim index f88021e..27bf4b3 100644 --- a/constantine/curves/bn254_snarks_sqrt.nim +++ b/constantine/curves/bn254_snarks_sqrt.nim @@ -16,7 +16,7 @@ import # # ############################################################ -func invsqrt_addchain*(r: var Fp[BN254_Snarks], a: Fp[BN254_Snarks]) = +func invsqrt_addchain*(r: var Fp[BN254_Snarks], a: Fp[BN254_Snarks]) {.addchain.} = var x10 {.noInit.}: Fp[BN254_Snarks] x11 {.noInit.}: Fp[BN254_Snarks] diff --git a/constantine/curves/bw6_761_sqrt.nim b/constantine/curves/bw6_761_sqrt.nim index b8e94a6..c5aeefd 100644 --- a/constantine/curves/bw6_761_sqrt.nim +++ b/constantine/curves/bw6_761_sqrt.nim @@ -16,7 +16,7 @@ import # # ############################################################ -func invsqrt_addchain*(r: var Fp[BW6_761], a: Fp[BW6_761]) = +func invsqrt_addchain*(r: var Fp[BW6_761], a: Fp[BW6_761]) {.addchain.} = var x10 {.noInit.}: Fp[BW6_761] x11 {.noInit.}: Fp[BW6_761] diff --git a/constantine/elliptic/ec_shortweierstrass_projective.nim b/constantine/elliptic/ec_shortweierstrass_projective.nim index 80c2bfc..5980fc4 100644 --- a/constantine/elliptic/ec_shortweierstrass_projective.nim +++ b/constantine/elliptic/ec_shortweierstrass_projective.nim @@ -220,7 +220,7 @@ func sum*[F; G: static Subgroup]( t4 *= SexticNonResidue x3.sum(P.x, P.z) # 14. X₃ <- X₁ + Z₁ y3.sum(Q.x, Q.z) # 15. Y₃ <- X₂ + Z₂ - x3 *= y3 # 16. X₃ <- X₃ Y₃ X₃ = (X₁Z₁)(X₂Z₂) + x3 *= y3 # 16. X₃ <- X₃ Y₃ X₃ = (X₁+Z₁)(X₂+Z₂) y3.sum(t0, t2) # 17. Y₃ <- t₀ + t₂ Y₃ = X₁ X₂ + Z₁ Z₂ y3.diff(x3, y3) # 18. Y₃ <- X₃ - Y₃ Y₃ = (X₁ + Z₁)(X₂ + Z₂) - (X₁ X₂ + Z₁ Z₂) = X₁Z₂ + X₂Z₁ when G == G2 and F.C.getSexticTwist() == D_Twist: diff --git a/docs/optimizations.md b/docs/optimizations.md index 5741a7f..5fb2ae2 100644 --- a/docs/optimizations.md +++ b/docs/optimizations.md @@ -84,7 +84,7 @@ The optimizations can be of algebraic, algorithmic or "implementation details" n - [ ] NAF recoding - [ ] windowed-NAF recoding - [ ] SIMD vectorized select in window algorithm - - [ ] Montgomery Multiplication with no final substraction, + - [x] Montgomery Multiplication with no final substraction, - Bos and Montgomery, https://eprint.iacr.org/2017/1057.pdf - Colin D Walter, https://colinandmargaret.co.uk/Research/CDW_ELL_99.pdf - Hachez and Quisquater, https://link.springer.com/content/pdf/10.1007%2F3-540-44499-8_23.pdf