From aefd40f455e527e7cb11fdfd31c22c28d43e9498 Mon Sep 17 00:00:00 2001 From: Mamy Ratsimbazafy Date: Sat, 20 Feb 2021 13:18:49 +0100 Subject: [PATCH] Square ADX (#160) * Add MULX/ADOX/ADCX assembly for squaring 4 limbs * Add squarings for 6 limbs * Use the new square assembly where relevant * Fix 32-bit register name and calling convention * typo * Disable MontRed ASM for 2 limbs or less --- .../assembly/limbs_asm_montmul_x86.nim | 37 +- .../limbs_asm_montmul_x86_adx_bmi2.nim | 37 +- .../assembly/limbs_asm_montred_x86.nim | 106 ++- .../limbs_asm_montred_x86_adx_bmi2.nim | 10 +- .../arithmetic/assembly/limbs_asm_mul_x86.nim | 4 +- .../assembly/limbs_asm_mul_x86_adx_bmi2.nim | 652 ++++++++++++++---- constantine/arithmetic/limbs_extmul.nim | 8 +- constantine/arithmetic/limbs_montgomery.nim | 432 +++++------- constantine/config/curves_prop_derived.nim | 2 +- .../primitives/macro_assembler_x86.nim | 26 +- tests/t_finite_fields_double_precision.nim | 5 +- .../t_finite_fields_double_precision.nim.cfg | 1 + tests/t_finite_fields_mulsquare.nim | 33 +- tests/t_finite_fields_mulsquare.nim.cfg | 1 + 14 files changed, 932 insertions(+), 422 deletions(-) create mode 100644 tests/t_finite_fields_double_precision.nim.cfg diff --git a/constantine/arithmetic/assembly/limbs_asm_montmul_x86.nim b/constantine/arithmetic/assembly/limbs_asm_montmul_x86.nim index 0af7fba..3a656e2 100644 --- a/constantine/arithmetic/assembly/limbs_asm_montmul_x86.nim +++ b/constantine/arithmetic/assembly/limbs_asm_montmul_x86.nim @@ -12,7 +12,8 @@ import # Internal ../../config/common, ../../primitives, - ./limbs_asm_montred_x86 + ./limbs_asm_montred_x86, + ./limbs_asm_mul_x86 # ############################################################ # @@ -176,3 +177,37 @@ macro montMul_CIOS_nocarry_gen[N: static int](r_MM: var Limbs[N], a_MM, b_MM, M_ func montMul_CIOS_nocarry_asm*(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType) = ## Constant-time modular multiplication montMul_CIOS_nocarry_gen(r, a, b, M, m0ninv) + +# Montgomery Squaring +# ------------------------------------------------------------ + +func square_asm_inline[rLen, aLen: static int](r: var Limbs[rLen], a: Limbs[aLen]) {.inline.} = + ## Multi-precision Squaring + ## Assumes r doesn't alias a + ## Extra indirection as the generator assumes that + ## arrays are pointers, which is true for parameters + ## but not for stack variables + sqr_gen(r, a) + +func montRed_asm_inline[N: static int]( + r: var array[N, SecretWord], + a: array[N*2, SecretWord], + M: array[N, SecretWord], + m0ninv: BaseType, + hasSpareBit: static bool + ) {.inline.} = + ## Constant-time Montgomery reduction + ## Extra indirection as the generator assumes that + ## arrays are pointers, which is true for parameters + ## but not for stack variables + montyRedc2x_gen(r, a, M, m0ninv, hasSpareBit) + +func montSquare_CIOS_asm*[N]( + r: var Limbs[N], + a, M: Limbs[N], + m0ninv: BaseType, + hasSpareBit: static bool) = + ## Constant-time modular squaring + var r2x {.noInit.}: Limbs[2*N] + r2x.square_asm_inline(a) + r.montRed_asm_inline(r2x, M, m0ninv, hasSpareBit) diff --git a/constantine/arithmetic/assembly/limbs_asm_montmul_x86_adx_bmi2.nim b/constantine/arithmetic/assembly/limbs_asm_montmul_x86_adx_bmi2.nim index c32dc90..be09592 100644 --- a/constantine/arithmetic/assembly/limbs_asm_montmul_x86_adx_bmi2.nim +++ b/constantine/arithmetic/assembly/limbs_asm_montmul_x86_adx_bmi2.nim @@ -12,7 +12,9 @@ import # Internal ../../config/common, ../../primitives, - ./limbs_asm_montred_x86 + ./limbs_asm_montred_x86, + ./limbs_asm_montred_x86_adx_bmi2, + ./limbs_asm_mul_x86_adx_bmi2 # ############################################################ # @@ -271,3 +273,36 @@ macro montMul_CIOS_nocarry_adx_bmi2_gen[N: static int](r_MM: var Limbs[N], a_MM, func montMul_CIOS_nocarry_asm_adx_bmi2*(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType) = ## Constant-time modular multiplication montMul_CIOS_nocarry_adx_bmi2_gen(r, a, b, M, m0ninv) + +# Montgomery Squaring +# ------------------------------------------------------------ + +func square_asm_adx_bmi2_inline[rLen, aLen: static int](r: var Limbs[rLen], a: Limbs[aLen]) {.inline.} = + ## Multi-precision Squaring + ## Extra indirection as the generator assumes that + ## arrays are pointers, which is true for parameters + ## but not for stack variables. + sqrx_gen(r, a) + +func montRed_asm_adx_bmi2_inline[N: static int]( + r: var array[N, SecretWord], + a: array[N*2, SecretWord], + M: array[N, SecretWord], + m0ninv: BaseType, + hasSpareBit: static bool + ) {.inline.} = + ## Constant-time Montgomery reduction + ## Extra indirection as the generator assumes that + ## arrays are pointers, which is true for parameters + ## but not for stack variables. + montyRedc2x_adx_gen(r, a, M, m0ninv, hasSpareBit) + +func montSquare_CIOS_asm_adx_bmi2*[N]( + r: var Limbs[N], + a, M: Limbs[N], + m0ninv: BaseType, + hasSpareBit: static bool) = + ## Constant-time modular squaring + var r2x {.noInit.}: Limbs[2*N] + r2x.square_asm_adx_bmi2_inline(a) + r.montRed_asm_adx_bmi2_inline(r2x, M, m0ninv, hasSpareBit) diff --git a/constantine/arithmetic/assembly/limbs_asm_montred_x86.nim b/constantine/arithmetic/assembly/limbs_asm_montred_x86.nim index e8da3cc..e35b2ca 100644 --- a/constantine/arithmetic/assembly/limbs_asm_montred_x86.nim +++ b/constantine/arithmetic/assembly/limbs_asm_montred_x86.nim @@ -60,10 +60,13 @@ proc finalSubCanOverflow*( ## `overflowReg` should be a register that will be used ## to store the carry flag + ctx.comment "Final substraction (may carry)" + + # Mask: overflowed contains 0xFFFF or 0x0000 ctx.sbb overflowReg, overflowReg + # Now substract the modulus to test a < p let N = M.len - ctx.comment "Final substraction (may carry)" for i in 0 ..< N: ctx.mov scratch[i], a[i] if i == 0: @@ -71,6 +74,8 @@ proc finalSubCanOverflow*( else: ctx.sbb scratch[i], M[i] + # If it overflows here, it means that it was + # smaller than the modulus and we don't need `scratch` ctx.sbb overflowReg, 0 # If we borrowed it means that we were smaller than @@ -83,12 +88,12 @@ proc finalSubCanOverflow*( # Montgomery reduction # ------------------------------------------------------------ -macro montyRedc2x_gen[N: static int]( +macro montyRedc2x_gen*[N: static int]( r_MR: var array[N, SecretWord], a_MR: array[N*2, SecretWord], M_MR: array[N, SecretWord], m0ninv_MR: BaseType, - spareBits: static int + hasSpareBit: static bool ) = result = newStmtList() @@ -137,7 +142,8 @@ macro montyRedc2x_gen[N: static int]( # r -= M # No register spilling handling - doAssert N <= 6, "The Assembly-optimized montgomery multiplication requires at most 6 limbs." + doAssert N > 2, "The Assembly-optimized montgomery reduction requires a minimum of 2 limbs." + doAssert N <= 6, "The Assembly-optimized montgomery reduction requires at most 6 limbs." for i in 0 ..< N: ctx.mov u[i], a[i] @@ -205,7 +211,7 @@ macro montyRedc2x_gen[N: static int]( let t = repackRegisters(v, u[N], u[N+1]) # v is invalidated - if spareBits >= 1: + if hasSpareBit: ctx.finalSubNoCarry(r, u, M, t) else: ctx.finalSubCanOverflow(r, u, M, t, rax) @@ -218,7 +224,93 @@ func montRed_asm*[N: static int]( a: array[N*2, SecretWord], M: array[N, SecretWord], m0ninv: BaseType, - spareBits: static int + hasSpareBit: static bool ) = ## Constant-time Montgomery reduction - montyRedc2x_gen(r, a, M, m0ninv, spareBits) + static: doAssert UseASM_X86_64, "This requires x86-64." + montyRedc2x_gen(r, a, M, m0ninv, hasSpareBit) + +# Sanity checks +# ---------------------------------------------------------- + +when isMainModule: + import + ../../config/[type_bigint, common], + ../../arithmetic/limbs + + type SW = SecretWord + + # TODO: Properly handle low number of limbs + + func montyRedc2x_Comba[N: static int]( + r: var array[N, SecretWord], + a: array[N*2, SecretWord], + M: array[N, SecretWord], + m0ninv: BaseType) = + ## Montgomery reduce a double-precision bigint modulo M + # We use Product Scanning / Comba multiplication + var t, u, v = Zero + var carry: Carry + var z: typeof(r) # zero-init, ensure on stack and removes in-place problems in tower fields + staticFor i, 0, N: + staticFor j, 0, i: + mulAcc(t, u, v, z[j], M[i-j]) + + addC(carry, v, v, a[i], Carry(0)) + addC(carry, u, u, Zero, carry) + addC(carry, t, t, Zero, carry) + + z[i] = v * SecretWord(m0ninv) + mulAcc(t, u, v, z[i], M[0]) + v = u + u = t + t = Zero + + staticFor i, N, 2*N-1: + staticFor j, i-N+1, N: + mulAcc(t, u, v, z[j], M[i-j]) + + addC(carry, v, v, a[i], Carry(0)) + addC(carry, u, u, Zero, carry) + addC(carry, t, t, Zero, carry) + + z[i-N] = v + + v = u + u = t + t = Zero + + addC(carry, z[N-1], v, a[2*N-1], Carry(0)) + + # Final substraction + discard z.csub(M, SecretBool(carry) or not(z < M)) + r = z + + + proc main2L() = + let M = [SW 0xFFFFFFFF_FFFFFFFF'u64, SW 0x7FFFFFFF_FFFFFFFF'u64] + + # a² + let adbl_sqr = [SW 0xFF677F6000000001'u64, SW 0xD79897153FA818FD'u64, SW 0x68BFF63DE35C5451'u64, SW 0x2D243FE4B480041F'u64] + # (-a)² + let nadbl_sqr = [SW 0xFECEFEC000000004'u64, SW 0xAE9896D43FA818FB'u64, SW 0x690C368DE35C5450'u64, SW 0x01A4400534800420'u64] + + var a_sqr{.noInit.}, na_sqr{.noInit.}: Limbs[2] + var a_sqr_comba{.noInit.}, na_sqr_comba{.noInit.}: Limbs[2] + + a_sqr.montRed_asm(adbl_sqr, M, 1, hasSpareBit = false) + na_sqr.montRed_asm(nadbl_sqr, M, 1, hasSpareBit = false) + a_sqr_comba.montyRedc2x_Comba(adbl_sqr, M, 1) + na_sqr_comba.montyRedc2x_Comba(nadbl_sqr, M, 1) + + debugecho "--------------------------------" + debugecho "after:" + debugecho " a_sqr: ", a_sqr.toString() + debugecho " na_sqr: ", na_sqr.toString() + debugecho " a_sqr_comba: ", a_sqr_comba.toString() + debugecho " na_sqr_comba: ", na_sqr_comba.toString() + + doAssert bool(a_sqr == na_sqr) + doAssert bool(a_sqr == a_sqr_comba) + + main2L() diff --git a/constantine/arithmetic/assembly/limbs_asm_montred_x86_adx_bmi2.nim b/constantine/arithmetic/assembly/limbs_asm_montred_x86_adx_bmi2.nim index bd4bf68..f2ac174 100644 --- a/constantine/arithmetic/assembly/limbs_asm_montred_x86_adx_bmi2.nim +++ b/constantine/arithmetic/assembly/limbs_asm_montred_x86_adx_bmi2.nim @@ -35,12 +35,12 @@ static: doAssert UseASM_X86_64 # Montgomery reduction # ------------------------------------------------------------ -macro montyRedc2x_gen[N: static int]( +macro montyRedc2x_adx_gen*[N: static int]( r_MR: var array[N, SecretWord], a_MR: array[N*2, SecretWord], M_MR: array[N, SecretWord], m0ninv_MR: BaseType, - spareBits: static int + hasSpareBit: static bool ) = result = newStmtList() @@ -132,7 +132,7 @@ macro montyRedc2x_gen[N: static int]( let t = repackRegisters(v, u[N]) - if spareBits >= 1: + if hasSpareBit: ctx.finalSubNoCarry(r, u, M, t) else: ctx.finalSubCanOverflow(r, u, M, t, hi) @@ -145,7 +145,7 @@ func montRed_asm_adx_bmi2*[N: static int]( a: array[N*2, SecretWord], M: array[N, SecretWord], m0ninv: BaseType, - spareBits: static int + hasSpareBit: static bool ) = ## Constant-time Montgomery reduction - montyRedc2x_gen(r, a, M, m0ninv, spareBits) + montyRedc2x_adx_gen(r, a, M, m0ninv, hasSpareBit) diff --git a/constantine/arithmetic/assembly/limbs_asm_mul_x86.nim b/constantine/arithmetic/assembly/limbs_asm_mul_x86.nim index 0863155..28d3def 100644 --- a/constantine/arithmetic/assembly/limbs_asm_mul_x86.nim +++ b/constantine/arithmetic/assembly/limbs_asm_mul_x86.nim @@ -133,7 +133,7 @@ func mul_asm*[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limbs[aLen], # Squaring # ----------------------------------------------------------------------------------------------- -macro square_gen[rLen, aLen: static int](r: var Limbs[rLen], a: Limbs[aLen]) = +macro sqr_gen*[rLen, aLen: static int](r: var Limbs[rLen], a: Limbs[aLen]) = ## Comba squaring generator ## `a` and `r` can have a different number of limbs ## if `r`.limbs.len < a.limbs.len * 2 @@ -240,4 +240,4 @@ macro square_gen[rLen, aLen: static int](r: var Limbs[rLen], a: Limbs[aLen]) = func square_asm*[rLen, aLen: static int](r: var Limbs[rLen], a: Limbs[aLen]) = ## Multi-precision Squaring ## Assumes r doesn't alias a - square_gen(r, a) + sqr_gen(r, a) diff --git a/constantine/arithmetic/assembly/limbs_asm_mul_x86_adx_bmi2.nim b/constantine/arithmetic/assembly/limbs_asm_mul_x86_adx_bmi2.nim index 7f91b27..3a7fde1 100644 --- a/constantine/arithmetic/assembly/limbs_asm_mul_x86_adx_bmi2.nim +++ b/constantine/arithmetic/assembly/limbs_asm_mul_x86_adx_bmi2.nim @@ -40,8 +40,10 @@ proc mulx_by_word( word0: Operand ) = ## Multiply the `a[0..= M: + # r -= M + # + # Important note: `a[i+n] += C` should propagate the carry + # to the higher limb if any, thank you "implementation detail" + # missing from paper. + + var a = a # Copy "t" for mutation and ensure on stack + var res: typeof(r) # Accumulator + staticFor i, 0, N: + var C = Zero + let m = a[i] * SecretWord(m0ninv) + staticFor j, 0, N: + muladd2(C, a[i+j], m, M[j], a[i+j], C) + res[i] = C + + # This does t[i+n] += C + # but in a separate carry chain, fused with the + # copy "r[i] = t[i+n]" + var carry = Carry(0) + staticFor i, 0, N: + addC(carry, res[i], a[i+N], res[i], carry) + + # Final substraction + discard res.csub(M, SecretWord(carry).isNonZero() or not(res < M)) + r = res + +func montyRedc2x_Comba[N: static int]( + r: var array[N, SecretWord], + a: array[N*2, SecretWord], + M: array[N, SecretWord], + m0ninv: BaseType) = + ## Montgomery reduce a double-precision bigint modulo M + # We use Product Scanning / Comba multiplication + var t, u, v = Zero + var carry: Carry + var z: typeof(r) # zero-init, ensure on stack and removes in-place problems in tower fields + staticFor i, 0, N: + staticFor j, 0, i: + mulAcc(t, u, v, z[j], M[i-j]) + + addC(carry, v, v, a[i], Carry(0)) + addC(carry, u, u, Zero, carry) + addC(carry, t, t, Zero, carry) + + z[i] = v * SecretWord(m0ninv) + mulAcc(t, u, v, z[i], M[0]) + v = u + u = t + t = Zero + + staticFor i, N, 2*N-1: + staticFor j, i-N+1, N: + mulAcc(t, u, v, z[j], M[i-j]) + + addC(carry, v, v, a[i], Carry(0)) + addC(carry, u, u, Zero, carry) + addC(carry, t, t, Zero, carry) + + z[i-N] = v + + v = u + u = t + t = Zero + + addC(carry, z[N-1], v, a[2*N-1], Carry(0)) + + # Final substraction + discard z.csub(M, SecretBool(carry) or not(z < M)) + r = z + # Montgomery Multiplication # ------------------------------------------------------------ @@ -172,224 +280,66 @@ func montyMul_FIPS(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType) = r = z # Montgomery Squaring -# ------------------------------------------------------------ - -func montySquare_CIOS_nocarry(r: var Limbs, a, M: Limbs, m0ninv: BaseType) {.used.}= - ## Montgomery Multiplication using Coarse Grained Operand Scanning (CIOS) - ## and no-carry optimization. - ## This requires the most significant word of the Modulus - ## M[^1] < high(SecretWord) shr 2 (i.e. less than 0b00111...1111) - ## https://hackmd.io/@zkteam/modular_multiplication - - # TODO: Deactivated - # Off-by one on 32-bit on the least significant bit - # for Fp[BLS12-381] with inputs - # - -0x091F02EFA1C9B99C004329E94CD3C6B308164CBE02037333D78B6C10415286F7C51B5CD7F917F77B25667AB083314B1B - # - -0x0B7C8AFE5D43E9A973AF8649AD8C733B97D06A78CFACD214CBE9946663C3F682362E0605BC8318714305B249B505AFD9 - - # We want all the computation to be kept in registers - # hence we use a temporary `t`, hoping that the compiler does it. - var t: typeof(M) # zero-init - const N = t.len - staticFor i, 0, N: - # Squaring - var - A1: Carry - A0: SecretWord - # (A0, t[i]) <- a[i] * a[i] + t[i] - muladd1(A0, t[i], a[i], a[i], t[i]) - staticFor j, i+1, N: - # (A1, A0, t[j]) <- 2*a[j]*a[i] + t[j] + (A1, A0) - # 2*a[j]*a[i] can spill 1-bit on a 3rd word - mulDoubleAdd2(A1, A0, t[j], a[j], a[i], t[j], A1, A0) - - # Reduction - # m <- (t[0] * m0ninv) mod 2^w - # (C, _) <- m * M[0] + t[0] - let m = t[0] * SecretWord(m0ninv) - var C, lo: SecretWord - muladd1(C, lo, m, M[0], t[0]) - staticFor j, 1, N: - # (C, t[j-1]) <- m*M[j] + t[j] + C - muladd2(C, t[j-1], m, M[j], t[j], C) - - t[N-1] = C + A0 - - discard t.csub(M, not(t < M)) - r = t - -func montySquare_CIOS(r: var Limbs, a, M: Limbs, m0ninv: BaseType) {.used.}= - ## Montgomery Multiplication using Coarse Grained Operand Scanning (CIOS) - ## - ## Architectural Support for Long Integer Modulo Arithmetic on Risc-Based Smart Cards - ## Johann Großschädl, 2003 - ## https://citeseerx.ist.psu.edu/viewdoc/download;jsessionid=95950BAC26A728114431C0C7B425E022?doi=10.1.1.115.3276&rep=rep1&type=pdf - ## - ## Analyzing and Comparing Montgomery Multiplication Algorithms - ## Koc, Acar, Kaliski, 1996 - ## https://www.semanticscholar.org/paper/Analyzing-and-comparing-Montgomery-multiplication-Ko%C3%A7-Acar/5e3941ff482ec3ee41dc53c3298f0be085c69483 - - # TODO: Deactivated - # Off-by one on 32-bit on the least significant bit - # for Fp[2^127 - 1] with inputs - # - -0x75bfffefbfffffff7fd9dfd800000000 - # - -0x7ff7ffffffffffff1dfb7fafc0000000 - # Squaring the number and its opposite - # should give the same result, but those are off-by-one - - # We want all the computation to be kept in registers - # hence we use a temporary `t`, hoping that the compiler does it. - var t: typeof(M) # zero-init - const N = t.len - # Extra words to handle up to 2 carries t[N] and t[N+1] - var tNp1: SecretWord - var tN: SecretWord - - staticFor i, 0, N: - # Squaring - var A1 = Carry(0) - var A0: SecretWord - # (A0, t[i]) <- a[i] * a[i] + t[i] - muladd1(A0, t[i], a[i], a[i], t[i]) - staticFor j, i+1, N: - # (A1, A0, t[j]) <- 2*a[j]*a[i] + t[j] + (A1, A0) - # 2*a[j]*a[i] can spill 1-bit on a 3rd word - mulDoubleAdd2(A1, A0, t[j], a[j], a[i], t[j], A1, A0) - - var carryS: Carry - addC(carryS, tN, tN, A0, Carry(0)) - addC(carryS, tNp1, SecretWord(A1), Zero, carryS) - - # Reduction - # m <- (t[0] * m0ninv) mod 2^w - # (C, _) <- m * M[0] + t[0] - var C, lo: SecretWord - let m = t[0] * SecretWord(m0ninv) - muladd1(C, lo, m, M[0], t[0]) - staticFor j, 1, N: - # (C, t[j-1]) <- m*M[j] + t[j] + C - muladd2(C, t[j-1], m, M[j], t[j], C) - - # (C,t[N-1]) <- t[N] + C - # (_, t[N]) <- t[N+1] + C - var carryR: Carry - addC(carryR, t[N-1], tN, C, Carry(0)) - addC(carryR, tN, tNp1, Zero, carryR) - - discard t.csub(M, tN.isNonZero() or not(t < M)) # TODO: (t >= M) is unnecessary for prime in the form (2^64)^w - r = t - -# Montgomery Reduction -# ------------------------------------------------------------ -func montyRedc2x_CIOS[N: static int]( - r: var array[N, SecretWord], - a: array[N*2, SecretWord], - M: array[N, SecretWord], - m0ninv: BaseType) = - ## Montgomery reduce a double-precision bigint modulo M - # - Analyzing and Comparing Montgomery Multiplication Algorithms - # Cetin Kaya Koc and Tolga Acar and Burton S. Kaliski Jr. - # http://pdfs.semanticscholar.org/5e39/41ff482ec3ee41dc53c3298f0be085c69483.pdf - # - # - Arithmetic of Finite Fields - # Chapter 5 of Guide to Pairing-Based Cryptography - # Jean Luc Beuchat, Luis J. Dominguez Perez, Sylvain Duquesne, Nadia El Mrabet, Laura Fuentes-Castañeda, Francisco Rodríguez-Henríquez, 2017 - # https://www.researchgate.net/publication/319538235_Arithmetic_of_Finite_Fields - # - # Algorithm - # Inputs: - # - N number of limbs - # - a[0 ..< 2N] (double-precision input to reduce) - # - M[0 ..< N] The field modulus (must be odd for Montgomery reduction) - # - m0ninv: Montgomery Reduction magic number = -1/M[0] - # Output: - # - r[0 ..< N], in the Montgomery domain - # Parameters: - # - w, the word width usually 64 on 64-bit platforms or 32 on 32-bit - # - # for i in 0 .. n-1: - # C <- 0 - # m <- a[i] * m0ninv mod 2^w (i.e. simple multiplication) - # for j in 0 .. n-1: - # (C, S) <- a[i+j] + m * M[j] + C - # a[i+j] <- S - # a[i+n] += C - # for i in 0 .. n-1: - # r[i] = a[i+n] - # if r >= M: - # r -= M - # - # Important note: `a[i+n] += C` should propagate the carry - # to the higher limb if any, thank you "implementation detail" - # missing from paper. - - var a = a # Copy "t" for mutation and ensure on stack - var res: typeof(r) # Accumulator - staticFor i, 0, N: - var C = Zero - let m = a[i] * SecretWord(m0ninv) - staticFor j, 0, N: - muladd2(C, a[i+j], m, M[j], a[i+j], C) - res[i] = C - - # This does t[i+n] += C - # but in a separate carry chain, fused with the - # copy "r[i] = t[i+n]" - var carry = Carry(0) - staticFor i, 0, N: - addC(carry, res[i], a[i+N], res[i], carry) - - # Final substraction - discard res.csub(M, SecretWord(carry).isNonZero() or not(res < M)) - r = res - -func montyRedc2x_Comba[N: static int]( - r: var array[N, SecretWord], - a: array[N*2, SecretWord], - M: array[N, SecretWord], - m0ninv: BaseType) = - ## Montgomery reduce a double-precision bigint modulo M - # We use Product Scanning / Comba multiplication - var t, u, v = Zero - var carry: Carry - var z: typeof(r) # zero-init, ensure on stack and removes in-place problems in tower fields - staticFor i, 0, N: - staticFor j, 0, i: - mulAcc(t, u, v, z[j], M[i-j]) - - addC(carry, v, v, a[i], Carry(0)) - addC(carry, u, u, Zero, carry) - addC(carry, t, t, Zero, carry) - - z[i] = v * SecretWord(m0ninv) - mulAcc(t, u, v, z[i], M[0]) - v = u - u = t - t = Zero - - staticFor i, N, 2*N-1: - staticFor j, i-N+1, N: - mulAcc(t, u, v, z[j], M[i-j]) - - addC(carry, v, v, a[i], Carry(0)) - addC(carry, u, u, Zero, carry) - addC(carry, t, t, Zero, carry) - - z[i-N] = v - - v = u - u = t - t = Zero - - addC(carry, z[N-1], v, a[2*N-1], Carry(0)) - - # Final substraction - discard z.csub(M, SecretBool(carry) or not(z < M)) - r = z +# -------------------------------------------------------------------------------------------------------------------- +# +# There are Montgomery squaring multiplications mentioned in the litterature +# - https://hackmd.io/@zkteam/modular_multiplication if M[^1] < high(SecretWord) shr 2 (i.e. less than 0b00111...1111) +# - Architectural Support for Long Integer Modulo Arithmetic on Risc-Based Smart Cards +# Johann Großschädl, 2003 +# https://citeseerx.ist.psu.edu/viewdoc/download;jsessionid=95950BAC26A728114431C0C7B425E022?doi=10.1.1.115.3276&rep=rep1&type=pdf +# - Analyzing and Comparing Montgomery Multiplication Algorithms +# Koc, Acar, Kaliski, 1996 +# https://www.semanticscholar.org/paper/Analyzing-and-comparing-Montgomery-multiplication-Ko%C3%A7-Acar/5e3941ff482ec3ee41dc53c3298f0be085c69483 +# +# However fuzzing the implementation showed off-by-one on certain inputs especially in 32-bit mode +# +# for Fp[BLS12-381] on 32 bit with inputs +# - 0x091F02EFA1C9B99C004329E94CD3C6B308164CBE02037333D78B6C10415286F7C51B5CD7F917F77B25667AB083314B1B +# - 0x0B7C8AFE5D43E9A973AF8649AD8C733B97D06A78CFACD214CBE9946663C3F682362E0605BC8318714305B249B505AFD9 +# for Consensys/zkteam algorithm (off by one in least significant bit) +# +# for Fp[2^127 - 1] with inputs +# - -0x75bfffefbfffffff7fd9dfd800000000 +# - -0x7ff7ffffffffffff1dfb7fafc0000000 +# Squaring the number and its opposite +# should give the same result, but those are off-by-one +# with Großschädl algorithm +# +# I suspect either I did a twice the same mistake when translating 2 different algorithms +# or there is a carry propagation constraint that prevents interleaving squaring +# and Montgomery reduction in the following loops +# for i in 0 ..< N: +# for j in i+1 ..< N: # <-- squaring, notice that we start at i+1 but carries from below may still impact us. +# ... +# for j in 1 ..< N: # <- Montgomery reduce. # Exported API # ------------------------------------------------------------ +# TODO upstream, using Limbs[N] breaks semcheck +func montyRedc2x*[N: static int]( + r: var array[N, SecretWord], + a: array[N*2, SecretWord], + M: array[N, SecretWord], + m0ninv: BaseType, spareBits: static int) {.inline.} = + ## Montgomery reduce a double-precision bigint modulo M + when UseASM_X86_64 and r.len <= 6: + # ADX implies BMI2 + if ({.noSideEffect.}: hasAdx()): + montRed_asm_adx_bmi2(r, a, M, m0ninv, spareBits >= 1) + else: + when r.len in {3..6}: + montRed_asm(r, a, M, m0ninv, spareBits >= 1) + else: + montyRedc2x_CIOS(r, a, M, m0ninv) + # montyRedc2x_Comba(r, a, M, m0ninv) + elif UseASM_X86_64 and r.len in {3..6}: + # TODO: Assembly faster than GCC but slower than Clang + montRed_asm(r, a, M, m0ninv, spareBits >= 1) + else: + montyRedc2x_CIOS(r, a, M, m0ninv) + # montyRedc2x_Comba(r, a, M, m0ninv) + func montyMul*( r: var Limbs, a, b, M: Limbs, m0ninv: static BaseType, spareBits: static int) {.inline.} = @@ -431,53 +381,27 @@ func montyMul*( else: montyMul_FIPS(r, a, b, M, m0ninv) -func montySquare*(r: var Limbs, a, M: Limbs, +func montySquare*[N](r: var Limbs[N], a, M: Limbs[N], m0ninv: static BaseType, spareBits: static int) {.inline.} = ## Compute r <- a^2 (mod M) in the Montgomery domain ## `m0ninv` = -1/M (mod SecretWord). Our words are 2^31 or 2^63 - # TODO: needs optimization similar to multiplication - montyMul(r, a, a, M, m0ninv, spareBits) - - # when spareBits >= 2: - # # TODO: Deactivated - # # Off-by one on 32-bit on the least significant bit - # # for Fp[BLS12-381] with inputs - # # - -0x091F02EFA1C9B99C004329E94CD3C6B308164CBE02037333D78B6C10415286F7C51B5CD7F917F77B25667AB083314B1B - # # - -0x0B7C8AFE5D43E9A973AF8649AD8C733B97D06A78CFACD214CBE9946663C3F682362E0605BC8318714305B249B505AFD9 - # - # # montySquare_CIOS_nocarry(r, a, M, m0ninv) - # montyMul_CIOS_nocarry(r, a, a, M, m0ninv) - # else: - # # TODO: Deactivated - # # Off-by one on 32-bit for Fp[2^127 - 1] with inputs - # # - -0x75bfffefbfffffff7fd9dfd800000000 - # # - -0x7ff7ffffffffffff1dfb7fafc0000000 - # # Squaring the number and its opposite - # # should give the same result, but those are off-by-one - # - # # montySquare_CIOS(r, a, M, m0ninv) # TODO <--- Fix this - # montyMul_FIPS(r, a, a, M, m0ninv) - -# TODO upstream, using Limbs[N] breaks semcheck -func montyRedc2x*[N: static int]( - r: var array[N, SecretWord], - a: array[N*2, SecretWord], - M: array[N, SecretWord], - m0ninv: BaseType, spareBits: static int) {.inline.} = - ## Montgomery reduce a double-precision bigint modulo M - when UseASM_X86_64 and r.len <= 6: + when UseASM_X86_64 and a.len in {4, 6}: # ADX implies BMI2 if ({.noSideEffect.}: hasAdx()): - montRed_asm_adx_bmi2(r, a, M, m0ninv, spareBits) + # With ADX and spare bit, montSquare_CIOS_asm_adx_bmi2 + # which uses unfused squaring then Montgomery reduction + # is slightly slower than fused Montgomery multiplication + when spareBits >= 1: + montMul_CIOS_nocarry_asm_adx_bmi2(r, a, a, M, m0ninv) + else: + montSquare_CIOS_asm_adx_bmi2(r, a, M, m0ninv, spareBits >= 1) else: - montRed_asm(r, a, M, m0ninv, spareBits) - elif UseASM_X86_32 and r.len <= 6: - # TODO: Assembly faster than GCC but slower than Clang - montRed_asm(r, a, M, m0ninv, spareBits) + montSquare_CIOS_asm(r, a, M, m0ninv, spareBits >= 1) else: - montyRedc2x_CIOS(r, a, M, m0ninv) - # montyRedc2x_Comba(r, a, M, m0ninv) + var r2x {.noInit.}: Limbs[2*N] + r2x.square(a) + r.montyRedc2x(r2x, M, m0ninv, spareBits) func redc*(r: var Limbs, a, one, M: Limbs, m0ninv: static BaseType, spareBits: static int) = diff --git a/constantine/config/curves_prop_derived.nim b/constantine/config/curves_prop_derived.nim index 1bac65f..f5237a8 100644 --- a/constantine/config/curves_prop_derived.nim +++ b/constantine/config/curves_prop_derived.nim @@ -125,7 +125,7 @@ macro debugConsts(): untyped {.used.} = for i in 1 ..< E.len: let curve = E[i] let curveName = $curve - let modulus = bindSym(curveName & "_Fp_Modulus") + let modulus = bindSym(curveName & "_Modulus") let r2modp = bindSym(curveName & "_Fp_R2modP") let negInvModWord = bindSym(curveName & "_Fp_NegInvModWord") diff --git a/constantine/primitives/macro_assembler_x86.nim b/constantine/primitives/macro_assembler_x86.nim index 8ba1421..d2aefbd 100644 --- a/constantine/primitives/macro_assembler_x86.nim +++ b/constantine/primitives/macro_assembler_x86.nim @@ -42,9 +42,24 @@ type # Clobbered register ClobberedReg - Register* = enum - rbx, rdx, r8, rax, xmm0 +when sizeof(int) == 8 and not defined(Constantine32): + type + Register* = enum + rbx + rdx + r8 + rax + xmm0 +else: + type + Register* = enum + rbx = "ebx" + rdx = "edx" + r8 = "r8d" + rax = "eax" + xmm0 +type Constraint* = enum ## GCC extended assembly modifier Input = "" @@ -117,6 +132,9 @@ func len*(opArray: Operand): int = func rotateLeft*(opArray: var OperandArray) = opArray.buf.rotateLeft(1) +func rotateRight*(opArray: var OperandArray) = + opArray.buf.rotateLeft(-1) + proc `[]`*(opArray: OperandArray, index: int): Operand = opArray.buf[index] @@ -941,7 +959,7 @@ func mulx*(a: var Assembler_x86, dHi, dLo: Register, src0: Operand, src1: Regist a.regClobbers.incl dHi a.regClobbers.incl dLo -func adcx*(a: var Assembler_x86, dst: Operand|OperandReuse, src: Operand|OperandReuse|Register) = +func adcx*(a: var Assembler_x86, dst: Operand|OperandReuse|Register, src: Operand|OperandReuse|Register) = ## Does: dst <- dst + src + carry ## and only sets the carry flag when dst is Operand: @@ -950,7 +968,7 @@ func adcx*(a: var Assembler_x86, dst: Operand|OperandReuse, src: Operand|Operand a.codeFragment("adcx", src, dst) a.areFlagsClobbered = true -func adox*(a: var Assembler_x86, dst: Operand|OperandReuse, src: Operand|OperandReuse|Register) = +func adox*(a: var Assembler_x86, dst: Operand|OperandReuse|Register, src: Operand|OperandReuse|Register) = ## Does: dst <- dst + src + overflow ## and only sets the overflow flag when dst is Operand: diff --git a/tests/t_finite_fields_double_precision.nim b/tests/t_finite_fields_double_precision.nim index 60eaf5b..4ad5790 100644 --- a/tests/t_finite_fields_double_precision.nim +++ b/tests/t_finite_fields_double_precision.nim @@ -109,7 +109,10 @@ template sqrTest(rng_gen: untyped): untyped = mulDbl.prod2x(a, a) sqrDbl.square2x(a) - doAssert bool(mulDbl == sqrDbl) + doAssert bool(mulDbl == sqrDbl), + "\nOriginal: " & a.mres.limbs.toString() & + "\n Mul: " & mulDbl.limbs2x.toString() & + "\n Sqr: " & sqrDbl.limbs2x.toString() addsubnegTest(random_unsafe) addsubnegTest(randomHighHammingWeight) diff --git a/tests/t_finite_fields_double_precision.nim.cfg b/tests/t_finite_fields_double_precision.nim.cfg new file mode 100644 index 0000000..dd68656 --- /dev/null +++ b/tests/t_finite_fields_double_precision.nim.cfg @@ -0,0 +1 @@ +-d:debugConstantine diff --git a/tests/t_finite_fields_mulsquare.nim b/tests/t_finite_fields_mulsquare.nim index 722cef4..6edd470 100644 --- a/tests/t_finite_fields_mulsquare.nim +++ b/tests/t_finite_fields_mulsquare.nim @@ -145,7 +145,6 @@ suite "Random Modular Squaring is consistent with Modular Multiplication" & " [" random_long01Seq(P224) test "Random squaring mod P-256 [FastSquaring = " & $(Fp[P256].getSpareBits() >= 2) & "]": - echo "Fp[P256].getSpareBits(): ", Fp[P256].getSpareBits() for _ in 0 ..< Iters: randomCurve(P256) for _ in 0 ..< Iters: @@ -173,8 +172,9 @@ suite "Modular squaring - bugs highlighted by property-based testing": a.square() na.square() - check: - bool(a == na) + doAssert bool(a == na), + "\n a² : " & a.mres.limbs.toString() & + "\n (-a)²: " & na.mres.limbs.toString() var a2{.noInit.}, na2{.noInit.}: Fp[Mersenne127] a2.fromHex"0x75bfffefbfffffff7fd9dfd800000000" @@ -183,10 +183,13 @@ suite "Modular squaring - bugs highlighted by property-based testing": a2 *= a2 na2 *= na2 - check: - bool(a2 == na2) - bool(a2 == a) - bool(a2 == na) + doAssert( + bool(a2 == na2) and + bool(a2 == a) and + bool(a2 == na), + "\n a*a: " & a2.mres.limbs.toString() & + "\n (-a)*(-a): " & na2.mres.limbs.toString() + ) test "a² == (-a)² on for Fp[2^127 - 1] - #62": var a{.noInit.}: Fp[Mersenne127] @@ -199,8 +202,9 @@ suite "Modular squaring - bugs highlighted by property-based testing": a.square() na.square() - check: - bool(a == na) + doAssert bool(a == na), + "\n a² : " & a.mres.limbs.toString() & + "\n (-a)²: " & na.mres.limbs.toString() var a2{.noInit.}, na2{.noInit.}: Fp[Mersenne127] a2.fromHex"0x7ff7ffffffffffff1dfb7fafc0000000" @@ -209,10 +213,13 @@ suite "Modular squaring - bugs highlighted by property-based testing": a2 *= a2 na2 *= na2 - check: - bool(a2 == na2) - bool(a2 == a) - bool(a2 == na) + doAssert( + bool(a2 == na2) and + bool(a2 == a) and + bool(a2 == na), + "\n a*a: " & a2.mres.limbs.toString() & + "\n (-a)*(-a): " & na2.mres.limbs.toString() + ) test "32-bit fast squaring on BLS12-381 - #42": # x = -(2^63 + 2^62 + 2^60 + 2^57 + 2^48 + 2^16) diff --git a/tests/t_finite_fields_mulsquare.nim.cfg b/tests/t_finite_fields_mulsquare.nim.cfg index 0922c18..92fac8a 100644 --- a/tests/t_finite_fields_mulsquare.nim.cfg +++ b/tests/t_finite_fields_mulsquare.nim.cfg @@ -1 +1,2 @@ -d:testingCurves +-d:debugConstantine