From 05bce529b49a21f28c2b2af1ab0a2cc93ccf5e32 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mamy=20Andr=C3=A9-Ratsimbazafy?= Date: Fri, 28 Feb 2020 22:46:20 +0100 Subject: [PATCH] 1st experiment at accelerating montgomery multiplication (665 lines of specialized duplicated ASM code for some reason, monomorphization is probably better than that) --- benchmarks/bls12_381_fp.nim | 66 ++++++++++++++++++++++++++ constantine/arithmetic/README.md | 4 ++ constantine/arithmetic/bigints_raw.nim | 26 +++++----- 3 files changed, 84 insertions(+), 12 deletions(-) create mode 100644 benchmarks/bls12_381_fp.nim diff --git a/benchmarks/bls12_381_fp.nim b/benchmarks/bls12_381_fp.nim new file mode 100644 index 0000000..17bc11f --- /dev/null +++ b/benchmarks/bls12_381_fp.nim @@ -0,0 +1,66 @@ +# Constantine +# Copyright (c) 2018-2019 Status Research & Development GmbH +# Copyright (c) 2020-Present Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +# ############################################################ +# +# Benchmark of modular exponentiation +# +# ############################################################ + +# 2 implementations are available +# - 1 is constant time +# - 1 exposes the exponent bits to: +# timing attack, +# memory access analysis, +# power analysis (i.e. oscilloscopes on embedded) +# It is suitable for public exponents for example +# to compute modular inversion via the Fermat method + +import + ../constantine/config/[common, curves], + ../constantine/arithmetic/[bigints_checked, finite_fields], + ../constantine/io/[io_bigints, io_fields], + random, std/monotimes, times, strformat + +const Iters = 1_000_000 + +randomize(1234) + +proc addBench() = + var r, x, y: Fp[BLS12_381] + # BN254 field modulus + x.fromHex("0x30644e72e131a029b85045b68181585d97816a916871ca8d3c208c16d87cfd47") + # BLS12-381 prime - 2 + y.fromHex("0x1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaa9") + + let start = getMonotime() + for _ in 0 ..< Iters: + x += y + let stop = getMonotime() + + echo &"Time for {Iters} conditional additions (constant-time 381-bit): {inMilliseconds(stop-start)} ms" + echo &"Time for 1 conditional addition ==> {inNanoseconds((stop-start) div Iters)} ns" + +addBench() + +proc mulBench() = + var r, x, y: Fp[BLS12_381] + # BN254 field modulus + x.fromHex("0x30644e72e131a029b85045b68181585d97816a916871ca8d3c208c16d87cfd47") + # BLS12-381 prime - 2 + y.fromHex("0x1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaa9") + + let start = getMonotime() + for _ in 0 ..< Iters: + r.prod(x, y) + let stop = getMonotime() + + echo &"Time for {Iters} multiplications (constant-time 381-bit): {inMilliseconds(stop-start)} ms" + echo &"Time for 1 multiplication ==> {inNanoseconds((stop-start) div Iters)} ns" + +mulBench() diff --git a/constantine/arithmetic/README.md b/constantine/arithmetic/README.md index dc01271..9ce12e1 100644 --- a/constantine/arithmetic/README.md +++ b/constantine/arithmetic/README.md @@ -6,6 +6,10 @@ This folder contains the implementation of ## References +- Analyzing and Comparing Montgomery Multiplication Algorithms + Cetin Kaya Koc and Tolga Acar and Burton S. Kaliski Jr. + http://pdfs.semanticscholar.org/5e39/41ff482ec3ee41dc53c3298f0be085c69483.pdf + - Montgomery Arithmetic from a Software Perspective\ Joppe W. Bos and Peter L. Montgomery, 2017\ https://eprint.iacr.org/2017/1057 diff --git a/constantine/arithmetic/bigints_raw.nim b/constantine/arithmetic/bigints_raw.nim index 9712e68..4916e6a 100644 --- a/constantine/arithmetic/bigints_raw.nim +++ b/constantine/arithmetic/bigints_raw.nim @@ -551,26 +551,28 @@ func montyMul*( r.setBitLength(bitSizeof(M)) setZero(r) - var r_hi = Zero # represents the high word that is used in intermediate computation before reduction mod M + var partials: array[14, DoubleWord] + for i in 0 ..< nLen: - let zi = (r[0] + wordMul(a[i], b[0])).wordMul(negInvModWord) - var carry = Zero + let zi = (Word(partials[0]) + wordMul(a[i], b[0])).wordMul(negInvModWord) for j in 0 ..< nLen: - let z = DoubleWord(r[j]) + unsafeExtPrecMul(a[i], b[j]) + - unsafeExtPrecMul(zi, M[j]) + DoubleWord(carry) - carry = Word(z shr WordBitSize) - if j != 0: # "division" by a physical word 2^32 or 2^64 - r[j-1] = Word(z).mask() + partials[j] += unsafeExtPrecMul(a[i], b[j]) + unsafeExtPrecMul(zi, M[j]) - r_hi += carry - r[^1] = r_hi.mask() - r_hi = r_hi shr WordBitSize + var carry = partials[0] shr WordBitSize + for j in 1 .. nlen: # we need 1 extra temporary at nlen + partials[j] += carry + carry = partials[j] shr WordBitSize + partials[j-1] = partials[j] and DoubleWord(MaxWord) + partials[nlen] = partials[nlen] shr WordBitSize + + for i in 0 ..< nLen: + r[i] = Word(partials[i]) # If the extra word is not zero or if r-M does not borrow (i.e. r > M) # Then substract M - discard r.csub(M, r_hi.isNonZero() or not r.csub(M, CtFalse)) + discard r.csub(M, CTBool[Word](partials[nLen].isNonZero()) or not r.csub(M, CtFalse)) func redc*(r: BigIntViewMut, a: BigIntViewAny, one, N: BigIntViewConst, negInvModWord: Word) {.inline.} = ## Transform a bigint ``a`` from it's Montgomery N-residue representation (mod N)