Implement fused multiply add modular multiplication for single limb "bigint". TODO fallback from assembly.

2018-12-02 16:32:38 +01:00 · 2018-12-02 16:32:38 +01:00 · 057ce0cbf9
parent 408bc9b6f3
commit 057ce0cbf9
3 changed files with 215 additions and 58 deletions
--- a/constantine/field_fp.nim
+++ b/constantine/field_fp.nim
@ -16,6 +16,8 @@
 import
  ./word_types, ./bigints
 from ./private/word_types_internal import unsafe_div2n1n
 type
  Fp*[P: static BigInt] = object
    ## P is a prime number
@ -64,63 +66,18 @@ func `+`*(a, b: Fp): Fp =
  ctl = ctl or not sub(result, Fp.P, False)
  sub(result, Fp.P, ctl)
-# ############################################################
+template scaleadd_impl(a: var Fp, c: Limb) =
-#
+  ## Scale-accumulate
-#                Montgomery domain primitives
+  ##
-#
+  ## With a word W = 2^LimbBitSize and a field Fp
-# ############################################################
+  ## Does a <- a * W + c (mod p)
-from bitops import fastLog2
+  when Fp.P.bits <= LimbBitSize:
-  # This will only be used at compile-time
+    # If the prime fits in a single limb
-  # so no constant-time worries (it is constant-time if using the De Bruijn multiplication)
+    var q: Limb
-func montyMagic*(M: static BigInt): static Limb =
+    # (hi, lo) = a * 2^63 + c
-  ## Returns the Montgomery domain magic number for the input modulus:
+    let hi = a[0] shr 1                            # 64 - 63 = 1
-  ##   -1/M[0] mod LimbSize
+    let lo = a[0] shl LimbBitSize or c             # Assumes most-significant bit in c is not set
-  ## M[0] is the least significant limb of M
+    unsafe_div2n1n(q, a[0], hi, lo, Fp.P.limbs[0]) # (hi, lo) mod P
-  ## M must be odd and greater than 2.
+    return
  # Test vectors: https://www.researchgate.net/publication/4107322_Montgomery_modular_multiplication_architecture_for_public_key_cryptosystems
  # on p354
  # Reference C impl: http://www.hackersdelight.org/hdcodetxt/mont64.c.txt
  # ######################################################################
  # Implementation of modular multiplication inverse
  # Assuming 2 positive integers a and m the modulo
  #
  # We are looking for z that solves `az ≡ 1 mod m`
  #
  # References:
  #   - Knuth, The Art of Computer Programming, Vol2 p342
  #   - Menezes, Handbook of Applied Cryptography (HAC), p610
  #     http://cacr.uwaterloo.ca/hac/about/chap14.pdf
  # Starting from the extended GCD formula (Bezout identity),
  # `ax + by = gcd(x,y)` with input x,y and outputs a, b, gcd
  # We assume a and m are coprimes, i.e. gcd is 1, otherwise no inverse
  # `ax + my = 1` <=> `ax + my ≡ 1 mod m` <=> `ax ≡ 1 mod m`
  # For Montgomery magic number, we are in a special case
  # where a = M and m = 2^LimbSize.
  # For a and m to be coprimes, a must be odd.
  # M being a power of 2 greatly simplifies computation:
  #  - https://crypto.stackexchange.com/questions/47493/how-to-determine-the-multiplicative-inverse-modulo-64-or-other-power-of-two
  #  - http://groups.google.com/groups?selm=1994Apr6.093116.27805%40mnemosyne.cs.du.edu
  #  - https://mumble.net/~campbell/2015/01/21/inverse-mod-power-of-two
  #  - https://eprint.iacr.org/2017/411
  # We have the following relation
  # ax ≡ 1 (mod 2^k) <=> ax(2 - ax) ≡ 1 (mod 2^(2k))
  #
  # To get  -1/M0 mod LimbSize
  # we can either negate the resulting x of `ax(2 - ax) ≡ 1 (mod 2^(2k))`
  # or do ax(2 + ax) ≡ 1 (mod 2^(2k))
  const
    M0 = M.limbs[0]
    k = fastLog2(LimbBitSize)
  result = M0                # Start from an inverse of M0 modulo 2, M0 is odd and it's own inverse
  for _ in static(0 ..< k):
    result *= 2 + M * result # x' = x(2 + ax) (`+` to avoid negating at the end)
--- a/constantine/montgomery.nim
+++ b/constantine/montgomery.nim
@ -0,0 +1,73 @@
 # Constantine
 # Copyright (c) 2018 Status Research & Development GmbH
 # Licensed and distributed under either of
 #   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 # ############################################################
 #
 #                Montgomery domain primitives
 #
 # ############################################################
 import
  ./word_types, ./bigints, ./field_fp
 from bitops import fastLog2
  # This will only be used at compile-time
  # so no constant-time worries (it is constant-time if using the De Bruijn multiplication)
 func montyMagic*(M: static BigInt): static Limb =
  ## Returns the Montgomery domain magic number for the input modulus:
  ##   -1/M[0] mod LimbSize
  ## M[0] is the least significant limb of M
  ## M must be odd and greater than 2.
  # Test vectors: https://www.researchgate.net/publication/4107322_Montgomery_modular_multiplication_architecture_for_public_key_cryptosystems
  # on p354
  # Reference C impl: http://www.hackersdelight.org/hdcodetxt/mont64.c.txt
  # ######################################################################
  # Implementation of modular multiplication inverse
  # Assuming 2 positive integers a and m the modulo
  #
  # We are looking for z that solves `az ≡ 1 mod m`
  #
  # References:
  #   - Knuth, The Art of Computer Programming, Vol2 p342
  #   - Menezes, Handbook of Applied Cryptography (HAC), p610
  #     http://cacr.uwaterloo.ca/hac/about/chap14.pdf
  # Starting from the extended GCD formula (Bezout identity),
  # `ax + by = gcd(x,y)` with input x,y and outputs a, b, gcd
  # We assume a and m are coprimes, i.e. gcd is 1, otherwise no inverse
  # `ax + my = 1` <=> `ax + my ≡ 1 mod m` <=> `ax ≡ 1 mod m`
  # For Montgomery magic number, we are in a special case
  # where a = M and m = 2^LimbSize.
  # For a and m to be coprimes, a must be odd.
  # M being a power of 2 greatly simplifies computation:
  #  - https://crypto.stackexchange.com/questions/47493/how-to-determine-the-multiplicative-inverse-modulo-64-or-other-power-of-two
  #  - http://groups.google.com/groups?selm=1994Apr6.093116.27805%40mnemosyne.cs.du.edu
  #  - https://mumble.net/~campbell/2015/01/21/inverse-mod-power-of-two
  #  - https://eprint.iacr.org/2017/411
  # We have the following relation
  # ax ≡ 1 (mod 2^k) <=> ax(2 - ax) ≡ 1 (mod 2^(2k))
  #
  # To get  -1/M0 mod LimbSize
  # we can either negate the resulting x of `ax(2 - ax) ≡ 1 (mod 2^(2k))`
  # or do ax(2 + ax) ≡ 1 (mod 2^(2k))
  const
    M0 = M.limbs[0]
    k = fastLog2(LimbBitSize)
  result = M0                # Start from an inverse of M0 modulo 2, M0 is odd and it's own inverse
  for _ in static(0 ..< k):
    result *= 2 + M * result # x' = x(2 + ax) (`+` to avoid negating at the end)
 # func toMonty*[P: static BigInt](a: Fp[P], montyMagic: Limb): Montgomery[P] =
--- a/constantine/private/word_types_internal.nim
+++ b/constantine/private/word_types_internal.nim
@ -0,0 +1,127 @@
 # Constantine
 # Copyright (c) 2018 Status Research & Development GmbH
 # Licensed and distributed under either of
 #   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 # ############################################################
 #
 #  Unsafe constant-time primitives with specific restrictions
 #
 # ############################################################
 import ../word_types
 func asm_x86_64_div2n1n(q, r: var uint64, n_hi, n_lo, d: uint64) {.inline.}=
  ## Division uint128 by uint64
  ## Warning ⚠️ :
  ##   - if n_hi == d, quotient does not fit in an uint64
  ##   - if n_hi > d result is undefined
  # TODO !!! - Replace by constant-time, portable, non-assembly version
  # DIV r/m64
  # Divide RDX:RAX (n_hi:n_lo) by r/m64
  #
  # Inputs
  #   - numerator high word in RDX,
  #   - numerator low word in RAX,
  #   - divisor as rm parameter (register or memory at the compiler discretion)
  # Result
  #   - Quotient in RAX
  #   - Remainder in RDX
  asm """
    divq %[divisor]             // We name the register/memory divisor
    : "=a" (`*q`), "=d" (`*r`)  // Don't forget to dereference the var hidden pointer
    : "d" (`n_hi`), "a" (`n_lo`), [divisor] "rm" (`d`)
    :  // no register clobbered besides explicitly used RAX and RDX
  """
 func unsafe_div2n1n*(q, r: var Ct[uint64], n_hi, n_lo, d: Ct[uint64]) {.inline.}=
  ## Division uint128 by uint64
  ## Warning ⚠️ :
  ##   - if n_hi == d, quotient does not fit in an uint64
  ##   - if n_hi > d result is undefined
  ##
  ## TODO, at the moment only x86_64 architecture are supported
  ##       as we use assembly.
  ##       Also we assume that the native integer division
  ##       provided by the PU is constant-time
  # Note, using C/Nim default `div` is inefficient
  # and complicated to make constant-time
  # See at the bottom.
  #
  # Furthermore compilers try to substitute division
  # with a fast path that may have branches. It might also
  # be the same at the hardware level.
  type T = uint64
  when not defined(amd64):
    {.error: "At the moment only x86_64 architecture is supported".}
  else:
    asm_x86_64_div2n1n(T(q), T(r), T(n_hi), T(n_lo), T(d))
 when isMainModule:
  var q, r: uint64
  # (1 shl 64) div 3
  let n_hi = 1'u64
  let n_lo = 0'u64
  let d = 3'u64
  asm_x86_64_div2n1n(q, r, n_hi, n_lo, d)
  doAssert q == 6148914691236517205'u64
  doAssert r == 1
 # ############################################################
 #
 #             Non-constant-time portable div2n1n
 #
 # ############################################################
 # implementation from Stint: https://github.com/status-im/nim-stint/blob/edb1ade37309390cc641cee07ab62e5459d9ca44/stint/private/uint_div.nim#L131
 # func div2n1n[T: SomeunsignedInt](q, r: var T, n_hi, n_lo, d: T) =
 #
 #   # assert countLeadingZeroBits(d) == 0, "Divisor was not normalized"
 #
 #   const
 #     size = bitsof(q)
 #     halfSize = size div 2
 #     halfMask = (1.T shl halfSize) - 1.T
 #
 #   template halfQR(n_hi, n_lo, d, d_hi, d_lo: T): tuple[q,r: T] =
 #
 #     var (q, r) = divmod(n_hi, d_hi)
 #     let m = q * d_lo
 #     var r = (r shl halfSize) or n_lo
 #
 #     # Fix the reminder, we're at most 2 iterations off
 #     if r < m:
 #       dec q
 #       r += d
 #       if r >= d and r < m:
 #         dec q
 #         r += d
 #     r -= m
 #     (q, r)
 #
 #   let
 #     d_hi = d shr halfSize
 #     d_lo = d and halfMask
 #     n_lohi = nlo shr halfSize
 #     n_lolo = nlo and halfMask
 #
 #   # First half of the quotient
 #   let (q1, r1) = halfQR(n_hi, n_lohi, d, d_hi, d_lo)
 #
 #   # Second half
 #   let (q2, r2) = halfQR(r1, n_lolo, d, d_hi, d_lo)
 #
 #   q = (q1 shl halfSize) or q2
 #   r = r2