Implement fused multiply add modular multiplication for single limb "bigint". TODO fallback from assembly.

This commit is contained in:
mratsim 2018-12-02 16:32:38 +01:00
parent 408bc9b6f3
commit 057ce0cbf9
3 changed files with 215 additions and 58 deletions

View File

@ -16,6 +16,8 @@
import
./word_types, ./bigints
from ./private/word_types_internal import unsafe_div2n1n
type
Fp*[P: static BigInt] = object
## P is a prime number
@ -64,63 +66,18 @@ func `+`*(a, b: Fp): Fp =
ctl = ctl or not sub(result, Fp.P, False)
sub(result, Fp.P, ctl)
# ############################################################
#
# Montgomery domain primitives
#
# ############################################################
template scaleadd_impl(a: var Fp, c: Limb) =
## Scale-accumulate
##
## With a word W = 2^LimbBitSize and a field Fp
## Does a <- a * W + c (mod p)
from bitops import fastLog2
# This will only be used at compile-time
# so no constant-time worries (it is constant-time if using the De Bruijn multiplication)
when Fp.P.bits <= LimbBitSize:
# If the prime fits in a single limb
var q: Limb
func montyMagic*(M: static BigInt): static Limb =
## Returns the Montgomery domain magic number for the input modulus:
## -1/M[0] mod LimbSize
## M[0] is the least significant limb of M
## M must be odd and greater than 2.
# Test vectors: https://www.researchgate.net/publication/4107322_Montgomery_modular_multiplication_architecture_for_public_key_cryptosystems
# on p354
# Reference C impl: http://www.hackersdelight.org/hdcodetxt/mont64.c.txt
# ######################################################################
# Implementation of modular multiplication inverse
# Assuming 2 positive integers a and m the modulo
#
# We are looking for z that solves `az ≡ 1 mod m`
#
# References:
# - Knuth, The Art of Computer Programming, Vol2 p342
# - Menezes, Handbook of Applied Cryptography (HAC), p610
# http://cacr.uwaterloo.ca/hac/about/chap14.pdf
# Starting from the extended GCD formula (Bezout identity),
# `ax + by = gcd(x,y)` with input x,y and outputs a, b, gcd
# We assume a and m are coprimes, i.e. gcd is 1, otherwise no inverse
# `ax + my = 1` <=> `ax + my ≡ 1 mod m` <=> `ax ≡ 1 mod m`
# For Montgomery magic number, we are in a special case
# where a = M and m = 2^LimbSize.
# For a and m to be coprimes, a must be odd.
# M being a power of 2 greatly simplifies computation:
# - https://crypto.stackexchange.com/questions/47493/how-to-determine-the-multiplicative-inverse-modulo-64-or-other-power-of-two
# - http://groups.google.com/groups?selm=1994Apr6.093116.27805%40mnemosyne.cs.du.edu
# - https://mumble.net/~campbell/2015/01/21/inverse-mod-power-of-two
# - https://eprint.iacr.org/2017/411
# We have the following relation
# ax ≡ 1 (mod 2^k) <=> ax(2 - ax) ≡ 1 (mod 2^(2k))
#
# To get -1/M0 mod LimbSize
# we can either negate the resulting x of `ax(2 - ax) ≡ 1 (mod 2^(2k))`
# or do ax(2 + ax) ≡ 1 (mod 2^(2k))
const
M0 = M.limbs[0]
k = fastLog2(LimbBitSize)
result = M0 # Start from an inverse of M0 modulo 2, M0 is odd and it's own inverse
for _ in static(0 ..< k):
result *= 2 + M * result # x' = x(2 + ax) (`+` to avoid negating at the end)
# (hi, lo) = a * 2^63 + c
let hi = a[0] shr 1 # 64 - 63 = 1
let lo = a[0] shl LimbBitSize or c # Assumes most-significant bit in c is not set
unsafe_div2n1n(q, a[0], hi, lo, Fp.P.limbs[0]) # (hi, lo) mod P
return

View File

@ -0,0 +1,73 @@
# Constantine
# Copyright (c) 2018 Status Research & Development GmbH
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.
# ############################################################
#
# Montgomery domain primitives
#
# ############################################################
import
./word_types, ./bigints, ./field_fp
from bitops import fastLog2
# This will only be used at compile-time
# so no constant-time worries (it is constant-time if using the De Bruijn multiplication)
func montyMagic*(M: static BigInt): static Limb =
## Returns the Montgomery domain magic number for the input modulus:
## -1/M[0] mod LimbSize
## M[0] is the least significant limb of M
## M must be odd and greater than 2.
# Test vectors: https://www.researchgate.net/publication/4107322_Montgomery_modular_multiplication_architecture_for_public_key_cryptosystems
# on p354
# Reference C impl: http://www.hackersdelight.org/hdcodetxt/mont64.c.txt
# ######################################################################
# Implementation of modular multiplication inverse
# Assuming 2 positive integers a and m the modulo
#
# We are looking for z that solves `az ≡ 1 mod m`
#
# References:
# - Knuth, The Art of Computer Programming, Vol2 p342
# - Menezes, Handbook of Applied Cryptography (HAC), p610
# http://cacr.uwaterloo.ca/hac/about/chap14.pdf
# Starting from the extended GCD formula (Bezout identity),
# `ax + by = gcd(x,y)` with input x,y and outputs a, b, gcd
# We assume a and m are coprimes, i.e. gcd is 1, otherwise no inverse
# `ax + my = 1` <=> `ax + my ≡ 1 mod m` <=> `ax ≡ 1 mod m`
# For Montgomery magic number, we are in a special case
# where a = M and m = 2^LimbSize.
# For a and m to be coprimes, a must be odd.
# M being a power of 2 greatly simplifies computation:
# - https://crypto.stackexchange.com/questions/47493/how-to-determine-the-multiplicative-inverse-modulo-64-or-other-power-of-two
# - http://groups.google.com/groups?selm=1994Apr6.093116.27805%40mnemosyne.cs.du.edu
# - https://mumble.net/~campbell/2015/01/21/inverse-mod-power-of-two
# - https://eprint.iacr.org/2017/411
# We have the following relation
# ax ≡ 1 (mod 2^k) <=> ax(2 - ax) ≡ 1 (mod 2^(2k))
#
# To get -1/M0 mod LimbSize
# we can either negate the resulting x of `ax(2 - ax) ≡ 1 (mod 2^(2k))`
# or do ax(2 + ax) ≡ 1 (mod 2^(2k))
const
M0 = M.limbs[0]
k = fastLog2(LimbBitSize)
result = M0 # Start from an inverse of M0 modulo 2, M0 is odd and it's own inverse
for _ in static(0 ..< k):
result *= 2 + M * result # x' = x(2 + ax) (`+` to avoid negating at the end)
# func toMonty*[P: static BigInt](a: Fp[P], montyMagic: Limb): Montgomery[P] =

View File

@ -0,0 +1,127 @@
# Constantine
# Copyright (c) 2018 Status Research & Development GmbH
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.
# ############################################################
#
# Unsafe constant-time primitives with specific restrictions
#
# ############################################################
import ../word_types
func asm_x86_64_div2n1n(q, r: var uint64, n_hi, n_lo, d: uint64) {.inline.}=
## Division uint128 by uint64
## Warning ⚠️ :
## - if n_hi == d, quotient does not fit in an uint64
## - if n_hi > d result is undefined
# TODO !!! - Replace by constant-time, portable, non-assembly version
# DIV r/m64
# Divide RDX:RAX (n_hi:n_lo) by r/m64
#
# Inputs
# - numerator high word in RDX,
# - numerator low word in RAX,
# - divisor as rm parameter (register or memory at the compiler discretion)
# Result
# - Quotient in RAX
# - Remainder in RDX
asm """
divq %[divisor] // We name the register/memory divisor
: "=a" (`*q`), "=d" (`*r`) // Don't forget to dereference the var hidden pointer
: "d" (`n_hi`), "a" (`n_lo`), [divisor] "rm" (`d`)
: // no register clobbered besides explicitly used RAX and RDX
"""
func unsafe_div2n1n*(q, r: var Ct[uint64], n_hi, n_lo, d: Ct[uint64]) {.inline.}=
## Division uint128 by uint64
## Warning ⚠️ :
## - if n_hi == d, quotient does not fit in an uint64
## - if n_hi > d result is undefined
##
## TODO, at the moment only x86_64 architecture are supported
## as we use assembly.
## Also we assume that the native integer division
## provided by the PU is constant-time
# Note, using C/Nim default `div` is inefficient
# and complicated to make constant-time
# See at the bottom.
#
# Furthermore compilers try to substitute division
# with a fast path that may have branches. It might also
# be the same at the hardware level.
type T = uint64
when not defined(amd64):
{.error: "At the moment only x86_64 architecture is supported".}
else:
asm_x86_64_div2n1n(T(q), T(r), T(n_hi), T(n_lo), T(d))
when isMainModule:
var q, r: uint64
# (1 shl 64) div 3
let n_hi = 1'u64
let n_lo = 0'u64
let d = 3'u64
asm_x86_64_div2n1n(q, r, n_hi, n_lo, d)
doAssert q == 6148914691236517205'u64
doAssert r == 1
# ############################################################
#
# Non-constant-time portable div2n1n
#
# ############################################################
# implementation from Stint: https://github.com/status-im/nim-stint/blob/edb1ade37309390cc641cee07ab62e5459d9ca44/stint/private/uint_div.nim#L131
# func div2n1n[T: SomeunsignedInt](q, r: var T, n_hi, n_lo, d: T) =
#
# # assert countLeadingZeroBits(d) == 0, "Divisor was not normalized"
#
# const
# size = bitsof(q)
# halfSize = size div 2
# halfMask = (1.T shl halfSize) - 1.T
#
# template halfQR(n_hi, n_lo, d, d_hi, d_lo: T): tuple[q,r: T] =
#
# var (q, r) = divmod(n_hi, d_hi)
# let m = q * d_lo
# var r = (r shl halfSize) or n_lo
#
# # Fix the reminder, we're at most 2 iterations off
# if r < m:
# dec q
# r += d
# if r >= d and r < m:
# dec q
# r += d
# r -= m
# (q, r)
#
# let
# d_hi = d shr halfSize
# d_lo = d and halfMask
# n_lohi = nlo shr halfSize
# n_lolo = nlo and halfMask
#
# # First half of the quotient
# let (q1, r1) = halfQR(n_hi, n_lohi, d, d_hi, d_lo)
#
# # Second half
# let (q2, r2) = halfQR(r1, n_lolo, d, d_hi, d_lo)
#
# q = (q1 shl halfSize) or q2
# r = r2