Implement fused multiply add modular multiplication for single limb "bigint". TODO fallback from assembly.
This commit is contained in:
parent
408bc9b6f3
commit
057ce0cbf9
|
@ -16,6 +16,8 @@
|
|||
import
|
||||
./word_types, ./bigints
|
||||
|
||||
from ./private/word_types_internal import unsafe_div2n1n
|
||||
|
||||
type
|
||||
Fp*[P: static BigInt] = object
|
||||
## P is a prime number
|
||||
|
@ -64,63 +66,18 @@ func `+`*(a, b: Fp): Fp =
|
|||
ctl = ctl or not sub(result, Fp.P, False)
|
||||
sub(result, Fp.P, ctl)
|
||||
|
||||
# ############################################################
|
||||
#
|
||||
# Montgomery domain primitives
|
||||
#
|
||||
# ############################################################
|
||||
template scaleadd_impl(a: var Fp, c: Limb) =
|
||||
## Scale-accumulate
|
||||
##
|
||||
## With a word W = 2^LimbBitSize and a field Fp
|
||||
## Does a <- a * W + c (mod p)
|
||||
|
||||
from bitops import fastLog2
|
||||
# This will only be used at compile-time
|
||||
# so no constant-time worries (it is constant-time if using the De Bruijn multiplication)
|
||||
when Fp.P.bits <= LimbBitSize:
|
||||
# If the prime fits in a single limb
|
||||
var q: Limb
|
||||
|
||||
func montyMagic*(M: static BigInt): static Limb =
|
||||
## Returns the Montgomery domain magic number for the input modulus:
|
||||
## -1/M[0] mod LimbSize
|
||||
## M[0] is the least significant limb of M
|
||||
## M must be odd and greater than 2.
|
||||
|
||||
# Test vectors: https://www.researchgate.net/publication/4107322_Montgomery_modular_multiplication_architecture_for_public_key_cryptosystems
|
||||
# on p354
|
||||
# Reference C impl: http://www.hackersdelight.org/hdcodetxt/mont64.c.txt
|
||||
|
||||
# ######################################################################
|
||||
# Implementation of modular multiplication inverse
|
||||
# Assuming 2 positive integers a and m the modulo
|
||||
#
|
||||
# We are looking for z that solves `az ≡ 1 mod m`
|
||||
#
|
||||
# References:
|
||||
# - Knuth, The Art of Computer Programming, Vol2 p342
|
||||
# - Menezes, Handbook of Applied Cryptography (HAC), p610
|
||||
# http://cacr.uwaterloo.ca/hac/about/chap14.pdf
|
||||
|
||||
# Starting from the extended GCD formula (Bezout identity),
|
||||
# `ax + by = gcd(x,y)` with input x,y and outputs a, b, gcd
|
||||
# We assume a and m are coprimes, i.e. gcd is 1, otherwise no inverse
|
||||
# `ax + my = 1` <=> `ax + my ≡ 1 mod m` <=> `ax ≡ 1 mod m`
|
||||
|
||||
# For Montgomery magic number, we are in a special case
|
||||
# where a = M and m = 2^LimbSize.
|
||||
# For a and m to be coprimes, a must be odd.
|
||||
|
||||
# M being a power of 2 greatly simplifies computation:
|
||||
# - https://crypto.stackexchange.com/questions/47493/how-to-determine-the-multiplicative-inverse-modulo-64-or-other-power-of-two
|
||||
# - http://groups.google.com/groups?selm=1994Apr6.093116.27805%40mnemosyne.cs.du.edu
|
||||
# - https://mumble.net/~campbell/2015/01/21/inverse-mod-power-of-two
|
||||
# - https://eprint.iacr.org/2017/411
|
||||
|
||||
# We have the following relation
|
||||
# ax ≡ 1 (mod 2^k) <=> ax(2 - ax) ≡ 1 (mod 2^(2k))
|
||||
#
|
||||
# To get -1/M0 mod LimbSize
|
||||
# we can either negate the resulting x of `ax(2 - ax) ≡ 1 (mod 2^(2k))`
|
||||
# or do ax(2 + ax) ≡ 1 (mod 2^(2k))
|
||||
|
||||
const
|
||||
M0 = M.limbs[0]
|
||||
k = fastLog2(LimbBitSize)
|
||||
|
||||
result = M0 # Start from an inverse of M0 modulo 2, M0 is odd and it's own inverse
|
||||
for _ in static(0 ..< k):
|
||||
result *= 2 + M * result # x' = x(2 + ax) (`+` to avoid negating at the end)
|
||||
# (hi, lo) = a * 2^63 + c
|
||||
let hi = a[0] shr 1 # 64 - 63 = 1
|
||||
let lo = a[0] shl LimbBitSize or c # Assumes most-significant bit in c is not set
|
||||
unsafe_div2n1n(q, a[0], hi, lo, Fp.P.limbs[0]) # (hi, lo) mod P
|
||||
return
|
||||
|
|
|
@ -0,0 +1,73 @@
|
|||
# Constantine
|
||||
# Copyright (c) 2018 Status Research & Development GmbH
|
||||
# Licensed and distributed under either of
|
||||
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
|
||||
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
|
||||
# at your option. This file may not be copied, modified, or distributed except according to those terms.
|
||||
|
||||
# ############################################################
|
||||
#
|
||||
# Montgomery domain primitives
|
||||
#
|
||||
# ############################################################
|
||||
|
||||
import
|
||||
./word_types, ./bigints, ./field_fp
|
||||
|
||||
from bitops import fastLog2
|
||||
# This will only be used at compile-time
|
||||
# so no constant-time worries (it is constant-time if using the De Bruijn multiplication)
|
||||
|
||||
func montyMagic*(M: static BigInt): static Limb =
|
||||
## Returns the Montgomery domain magic number for the input modulus:
|
||||
## -1/M[0] mod LimbSize
|
||||
## M[0] is the least significant limb of M
|
||||
## M must be odd and greater than 2.
|
||||
|
||||
# Test vectors: https://www.researchgate.net/publication/4107322_Montgomery_modular_multiplication_architecture_for_public_key_cryptosystems
|
||||
# on p354
|
||||
# Reference C impl: http://www.hackersdelight.org/hdcodetxt/mont64.c.txt
|
||||
|
||||
# ######################################################################
|
||||
# Implementation of modular multiplication inverse
|
||||
# Assuming 2 positive integers a and m the modulo
|
||||
#
|
||||
# We are looking for z that solves `az ≡ 1 mod m`
|
||||
#
|
||||
# References:
|
||||
# - Knuth, The Art of Computer Programming, Vol2 p342
|
||||
# - Menezes, Handbook of Applied Cryptography (HAC), p610
|
||||
# http://cacr.uwaterloo.ca/hac/about/chap14.pdf
|
||||
|
||||
# Starting from the extended GCD formula (Bezout identity),
|
||||
# `ax + by = gcd(x,y)` with input x,y and outputs a, b, gcd
|
||||
# We assume a and m are coprimes, i.e. gcd is 1, otherwise no inverse
|
||||
# `ax + my = 1` <=> `ax + my ≡ 1 mod m` <=> `ax ≡ 1 mod m`
|
||||
|
||||
# For Montgomery magic number, we are in a special case
|
||||
# where a = M and m = 2^LimbSize.
|
||||
# For a and m to be coprimes, a must be odd.
|
||||
|
||||
# M being a power of 2 greatly simplifies computation:
|
||||
# - https://crypto.stackexchange.com/questions/47493/how-to-determine-the-multiplicative-inverse-modulo-64-or-other-power-of-two
|
||||
# - http://groups.google.com/groups?selm=1994Apr6.093116.27805%40mnemosyne.cs.du.edu
|
||||
# - https://mumble.net/~campbell/2015/01/21/inverse-mod-power-of-two
|
||||
# - https://eprint.iacr.org/2017/411
|
||||
|
||||
# We have the following relation
|
||||
# ax ≡ 1 (mod 2^k) <=> ax(2 - ax) ≡ 1 (mod 2^(2k))
|
||||
#
|
||||
# To get -1/M0 mod LimbSize
|
||||
# we can either negate the resulting x of `ax(2 - ax) ≡ 1 (mod 2^(2k))`
|
||||
# or do ax(2 + ax) ≡ 1 (mod 2^(2k))
|
||||
|
||||
const
|
||||
M0 = M.limbs[0]
|
||||
k = fastLog2(LimbBitSize)
|
||||
|
||||
result = M0 # Start from an inverse of M0 modulo 2, M0 is odd and it's own inverse
|
||||
for _ in static(0 ..< k):
|
||||
result *= 2 + M * result # x' = x(2 + ax) (`+` to avoid negating at the end)
|
||||
|
||||
# func toMonty*[P: static BigInt](a: Fp[P], montyMagic: Limb): Montgomery[P] =
|
||||
|
|
@ -0,0 +1,127 @@
|
|||
# Constantine
|
||||
# Copyright (c) 2018 Status Research & Development GmbH
|
||||
# Licensed and distributed under either of
|
||||
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
|
||||
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
|
||||
# at your option. This file may not be copied, modified, or distributed except according to those terms.
|
||||
|
||||
# ############################################################
|
||||
#
|
||||
# Unsafe constant-time primitives with specific restrictions
|
||||
#
|
||||
# ############################################################
|
||||
|
||||
import ../word_types
|
||||
|
||||
func asm_x86_64_div2n1n(q, r: var uint64, n_hi, n_lo, d: uint64) {.inline.}=
|
||||
## Division uint128 by uint64
|
||||
## Warning ⚠️ :
|
||||
## - if n_hi == d, quotient does not fit in an uint64
|
||||
## - if n_hi > d result is undefined
|
||||
|
||||
# TODO !!! - Replace by constant-time, portable, non-assembly version
|
||||
|
||||
# DIV r/m64
|
||||
# Divide RDX:RAX (n_hi:n_lo) by r/m64
|
||||
#
|
||||
# Inputs
|
||||
# - numerator high word in RDX,
|
||||
# - numerator low word in RAX,
|
||||
# - divisor as rm parameter (register or memory at the compiler discretion)
|
||||
# Result
|
||||
# - Quotient in RAX
|
||||
# - Remainder in RDX
|
||||
asm """
|
||||
divq %[divisor] // We name the register/memory divisor
|
||||
: "=a" (`*q`), "=d" (`*r`) // Don't forget to dereference the var hidden pointer
|
||||
: "d" (`n_hi`), "a" (`n_lo`), [divisor] "rm" (`d`)
|
||||
: // no register clobbered besides explicitly used RAX and RDX
|
||||
"""
|
||||
|
||||
func unsafe_div2n1n*(q, r: var Ct[uint64], n_hi, n_lo, d: Ct[uint64]) {.inline.}=
|
||||
## Division uint128 by uint64
|
||||
## Warning ⚠️ :
|
||||
## - if n_hi == d, quotient does not fit in an uint64
|
||||
## - if n_hi > d result is undefined
|
||||
##
|
||||
## TODO, at the moment only x86_64 architecture are supported
|
||||
## as we use assembly.
|
||||
## Also we assume that the native integer division
|
||||
## provided by the PU is constant-time
|
||||
|
||||
# Note, using C/Nim default `div` is inefficient
|
||||
# and complicated to make constant-time
|
||||
# See at the bottom.
|
||||
#
|
||||
# Furthermore compilers try to substitute division
|
||||
# with a fast path that may have branches. It might also
|
||||
# be the same at the hardware level.
|
||||
|
||||
type T = uint64
|
||||
|
||||
when not defined(amd64):
|
||||
{.error: "At the moment only x86_64 architecture is supported".}
|
||||
else:
|
||||
asm_x86_64_div2n1n(T(q), T(r), T(n_hi), T(n_lo), T(d))
|
||||
|
||||
when isMainModule:
|
||||
|
||||
var q, r: uint64
|
||||
|
||||
# (1 shl 64) div 3
|
||||
let n_hi = 1'u64
|
||||
let n_lo = 0'u64
|
||||
let d = 3'u64
|
||||
|
||||
asm_x86_64_div2n1n(q, r, n_hi, n_lo, d)
|
||||
|
||||
doAssert q == 6148914691236517205'u64
|
||||
doAssert r == 1
|
||||
|
||||
# ############################################################
|
||||
#
|
||||
# Non-constant-time portable div2n1n
|
||||
#
|
||||
# ############################################################
|
||||
|
||||
# implementation from Stint: https://github.com/status-im/nim-stint/blob/edb1ade37309390cc641cee07ab62e5459d9ca44/stint/private/uint_div.nim#L131
|
||||
|
||||
# func div2n1n[T: SomeunsignedInt](q, r: var T, n_hi, n_lo, d: T) =
|
||||
#
|
||||
# # assert countLeadingZeroBits(d) == 0, "Divisor was not normalized"
|
||||
#
|
||||
# const
|
||||
# size = bitsof(q)
|
||||
# halfSize = size div 2
|
||||
# halfMask = (1.T shl halfSize) - 1.T
|
||||
#
|
||||
# template halfQR(n_hi, n_lo, d, d_hi, d_lo: T): tuple[q,r: T] =
|
||||
#
|
||||
# var (q, r) = divmod(n_hi, d_hi)
|
||||
# let m = q * d_lo
|
||||
# var r = (r shl halfSize) or n_lo
|
||||
#
|
||||
# # Fix the reminder, we're at most 2 iterations off
|
||||
# if r < m:
|
||||
# dec q
|
||||
# r += d
|
||||
# if r >= d and r < m:
|
||||
# dec q
|
||||
# r += d
|
||||
# r -= m
|
||||
# (q, r)
|
||||
#
|
||||
# let
|
||||
# d_hi = d shr halfSize
|
||||
# d_lo = d and halfMask
|
||||
# n_lohi = nlo shr halfSize
|
||||
# n_lolo = nlo and halfMask
|
||||
#
|
||||
# # First half of the quotient
|
||||
# let (q1, r1) = halfQR(n_hi, n_lohi, d, d_hi, d_lo)
|
||||
#
|
||||
# # Second half
|
||||
# let (q2, r2) = halfQR(r1, n_lolo, d, d_hi, d_lo)
|
||||
#
|
||||
# q = (q1 shl halfSize) or q2
|
||||
# r = r2
|
Loading…
Reference in New Issue