From 88d4a58a10673e61bfb8b4b13d3003117aa047ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mamy=20Andr=C3=A9-Ratsimbazafy?= Date: Sat, 29 Feb 2020 02:10:55 +0100 Subject: [PATCH] First steps in using uint64 words --- constantine/arithmetic/bigints_raw.nim | 31 +- constantine/arithmetic/precomputed.nim | 2 +- constantine/config/common.nim | 5 +- constantine/primitives/constant_time.nim | 21 +- constantine/primitives/extended_precision.nim | 323 +++++------------- 5 files changed, 115 insertions(+), 267 deletions(-) diff --git a/constantine/arithmetic/bigints_raw.nim b/constantine/arithmetic/bigints_raw.nim index 4916e6a..90e9060 100644 --- a/constantine/arithmetic/bigints_raw.nim +++ b/constantine/arithmetic/bigints_raw.nim @@ -138,7 +138,7 @@ template bitSizeof(v: BigIntViewAny): uint32 = bind BigIntView distinctBase(type v)(v).bitLength -const divShiftor = log2(WordPhysBitSize) +const divShiftor = log2(uint32(WordPhysBitSize)) template numLimbs*(v: BigIntViewAny): int = ## Compute the number of limbs from ## the **internal** bitlength @@ -447,9 +447,7 @@ func shlAddMod(a: BigIntViewMut, c: Word, M: BigIntViewConst) = block: # q*p # q * p + carry (doubleword) carry from previous limb - let qp = unsafeExtPrecMul(q, M[i]) + carry.DoubleWord - carry = Word(qp shr WordBitSize) # New carry: high digit besides LSB - qp_lo = qp.Word.mask() # Normalize to u63 + unsafeFMA(carry, qp_lo, q, M[i], carry) block: # a*2^63 - q*p a[i] -= qp_lo @@ -551,28 +549,23 @@ func montyMul*( r.setBitLength(bitSizeof(M)) setZero(r) - var partials: array[14, DoubleWord] - + var r_hi = Zero # represents the high word that is used in intermediate computation before reduction mod M for i in 0 ..< nLen: - let zi = (Word(partials[0]) + wordMul(a[i], b[0])).wordMul(negInvModWord) + let zi = (r[0] + wordMul(a[i], b[0])).wordMul(negInvModWord) + var carry, z = Zero + unsafeFMA2(carry, z, a[i], b[0], zi, M[0], r[0], carry) - for j in 0 ..< nLen: - partials[j] += unsafeExtPrecMul(a[i], b[j]) + unsafeExtPrecMul(zi, M[j]) + for j in 1 ..< nLen: + unsafeFMA2(carry, r[j-1], a[i], b[j], zi, M[j], r[j], carry) - var carry = partials[0] shr WordBitSize - for j in 1 .. nlen: # we need 1 extra temporary at nlen - partials[j] += carry - carry = partials[j] shr WordBitSize - partials[j-1] = partials[j] and DoubleWord(MaxWord) - partials[nlen] = partials[nlen] shr WordBitSize - - for i in 0 ..< nLen: - r[i] = Word(partials[i]) + r_hi += carry + r[^1] = r_hi.mask() + r_hi = r_hi shr WordBitSize # If the extra word is not zero or if r-M does not borrow (i.e. r > M) # Then substract M - discard r.csub(M, CTBool[Word](partials[nLen].isNonZero()) or not r.csub(M, CtFalse)) + discard r.csub(M, r_hi.isNonZero() or not r.csub(M, CtFalse)) func redc*(r: BigIntViewMut, a: BigIntViewAny, one, N: BigIntViewConst, negInvModWord: Word) {.inline.} = ## Transform a bigint ``a`` from it's Montgomery N-residue representation (mod N) diff --git a/constantine/arithmetic/precomputed.nim b/constantine/arithmetic/precomputed.nim index 47f2537..4af0a75 100644 --- a/constantine/arithmetic/precomputed.nim +++ b/constantine/arithmetic/precomputed.nim @@ -126,7 +126,7 @@ func negInvModWord*(M: BigInt): BaseType = let M0 = BaseType(M.limbs[0]) - k = log2(WordPhysBitSize) + k = log2(uint32(WordPhysBitSize)) result = M0 # Start from an inverse of M0 modulo 2, M0 is odd and it's own inverse for _ in 0 ..< k: # at each iteration we get the inverse mod(2^2k) diff --git a/constantine/config/common.nim b/constantine/config/common.nim index e3a1ace..857beb6 100644 --- a/constantine/config/common.nim +++ b/constantine/config/common.nim @@ -14,12 +14,11 @@ import ../primitives/constant_time -type Word* = Ct[uint32] +type Word* = Ct[uint64] ## Logical BigInt word ## A logical BigInt word is of size physical MachineWord-1 -type DoubleWord* = Ct[uint64] -type BaseType* = uint32 +type BaseType* = uint64 ## Physical BigInt for conversion in "normal integers" const diff --git a/constantine/primitives/constant_time.nim b/constantine/primitives/constant_time.nim index dee207e..2cc2e22 100644 --- a/constantine/primitives/constant_time.nim +++ b/constantine/primitives/constant_time.nim @@ -11,7 +11,6 @@ # Constant-time primitives # # ############################################################ - type BaseUint* = SomeUnsignedInt or byte @@ -168,6 +167,26 @@ func log2*(x: uint32): uint32 = v = v or v shr 16 lookup[(v * 0x07C4ACDD'u32) shr 27] +func log2*(x: uint64): uint64 {.inline, noSideEffect.} = + ## Find the log base 2 of a 32-bit or less integer. + ## using De Bruijn multiplication + ## Works at compile-time, guaranteed constant-time. + ## Note: at runtime BitScanReverse or CountLeadingZero are more efficient + ## but log2 is never needed at runtime. + # https://graphics.stanford.edu/%7Eseander/bithacks.html#IntegerLogDeBruijn + const lookup: array[64, uint8] = [0'u8, 58, 1, 59, 47, 53, 2, 60, 39, 48, 27, 54, + 33, 42, 3, 61, 51, 37, 40, 49, 18, 28, 20, 55, 30, 34, 11, 43, 14, 22, 4, 62, + 57, 46, 52, 38, 26, 32, 41, 50, 36, 17, 19, 29, 10, 13, 21, 56, 45, 25, 31, + 35, 16, 9, 12, 44, 24, 15, 8, 23, 7, 6, 5, 63] + var v = x + v = v or v shr 1 # first round down to one less than a power of 2 + v = v or v shr 2 + v = v or v shr 4 + v = v or v shr 8 + v = v or v shr 16 + v = v or v shr 32 + lookup[(v * 0x03F6EAF2CD271461'u64) shr 58] + # ############################################################ # # Hardened Boolean primitives diff --git a/constantine/primitives/extended_precision.nim b/constantine/primitives/extended_precision.nim index d5fe742..e7984df 100644 --- a/constantine/primitives/extended_precision.nim +++ b/constantine/primitives/extended_precision.nim @@ -20,10 +20,6 @@ import ./constant_time # # ############################################################ -template unsafeExtPrecMul*(a, b: Ct[uint32]): Ct[uint64] = - ## Extended precision multiplication uint32 * uint32 --> uint64 - Ct[uint64](uint64(a) * uint64(b)) - func unsafeDiv2n1n*(q, r: var Ct[uint32], n_hi, n_lo, d: Ct[uint32]) {.inline.}= ## Division uint64 by uint32 ## Warning ⚠️ : @@ -35,261 +31,102 @@ func unsafeDiv2n1n*(q, r: var Ct[uint32], n_hi, n_lo, d: Ct[uint32]) {.inline.}= ## so that the most significant bit in d is set. # TODO !!! - Replace by constant-time, portable, non-assembly version # -> use uint128? Compiler might add unwanted branches + {.warning: "unsafeDiv2n1n is not constant-time at the moment on most hardware".} let dividend = (uint64(n_hi) shl 32) or uint64(n_lo) let divisor = uint64(d) q = (Ct[uint32])(dividend div divisor) r = (Ct[uint32])(dividend mod divisor) +template unsafeFMA*(hi, lo: var Ct[uint32], a, b, c: Ct[uint32]) = + ## Extended precision multiplication + addition + ## This is constant-time on most hardware except some specific one like Cortex M0 + ## (hi, lo) <- a*b + c + block: + # Note: since a and b use 31-bit, + # the result is 62-bit and carrying cannot overflow + let dblPrec = uint64(a) * uint64(b) + uint64(c) + hi = Ct[uint32](dblPrec shr 31) + lo = Ct[uint32](dblPrec) and Ct[uint32](1 shl 31 - 1) + +template unsafeFMA2*(hi, lo: var Ct[uint32], a1, b1, a2, b2, c1, c2: Ct[uint32]) = + ## (hi, lo) <- a1 * b1 + a2 * b2 + c1 + c2 + block: + # TODO: Can this overflow? + let dblPrec = uint64(a1) * uint64(b1) + + uint64(a2) * uint64(b2) + + uint64(c1) + + uint64(c2) + hi = Ct[uint32](dblPrec shr 31) + lo = Ct[uint32](dblPrec) and Ct[uint32](1 shl 31 - 1) + # ############################################################ # # 64-bit words # # ############################################################ -# func asm_x86_64_extMul(hi, lo: var uint64, a, b: uint64) {.inline.}= -# ## Extended precision multiplication uint64 * uint64 --> uint128 +when defined(gcc) or defined(clang) or defined(llvm_gcc): + type + uint128*{.importc: "unsigned __int128".} = object -# # TODO !!! - Replace by constant-time, portable, non-assembly version -# # -> use uint128? Compiler might add unwanted branches + func unsafeDiv2n1n*(q, r: var Ct[uint64], n_hi, n_lo, d: Ct[uint64]) {.inline.}= + ## Division uint128 by uint64 + ## Warning ⚠️ : + ## - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE + ## - if n_hi > d result is undefined + {.warning: "unsafeDiv2n1n is not constant-time at the moment on most hardware".} -# # MUL r/m64 -# # Multiply RAX by r/m64 -# # -# # Inputs: -# # - RAX -# # - r/m -# # Outputs: -# # - High word in RDX -# # - Low word in RAX + # TODO !!! - Replace by constant-time, portable, non-assembly version + # -> use uint128? Compiler might add unwanted branches -# # Don't forget to dereference the var hidden pointer in hi/lo -# asm """ -# mulq %[operand] -# : "=d" (`*hi`), "=a" (`*lo`) -# : "a" (`a`), [operand] "rm" (`b`) -# : -# """ + # DIV r/m64 + # Divide RDX:RAX (n_hi:n_lo) by r/m64 + # + # Inputs + # - numerator high word in RDX, + # - numerator low word in RAX, + # - divisor as r/m parameter (register or memory at the compiler discretion) + # Result + # - Quotient in RAX + # - Remainder in RDX -# func unsafeExtPrecMul*(hi, lo: var Ct[uint64], a, b: Ct[uint64]) {.inline.}= -# ## Extended precision multiplication uint64 * uint64 --> uint128 -# ## -# ## TODO, at the moment only x86_64 architecture are supported -# ## as we use assembly. -# ## Also we assume that the native integer division -# ## provided by the PU is constant-time + # 1. name the register/memory "divisor" + # 2. don't forget to dereference the var hidden pointer + # 3. - + # 4. no clobbered registers beside explectly used RAX and RDX + asm """ + divq %[divisor] + : "=a" (`*q`), "=d" (`*r`) + : "d" (`n_hi`), "a" (`n_lo`), [divisor] "rm" (`d`) + : + """ -# # Note, using C/Nim default `*` is inefficient -# # and complicated to make constant-time -# # See at the bottom. + template unsafeFMA*(hi, lo: var Ct[uint64], a, b, c: Ct[uint64]) = + ## Extended precision multiplication + addition + ## This is constant-time on most hardware except some specific one like Cortex M0 + ## (hi, lo) <- a*b + c + block: + # Note: since a and b use 63-bit, + # the result is 126-bit and carrying cannot overflow + var dblPrec {.noInit.}: uint128 + {.emit:[dblPrec, " = (unsigned __int128)", a," * (unsigned __int128)", b, " + (unsigned __int128)",c,";"].} -# type T = uint64 + {.emit:[hi, " = (NU64)(", dblPrec," >> ", 63'u64, ");"].} + {.emit:[lo, " = (NU64)", dblPrec," & ", 1'u64 shl 63 - 1, ";"].} -# when not defined(amd64): -# {.error: "At the moment only x86_64 architecture is supported".} -# else: -# asm_x86_64_extMul(T(hi), T(lo), T(a), T(b)) + template unsafeFMA2*(hi, lo: var Ct[uint64], a1, b1, a2, b2, c1, c2: Ct[uint64]) = + ## (hi, lo) <- a1 * b1 + a2 * b2 + c1 + c2 + block: + # TODO: Can this overflow? + var dblPrec: uint128 + {.emit:[dblPrec, " = (unsigned __int128)", a1," * (unsigned __int128)", b1, + " + (unsigned __int128)", a2," * (unsigned __int128)", b2, + " + (unsigned __int128)", c1, + " + (unsigned __int128)", c2, ";"].} + {.emit:[hi, " = (NU64)", dblPrec," >> ", 63'u64, ";"].} + {.emit:[lo, " = (NU64)", dblPrec," & ", 1'u64 shl 63 - 1, ";"].} -# func asm_x86_64_div2n1n(q, r: var uint64, n_hi, n_lo, d: uint64) {.inline.}= -# ## Division uint128 by uint64 -# ## Warning ⚠️ : -# ## - if n_hi == d, quotient does not fit in an uint64 -# ## - if n_hi > d result is undefined - -# # TODO !!! - Replace by constant-time, portable, non-assembly version -# # -> use uint128? Compiler might add unwanted branches - -# # DIV r/m64 -# # Divide RDX:RAX (n_hi:n_lo) by r/m64 -# # -# # Inputs -# # - numerator high word in RDX, -# # - numerator low word in RAX, -# # - divisor as r/m parameter (register or memory at the compiler discretion) -# # Result -# # - Quotient in RAX -# # - Remainder in RDX - -# # 1. name the register/memory "divisor" -# # 2. don't forget to dereference the var hidden pointer -# # 3. - -# # 4. no clobbered registers beside explectly used RAX and RDX -# asm """ -# divq %[divisor] -# : "=a" (`*q`), "=d" (`*r`) -# : "d" (`n_hi`), "a" (`n_lo`), [divisor] "rm" (`d`) -# : -# """ - -# func unsafeDiv2n1n*(q, r: var Ct[uint64], n_hi, n_lo, d: Ct[uint64]) {.inline.}= -# ## Division uint128 by uint64 -# ## Warning ⚠️ : -# ## - if n_hi == d, quotient does not fit in an uint64 -# ## - if n_hi > d result is undefined -# ## -# ## To avoid issues, n_hi, n_lo, d should be normalized. -# ## i.e. shifted (== multiplied by the same power of 2) -# ## so that the most significant bit in d is set. -# ## -# ## TODO, at the moment only x86_64 architecture are supported -# ## as we use assembly. -# ## Also we assume that the native integer division -# ## provided by the PU is constant-time - -# # Note, using C/Nim default `div` is inefficient -# # and complicated to make constant-time -# # See at the bottom. -# # -# # Furthermore compilers may try to substitute division -# # with a fast path that may have branches. It might also -# # be the same at the hardware level. - -# # TODO !!! - Replace by constant-time, portable, non-assembly version -# # -> use uint128? Compiler might add unwanted branches - -# type T = uint64 - -# when not defined(amd64): -# {.error: "At the moment only x86_64 architecture is supported".} -# else: -# asm_x86_64_div2n1n(T(q), T(r), T(n_hi), T(n_lo), T(d)) - -when isMainModule: - block: # Multiplication - var hi, lo: uint64 - - asm_x86_64_extMul(hi, lo, 1 shl 32, 1 shl 33) # 2^65 - doAssert hi == 2 - doAssert lo == 0 - - block: # Division - var q, r: uint64 - - # (1 shl 64) div 3 - let n_hi = 1'u64 - let n_lo = 0'u64 - let d = 3'u64 - - asm_x86_64_div2n1n(q, r, n_hi, n_lo, d) - - doAssert q == 6148914691236517205'u64 - doAssert r == 1 - - # block: # TODO - support Quotient that doesn't fit in the result - # # The usual way with normalization by the bitSize difference - # # is fundamentally non constant-time - # # it is probable that division is not constant-time at the hardware level as well - # # as it throws sigfpe when the quotient doesn't fit in the result size - - # var q, r: uint64 - - # let n_hi = 1'u64 - # let n_lo = 0'u64 - # let d = 1'u64 - - # asm_x86_64_div2n1n(q, r, n_hi, n_lo, d) - - # echo "quotient: ", q - # echo "remainder: ", r - - # block: - # var q, r: uint64 - - # let n_hi = 4186590388502004879'u64 - # let n_lo = 17852795547484522084'u64 - # let d = 327340459940166448'u64 - - # asm_x86_64_div2n1n(q, r, n_hi, n_lo, d) - - # echo "quotient: ", q - # echo "remainder: ", r - -# ############################################################## -# -# Non-constant-time portable extended precision multiplication -# -# ############################################################## - -# implementation from Stint: https://github.com/status-im/nim-stint/blob/edb1ade37309390cc641cee07ab62e5459d9ca44/stint/private/uint_mul.nim#L91-L135 - -# template extPrecMulImpl(result: var UintImpl[uint64], op: untyped, u, v: uint64) = -# const -# p = 64 div 2 -# base: uint64 = 1 shl p -# -# var -# x0, x1, x2, x3: uint64 -# -# let -# ul = lo(u) -# uh = hi(u) -# vl = lo(v) -# vh = hi(v) -# -# x0 = ul * vl -# x1 = ul * vh -# x2 = uh * vl -# x3 = uh * vh -# -# x1 += hi(x0) # This can't carry -# x1 += x2 # but this can -# if x1 < x2: # if carry, add it to x3 -# x3 += base -# -# op(result.hi, x3 + hi(x1)) -# op(result.lo, (x1 shl p) or lo(x0)) -# -# func extPrecMul*(result: var UintImpl[uint64], u, v: uint64) = -# ## Extended precision multiplication -# extPrecMulImpl(result, `=`, u, v) -# -# func extPrecAddMul(result: var UintImpl[uint64], u, v: uint64) = -# ## Extended precision fused in-place addition & multiplication -# extPrecMulImpl(result, `+=`, u, v) - -# ############################################################ -# -# Non-constant-time portable div2n1n -# -# ############################################################ - -# implementation from Stint: https://github.com/status-im/nim-stint/blob/edb1ade37309390cc641cee07ab62e5459d9ca44/stint/private/uint_div.nim#L131 - -# func div2n1n[T: SomeunsignedInt](q, r: var T, n_hi, n_lo, d: T) = -# -# # assert countLeadingZeroBits(d) == 0, "Divisor was not normalized" -# -# const -# size = bitsof(q) -# halfSize = size div 2 -# halfMask = (1.T shl halfSize) - 1.T -# -# template halfQR(n_hi, n_lo, d, d_hi, d_lo: T): tuple[q,r: T] = -# -# var (q, r) = divmod(n_hi, d_hi) -# let m = q * d_lo -# var r = (r shl halfSize) or n_lo -# -# # Fix the reminder, we're at most 2 iterations off -# if r < m: -# dec q -# r += d -# if r >= d and r < m: -# dec q -# r += d -# r -= m -# (q, r) -# -# let -# d_hi = d shr halfSize -# d_lo = d and halfMask -# n_lohi = nlo shr halfSize -# n_lolo = nlo and halfMask -# -# # First half of the quotient -# let (q1, r1) = halfQR(n_hi, n_lohi, d, d_hi, d_lo) -# -# # Second half -# let (q2, r2) = halfQR(r1, n_lolo, d, d_hi, d_lo) -# -# q = (q1 shl halfSize) or q2 -# r = r2 +else: + {.error: "Compiler not implemented".} + # For VCC and ICC use add_carry_u64, _add_carryx_u64 intrinsics + # and _umul128