Improve initialization for montymul. (64-bit speed is 2.3x 32-bit)
This commit is contained in:
parent
feacf2b2ea
commit
0fab0c8a42
|
@ -553,10 +553,12 @@ func montyMul*(
|
||||||
for i in 0 ..< nLen:
|
for i in 0 ..< nLen:
|
||||||
|
|
||||||
let zi = (r[0] + wordMul(a[i], b[0])).wordMul(negInvModWord)
|
let zi = (r[0] + wordMul(a[i], b[0])).wordMul(negInvModWord)
|
||||||
var carry, z = Zero
|
var carry: Word
|
||||||
unsafeFMA2(carry, z, a[i], b[0], zi, M[0], r[0], carry)
|
# (carry, _) <- a[i] * b[0] + zi * M[0] + r[0]
|
||||||
|
unsafeFMA2_hi(carry, a[i], b[0], zi, M[0], r[0])
|
||||||
|
|
||||||
for j in 1 ..< nLen:
|
for j in 1 ..< nLen:
|
||||||
|
# (carry, r[j-1]) <- a[i] * b[j] + zi * M[j] + r[j] + carry
|
||||||
unsafeFMA2(carry, r[j-1], a[i], b[j], zi, M[j], r[j], carry)
|
unsafeFMA2(carry, r[j-1], a[i], b[j], zi, M[j], r[j], carry)
|
||||||
|
|
||||||
r_hi += carry
|
r_hi += carry
|
||||||
|
|
|
@ -14,16 +14,17 @@
|
||||||
|
|
||||||
import ../primitives/constant_time
|
import ../primitives/constant_time
|
||||||
|
|
||||||
type Word* = Ct[uint64]
|
type
|
||||||
## Logical BigInt word
|
BaseType* = uint64
|
||||||
## A logical BigInt word is of size physical MachineWord-1
|
## Physical BigInt for conversion in "normal integers"
|
||||||
|
Word* = Ct[BaseType]
|
||||||
type BaseType* = uint64
|
## Logical BigInt word
|
||||||
## Physical BigInt for conversion in "normal integers"
|
## A logical BigInt word is of size physical MachineWord-1
|
||||||
|
|
||||||
const
|
const
|
||||||
|
ExcessBits = 1
|
||||||
WordPhysBitSize* = sizeof(Word) * 8
|
WordPhysBitSize* = sizeof(Word) * 8
|
||||||
WordBitSize* = WordPhysBitSize - 1
|
WordBitSize* = WordPhysBitSize - ExcessBits
|
||||||
|
|
||||||
CtTrue* = ctrue(Word)
|
CtTrue* = ctrue(Word)
|
||||||
CtFalse* = cfalse(Word)
|
CtFalse* = cfalse(Word)
|
||||||
|
|
|
@ -59,6 +59,16 @@ template unsafeFMA2*(hi, lo: var Ct[uint32], a1, b1, a2, b2, c1, c2: Ct[uint32])
|
||||||
hi = Ct[uint32](dblPrec shr 31)
|
hi = Ct[uint32](dblPrec shr 31)
|
||||||
lo = Ct[uint32](dblPrec) and Ct[uint32](1 shl 31 - 1)
|
lo = Ct[uint32](dblPrec) and Ct[uint32](1 shl 31 - 1)
|
||||||
|
|
||||||
|
template unsafeFMA2_hi*(hi: var Ct[uint32], a1, b1, a2, b2, c1: Ct[uint32]) =
|
||||||
|
## Returns the high word of the sum of extended precision multiply-adds
|
||||||
|
## (hi, _) <- a1 * b1 + a2 * b2 + c
|
||||||
|
block:
|
||||||
|
# TODO: Can this overflow?
|
||||||
|
let dblPrec = uint64(a1) * uint64(b1) +
|
||||||
|
uint64(a2) * uint64(b2) +
|
||||||
|
uint64(c1)
|
||||||
|
hi = Ct[uint32](dblPrec shr 31)
|
||||||
|
|
||||||
# ############################################################
|
# ############################################################
|
||||||
#
|
#
|
||||||
# 64-bit words
|
# 64-bit words
|
||||||
|
@ -126,6 +136,16 @@ when defined(gcc) or defined(clang) or defined(llvm_gcc):
|
||||||
{.emit:[hi, " = (NU64)(", dblPrec," >> ", 63'u64, ");"].}
|
{.emit:[hi, " = (NU64)(", dblPrec," >> ", 63'u64, ");"].}
|
||||||
{.emit:[lo, " = (NU64)", dblPrec," & ", (1'u64 shl 63 - 1), ";"].}
|
{.emit:[lo, " = (NU64)", dblPrec," & ", (1'u64 shl 63 - 1), ";"].}
|
||||||
|
|
||||||
|
template unsafeFMA2_hi*(hi: var Ct[uint64], a1, b1, a2, b2, c: Ct[uint64]) =
|
||||||
|
## Returns the high word of the sum of extended precision multiply-adds
|
||||||
|
## (hi, _) <- a1 * b1 + a2 * b2 + c
|
||||||
|
block:
|
||||||
|
var dblPrec: uint128
|
||||||
|
{.emit:[dblPrec, " = (unsigned __int128)", a1," * (unsigned __int128)", b1,
|
||||||
|
" + (unsigned __int128)", a2," * (unsigned __int128)", b2,
|
||||||
|
" + (unsigned __int128)", c, ";"].}
|
||||||
|
{.emit:[hi, " = (NU64)(", dblPrec," >> ", 63'u64, ");"].}
|
||||||
|
|
||||||
else:
|
else:
|
||||||
{.error: "Compiler not implemented".}
|
{.error: "Compiler not implemented".}
|
||||||
# For VCC and ICC use add_carry_u64, _add_carryx_u64 intrinsics
|
# For VCC and ICC use add_carry_u64, _add_carryx_u64 intrinsics
|
||||||
|
|
Loading…
Reference in New Issue