Mamy Ratsimbazafy eee0f4f0fc
Lattice decomposition fixes (#71)
* Sage: Lattice decomp script fixes from anonymous reviewer

* update recoding mini test and add recoding primitives

* Update the GLV recoding

* update comments on positive/negative recoding [skip ci]

* sprinkle some {.noInit.} where possible
2020-08-22 19:45:44 +02:00

386 lines
12 KiB
Nim

# Constantine
# Copyright (c) 2018-2019 Status Research & Development GmbH
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.
import
../config/common,
../primitives
when UseASM_X86_32:
import ./limbs_asm_x86
when UseASM_X86_64:
import ./limbs_asm_mul_x86
import ./limbs_asm_mul_x86_adx_bmi2
# ############################################################
#
# Limbs raw representation and operations
#
# ############################################################
#
# This file holds the raw operations done on big ints
# The representation is optimized for:
# - constant-time (not leaking secret data via side-channel)
# - performance
# - generated code size, datatype size and stack usage
# in this order
#
# The "limbs" API limits code duplication
# due to generic/static monomorphization for bit-width
# that are represented with the same number of words.
#
# It also exposes at the number of words to the compiler
# to allow aggressive unrolling and inlining for example
# of multi-precision addition which is so small (2 instructions per word)
# that inlining it improves both performance and code-size
# even for 2 curves (secp256k1 and BN254) that could share the code.
#
# The limb-endianess is little-endian, less significant limb is at index 0.
# The word-endianness is native-endian.
# No exceptions allowed
{.push raises: [].}
# ############################################################
#
# Limbs Primitives
#
# ############################################################
{.push inline.}
# The following primitives are small enough on regular limb sizes
# (BN254 and secp256k1 -> 4 limbs, BLS12-381 -> 6 limbs)
# that inline both decreases the code size and increases speed
# as we avoid the parmeter packing/unpacking ceremony at function entry/exit
# and unrolling overhead is minimal.
# Initialization
# ------------------------------------------------------------
func setZero*(a: var Limbs) =
## Set ``a`` to 0
zeroMem(a[0].addr, sizeof(a))
func setOne*(a: var Limbs) =
## Set ``a`` to 1
a[0] = SecretWord(1)
when a.len > 1:
zeroMem(a[1].addr, (a.len - 1) * sizeof(SecretWord))
# Copy
# ------------------------------------------------------------
func ccopy*(a: var Limbs, b: Limbs, ctl: SecretBool) =
## Constant-time conditional copy
## If ctl is true: b is copied into a
## if ctl is false: b is not copied and a is untouched
## Time and memory accesses are the same whether a copy occurs or not
when UseASM_X86_32:
ccopy_asm(a, b, ctl)
else:
for i in 0 ..< a.len:
ctl.ccopy(a[i], b[i])
func cswap*(a, b: var Limbs, ctl: CTBool) =
## Swap ``a`` and ``b`` if ``ctl`` is true
##
## Constant-time:
## Whether ``ctl`` is true or not, the same
## memory accesses are done (unless the compiler tries to be clever)
var mask = -(SecretWord ctl)
for i in 0 ..< a.len:
let t = mask and (a[i] xor b[i])
a[i] = a[i] xor t
b[i] = b[i] xor t
# Comparison
# ------------------------------------------------------------
func `==`*(a, b: Limbs): SecretBool =
## Returns true if 2 limbs are equal
## Comparison is constant-time
var accum = Zero
for i in 0 ..< a.len:
accum = accum or (a[i] xor b[i])
result = accum.isZero()
func `<`*(a, b: Limbs): SecretBool =
## Returns true if a < b
## Comparison is constant-time
var diff: SecretWord
var borrow: Borrow
for i in 0 ..< a.len:
subB(borrow, diff, a[i], b[i], borrow)
result = (SecretBool)(borrow)
func `<=`*(a, b: Limbs): SecretBool =
## Returns true if a <= b
## Comparison is constant-time
not(b < a)
func isZero*(a: Limbs): SecretBool =
## Returns true if ``a`` is equal to zero
var accum = Zero
for i in 0 ..< a.len:
accum = accum or a[i]
result = accum.isZero()
func isOne*(a: Limbs): SecretBool =
## Returns true if ``a`` is equal to one
result = a[0] == SecretWord(1)
for i in 1 ..< a.len:
result = result and a[i].isZero()
func isOdd*(a: Limbs): SecretBool =
## Returns true if a is odd
SecretBool(a[0] and SecretWord(1))
# Bit manipulation
# ------------------------------------------------------------
func shiftRight*(a: var Limbs, k: int) {.inline.}=
## Shift right by k.
##
## k MUST be less than the base word size (2^32 or 2^64)
# We don't reuse shr as this is an in-place operation
# Do we need to return the shifted out part?
#
# Note: for speed, loading a[i] and a[i+1]
# instead of a[i-1] and a[i]
# is probably easier to parallelize for the compiler
# (antidependence WAR vs loop-carried dependence RAW)
# checkWordShift(k)
for i in 0 ..< a.len-1:
a[i] = (a[i] shr k) or (a[i+1] shl (WordBitWidth - k))
a[a.len-1] = a[a.len-1] shr k
# Basic Arithmetic
# ------------------------------------------------------------
func add*(a: var Limbs, b: Limbs): Carry =
## Limbs addition
## Returns the carry
when UseASM_X86_32:
result = add_asm(a, a, b)
else:
result = Carry(0)
for i in 0 ..< a.len:
addC(result, a[i], a[i], b[i], result)
func add*(a: var Limbs, w: SecretWord): Carry =
## Limbs addition, add a number that fits in a word
## Returns the carry
result = Carry(0)
addC(result, a[0], a[0], w, result)
for i in 1 ..< a.len:
addC(result, a[i], a[i], Zero, result)
func cadd*(a: var Limbs, b: Limbs, ctl: SecretBool): Carry =
## Limbs conditional addition
## Returns the carry
##
## if ctl is true: a <- a + b
## if ctl is false: a <- a
## The carry is always computed whether ctl is true or false
##
## Time and memory accesses are the same whether a copy occurs or not
result = Carry(0)
var sum: SecretWord
for i in 0 ..< a.len:
addC(result, sum, a[i], b[i], result)
ctl.ccopy(a[i], sum)
func cadd*(a: var Limbs, w: SecretWord, ctl: SecretBool): Borrow =
## Limbs conditional addition, sub a number that fits in a word
## Returns the borrow
result = Carry(0)
var diff: SecretWord
addC(result, diff, a[0], w, result)
ctl.ccopy(a[0], diff)
for i in 1 ..< a.len:
addC(result, diff, a[i], Zero, result)
ctl.ccopy(a[i], diff)
func sum*(r: var Limbs, a, b: Limbs): Carry =
## Sum `a` and `b` into `r`
## `r` is initialized/overwritten
##
## Returns the carry
when UseASM_X86_32:
result = add_asm(r, a, b)
else:
result = Carry(0)
for i in 0 ..< a.len:
addC(result, r[i], a[i], b[i], result)
func sub*(a: var Limbs, b: Limbs): Borrow =
## Limbs substraction
## Returns the borrow
when UseASM_X86_32:
result = sub_asm(a, a, b)
else:
result = Borrow(0)
for i in 0 ..< a.len:
subB(result, a[i], a[i], b[i], result)
func sub*(a: var Limbs, w: SecretWord): Borrow =
## Limbs substraction, sub a number that fits in a word
## Returns the borrow
result = Borrow(0)
subB(result, a[0], a[0], w, result)
for i in 1 ..< a.len:
subB(result, a[i], a[i], Zero, result)
func csub*(a: var Limbs, b: Limbs, ctl: SecretBool): Borrow =
## Limbs conditional substraction
## Returns the borrow
##
## if ctl is true: a <- a - b
## if ctl is false: a <- a
## The borrow is always computed whether ctl is true or false
##
## Time and memory accesses are the same whether a copy occurs or not
result = Borrow(0)
var diff: SecretWord
for i in 0 ..< a.len:
subB(result, diff, a[i], b[i], result)
ctl.ccopy(a[i], diff)
func csub*(a: var Limbs, w: SecretWord, ctl: SecretBool): Borrow =
## Limbs conditional substraction, sub a number that fits in a word
## Returns the borrow
result = Borrow(0)
var diff: SecretWord
subB(result, diff, a[0], w, result)
ctl.ccopy(a[0], diff)
for i in 1 ..< a.len:
subB(result, diff, a[i], Zero, result)
ctl.ccopy(a[i], diff)
func diff*(r: var Limbs, a, b: Limbs): Borrow =
## Diff `a` and `b` into `r`
## `r` is initialized/overwritten
##
## Returns the borrow
when UseASM_X86_32:
result = sub_asm(r, a, b)
else:
result = Borrow(0)
for i in 0 ..< a.len:
subB(result, r[i], a[i], b[i], result)
func cneg*(a: var Limbs, ctl: CTBool) =
## Conditional negation.
## Negate if ``ctl`` is true
# Algorithm:
# In two-complement representation
# -x <=> not(x) + 1 <=> x xor 0xFF... + 1
# and
# x <=> x xor 0x00...<=> x xor 0x00... + 0
#
# So we need to xor all words and then add 1
# The "+1" might carry
# So we fuse the 2 steps
let mask = -SecretWord(ctl) # Obtain a 0xFF... or 0x00... mask
var carry = SecretWord(ctl)
for i in 0 ..< a.len:
let t = (a[i] xor mask) + carry # XOR with mask and add 0x01 or 0x00 respectively
carry = SecretWord(t < carry) # Carry on
a[i] = t
{.pop.} # inline
# Multiplication
# ------------------------------------------------------------
func prod*[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limbs[aLen], b: Limbs[bLen]) =
## Multi-precision multiplication
## r <- a*b
##
## `a`, `b`, `r` can have a different number of limbs
## if `r`.limbs.len < a.limbs.len + b.limbs.len
## The result will be truncated, i.e. it will be
## a * b (mod (2^WordBitwidth)^r.limbs.len)
##
## `r` must not alias ``a`` or ``b``
when UseASM_X86_64 and aLen <= 6:
if ({.noSideEffect.}: hasBmi2()) and ({.noSideEffect.}: hasAdx()):
mul_asm_adx_bmi2(r, a, b)
else:
mul_asm(r, a, b)
elif UseASM_X86_64:
mul_asm(r, a, b)
else:
# We use Product Scanning / Comba multiplication
var t, u, v = SecretWord(0)
staticFor i, 0, min(a.len+b.len, r.len):
const ib = min(b.len-1, i)
const ia = i - ib
staticFor j, 0, min(a.len - ia, ib+1):
mulAcc(t, u, v, a[ia+j], b[ib-j])
r[i] = v
v = u
u = t
t = SecretWord(0)
if aLen+bLen < rLen:
for i in aLen+bLen ..< rLen:
r[i] = SecretWord 0
func prod_high_words*[rLen, aLen, bLen](
r: var Limbs[rLen],
a: Limbs[aLen], b: Limbs[bLen],
lowestWordIndex: static int) =
## Multi-precision multiplication keeping only high words
## r <- a*b >> (2^WordBitWidth)^lowestWordIndex
##
## `a`, `b`, `r` can have a different number of limbs
## if `r`.limbs.len < a.limbs.len + b.limbs.len - lowestWordIndex
## The result will be truncated, i.e. it will be
## a * b >> (2^WordBitWidth)^lowestWordIndex (mod (2^WordBitwidth)^r.limbs.len)
#
# This is useful for
# - Barret reduction
# - Approximating multiplication by a fractional constant in the form f(a) = K/C * a
# with K and C known at compile-time.
# We can instead find a well chosen M = (2^WordBitWidth)^w, with M > C (i.e. M is a power of 2 bigger than C)
# Precompute P = K*M/C at compile-time
# and at runtime do P*a/M <=> P*a >> (WordBitWidth*w)
# i.e. prod_high_words(result, P, a, w)
# We use Product Scanning / Comba multiplication
var t, u, v = SecretWord(0) # Will raise warning on empty iterations
var z: Limbs[rLen] # zero-init, ensure on stack and removes in-place problems
# The previous 2 columns can affect the lowest word due to carries
# but not the ones before (we accumulate in 3 words (t, u, v))
const w = lowestWordIndex - 2
staticFor i, max(0, w), min(a.len+b.len, r.len+lowestWordIndex):
const ib = min(b.len-1, i)
const ia = i - ib
staticFor j, 0, min(a.len - ia, ib+1):
mulAcc(t, u, v, a[ia+j], b[ib-j])
when i >= lowestWordIndex:
z[i-lowestWordIndex] = v
v = u
u = t
t = SecretWord(0)
r = z
{.pop.} # raises no exceptions