Refactoring, optimize code-size: use type-erased views to avoid monomorphization of compute kernels

2020-02-10 18:16:34 +01:00 · 2020-02-10 18:16:34 +01:00 · b689223cf5
parent ade919b003
commit b689223cf5
11 changed files with 577 additions and 367 deletions
--- a/constantine/bigints.nim
+++ b/constantine/bigints.nim
@ -1,313 +0,0 @@
 # Constantine
 # Copyright (c) 2018-2019    Status Research & Development GmbH
 # Copyright (c) 2020-Present Mamy André-Ratsimbazafy
 # Licensed and distributed under either of
 #   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 # ############################################################
 #
 #                    BigInt representation
 #
 # ############################################################
 # To avoid carry issues we don't use the
 # most significant bit of each word.
 # i.e. for a uint64 base we only use 63-bit.
 # More info: https://github.com/status-im/nim-constantine/wiki/Constant-time-arithmetics#guidelines
 # Especially:
 #    - https://bearssl.org/bigint.html
 #    - https://cryptojedi.org/peter/data/pairing-20131122.pdf
 #    - http://docs.milagro.io/en/amcl/milagro-crypto-library-white-paper.html
 #
 # Note that this might also be beneficial in terms of performance.
 # Due to opcode latency, on Nehalem ADC is 6x times slower than ADD
 # if it has dependencies (i.e the ADC depends on a previous ADC result)
 # Control flow should only depends on the static maximum number of bits
 # This number is defined per Finite Field/Prime/Elliptic Curve
 #
 # For efficiency, our limbs will use a word size of 63-bit
 # Warning ⚠️ : This assumes that u64 + u64 and u64 * u64
 #              are constant-time even on 32-bit platforms
 #
 # We internally order the limbs in little-endian
 # So the least significant limb is limb[0]
 # This is independent from the base type endianness.
 import ./primitives
 from ./private/primitives_internal import unsafeDiv2n1n, unsafeExtendedPrecMul
 type Word* = Ct[uint32]
 type BaseType* = uint32 # Exported type for conversion in "normal integers"
 const WordBitSize* = sizeof(Word) * 8 - 1
  ## Limbs are 63-bit by default
 const
  Zero* = Word(0)
  One* = Word(1)
  MaxWord* = (not Zero) shr 1
    ## This represents 0x7F_FF_FF_FF__FF_FF_FF_FF
    ## also 0b0111...1111
    ## This biggest representable number in our limbs.
    ## i.e. The most significant bit is never set at the end of each function
 func wordsRequired(bits: int): int {.compileTime.}=
  (bits + WordBitSize - 1) div WordBitSize
 # TODO: Currently the library is instantiation primitives like "add"
 #       for each "bits" size supported. This will lead to duplication
 #       if many sizes (for example for scp256k1, bn254 and BLS12-381)
 #       are required.
 #       It could be avoided by having the bitsize be a runtime field
 #       of the bigint. However the tradeoff would be:
 #       - overhead of this additional field
 #       - limbs have to be stored in an UncheckedArray instead of an array
 #         introducing memory management issues
 type
  BigInt*[bits: static int] = object
    ## Fixed-precision big integer
    ##
    ## "limbs" is an internal field that holds the internal representation
    ## of the big integer. This internal representation can be changed
    ## without notice and should not be used by external applications or libraries.
    # Constantine BigInt have a word-size chosen to minimize bigint memory usage
    # while allowing carry-less operations in a machine-efficient type like uint32
    # uint64 or uint128 if available.
    # In practice the word size is 63-bit.
    #
    # "Limb-endianess" is little-endian (least significant limb at BigInt.limbs[0])
    limbs*: array[bits.wordsRequired, Word]
 # No exceptions allowed
 # TODO: can we use compile-time "Natural" instead of "int" in that case?
 {.push raises: [].}
 # ############################################################
 #
 #                         Internal
 #
 # ############################################################
 func copyLimbs*[dstBits, srcBits](
        dst: var BigInt[dstBits], dstStart: static int,
        src: BigInt[srcBits], srcStart: static int,
        numLimbs: static int) {.inline.}=
  ## Copy `numLimbs` from src into dst
  ## If `dst` buffer is larger than `numLimbs` buffer
  ## the extra space will be zero-ed out
  ##
  ## Limbs ordering is little-endian. limb 0 is the least significant/
  ##
  ## This should work at both compile-time and runtime.
  ##
  ## `numLimbs` must be less or equal the limbs of the `dst` and `src` buffers
  ## This is checked at compile-time and has no runtime impact
  static:
    doAssert numLimbs >= 0, "`numLimbs` must be greater or equal zero"
    doAssert numLimbs + srcStart <= src.limbs.len,
      "The number of limbs to copy (" & $numLimbs &
      ") must be less or equal to the number of limbs in the `src` buffer (" &
      $src.limbs.len & " for " & $srcBits & " bits)"
    doAssert numLimbs + dstStart <= dst.limbs.len,
      "The number of limbs to copy (" & $numLimbs &
      ") must be less or equal to the number of limbs in the `dst` buffer (" &
      $dst.limbs.len & " for " & $dstBits & " bits)"
  # TODO: do we need a copyMem / memcpy specialization for runtime
  #       or use dst.limbs[0..<numLimbs] = src.toOpenarray(0, numLimbs - 1)
  for i in static(0 ..< numLimbs):
    dst.limbs[i+dstStart] = src.limbs[i+srcStart]
 func setZero*(a: var BigInt, start, stop: static int) {.inline.} =
  ## Set limbs to zero
  ## The [start, stop] range is inclusive
  ## If stop < start, a is unmodified
  static:
    doAssert start in 0 ..< a.limbs.len, $start & " not in 0 ..< " & $a.limbs.len & " (numLimbs)"
    doAssert stop  in 0 ..< a.limbs.len, $stop & " not in 0 ..< " & $a.limbs.len & " (numLimbs)"
  for i in static(start .. stop):
    a.limbs[i] = Zero
 # ############################################################
 #
 #                    BigInt primitives
 #
 # ############################################################
 # TODO: {.inline.} analysis
 func isZero*(a: BigInt): CTBool[Word] =
  ## Returns if a big int is equal to zero
  var accum: Word
  for i in static(0 ..< a.limbs.len):
    accum = accum or a.limbs[i]
  result = accum.isZero()
 func `==`*(a, b: BigInt): CTBool[Word] =
  ## Returns true if 2 big ints are equal
  var accum: Word
  for i in static(0 ..< a.limbs.len):
    accum = accum or (a.limbs[i] xor b.limbs[i])
  result = accum.isZero
 # The arithmetic primitives all accept a control input that indicates
 # if it is a placebo operation. It stills performs the
 # same memory accesses to be side-channel attack resistant.
 func add*[bits](a: var BigInt[bits], b: BigInt[bits], ctl: CTBool[Word]): CTBool[Word] =
  ## Constant-time big integer in-place optional addition
  ## The addition is only performed if ctl is "true"
  ## The result carry is always computed.
  for i in static(0 ..< a.limbs.len):
    let new_a = a.limbs[i] + b.limbs[i] + Word(result)
    result = new_a.isMsbSet()
    a.limbs[i] = ctl.mux(new_a and MaxWord, a.limbs[i])
 func sub*[bits](a: var BigInt[bits], b: BigInt[bits], ctl: CTBool[Word]): CTBool[Word] =
  ## Constant-time big integer in-place optional substraction
  ## The substraction is only performed if ctl is "true"
  ## The result carry is always computed.
  for i in static(0 ..< a.limbs.len):
    let new_a = a.limbs[i] - b.limbs[i] - Word(result)
    result = new_a.isMsbSet()
    a.limbs[i] = ctl.mux(new_a and MaxWord, a.limbs[i])
 # ############################################################
 #
 #                   Modular BigInt
 #
 # ############################################################
 # TODO: push boundsCheck off. They would be extremely costly.
 func shlAddMod[bits](a: var BigInt[bits], c: Word, M: BigInt[bits]) =
  ## Fused modular left-shift + add
  ## Shift input `a` by a word and add `c` modulo `M`
  ##
  ## With a word W = 2^WordBitSize and a modulus M
  ## Does a <- a * W + c (mod M)
  ##
  ## The modulus `M` **must** use `mBits` bits.
  assert not M.limbs[^1].isZero.bool, "The modulus must use all declared bits"
  const len = a.limbs.len
  when bits <= WordBitSize:
    # If M fits in a single limb
    var q: Word
    # (hi, lo) = a * 2^63 + c
    let hi = a.limbs[0] shr 1                        # 64 - 63 = 1
    let lo = (a.limbs[0] shl WordBitSize) or c       # Assumes most-significant bit in c is not set
    unsafeDiv2n1n(q, a.limbs[0], hi, lo, M.limbs[0]) # (hi, lo) mod M
    return
  else: # TODO replace moveMem with a proc that also works at compile-time
    ## Multiple limbs
    let hi = a.limbs[^1]                                               # Save the high word to detect carries
    const R = bits and WordBitSize                                     # R = bits mod 64
    when R == 0:                                                       # If the number of bits is a multiple of 64
      let a0 = a.limbs[^1]                                             #
      moveMem(a.limbs[1].addr, a.limbs[0].addr, (len-1) * Word.sizeof) # we can just shift words
      a.limbs[0] = c                                                   # and replace the first one by c
      let a1 = a.limbs[^1]
      let m0 = M.limbs[^1]
    else: # Need to deal with partial word shifts at the edge.
      let a0 = ((a.limbs[^1] shl (WordBitSize-R)) or (a.limbs[^2] shr R)) and MaxWord
      moveMem(a.limbs[1].addr, a.limbs[0].addr, (len-1) * Word.sizeof)
      a.limbs[0] = c
      let a1 = ((a.limbs[^1] shl (WordBitSize-R)) or (a.limbs[^2] shr R)) and MaxWord
      let m0 = ((M.limbs[^1] shl (WordBitSize-R)) or (M.limbs[^2] shr R)) and MaxWord
    # m0 has its high bit set. (a0, a1)/p0 fits in a limb.
    # Get a quotient q, at most we will be 2 iterations off
    # from the true quotient
    let
      a_hi = a0 shr 1                              # 64 - 63 = 1
      a_lo = (a0 shl WordBitSize) or a1
    var q, r: Word
    unsafeDiv2n1n(q, r, a_hi, a_lo, m0)            # Estimate quotient
    q = mux(                                       # If n_hi == divisor
          a0 == m0, MaxWord,                       # Quotient == MaxWord (0b0111...1111)
          mux(
            q.isZero, Zero,                        # elif q == 0, true quotient = 0
            q - One                                # else instead of being of by 0, 1 or 2
          )                                        # we returning q-1 to be off by -1, 0 or 1
        )
    # Now substract a*2^63 - q*p
    var carry = Zero
    var over_p = ctrue(Word)                       # Track if quotient greater than the modulus
    for i in static(0 ..< M.limbs.len):
      var qp_lo: Word
      block: # q*p
        var qp_hi: Word
        unsafeExtendedPrecMul(qp_hi, qp_lo, q, M.limbs[i])  # q * p
        qp_lo += carry                                      # Add carry from previous limb
        carry = qp_hi shl 1 + qp_lo.isMsbSet.Word           # New carry
        qp_lo = qp_lo and MaxWord                           # Normalize to u63
      block: # a*2^63 - q*p
        a.limbs[i] -= qp_lo
        carry += Word(a.limbs[i].isMsbSet)                  # Adjust if borrow
        a.limbs[i] = a.limbs[i] and MaxWord                 # Normalize to u63
      over_p = mux(
                a.limbs[i] == M.limbs[i], over_p,
                a.limbs[i] > M.limbs[i]
              )
    # Fix quotient, the true quotient is either q-1, q or q+1
    #
    # if carry < q or carry == q and over_p we must do "a -= p"
    # if carry > hi (negative result) we must do "a += p"
    let neg = carry < hi
    let tooBig = not neg and (over_p or (carry < hi))
    discard a.add(M, ctl = neg)
    discard a.sub(M, ctl = tooBig)
    return
 func reduce*[aBits, mBits](r: var BigInt[mBits], a: BigInt[aBits], M: BigInt[mBits]) =
  ## Reduce `a` modulo `M` and store the result in `r`
  ##
  ## The modulus `M` **must** use `mBits` bits.
  ##
  ## CT: Depends only on the length of the modulus `M`
  # Note: for all cryptographic intents and purposes the modulus is known at compile-time
  # but we don't want to inline it as it would increase codesize, better have Nim
  # pass a pointer+length to a fixed session of the BSS.
  assert not M.limbs[^1].isZero.bool, "The modulus must use all declared bits"
  when aBits < mBits:
    # if a uses less bits than the modulus,
    # it is guaranteed < modulus.
    # This relies on the precondition that the modulus uses all declared bits
    copyLimbs(r, 0, a, 0, a.limbs.len)
    r.setZero(a.limbs.len, r.limbs.len-1)
  else:
    # a length i at least equal to the modulus.
    # we can copy modulus.limbs-1 words
    # and modular shift-left-add the rest
    const aOffset = a.limbs.len - M.limbs.len
    copyLimbs(r, 0, a, aOffset, M.limbs.len - 1)
    r.limbs[^1] = Zero
    for i in countdown(aOffset-1, 0):
      r.shlAddMod(a.limbs[i], M)
--- a/constantine/bigints_public.nim
+++ b/constantine/bigints_public.nim
@ -0,0 +1,54 @@
 # Constantine
 # Copyright (c) 2018-2019    Status Research & Development GmbH
 # Copyright (c) 2020-Present Mamy André-Ratsimbazafy
 # Licensed and distributed under either of
 #   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 import
  ./bigints_raw,
  ./primitives
 # ############################################################
 #
 #                   BigInts Public API
 #
 # ############################################################
 # The "public" API, exported for finite field computations
 # enforced compile-time checking of BigInt bitsize
 #
 # The "raw" compute API, uses views to avoid code duplication due to generic/static monomorphization.
 # No exceptions allowed
 {.push raises: [].}
 {.push inline.}
 func isZero*(a: BigInt): CTBool[Word] =
  ## Returns true if a big int is equal to zero
  a.view.isZero
 func add*[bits](a: var BigInt[bits], b: BigInt[bits], ctl: CTBool[Word]): CTBool[Word] =
  ## Constant-time big integer in-place optional addition
  ## The addition is only performed if ctl is "true"
  ## The result carry is always computed.
  add(a.view, b.view, ctl)
 func sub*[bits](a: var BigInt[bits], b: BigInt[bits], ctl: CTBool[Word]): CTBool[Word] =
  ## Constant-time big integer in-place optional addition
  ## The addition is only performed if ctl is "true"
  ## The result carry is always computed.
  sub(a.view, b.view, ctl)
 func reduce*[aBits, mBits](r: var BigInt[mBits], a: BigInt[aBits], M: BigInt[mBits]) =
  ## Reduce `a` modulo `M` and store the result in `r`
  ##
  ## The modulus `M` **must** use `mBits` bits (bits at position mBits-1 must be set)
  ##
  ## CT: Depends only on the length of the modulus `M`
  # Note: for all cryptographic intents and purposes the modulus is known at compile-time
  # but we don't want to inline it as it would increase codesize, better have Nim
  # pass a pointer+length to a fixed session of the BSS.
  reduce(r.view, a.view, M.view)
--- a/constantine/bigints_raw.nim
+++ b/constantine/bigints_raw.nim
@ -0,0 +1,424 @@
 # Constantine
 # Copyright (c) 2018-2019    Status Research & Development GmbH
 # Copyright (c) 2020-Present Mamy André-Ratsimbazafy
 # Licensed and distributed under either of
 #   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 # ############################################################
 #
 #         BigInt Raw representation and operations
 #
 # ############################################################
 #
 # This file holds the raw operations done on big ints
 # The representation is optimized for:
 # - constant-time (not leaking secret data via side-channel)
 # - generated code size and datatype size
 # - performance
 # in this order
 # ############################################################
 # Design
 # To avoid carry issues we don't use the
 # most significant bit of each machine word.
 # i.e. for a uint64 base we only use 63-bit.
 # More info: https://github.com/status-im/nim-constantine/wiki/Constant-time-arithmetics#guidelines
 # Especially:
 #    - https://bearssl.org/bigint.html
 #    - https://cryptojedi.org/peter/data/pairing-20131122.pdf
 #    - http://docs.milagro.io/en/amcl/milagro-crypto-library-white-paper.html
 #
 # Note that this might also be beneficial in terms of performance.
 # Due to opcode latency, on Nehalem ADC is 6x times slower than ADD
 # if it has dependencies (i.e the ADC depends on a previous ADC result)
 #
 # Control flow should only depends on the static maximum number of bits
 # This number is defined per Finite Field/Prime/Elliptic Curve
 #
 # We internally order the limbs in little-endian
 # So the least significant limb is limb[0]
 # This is independent from the base type endianness.
 #
 # Constantine uses Nim generic integer to prevent mixing
 # BigInts of different bitlength at compile-time and
 # properly statically size the BigInt buffers.
 #
 # To avoid code-bloat due to monomorphization (i.e. duplicating code per announced bitlength)
 # actual computation is deferred to type-erased routines.
 import
  ./primitives, ./common,
  ./primitives_extprecision
 from sugar import distinctBase
 type Word* = Ct[uint32]
  ## Logical BigInt word
  ## A logical BigInt word is of size physical MachineWord-1
 type BaseType* = uint32
  ## Physical BigInt for conversion in "normal integers"
 const
  WordPhysBitSize = sizeof(Word) * 8
  WordBitSize* = WordPhysBitSize - 1
 const
  Zero* = Word(0)
  One* = Word(1)
  MaxWord* = (not Zero) shr 1
    ## This represents 0x7F_FF_FF_FF__FF_FF_FF_FF
    ## also 0b0111...1111
    ## This biggest representable number in our limbs.
    ## i.e. The most significant bit is never set at the end of each function
 func wordsRequired(bits: int): int {.compileTime.} =
  ## Compute the number of limbs required
  # from the **announced** bit length
  (bits + WordBitSize - 1) div WordBitSize
 type
  BigInt*[bits: static int] = object
    ## Fixed-precision big integer
    ##
    ## - "bits" is the announced bit-length of the BigInt
    ##   This is public data, usually equal to the curve prime bitlength.
    ##
    ## - "bitLength" is the internal bitlength of the integer
    ##   This differs from the canonical bit-length as
    ##   Constantine word-size is smaller than a machine word.
    ##   This value should never be used as-is to prevent leaking secret data.
    ##   Computing this value requires constant-time operations.
    ##   Using this value requires converting it to the # of limbs in constant-time
    ##
    ## - "limbs" is an internal field that holds the internal representation
    ##   of the big integer. Least-significant limb first. Within limbs words are native-endian.
    ##
    ## This internal representation can be changed
    ## without notice and should not be used by external applications or libraries.
    bitLength: uint32
    limbs*: array[bits.wordsRequired, Word]
  BigIntView* = ptr object
    ## Type-erased fixed-precision big integer
    ##
    ## This type mirrors the BigInt type and is used
    ## for the low-level computation API
    ## This design
    ## - avoids code bloat due to generic monomorphization
    ##   otherwise each bigint routines would have an instantiation for
    ##   each static `bits` parameter.
    ## - while not forcing the caller to preallocate computation buffers
    ##   for the high-level API
    ##
    ## As with the BigInt type:
    ## - "bitLength" is the internal bitlength of the integer
    ##   This differs from the canonical bit-length as
    ##   Constantine word-size is smaller than a machine word.
    ##   This value should never be used as-is to prevent leaking secret data.
    ##   Computing this value requires constant-time operations.
    ##   Using this value requires converting it to the # of limbs in constant-time
    ##
    ## - "limbs" is an internal field that holds the internal representation
    ##   of the big integer. Least-significant limb first. Within limbs words are native-endian.
    ##
    ## This internal representation can be changed
    ## without notice and should not be used by external applications or libraries.
    ##
    ## Accesses should be done via BigIntViewConst / BigIntViewConst
    ## to have the compiler check for mutability
    bitLength: uint32
    limbs: UncheckedArray[Word]
  # "Indirection" to enforce pointer types deep immutability
  BigIntViewConst* = distinct BigIntView
    ## Immutable view into a BigInt
  BigIntViewMut* = distinct BigIntView
    ## Mutable view into a BigInt
  BigIntViewAny* = BigIntViewConst or BigIntViewMut
 # No exceptions allowed
 {.push raises: [].}
 # ############################################################
 #
 #                  Mutability safety
 #
 # ############################################################
 template view*(a: BigInt): BigIntViewConst =
  ## Returns a borrowed type-erased immutable view to a bigint
  BigIntViewConst(cast[BigIntView](a.unsafeAddr))
 template view*(a: var BigInt): BigIntViewMut =
  ## Returns a borrowed type-erased mutable view to a mutable bigint
  BigIntViewMut(cast[BigIntView](a.addr))
 template `[]`*(v: BigIntViewConst, limbIdx: int): Word =
  distinctBase(type v)(v).limbs[limbIdx]
 template `[]`*(v: BigIntViewMut, limbIdx: int): var Word =
  distinctBase(type v)(v).limbs[limbIdx]
 template `[]=`*(v: BigIntViewMut, limbIdx: int, val: Word) =
  distinctBase(type v)(v).limbs[limbIdx] = val
 template bitSizeof(v: BigIntViewAny): uint32 =
  distinctBase(type v)(v).bitLength
 const divShiftor = log2(WordPhysBitSize)
 template numLimbs*(v: BigIntViewAny): int =
  ## Compute the number of limbs from
  ## the **internal** bitlength
  (bitSizeof(v).int + WordPhysBitSize - 1) shr divShiftor
 template setBitLength(v: BigIntViewMut, internalBitLength: uint32) =
  distinctBase(type v)(v).bitLength = internalBitLength
 # TODO: Check if repeated v.numLimbs calls are optimized away
 template `[]`*(v: BigIntViewConst, limbIdxFromEnd: BackwardsIndex): Word =
  distinctBase(type v)(v).limbs[v.numLimbs.int - int limbIdxFromEnd]
 template `[]`*(v: BigIntViewMut, limbIdxFromEnd: BackwardsIndex): var Word =
  distinctBase(type v)(v).limbs[v.numLimbs.int - int limbIdxFromEnd]
 template `[]=`*(v: BigIntViewMut, limbIdxFromEnd: BackwardsIndex, val: Word) =
  distinctBase(type v)(v).limbs[v.numLimbs.int - int limbIdxFromEnd] = val
 # ############################################################
 #
 #           Checks and debug/test only primitives
 #
 # ############################################################
 template checkMatchingBitlengths(a, b: distinct BigIntViewAny) =
  ## Check that bitlengths of bigints match
  ## This is only checked
  ## with "-d:debugConstantine" and when assertions are on.
  debug:
    assert distinctBase(type a)(a).bitLength ==
      distinctBase(type b)(b).bitLength, "Internal Error: operands bitlength do not match"
 template checkValidModulus(m: BigIntViewConst) =
  ## Check that the modulus is valid
  ## The check is approximate, it only checks that
  ## the most-significant words is non-zero instead of
  ## checking that the last announced bit is 1.
  ## This is only checked
  ## with "-d:debugConstantine" and when assertions are on.
  debug:
    assert not m[^1].isZero.bool, "Internal Error: the modulus must use all declared bits"
 debug:
  func `==`*(a, b: BigInt): CTBool[Word] =
    ## Returns true if 2 big ints are equal
    var accum: Word
    for i in static(0 ..< a.limbs.len):
      accum = accum or (a.limbs[i] xor b.limbs[i])
    result = accum.isZero
  func `$`*(a: BigIntViewAny): string =
    let len = a.numLimbs()
    result = "["
    for i in 0 ..< len - 1:
      result.add $a[i]
      result.add ", "
    result.add $a[len-1]
    result.add "] ("
    result.add $a.bitSizeof
    result.add " bits)"
 # ############################################################
 #
 #                    BigInt primitives
 #
 # ############################################################
 func setInternalBitLength*(a: var BigInt) {.inline.} =
  ## Derive the actual bitsize used internally of a BigInt
  ## from the announced BigInt bitsize
  ## and set the bitLength field of that BigInt
  ## to that computed value.
  a.bitLength = static(a.bits + a.bits div WordBitSize)
 func isZero*(a: BigIntViewAny): CTBool[Word] =
  ## Returns true if a big int is equal to zero
  var accum: Word
  for i in 0 ..< a.numLimbs():
    accum = accum or a[i]
  result = accum.isZero()
 # The arithmetic primitives all accept a control input that indicates
 # if it is a placebo operation. It stills performs the
 # same memory accesses to be side-channel attack resistant.
 func add*(a: BigIntViewMut, b: BigIntViewAny, ctl: CTBool[Word]): CTBool[Word] =
  ## Constant-time big integer in-place optional addition
  ## The addition is only performed if ctl is "true"
  ## The result carry is always computed.
  ##
  ## a and b MAY be the same buffer
  ## a and b MUST have the same announced bitlength (i.e. `bits` static parameters)
  checkMatchingBitlengths(a, b)
  for i in 0 ..< a.numLimbs():
    let new_a = a[i] + b[i] + Word(result)
    result = new_a.isMsbSet()
    a[i] = ctl.mux(new_a and MaxWord, a[i])
 func sub*(a: BigIntViewMut, b: BigIntViewAny, ctl: CTBool[Word]): CTBool[Word] =
  ## Constant-time big integer in-place optional substraction
  ## The substraction is only performed if ctl is "true"
  ## The result carry is always computed.
  ##
  ## a and b MAY be the same buffer
  ## a and b MUST have the same announced bitlength (i.e. `bits` static parameters)
  checkMatchingBitlengths(a, b)
  for i in 0 ..< a.numLimbs():
    let new_a = a[i] - b[i] - Word(result)
    result = new_a.isMsbSet()
    a[i] = ctl.mux(new_a and MaxWord, a[i])
 # ############################################################
 #
 #                   Modular BigInt
 #
 # ############################################################
 # TODO: push boundsCheck off. They would be extremely costly.
 func shlAddMod(a: BigIntViewMut, c: Word, M: BigIntViewConst) =
  ## Fused modular left-shift + add
  ## Shift input `a` by a word and add `c` modulo `M`
  ##
  ## With a word W = 2^WordBitSize and a modulus M
  ## Does a <- a * W + c (mod M)
  ##
  ## The modulus `M` MUST announced most-significant bit must be set.
  checkValidModulus(M)
  let aLen = a.numLimbs()
  let mBits = bitSizeof(M)
  if mBits <= WordBitSize:
    # If M fits in a single limb
    var q: Word
    # (hi, lo) = a * 2^63 + c
    let hi = a[0] shr 1                   # 64 - 63 = 1
    let lo = (a[0] shl WordBitSize) or c  # Assumes most-significant bit in c is not set
    unsafeDiv2n1n(q, a[0], hi, lo, M[0])  # (hi, lo) mod M
    return
  else:
    ## Multiple limbs
    let hi = a[^1]                                          # Save the high word to detect carries
    let R = mBits and WordBitSize                           # R = mBits mod 64
    var a0, a1, m0: Word
    if R == 0:                                              # If the number of mBits is a multiple of 64
      a0 = a[^1]                                        #
      moveMem(a[1].addr, a[0].addr, (aLen-1) * Word.sizeof) # we can just shift words
      a[0] = c                                              # and replace the first one by c
      a1 = a[^1]
      m0 = M[^1]
    else:                                                   # Else: need to deal with partial word shifts at the edge.
      a0 = ((a[^1] shl (WordBitSize-R)) or (a[^2] shr R)) and MaxWord
      moveMem(a[1].addr, a[0].addr, (aLen-1) * Word.sizeof)
      a[0] = c
      a1 = ((a[^1] shl (WordBitSize-R)) or (a[^2] shr R)) and MaxWord
      m0 = ((M[^1] shl (WordBitSize-R)) or (M[^2] shr R)) and MaxWord
    # m0 has its high bit set. (a0, a1)/p0 fits in a limb.
    # Get a quotient q, at most we will be 2 iterations off
    # from the true quotient
    let
      a_hi = a0 shr 1                              # 64 - 63 = 1
      a_lo = (a0 shl WordBitSize) or a1
    var q, r: Word
    unsafeDiv2n1n(q, r, a_hi, a_lo, m0)            # Estimate quotient
    q = mux(                                       # If n_hi == divisor
          a0 == m0, MaxWord,                       # Quotient == MaxWord (0b0111...1111)
          mux(
            q.isZero, Zero,                        # elif q == 0, true quotient = 0
            q - One                                # else instead of being of by 0, 1 or 2
          )                                        # we returning q-1 to be off by -1, 0 or 1
        )
    # Now substract a*2^63 - q*p
    var carry = Zero
    var over_p = ctrue(Word)                       # Track if quotient greater than the modulus
    for i in 0 ..< M.numLimbs():
      var qp_lo: Word
      block: # q*p
        var qp_hi: Word
        unsafeExtendedPrecMul(qp_hi, qp_lo, q, M[i]) # q * p
        qp_lo += carry                               # Add carry from previous limb
        carry = qp_hi shl 1 + qp_lo.isMsbSet.Word    # New carry
        qp_lo = qp_lo and MaxWord                    # Normalize to u63
      block: # a*2^63 - q*p
        a[i] -= qp_lo
        carry += Word(a[i].isMsbSet)                 # Adjust if borrow
        a[i] = a[i] and MaxWord                      # Normalize to u63
      over_p = mux(
                a[i] == M[i], over_p,
                a[i] > M[i]
              )
    # Fix quotient, the true quotient is either q-1, q or q+1
    #
    # if carry < q or carry == q and over_p we must do "a -= p"
    # if carry > hi (negative result) we must do "a += p"
    let neg = carry < hi
    let tooBig = not neg and (over_p or (carry < hi))
    discard a.add(M, ctl = neg)
    discard a.sub(M, ctl = tooBig)
    return
 func reduce*(r: BigIntViewMut, a: BigIntViewAny, M: BigIntViewConst) =
  ## Reduce `a` modulo `M` and store the result in `r`
  ##
  ## The modulus `M` MUST announced most-significant bit must be set.
  ## The result `r` buffer size MUST be at least the size of `M` buffer
  ##
  ## CT: Depends only on the bitlength of `a` and the modulus `M`
  # Note: for all cryptographic intents and purposes the modulus is known at compile-time
  # but we don't want to inline it as it would increase codesize, better have Nim
  # pass a pointer+length to a fixed session of the BSS.
  checkValidModulus(M)
  let aBits = bitSizeof(a)
  let mBits = bitSizeof(M)
  let aLen = a.numLimbs()
  r.setBitLength(bitSizeof(M))
  if aBits < mBits:
    # if a uses less bits than the modulus,
    # it is guaranteed < modulus.
    # This relies on the precondition that the modulus uses all declared bits
    copyMem(r[0].addr, a[0].unsafeAddr, aLen * sizeof(Word))
    for i in aLen ..< r.numLimbs():
      r[i] = Zero
  else:
    # a length i at least equal to the modulus.
    # we can copy modulus.limbs-1 words
    # and modular shift-left-add the rest
    let mLen = M.numLimbs()
    let aOffset = aLen - mLen
    copyMem(r[0].addr, a[aOffset].unsafeAddr, (mLen-1) * sizeof(Word))
    r[^1] = Zero
    for i in countdown(aOffset-1, 0):
      r.shlAddMod(a[i], M)
--- a/constantine/common.nim
+++ b/constantine/common.nim
@ -0,0 +1,13 @@
 # Constantine
 # Copyright (c) 2018-2019    Status Research & Development GmbH
 # Copyright (c) 2020-Present Mamy André-Ratsimbazafy
 # Licensed and distributed under either of
 #   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 # Common configuration
 template debug*(body: untyped): untyped =
  when defined(debugConstantine):
    body
--- a/constantine/private/curves_config_parser.nim
+++ b/constantine/private/curves_config_parser.nim
--- a/constantine/io.nim
+++ b/constantine/io.nim
@ -12,7 +12,7 @@
 import
  endians,
-  ./primitives, ./bigints
+  ./primitives, ./bigints_raw
 # ############################################################
 #
@ -23,14 +23,16 @@ import
 # TODO: tag/remove exceptions raised.
 func fromRawUintLE(
-        T: type BigInt,
+        dst: var BigInt,
-        src: openarray[byte]): T =
+        src: openarray[byte]) =
  ## Parse an unsigned integer from its canonical
  ## little-endian unsigned representation
  ## And store it into a BigInt of size bits
  ##
-  ## CT:
+  ## Constant-Time:
  ##   - no leaks
  ##
  ## Can work at compile-time
  var
    dst_idx = 0
@ -46,13 +48,33 @@ func fromRawUintLE(
    # if full, dump
    if acc_len >= WordBitSize:
-      result.limbs[dst_idx] = acc and MaxWord
+      dst.limbs[dst_idx] = acc and MaxWord
      inc dst_idx
      acc_len -= WordBitSize
      acc = src_byte shr (8 - acc_len)
-  if dst_idx < result.limbs.len:
+  if dst_idx < dst.limbs.len:
-    result.limbs[dst_idx] = acc
+    dst.limbs[dst_idx] = acc
 func fromRawUint*(
        dst: var BigInt,
        src: openarray[byte],
        srcEndianness: static Endianness) {.inline.}=
  ## Parse an unsigned integer from its canonical
  ## big-endian or little-endian unsigned representation
  ## And store it into a BigInt of size `bits`
  ##
  ## Constant-Time:
  ##   - no leaks
  ##
  ## Can work at compile-time to embed curve moduli
  ## from a canonical integer representation
  when srcEndianness == littleEndian:
    dst.fromRawUintLE(src)
  else:
    {.error: "Not implemented at the moment".}
  dst.setInternalBitLength()
 func fromRawUint*(
        T: type BigInt,
@ -62,20 +84,19 @@ func fromRawUint*(
  ## big-endian or little-endian unsigned representation
  ## And store it into a BigInt of size `bits`
  ##
-  ## CT:
+  ## Constant-Time:
  ##   - no leaks
-
+  ##
-  when srcEndianness == littleEndian:
+  ## Can work at compile-time to embed curve moduli
-    fromRawUintLE(T, src)
+  ## from a canonical integer representation
-  else:
+  result.fromRawUint(src, srcEndianness)
    {.error: "Not implemented at the moment".}
 func fromUint*(
        T: type BigInt,
        src: SomeUnsignedInt): T =
  ## Parse a regular unsigned integer
  ## and store it into a BigInt of size `bits`
-  fromRawUint(T, cast[array[sizeof(src), byte]](src), cpuEndian)
+  result.fromRawUint(cast[array[sizeof(src), byte]](src), cpuEndian)
 # ############################################################
 #
@ -128,7 +149,6 @@ func dumpRawUintLE(
      if tail >= sizeof(Word):
        # Unrolled copy
        # debugecho src.repr
        littleEndianXX(dst[dst_idx].addr, lo.unsafeAddr)
        dst_idx += sizeof(Word)
        tail -= sizeof(Word)
@ -293,6 +313,8 @@ func fromHex*(T: type BigInt, s: string): T =
  ##
  ## This API is intended for configuration and debugging purposes
  ## Do not pass secret or private data to it.
  ##
  ## Can work at compile-time to declare curve moduli from their hex strings
  # 1. Convert to canonical uint
  const canonLen = (T.bits + 8 - 1) div 8
@ -300,7 +322,7 @@ func fromHex*(T: type BigInt, s: string): T =
  hexToPaddedByteArray(s, bytes, littleEndian)
  # 2. Convert canonical uint to Big Int
-  result = T.fromRawUint(bytes, littleEndian)
+  result.fromRawUint(bytes, littleEndian)
 func dumpHex*(big: BigInt, order: static Endianness = bigEndian): string =
  ## Stringify an int to hex.
--- a/constantine/primitives.nim
+++ b/constantine/primitives.nim
@ -116,6 +116,21 @@ func isMsbSet*[T: Ct](x: T): CTBool[T] =
  const msb_pos = T.sizeof * 8 - 1
  result = (CTBool[T])(x shr msb_pos)
 func log2*(x: uint32): uint32 =
  ## Find the log base 2 of a 32-bit or less integer.
  ## using De Bruijn multiplication
  ## Works at compile-time, guaranteed constant-time.
  # https://graphics.stanford.edu/%7Eseander/bithacks.html#IntegerLogDeBruijn
  const lookup: array[32, uint8] = [0'u8, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18,
    22, 25, 3, 30, 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31]
  var v = x
  v = v or v shr 1 # first round down to one less than a power of 2
  v = v or v shr 2
  v = v or v shr 4
  v = v or v shr 8
  v = v or v shr 16
  lookup[(v * 0x07C4ACDD'u32) shr 27]
 # ############################################################
 #
 #             Hardened Boolean primitives
--- a/constantine/private/primitives_internal.nim
+++ b/constantine/private/primitives_internal.nim
@ -12,7 +12,7 @@
 #
 # ############################################################
-import ../primitives
+import ./primitives
 func asm_x86_64_extMul(hi, lo: var uint64, a, b: uint64) {.inline.}=
  ## Extended precision multiplication uint64 * uint64 --> uint128
@ -161,34 +161,34 @@ when isMainModule:
    doAssert q == 6148914691236517205'u64
    doAssert r == 1
-  block: # TODO - support Quotient that doesn't fit in the result
+  # block: # TODO - support Quotient that doesn't fit in the result
-         # The usual way with normalization by the bitSize difference
+  #        # The usual way with normalization by the bitSize difference
-         # is fundamentally non constant-time
+  #        # is fundamentally non constant-time
-         # it is probable that division is not constant-time at the hardware level as well
+  #        # it is probable that division is not constant-time at the hardware level as well
-         # as it throws sigfpe when the quotient doesn't fit in the result size
+  #        # as it throws sigfpe when the quotient doesn't fit in the result size
-    var q, r: uint64
+  #   var q, r: uint64
-    let n_hi = 1'u64
+  #   let n_hi = 1'u64
-    let n_lo = 0'u64
+  #   let n_lo = 0'u64
-    let d = 1'u64
+  #   let d = 1'u64
-    asm_x86_64_div2n1n(q, r, n_hi, n_lo, d)
+  #   asm_x86_64_div2n1n(q, r, n_hi, n_lo, d)
-    echo "quotient: ", q
+  #   echo "quotient: ", q
-    echo "remainder: ", r
+  #   echo "remainder: ", r
-  block:
+  # block:
-    var q, r: uint64
+  #   var q, r: uint64
-    let n_hi = 4186590388502004879'u64
+  #   let n_hi = 4186590388502004879'u64
-    let n_lo = 17852795547484522084'u64
+  #   let n_lo = 17852795547484522084'u64
-    let d = 327340459940166448'u64
+  #   let d = 327340459940166448'u64
-    asm_x86_64_div2n1n(q, r, n_hi, n_lo, d)
+  #   asm_x86_64_div2n1n(q, r, n_hi, n_lo, d)
-    echo "quotient: ", q
+  #   echo "quotient: ", q
-    echo "remainder: ", r
+  #   echo "remainder: ", r
 # ##############################################################
 #
--- a/tests/test_bigints.nim
+++ b/tests/test_bigints.nim
@ -7,7 +7,7 @@
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 import  unittest, random, strutils,
-        ../constantine/[io, bigints, primitives]
+        ../constantine/[io, bigints_public, bigints_raw, primitives]
 suite "isZero":
  test "isZero for zero":
@ -128,12 +128,12 @@ suite "Modular operations - small modulus":
  # Vectors taken from Stint - https://github.com/status-im/nim-stint
  test "100 mod 13":
    let a = BigInt[32].fromUint(100'u32)
-    let m = BigInt[8].fromUint(13'u8)
+    let m = BigInt[4].fromUint(13'u8)
-    var r: BigInt[8]
+    var r: BigInt[4]
    r.reduce(a, m)
    check:
-      bool(r == BigInt[8].fromUint(100'u8 mod 13))
+      bool(r == BigInt[4].fromUint(100'u8 mod 13))
  test "2^64 mod 3":
    let a = BigInt[65].fromHex("0x1_00000000_00000000")
@ -160,29 +160,23 @@ suite "Modular operations - small modulus - Stint specific failures highlighted
    let v = 174261910798982'u64
    let a = BigInt[64].fromUint(u)
-    let m = BigInt[49].fromUint(v)
+    let m = BigInt[48].fromUint(v)
-    var r: BigInt[49]
+    var r: BigInt[48]
    r.reduce(a, m)
    # Copy the result in a conveniently sized buffer
    var rr: BigInt[49]
    copyLimbs(rr, 0, r, 0, r.limbs.len)
    check:
-      bool(rr == BigInt[49].fromUint(u mod v))
+      bool(r == BigInt[48].fromUint(u mod v))
  test "Modulo: 15080397990160655 mod 600432699691":
    let u = 15080397990160655'u64
    let v = 600432699691'u64
    let a = BigInt[64].fromUint(u)
-    let m = BigInt[41].fromUint(v)
+    let m = BigInt[40].fromUint(v)
-    var r: BigInt[41]
+    var r: BigInt[40]
    r.reduce(a, m)
    # Copy the result in a conveniently sized buffer
    var rr: BigInt[41]
    copyLimbs(rr, 0, r, 0, r.limbs.len)
    check:
-      bool(rr == BigInt[41].fromUint(u mod v))
+      bool(r == BigInt[40].fromUint(u mod v))
--- a/tests/test_bigints.nim.cfg
+++ b/tests/test_bigints.nim.cfg
@ -0,0 +1 @@
 -d:debugConstantine
--- a/tests/test_io.nim
+++ b/tests/test_io.nim
@ -7,7 +7,7 @@
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 import  unittest, random,
-        ../constantine/[io, bigints]
+        ../constantine/[io, bigints_raw]
 randomize(0xDEADBEEF) # Random seed for reproducibility
 type T = BaseType