From b689223cf5a06a80e84dc8384ade136366aa6207 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mamy=20Andr=C3=A9-Ratsimbazafy?= <mamy_github@numforge.co>
Date: Mon, 10 Feb 2020 18:16:34 +0100
Subject: [PATCH] Refactoring, optimize code-size: use type-erased views to
 avoid monomorphization of compute kernels

---
 constantine/bigints.nim                       | 313 -------------
 constantine/bigints_public.nim                |  54 +++
 constantine/bigints_raw.nim                   | 424 ++++++++++++++++++
 constantine/common.nim                        |  13 +
 .../{private => }/curves_config_parser.nim    |   0
 constantine/io.nim                            |  54 ++-
 constantine/primitives.nim                    |  15 +
 ...ternal.nim => primitives_extprecision.nim} |  42 +-
 tests/test_bigints.nim                        |  26 +-
 tests/test_bigints.nim.cfg                    |   1 +
 tests/test_io.nim                             |   2 +-
 11 files changed, 577 insertions(+), 367 deletions(-)
 delete mode 100644 constantine/bigints.nim
 create mode 100644 constantine/bigints_public.nim
 create mode 100644 constantine/bigints_raw.nim
 create mode 100644 constantine/common.nim
 rename constantine/{private => }/curves_config_parser.nim (100%)
 rename constantine/{private/primitives_internal.nim => primitives_extprecision.nim} (90%)
 create mode 100644 tests/test_bigints.nim.cfg

diff --git a/constantine/bigints.nim b/constantine/bigints.nim
deleted file mode 100644
index 322ef81..0000000
--- a/constantine/bigints.nim
+++ /dev/null
@@ -1,313 +0,0 @@
-# Constantine
-# Copyright (c) 2018-2019    Status Research & Development GmbH
-# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
-# Licensed and distributed under either of
-#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
-#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
-# at your option. This file may not be copied, modified, or distributed except according to those terms.
-
-
-# ############################################################
-#
-#                    BigInt representation
-#
-# ############################################################
-
-# To avoid carry issues we don't use the
-# most significant bit of each word.
-# i.e. for a uint64 base we only use 63-bit.
-# More info: https://github.com/status-im/nim-constantine/wiki/Constant-time-arithmetics#guidelines
-# Especially:
-#    - https://bearssl.org/bigint.html
-#    - https://cryptojedi.org/peter/data/pairing-20131122.pdf
-#    - http://docs.milagro.io/en/amcl/milagro-crypto-library-white-paper.html
-#
-# Note that this might also be beneficial in terms of performance.
-# Due to opcode latency, on Nehalem ADC is 6x times slower than ADD
-# if it has dependencies (i.e the ADC depends on a previous ADC result)
-
-# Control flow should only depends on the static maximum number of bits
-# This number is defined per Finite Field/Prime/Elliptic Curve
-#
-# For efficiency, our limbs will use a word size of 63-bit
-# Warning ⚠️ : This assumes that u64 + u64 and u64 * u64
-#              are constant-time even on 32-bit platforms
-#
-# We internally order the limbs in little-endian
-# So the least significant limb is limb[0]
-# This is independent from the base type endianness.
-
-import ./primitives
-from ./private/primitives_internal import unsafeDiv2n1n, unsafeExtendedPrecMul
-
-type Word* = Ct[uint32]
-type BaseType* = uint32 # Exported type for conversion in "normal integers"
-
-const WordBitSize* = sizeof(Word) * 8 - 1
-  ## Limbs are 63-bit by default
-
-const
-  Zero* = Word(0)
-  One* = Word(1)
-  MaxWord* = (not Zero) shr 1
-    ## This represents 0x7F_FF_FF_FF__FF_FF_FF_FF
-    ## also 0b0111...1111
-    ## This biggest representable number in our limbs.
-    ## i.e. The most significant bit is never set at the end of each function
-
-func wordsRequired(bits: int): int {.compileTime.}=
-  (bits + WordBitSize - 1) div WordBitSize
-
-# TODO: Currently the library is instantiation primitives like "add"
-#       for each "bits" size supported. This will lead to duplication
-#       if many sizes (for example for scp256k1, bn254 and BLS12-381)
-#       are required.
-#       It could be avoided by having the bitsize be a runtime field
-#       of the bigint. However the tradeoff would be:
-#       - overhead of this additional field
-#       - limbs have to be stored in an UncheckedArray instead of an array
-#         introducing memory management issues
-
-type
-  BigInt*[bits: static int] = object
-    ## Fixed-precision big integer
-    ##
-    ## "limbs" is an internal field that holds the internal representation
-    ## of the big integer. This internal representation can be changed
-    ## without notice and should not be used by external applications or libraries.
-    # Constantine BigInt have a word-size chosen to minimize bigint memory usage
-    # while allowing carry-less operations in a machine-efficient type like uint32
-    # uint64 or uint128 if available.
-    # In practice the word size is 63-bit.
-    #
-    # "Limb-endianess" is little-endian (least significant limb at BigInt.limbs[0])
-    limbs*: array[bits.wordsRequired, Word]
-
-# No exceptions allowed
-# TODO: can we use compile-time "Natural" instead of "int" in that case?
-{.push raises: [].}
-
-# ############################################################
-#
-#                         Internal
-#
-# ############################################################
-
-func copyLimbs*[dstBits, srcBits](
-        dst: var BigInt[dstBits], dstStart: static int,
-        src: BigInt[srcBits], srcStart: static int,
-        numLimbs: static int) {.inline.}=
-  ## Copy `numLimbs` from src into dst
-  ## If `dst` buffer is larger than `numLimbs` buffer
-  ## the extra space will be zero-ed out
-  ##
-  ## Limbs ordering is little-endian. limb 0 is the least significant/
-  ##
-  ## This should work at both compile-time and runtime.
-  ##
-  ## `numLimbs` must be less or equal the limbs of the `dst` and `src` buffers
-  ## This is checked at compile-time and has no runtime impact
-
-  static:
-    doAssert numLimbs >= 0, "`numLimbs` must be greater or equal zero"
-
-    doAssert numLimbs + srcStart <= src.limbs.len,
-      "The number of limbs to copy (" & $numLimbs &
-      ") must be less or equal to the number of limbs in the `src` buffer (" &
-      $src.limbs.len & " for " & $srcBits & " bits)"
-
-    doAssert numLimbs + dstStart <= dst.limbs.len,
-      "The number of limbs to copy (" & $numLimbs &
-      ") must be less or equal to the number of limbs in the `dst` buffer (" &
-      $dst.limbs.len & " for " & $dstBits & " bits)"
-
-  # TODO: do we need a copyMem / memcpy specialization for runtime
-  #       or use dst.limbs[0..<numLimbs] = src.toOpenarray(0, numLimbs - 1)
-  for i in static(0 ..< numLimbs):
-    dst.limbs[i+dstStart] = src.limbs[i+srcStart]
-
-func setZero*(a: var BigInt, start, stop: static int) {.inline.} =
-  ## Set limbs to zero
-  ## The [start, stop] range is inclusive
-  ## If stop < start, a is unmodified
-  static:
-    doAssert start in 0 ..< a.limbs.len, $start & " not in 0 ..< " & $a.limbs.len & " (numLimbs)"
-    doAssert stop  in 0 ..< a.limbs.len, $stop & " not in 0 ..< " & $a.limbs.len & " (numLimbs)"
-
-  for i in static(start .. stop):
-    a.limbs[i] = Zero
-
-# ############################################################
-#
-#                    BigInt primitives
-#
-# ############################################################
-
-# TODO: {.inline.} analysis
-
-func isZero*(a: BigInt): CTBool[Word] =
-  ## Returns if a big int is equal to zero
-  var accum: Word
-  for i in static(0 ..< a.limbs.len):
-    accum = accum or a.limbs[i]
-  result = accum.isZero()
-
-func `==`*(a, b: BigInt): CTBool[Word] =
-  ## Returns true if 2 big ints are equal
-  var accum: Word
-  for i in static(0 ..< a.limbs.len):
-    accum = accum or (a.limbs[i] xor b.limbs[i])
-  result = accum.isZero
-
-# The arithmetic primitives all accept a control input that indicates
-# if it is a placebo operation. It stills performs the
-# same memory accesses to be side-channel attack resistant.
-
-func add*[bits](a: var BigInt[bits], b: BigInt[bits], ctl: CTBool[Word]): CTBool[Word] =
-  ## Constant-time big integer in-place optional addition
-  ## The addition is only performed if ctl is "true"
-  ## The result carry is always computed.
-  for i in static(0 ..< a.limbs.len):
-    let new_a = a.limbs[i] + b.limbs[i] + Word(result)
-    result = new_a.isMsbSet()
-    a.limbs[i] = ctl.mux(new_a and MaxWord, a.limbs[i])
-
-func sub*[bits](a: var BigInt[bits], b: BigInt[bits], ctl: CTBool[Word]): CTBool[Word] =
-  ## Constant-time big integer in-place optional substraction
-  ## The substraction is only performed if ctl is "true"
-  ## The result carry is always computed.
-  for i in static(0 ..< a.limbs.len):
-    let new_a = a.limbs[i] - b.limbs[i] - Word(result)
-    result = new_a.isMsbSet()
-    a.limbs[i] = ctl.mux(new_a and MaxWord, a.limbs[i])
-
-# ############################################################
-#
-#                   Modular BigInt
-#
-# ############################################################
-
-# TODO: push boundsCheck off. They would be extremely costly.
-
-func shlAddMod[bits](a: var BigInt[bits], c: Word, M: BigInt[bits]) =
-  ## Fused modular left-shift + add
-  ## Shift input `a` by a word and add `c` modulo `M`
-  ##
-  ## With a word W = 2^WordBitSize and a modulus M
-  ## Does a <- a * W + c (mod M)
-  ##
-  ## The modulus `M` **must** use `mBits` bits.
-  assert not M.limbs[^1].isZero.bool, "The modulus must use all declared bits"
-
-  const len = a.limbs.len
-
-  when bits <= WordBitSize:
-    # If M fits in a single limb
-    var q: Word
-
-    # (hi, lo) = a * 2^63 + c
-    let hi = a.limbs[0] shr 1                        # 64 - 63 = 1
-    let lo = (a.limbs[0] shl WordBitSize) or c       # Assumes most-significant bit in c is not set
-    unsafeDiv2n1n(q, a.limbs[0], hi, lo, M.limbs[0]) # (hi, lo) mod M
-    return
-
-  else: # TODO replace moveMem with a proc that also works at compile-time
-    ## Multiple limbs
-    let hi = a.limbs[^1]                                               # Save the high word to detect carries
-    const R = bits and WordBitSize                                     # R = bits mod 64
-
-    when R == 0:                                                       # If the number of bits is a multiple of 64
-      let a0 = a.limbs[^1]                                             #
-      moveMem(a.limbs[1].addr, a.limbs[0].addr, (len-1) * Word.sizeof) # we can just shift words
-      a.limbs[0] = c                                                   # and replace the first one by c
-      let a1 = a.limbs[^1]
-      let m0 = M.limbs[^1]
-    else: # Need to deal with partial word shifts at the edge.
-      let a0 = ((a.limbs[^1] shl (WordBitSize-R)) or (a.limbs[^2] shr R)) and MaxWord
-      moveMem(a.limbs[1].addr, a.limbs[0].addr, (len-1) * Word.sizeof)
-      a.limbs[0] = c
-      let a1 = ((a.limbs[^1] shl (WordBitSize-R)) or (a.limbs[^2] shr R)) and MaxWord
-      let m0 = ((M.limbs[^1] shl (WordBitSize-R)) or (M.limbs[^2] shr R)) and MaxWord
-
-    # m0 has its high bit set. (a0, a1)/p0 fits in a limb.
-    # Get a quotient q, at most we will be 2 iterations off
-    # from the true quotient
-
-    let
-      a_hi = a0 shr 1                              # 64 - 63 = 1
-      a_lo = (a0 shl WordBitSize) or a1
-    var q, r: Word
-    unsafeDiv2n1n(q, r, a_hi, a_lo, m0)            # Estimate quotient
-    q = mux(                                       # If n_hi == divisor
-          a0 == m0, MaxWord,                       # Quotient == MaxWord (0b0111...1111)
-          mux(
-            q.isZero, Zero,                        # elif q == 0, true quotient = 0
-            q - One                                # else instead of being of by 0, 1 or 2
-          )                                        # we returning q-1 to be off by -1, 0 or 1
-        )
-
-    # Now substract a*2^63 - q*p
-    var carry = Zero
-    var over_p = ctrue(Word)                       # Track if quotient greater than the modulus
-
-    for i in static(0 ..< M.limbs.len):
-      var qp_lo: Word
-
-      block: # q*p
-        var qp_hi: Word
-        unsafeExtendedPrecMul(qp_hi, qp_lo, q, M.limbs[i])  # q * p
-        qp_lo += carry                                      # Add carry from previous limb
-
-        carry = qp_hi shl 1 + qp_lo.isMsbSet.Word           # New carry
-        qp_lo = qp_lo and MaxWord                           # Normalize to u63
-
-      block: # a*2^63 - q*p
-        a.limbs[i] -= qp_lo
-        carry += Word(a.limbs[i].isMsbSet)                  # Adjust if borrow
-        a.limbs[i] = a.limbs[i] and MaxWord                 # Normalize to u63
-
-      over_p = mux(
-                a.limbs[i] == M.limbs[i], over_p,
-                a.limbs[i] > M.limbs[i]
-              )
-
-    # Fix quotient, the true quotient is either q-1, q or q+1
-    #
-    # if carry < q or carry == q and over_p we must do "a -= p"
-    # if carry > hi (negative result) we must do "a += p"
-
-    let neg = carry < hi
-    let tooBig = not neg and (over_p or (carry < hi))
-
-    discard a.add(M, ctl = neg)
-    discard a.sub(M, ctl = tooBig)
-    return
-
-func reduce*[aBits, mBits](r: var BigInt[mBits], a: BigInt[aBits], M: BigInt[mBits]) =
-  ## Reduce `a` modulo `M` and store the result in `r`
-  ##
-  ## The modulus `M` **must** use `mBits` bits.
-  ##
-  ## CT: Depends only on the length of the modulus `M`
-
-  # Note: for all cryptographic intents and purposes the modulus is known at compile-time
-  # but we don't want to inline it as it would increase codesize, better have Nim
-  # pass a pointer+length to a fixed session of the BSS.
-
-  assert not M.limbs[^1].isZero.bool, "The modulus must use all declared bits"
-
-  when aBits < mBits:
-    # if a uses less bits than the modulus,
-    # it is guaranteed < modulus.
-    # This relies on the precondition that the modulus uses all declared bits
-    copyLimbs(r, 0, a, 0, a.limbs.len)
-    r.setZero(a.limbs.len, r.limbs.len-1)
-  else:
-    # a length i at least equal to the modulus.
-    # we can copy modulus.limbs-1 words
-    # and modular shift-left-add the rest
-    const aOffset = a.limbs.len - M.limbs.len
-    copyLimbs(r, 0, a, aOffset, M.limbs.len - 1)
-    r.limbs[^1] = Zero
-    for i in countdown(aOffset-1, 0):
-      r.shlAddMod(a.limbs[i], M)
diff --git a/constantine/bigints_public.nim b/constantine/bigints_public.nim
new file mode 100644
index 0000000..3884945
--- /dev/null
+++ b/constantine/bigints_public.nim
@@ -0,0 +1,54 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  ./bigints_raw,
+  ./primitives
+
+# ############################################################
+#
+#                   BigInts Public API
+#
+# ############################################################
+
+# The "public" API, exported for finite field computations
+# enforced compile-time checking of BigInt bitsize
+#
+# The "raw" compute API, uses views to avoid code duplication due to generic/static monomorphization.
+
+# No exceptions allowed
+{.push raises: [].}
+{.push inline.}
+
+func isZero*(a: BigInt): CTBool[Word] =
+  ## Returns true if a big int is equal to zero
+  a.view.isZero
+
+func add*[bits](a: var BigInt[bits], b: BigInt[bits], ctl: CTBool[Word]): CTBool[Word] =
+  ## Constant-time big integer in-place optional addition
+  ## The addition is only performed if ctl is "true"
+  ## The result carry is always computed.
+  add(a.view, b.view, ctl)
+
+func sub*[bits](a: var BigInt[bits], b: BigInt[bits], ctl: CTBool[Word]): CTBool[Word] =
+  ## Constant-time big integer in-place optional addition
+  ## The addition is only performed if ctl is "true"
+  ## The result carry is always computed.
+  sub(a.view, b.view, ctl)
+
+func reduce*[aBits, mBits](r: var BigInt[mBits], a: BigInt[aBits], M: BigInt[mBits]) =
+  ## Reduce `a` modulo `M` and store the result in `r`
+  ##
+  ## The modulus `M` **must** use `mBits` bits (bits at position mBits-1 must be set)
+  ##
+  ## CT: Depends only on the length of the modulus `M`
+
+  # Note: for all cryptographic intents and purposes the modulus is known at compile-time
+  # but we don't want to inline it as it would increase codesize, better have Nim
+  # pass a pointer+length to a fixed session of the BSS.
+  reduce(r.view, a.view, M.view)
diff --git a/constantine/bigints_raw.nim b/constantine/bigints_raw.nim
new file mode 100644
index 0000000..028f054
--- /dev/null
+++ b/constantine/bigints_raw.nim
@@ -0,0 +1,424 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+# ############################################################
+#
+#         BigInt Raw representation and operations
+#
+# ############################################################
+#
+# This file holds the raw operations done on big ints
+# The representation is optimized for:
+# - constant-time (not leaking secret data via side-channel)
+# - generated code size and datatype size
+# - performance
+# in this order
+
+# ############################################################
+# Design
+
+# To avoid carry issues we don't use the
+# most significant bit of each machine word.
+# i.e. for a uint64 base we only use 63-bit.
+# More info: https://github.com/status-im/nim-constantine/wiki/Constant-time-arithmetics#guidelines
+# Especially:
+#    - https://bearssl.org/bigint.html
+#    - https://cryptojedi.org/peter/data/pairing-20131122.pdf
+#    - http://docs.milagro.io/en/amcl/milagro-crypto-library-white-paper.html
+#
+# Note that this might also be beneficial in terms of performance.
+# Due to opcode latency, on Nehalem ADC is 6x times slower than ADD
+# if it has dependencies (i.e the ADC depends on a previous ADC result)
+#
+# Control flow should only depends on the static maximum number of bits
+# This number is defined per Finite Field/Prime/Elliptic Curve
+#
+# We internally order the limbs in little-endian
+# So the least significant limb is limb[0]
+# This is independent from the base type endianness.
+#
+# Constantine uses Nim generic integer to prevent mixing
+# BigInts of different bitlength at compile-time and
+# properly statically size the BigInt buffers.
+#
+# To avoid code-bloat due to monomorphization (i.e. duplicating code per announced bitlength)
+# actual computation is deferred to type-erased routines.
+
+import
+  ./primitives, ./common,
+  ./primitives_extprecision
+from sugar import distinctBase
+
+type Word* = Ct[uint32]
+  ## Logical BigInt word
+  ## A logical BigInt word is of size physical MachineWord-1
+type BaseType* = uint32
+  ## Physical BigInt for conversion in "normal integers"
+
+const
+  WordPhysBitSize = sizeof(Word) * 8
+  WordBitSize* = WordPhysBitSize - 1
+
+const
+  Zero* = Word(0)
+  One* = Word(1)
+  MaxWord* = (not Zero) shr 1
+    ## This represents 0x7F_FF_FF_FF__FF_FF_FF_FF
+    ## also 0b0111...1111
+    ## This biggest representable number in our limbs.
+    ## i.e. The most significant bit is never set at the end of each function
+
+func wordsRequired(bits: int): int {.compileTime.} =
+  ## Compute the number of limbs required
+  # from the **announced** bit length
+  (bits + WordBitSize - 1) div WordBitSize
+
+type
+  BigInt*[bits: static int] = object
+    ## Fixed-precision big integer
+    ##
+    ## - "bits" is the announced bit-length of the BigInt
+    ##   This is public data, usually equal to the curve prime bitlength.
+    ##
+    ## - "bitLength" is the internal bitlength of the integer
+    ##   This differs from the canonical bit-length as
+    ##   Constantine word-size is smaller than a machine word.
+    ##   This value should never be used as-is to prevent leaking secret data.
+    ##   Computing this value requires constant-time operations.
+    ##   Using this value requires converting it to the # of limbs in constant-time
+    ##
+    ## - "limbs" is an internal field that holds the internal representation
+    ##   of the big integer. Least-significant limb first. Within limbs words are native-endian.
+    ##
+    ## This internal representation can be changed
+    ## without notice and should not be used by external applications or libraries.
+    bitLength: uint32
+    limbs*: array[bits.wordsRequired, Word]
+
+  BigIntView* = ptr object
+    ## Type-erased fixed-precision big integer
+    ##
+    ## This type mirrors the BigInt type and is used
+    ## for the low-level computation API
+    ## This design
+    ## - avoids code bloat due to generic monomorphization
+    ##   otherwise each bigint routines would have an instantiation for
+    ##   each static `bits` parameter.
+    ## - while not forcing the caller to preallocate computation buffers
+    ##   for the high-level API
+    ##
+    ## As with the BigInt type:
+    ## - "bitLength" is the internal bitlength of the integer
+    ##   This differs from the canonical bit-length as
+    ##   Constantine word-size is smaller than a machine word.
+    ##   This value should never be used as-is to prevent leaking secret data.
+    ##   Computing this value requires constant-time operations.
+    ##   Using this value requires converting it to the # of limbs in constant-time
+    ##
+    ## - "limbs" is an internal field that holds the internal representation
+    ##   of the big integer. Least-significant limb first. Within limbs words are native-endian.
+    ##
+    ## This internal representation can be changed
+    ## without notice and should not be used by external applications or libraries.
+    ##
+    ## Accesses should be done via BigIntViewConst / BigIntViewConst
+    ## to have the compiler check for mutability
+    bitLength: uint32
+    limbs: UncheckedArray[Word]
+
+  # "Indirection" to enforce pointer types deep immutability
+  BigIntViewConst* = distinct BigIntView
+    ## Immutable view into a BigInt
+  BigIntViewMut* = distinct BigIntView
+    ## Mutable view into a BigInt
+  BigIntViewAny* = BigIntViewConst or BigIntViewMut
+
+# No exceptions allowed
+{.push raises: [].}
+
+# ############################################################
+#
+#                  Mutability safety
+#
+# ############################################################
+
+template view*(a: BigInt): BigIntViewConst =
+  ## Returns a borrowed type-erased immutable view to a bigint
+  BigIntViewConst(cast[BigIntView](a.unsafeAddr))
+
+template view*(a: var BigInt): BigIntViewMut =
+  ## Returns a borrowed type-erased mutable view to a mutable bigint
+  BigIntViewMut(cast[BigIntView](a.addr))
+
+template `[]`*(v: BigIntViewConst, limbIdx: int): Word =
+  distinctBase(type v)(v).limbs[limbIdx]
+
+template `[]`*(v: BigIntViewMut, limbIdx: int): var Word =
+  distinctBase(type v)(v).limbs[limbIdx]
+
+template `[]=`*(v: BigIntViewMut, limbIdx: int, val: Word) =
+  distinctBase(type v)(v).limbs[limbIdx] = val
+
+template bitSizeof(v: BigIntViewAny): uint32 =
+  distinctBase(type v)(v).bitLength
+
+const divShiftor = log2(WordPhysBitSize)
+template numLimbs*(v: BigIntViewAny): int =
+  ## Compute the number of limbs from
+  ## the **internal** bitlength
+  (bitSizeof(v).int + WordPhysBitSize - 1) shr divShiftor
+
+template setBitLength(v: BigIntViewMut, internalBitLength: uint32) =
+  distinctBase(type v)(v).bitLength = internalBitLength
+
+# TODO: Check if repeated v.numLimbs calls are optimized away
+
+template `[]`*(v: BigIntViewConst, limbIdxFromEnd: BackwardsIndex): Word =
+  distinctBase(type v)(v).limbs[v.numLimbs.int - int limbIdxFromEnd]
+
+template `[]`*(v: BigIntViewMut, limbIdxFromEnd: BackwardsIndex): var Word =
+  distinctBase(type v)(v).limbs[v.numLimbs.int - int limbIdxFromEnd]
+
+template `[]=`*(v: BigIntViewMut, limbIdxFromEnd: BackwardsIndex, val: Word) =
+  distinctBase(type v)(v).limbs[v.numLimbs.int - int limbIdxFromEnd] = val
+
+# ############################################################
+#
+#           Checks and debug/test only primitives
+#
+# ############################################################
+
+template checkMatchingBitlengths(a, b: distinct BigIntViewAny) =
+  ## Check that bitlengths of bigints match
+  ## This is only checked
+  ## with "-d:debugConstantine" and when assertions are on.
+  debug:
+    assert distinctBase(type a)(a).bitLength ==
+      distinctBase(type b)(b).bitLength, "Internal Error: operands bitlength do not match"
+
+template checkValidModulus(m: BigIntViewConst) =
+  ## Check that the modulus is valid
+  ## The check is approximate, it only checks that
+  ## the most-significant words is non-zero instead of
+  ## checking that the last announced bit is 1.
+  ## This is only checked
+  ## with "-d:debugConstantine" and when assertions are on.
+  debug:
+    assert not m[^1].isZero.bool, "Internal Error: the modulus must use all declared bits"
+
+debug:
+  func `==`*(a, b: BigInt): CTBool[Word] =
+    ## Returns true if 2 big ints are equal
+    var accum: Word
+    for i in static(0 ..< a.limbs.len):
+      accum = accum or (a.limbs[i] xor b.limbs[i])
+    result = accum.isZero
+
+  func `$`*(a: BigIntViewAny): string =
+    let len = a.numLimbs()
+    result = "["
+    for i in 0 ..< len - 1:
+      result.add $a[i]
+      result.add ", "
+    result.add $a[len-1]
+    result.add "] ("
+    result.add $a.bitSizeof
+    result.add " bits)"
+
+# ############################################################
+#
+#                    BigInt primitives
+#
+# ############################################################
+
+func setInternalBitLength*(a: var BigInt) {.inline.} =
+  ## Derive the actual bitsize used internally of a BigInt
+  ## from the announced BigInt bitsize
+  ## and set the bitLength field of that BigInt
+  ## to that computed value.
+  a.bitLength = static(a.bits + a.bits div WordBitSize)
+
+func isZero*(a: BigIntViewAny): CTBool[Word] =
+  ## Returns true if a big int is equal to zero
+  var accum: Word
+  for i in 0 ..< a.numLimbs():
+    accum = accum or a[i]
+  result = accum.isZero()
+
+# The arithmetic primitives all accept a control input that indicates
+# if it is a placebo operation. It stills performs the
+# same memory accesses to be side-channel attack resistant.
+
+func add*(a: BigIntViewMut, b: BigIntViewAny, ctl: CTBool[Word]): CTBool[Word] =
+  ## Constant-time big integer in-place optional addition
+  ## The addition is only performed if ctl is "true"
+  ## The result carry is always computed.
+  ##
+  ## a and b MAY be the same buffer
+  ## a and b MUST have the same announced bitlength (i.e. `bits` static parameters)
+  checkMatchingBitlengths(a, b)
+
+  for i in 0 ..< a.numLimbs():
+    let new_a = a[i] + b[i] + Word(result)
+    result = new_a.isMsbSet()
+    a[i] = ctl.mux(new_a and MaxWord, a[i])
+
+func sub*(a: BigIntViewMut, b: BigIntViewAny, ctl: CTBool[Word]): CTBool[Word] =
+  ## Constant-time big integer in-place optional substraction
+  ## The substraction is only performed if ctl is "true"
+  ## The result carry is always computed.
+  ##
+  ## a and b MAY be the same buffer
+  ## a and b MUST have the same announced bitlength (i.e. `bits` static parameters)
+  checkMatchingBitlengths(a, b)
+
+  for i in 0 ..< a.numLimbs():
+    let new_a = a[i] - b[i] - Word(result)
+    result = new_a.isMsbSet()
+    a[i] = ctl.mux(new_a and MaxWord, a[i])
+
+# ############################################################
+#
+#                   Modular BigInt
+#
+# ############################################################
+
+# TODO: push boundsCheck off. They would be extremely costly.
+
+func shlAddMod(a: BigIntViewMut, c: Word, M: BigIntViewConst) =
+  ## Fused modular left-shift + add
+  ## Shift input `a` by a word and add `c` modulo `M`
+  ##
+  ## With a word W = 2^WordBitSize and a modulus M
+  ## Does a <- a * W + c (mod M)
+  ##
+  ## The modulus `M` MUST announced most-significant bit must be set.
+  checkValidModulus(M)
+
+  let aLen = a.numLimbs()
+  let mBits = bitSizeof(M)
+
+  if mBits <= WordBitSize:
+    # If M fits in a single limb
+    var q: Word
+
+    # (hi, lo) = a * 2^63 + c
+    let hi = a[0] shr 1                   # 64 - 63 = 1
+    let lo = (a[0] shl WordBitSize) or c  # Assumes most-significant bit in c is not set
+    unsafeDiv2n1n(q, a[0], hi, lo, M[0])  # (hi, lo) mod M
+    return
+
+  else:
+    ## Multiple limbs
+    let hi = a[^1]                                          # Save the high word to detect carries
+    let R = mBits and WordBitSize                           # R = mBits mod 64
+
+    var a0, a1, m0: Word
+    if R == 0:                                              # If the number of mBits is a multiple of 64
+      a0 = a[^1]                                        #
+      moveMem(a[1].addr, a[0].addr, (aLen-1) * Word.sizeof) # we can just shift words
+      a[0] = c                                              # and replace the first one by c
+      a1 = a[^1]
+      m0 = M[^1]
+    else:                                                   # Else: need to deal with partial word shifts at the edge.
+      a0 = ((a[^1] shl (WordBitSize-R)) or (a[^2] shr R)) and MaxWord
+      moveMem(a[1].addr, a[0].addr, (aLen-1) * Word.sizeof)
+      a[0] = c
+      a1 = ((a[^1] shl (WordBitSize-R)) or (a[^2] shr R)) and MaxWord
+      m0 = ((M[^1] shl (WordBitSize-R)) or (M[^2] shr R)) and MaxWord
+
+    # m0 has its high bit set. (a0, a1)/p0 fits in a limb.
+    # Get a quotient q, at most we will be 2 iterations off
+    # from the true quotient
+
+    let
+      a_hi = a0 shr 1                              # 64 - 63 = 1
+      a_lo = (a0 shl WordBitSize) or a1
+    var q, r: Word
+    unsafeDiv2n1n(q, r, a_hi, a_lo, m0)            # Estimate quotient
+    q = mux(                                       # If n_hi == divisor
+          a0 == m0, MaxWord,                       # Quotient == MaxWord (0b0111...1111)
+          mux(
+            q.isZero, Zero,                        # elif q == 0, true quotient = 0
+            q - One                                # else instead of being of by 0, 1 or 2
+          )                                        # we returning q-1 to be off by -1, 0 or 1
+        )
+
+    # Now substract a*2^63 - q*p
+    var carry = Zero
+    var over_p = ctrue(Word)                       # Track if quotient greater than the modulus
+
+    for i in 0 ..< M.numLimbs():
+      var qp_lo: Word
+
+      block: # q*p
+        var qp_hi: Word
+        unsafeExtendedPrecMul(qp_hi, qp_lo, q, M[i]) # q * p
+        qp_lo += carry                               # Add carry from previous limb
+
+        carry = qp_hi shl 1 + qp_lo.isMsbSet.Word    # New carry
+        qp_lo = qp_lo and MaxWord                    # Normalize to u63
+
+      block: # a*2^63 - q*p
+        a[i] -= qp_lo
+        carry += Word(a[i].isMsbSet)                 # Adjust if borrow
+        a[i] = a[i] and MaxWord                      # Normalize to u63
+
+      over_p = mux(
+                a[i] == M[i], over_p,
+                a[i] > M[i]
+              )
+
+    # Fix quotient, the true quotient is either q-1, q or q+1
+    #
+    # if carry < q or carry == q and over_p we must do "a -= p"
+    # if carry > hi (negative result) we must do "a += p"
+
+    let neg = carry < hi
+    let tooBig = not neg and (over_p or (carry < hi))
+
+    discard a.add(M, ctl = neg)
+    discard a.sub(M, ctl = tooBig)
+    return
+
+func reduce*(r: BigIntViewMut, a: BigIntViewAny, M: BigIntViewConst) =
+  ## Reduce `a` modulo `M` and store the result in `r`
+  ##
+  ## The modulus `M` MUST announced most-significant bit must be set.
+  ## The result `r` buffer size MUST be at least the size of `M` buffer
+  ##
+  ## CT: Depends only on the bitlength of `a` and the modulus `M`
+
+  # Note: for all cryptographic intents and purposes the modulus is known at compile-time
+  # but we don't want to inline it as it would increase codesize, better have Nim
+  # pass a pointer+length to a fixed session of the BSS.
+  checkValidModulus(M)
+
+  let aBits = bitSizeof(a)
+  let mBits = bitSizeof(M)
+  let aLen = a.numLimbs()
+
+  r.setBitLength(bitSizeof(M))
+
+  if aBits < mBits:
+    # if a uses less bits than the modulus,
+    # it is guaranteed < modulus.
+    # This relies on the precondition that the modulus uses all declared bits
+    copyMem(r[0].addr, a[0].unsafeAddr, aLen * sizeof(Word))
+    for i in aLen ..< r.numLimbs():
+      r[i] = Zero
+  else:
+    # a length i at least equal to the modulus.
+    # we can copy modulus.limbs-1 words
+    # and modular shift-left-add the rest
+    let mLen = M.numLimbs()
+    let aOffset = aLen - mLen
+    copyMem(r[0].addr, a[aOffset].unsafeAddr, (mLen-1) * sizeof(Word))
+    r[^1] = Zero
+    for i in countdown(aOffset-1, 0):
+      r.shlAddMod(a[i], M)
diff --git a/constantine/common.nim b/constantine/common.nim
new file mode 100644
index 0000000..38be505
--- /dev/null
+++ b/constantine/common.nim
@@ -0,0 +1,13 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+# Common configuration
+
+template debug*(body: untyped): untyped =
+  when defined(debugConstantine):
+    body
diff --git a/constantine/private/curves_config_parser.nim b/constantine/curves_config_parser.nim
similarity index 100%
rename from constantine/private/curves_config_parser.nim
rename to constantine/curves_config_parser.nim
diff --git a/constantine/io.nim b/constantine/io.nim
index e010831..5e24e22 100644
--- a/constantine/io.nim
+++ b/constantine/io.nim
@@ -12,7 +12,7 @@
 
 import
   endians,
-  ./primitives, ./bigints
+  ./primitives, ./bigints_raw
 
 # ############################################################
 #
@@ -23,14 +23,16 @@ import
 # TODO: tag/remove exceptions raised.
 
 func fromRawUintLE(
-        T: type BigInt,
-        src: openarray[byte]): T =
+        dst: var BigInt,
+        src: openarray[byte]) =
   ## Parse an unsigned integer from its canonical
   ## little-endian unsigned representation
   ## And store it into a BigInt of size bits
   ##
-  ## CT:
+  ## Constant-Time:
   ##   - no leaks
+  ##
+  ## Can work at compile-time
 
   var
     dst_idx = 0
@@ -46,13 +48,33 @@ func fromRawUintLE(
 
     # if full, dump
     if acc_len >= WordBitSize:
-      result.limbs[dst_idx] = acc and MaxWord
+      dst.limbs[dst_idx] = acc and MaxWord
       inc dst_idx
       acc_len -= WordBitSize
       acc = src_byte shr (8 - acc_len)
 
-  if dst_idx < result.limbs.len:
-    result.limbs[dst_idx] = acc
+  if dst_idx < dst.limbs.len:
+    dst.limbs[dst_idx] = acc
+
+func fromRawUint*(
+        dst: var BigInt,
+        src: openarray[byte],
+        srcEndianness: static Endianness) {.inline.}=
+  ## Parse an unsigned integer from its canonical
+  ## big-endian or little-endian unsigned representation
+  ## And store it into a BigInt of size `bits`
+  ##
+  ## Constant-Time:
+  ##   - no leaks
+  ##
+  ## Can work at compile-time to embed curve moduli
+  ## from a canonical integer representation
+
+  when srcEndianness == littleEndian:
+    dst.fromRawUintLE(src)
+  else:
+    {.error: "Not implemented at the moment".}
+  dst.setInternalBitLength()
 
 func fromRawUint*(
         T: type BigInt,
@@ -62,20 +84,19 @@ func fromRawUint*(
   ## big-endian or little-endian unsigned representation
   ## And store it into a BigInt of size `bits`
   ##
-  ## CT:
+  ## Constant-Time:
   ##   - no leaks
-
-  when srcEndianness == littleEndian:
-    fromRawUintLE(T, src)
-  else:
-    {.error: "Not implemented at the moment".}
+  ##
+  ## Can work at compile-time to embed curve moduli
+  ## from a canonical integer representation
+  result.fromRawUint(src, srcEndianness)
 
 func fromUint*(
         T: type BigInt,
         src: SomeUnsignedInt): T =
   ## Parse a regular unsigned integer
   ## and store it into a BigInt of size `bits`
-  fromRawUint(T, cast[array[sizeof(src), byte]](src), cpuEndian)
+  result.fromRawUint(cast[array[sizeof(src), byte]](src), cpuEndian)
 
 # ############################################################
 #
@@ -128,7 +149,6 @@ func dumpRawUintLE(
 
       if tail >= sizeof(Word):
         # Unrolled copy
-        # debugecho src.repr
         littleEndianXX(dst[dst_idx].addr, lo.unsafeAddr)
         dst_idx += sizeof(Word)
         tail -= sizeof(Word)
@@ -293,6 +313,8 @@ func fromHex*(T: type BigInt, s: string): T =
   ##
   ## This API is intended for configuration and debugging purposes
   ## Do not pass secret or private data to it.
+  ##
+  ## Can work at compile-time to declare curve moduli from their hex strings
 
   # 1. Convert to canonical uint
   const canonLen = (T.bits + 8 - 1) div 8
@@ -300,7 +322,7 @@ func fromHex*(T: type BigInt, s: string): T =
   hexToPaddedByteArray(s, bytes, littleEndian)
 
   # 2. Convert canonical uint to Big Int
-  result = T.fromRawUint(bytes, littleEndian)
+  result.fromRawUint(bytes, littleEndian)
 
 func dumpHex*(big: BigInt, order: static Endianness = bigEndian): string =
   ## Stringify an int to hex.
diff --git a/constantine/primitives.nim b/constantine/primitives.nim
index ed5f1aa..a1adfed 100644
--- a/constantine/primitives.nim
+++ b/constantine/primitives.nim
@@ -116,6 +116,21 @@ func isMsbSet*[T: Ct](x: T): CTBool[T] =
   const msb_pos = T.sizeof * 8 - 1
   result = (CTBool[T])(x shr msb_pos)
 
+func log2*(x: uint32): uint32 =
+  ## Find the log base 2 of a 32-bit or less integer.
+  ## using De Bruijn multiplication
+  ## Works at compile-time, guaranteed constant-time.
+  # https://graphics.stanford.edu/%7Eseander/bithacks.html#IntegerLogDeBruijn
+  const lookup: array[32, uint8] = [0'u8, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18,
+    22, 25, 3, 30, 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31]
+  var v = x
+  v = v or v shr 1 # first round down to one less than a power of 2
+  v = v or v shr 2
+  v = v or v shr 4
+  v = v or v shr 8
+  v = v or v shr 16
+  lookup[(v * 0x07C4ACDD'u32) shr 27]
+
 # ############################################################
 #
 #             Hardened Boolean primitives
diff --git a/constantine/private/primitives_internal.nim b/constantine/primitives_extprecision.nim
similarity index 90%
rename from constantine/private/primitives_internal.nim
rename to constantine/primitives_extprecision.nim
index a4f4e8d..756ea0d 100644
--- a/constantine/private/primitives_internal.nim
+++ b/constantine/primitives_extprecision.nim
@@ -12,7 +12,7 @@
 #
 # ############################################################
 
-import ../primitives
+import ./primitives
 
 func asm_x86_64_extMul(hi, lo: var uint64, a, b: uint64) {.inline.}=
   ## Extended precision multiplication uint64 * uint64 --> uint128
@@ -161,34 +161,34 @@ when isMainModule:
     doAssert q == 6148914691236517205'u64
     doAssert r == 1
 
-  block: # TODO - support Quotient that doesn't fit in the result
-         # The usual way with normalization by the bitSize difference
-         # is fundamentally non constant-time
-         # it is probable that division is not constant-time at the hardware level as well
-         # as it throws sigfpe when the quotient doesn't fit in the result size
+  # block: # TODO - support Quotient that doesn't fit in the result
+  #        # The usual way with normalization by the bitSize difference
+  #        # is fundamentally non constant-time
+  #        # it is probable that division is not constant-time at the hardware level as well
+  #        # as it throws sigfpe when the quotient doesn't fit in the result size
 
-    var q, r: uint64
+  #   var q, r: uint64
 
-    let n_hi = 1'u64
-    let n_lo = 0'u64
-    let d = 1'u64
+  #   let n_hi = 1'u64
+  #   let n_lo = 0'u64
+  #   let d = 1'u64
 
-    asm_x86_64_div2n1n(q, r, n_hi, n_lo, d)
+  #   asm_x86_64_div2n1n(q, r, n_hi, n_lo, d)
 
-    echo "quotient: ", q
-    echo "remainder: ", r
+  #   echo "quotient: ", q
+  #   echo "remainder: ", r
 
-  block:
-    var q, r: uint64
+  # block:
+  #   var q, r: uint64
 
-    let n_hi = 4186590388502004879'u64
-    let n_lo = 17852795547484522084'u64
-    let d = 327340459940166448'u64
+  #   let n_hi = 4186590388502004879'u64
+  #   let n_lo = 17852795547484522084'u64
+  #   let d = 327340459940166448'u64
 
-    asm_x86_64_div2n1n(q, r, n_hi, n_lo, d)
+  #   asm_x86_64_div2n1n(q, r, n_hi, n_lo, d)
 
-    echo "quotient: ", q
-    echo "remainder: ", r
+  #   echo "quotient: ", q
+  #   echo "remainder: ", r
 
 # ##############################################################
 #
diff --git a/tests/test_bigints.nim b/tests/test_bigints.nim
index a471197..493c658 100644
--- a/tests/test_bigints.nim
+++ b/tests/test_bigints.nim
@@ -7,7 +7,7 @@
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
 import  unittest, random, strutils,
-        ../constantine/[io, bigints, primitives]
+        ../constantine/[io, bigints_public, bigints_raw, primitives]
 
 suite "isZero":
   test "isZero for zero":
@@ -128,12 +128,12 @@ suite "Modular operations - small modulus":
   # Vectors taken from Stint - https://github.com/status-im/nim-stint
   test "100 mod 13":
     let a = BigInt[32].fromUint(100'u32)
-    let m = BigInt[8].fromUint(13'u8)
+    let m = BigInt[4].fromUint(13'u8)
 
-    var r: BigInt[8]
+    var r: BigInt[4]
     r.reduce(a, m)
     check:
-      bool(r == BigInt[8].fromUint(100'u8 mod 13))
+      bool(r == BigInt[4].fromUint(100'u8 mod 13))
 
   test "2^64 mod 3":
     let a = BigInt[65].fromHex("0x1_00000000_00000000")
@@ -160,29 +160,23 @@ suite "Modular operations - small modulus - Stint specific failures highlighted
     let v = 174261910798982'u64
 
     let a = BigInt[64].fromUint(u)
-    let m = BigInt[49].fromUint(v)
+    let m = BigInt[48].fromUint(v)
 
-    var r: BigInt[49]
+    var r: BigInt[48]
     r.reduce(a, m)
-    # Copy the result in a conveniently sized buffer
-    var rr: BigInt[49]
-    copyLimbs(rr, 0, r, 0, r.limbs.len)
 
     check:
-      bool(rr == BigInt[49].fromUint(u mod v))
+      bool(r == BigInt[48].fromUint(u mod v))
 
   test "Modulo: 15080397990160655 mod 600432699691":
     let u = 15080397990160655'u64
     let v = 600432699691'u64
 
     let a = BigInt[64].fromUint(u)
-    let m = BigInt[41].fromUint(v)
+    let m = BigInt[40].fromUint(v)
 
-    var r: BigInt[41]
+    var r: BigInt[40]
     r.reduce(a, m)
-    # Copy the result in a conveniently sized buffer
-    var rr: BigInt[41]
-    copyLimbs(rr, 0, r, 0, r.limbs.len)
 
     check:
-      bool(rr == BigInt[41].fromUint(u mod v))
+      bool(r == BigInt[40].fromUint(u mod v))
diff --git a/tests/test_bigints.nim.cfg b/tests/test_bigints.nim.cfg
new file mode 100644
index 0000000..dd68656
--- /dev/null
+++ b/tests/test_bigints.nim.cfg
@@ -0,0 +1 @@
+-d:debugConstantine
diff --git a/tests/test_io.nim b/tests/test_io.nim
index b687422..14bdbb4 100644
--- a/tests/test_io.nim
+++ b/tests/test_io.nim
@@ -7,7 +7,7 @@
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
 import  unittest, random,
-        ../constantine/[io, bigints]
+        ../constantine/[io, bigints_raw]
 
 randomize(0xDEADBEEF) # Random seed for reproducibility
 type T = BaseType