# Constantine
# Copyright (c) 2018-2019    Status Research & Development GmbH
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
# Licensed and distributed under either of
#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.

# ############################################################
#
#         BigInt Raw representation and operations
#
# ############################################################
#
# This file holds the raw operations done on big ints
# The representation is optimized for:
# - constant-time (not leaking secret data via side-channel)
# - generated code size, datatype size and stack usage
# - performance
# in this order

# ############################################################
# Design

# To avoid carry issues we don't use the
# most significant bit of each machine word.
# i.e. for a uint64 base we only use 63-bit.
# More info: https://github.com/status-im/nim-constantine/wiki/Constant-time-arithmetics#guidelines
# Especially:
#    - https://bearssl.org/bigint.html
#    - https://cryptojedi.org/peter/data/pairing-20131122.pdf
#    - http://docs.milagro.io/en/amcl/milagro-crypto-library-white-paper.html
#
# Note that this might also be beneficial in terms of performance.
# Due to opcode latency, on Nehalem ADC is 6x times slower than ADD
# if it has dependencies (i.e the ADC depends on a previous ADC result)
#
# Control flow should only depends on the static maximum number of bits
# This number is defined per Finite Field/Prime/Elliptic Curve
#
# We internally order the limbs in little-endian
# So the least significant limb is limb[0]
# This is independent from the base type endianness.
#
# Constantine uses Nim generic integer to prevent mixing
# BigInts of different bitlength at compile-time and
# properly statically size the BigInt buffers.
#
# To avoid code-bloat due to monomorphization (i.e. duplicating code per announced bitlength)
# actual computation is deferred to type-erased routines.

import
  ../primitives/constant_time,
  ../primitives/extended_precision,
  ../config/common
from sugar import distinctBase

# ############################################################
#
#                BigInts type-erased API
#
# ############################################################

# The "checked" API is exported as a building blocks
# with enforced compile-time checking of BigInt bitsize
# and memory ownership.
#
# The "raw" compute API uses views to avoid code duplication
# due to generic/static monomorphization.
#
# The "checked" API is a thin wrapper above the "raw" API to get the best of both world:
# - small code footprint
# - compiler enforced checks: types, bitsizes
# - compiler enforced memory: stack allocation and buffer ownership

type
  BigIntView* = ptr object
    ## Type-erased fixed-precision big integer
    ##
    ## This type mirrors the BigInt type and is used
    ## for the low-level computation API
    ## This design
    ## - avoids code bloat due to generic monomorphization
    ##   otherwise each bigint routines would have an instantiation for
    ##   each static `bits` parameter.
    ## - while not forcing the caller to preallocate computation buffers
    ##   for the high-level API and enforcing bitsizes
    ## - avoids runtime bound-checks on the view
    ##   for performance
    ##   and to ensure exception-free code
    ##   even when compiled in non "-d:danger" mode
    ##
    ## As with the BigInt type:
    ## - "bitLength" is the internal bitlength of the integer
    ##   This differs from the canonical bit-length as
    ##   Constantine word-size is smaller than a machine word.
    ##   This value should never be used as-is to prevent leaking secret data.
    ##   Computing this value requires constant-time operations.
    ##   Using this value requires converting it to the # of limbs in constant-time
    ##
    ## - "limbs" is an internal field that holds the internal representation
    ##   of the big integer. Least-significant limb first. Within limbs words are native-endian.
    ##
    ## This internal representation can be changed
    ## without notice and should not be used by external applications or libraries.
    ##
    ## Accesses should be done via BigIntViewConst / BigIntViewConst
    ## to have the compiler check for mutability
    bitLength: uint32
    limbs: UncheckedArray[Word]

  # "Indirection" to enforce pointer types deep immutability
  BigIntViewConst* = distinct BigIntView
    ## Immutable view into a BigInt
  BigIntViewMut* = distinct BigIntView
    ## Mutable view into a BigInt
  BigIntViewAny* = BigIntViewConst or BigIntViewMut

# No exceptions allowed
{.push raises: [].}

# ############################################################
#
#                 Deep Mutability safety
#
# ############################################################

template `[]`*(v: BigIntViewConst, limbIdx: int): Word =
  distinctBase(type v)(v).limbs[limbIdx]

template `[]`*(v: BigIntViewMut, limbIdx: int): var Word =
  distinctBase(type v)(v).limbs[limbIdx]

template `[]=`*(v: BigIntViewMut, limbIdx: int, val: Word) =
  distinctBase(type v)(v).limbs[limbIdx] = val

template bitSizeof(v: BigIntViewAny): uint32 =
  bind BigIntView
  distinctBase(type v)(v).bitLength

const divShiftor = log2(WordPhysBitSize)
template numLimbs*(v: BigIntViewAny): int =
  ## Compute the number of limbs from
  ## the **internal** bitlength
  (bitSizeof(v).int + WordPhysBitSize - 1) shr divShiftor

template setBitLength(v: BigIntViewMut, internalBitLength: uint32) =
  distinctBase(type v)(v).bitLength = internalBitLength

# TODO: Check if repeated v.numLimbs calls are optimized away

template `[]`*(v: BigIntViewConst, limbIdxFromEnd: BackwardsIndex): Word =
  distinctBase(type v)(v).limbs[numLimbs(v).int - int limbIdxFromEnd]

template `[]`*(v: BigIntViewMut, limbIdxFromEnd: BackwardsIndex): var Word =
  distinctBase(type v)(v).limbs[numLimbs(v).int - int limbIdxFromEnd]

template `[]=`*(v: BigIntViewMut, limbIdxFromEnd: BackwardsIndex, val: Word) =
  distinctBase(type v)(v).limbs[numLimbs(v).int - int limbIdxFromEnd] = val

# ############################################################
#
#           Checks and debug/test only primitives
#
# ############################################################

template checkMatchingBitlengths(a, b: distinct BigIntViewAny) =
  ## Check that bitlengths of bigints match
  ## This is only checked
  ## with "-d:debugConstantine" and when assertions are on.
  debug:
    assert distinctBase(type a)(a).bitLength ==
      distinctBase(type b)(b).bitLength, "Internal Error: operands bitlength do not match"

template checkValidModulus(m: BigIntViewConst) =
  ## Check that the modulus is valid
  ## The check is approximate, it only checks that
  ## the most-significant words is non-zero instead of
  ## checking that the last announced bit is 1.
  ## This is only checked
  ## with "-d:debugConstantine" and when assertions are on.
  debug:
    assert not isZero(m[^1]).bool, "Internal Error: the modulus must use all declared bits"

template checkOddModulus(m: BigIntViewConst) =
  ## CHeck that the modulus is odd
  ## and valid for use in the Montgomery n-residue representation
  debug:
    assert bool(BaseType(m[0]) and 1), "Internal Error: the modulus must be odd to use the Montgomery representation."

template checkWordShift(k: int) =
  ## Checks that the shift is less than the word bit size
  debug:
    assert k <= WordBitSize, "Internal Error: the shift must be less than the word bit size"

template checkPowScratchSpaceLen(len: int) =
  ## Checks that there is a minimum of scratchspace to hold the temporaries
  debug:
    assert len >= 2, "Internal Error: the scratchspace for powmod should be equal or greater than 2"

debug:
  func `$`*(a: BigIntViewAny): string =
    let len = a.numLimbs()
    result = "["
    for i in 0 ..< len - 1:
      result.add $a[i]
      result.add ", "
    result.add $a[len-1]
    result.add "] ("
    result.add $a.bitSizeof
    result.add " bits)"

# ############################################################
#
#                    BigInt primitives
#
# ############################################################

func isZero*(a: BigIntViewAny): CTBool[Word] =
  ## Returns true if a big int is equal to zero
  var accum: Word
  for i in 0 ..< a.numLimbs():
    accum = accum or a[i]
  result = accum.isZero()

func setZero*(a: BigIntViewMut) =
  ## Set a BigInt to 0
  ## It's bit size is unchanged
  zeroMem(a[0].unsafeAddr, a.numLimbs() * sizeof(Word))

func cmov*(a: BigIntViewMut, b: BigIntViewAny, ctl: CTBool[Word]) =
  ## Constant-time conditional copy
  ## If ctl is true: b is copied into a
  ## if ctl is false: b is not copied and a is untouched
  ## Time and memory accesses are the same whether a copy occurs or not
  checkMatchingBitlengths(a, b)
  for i in 0 ..< a.numLimbs():
    a[i] = ctl.mux(b[i], a[i])

# The arithmetic primitives all accept a control input that indicates
# if it is a placebo operation. It stills performs the
# same memory accesses to be side-channel attack resistant.

func add*(a: BigIntViewMut, b: BigIntViewAny, ctl: CTBool[Word]): CTBool[Word] =
  ## Constant-time big integer in-place optional addition
  ## The addition is only performed if ctl is "true"
  ## The result carry is always computed.
  ##
  ## a and b MAY be the same buffer
  ## a and b MUST have the same announced bitlength (i.e. `bits` static parameters)
  checkMatchingBitlengths(a, b)

  for i in 0 ..< a.numLimbs():
    let new_a = a[i] + b[i] + Word(result)
    result = new_a.isMsbSet()
    a[i] = ctl.mux(new_a.mask(), a[i])

func sub*(a: BigIntViewMut, b: BigIntViewAny, ctl: CTBool[Word]): CTBool[Word] =
  ## Constant-time big integer in-place optional substraction
  ## The substraction is only performed if ctl is "true"
  ## The result carry is always computed.
  ##
  ## a and b MAY be the same buffer
  ## a and b MUST have the same announced bitlength (i.e. `bits` static parameters)
  checkMatchingBitlengths(a, b)

  for i in 0 ..< a.numLimbs():
    let new_a = a[i] - b[i] - Word(result)
    result = new_a.isMsbSet()
    a[i] = ctl.mux(new_a.mask(), a[i])

func dec*(a: BigIntViewMut, w: Word): CTBool[Word] =
  ## Decrement a big int by a small word
  # returns the result carry

  a[0] -= w
  result = a[0].isMsbSet()
  a[0] = a[0].mask()
  for i in 1 ..< a.numLimbs():
    a[i] -= Word(result)
    result = a[i].isMsbSet()
    a[i] = a[i].mask()

func shiftRight*(a: BigIntViewMut, k: int) =
  ## Shift right by k.
  ##
  ## k MUST be less than the base word size (2^31 or 2^63)
  # We don't reuse shr for this in-place operation
  # Do we need to return the shifted out part?
  #
  # Note: for speed, loading a[i] and a[i+1]
  #       instead of a[i-1] and a[i]
  #       is probably easier to parallelize for the compiler
  #       (antidependence WAR vs loop-carried dependence RAW)
  checkWordShift(k)

  let len = a.numLimbs()
  for i in 0 ..< len-1:
    a[i] = (a[i] shr k) or mask(a[i+1] shl (WordBitSize - k))
  a[len-1] = a[len-1] shr k

# ############################################################
#
#                   Modular BigInt
#
# ############################################################

func shlAddMod(a: BigIntViewMut, c: Word, M: BigIntViewConst) =
  ## Fused modular left-shift + add
  ## Shift input `a` by a word and add `c` modulo `M`
  ##
  ## With a word W = 2^WordBitSize and a modulus M
  ## Does a <- a * W + c (mod M)
  ##
  ## The modulus `M` MUST announced most-significant bit must be set.
  checkValidModulus(M)

  let aLen = a.numLimbs()
  let mBits = bitSizeof(M)

  if mBits <= WordBitSize:
    # If M fits in a single limb
    var q: Word

    # (hi, lo) = a * 2^63 + c
    let hi = a[0] shr 1                   # 64 - 63 = 1
    let lo = (a[0] shl WordBitSize) or c  # Assumes most-significant bit in c is not set
    unsafeDiv2n1n(q, a[0], hi, lo, M[0])  # (hi, lo) mod M
    return

  else:
    ## Multiple limbs
    let hi = a[^1]                                          # Save the high word to detect carries
    let R = mBits and WordBitSize                       # R = mBits mod 64

    var a0, a1, m0: Word
    if R == 0:                                              # If the number of mBits is a multiple of 64
      a0 = a[^1]                                        #
      moveMem(a[1].addr, a[0].addr, (aLen-1) * Word.sizeof) # we can just shift words
      a[0] = c                                              # and replace the first one by c
      a1 = a[^1]
      m0 = M[^1]
    else:                                                   # Else: need to deal with partial word shifts at the edge.
      a0 = mask((a[^1] shl (WordBitSize-R)) or (a[^2] shr R))
      moveMem(a[1].addr, a[0].addr, (aLen-1) * Word.sizeof)
      a[0] = c
      a1 = mask((a[^1] shl (WordBitSize-R)) or (a[^2] shr R))
      m0 = mask((M[^1] shl (WordBitSize-R)) or (M[^2] shr R))

    # m0 has its high bit set. (a0, a1)/p0 fits in a limb.
    # Get a quotient q, at most we will be 2 iterations off
    # from the true quotient

    let
      a_hi = a0 shr 1                              # 64 - 63 = 1
      a_lo = (a0 shl WordBitSize) or a1
    var q, r: Word
    unsafeDiv2n1n(q, r, a_hi, a_lo, m0)            # Estimate quotient
    q = mux(                                       # If n_hi == divisor
          a0 == m0, MaxWord,                       # Quotient == MaxWord (0b0111...1111)
          mux(
            q.isZero, Zero,                        # elif q == 0, true quotient = 0
            q - One                                # else instead of being of by 0, 1 or 2
          )                                        # we returning q-1 to be off by -1, 0 or 1
        )

    # Now substract a*2^63 - q*p
    var carry = Zero
    var over_p = CtTrue                            # Track if quotient greater than the modulus

    for i in 0 ..< M.numLimbs():
      var qp_lo: Word

      block: # q*p
        # q * p + carry (doubleword) carry from previous limb
        let qp = unsafeExtPrecMul(q, M[i]) + carry.DoubleWord
        carry = Word(qp shr WordBitSize)           # New carry: high digit besides LSB
        qp_lo = qp.Word.mask()                     # Normalize to u63

      block: # a*2^63 - q*p
        a[i] -= qp_lo
        carry += Word(a[i].isMsbSet)               # Adjust if borrow
        a[i] = a[i].mask()                        # Normalize to u63

      over_p = mux(
                a[i] == M[i], over_p,
                a[i] > M[i]
              )

    # Fix quotient, the true quotient is either q-1, q or q+1
    #
    # if carry < q or carry == q and over_p we must do "a -= p"
    # if carry > hi (negative result) we must do "a += p"

    let neg = carry > hi
    let tooBig = not neg and (over_p or (carry < hi))

    discard a.add(M, ctl = neg)
    discard a.sub(M, ctl = tooBig)
    return

func reduce*(r: BigIntViewMut, a: BigIntViewAny, M: BigIntViewConst) =
  ## Reduce `a` modulo `M` and store the result in `r`
  ##
  ## The modulus `M` MUST announced most-significant bit must be set.
  ## The result `r` buffer size MUST be at least the size of `M` buffer
  ##
  ## CT: Depends only on the bitlength of `a` and the modulus `M`

  # Note: for all cryptographic intents and purposes the modulus is known at compile-time
  # but we don't want to inline it as it would increase codesize, better have Nim
  # pass a pointer+length to a fixed session of the BSS.
  checkValidModulus(M)

  let aBits = bitSizeof(a)
  let mBits = bitSizeof(M)
  let aLen = a.numLimbs()

  r.setBitLength(bitSizeof(M))

  if aBits < mBits:
    # if a uses less bits than the modulus,
    # it is guaranteed < modulus.
    # This relies on the precondition that the modulus uses all declared bits
    copyMem(r[0].addr, a[0].unsafeAddr, aLen * sizeof(Word))
    for i in aLen ..< r.numLimbs():
      r[i] = Zero
  else:
    # a length i at least equal to the modulus.
    # we can copy modulus.limbs-1 words
    # and modular shift-left-add the rest
    let mLen = M.numLimbs()
    let aOffset = aLen - mLen
    copyMem(r[0].addr, a[aOffset+1].unsafeAddr, (mLen-1) * sizeof(Word))
    r[^1] = Zero
    # Now shift-left the copied words while adding the new word modulo M
    for i in countdown(aOffset, 0):
      r.shlAddMod(a[i], M)

# ############################################################
#
#                 Montgomery Arithmetic
#
# ############################################################

template wordMul(a, b: Word): Word =
  mask(a * b)

func montyMul*(
       r: BigIntViewMut, a, b: distinct BigIntViewAny,
       M: BigIntViewConst, negInvModWord: Word) =
  ## Compute r <- a*b (mod M) in the Montgomery domain
  ## `negInvModWord` = -1/M (mod Word). Our words are 2^31 or 2^63
  ##
  ## This resets r to zero before processing. Use {.noInit.}
  ## to avoid duplicating with Nim zero-init policy
  ## The result `r` buffer size MUST be at least the size of `M` buffer
  ##
  ##
  ## Assuming 63-bit wors, the magic constant should be:
  ##
  ## - µ ≡ -1/M[0] (mod 2^63) for a general multiplication
  ##   This can be precomputed with `negInvModWord`
  ## - 1 for conversion from Montgomery to canonical representation
  ##   The library implements a faster `redc` primitive for that use-case
  ## - R^2 (mod M) for conversion from canonical to Montgomery representation
  ##
  # i.e. c'R <- a'R b'R * R^-1 (mod M) in the natural domain
  # as in the Montgomery domain all numbers are scaled by R

  checkValidModulus(M)
  checkOddModulus(M)
  checkMatchingBitlengths(a, M)
  checkMatchingBitlengths(b, M)

  let nLen = M.numLimbs()
  r.setBitLength(bitSizeof(M))
  setZero(r)

  var r_hi = Zero   # represents the high word that is used in intermediate computation before reduction mod M
  for i in 0 ..< nLen:

    let zi = (r[0] + wordMul(a[i], b[0])).wordMul(negInvModWord)
    var carry = Zero

    for j in 0 ..< nLen:
      let z = DoubleWord(r[j]) + unsafeExtPrecMul(a[i], b[j]) +
              unsafeExtPrecMul(zi, M[j]) + DoubleWord(carry)
      carry = Word(z shr WordBitSize)
      if j != 0:
        r[j-1] = Word(z).mask()

    r_hi += carry
    r[^1] = r_hi.mask()
    r_hi = r_hi shr WordBitSize

  # If the extra word is not zero or if r-M does not borrow (i.e. r > M)
  # Then substract M
  discard r.sub(M, r_hi.isNonZero() or not r.sub(M, CtFalse))

func redc*(r: BigIntViewMut, a: BigIntViewAny, one, N: BigIntViewConst, negInvModWord: Word) {.inline.} =
  ## Transform a bigint ``a`` from it's Montgomery N-residue representation (mod N)
  ## to the regular natural representation (mod N)
  ##
  ## with W = N.numLimbs()
  ## and R = (2^WordBitSize)^W
  ##
  ## Does "a * R^-1 (mod N)"
  ##
  ## This is called a Montgomery Reduction
  ## The Montgomery Magic Constant is µ = -1/N mod N
  ## is used internally and can be precomputed with negInvModWord(Curve)
  # References:
  #   - https://eprint.iacr.org/2017/1057.pdf (Montgomery)
  #     page: Radix-r interleaved multiplication algorithm
  #   - https://en.wikipedia.org/wiki/Montgomery_modular_multiplication#Montgomery_arithmetic_on_multiprecision_(variable-radix)_integers
  #   - http://langevin.univ-tln.fr/cours/MLC/extra/montgomery.pdf
  #     Montgomery original paper
  #
  checkValidModulus(N)
  checkOddModulus(N)
  checkMatchingBitlengths(a, N)

  # TODO: This is a Montgomery multiplication by 1 and can be specialized
  montyMul(r, a, one, N, negInvModWord)

func montyResidue*(
       r: BigIntViewMut, a: BigIntViewAny,
       N, r2modN: BigIntViewConst, negInvModWord: Word) {.inline.} =
  ## Transform a bigint ``a`` from it's natural representation (mod N)
  ## to a the Montgomery n-residue representation
  ##
  ## Montgomery-Multiplication - based
  ##
  ## with W = N.numLimbs()
  ## and R = (2^WordBitSize)^W
  ##
  ## Does "a * R (mod N)"
  ##
  ## `a`: The source BigInt in the natural representation. `a` in [0, N) range
  ## `N`: The field modulus. N must be odd.
  ## `r2modN`: 2^WordBitSize mod `N`. Can be precomputed with `r2mod` function
  ##
  ## Important: `r` is overwritten
  ## The result `r` buffer size MUST be at least the size of `M` buffer
  # Reference: https://eprint.iacr.org/2017/1057.pdf
  checkValidModulus(N)
  checkOddModulus(N)
  checkMatchingBitlengths(a, N)

  montyMul(r, a, r2ModN, N, negInvModWord)

# Montgomery Modular Exponentiation
# ------------------------------------------
# We use fixed-window based exponentiation
# that is constant-time: i.e. the number of multiplications
# does not depend on the number of set bits in the exponents
# those are always done and conditionally copied.
#
# TODO: analyze cost difference with naive exponentiation
#       with n being the number of words to represent a number in Fp
#       and k the window-size
#       - we always multiply even for unused multiplications
#       - conditional copy only save a small fraction of time
#         (multiplication O(n²), cmov O(n), doing nothing i.e. non constant-time O(n))
#       - Table lookup is O(kn) copy time since we need to access the whole table to
#         defeat cache attacks. Without windows, we don't have table lookups at all.
#
# The exponent MUST NOT be private data (until audited otherwise)
# - Power attack on RSA, https://www.di.ens.fr/~fouque/pub/ches06.pdf
# - Flush-and-reload on Sliding window exponentiation: https://tutcris.tut.fi/portal/files/8966761/p1639_pereida_garcia.pdf
# - Sliding right into disaster, https://eprint.iacr.org/2017/627.pdf
# - Fixed window leak: https://www.scirp.org/pdf/JCC_2019102810331929.pdf
# - Constructing sliding-windows leak, https://easychair.org/publications/open/fBNC
#
# For pairing curves, this is the case since exponentiation is only
# used for inversion via the Little Fermat theorem.
# For RSA, some exponentiations uses private exponents.
#
# Note:
# - Implementation closely follows Thomas Pornin's BearSSL
# - Apache Milagro Crypto has an alternative implementation
#   that is more straightforward however:
#   - the exponent hamming weight is used as loop bounds
#   - the base^k is stored at each index of a temp table of size k
#   - the base^k to use is indexed by the hamming weight
#     of the exponent, leaking this to cache attacks
#   - in contrast BearSSL touches the whole table to
#     hide the actual selection
#
# Directly using the Hamming weight would probably
# significantly improve pairing-friendly curves as
# they are chosen for their low Hamming-Weight (see BLS12-381 x factor)
# --> Expose an exponent-leaky powMod?
#     If so, create distinct type for leaked bits and BigInt
#     so that sensitive data use is compiler-checked

func getWindowLen(bufLen: int): uint =
  ## Compute the maximum window size that fits in the scratchspace buffer
  checkPowScratchSpaceLen(bufLen)
  result = 5
  while (1 shl result) + 1 > bufLen:
    dec result

func montyPowPrologue(
       a: BigIntViewMut, M, one: BigIntViewConst,
       negInvModWord: Word,
       scratchspace: openarray[BigIntViewMut]
     ): tuple[window: uint, bigIntSize: int] {.inline.}=
  # Due to the high number of parameters,
  # forcing this inline actually reduces the code size

  result.window = scratchspace.len.getWindowLen()
  result.bigIntSize = a.numLimbs() * sizeof(Word) + sizeof(BigIntView.bitLength)

  # Precompute window content, special case for window = 1
  # (i.e scratchspace has only space for 2 temporaries)
  # The content scratchspace[2+k] is set at a^k
  # with scratchspace[0] untouched
  if result.window == 1:
    copyMem(pointer scratchspace[1], pointer a, result.bigIntSize)
  else:
    copyMem(pointer scratchspace[2], pointer a, result.bigIntSize)
    for k in 2 ..< 1 shl result.window:
      scratchspace[k+1].montyMul(scratchspace[k], a, M, negInvModWord)

    scratchspace[1].setBitLength(bitSizeof(M))

  # Set a to one
  copyMem(pointer a, pointer one, result.bigIntSize)

func montyPowSquarings(
        a: BigIntViewMut,
        exponent: openarray[byte],
        M: BigIntViewConst,
        negInvModWord: Word,
        tmp: BigIntViewMut,
        window: uint,
        bigIntSize: int,
        acc, acc_len: var uint,
        e: var int,
      ): tuple[k, bits: uint] {.inline.}=
  ## Squaring step of exponentiation by squaring
  ## Get the next k bits in range [1, window)
  ## Square k times
  ## Returns the number of squarings done and the corresponding bits
  ##
  ## Updates iteration variables and accumulators
  # Due to the high number of parameters,
  # forcing this inline actually reduces the code size

  # Get the next bits
  var k = window
  if acc_len < window:
    if e < exponent.len:
      acc = (acc shl 8) or exponent[e].uint
      inc e
      acc_len += 8
    else: # Drained all exponent bits
      k = acc_len

  let bits = (acc shr (acc_len - k)) and ((1'u32 shl k) - 1)
  acc_len -= k

  # We have k bits and can do k squaring
  for i in 0 ..< k:
    tmp.montyMul(a, a, M, negInvModWord)
    copyMem(pointer a, pointer tmp, bigIntSize)

  return (k, bits)

func montyPow*(
       a: BigIntViewMut,
       exponent: openarray[byte],
       M, one: BigIntViewConst,
       negInvModWord: Word,
       scratchspace: openarray[BigIntViewMut]
      ) =
  ## Modular exponentiation r = a^exponent mod M
  ## in the Montgomery domain
  ##
  ## This uses fixed-window optimization if possible
  ##
  ## - On input ``a`` is the base, on ``output`` a = a^exponent (mod M)
  ##   ``a`` is in the Montgomery domain
  ## - ``exponent`` is the exponent in big-endian canonical format (octet-string)
  ##   Use ``exportRawUint`` for conversion
  ## - ``M`` is the modulus
  ## - ``one`` is 1 (mod M) in montgomery representation
  ## - ``negInvModWord`` is the montgomery magic constant "-1/M[0] mod 2^WordBitSize"
  ## - ``scratchspace`` with k the window bitsize of size up to 5
  ##   This is a buffer that can hold between 2^k + 1 big-ints
  ##   A window of of 1-bit (no window optimization) requires only 2 big-ints
  ##
  ## Note that the best window size require benchmarking and is a tradeoff between
  ## - performance
  ## - stack usage
  ## - precomputation
  ##
  ## For example BLS12-381 window size of 5 is 30% faster than no window,
  ## but windows of size 2, 3, 4 bring no performance benefit, only increased stack space.
  ## A window of size 5 requires (2^5 + 1)*(381 + 7)/8 = 33 * 48 bytes = 1584 bytes
  ## of scratchspace (on the stack).

  let (window, bigIntSize) = montyPowPrologue(a, M, one, negInvModWord, scratchspace)

  # We process bits with from most to least significant.
  # At each loop iteration with have acc_len bits in acc.
  # To maintain constant-time the number of iterations
  # or the number of operations or memory accesses should be the same
  # regardless of acc & acc_len
  var
    acc, acc_len: uint
    e = 0
  while acc_len > 0 or e < exponent.len:
    let (k, bits) = montyPowSquarings(
      a, exponent, M, negInvModWord,
      scratchspace[0], window, bigIntSize,
      acc, acc_len, e
    )

    # Window lookup: we set scratchspace[1] to the lookup value.
    # If the window length is 1, then it's already set.
    if window > 1:
      # otherwise we need a constant-time lookup
      # in particular we need the same memory accesses, we can't
      # just index the openarray with the bits to avoid cache attacks.
      for i in 1 ..< 1 shl k:
        let ctl = Word(i) == Word(bits)
        scratchspace[1].cmov(scratchspace[1+i], ctl)

    # Multiply with the looked-up value
    # we keep the product only if the exponent bits are not all zero
    scratchspace[0].montyMul(a, scratchspace[1], M, negInvModWord)
    a.cmov(scratchspace[0], Word(bits) != Zero)

func montyPowUnsafeExponent*(
       a: BigIntViewMut,
       exponent: openarray[byte],
       M, one: BigIntViewConst,
       negInvModWord: Word,
       scratchspace: openarray[BigIntViewMut]
      ) =
  ## Modular exponentiation r = a^exponent mod M
  ## in the Montgomery domain
  ##
  ## Warning ⚠️ :
  ## This is an optimization for public exponent
  ## Otherwise bits of the exponent can be retrieved with:
  ## - memory access analysis
  ## - power analysis
  ## - timing analysis

  # TODO: scratchspace[1] is unused when window > 1

  let (window, bigIntSize) = montyPowPrologue(
         a, M, one, negInvModWord, scratchspace)

  var
    acc, acc_len: uint
    e = 0
  while acc_len > 0 or e < exponent.len:
    let (k, bits) = montyPowSquarings(
      a, exponent, M, negInvModWord,
      scratchspace[0], window, bigIntSize,
      acc, acc_len, e
    )

    ## Warning ⚠️: Exposes the exponent bits
    if bits != 0:
      if window > 1:
        scratchspace[0].montyMul(a, scratchspace[1+bits], M, negInvModWord)
      else:
        # scratchspace[1] holds the original `a`
        scratchspace[0].montyMul(a, scratchspace[1], M, negInvModWord)
      copyMem(pointer a, pointer scratchspace[0], bigIntSize)