nim-stint/stint/private/primitives/extended_precision.nim

# Stint
# Copyright 2018 Status Research & Development GmbH
# Licensed under either of
#
#  * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
#  * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
#
# at your option. This file may not be copied, modified, or distributed except according to those terms.

# ############################################################
#
#               Extended precision primitives
#
# ############################################################

import
  ../datatypes,
  ./addcarry_subborrow

# ############################################################
#
#                     32-bit words
#
# ############################################################

func div2n1n*(q, r: var uint32, n_hi, n_lo, d: uint32) {.inline.}=
  ## Division uint64 by uint32
  ## Warning ⚠️ :
  ##   - if n_hi == d, quotient does not fit in an uint32
  ##   - if n_hi > d result is undefined
  ##
  ## To avoid issues, n_hi, n_lo, d should be normalized.
  ## i.e. shifted (== multiplied by the same power of 2)
  ## so that the most significant bit in d is set.
  let dividend = (uint64(n_hi) shl 32) or uint64(n_lo)
  let divisor = uint64(d)
  q = uint32(dividend div divisor)
  r = uint32(dividend mod divisor)

func mul*(hi, lo: var uint32, a, b: uint32) {.inline.} =
  ## Extended precision multiplication
  ## (hi, lo) <- a*b
  let dblPrec = uint64(a) * uint64(b)
  lo = uint32(dblPrec)
  hi = uint32(dblPrec shr 32)

func muladd1*(hi, lo: var uint32, a, b, c: uint32) {.inline.} =
  ## Extended precision multiplication + addition
  ## (hi, lo) <- a*b + c
  ##
  ## Note: 0xFFFFFFFF² -> (hi: 0xFFFFFFFE, lo: 0x00000001)
  ##       so adding any c cannot overflow
  let dblPrec = uint64(a) * uint64(b) + uint64(c)
  lo = uint32(dblPrec)
  hi = uint32(dblPrec shr 32)

func muladd2*(hi, lo: var uint32, a, b, c1, c2: uint32) {.inline.}=
  ## Extended precision multiplication + addition + addition
  ## This is constant-time on most hardware except some specific one like Cortex M0
  ## (hi, lo) <- a*b + c1 + c2
  ##
  ## Note: 0xFFFFFFFF² -> (hi: 0xFFFFFFFE, lo: 0x00000001)
  ##       so adding 0xFFFFFFFF leads to (hi: 0xFFFFFFFF, lo: 0x00000000)
  ##       and we have enough space to add again 0xFFFFFFFF without overflowing
  let dblPrec = uint64(a) * uint64(b) + uint64(c1) + uint64(c2)
  lo = uint32(dblPrec)
  hi = uint32(dblPrec shr 32)

# ############################################################
#
#                     64-bit words
#
# ############################################################

when sizeof(int) == 8 and not defined(Stint32):
  from ./compiletime_fallback import div2n1n_nim, mul_nim, muladd1_nim, muladd2_nim

  when defined(vcc):
    from ./extended_precision_x86_64_msvc import div2n1n_128, mul_128, muladd1_128, muladd2_128
  elif GCCCompatible:
    when X86:
      from ./extended_precision_x86_64_gcc import div2n1n_128
      from ./extended_precision_64bit_uint128 import mul_128, muladd1_128, muladd2_128
    else:
      from ./extended_precision_64bit_uint128 import div2n1n_128, mul_128, muladd1_128, muladd2_128

  func mul*(hi, lo: var uint64, u, v: uint64) {.inline.}=
    ## Extended precision multiplication
    ## (hi, lo) <- u * v
    when nimvm:
      mul_nim(hi, lo, u, v)
    else:
      mul_128(hi, lo, u, v)

  func muladd1*(hi, lo: var uint64, a, b, c: uint64) {.inline.}=
    ## Extended precision multiplication + addition
    ## (hi, lo) <- a*b + c
    ##
    ## Note: 0xFFFFFFFF_FFFFFFFF² -> (hi: 0xFFFFFFFFFFFFFFFE, lo: 0x0000000000000001)
    ##       so adding any c cannot overflow
    when nimvm:
      muladd1_nim(hi, lo, a, b, c)
    else:
      muladd1_128(hi, lo, a, b, c)

  func muladd2*(hi, lo: var uint64, a, b, c1, c2: uint64) {.inline.}=
    ## Extended precision multiplication + addition + addition
    ## (hi, lo) <- a*b + c1 + c2
    ##
    ## Note: 0xFFFFFFFF_FFFFFFFF² -> (hi: 0xFFFFFFFFFFFFFFFE, lo: 0x0000000000000001)
    ##       so adding 0xFFFFFFFFFFFFFFFF leads to (hi: 0xFFFFFFFFFFFFFFFF, lo: 0x0000000000000000)
    ##       and we have enough space to add again 0xFFFFFFFFFFFFFFFF without overflowing
    when nimvm:
      muladd2_nim(hi, lo, a, b, c1, c2)
    else:
      muladd2_128(hi, lo, a, b, c1, c2)

  func div2n1n*(q, r: var uint64, n_hi, n_lo, d: uint64) {.inline.}=
    ## Division uint128 by uint64
    ## Warning ⚠️ :
    ##   - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE
    ##   - if n_hi > d result is undefined
    when nimvm:
      div2n1n_nim(q, r, n_hi, n_lo, d)
    else:
      div2n1n_128(q, r, n_hi, n_lo, d)

# ############################################################
#
#                  Composite primitives
#
# ############################################################

func mulDoubleAdd2*[T: uint32|uint64](r2: var Carry, r1, r0: var T, a, b, c: T, dHi: Carry, dLo: T) {.inline.} =
  ## (r2, r1, r0) <- 2*a*b + c + (dHi, dLo)
  ## with r = (r2, r1, r0) a triple-word number
  ## and d = (dHi, dLo) a double-word number
  ## r2 and dHi are carries, either 0 or 1

  var carry: Carry

  # (r1, r0) <- a*b
  # Note: 0xFFFFFFFF_FFFFFFFF² -> (hi: 0xFFFFFFFF_FFFFFFFE, lo: 0x00000000_00000001)
  mul(r1, r0, a, b)

  # (r2, r1, r0) <- 2*a*b
  # Then  (hi: 0xFFFFFFFF_FFFFFFFE, lo: 0x00000000_00000001) * 2
  #       (carry: 1, hi: 0xFFFFFFFF_FFFFFFFC, lo: 0x00000000_00000002)
  addC(carry, r0, r0, r0, Carry(0))
  addC(r2, r1, r1, r1, carry)

  # (r1, r0) <- (r1, r0) + c
  # Adding any uint64 cannot overflow into r2 for example Adding 2^64-1
  #       (carry: 1, hi: 0xFFFFFFFF_FFFFFFFD, lo: 0x00000000_00000001)
  addC(carry, r0, r0, c, Carry(0))
  addC(carry, r1, r1, T(0), carry)

  # (r1, r0) <- (r1, r0) + (dHi, dLo) with dHi a carry (previous limb r2)
  # (dHi, dLo) is at most (dhi: 1, dlo: 0xFFFFFFFF_FFFFFFFF)
  # summing into (carry: 1, hi: 0xFFFFFFFF_FFFFFFFD, lo: 0x00000000_00000001)
  # result at most in (carry: 1, hi: 0xFFFFFFFF_FFFFFFFF, lo: 0x00000000_00000000)
  addC(carry, r0, r0, dLo, Carry(0))
  addC(carry, r1, r1, T(dHi), carry)

func mulAcc*[T: uint32|uint64](t, u, v: var T, a, b: T) {.inline.} =
  ## (t, u, v) <- (t, u, v) + a * b
  var UV: array[2, T]
  var carry: Carry
  mul(UV[1], UV[0], a, b)
  addC(carry, v, v, UV[0], Carry(0))
  addC(carry, u, u, UV[1], carry)
  t += T(carry)
Implement types and uint128 primitives 2020-06-12 16:37:02 +00:00			`# Stint`
			`# Copyright 2018 Status Research & Development GmbH`
			`# Licensed under either of`
			`#`
			`# * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)`
			`# * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)`
			`#`
			`# at your option. This file may not be copied, modified, or distributed except according to those terms.`

			`# ############################################################`
			`#`
			`# Extended precision primitives`
			`#`
			`# ############################################################`

			`import`
			`../datatypes,`
			`./addcarry_subborrow`

			`# ############################################################`
			`#`
			`# 32-bit words`
			`#`
			`# ############################################################`

			`func div2n1n*(q, r: var uint32, n_hi, n_lo, d: uint32) {.inline.}=`
			`## Division uint64 by uint32`
			`## Warning ⚠️ :`
			`## - if n_hi == d, quotient does not fit in an uint32`
			`## - if n_hi > d result is undefined`
			`##`
			`## To avoid issues, n_hi, n_lo, d should be normalized.`
			`## i.e. shifted (== multiplied by the same power of 2)`
			`## so that the most significant bit in d is set.`
			`let dividend = (uint64(n_hi) shl 32) or uint64(n_lo)`
			`let divisor = uint64(d)`
			`q = uint32(dividend div divisor)`
			`r = uint32(dividend mod divisor)`

			`func mul*(hi, lo: var uint32, a, b: uint32) {.inline.} =`
			`## Extended precision multiplication`
			`## (hi, lo) <- a*b`
			`let dblPrec = uint64(a) * uint64(b)`
			`lo = uint32(dblPrec)`
			`hi = uint32(dblPrec shr 32)`

			`func muladd1*(hi, lo: var uint32, a, b, c: uint32) {.inline.} =`
			`## Extended precision multiplication + addition`
			`## (hi, lo) <- a*b + c`
			`##`
			`## Note: 0xFFFFFFFF² -> (hi: 0xFFFFFFFE, lo: 0x00000001)`
			`## so adding any c cannot overflow`
			`let dblPrec = uint64(a) * uint64(b) + uint64(c)`
			`lo = uint32(dblPrec)`
			`hi = uint32(dblPrec shr 32)`

			`func muladd2*(hi, lo: var uint32, a, b, c1, c2: uint32) {.inline.}=`
			`## Extended precision multiplication + addition + addition`
			`## This is constant-time on most hardware except some specific one like Cortex M0`
			`## (hi, lo) <- a*b + c1 + c2`
			`##`
			`## Note: 0xFFFFFFFF² -> (hi: 0xFFFFFFFE, lo: 0x00000001)`
			`## so adding 0xFFFFFFFF leads to (hi: 0xFFFFFFFF, lo: 0x00000000)`
			`## and we have enough space to add again 0xFFFFFFFF without overflowing`
			`let dblPrec = uint64(a) * uint64(b) + uint64(c1) + uint64(c2)`
			`lo = uint32(dblPrec)`
			`hi = uint32(dblPrec shr 32)`

			`# ############################################################`
			`#`
			`# 64-bit words`
			`#`
			`# ############################################################`

Fix compiletime primitives to pass all bitwise tests except large shifts 2020-06-13 10:29:31 +00:00			`when sizeof(int) == 8 and not defined(Stint32):`
uint division - compile and pass the single limb tests 2022-01-22 00:42:54 +00:00			`from ./compiletime_fallback import div2n1n_nim, mul_nim, muladd1_nim, muladd2_nim`

			`when defined(vcc):`
			`from ./extended_precision_x86_64_msvc import div2n1n_128, mul_128, muladd1_128, muladd2_128`
			`elif GCCCompatible:`
			`when X86:`
			`from ./extended_precision_x86_64_gcc import div2n1n_128`
			`from ./extended_precision_64bit_uint128 import mul_128, muladd1_128, muladd2_128`
			`else:`
			`from ./extended_precision_64bit_uint128 import div2n1n_128, mul_128, muladd1_128, muladd2_128`

			`func mul*(hi, lo: var uint64, u, v: uint64) {.inline.}=`
			`## Extended precision multiplication`
			`## (hi, lo) <- u * v`
			`when nimvm:`
			`mul_nim(hi, lo, u, v)`
			`else:`
			`mul_128(hi, lo, u, v)`

			`func muladd1*(hi, lo: var uint64, a, b, c: uint64) {.inline.}=`
			`## Extended precision multiplication + addition`
			`## (hi, lo) <- a*b + c`
			`##`
			`## Note: 0xFFFFFFFF_FFFFFFFF² -> (hi: 0xFFFFFFFFFFFFFFFE, lo: 0x0000000000000001)`
			`## so adding any c cannot overflow`
			`when nimvm:`
			`muladd1_nim(hi, lo, a, b, c)`
			`else:`
			`muladd1_128(hi, lo, a, b, c)`

			`func muladd2*(hi, lo: var uint64, a, b, c1, c2: uint64) {.inline.}=`
			`## Extended precision multiplication + addition + addition`
			`## (hi, lo) <- a*b + c1 + c2`
			`##`
			`## Note: 0xFFFFFFFF_FFFFFFFF² -> (hi: 0xFFFFFFFFFFFFFFFE, lo: 0x0000000000000001)`
			`## so adding 0xFFFFFFFFFFFFFFFF leads to (hi: 0xFFFFFFFFFFFFFFFF, lo: 0x0000000000000000)`
			`## and we have enough space to add again 0xFFFFFFFFFFFFFFFF without overflowing`
			`when nimvm:`
			`muladd2_nim(hi, lo, a, b, c1, c2)`
			`else:`
			`muladd2_128(hi, lo, a, b, c1, c2)`

			`func div2n1n*(q, r: var uint64, n_hi, n_lo, d: uint64) {.inline.}=`
			`## Division uint128 by uint64`
			`## Warning ⚠️ :`
			`## - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE`
			`## - if n_hi > d result is undefined`
			`when nimvm:`
			`div2n1n_nim(q, r, n_hi, n_lo, d)`
			`else:`
			`div2n1n_128(q, r, n_hi, n_lo, d)`
Implement types and uint128 primitives 2020-06-12 16:37:02 +00:00
			`# ############################################################`
			`#`
			`# Composite primitives`
			`#`
			`# ############################################################`

			`func mulDoubleAdd2*[T: uint32\|uint64](r2: var Carry, r1, r0: var T, a, b, c: T, dHi: Carry, dLo: T) {.inline.} =`
			`## (r2, r1, r0) <- 2ab + c + (dHi, dLo)`
			`## with r = (r2, r1, r0) a triple-word number`
			`## and d = (dHi, dLo) a double-word number`
			`## r2 and dHi are carries, either 0 or 1`

			`var carry: Carry`

			`# (r1, r0) <- a*b`
			`# Note: 0xFFFFFFFF_FFFFFFFF² -> (hi: 0xFFFFFFFF_FFFFFFFE, lo: 0x00000000_00000001)`
			`mul(r1, r0, a, b)`

			`# (r2, r1, r0) <- 2ab`
			`# Then (hi: 0xFFFFFFFF_FFFFFFFE, lo: 0x00000000_00000001) * 2`
			`# (carry: 1, hi: 0xFFFFFFFF_FFFFFFFC, lo: 0x00000000_00000002)`
			`addC(carry, r0, r0, r0, Carry(0))`
			`addC(r2, r1, r1, r1, carry)`

			`# (r1, r0) <- (r1, r0) + c`
			`# Adding any uint64 cannot overflow into r2 for example Adding 2^64-1`
			`# (carry: 1, hi: 0xFFFFFFFF_FFFFFFFD, lo: 0x00000000_00000001)`
			`addC(carry, r0, r0, c, Carry(0))`
			`addC(carry, r1, r1, T(0), carry)`

			`# (r1, r0) <- (r1, r0) + (dHi, dLo) with dHi a carry (previous limb r2)`
			`# (dHi, dLo) is at most (dhi: 1, dlo: 0xFFFFFFFF_FFFFFFFF)`
			`# summing into (carry: 1, hi: 0xFFFFFFFF_FFFFFFFD, lo: 0x00000000_00000001)`
			`# result at most in (carry: 1, hi: 0xFFFFFFFF_FFFFFFFF, lo: 0x00000000_00000000)`
			`addC(carry, r0, r0, dLo, Carry(0))`
			`addC(carry, r1, r1, T(dHi), carry)`

			`func mulAcc*[T: uint32\|uint64](t, u, v: var T, a, b: T) {.inline.} =`
			`## (t, u, v) <- (t, u, v) + a * b`
			`var UV: array[2, T]`
			`var carry: Carry`
uint division - compile and pass the single limb tests 2022-01-22 00:42:54 +00:00			`mul(UV[1], UV[0], a, b)`
Implement types and uint128 primitives 2020-06-12 16:37:02 +00:00			`addC(carry, v, v, UV[0], Carry(0))`
			`addC(carry, u, u, UV[1], carry)`
			`t += T(carry)`