nim-stint/stint/private/primitives/extended_precision_x86_64_m...

# Stint
# Copyright 2018 Status Research & Development GmbH
# Licensed under either of
#
#  * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
#  * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
#
# at your option. This file may not be copied, modified, or distributed except according to those terms.

import
  ../datatypes,
  ./addcarry_subborrow

# ############################################################
#
#      Extended precision primitives for X86-64 on MSVC
#
# ############################################################

static:
  doAssert defined(vcc)
  doAssert sizeof(int) == 8
  doAssert X86

func udiv128(highDividend, lowDividend, divisor: Ct[uint64], remainder: var Ct[uint64]): Ct[uint64] {.importc:"_udiv128", header: "<intrin.h>", nodecl.}
  ## Division 128 by 64, Microsoft only, 64-bit only,
  ## returns quotient as return value remainder as var parameter
  ## Warning ⚠️ :
  ##   - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE
  ##   - if n_hi > d result is undefined

func umul128(a, b: Ct[uint64], hi: var Ct[uint64]): Ct[uint64] {.importc:"_umul128", header:"<intrin.h>", nodecl.}
  ## (hi, lo) <-- a * b
  ## Return value is the low word

func div2n1n*(q, r: var Ct[uint64], n_hi, n_lo, d: Ct[uint64]) {.inline.}=
    ## Division uint128 by uint64
    ## Warning ⚠️ :
    ##   - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE
    ##   - if n_hi > d result is undefined
    q = udiv128(n_hi, n_lo, d, r)

func mul_128*(hi, lo: var Ct[uint64], a, b: Ct[uint64]) {.inline.} =
  ## Extended precision multiplication
  ## (hi, lo) <- a*b
  lo = umul128(a, b, hi)

func muladd1_128*(hi, lo: var Ct[uint64], a, b, c: Ct[uint64]) {.inline.} =
  ## Extended precision multiplication + addition
  ## (hi, lo) <- a*b + c
  ##
  ## Note: 0xFFFFFFFF_FFFFFFFF² -> (hi: 0xFFFFFFFFFFFFFFFE, lo: 0x0000000000000001)
  ##       so adding any c cannot overflow
  var carry: Carry
  lo = umul128(a, b, hi)
  addC(carry, lo, lo, c, Carry(0))
  addC(carry, hi, hi, 0, carry)

func muladd2_128*(hi, lo: var Ct[uint64], a, b, c1, c2: Ct[uint64]) {.inline.}=
  ## Extended precision multiplication + addition + addition
  ## This is constant-time on most hardware except some specific one like Cortex M0
  ## (hi, lo) <- a*b + c1 + c2
  ##
  ## Note: 0xFFFFFFFF_FFFFFFFF² -> (hi: 0xFFFFFFFFFFFFFFFE, lo: 0x0000000000000001)
  ##       so adding 0xFFFFFFFFFFFFFFFF leads to (hi: 0xFFFFFFFFFFFFFFFF, lo: 0x0000000000000000)
  ##       and we have enough space to add again 0xFFFFFFFFFFFFFFFF without overflowing
  # For speed this could be implemented with parallel pipelined carry chains
  # via MULX + ADCX + ADOX
  var carry1, carry2: Carry

  lo = umul128(a, b, hi)
  # Carry chain 1
  addC(carry1, lo, lo, c1, Carry(0))
  addC(carry1, hi, hi, 0, carry1)
  # Carry chain 2
  addC(carry2, lo, lo, c2, Carry(0))
  addC(carry2, hi, hi, 0, carry2)
Implement types and uint128 primitives 2020-06-12 16:37:02 +00:00			`# Stint`
			`# Copyright 2018 Status Research & Development GmbH`
			`# Licensed under either of`
			`#`
			`# * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)`
			`# * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)`
			`#`
			`# at your option. This file may not be copied, modified, or distributed except according to those terms.`

			`import`
			`../datatypes,`
			`./addcarry_subborrow`

			`# ############################################################`
			`#`
			`# Extended precision primitives for X86-64 on MSVC`
			`#`
			`# ############################################################`

			`static:`
			`doAssert defined(vcc)`
			`doAssert sizeof(int) == 8`
			`doAssert X86`

			`func udiv128(highDividend, lowDividend, divisor: Ct[uint64], remainder: var Ct[uint64]): Ct[uint64] {.importc:"_udiv128", header: "<intrin.h>", nodecl.}`
			`## Division 128 by 64, Microsoft only, 64-bit only,`
			`## returns quotient as return value remainder as var parameter`
			`## Warning ⚠️ :`
			`## - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE`
			`## - if n_hi > d result is undefined`

			`func umul128(a, b: Ct[uint64], hi: var Ct[uint64]): Ct[uint64] {.importc:"_umul128", header:"<intrin.h>", nodecl.}`
			`## (hi, lo) <-- a * b`
			`## Return value is the low word`

			`func div2n1n*(q, r: var Ct[uint64], n_hi, n_lo, d: Ct[uint64]) {.inline.}=`
			`## Division uint128 by uint64`
			`## Warning ⚠️ :`
			`## - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE`
			`## - if n_hi > d result is undefined`
			`q = udiv128(n_hi, n_lo, d, r)`

uint division - compile and pass the single limb tests 2022-01-22 00:42:54 +00:00			`func mul_128*(hi, lo: var Ct[uint64], a, b: Ct[uint64]) {.inline.} =`
Implement types and uint128 primitives 2020-06-12 16:37:02 +00:00			`## Extended precision multiplication`
			`## (hi, lo) <- a*b`
			`lo = umul128(a, b, hi)`

uint division - compile and pass the single limb tests 2022-01-22 00:42:54 +00:00			`func muladd1_128*(hi, lo: var Ct[uint64], a, b, c: Ct[uint64]) {.inline.} =`
Implement types and uint128 primitives 2020-06-12 16:37:02 +00:00			`## Extended precision multiplication + addition`
			`## (hi, lo) <- a*b + c`
			`##`
			`## Note: 0xFFFFFFFF_FFFFFFFF² -> (hi: 0xFFFFFFFFFFFFFFFE, lo: 0x0000000000000001)`
			`## so adding any c cannot overflow`
			`var carry: Carry`
			`lo = umul128(a, b, hi)`
			`addC(carry, lo, lo, c, Carry(0))`
			`addC(carry, hi, hi, 0, carry)`

uint division - compile and pass the single limb tests 2022-01-22 00:42:54 +00:00			`func muladd2_128*(hi, lo: var Ct[uint64], a, b, c1, c2: Ct[uint64]) {.inline.}=`
Implement types and uint128 primitives 2020-06-12 16:37:02 +00:00			`## Extended precision multiplication + addition + addition`
			`## This is constant-time on most hardware except some specific one like Cortex M0`
			`## (hi, lo) <- a*b + c1 + c2`
			`##`
			`## Note: 0xFFFFFFFF_FFFFFFFF² -> (hi: 0xFFFFFFFFFFFFFFFE, lo: 0x0000000000000001)`
			`## so adding 0xFFFFFFFFFFFFFFFF leads to (hi: 0xFFFFFFFFFFFFFFFF, lo: 0x0000000000000000)`
			`## and we have enough space to add again 0xFFFFFFFFFFFFFFFF without overflowing`
			`# For speed this could be implemented with parallel pipelined carry chains`
			`# via MULX + ADCX + ADOX`
			`var carry1, carry2: Carry`

			`lo = umul128(a, b, hi)`
			`# Carry chain 1`
			`addC(carry1, lo, lo, c1, Carry(0))`
			`addC(carry1, hi, hi, 0, carry1)`
			`# Carry chain 2`
			`addC(carry2, lo, lo, c2, Carry(0))`
			`addC(carry2, hi, hi, 0, carry2)`