Poly1305 Message Authentication Code (#186)

* Groundwork for Poly1305 MAC * Implement fast reduction for Poly1305 * don't import assembly files when compiling without assembly
2025-02-22 08:58:05 +00:00 · 2022-03-05 23:39:24 +01:00 · 2022-03-05 23:39:24 +01:00 · 742cecce08
commit 742cecce08
parent c2eb42b769
12 changed files with 537 additions and 73 deletions
--- a/benchmarks/bench_poly1305.nim
+++ b/benchmarks/bench_poly1305.nim
@ -0,0 +1,65 @@
+import
+  # Internals
+  ../constantine/mac/mac_poly1305,
+  # Helpers
+  ../helpers/prng_unsafe,
+  ./bench_blueprint,
+  # C API
+  system/ansi_c
+
+proc separator*() = separator(69)
+
+# --------------------------------------------------------------------
+
+proc report(op: string, bytes: int, startTime, stopTime: MonoTime, startClk, stopClk: int64, iters: int) =
+  let ns = inNanoseconds((stopTime-startTime) div iters)
+  let throughput = 1e9 / float64(ns)
+  when SupportsGetTicks:
+    let cycles = (stopClk - startClk) div iters
+    let cyclePerByte = cycles.float64 / bytes.float64
+    echo &"{op:<30}     {throughput:>15.3f} ops/s    {ns:>9} ns/op    {cycles:>10} cycles    {cyclePerByte:>5.2f} cycles/byte"
+  else:
+    echo &"{op:<30}     {throughput:>15.3f} ops/s    {ns:>9} ns/op"
+
+template bench(op: string, bytes: int, iters: int, body: untyped): untyped =
+  measure(iters, startTime, stopTime, startClk, stopClk, body)
+  report(op, bytes, startTime, stopTime, startClk, stopClk, iters)
+
+proc benchPoly1305_constantine[T](msg: openarray[T], msgComment: string, iters: int) =
+  var tag: array[16, byte]
+  let ikm = [
+      byte 0x85, 0xd6, 0xbe, 0x78, 0x57, 0x55, 0x6d, 0x33,
+           0x7f, 0x44, 0x52, 0xfe, 0x42, 0xd5, 0x06, 0xa8,
+           0x01, 0x03, 0x80, 0x8a, 0xfb, 0x0d, 0xb2, 0xfd,
+           0x4a, 0xbf, 0xf6, 0xaf, 0x41, 0x49, 0xf5, 0x1b
+    ]
+  bench("Poly1305 - Constantine - " & msgComment, msg.len, iters):
+    poly1305.auth(tag, msg, ikm)
+
+when isMainModule:
+  proc main() =
+    block:
+      let msg32B = rng.random_byte_seq(32)
+      benchPoly1305_constantine(msg32B, "32B", 100)
+    block:
+      let msg64B = rng.random_byte_seq(64)
+      benchPoly1305_constantine(msg64B, "64B", 100)
+    block:
+      let msg128B = rng.random_byte_seq(128)
+      benchPoly1305_constantine(msg128B, "128B", 100)
+    block:
+      let msg576B = rng.random_byte_seq(576)
+      benchPoly1305_constantine(msg576B, "576B", 50)
+    block:
+      let msg8192B = rng.random_byte_seq(8192)
+      benchPoly1305_constantine(msg8192B, "8192B", 25)
+    block:
+      let msg1MB = rng.random_byte_seq(1_000_000)
+      benchPoly1305_constantine(msg1MB, "1MB", 16)
+    block:
+      let msg10MB = rng.random_byte_seq(10_000_000)
+      benchPoly1305_constantine(msg10MB, "10MB", 16)
+    block:
+      let msg100MB = rng.random_byte_seq(100_000_000)
+      benchPoly1305_constantine(msg100MB, "100MB", 3)
+  main()
--- a/benchmarks/bench_sha256.nim
+++ b/benchmarks/bench_sha256.nim
@ -69,17 +69,33 @@ proc benchSHA256_openssl[T](msg: openarray[T], msgComment: string, iters: int) =
 when isMainModule:
  proc main() =
    block:
-      let msg128B = rng.random_byte_seq(32)
-      benchSHA256_constantine(msg128B, "32B", 32)
-      benchSHA256_openssl(msg128B, "32B", 32)
+      let msg32B = rng.random_byte_seq(32)
+      benchSHA256_constantine(msg32B, "32B", 100)
+      benchSHA256_openssl(msg32B, "32B", 100)
+    block:
+      let msg64B = rng.random_byte_seq(64)
+      benchSHA256_constantine(msg64B, "64B", 100)
+      benchSHA256_openssl(msg64B, "64B", 100)
    block:
      let msg128B = rng.random_byte_seq(128)
-      benchSHA256_constantine(msg128B, "128B", 128)
-      benchSHA256_openssl(msg128B, "128B", 128)
+      benchSHA256_constantine(msg128B, "128B", 100)
+      benchSHA256_openssl(msg128B, "128B", 100)
    block:
-      let msg5MB = rng.random_byte_seq(5_000_000)
-      benchSHA256_constantine(msg5MB, "5MB", 16)
-      benchSHA256_openssl(msg5MB, "5MB", 16)
+      let msg576B = rng.random_byte_seq(576)
+      benchSHA256_constantine(msg576B, "576B", 50)
+      benchSHA256_openssl(msg576B, "576B", 50)
+    block:
+      let msg8192B = rng.random_byte_seq(8192)
+      benchSHA256_constantine(msg8192B, "8192B", 25)
+      benchSHA256_openssl(msg8192B, "8192B", 25)
+    block:
+      let msg1MB = rng.random_byte_seq(1_000_000)
+      benchSHA256_constantine(msg1MB, "1MB", 16)
+      benchSHA256_openssl(msg1MB, "1MB", 16)
+    block:
+      let msg10MB = rng.random_byte_seq(10_000_000)
+      benchSHA256_constantine(msg10MB, "10MB", 16)
+      benchSHA256_openssl(msg10MB, "10MB", 16)
    block:
      let msg100MB = rng.random_byte_seq(100_000_000)
      benchSHA256_constantine(msg100MB, "100MB", 3)
--- a/constantine.nimble
+++ b/constantine.nimble
@ -192,6 +192,10 @@ const testDesc: seq[tuple[path: string, useGMP: bool]] = @[
  # ----------------------------------------------------------
  ("tests/t_cipher_chacha20.nim", false),

+  # Message Authentication Code
+  # ----------------------------------------------------------
+  ("tests/t_mac_poly1305.nim", false),
+
  # Protocols
  # ----------------------------------------------------------
  ("tests/t_ethereum_evm_precompiles.nim", false),
--- a/constantine/hashes.nim
+++ b/constantine/hashes.nim
@ -58,7 +58,7 @@ func hash*[DigestSize: static int, T: char|byte](
 func hash*[T: char|byte](
       HashKind: type CryptoHash,
       message: openarray[T],
-       clearmem = false): array[HashKind.sizeInBytes, byte] =
+       clearmem = false): array[HashKind.sizeInBytes, byte] {.noInit.} =
  ## Produce a digest from a message
  HashKind.hash(result, message, clearMem)

--- a/constantine/hashes/h_sha256.nim
+++ b/constantine/hashes/h_sha256.nim
@ -51,10 +51,6 @@ type
 {.push raises: [].}
 {.push checks: off.}

-func setZero[N](a: var array[N, SomeNumber]){.inline.} =
-  for i in 0 ..< a.len:
-    a[i] = 0
-
 template rotr(x, n: uint32): uint32 =
  ## Rotate right the bits
  # We always use it with constants in 0 ..< 32
@ -272,24 +268,6 @@ func dumpHash(
    digest.dumpRawInt(H[i], dstIdx, bigEndian)
    dstIdx += uint sizeof(uint32)

-func copy[N: static int, T: byte|char](
-       dst: var array[N, byte],
-       dStart: SomeInteger,
-       src: openArray[T],
-       sStart: SomeInteger,
-       len: SomeInteger
-     ) =
-  ## Copy dst[dStart ..< dStart+len] = src[sStart ..< sStart+len]
-  ## Unlike the standard library, this cannot throw
-  ## even a defect.
-  ## It also handles copy of char into byte arrays
-  debug:
-    doAssert 0 <= dStart and dStart+len <= dst.len.uint
-    doAssert 0 <= sStart and sStart+len <= src.len.uint
-
-  for i in 0 ..< len:
-    dst[dStart + i] = byte src[sStart + i]
-
 func hashBuffer(ctx: var Sha256Context) =
  discard ctx.H.hashMessageBlocks(ctx.buf)
  ctx.buf.setZero()
@ -445,4 +423,7 @@ func clear*(ctx: var Sha256Context) =
  ## For passwords and secret keys, you MUST NOT use raw SHA-256
  ## use a Key Derivation Function instead (KDF)
  # TODO: ensure compiler cannot optimize the code away
+  ctx.H.setZero()
  ctx.buf.setZero()
+  ctx.msgLen = 0
+  ctx.bufIdx = 0
--- a/constantine/mac/mac_poly1305.nim
+++ b/constantine/mac/mac_poly1305.nim
@ -0,0 +1,355 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  ../platforms/abstractions,
+  ../math/arithmetic/bigints,
+  ../math/arithmetic/[limbs, limbs_extmul],
+  ../math/io/io_bigints
+
+when UseASM_X86_64:
+  import ../math/arithmetic/assembly/limbs_asm_modular_x86
+
+# No exceptions allowed
+{.push raises: [].}
+
+# ############################################################
+#
+#               Poly1305 Message Authentication Code
+#
+# ############################################################
+
+# TODO: instead of using a saturated representation,
+#       since there is 62 extra bits unused in the last limb
+#       use an unsaturated representation and remove all carry dependency chains.
+#       Given the number of add with carries, this would significantly
+#       improve instruction level parallelism.
+#
+#       Also vectorizing the code requires removing carry chains anyway.
+
+const P1305 = BigInt[130].fromHex"0x3fffffffffffffffffffffffffffffffb"
+
+func partialReduce_1305[N1, N2: static int](r: var Limbs[N1], a: Limbs[N2]) =
+  ## The prime 2¹³⁰-5 has a special form 2ᵐ-c
+  ## called "Crandall prime" or Pseudo-Mersenne Prime
+  ## in the litterature
+  ## which allows fast reduction from the fact that
+  ##        2ᵐ-c ≡  0     (mod p)
+  ##   <=>  2ᵐ   ≡  c     (mod p)   [1]
+  ##   <=> a2ᵐ+b ≡ ac + b (mod p)
+  ## 
+  ## This partially reduces the input in range [0, 2¹³⁰)
+  #
+  # Assuming 64-bit words,
+  #   N1 = 3 words (192-bit necessary for 2¹³⁰-1)
+  #   N2 = 4 words (256-bit necessary for 2¹³¹.2¹²⁴)
+  # Assuming 32-bit words,
+  #   N1 = 5 words (160-bit necessary for 2¹³⁰-1)
+  #   N2 = 8 words (288-bit necessary for 2¹³¹.2¹²⁴)
+  # 
+  # from 64-bit, starting from [1]
+  #   2ᵐ      ≡  c     (mod p)
+  #   2¹³⁰    ≡  5     (mod p)
+  # 2¹³⁰.2⁶²  ≡  5.2⁶² (mod p)
+  #   2¹⁹²    ≡  5.2⁶² (mod p)
+  # 
+  # Hence if we call a the [2¹⁹², 2²⁶⁰) range
+  # and b the [0, 2¹⁹²) range
+  # we have
+  # a2¹⁹²+b ≡ a.5.2⁶² + b (mod p)
+  # 
+  # Then we can handle the highest word which has
+  # 62 bits that should be folded back as well
+  # 
+  # Similarly for 32-bit
+  #   2¹⁶⁰    ≡  5.2³⁰ (mod p)
+  # and we need to fold back the top 30 bits
+  # 
+  # But there is a twist. 5.2⁶² need 65-bit not 64
+  # and 5.2³⁰ need 33-bit not 32
+
+  when WordBitwidth == 64:
+    static:
+      doAssert N1 == 3
+      doAssert N2 == 4
+    
+    block:
+      # First pass, fold everything greater than 2¹⁹²-1
+      # a2¹⁹²+b ≡ a.5.2⁶² + b (mod p)
+      #   scale by 5.2⁶¹ first as 5.2⁶² does not fit in 64-bit words
+      const c = SecretWord 5
+      const cExcess = c shl 61
+
+      var carry: Carry
+      var hi, lo: SecretWord
+      mul(hi, lo, a[3], cExcess)
+      addC(carry, r[0], lo, a[0], Carry(0))
+      addC(carry, r[1], hi, a[1], carry)
+      addC(carry, r[2], Zero, a[2], carry)
+      #   finally double to scale by 5.2⁶²
+      addC(carry, r[0], lo, r[0], Carry(0))
+      addC(carry, r[1], hi, r[1], carry)
+      addC(carry, r[2], Zero, r[2], carry)
+  else:
+    static:
+      doAssert N1 == 5
+      doAssert N2 == 8
+    
+    block:
+      # First pass, fold everything greater than 2¹⁶⁰-1
+      # a2¹⁶⁰+b ≡ a.5.2³⁰ + b (mod p)
+      #   scale by 5.2²⁹ first as 5.2³⁰ does not fit in 32-bit words
+      const c = SecretWord 5
+      const cExcess = c shl 29
+
+      staticFor i, 0, N1:
+        r[i] = a[i]
+      
+      mulDoubleAcc(r[2], r[1], r[0], a[5], cExcess)
+      mulDoubleAcc(r[3], r[2], r[1], a[6], cExcess)
+      mulDoubleAcc(r[4], r[3], r[2], a[7], cExcess)
+
+  const bits = 130
+  const excessBits = wordsRequired(bits)*WordBitWidth - bits
+
+  # Second pass, fold everything greater than 2¹³⁰-1
+  # into the lower bits
+  var carry, carry2: Carry
+  var hi = r[N1-1] shr (WordBitWidth - excessBits)
+  r[N1-1] = r[N1-1] and (MaxWord shr excessBits)
+  
+  # hi *= 5, with overflow stored in carry
+  let hi4 = hi shl 2                   # Cannot overflow as we have 2 spare bits
+  addC(carry2, hi, hi, hi4, Carry(0))  # Use the carry bit for storing a 63/31 bit result
+
+  # Process with actual fold
+  addC(carry, r[0], r[0], hi, Carry(0))
+  addC(carry, r[1], r[1], SecretWord(carry2), carry)
+  staticFor i, 2, N1:
+    addC(carry, r[i], r[i], Zero, carry)
+  
+func finalReduce_1305[N: static int](a: var Limbs[N]) =
+  ## Maps an input in redundant representation [0, 2¹³¹-10)
+  ## to the canonical representation in [0, 2¹³⁰-5)
+  # Algorithm:
+  # 1. substract p = 2¹³⁰-5
+  # 2. if borrow, add back p.
+  when UseASM_X86_64 and a.len <= 6:
+    submod_asm(a, a, P1305.limbs, P1305.limbs)
+  else:
+    let underflowed = SecretBool sub(a, P1305.limbs)
+    discard cadd(a, P1305.limbs, underflowed)
+
+const BlockSize = 16
+
+type Poly1305_CTX = object
+  acc: BigInt[130+1] # After an unreduced sum, up to 131 bit may be used
+  r: BigInt[124]     # r is 124-bit after clamping
+  s: BigInt[128]
+  buf: array[BlockSize, byte]
+  msgLen: uint64
+  bufIdx: uint8
+
+type poly1305* = Poly1305_CTX
+
+func macMessageBlocks[T: byte|char](
+       acc: var BigInt[130+1],
+       r: BigInt[124],
+       message: openArray[T],
+       blockSize = BlockSize): uint =
+  ## Authenticate a message block by block
+  ## Poly1305 block size is 16 bytes.
+  ## Return the number of bytes processed.
+  ##
+  ## If hashing one partial block,
+  ## set blocksize to the remaining bytes to process
+
+  result = 0
+  let numBlocks = int(message.len.uint div BlockSize)
+  if numBlocks == 0:
+    return 0
+
+  var input {.noInit.}: BigInt[130+1]
+  # r is 124-bit after clambing
+  var t{.noInit.}: BigInt[130+1+124]
+
+  for curBlock in 0 ..< numBlocks:
+    # range [0, 2¹²⁸-1)
+    when T is byte:
+      input.unmarshal(
+        message.toOpenArray(curBlock*BlockSize, curBlock*BlockSize + BlockSize - 1),
+        littleEndian
+      )
+    else:
+      input.unmarshal(
+        message.toOpenArrayByte(curBlock*BlockSize, curBlock*BlockSize + BlockSize - 1),
+        littleEndian
+      )
+    input.setBit(8*blockSize) # range [2¹²⁸, 2¹²⁸+2¹²⁸-1)
+    acc += input              # range [2¹²⁸, 2¹³⁰-1+2¹²⁸+2¹²⁸-1)
+    t.prod(acc, r)            # range [2²⁵⁶, (2¹²⁴-1)(2¹³⁰+2(2¹²⁸-1)))
+    
+    acc.limbs.partialReduce_1305(t.limbs)
+
+  return BlockSize * numBlocks.uint
+
+func macBuffer(ctx: var Poly1305_CTX, blockSize: int) =
+  discard ctx.acc.macMessageBlocks(
+    ctx.r, ctx.buf, blockSize
+  )
+  ctx.buf.setZero()
+  ctx.bufIdx = 0
+
+# Public API
+# ----------------------------------------------------------------
+
+func init*(ctx: var Poly1305_CTX, nonReusedKey: array[32, byte]) =
+  ## Initialize Poly1305 MAC (Message Authentication Code) context.
+  ## nonReusedKey is an unique not-reused pre-shared key
+  ## between the parties that want to authenticate messages between each other
+  ctx.acc.setZero()
+  
+  const clamp = BigInt[128].fromHex"0x0ffffffc0ffffffc0ffffffc0fffffff"
+  ctx.r.unmarshal(nonReusedKey.toOpenArray(0, 15), littleEndian)
+  staticFor i, 0, ctx.r.limbs.len:
+    ctx.r.limbs[i] = ctx.r.limbs[i] and clamp.limbs[i]
+
+  ctx.s.unmarshal(nonReusedKey.toOpenArray(16, 31), littleEndian)
+  ctx.buf.setZero()
+  ctx.msgLen = 0
+  ctx.bufIdx = 0
+
+func update*[T: char|byte](ctx: var Poly1305_CTX, message: openArray[T]) =
+  ## Append a message to a Poly1305 authentication context.
+  ## for incremental Poly1305 computation
+  ##
+  ## Security note: the tail of your message might be stored
+  ## in an internal buffer.
+  ## if sensitive content is used, ensure that
+  ## `ctx.finish(...)` and `ctx.clear()` are called as soon as possible.
+  ## Additionally ensure that the message(s) passed were stored
+  ## in memory considered secure for your threat model.
+
+  debug:
+    doAssert: 0 <= ctx.bufIdx and ctx.bufIdx.int < ctx.buf.len
+    for i in ctx.bufIdx ..< ctx.buf.len:
+      doAssert ctx.buf[i] == 0
+
+  if message.len == 0:
+    return
+
+  var # Message processing state machine
+    cur = 0'u
+    bytesLeft = message.len.uint
+  
+  ctx.msgLen += bytesLeft
+
+  if ctx.bufIdx != 0: # Previous partial update
+    let bufIdx = ctx.bufIdx.uint
+    let free = ctx.buf.sizeof().uint - bufIdx
+
+    if free > bytesLeft:
+      # Enough free space, store in buffer
+      ctx.buf.copy(dStart = bufIdx, message, sStart = 0, len = bytesLeft)
+      ctx.bufIdx += bytesLeft.uint8
+      return
+    else:
+      # Fill the buffer and do one Poly1305 MAC
+      ctx.buf.copy(dStart = bufIdx, message, sStart = 0, len = free)
+      ctx.macBuffer(blockSize = BlockSize)
+
+      # Update message state for further processing
+      cur = free
+      bytesLeft -= free
+  
+  # Process n blocks (16 bytes each)
+  let consumed = ctx.acc.macMessageBlocks(
+    ctx.r, 
+    message.toOpenArray(int cur, message.len-1),
+    blockSize = BlockSize
+  )
+  cur += consumed
+  bytesLeft -= consumed
+
+  if bytesLeft != 0:
+    # Store the tail in buffer
+    debug: # TODO: state machine formal verification - https://nim-lang.org/docs/drnim.html
+      doAssert ctx.bufIdx == 0
+      doAssert cur + bytesLeft == message.len.uint
+
+    ctx.buf.copy(dStart = 0'u, message, sStart = cur, len = bytesLeft)
+    ctx.bufIdx = uint8 bytesLeft
+
+func finish*(ctx: var Poly1305_CTX, tag: var array[16, byte]) =
+  ## Finalize a Poly1305 authentication
+  ## and output an authentication tag to the `tag` buffer
+  ##
+  ## Security note: this does not clear the internal context.
+  ## if sensitive content is used, use "ctx.clear()"
+  ## and also make sure that the message(s) passed were stored
+  ## in memory considered secure for your threat model.
+
+  debug:
+    doAssert: 0 <= ctx.bufIdx and ctx.bufIdx.int < ctx.buf.len
+    for i in ctx.bufIdx ..< ctx.buf.len:
+      doAssert ctx.buf[i] == 0
+
+  if ctx.bufIdx != 0:
+    ctx.macBuffer(blockSize = ctx.bufIdx.int)
+
+  # Input is only partially reduced to [0, 2¹³⁰)
+  # Map it to [0, 2¹³⁰-5)
+  ctx.acc.limbs.finalReduce_1305()
+  
+  # Starting from now, we only care about the 128 least significant bits
+  var acc128{.noInit.}: BigInt[128]
+  acc128.copyTruncatedFrom(ctx.acc)
+  acc128 += ctx.s
+
+  tag.marshal(acc128, littleEndian)
+
+  debug:
+    doAssert ctx.bufIdx == 0
+    for i in 0 ..< ctx.buf.len:
+      doAssert ctx.buf[i] == 0
+
+func clear*(ctx: var Poly1305_CTX) =
+  ## Clear the context internal buffers
+  # TODO: ensure compiler cannot optimize the code away
+  ctx.acc.setZero()
+  ctx.r.setZero()
+  ctx.s.setZero()
+  ctx.buf.setZero()
+  ctx.msgLen = 0
+  ctx.bufIdx = 0
+
+func auth*[T: char|byte](
+       _: type poly1305,
+       tag: var array[16, byte],
+       message: openArray[T],
+       nonReusedKey: array[32, byte],
+       clearMem = false) =
+  ## Produce an authentication tag from a message
+  ## and a preshared unique non-reused secret key
+  
+  var ctx {.noInit.}: poly1305
+  ctx.init(nonReusedKey)
+  ctx.update(message)
+  ctx.finish(tag)
+
+  if clearMem:
+    ctx.clear()
+
+func auth*[T: char|byte](
+       _: type poly1305,
+       message: openArray[T],
+       nonReusedKey: array[32, byte],
+       clearMem = false): array[16, byte]{.noInit.}=
+  ## Produce an authentication tag from a message
+  ## and a preshared unique non-reused secret key
+  poly1305.auth(result, message, nonReusedKey, clearMem)
--- a/constantine/math/arithmetic/bigints.nim
+++ b/constantine/math/arithmetic/bigints.nim
@ -344,6 +344,16 @@ func bit0*(a: BigInt): Ct[uint8] =
  ## Access the least significant bit
  ct(a.limbs[0] and One, uint8)

+func setBit*[bits: static int](a: var BigInt[bits], index: int) =
+  ## Set an individual bit of `a` to 1.
+  ## This has no effect if it is already 1
+  const SlotShift = log2_vartime(WordBitWidth.uint32)
+  const SelectMask = WordBitWidth - 1
+
+  let slot = a.limbs[index shr SlotShift].addr
+  let shifted = One shl (index and SelectMask)
+  slot[] = slot[] or shifted
+
 # Multiplication by small cosntants
 # ------------------------------------------------------------

--- a/constantine/platforms/abstractions.nim
+++ b/constantine/platforms/abstractions.nim
@ -56,14 +56,4 @@ const
 # We need to support register spills for large limbs
 const CttASM {.booldefine.} = true
 const UseASM_X86_32* = CttASM and X86 and GCC_Compatible
-const UseASM_X86_64* = WordBitWidth == 64 and UseASM_X86_32
-
-# ############################################################
-#
-#                  Instrumentation
-#
-# ############################################################
-
-template debug*(body: untyped): untyped =
-  when defined(debugConstantine):
-    body
+const UseASM_X86_64* = WordBitWidth == 64 and UseASM_X86_32
--- a/constantine/platforms/compilers/extended_precision.nim
+++ b/constantine/platforms/compilers/extended_precision.nim
@ -94,37 +94,6 @@ when sizeof(int) == 8:
 #
 # ############################################################

-func mulDoubleAdd2*[T: Ct[uint32]|Ct[uint64]](r2: var Carry, r1, r0: var T, a, b, c: T, dHi: Carry, dLo: T) {.inline.} =
-  ## (r2, r1, r0) <- 2*a*b + c + (dHi, dLo)
-  ## with r = (r2, r1, r0) a triple-word number
-  ## and d = (dHi, dLo) a double-word number
-  ## r2 and dHi are carries, either 0 or 1
-
-  var carry: Carry
-
-  # (r1, r0) <- a*b
-  # Note: 0xFFFFFFFF_FFFFFFFF² -> (hi: 0xFFFFFFFF_FFFFFFFE, lo: 0x00000000_00000001)
-  mul(r1, r0, a, b)
-
-  # (r2, r1, r0) <- 2*a*b
-  # Then  (hi: 0xFFFFFFFF_FFFFFFFE, lo: 0x00000000_00000001) * 2
-  #       (carry: 1, hi: 0xFFFFFFFF_FFFFFFFC, lo: 0x00000000_00000002)
-  addC(carry, r0, r0, r0, Carry(0))
-  addC(r2, r1, r1, r1, carry)
-
-  # (r1, r0) <- (r1, r0) + c
-  # Adding any uint64 cannot overflow into r2 for example Adding 2^64-1
-  #       (carry: 1, hi: 0xFFFFFFFF_FFFFFFFD, lo: 0x00000000_00000001)
-  addC(carry, r0, r0, c, Carry(0))
-  addC(carry, r1, r1, T(0), carry)
-
-  # (r1, r0) <- (r1, r0) + (dHi, dLo) with dHi a carry (previous limb r2)
-  # (dHi, dLo) is at most (dhi: 1, dlo: 0xFFFFFFFF_FFFFFFFF)
-  # summing into (carry: 1, hi: 0xFFFFFFFF_FFFFFFFD, lo: 0x00000000_00000001)
-  # result at most in (carry: 1, hi: 0xFFFFFFFF_FFFFFFFF, lo: 0x00000000_00000000)
-  addC(carry, r0, r0, dLo, Carry(0))
-  addC(carry, r1, r1, T(dHi), carry)
-
 func mulAcc*[T: Ct[uint32]|Ct[uint64]](t, u, v: var T, a, b: T) {.inline.} =
  ## (t, u, v) <- (t, u, v) + a * b
  var UV: array[2, T]
--- a/constantine/platforms/constant_time/ct_routines.nim
+++ b/constantine/platforms/constant_time/ct_routines.nim
@ -104,6 +104,11 @@ template `*`*[T: Ct](x, y: T): T =
  # but this is not always true, especially on ARMv7 and ARMv9
  fmap(x, `*`, y)

+template `*=`*[T: Ct](x, y: T) =
+  # Warning ⚠️ : We assume that mul hardware multiplication is constant time
+  # but this is not always true, especially on ARMv7 and ARMv9
+  fmapAsgn(x, `*=`, y)
+
 # We don't implement div/mod as we can't assume the hardware implementation
 # is constant-time

--- a/constantine/platforms/primitives.nim
+++ b/constantine/platforms/primitives.nim
@ -33,3 +33,41 @@ export
 when X86 and GCC_Compatible:
  import isa/[cpuinfo_x86, macro_assembler_x86]
  export cpuinfo_x86, macro_assembler_x86
+
+# ############################################################
+#
+#                      Instrumentation
+#
+# ############################################################
+
+template debug*(body: untyped): untyped =
+  when defined(debugConstantine):
+    body
+
+# ############################################################
+#
+#                         Buffers
+#
+# ############################################################
+
+func setZero*[N](a: var array[N, SomeNumber]){.inline.} =
+  for i in 0 ..< a.len:
+    a[i] = 0
+
+func copy*[N: static int, T: byte|char](
+       dst: var array[N, byte],
+       dStart: SomeInteger,
+       src: openArray[T],
+       sStart: SomeInteger,
+       len: SomeInteger
+     ) {.inline.} =
+  ## Copy dst[dStart ..< dStart+len] = src[sStart ..< sStart+len]
+  ## Unlike the standard library, this cannot throw
+  ## even a defect.
+  ## It also handles copy of char into byte arrays
+  debug:
+    doAssert 0 <= dStart and dStart+len <= dst.len.uint, "dStart: " & $dStart & ", dStart+len: " & $(dStart+len) & ", dst.len: " & $dst.len
+    doAssert 0 <= sStart and sStart+len <= src.len.uint, "sStart: " & $sStart & ", sStart+len: " & $(sStart+len) & ", src.len: " & $src.len
+
+  for i in 0 ..< len:
+    dst[dStart + i] = byte src[sStart + i]
--- a/tests/t_mac_poly1305.nim
+++ b/tests/t_mac_poly1305.nim
@ -0,0 +1,31 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  std/unittest,
+  ../constantine/mac/mac_poly1305
+
+suite "[Message Authentication Code] Poly1305":
+  test "Test vector 1 - RFC8439":
+    let ikm = [
+      byte 0x85, 0xd6, 0xbe, 0x78, 0x57, 0x55, 0x6d, 0x33,
+           0x7f, 0x44, 0x52, 0xfe, 0x42, 0xd5, 0x06, 0xa8,
+           0x01, 0x03, 0x80, 0x8a, 0xfb, 0x0d, 0xb2, 0xfd,
+           0x4a, 0xbf, 0xf6, 0xaf, 0x41, 0x49, 0xf5, 0x1b
+    ]
+    let message = "Cryptographic Forum Research Group"
+
+    let expectedTag = [
+      byte 0xa8, 0x06, 0x1d, 0xc1, 0x30, 0x51, 0x36, 0xc6,
+           0xc2, 0x2b, 0x8b, 0xaf, 0x0c, 0x01, 0x27, 0xa9
+    ]
+
+    var tag: array[16, byte]
+    poly1305.auth(tag, message, ikm)
+
+    doAssert tag == expectedTag