Sha256 refactor (#206)

* sha256: separate message scheduling and state updates to help implement specific use-cases like #205; also implement SSSE3 acceleration (2006, Intel Core 2 Duo) * sha256: simplify update flow, store less metadata in context * sha256: Fix reworked update function * Implement x86 hardware SHA acceleration * typo
2022-09-19 02:02:57 +02:00 · 2022-09-19 02:02:57 +02:00 · 351a3f6bd2
parent b901dd5878
commit 351a3f6bd2
10 changed files with 965 additions and 339 deletions
--- a/benchmarks/bench_sha256.nim
+++ b/benchmarks/bench_sha256.nim
@ -48,9 +48,9 @@ proc report(op: string, bytes: int, startTime, stopTime: MonoTime, startClk, sto
  when SupportsGetTicks:
    let cycles = (stopClk - startClk) div iters
    let cyclePerByte = cycles.float64 / bytes.float64
-    echo &"{op:<30}     {throughput:>15.3f} ops/s    {ns:>9} ns/op    {cycles:>10} cycles    {cyclePerByte:>5.2f} cycles/byte"
+    echo &"{op:<50}     {throughput:>15.3f} ops/s    {ns:>9} ns/op    {cycles:>10} cycles    {cyclePerByte:>5.2f} cycles/byte"
  else:
-    echo &"{op:<30}     {throughput:>15.3f} ops/s    {ns:>9} ns/op"
+    echo &"{op:<50}     {throughput:>15.3f} ops/s    {ns:>9} ns/op"
 template bench(op: string, bytes: int, iters: int, body: untyped): untyped =
  measure(iters, startTime, stopTime, startClk, stopClk, body)
@ -63,41 +63,23 @@ proc benchSHA256_constantine[T](msg: openarray[T], msgComment: string, iters: in
 proc benchSHA256_openssl[T](msg: openarray[T], msgComment: string, iters: int) =
  var digest: array[32, byte]
-  bench("SHA256 - OpenSSL - " & msgComment, msg.len, iters):
+  bench("SHA256 - OpenSSL     - " & msgComment, msg.len, iters):
    SHA256_OpenSSL(digest, msg)
 when isMainModule:
  proc main() =
-    block:
+    const sizes = [
-      let msg32B = rng.random_byte_seq(32)
+      32, 64, 128, 256,
-      benchSHA256_constantine(msg32B, "32B", 100)
+      1024, 4096, 16384, 65536,
-      benchSHA256_openssl(msg32B, "32B", 100)
+      1_000_000, 10_000_000
-    block:
+    ]
-      let msg64B = rng.random_byte_seq(64)
+
-      benchSHA256_constantine(msg64B, "64B", 100)
+    const target_cycles = 1_000_000_000'i64
-      benchSHA256_openssl(msg64B, "64B", 100)
+    const worst_cycles_per_bytes = 25'i64
-    block:
+    for s in sizes:
-      let msg128B = rng.random_byte_seq(128)
+      let msg = rng.random_byte_seq(s)
-      benchSHA256_constantine(msg128B, "128B", 100)
+      let iters = int(target_cycles div (s.int64 * worst_cycles_per_bytes))
-      benchSHA256_openssl(msg128B, "128B", 100)
+      benchSHA256_constantine(msg, $s & "B", iters)
-    block:
+      benchSHA256_openssl(msg, $s & "B", iters)
-      let msg576B = rng.random_byte_seq(576)
+
      benchSHA256_constantine(msg576B, "576B", 50)
      benchSHA256_openssl(msg576B, "576B", 50)
    block:
      let msg8192B = rng.random_byte_seq(8192)
      benchSHA256_constantine(msg8192B, "8192B", 25)
      benchSHA256_openssl(msg8192B, "8192B", 25)
    block:
      let msg1MB = rng.random_byte_seq(1_000_000)
      benchSHA256_constantine(msg1MB, "1MB", 16)
      benchSHA256_openssl(msg1MB, "1MB", 16)
    block:
      let msg10MB = rng.random_byte_seq(10_000_000)
      benchSHA256_constantine(msg10MB, "10MB", 16)
      benchSHA256_openssl(msg10MB, "10MB", 16)
    block:
      let msg100MB = rng.random_byte_seq(100_000_000)
      benchSHA256_constantine(msg100MB, "100MB", 3)
      benchSHA256_openssl(msg100MB, "100MB", 3)
  main()
--- a/constantine/hashes/h_sha256.nim
+++ b/constantine/hashes/h_sha256.nim
@ -7,8 +7,13 @@
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 import
-  std/macros,
+  ../platforms/[abstractions, endians],
-  ../platforms/[abstractions, endians]
+  ./sha256/sha256_generic
 when UseASM_X86_32:
  import ./sha256/[
    sha256_x86_ssse3,
    sha256_x86_shaext]
 # SHA256, a hash function from the SHA2 family
 # --------------------------------------------------------------------------------
@ -16,11 +21,6 @@ import
 # References:
 # - NIST: https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.180-4.pdf
 # - IETF: US Secure Hash Algorithms (SHA and HMAC-SHA) https://tools.ietf.org/html/rfc4634
 # - Intel optimization https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/sha-256-implementations-paper.pdf
 # - Parallelizing message schedules
 #   to accelerate the computations of hash functions
 #   Shay Gueron, Vlad Krasnov, 2012
 #   https://eprint.iacr.org/2012/067.pdf
 #
 # Vectors:
 # - https://csrc.nist.gov/CSRC/media/Projects/Cryptographic-Standards-and-Guidelines/documents/examples/SHA256.pdf
@ -28,277 +28,74 @@ import
 # Types and constants
 # ----------------------------------------------------------------
 const
  DigestSize = 32
  BlockSize = 64
  HashSize = DigestSize div sizeof(uint32) # 8
 type
  Sha256Context* = object
    ## Align to 64 for cache line and SIMD friendliness
-    H{.align: 64}: array[HashSize, uint32]
+    s{.align: 64}: Sha256_state
    buf{.align: 64}: array[BlockSize, byte]
    msgLen: uint64
    bufIdx: uint8
  sha256* = Sha256Context
-# Internal
+# Internals
 # ----------------------------------------------------------------
 # TODO: vectorized implementations
 # No exceptions allowed in core cryptographic operations
 {.push raises: [].}
 {.push checks: off.}
-template rotr(x, n: uint32): uint32 =
+func hashMessageBlocks(
-  ## Rotate right the bits
+       s: var Sha256_state,
-  # We always use it with constants in 0 ..< 32
+       message: ptr UncheckedArray[byte],
-  # so undefined behaviour.
+       numBlocks: uint) =
-  (x shr n) or (x shl (32 - n))
+  when UseASM_X86_32:
-
+    if ({.noSideEffect.}: hasSha()):
-template ch(x, y, z: uint32): uint32 =
+      hashMessageBlocks_shaext(s, message, numBlocks)
-  ## "Choose" function of SHA256
+    elif ({.noSideEffect.}: hasSSSE3()):
-  ## Choose bit i from yi or zi depending on xi
+      hashMessageBlocks_ssse3(s, message, numBlocks)
  when false: # Spec FIPS 180-4
    (x and y) xor (not(x) and z)
  else:       # RFC4634
    ((x and (y xor z)) xor z)
 template maj(x, y, z: uint32): uint32 =
  ## "Majority" function of SHA256
  when false: # Spec FIPS 180-4
    (x and y) xor (x and z) xor (y and z)
  else:      # RFC4634
    (x and (y or z)) or (y and z)
 template S0(x: uint32): uint32 =
  # Σ₀
  rotr(x, 2) xor rotr(x, 13) xor rotr(x, 22)
 template S1(x: uint32): uint32 =
  # Σ₁
  rotr(x, 6) xor rotr(x, 11) xor rotr(x, 25)
 template s0(x: uint32): uint32 =
  # σ₀
  rotr(x, 7) xor rotr(x, 18) xor (x shr 3)
 template s1(x: uint32): uint32 =
  # σ₁
  rotr(x, 17) xor rotr(x, 19) xor (x shr 10)
 template u32BE(blob: array[4, byte]): uint32 =
  ## Interpret a data blob as a big-endian uint32
  ## This should lower to
  when nimvm:
    (blob[0].uint32 shl 24) or (blob[1].uint32 shl 16) or (blob[2].uint32 shl 8) or blob[3].uint32
  else:
    when cpuEndian == littleEndian:
      (blob[0].uint32 shl 24) or (blob[1].uint32 shl 16) or (blob[2].uint32 shl 8) or blob[3].uint32
    else:
-      cast[uint32](blob)
+      hashMessageBlocks_generic(s, message, numBlocks)
 template getU32at[T: byte|char](msg: openarray[T], pos: SomeInteger): uint32 =
  u32BE(cast[ptr array[4, byte]](msg[pos].unsafeAddr())[])
 func rotateRight[T](a: var openarray[T], k: int) =
  ## Rotate a seuqnce by k
  doAssert a.len > 0
  let k = k mod a.len
  for _ in 0 ..< k:
    let tmp = a[^1]
    for i in countdown(a.len-1, 1):
      a[i] = a[i-1]
    a[0] = tmp
 macro round(a, b, c, d, e, f, g, h: untyped, t: static int): untyped =
  ## Unrolled and allocation efficient SHA256 round
  var s = [a, b, c, d, e, f, g, h]
  s.rotateRight(t)
  let
    a = s[0]
    b = s[1]
    c = s[2]
    d = s[3]
    e = s[4]
    f = s[5]
    g = s[6]
    h = s[7]
  # W[t]
  let w = nnkBracketExpr.newTree(
    ident("W"), newLit(t mod 16)
  )
  result = newStmtList()
  if t < 16:
    # Reading message phase
    let msg = ident"message"
    let curBlock = ident"curBlock"
    result.add quote do:
      `w` = getU32at(`msg`, `curBlock`*64 + `t`*4)
  else:
-    # Mixing
+    hashMessageBlocks_generic(s, message, numBlocks)
    # Wt-2, Wt-7, Wt-15, Wt-16
    let Wtm2 = nnkBracketExpr.newTree(
      ident("W"), newLit((t-2) mod 16)
    )
    let Wtm7 = nnkBracketExpr.newTree(
      ident("W"), newLit((t-7) mod 16)
    )
    let Wtm15 = nnkBracketExpr.newTree(
      ident("W"), newLit((t-15) mod 16)
    )
    # w is Wt-16
    result.add quote do:
      `w` += s1(`Wtm2`) + `Wtm7` + s0(`Wtm15`)
  result.add quote do:
    let T1 = `h` + S1(`e`) + ch(`e`, `f`, `g`) + K256[`t`] + `w`
    let T2 = S0(`a`) + maj(`a`, `b`, `c`)
    `d` += T1
    `h` = T1 + T2
 func hashMessageBlocks[T: byte|char](
       H: var array[HashSize, uint32],
       message: openarray[T]): uint =
  ## Hash a message block by block
  ## Sha256 block size is 64 bytes hence
  ## a message will be process 64 by 64 bytes.
  ## FIPS.180-4 6.2.2. SHA-256 Hash Computation
  result = 0
  let numBlocks = message.len.uint div BlockSize
  if numBlocks == 0:
    return 0
  const K256 = [
    0x428a2f98'u32, 0x71374491'u32, 0xb5c0fbcf'u32, 0xe9b5dba5'u32, 0x3956c25b'u32, 0x59f111f1'u32, 0x923f82a4'u32, 0xab1c5ed5'u32,
    0xd807aa98'u32, 0x12835b01'u32, 0x243185be'u32, 0x550c7dc3'u32, 0x72be5d74'u32, 0x80deb1fe'u32, 0x9bdc06a7'u32, 0xc19bf174'u32,
    0xe49b69c1'u32, 0xefbe4786'u32, 0x0fc19dc6'u32, 0x240ca1cc'u32, 0x2de92c6f'u32, 0x4a7484aa'u32, 0x5cb0a9dc'u32, 0x76f988da'u32,
    0x983e5152'u32, 0xa831c66d'u32, 0xb00327c8'u32, 0xbf597fc7'u32, 0xc6e00bf3'u32, 0xd5a79147'u32, 0x06ca6351'u32, 0x14292967'u32,
    0x27b70a85'u32, 0x2e1b2138'u32, 0x4d2c6dfc'u32, 0x53380d13'u32, 0x650a7354'u32, 0x766a0abb'u32, 0x81c2c92e'u32, 0x92722c85'u32,
    0xa2bfe8a1'u32, 0xa81a664b'u32, 0xc24b8b70'u32, 0xc76c51a3'u32, 0xd192e819'u32, 0xd6990624'u32, 0xf40e3585'u32, 0x106aa070'u32,
    0x19a4c116'u32, 0x1e376c08'u32, 0x2748774c'u32, 0x34b0bcb5'u32, 0x391c0cb3'u32, 0x4ed8aa4a'u32, 0x5b9cca4f'u32, 0x682e6ff3'u32,
    0x748f82ee'u32, 0x78a5636f'u32, 0x84c87814'u32, 0x8cc70208'u32, 0x90befffa'u32, 0xa4506ceb'u32, 0xbef9a3f7'u32, 0xc67178f2'u32
  ]
  var
    a = H[0]
    b = H[1]
    c = H[2]
    d = H[3]
    e = H[4]
    f = H[5]
    g = H[6]
    h = H[7]
  for curBlock in 0 ..< numBlocks:
    # The first 16 bytes have different handling
    # from bytes 16..<64.
    # Using an array[64, uint32] will span it
    # across 8 cache lines impacting performance
    # Workspace with message schedule Wₜ
    var W{.noInit.}: array[16, uint32]
    when true:
      # Translation of the spec
      # This is faster than even OpenSSL for hashing just 32 bytes
      # for example for HMAC and HKDF.
      var t = 0'u32
      while t < 16: # Wₜ = Mⁱₜ
        W[t].parseFromBlob(message, result, bigEndian)
        let T1 = h + S1(e) + ch(e, f, g) + K256[t] + W[t]
        let T2 = S0(a) + maj(a, b, c)
        h = g
        g = f
        f = e
        e = d + T1
        d = c
        c = b
        b = a
        a = T1+T2
        t += 1
      while t < 64:
        W[t mod 16] += s1(W[(t-2) mod 16]) +
                      W[(t-7) mod 16] +
                      s0(W[(t-15) mod 16])
        let T1 = h + S1(e) + ch(e, f, g) + K256[t] + W[t mod 16]
        let T2 = S0(a) + maj(a, b, c)
        h = g
        g = f
        f = e
        e = d + T1
        d = c
        c = b
        b = a
        a = T1+T2
        t += 1
    else:
      # optimized version for large hashes
      # For hashing 32B, this is slower than the rough translation
      # of spec, unless compiled with -mssse3 (but no vector instructions are used :/)
      staticFor t, 0, 64:
        round(a, b, c, d, e, f, g, h, t)
      result += 64
    a += H[0]; H[0] = a
    b += H[1]; H[1] = b
    c += H[2]; H[2] = c
    d += H[3]; H[3] = d
    e += H[4]; H[4] = e
    f += H[5]; H[5] = f
    g += H[6]; H[6] = g
    h += H[7]; H[7] = h
 func dumpHash(
       digest: var array[DigestSize, byte],
-       H: array[HashSize, uint32]) =
+       s: Sha256_state) {.inline.} =
  ## Convert the internal hash into a message digest
  var dstIdx = 0'u
-  for i in 0 ..< H.len:
+  for i in 0 ..< s.H.len:
-    digest.dumpRawInt(H[i], dstIdx, bigEndian)
+    digest.dumpRawInt(s.H[i], dstIdx, bigEndian)
    dstIdx += uint sizeof(uint32)
-func hashBuffer(ctx: var Sha256Context) =
+func hashBuffer(ctx: var Sha256Context) {.inline.} =
-  discard ctx.H.hashMessageBlocks(ctx.buf)
+  ctx.s.hashMessageBlocks(ctx.buf.asUnchecked(), numBlocks = 1)
  ctx.buf.setZero()
  ctx.bufIdx = 0
 # Public API
 # ----------------------------------------------------------------
 template digestSize*(H: type sha256): int =
  ## Returns the output size in bytes
-  32
+  DigestSize
 template internalBlockSize*(H: type sha256): int =
  ## Returns the byte size of the hash function ingested blocks
-  64
+  BlockSize
 func init*(ctx: var Sha256Context) =
  ## Initialize or reinitialize a Sha256 context
  ctx.msgLen = 0
  ctx.buf.setZero()
  ctx.bufIdx = 0
-  ctx.H[0] = 0x6a09e667'u32
+  ctx.s.H[0] = 0x6a09e667'u32
-  ctx.H[1] = 0xbb67ae85'u32
+  ctx.s.H[1] = 0xbb67ae85'u32
-  ctx.H[2] = 0x3c6ef372'u32
+  ctx.s.H[2] = 0x3c6ef372'u32
-  ctx.H[3] = 0xa54ff53a'u32
+  ctx.s.H[3] = 0xa54ff53a'u32
-  ctx.H[4] = 0x510e527f'u32
+  ctx.s.H[4] = 0x510e527f'u32
-  ctx.H[5] = 0x9b05688c'u32
+  ctx.s.H[5] = 0x9b05688c'u32
-  ctx.H[6] = 0x1f83d9ab'u32
+  ctx.s.H[6] = 0x1f83d9ab'u32
-  ctx.H[7] = 0x5be0cd19'u32
+  ctx.s.H[7] = 0x5be0cd19'u32
 func initZeroPadded*(ctx: var Sha256Context) =
  ## Initialize a Sha256 context
@ -312,18 +109,17 @@ func initZeroPadded*(ctx: var Sha256Context) =
  ctx.msgLen = 64
  ctx.buf.setZero()
  ctx.bufIdx = 0
-  ctx.H[0] = 0xda5698be'u32
+  ctx.s.H[0] = 0xda5698be'u32
-  ctx.H[1] = 0x17b9b469'u32
+  ctx.s.H[1] = 0x17b9b469'u32
-  ctx.H[2] = 0x62335799'u32
+  ctx.s.H[2] = 0x62335799'u32
-  ctx.H[3] = 0x779fbeca'u32
+  ctx.s.H[3] = 0x779fbeca'u32
-  ctx.H[4] = 0x8ce5d491'u32
+  ctx.s.H[4] = 0x8ce5d491'u32
-  ctx.H[5] = 0xc0d26243'u32
+  ctx.s.H[5] = 0xc0d26243'u32
-  ctx.H[6] = 0xbafef9ea'u32
+  ctx.s.H[6] = 0xbafef9ea'u32
-  ctx.H[7] = 0x1837a9d8'u32
+  ctx.s.H[7] = 0x1837a9d8'u32
-func update*[T: char|byte](ctx: var Sha256Context, message: openarray[T]) =
+func update*(ctx: var Sha256Context, message: openarray[byte]) =
  ## Append a message to a SHA256 context
  ## for incremental SHA256 computation
  ##
@ -336,53 +132,48 @@ func update*[T: char|byte](ctx: var Sha256Context, message: openarray[T]) =
  ##
  ## For passwords and secret keys, you MUST NOT use raw SHA-256
  ## use a Key Derivation Function instead (KDF)
  # Message processing state machine
  var bufIdx = uint(ctx.msgLen mod BlockSize)
  var cur = 0'u
  var bytesLeft = message.len.uint
-  debug:
+  if bufIdx != 0 and bufIdx+bytesLeft >= BlockSize:
-    doAssert: 0 <= ctx.bufIdx and ctx.bufIdx.int < ctx.buf.len
+    # Previous partial update, fill the buffer and do one sha256 hash
-    for i in ctx.bufIdx ..< ctx.buf.len:
+    let free = BlockSize - bufIdx
-      doAssert ctx.buf[i] == 0
+    ctx.buf.copy(dStart = bufIdx, message, sStart = 0, len = free)
-
+    ctx.hashBuffer()
-  if message.len == 0:
+    bufIdx = 0
-    return
+    cur = free
-
+    bytesLeft -= free
-  var # Message processing state machine
+  
-    cur = 0'u
+  if bytesLeft >= BlockSize:
-    bytesLeft = message.len.uint
+    # Process n blocks (64 byte each)
-
+    let numBlocks = bytesLeft div BlockSize
-  ctx.msgLen += bytesLeft
+    ctx.s.hashMessageBlocks(message.asUnchecked +% cur, numBlocks)
-
+    cur += numBlocks * BlockSize
-  if ctx.bufIdx != 0: # Previous partial update
+    bytesLeft -= numBlocks * BlockSize
    let bufIdx = ctx.bufIdx.uint
    let free = ctx.buf.sizeof().uint - bufIdx
    if free > bytesLeft:
      # Enough free space, store in buffer
      ctx.buf.copy(dStart = bufIdx, message, sStart = 0, len = bytesLeft)
      ctx.bufIdx += bytesLeft.uint8
      return
    else:
      # Fill the buffer and do one sha256 hash
      ctx.buf.copy(dStart = bufIdx, message, sStart = 0, len = free)
      ctx.hashBuffer()
      # Update message state for further processing
      cur = free
      bytesLeft -= free
  # Process n blocks (64 byte each)
  let consumed = ctx.H.hashMessageBlocks(
    message.toOpenArray(int cur, message.len-1))
  cur += consumed
  bytesLeft -= consumed
  if bytesLeft != 0:
    # Store the tail in buffer
-    debug: # TODO: state machine formal verification - https://nim-lang.org/docs/drnim.html
+    ctx.buf.copy(dStart = bufIdx, message, sStart = cur, len = bytesLeft)
      doAssert ctx.bufIdx == 0
      doAssert cur + bytesLeft == message.len.uint
-    ctx.buf.copy(dStart = 0'u, message, sStart = cur, len = bytesLeft)
+  ctx.msgLen += message.len.uint
-    ctx.bufIdx = uint8 bytesLeft
+
 func update*(ctx: var Sha256Context, message: openarray[char]) {.inline.} =
  ## Append a message to a SHA256 context
  ## for incremental SHA256 computation
  ##
  ## Security note: the tail of your message might be stored
  ## in an internal buffer.
  ## if sensitive content is used, ensure that
  ## `ctx.finish(...)` and `ctx.clear()` are called as soon as possible.
  ## Additionally ensure that the message(s) passed were stored
  ## in memory considered secure for your threat model.
  ##
  ## For passwords and secret keys, you MUST NOT use raw SHA-256
  ## use a Key Derivation Function instead (KDF)
  ctx.update(message.toOpenArrayByte(message.low, message.high))
 func finish*(ctx: var Sha256Context, digest: var array[32, byte]) =
  ## Finalize a SHA256 computation and output the
@ -396,26 +187,23 @@ func finish*(ctx: var Sha256Context, digest: var array[32, byte]) =
  ## For passwords and secret keys, you MUST NOT use raw SHA-256
  ## use a Key Derivation Function instead (KDF)
-  debug:
+  let bufIdx = uint(ctx.msgLen mod BlockSize)
    doAssert: 0 <= ctx.bufIdx and ctx.bufIdx.int < ctx.buf.len
    for i in ctx.bufIdx ..< ctx.buf.len:
      doAssert ctx.buf[i] == 0
  # Add '1' bit at the end of the message (+7 zero bits)
-  ctx.buf[ctx.bufIdx] = 0b1000_0000
+  ctx.buf[bufIdx] = 0b1000_0000
  # Add k bits so that msgLenBits + 1 + k ≡ 448 mod 512
  # Hence in bytes msgLen + 1 + K ≡ 56 mod 64
  const padZone = 56
-  if ctx.bufIdx >= padZone:
+  if bufIdx >= padZone:
    # We are in the 56..<64 mod 64 byte count
    # and need to rollover to 0
    ctx.hashBuffer()
  let lenInBits = ctx.msgLen.uint64 * 8
  ctx.buf.dumpRawInt(lenInBits, padZone, bigEndian)
-  discard ctx.H.hashMessageBlocks(ctx.buf)
+  ctx.s.hashMessageBlocks(ctx.buf.asUnchecked(), numBlocks = 1)
-  digest.dumpHash(ctx.H)
+  digest.dumpHash(ctx.s)
 func clear*(ctx: var Sha256Context) =
  ## Clear the context internal buffers
@ -423,7 +211,6 @@ func clear*(ctx: var Sha256Context) =
  ## For passwords and secret keys, you MUST NOT use raw SHA-256
  ## use a Key Derivation Function instead (KDF)
  # TODO: ensure compiler cannot optimize the code away
-  ctx.H.setZero()
+  ctx.s.H.setZero()
  ctx.buf.setZero()
  ctx.msgLen = 0
  ctx.bufIdx = 0
--- a/constantine/hashes/sha256/sha256_generic.nim
+++ b/constantine/hashes/sha256/sha256_generic.nim
@ -0,0 +1,185 @@
 # Constantine
 # Copyright (c) 2018-2019    Status Research & Development GmbH
 # Copyright (c) 2020-Present Mamy André-Ratsimbazafy
 # Licensed and distributed under either of
 #   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 import
  ../../platforms/primitives
 # SHA256, a hash function from the SHA2 family
 # --------------------------------------------------------------------------------
 #
 # References:
 # - NIST: https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.180-4.pdf
 # - IETF: US Secure Hash Algorithms (SHA and HMAC-SHA) https://tools.ietf.org/html/rfc4634
 # Vectors:
 # - https://csrc.nist.gov/CSRC/media/Projects/Cryptographic-Standards-and-Guidelines/documents/examples/SHA256.pdf
 # No exceptions allowed in core cryptographic operations
 {.push raises: [].}
 {.push checks: off.}
 # Types & Constants
 # ------------------------------------------------
 # The enforced alignment should help the compiler produce optimized code
 type Word* = uint32
 const
  DigestSize* = 32
  BlockSize* = 64
  HashSize* = DigestSize div sizeof(Word) # 8
 type Sha256_MessageSchedule* = object
  w*{.align: 64.}: array[BlockSize div sizeof(Word), Word]
 type Sha256_state* = object
  H*{.align: 64.}: array[HashSize, Word]
 const K256* = [
  0x428a2f98'u32, 0x71374491'u32, 0xb5c0fbcf'u32, 0xe9b5dba5'u32, 0x3956c25b'u32, 0x59f111f1'u32, 0x923f82a4'u32, 0xab1c5ed5'u32,
  0xd807aa98'u32, 0x12835b01'u32, 0x243185be'u32, 0x550c7dc3'u32, 0x72be5d74'u32, 0x80deb1fe'u32, 0x9bdc06a7'u32, 0xc19bf174'u32,
  0xe49b69c1'u32, 0xefbe4786'u32, 0x0fc19dc6'u32, 0x240ca1cc'u32, 0x2de92c6f'u32, 0x4a7484aa'u32, 0x5cb0a9dc'u32, 0x76f988da'u32,
  0x983e5152'u32, 0xa831c66d'u32, 0xb00327c8'u32, 0xbf597fc7'u32, 0xc6e00bf3'u32, 0xd5a79147'u32, 0x06ca6351'u32, 0x14292967'u32,
  0x27b70a85'u32, 0x2e1b2138'u32, 0x4d2c6dfc'u32, 0x53380d13'u32, 0x650a7354'u32, 0x766a0abb'u32, 0x81c2c92e'u32, 0x92722c85'u32,
  0xa2bfe8a1'u32, 0xa81a664b'u32, 0xc24b8b70'u32, 0xc76c51a3'u32, 0xd192e819'u32, 0xd6990624'u32, 0xf40e3585'u32, 0x106aa070'u32,
  0x19a4c116'u32, 0x1e376c08'u32, 0x2748774c'u32, 0x34b0bcb5'u32, 0x391c0cb3'u32, 0x4ed8aa4a'u32, 0x5b9cca4f'u32, 0x682e6ff3'u32,
  0x748f82ee'u32, 0x78a5636f'u32, 0x84c87814'u32, 0x8cc70208'u32, 0x90befffa'u32, 0xa4506ceb'u32, 0xbef9a3f7'u32, 0xc67178f2'u32
 ]
 # Primitives
 # ------------------------------------------------
 template rotr(x, n: uint32): uint32 =
  ## Rotate right the bits
  # We always use it with constants in 0 ..< 32
  # so no undefined behaviour.
  (x shr n) or (x shl (32 - n))
 template ch(x, y, z: uint32): uint32 =
  ## "Choose" function of SHA256
  ## Choose bit i from yi or zi depending on xi
  when false: # Spec FIPS 180-4
    (x and y) xor (not(x) and z)
  else:       # RFC4634
    ((x and (y xor z)) xor z)
 template maj(x, y, z: uint32): uint32 =
  ## "Majority" function of SHA256
  when false: # Spec FIPS 180-4
    (x and y) xor (x and z) xor (y and z)
  else:      # RFC4634
    (x and (y or z)) or (y and z)
 template S0(x: uint32): uint32 =
  # Σ₀
  rotr(x, 2) xor rotr(x, 13) xor rotr(x, 22)
 template S1(x: uint32): uint32 =
  # Σ₁
  rotr(x, 6) xor rotr(x, 11) xor rotr(x, 25)
 template s0(x: uint32): uint32 =
  # σ₀
  rotr(x, 7) xor rotr(x, 18) xor (x shr 3)
 template s1(x: uint32): uint32 =
  # σ₁
  rotr(x, 17) xor rotr(x, 19) xor (x shr 10)
 # Message schedule
 # ------------------------------------------------
 template u32BE(blob: array[4, byte]): uint32 =
  ## Interpret a data blob as a big-endian uint32
  when nimvm:
    (blob[0].uint32 shl 24) or (blob[1].uint32 shl 16) or (blob[2].uint32 shl 8) or blob[3].uint32
  else:
    when cpuEndian == littleEndian:
      (blob[0].uint32 shl 24) or (blob[1].uint32 shl 16) or (blob[2].uint32 shl 8) or blob[3].uint32
    else:
      cast[uint32](blob)
 template getU32at(msg: ptr UncheckedArray[byte], pos: SomeInteger): uint32 =
  u32BE(cast[ptr array[4, byte]](msg[pos].addr)[])
 # State updates
 # ------------------------------------------------
 template copy*(dst: var Sha256_state, src: Sha256_state) =
  ## State copy
  # Should compile with a specialized aligned copy.
  # No bounds check
  for i in 0 ..< HashSize:
    dst.H[i] = src.H[i]
 template accumulate*(dst: var Sha256_state, src: Sha256_state) =
  ## State accumulation
  # No bounds check
  for i in 0 ..< HashSize:
    dst.H[i] += src.H[i]
 template sha256_round*(s: var Sha256_state, wt, kt: Word) =
  template a: Word = s.H[0]
  template b: Word = s.H[1]
  template c: Word = s.H[2]
  template d: Word = s.H[3]
  template e: Word = s.H[4]
  template f: Word = s.H[5]
  template g: Word = s.H[6]
  template h: Word = s.H[7]
  let T1 = h + S1(e) + ch(e, f, g) + kt + wt
  let T2 = S0(a) + maj(a, b, c)
  d += T1
  h = T1 + T2
  s.H.rotateRight()
 # Hash Computation
 # ------------------------------------------------
 func sha256_rounds_0_15(
       s: var Sha256_state,
       ms: var Sha256_MessageSchedule,
       message: ptr UncheckedArray[byte]) {.inline.} =
  staticFor t, 0, 16:
    ms.w[t] = message.getU32at(t * sizeof(Word))
    sha256_round(s, ms.w[t], K256[t])
 func sha256_rounds_16_63(
       s: var Sha256_state,
       ms: var Sha256_MessageSchedule) {.inline.}  =
  staticFor t, 16, 64:
    ms.w[t and 15] += s1(ms.w[(t -  2) and 15])+
                         ms.w[(t -  7) and 15] +
                      s0(ms.w[(t - 15) and 15])
    sha256_round(s, ms.w[t and 15], K256[t])
 func hashMessageBlocks_generic*(
       H: var Sha256_state,
       message: ptr UncheckedArray[byte],
       numBlocks: uint) =
  ## Hash a message block by block
  ## Sha256 block size is 64 bytes hence
  ## a message will be process 64 by 64 bytes.
  ## FIPS.180-4 6.2.2. SHA-256 Hash Computation
  var msg = message
  var ms{.noInit.}: Sha256_MessageSchedule
  var s{.noInit.}: Sha256_state
  s.copy(H)
  for _ in 0 ..< numBlocks:
    sha256_rounds_0_15(s, ms, msg)
    msg +%= BlockSize
    sha256_rounds_16_63(s, ms)
    s.accumulate(H) # accumulate on register variables
    H.copy(s)
--- a/constantine/hashes/sha256/sha256_x86_shaext.nim
+++ b/constantine/hashes/sha256/sha256_x86_shaext.nim
@ -0,0 +1,130 @@
 # Constantine
 # Copyright (c) 2018-2019    Status Research & Development GmbH
 # Copyright (c) 2020-Present Mamy André-Ratsimbazafy
 # Licensed and distributed under either of
 #   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 import
  ../../platforms/isa/simd_x86,
  ../../platforms/primitives,
  ./sha256_generic
 {.localpassC:"-msse4.1 -msha".}
 # SHA256, a hash function from the SHA2 family
 # --------------------------------------------------------------------------------
 #
 # References:
 # - Intel SHA extensions whitepaper
 #   https://www.intel.com/content/dam/develop/external/us/en/documents/intel-sha-extensions-white-paper-402097.pdf
 # - Intel SHA extensions article
 #   https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sha-extensions.html
 # No exceptions allowed in core cryptographic operations
 {.push raises: [].}
 {.push checks: off.}
 # Primitives
 # ------------------------------------------------
 template setr_K(i: int): m128i =
  setr_u32x4(K256[4*i], K256[4*i+1], K256[4*i+2], K256[4*i+3])
 # Hash Computation
 # ------------------------------------------------
 func hashMessageBlocks_shaext*(
       H: var Sha256_state,
       message: ptr UncheckedArray[byte],
       numBlocks: uint)=
  ## Hash a message block by block
  ## Sha256 block size is 64 bytes hence
  ## a message will be process 64 by 64 bytes.
  var
    abef_save {.noInit.}: m128i
    cdgh_save {.noInit.}: m128i
    state0 {.noInit.}: m128i
    state1 {.noInit.}: m128i
    msgtmp {.noInit.}: array[4, m128i]
    msg {.noInit.}: m128i
    tmp {.noInit.}: m128i
    data = message
  let shuf_mask = set_u64x2(0x0c0d0e0f08090a0b, 0x0405060700010203)
  # The SHA state is stored in this order:
  #   D, C, B, A, H, G, F, E
  #
  # state0 contains ABEF, state1 contains CDGH
  tmp    = shuf_u32x4(loada_u128(H.H[0].addr), 0xB1) # CDAB
  state1 = shuf_u32x4(loada_u128(H.H[4].addr), 0x1B) # EFGH
  state0 = alignr_u128(tmp, state1, 8)               # ABEF
  state1 = blend_u16x8(state1, tmp, 0xF0)            # CDGH
  for _ in 0 ..< numBlocks:
    # Save current state
    abef_save = state0
    cdgh_save = state1
    # Rounds 0-3
    msgtmp[0] = shuf_u8x16(loadu_u128(data[0].addr), shuf_mask)
    msg       = add_u32x4(msgtmp[0], setr_K(0))
    state1    = sha256_2rounds(state1, state0, msg)
    msg       = shuf_u32x4(msg, 0x0E)
    state0    = sha256_2rounds(state0, state1, msg)
    # Rounds 4-7 and 8-11
    staticFor i, 1, 3:
      msgtmp[i]   = shuf_u8x16(loadu_u128(data[16*i].addr), shuf_mask)
      msg         = add_u32x4(msgtmp[i], setr_K(i))
      state1      = sha256_2rounds(state1, state0, msg)
      msg         = shuf_u32x4(msg, 0x0E)
      state0      = sha256_2rounds(state0, state1, msg)
      msgtmp[i-1] = sha256_msg1(msgtmp[i-1], msgtmp[i])
    # Rounds 12-59
    msgtmp[3] = shuf_u8x16(loadu_u128(data[16*3].addr), shuf_mask)
    staticFor i, 3, 15:
      let prev = (i-1) and 3 # mod 4, we rotate buffers
      let curr =  i    and 3
      let next = (i+1) and 3
      msg          = add_u32x4(msgtmp[curr], setr_K(i))
      state1       = sha256_2rounds(state1, state0, msg)
      tmp          = alignr_u128(msgtmp[curr], msgtmp[prev], 4)
      msgtmp[next] = add_u32x4(msgtmp[next], tmp)
      msgtmp[next] = sha256_msg2(msgtmp[next], msgtmp[curr])
      msg          = shuf_u32x4(msg, 0x0E)
      state0       = sha256_2rounds(state0, state1, msg)
      msgtmp[prev] = sha256_msg1(msgtmp[prev], msgtmp[curr])
    # Rounds 60-63
    msg    = add_u32x4(msgtmp[3], setr_K(15))
    state1 = sha256_2rounds(state1, state0, msg)
    msg    = shuf_u32x4(msg, 0x0E)
    state0 = sha256_2rounds(state0, state1, msg)
    # Accumulate
    state0 = add_u32x4(state0, abef_save)
    state1 = add_u32x4(state1, cdgh_save)
    data +%= BlockSize
  # The SHA state is stored in this order:
  #   D, C, B, A, H, G, F, E
  #
  # state0 contains ABEF, state1 contains CDGH
  tmp    = shuf_u32x4(state0, 0x1B)       # FEBA
  state1 = shuf_u32x4(state1, 0xB1)       # DCHG
  state0 = blend_u16x8(tmp, state1, 0xF0) # DCBA
  state1 = alignr_u128(state1, tmp, 8)    # HGFE
  storea_u128(H.H[0].addr, state0)
  storea_u128(H.H[4].addr, state1)
--- a/constantine/hashes/sha256/sha256_x86_ssse3.nim
+++ b/constantine/hashes/sha256/sha256_x86_ssse3.nim
@ -0,0 +1,211 @@
 # Constantine
 # Copyright (c) 2018-2019    Status Research & Development GmbH
 # Copyright (c) 2020-Present Mamy André-Ratsimbazafy
 # Licensed and distributed under either of
 #   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 import
  ../../platforms/isa/simd_x86,
  ../../platforms/primitives,
  ./sha256_generic
 {.localpassC:"-mssse3".}
 # SHA256, SSSE3 optimizations
 # --------------------------------------------------------------------------------
 #
 # References:
 # - NIST: https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.180-4.pdf
 # - IETF: US Secure Hash Algorithms (SHA and HMAC-SHA) https://tools.ietf.org/html/rfc4634
 # - Fast SHA-256 Implementations on Intel® Architecture Processors
 #   https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/sha-256-implementations-paper.pdf
 # Following the intel whitepaper we split our code into:
 # We keep track of a 256-bit state vector corresponding
 # to {a, b, c, d, e, f, g, h} in specification
 #
 # Processing is done in 2 steps
 # - Message scheduler:
 #   Takes the input 16 DWORDs and
 #   computes 48 new DWORDs. Together with the original 16 DWORDs, these
 #   form a vector of 64 DWORDs that is the input to the second step.
 #   This can be vectorized.
 # - 64 SHA rounds:
 #   This code is scalar.
 # No exceptions allowed in core cryptographic operations
 {.push raises: [].}
 {.push checks: off.}
 # Vectorized message scheduler
 # ------------------------------------------------
 const VecNum = BlockSize div 16      # BlockSize / sizeof(m128i)
 const VecWords = 16 div sizeof(Word) # sizeof(m128i) / sizeof(Word) 
 func initMessageSchedule(
       msnext: var array[VecNum, m128i],
       ms: var Sha256_MessageSchedule,
       message: ptr UncheckedArray[byte]) {.inline.} =
  ## Initial state, from data
  ## - Precompute steps for the future message schedule `msnext`
  ## - compute the current message schedule `ms`
  let mask = setr_u32x4(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f)
  let pK256 = K256.unsafeAddr()
  staticFor i, 0, VecNum:
    msnext[i] = loadu_u128(message[i * sizeof(m128i)].addr)
    msnext[i] = shuf_u8x16(msnext[i], mask)
    storea_u128(ms.w[VecWords*i].addr, add_u32x4(msnext[i], loadu_u128(pK256[VecWords*i].addr)))
 func updateMessageSchedule(
       W: var array[4, m128i],
       loMask, hiMask: m128i) {.inline.} =
  # Steady state
  # ------------
  # The message schedule workspace W[16:0]
  # is updated with
  #   W[t mod 16] += s0 + s1 + W[(t-7) mod 16]
  # with
  #   s0 = σ₀(W[(t-15) mod 16])
  #   s1 = σ₁(W[(t-2)  mod 16])
  # by denoting the right rotation >>>, and xor ⊕
  #   σ₀(x) = (x >>>  7) ⊕ (x >>> 18) ⊕ (x >>>  3)
  #   σ₁(x) = (x >>> 17) ⊕ (x >>> 19) ⊕ (x >>> 10)
  const rot0 = [int32  7, 18,  3]
  const rot1 = [int32 17, 19, 10]
  var v{.noInit.}: array[4, m128i]
  v[0] = alignr_u128(W[1], W[0], 4)
  v[3] = alignr_u128(W[3], W[2], 4)
  v[2] = shr_u32x4(v[0], rot0[0])
  W[0] = add_u32x4(W[0], v[3])
  v[3] = shr_u32x4(v[0], rot0[2])
  v[1] = shl_u32x4(v[0], 32-rot0[1])
  v[0] = xor_u128(v[3], v[2])
  v[3] = shuf_u32x4(W[3], 0xfa)
  v[2] = shr_u32x4(v[2], rot0[1] - rot0[0])
  v[0] = xor_u128(v[0], v[1])
  v[0] = xor_u128(v[0], v[2])
  v[1] = shl_u32x4(v[1], rot0[1] - rot0[0])
  v[2] = shr_u32x4(v[3], rot1[2])
  v[3] = shr_u64x2(v[3], rot1[0])
  W[0] = add_u32x4(W[0], xor_u128(v[0], v[1]))
  v[2] = xor_u128(v[2], v[3])
  v[3] = shr_u64x2(v[3], rot1[1] - rot1[0])
  v[2] = shuf_u8x16(xor_u128(v[2], v[3]), lo_mask)
  W[0] = add_u32x4(W[0], v[2])
  v[3] = shuf_u32x4(W[0], 0x50)
  v[2] = shr_u32x4(v[3], rot1[2])
  v[3] = shr_u64x2(v[3], rot1[0])
  v[2] = xor_u128(v[2], v[3])
  v[3] = shr_u64x2(v[3], rot1[1] - rot1[0])
  W[0] = add_u32x4(W[0], shuf_u8x16(xor_u128(v[2], v[3]), hi_mask)) 
  W.rotateLeft()
 # Hash Computation
 # ------------------------------------------------
 func sha256_rounds_0_47(
       s: var Sha256_state,
       ms: var Sha256_MessageSchedule,
       msnext: var array[VecNum, m128i]) {.inline.} =
  ## Process Sha256 rounds 0 to 47
  let loMask = setr_u32x4(0x03020100, 0x0b0a0908, -1, -1)
  let hiMask = setr_u32x4(-1, -1, 0x03020100, 0x0b0a0908)
  # The first items of K256 were processed in initMessageSchedule
  var k256_idx = 16
  # Rounds 0-15, 16-31, 32-47
  for r in 0 ..< 3:
    # Important unrolling for 2 reasons, see Intel paper
    # - State updates:
    #   In each round calculation six out of the eight state variables are shifted to the
    #   next state variable. Rather than do these using mov instructions, we rename
    #   the virtual registers (symbols) to effect this “shift”. Thus each round
    #   effectively rotates the set of state register names by one place. By doing 8 or
    #   16 rounds in the body of the loop, the names have rotated back to their
    #   starting values, so no register moves are needed before looping.
    #
    # - Message schedule:
    #   Similarly on the vector unit for the message scheduling, the 16 necessary
    #   scheduled DWORDs are stored in four XMM registers, as described earlier. For
    #   example, the initial data DWORDs are stored in order as {X0, X1, X2, X3}.
    #   When we compute four new scheduled DWORDs, we store them in X0
    #   (overwriting the “oldest” data DWORDs), so now the scheduled DWORDs are
    #   stored in order in {X1, X2, X3, X0}. Once again, we handle this by “rotating”
    #   the four names, where in this case the names rotate one place every four
    #   rounds (because we compute four scheduled DWORDs in each calculation).
    #   By having 16 rounds (four scheduling operations) in the body of the loop,
    #   these XMM register names rotate back to their initial value, and again no
    #   register moves are needed before looping.
    staticFor i, 0, VecNum:
      # We interleave computing the message scheduled at {t+4, t+5, t+6, t+7}
      # with SHA256 state update for {t, t+1, t+2, t+3}
      # As they are independent, hopefully the compiler reorders instructions
      # for maximum throughput.
      # Also it optimize away the moves and use register renaming to avoid rotations
      const pos = VecWords * i
      msnext.updateMessageSchedule(loMask, hiMask)
      let wnext = add_u32x4(msnext[3], loadu_u128(K256[k256_idx].unsafeAddr))
      # K256 was already included in the computation of wnext, hence kt = 0
      s.sha256_round(wt = ms.w[pos + 0], kt = 0)
      s.sha256_round(wt = ms.w[pos + 1], kt = 0)
      s.sha256_round(wt = ms.w[pos + 2], kt = 0)
      s.sha256_round(wt = ms.w[pos + 3], kt = 0)
      storea_u128(ms.w[pos].addr, wnext)
      k256_idx += VecWords
 func sha256_rounds_48_63(
       s: var Sha256_state,
       ms: var Sha256_MessageSchedule) {.inline.} =
  ## Process Sha256 rounds 48 to 63
  staticFor t, 48, 64:
    # Wt[i mod 16] and K256 was already integrated in the computation of wnext
    s.sha256_round(wt = ms.w[t and 15], kt = 0)
 func hashMessageBlocks_ssse3*(
       H: var Sha256_state,
       message: ptr UncheckedArray[byte],
       numBlocks: uint)=
  ## Hash a message block by block
  ## Sha256 block size is 64 bytes hence
  ## a message will be process 64 by 64 bytes.
  var msg = message
  var ms{.noInit.}: Sha256_MessageSchedule
  var msnext{.noInit.}: array[VecNum, m128i]
  var s{.noInit.}: Sha256_state
  s.copy(H)
  for _ in 0 ..< numBlocks:
    initMessageSchedule(msnext, ms, msg)
    msg +%= BlockSize
    sha256_rounds_0_47(s, ms, msnext)
    sha256_rounds_48_63(s, ms)
    s.accumulate(H) # accumulate on register variables
    H.copy(s)
--- a/constantine/math/config/type_bigint.nim
+++ b/constantine/math/config/type_bigint.nim
@ -36,7 +36,7 @@ debug:
    result[0] = '0'
    result[1] = 'x'
    var a = a
-    for j in countdown(L-1, 0):
+    for j in countdown(result.len-1, 2):
      result[j] = hexChars.secretLookup(a and SecretWord 0xF)
      a = a shr 4
@ -45,7 +45,7 @@ debug:
    result.add " " & toHex(a[0])
    for i in 1 ..< a.len:
      result.add ", " & toHex(a[i])
-    result.add "])"
+    result.add "]"
  func `$`*(a: BigInt): string =
    result = "BigInt["
--- a/constantine/platforms/isa/simd_x86.nim
+++ b/constantine/platforms/isa/simd_x86.nim
@ -0,0 +1,277 @@
 # Constantine
 # Copyright (c) 2018-2019    Status Research & Development GmbH
 # Copyright (c) 2020-Present Mamy André-Ratsimbazafy
 # Licensed and distributed under either of
 #   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 static: doAssert defined(i386) or defined(amd64)
 # SIMD throughput and latency:
 #   - https://software.intel.com/sites/landingpage/IntrinsicsGuide/
 #   - https://www.agner.org/optimize/instruction_tables.pdf
 # Reminder: x86 is little-endian, order is [low part, high part]
 # Documentation at https://software.intel.com/sites/landingpage/IntrinsicsGuide/
 when defined(vcc):
  {.pragma: x86_type, byCopy, header:"<intrin.h>".}
  {.pragma: x86, noDecl, header:"<intrin.h>".}
 else:
  {.pragma: x86_type, byCopy, header:"<x86intrin.h>".}
  {.pragma: x86, noDecl, header:"<x86intrin.h>".}
 type
  m128* {.importc: "__m128", x86_type.} = object
    raw: array[4, float32]
  m128d* {.importc: "__m128d", x86_type.} = object
    raw: array[2, float64]
  m128i* {.importc: "__m128i", x86_type.} = object
    raw: array[16, byte]
  m256* {.importc: "__m256", x86_type.} = object
    raw: array[8, float32]
  m256d* {.importc: "__m256d", x86_type.} = object
    raw: array[4, float64]
  m256i* {.importc: "__m256i", x86_type.} = object
    raw: array[32, byte]
  m512* {.importc: "__m512", x86_type.} = object
    raw: array[16, float32]
  m512d* {.importc: "__m512d", x86_type.} = object
    raw: array[8, float64]
  m512i* {.importc: "__m512i", x86_type.} = object
    raw: array[64, byte]
  mmask8* {.importc: "__mmask8", x86_type.} = uint8
  mmask16* {.importc: "__mmask16", x86_type.} = uint16
  mmask64* {.importc: "__mmask64", x86_type.} = uint64
 # ############################################################
 #
 #                    SSE2 - integer - packed
 #
 # ############################################################
 func mm_setzero_si128(): m128i {.importc: "_mm_setzero_si128", x86.}
 func mm_set1_epi8(a: int8 or uint8): m128i {.importc: "_mm_set1_epi8", x86.}
 func mm_set1_epi16(a: int16 or uint16): m128i {.importc: "_mm_set1_epi16", x86.}
 func mm_set1_epi32(a: int32 or uint32): m128i {.importc: "_mm_set1_epi32", x86.}
 func mm_set1_epi64x(a: int64 or uint64): m128i {.importc: "_mm_set1_epi64x", x86.}
 func mm_set_epi64x(e1, e0: int64 or uint64): m128i {.importc: "_mm_set_epi64x", x86.}
 func mm_load_si128(mem_addr: ptr m128i): m128i {.importc: "_mm_load_si128", x86.}
 func mm_loadu_si128(mem_addr: ptr m128i): m128i {.importc: "_mm_loadu_si128", x86.}
 func mm_store_si128(mem_addr: ptr m128i, a: m128i) {.importc: "_mm_store_si128", x86.}
 func mm_storeu_si128(mem_addr: ptr m128i, a: m128i) {.importc: "_mm_storeu_si128", x86.}
 func mm_set_epi32(e3, e2, e1, e0: int32 or uint32): m128i {.importc: "_mm_set_epi32", x86.}
  ## Initialize m128i with {e3, e2, e1, e0} (big endian order)
  ## in order [e0, e1, e2, e3]
 func mm_setr_epi32(e3, e2, e1, e0: int32 or uint32): m128i {.importc: "_mm_setr_epi32", x86.}
  ## Initialize m128i with {e3, e2, e1, e0} (big endian order)
  ## in order [e3, e2, e1, e0]
 func mm_xor_si128(a, b: m128i): m128i {.importc: "_mm_xor_si128", x86.}
 func mm_add_epi8(a, b: m128i): m128i {.importc: "_mm_add_epi8", x86.}
 func mm_add_epi16(a, b: m128i): m128i {.importc: "_mm_add_epi16", x86.}
 func mm_add_epi32(a, b: m128i): m128i {.importc: "_mm_add_epi32", x86.}
 func mm_add_epi64(a, b: m128i): m128i {.importc: "_mm_add_epi64", x86.}
 func mm_slli_epi64(a: m128i, imm8: int32 or uint32): m128i {.importc: "_mm_slli_epi64", x86.}
  ## Shift 2xint64 left
 func mm_srli_epi64(a: m128i, imm8: int32 or uint32): m128i {.importc: "_mm_srli_epi64", x86.}
  ## Shift 2xint64 right
 func mm_srli_epi32(a: m128i, imm8: int32 or uint32): m128i {.importc: "_mm_srli_epi32", x86.}
  ## Shift 4xint32 left
 func mm_slli_epi32(a: m128i, imm8: int32 or uint32): m128i {.importc: "_mm_slli_epi32", x86.}
  ## Shift 4xint32 right
 func mm_shuffle_epi32(a: m128i, imm8: int32 or uint32): m128i {.importc: "_mm_shuffle_epi32", x86.}
  ## Shuffle 32-bit integers in a according to the control in imm8
  ## Formula is in big endian representation
  ## a = {a3, a2, a1, a0}
  ## dst = {d3, d2, d1, d0}
  ## imm8 = {bits[7:6], bits[5:4], bits[3:2], bits[1:0]}
  ## d0 will refer a[bits[1:0]]
  ## d1            a[bits[3:2]]
 # ############################################################
 #
 #                    SSSE3 - integer - packed
 #
 # ############################################################
 func mm_alignr_epi8(a, b: m128i, imm8: int32 or uint32): m128i {.importc: "_mm_alignr_epi8", x86.}
  ## Concatenate 16-byte blocks in a and b into a 32-byte temporary result,
  ## shift the result right by imm8 bytes, and return the low 16 bytes
  ## Input:
  ##   a[127:0], b[127:0]
  ## Result:
  ##   tmp[255:128] = a
  ##   tmp[127:0]   = b
  ##   tmp[255:0]   = tmp[255:0] >> (imm8*8)
  ##   dst[127:0]   = tmp[127:0]
 func mm_shuffle_epi8(a, b: m128i): m128i {.importc: "_mm_shuffle_epi8", x86.}
  ## Shuffle 8-bit integers in a according to the control mask in b
  ## Formula is in big endian representation
  ## a =   {a15, a14, a13, a12, a11, a10, a9, a8, a7, a6, a5, a4, a3, a2, a1, a0}
  ## b =   {b15, b14, b13, b12, b11, b10, b9, b8, b7, b6, b5, b4, b3, b2, b1, b0}
  ## dst = {d15, d14, d13, d12, d11, d10, d9, d8, d7, d6, d5, d4, d3, d2, d1, d0}
  ##
  ## The control mask b0 ... b15 have the shape:
  ## bits z000uvwx
  ## if z is set, the corresponding d is set to zero.
  ## otherwise uvwx represents a binary number in 0..15,
  ##           the corresponding d will be set to a(uvwx)
  ## 
  ## for i in 0 ..< 16:
  ##  if bitand(b[i], 0x80) != 0:
  ##   dst[i] = 0
  ##  else:
  ##   dst[i] = a[bitand(b[i], 0x0F)]
 # ############################################################
 #
 #                    SSE4.1 - integer - packed
 #
 # ############################################################
 func mm_blend_epi16(a, b: m128i, imm8: int32 or uint32): m128i {.importc: "_mm_blend_epi16", x86.}
  ## Blend packed 16-bit integers from a and b using control mask imm8,
  ## and store the results in dst.
  ## 
  ## FOR j := 0 to 7
  ## i := j*16
  ## IF imm8[j]
  ##   dst[i+15:i] := b[i+15:i]
  ## ELSE
  ##   dst[i+15:i] := a[i+15:i]
 # ############################################################
 #
 #                  AVX512F - integer - packed
 #
 # ############################################################
 func mm_ror_epi32(a: m128i, imm8: int32 or uint32): m128i {.importc: "_mm_ror_epi32", x86.}
  ## Rotate 4xint32 right
 func mm_mask_add_epi32(src: m128i, mask: mmask8, a, b: m128i): m128i {.importc: "_mm_mask_add_epi32", x86.}
  ## Add packed 32-bit integers in a and b, and store the results in dst using writemask mask
  ## (elements are copied from src when the corresponding mask bit is not set).
  ## for j in 0 ..< 4:
  ##   let i = j*32
  ##   if k[j]:
  ##     dst[i+31:i] := a[i+31:i] + b[i+31:i]
  ##   else:
  ##     dst[i+31:i] := src[i+31:i]
 # ############################################################
 #
 #                  SHA extensions
 #
 # ############################################################
 func mm_sha256msg1_epu32(a, b: m128i): m128i {.importc: "_mm_sha256msg1_epu32", x86.}
  ## Perform an intermediate calculation for the next four SHA256 message values (unsigned 32-bit integers)
  ## using previous message values from a and b, and store the result in dst.
  ## 
  ## W4 := b[31:0]
  ## W3 := a[127:96]
  ## W2 := a[95:64]
  ## W1 := a[63:32]
  ## W0 := a[31:0]
  ## dst[127:96] := W3 + sigma0(W4)
  ## dst[95:64] := W2 + sigma0(W3)
  ## dst[63:32] := W1 + sigma0(W2)
  ## dst[31:0] := W0 + sigma0(W1)
 func mm_sha256msg2_epu32(a, b: m128i): m128i {.importc: "_mm_sha256msg2_epu32", x86.}
  ## Perform the final calculation for the next four SHA256 message values (unsigned 32-bit integers)
  ## using previous message values from a and b, and store the result in dst.
  ## 
  ## W14 := b[95:64]
  ## W15 := b[127:96]
  ## W16 := a[31:0] + sigma1(W14)
  ## W17 := a[63:32] + sigma1(W15)
  ## W18 := a[95:64] + sigma1(W16)
  ## W19 := a[127:96] + sigma1(W17)
  ## dst[127:96] := W19
  ## dst[95:64] := W18
  ## dst[63:32] := W17
  ## dst[31:0] := W16
 func mm_sha256rnds2_epu32(cdgh, abef, k: m128i): m128i {.importc: "_mm_sha256rnds2_epu32", x86.}
  ## Perform 2 rounds of SHA256 operation using
  ##   an initial SHA256 state (C,D,G,H) from a,
  ##   an initial SHA256 state (A,B,E,F) from b,
  ##   and a pre-computed sum of the next 2 round message values (unsigned 32-bit integers)
  ##     and the corresponding round constants from k,
  ## and store the updated SHA256 state (A,B,E,F) in dst.
  ## 
  ## A[0] := b[127:96]
  ## B[0] := b[95:64]
  ## C[0] := a[127:96]
  ## D[0] := a[95:64]
  ## E[0] := b[63:32]
  ## F[0] := b[31:0]
  ## G[0] := a[63:32]
  ## H[0] := a[31:0]
  ## W_K[0] := k[31:0]
  ## W_K[1] := k[63:32]
  ## FOR i := 0 to 1
  ##   A[i+1] := Ch(E[i], F[i], G[i]) + sum1(E[i]) + W_K[i] + H[i] + Maj(A[i], B[i], C[i]) + sum0(A[i])
  ##   B[i+1] := A[i]
  ##   C[i+1] := B[i]
  ##   D[i+1] := C[i]
  ##   E[i+1] := Ch(E[i], F[i], G[i]) + sum1(E[i]) + W_K[i] + H[i] + D[i]
  ##   F[i+1] := E[i]
  ##   G[i+1] := F[i]
  ##   H[i+1] := G[i]
  ## ENDFOR
  ## dst[127:96] := A[2]
  ## dst[95:64] := B[2]
  ## dst[63:32] := E[2]
  ## dst[31:0] := F[2]
 # Aliases
 # ------------------------------------------------
 template set_u64x2*(e1, e0: int64 or uint64): m128i =
  mm_set_epi64x(e1, e0)
 template setr_u32x4*(e3, e2, e1, e0: int32 or uint32): m128i =
  mm_setr_epi32(e3, e2, e1, e0)
 template loada_u128*(data: pointer): m128i =
  mm_load_si128(cast[ptr m128i](data))
 template loadu_u128*(data: pointer): m128i =
  mm_loadu_si128(cast[ptr m128i](data))
 template storea_u128*(mem_addr: pointer, a: m128i) =
  mm_store_si128(cast[ptr m128i](mem_addr), a)
 template xor_u128*(a, b: m128i): m128i =
  mm_xor_si128(a, b)
 template add_u32x4*(a, b: m128i): m128i =
  mm_add_epi32(a, b)
 template shl_u32x4*(a: m128i, imm8: int32 or uint32): m128i =
  mm_slli_epi32(a, imm8)
 template shr_u32x4*(a: m128i, imm8: int32 or uint32): m128i =
  mm_srli_epi32(a, imm8)
 template shr_u64x2*(a: m128i, imm8: int32 or uint32): m128i =
  mm_srli_epi64(a, imm8)
 template alignr_u128*(a, b: m128i, shiftRightByBytes: int32 or uint32): m128i =
  mm_alignr_epi8(a, b, shiftRightByBytes)
 template shuf_u8x16*(a: m128i, mask: m128i): m128i =
  mm_shuffle_epi8(a, mask)
 template shuf_u32x4*(a: m128i, mask: int32 or uint32): m128i =
  mm_shuffle_epi32(a, mask)
 template blend_u16x8*(a, b: m128i, mask: int32 or uint32): m128i =
  mm_blend_epi16(a, b, mask)
 template sha256_msg1*(a, b: m128i): m128i = 
  mm_sha256msg1_epu32(a, b)
 template sha256_msg2*(a, b: m128i): m128i = 
  mm_sha256msg2_epu32(a, b)
 template sha256_2rounds*(cdgh, abef, k: m128i): m128i =
  mm_sha256rnds2_epu32(cdgh, abef, k)
--- a/constantine/platforms/primitives.nim
+++ b/constantine/platforms/primitives.nim
@ -34,6 +34,10 @@ when X86 and GCC_Compatible:
  import isa/[cpuinfo_x86, macro_assembler_x86]
  export cpuinfo_x86, macro_assembler_x86
 # No exceptions allowed in core cryptographic operations
 {.push raises: [].}
 {.push checks: off.}
 # ############################################################
 #
 #                      Instrumentation
@ -71,4 +75,46 @@ func copy*[T: byte|char](
  {.push checks: off.} # No OverflowError or IndexError allowed
  for i in 0 ..< len:
-    dst[dStart + i] = byte src[sStart + i]
+    dst[dStart + i] = byte src[sStart + i]
 func rotateRight*[N: static int, T](a: var array[N, T]) {.inline.} =
  # Rotate right (Somehow we can't use a generic template here)
  # Inline
  # Hopefully we want the compiler to see that N rounds of rotation
  # can be optimized away with register renaming
  let tmp = a[a.len-1]
  staticForCountdown i, a.len-1, 1:
    a[i] = a[i-1]
  a[0] = tmp
 func rotateLeft*[N: static int, T](a: var array[N, T]) {.inline.} =
  # Rotate left (Somehow we can't use a generic template here)
  # Inline
  # Hopefully we want the compiler to see that N rounds of rotation
  # can be optimized away with register renaming
  let tmp = a[0]
  staticFor i, 0, a.len-1:
    a[i] = a[i+1]
  a[a.len-1] = tmp
 # ############################################################
 #
 #                    Pointer arithmetics
 #
 # ############################################################
 template asUnchecked*[T](a: openArray[T]): ptr UncheckedArray[T] =
  cast[ptr UncheckedArray[T]](a[0].unsafeAddr)
 # Warning for pointer arithmetics via inline C
 # be careful of not passing a `var ptr`
 # to a function as `var` are passed by hidden pointers in Nim and the wrong
 # pointer will be modified. Templates are fine.
 func `+%`*(p: ptr, offset: SomeInteger): type(p) {.inline, noInit.}=
  ## Pointer increment
  {.emit: [result, " = ", p, " + ", offset, ";"].}
 func `+%=`*(p: var ptr, offset: SomeInteger){.inline.}=
  ## Pointer increment
  p = p +% offset
--- a/helpers/static_for.nim
+++ b/helpers/static_for.nim
@ -35,6 +35,14 @@ macro staticFor*(idx: untyped{nkIdent}, start, stopEx: static int, body: untyped
      body.replaceNodes(idx, newLit i)
    )
 macro staticForCountdown*(idx: untyped{nkIdent}, start, stopIncl: static int, body: untyped): untyped =
  result = newStmtList()
  for i in countdown(start, stopIncl):
    result.add nnkBlockStmt.newTree(
      ident("unrolledIter_" & $idx & $i),
      body.replaceNodes(idx, newLit i)
    )
 {.experimental: "dynamicBindSym".}
 macro staticFor*(ident: untyped{nkIdent}, choices: typed, body: untyped): untyped =
--- a/tests/t_hash_sha256_vs_openssl.nim
+++ b/tests/t_hash_sha256_vs_openssl.nim
@ -34,7 +34,7 @@ else:
 # But the new API isn't expose on Linux :/
 # TODO: fix Windows
-when not defined(Windows):
+when not defined(windows):
  proc SHA256[T: byte|char](
        msg: openarray[T],
        digest: ptr array[32, byte] = nil