diff --git a/benchmarks/bench_sha256.nim b/benchmarks/bench_sha256.nim
index e6a31d3..6f2cc98 100644
--- a/benchmarks/bench_sha256.nim
+++ b/benchmarks/bench_sha256.nim
@@ -48,9 +48,9 @@ proc report(op: string, bytes: int, startTime, stopTime: MonoTime, startClk, sto
   when SupportsGetTicks:
     let cycles = (stopClk - startClk) div iters
     let cyclePerByte = cycles.float64 / bytes.float64
-    echo &"{op:<30}     {throughput:>15.3f} ops/s    {ns:>9} ns/op    {cycles:>10} cycles    {cyclePerByte:>5.2f} cycles/byte"
+    echo &"{op:<50}     {throughput:>15.3f} ops/s    {ns:>9} ns/op    {cycles:>10} cycles    {cyclePerByte:>5.2f} cycles/byte"
   else:
-    echo &"{op:<30}     {throughput:>15.3f} ops/s    {ns:>9} ns/op"
+    echo &"{op:<50}     {throughput:>15.3f} ops/s    {ns:>9} ns/op"
 
 template bench(op: string, bytes: int, iters: int, body: untyped): untyped =
   measure(iters, startTime, stopTime, startClk, stopClk, body)
@@ -63,41 +63,23 @@ proc benchSHA256_constantine[T](msg: openarray[T], msgComment: string, iters: in
 
 proc benchSHA256_openssl[T](msg: openarray[T], msgComment: string, iters: int) =
   var digest: array[32, byte]
-  bench("SHA256 - OpenSSL - " & msgComment, msg.len, iters):
+  bench("SHA256 - OpenSSL     - " & msgComment, msg.len, iters):
     SHA256_OpenSSL(digest, msg)
 
 when isMainModule:
   proc main() =
-    block:
-      let msg32B = rng.random_byte_seq(32)
-      benchSHA256_constantine(msg32B, "32B", 100)
-      benchSHA256_openssl(msg32B, "32B", 100)
-    block:
-      let msg64B = rng.random_byte_seq(64)
-      benchSHA256_constantine(msg64B, "64B", 100)
-      benchSHA256_openssl(msg64B, "64B", 100)
-    block:
-      let msg128B = rng.random_byte_seq(128)
-      benchSHA256_constantine(msg128B, "128B", 100)
-      benchSHA256_openssl(msg128B, "128B", 100)
-    block:
-      let msg576B = rng.random_byte_seq(576)
-      benchSHA256_constantine(msg576B, "576B", 50)
-      benchSHA256_openssl(msg576B, "576B", 50)
-    block:
-      let msg8192B = rng.random_byte_seq(8192)
-      benchSHA256_constantine(msg8192B, "8192B", 25)
-      benchSHA256_openssl(msg8192B, "8192B", 25)
-    block:
-      let msg1MB = rng.random_byte_seq(1_000_000)
-      benchSHA256_constantine(msg1MB, "1MB", 16)
-      benchSHA256_openssl(msg1MB, "1MB", 16)
-    block:
-      let msg10MB = rng.random_byte_seq(10_000_000)
-      benchSHA256_constantine(msg10MB, "10MB", 16)
-      benchSHA256_openssl(msg10MB, "10MB", 16)
-    block:
-      let msg100MB = rng.random_byte_seq(100_000_000)
-      benchSHA256_constantine(msg100MB, "100MB", 3)
-      benchSHA256_openssl(msg100MB, "100MB", 3)
+    const sizes = [
+      32, 64, 128, 256,
+      1024, 4096, 16384, 65536,
+      1_000_000, 10_000_000
+    ]
+
+    const target_cycles = 1_000_000_000'i64
+    const worst_cycles_per_bytes = 25'i64
+    for s in sizes:
+      let msg = rng.random_byte_seq(s)
+      let iters = int(target_cycles div (s.int64 * worst_cycles_per_bytes))
+      benchSHA256_constantine(msg, $s & "B", iters)
+      benchSHA256_openssl(msg, $s & "B", iters)
+
   main()
diff --git a/constantine/hashes/h_sha256.nim b/constantine/hashes/h_sha256.nim
index 231776b..38b5b0e 100644
--- a/constantine/hashes/h_sha256.nim
+++ b/constantine/hashes/h_sha256.nim
@@ -7,8 +7,13 @@
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
 import
-  std/macros,
-  ../platforms/[abstractions, endians]
+  ../platforms/[abstractions, endians],
+  ./sha256/sha256_generic
+
+when UseASM_X86_32:
+  import ./sha256/[
+    sha256_x86_ssse3,
+    sha256_x86_shaext]
 
 # SHA256, a hash function from the SHA2 family
 # --------------------------------------------------------------------------------
@@ -16,11 +21,6 @@ import
 # References:
 # - NIST: https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.180-4.pdf
 # - IETF: US Secure Hash Algorithms (SHA and HMAC-SHA) https://tools.ietf.org/html/rfc4634
-# - Intel optimization https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/sha-256-implementations-paper.pdf
-# - Parallelizing message schedules
-#   to accelerate the computations of hash functions
-#   Shay Gueron, Vlad Krasnov, 2012
-#   https://eprint.iacr.org/2012/067.pdf
 #
 # Vectors:
 # - https://csrc.nist.gov/CSRC/media/Projects/Cryptographic-Standards-and-Guidelines/documents/examples/SHA256.pdf
@@ -28,277 +28,74 @@ import
 # Types and constants
 # ----------------------------------------------------------------
 
-const
-  DigestSize = 32
-  BlockSize = 64
-  HashSize = DigestSize div sizeof(uint32) # 8
-
 type
   Sha256Context* = object
     ## Align to 64 for cache line and SIMD friendliness
-    H{.align: 64}: array[HashSize, uint32]
+    s{.align: 64}: Sha256_state
     buf{.align: 64}: array[BlockSize, byte]
     msgLen: uint64
-    bufIdx: uint8
 
   sha256* = Sha256Context
 
-# Internal
+# Internals
 # ----------------------------------------------------------------
-# TODO: vectorized implementations
 
 # No exceptions allowed in core cryptographic operations
 {.push raises: [].}
 {.push checks: off.}
 
-template rotr(x, n: uint32): uint32 =
-  ## Rotate right the bits
-  # We always use it with constants in 0 ..< 32
-  # so undefined behaviour.
-  (x shr n) or (x shl (32 - n))
-
-template ch(x, y, z: uint32): uint32 =
-  ## "Choose" function of SHA256
-  ## Choose bit i from yi or zi depending on xi
-  when false: # Spec FIPS 180-4
-    (x and y) xor (not(x) and z)
-  else:       # RFC4634
-    ((x and (y xor z)) xor z)
-
-template maj(x, y, z: uint32): uint32 =
-  ## "Majority" function of SHA256
-  when false: # Spec FIPS 180-4
-    (x and y) xor (x and z) xor (y and z)
-  else:      # RFC4634
-    (x and (y or z)) or (y and z)
-
-template S0(x: uint32): uint32 =
-  # Σ₀
-  rotr(x, 2) xor rotr(x, 13) xor rotr(x, 22)
-
-template S1(x: uint32): uint32 =
-  # Σ₁
-  rotr(x, 6) xor rotr(x, 11) xor rotr(x, 25)
-
-template s0(x: uint32): uint32 =
-  # σ₀
-  rotr(x, 7) xor rotr(x, 18) xor (x shr 3)
-
-template s1(x: uint32): uint32 =
-  # σ₁
-  rotr(x, 17) xor rotr(x, 19) xor (x shr 10)
-
-template u32BE(blob: array[4, byte]): uint32 =
-  ## Interpret a data blob as a big-endian uint32
-  ## This should lower to
-  when nimvm:
-    (blob[0].uint32 shl 24) or (blob[1].uint32 shl 16) or (blob[2].uint32 shl 8) or blob[3].uint32
-  else:
-    when cpuEndian == littleEndian:
-      (blob[0].uint32 shl 24) or (blob[1].uint32 shl 16) or (blob[2].uint32 shl 8) or blob[3].uint32
+func hashMessageBlocks(
+       s: var Sha256_state,
+       message: ptr UncheckedArray[byte],
+       numBlocks: uint) =
+  when UseASM_X86_32:
+    if ({.noSideEffect.}: hasSha()):
+      hashMessageBlocks_shaext(s, message, numBlocks)
+    elif ({.noSideEffect.}: hasSSSE3()):
+      hashMessageBlocks_ssse3(s, message, numBlocks)
     else:
-      cast[uint32](blob)
-
-template getU32at[T: byte|char](msg: openarray[T], pos: SomeInteger): uint32 =
-  u32BE(cast[ptr array[4, byte]](msg[pos].unsafeAddr())[])
-
-func rotateRight[T](a: var openarray[T], k: int) =
-  ## Rotate a seuqnce by k
-  doAssert a.len > 0
-  let k = k mod a.len
-
-  for _ in 0 ..< k:
-    let tmp = a[^1]
-    for i in countdown(a.len-1, 1):
-      a[i] = a[i-1]
-    a[0] = tmp
-
-macro round(a, b, c, d, e, f, g, h: untyped, t: static int): untyped =
-  ## Unrolled and allocation efficient SHA256 round
-  var s = [a, b, c, d, e, f, g, h]
-  s.rotateRight(t)
-  let
-    a = s[0]
-    b = s[1]
-    c = s[2]
-    d = s[3]
-    e = s[4]
-    f = s[5]
-    g = s[6]
-    h = s[7]
-
-  # W[t]
-  let w = nnkBracketExpr.newTree(
-    ident("W"), newLit(t mod 16)
-  )
-
-  result = newStmtList()
-
-  if t < 16:
-    # Reading message phase
-    let msg = ident"message"
-    let curBlock = ident"curBlock"
-    result.add quote do:
-      `w` = getU32at(`msg`, `curBlock`*64 + `t`*4)
+      hashMessageBlocks_generic(s, message, numBlocks)
   else:
-    # Mixing
-
-    # Wt-2, Wt-7, Wt-15, Wt-16
-    let Wtm2 = nnkBracketExpr.newTree(
-      ident("W"), newLit((t-2) mod 16)
-    )
-    let Wtm7 = nnkBracketExpr.newTree(
-      ident("W"), newLit((t-7) mod 16)
-    )
-    let Wtm15 = nnkBracketExpr.newTree(
-      ident("W"), newLit((t-15) mod 16)
-    )
-    # w is Wt-16
-    result.add quote do:
-      `w` += s1(`Wtm2`) + `Wtm7` + s0(`Wtm15`)
-
-  result.add quote do:
-    let T1 = `h` + S1(`e`) + ch(`e`, `f`, `g`) + K256[`t`] + `w`
-    let T2 = S0(`a`) + maj(`a`, `b`, `c`)
-    `d` += T1
-    `h` = T1 + T2
-
-func hashMessageBlocks[T: byte|char](
-       H: var array[HashSize, uint32],
-       message: openarray[T]): uint =
-  ## Hash a message block by block
-  ## Sha256 block size is 64 bytes hence
-  ## a message will be process 64 by 64 bytes.
-  ## FIPS.180-4 6.2.2. SHA-256 Hash Computation
-
-  result = 0
-  let numBlocks = message.len.uint div BlockSize
-  if numBlocks == 0:
-    return 0
-
-  const K256 = [
-    0x428a2f98'u32, 0x71374491'u32, 0xb5c0fbcf'u32, 0xe9b5dba5'u32, 0x3956c25b'u32, 0x59f111f1'u32, 0x923f82a4'u32, 0xab1c5ed5'u32,
-    0xd807aa98'u32, 0x12835b01'u32, 0x243185be'u32, 0x550c7dc3'u32, 0x72be5d74'u32, 0x80deb1fe'u32, 0x9bdc06a7'u32, 0xc19bf174'u32,
-    0xe49b69c1'u32, 0xefbe4786'u32, 0x0fc19dc6'u32, 0x240ca1cc'u32, 0x2de92c6f'u32, 0x4a7484aa'u32, 0x5cb0a9dc'u32, 0x76f988da'u32,
-    0x983e5152'u32, 0xa831c66d'u32, 0xb00327c8'u32, 0xbf597fc7'u32, 0xc6e00bf3'u32, 0xd5a79147'u32, 0x06ca6351'u32, 0x14292967'u32,
-    0x27b70a85'u32, 0x2e1b2138'u32, 0x4d2c6dfc'u32, 0x53380d13'u32, 0x650a7354'u32, 0x766a0abb'u32, 0x81c2c92e'u32, 0x92722c85'u32,
-    0xa2bfe8a1'u32, 0xa81a664b'u32, 0xc24b8b70'u32, 0xc76c51a3'u32, 0xd192e819'u32, 0xd6990624'u32, 0xf40e3585'u32, 0x106aa070'u32,
-    0x19a4c116'u32, 0x1e376c08'u32, 0x2748774c'u32, 0x34b0bcb5'u32, 0x391c0cb3'u32, 0x4ed8aa4a'u32, 0x5b9cca4f'u32, 0x682e6ff3'u32,
-    0x748f82ee'u32, 0x78a5636f'u32, 0x84c87814'u32, 0x8cc70208'u32, 0x90befffa'u32, 0xa4506ceb'u32, 0xbef9a3f7'u32, 0xc67178f2'u32
-  ]
-
-  var
-    a = H[0]
-    b = H[1]
-    c = H[2]
-    d = H[3]
-    e = H[4]
-    f = H[5]
-    g = H[6]
-    h = H[7]
-
-  for curBlock in 0 ..< numBlocks:
-    # The first 16 bytes have different handling
-    # from bytes 16..<64.
-    # Using an array[64, uint32] will span it
-    # across 8 cache lines impacting performance
-
-    # Workspace with message schedule Wₜ
-    var W{.noInit.}: array[16, uint32]
-
-    when true:
-      # Translation of the spec
-      # This is faster than even OpenSSL for hashing just 32 bytes
-      # for example for HMAC and HKDF.
-      var t = 0'u32
-      while t < 16: # Wₜ = Mⁱₜ
-        W[t].parseFromBlob(message, result, bigEndian)
-        let T1 = h + S1(e) + ch(e, f, g) + K256[t] + W[t]
-        let T2 = S0(a) + maj(a, b, c)
-        h = g
-        g = f
-        f = e
-        e = d + T1
-        d = c
-        c = b
-        b = a
-        a = T1+T2
-
-        t += 1
-
-      while t < 64:
-        W[t mod 16] += s1(W[(t-2) mod 16]) +
-                      W[(t-7) mod 16] +
-                      s0(W[(t-15) mod 16])
-        let T1 = h + S1(e) + ch(e, f, g) + K256[t] + W[t mod 16]
-        let T2 = S0(a) + maj(a, b, c)
-        h = g
-        g = f
-        f = e
-        e = d + T1
-        d = c
-        c = b
-        b = a
-        a = T1+T2
-
-        t += 1
-    else:
-      # optimized version for large hashes
-      # For hashing 32B, this is slower than the rough translation
-      # of spec, unless compiled with -mssse3 (but no vector instructions are used :/)
-      staticFor t, 0, 64:
-        round(a, b, c, d, e, f, g, h, t)
-      result += 64
-
-    a += H[0]; H[0] = a
-    b += H[1]; H[1] = b
-    c += H[2]; H[2] = c
-    d += H[3]; H[3] = d
-    e += H[4]; H[4] = e
-    f += H[5]; H[5] = f
-    g += H[6]; H[6] = g
-    h += H[7]; H[7] = h
+    hashMessageBlocks_generic(s, message, numBlocks)
 
 func dumpHash(
        digest: var array[DigestSize, byte],
-       H: array[HashSize, uint32]) =
+       s: Sha256_state) {.inline.} =
   ## Convert the internal hash into a message digest
   var dstIdx = 0'u
-  for i in 0 ..< H.len:
-    digest.dumpRawInt(H[i], dstIdx, bigEndian)
+  for i in 0 ..< s.H.len:
+    digest.dumpRawInt(s.H[i], dstIdx, bigEndian)
     dstIdx += uint sizeof(uint32)
 
-func hashBuffer(ctx: var Sha256Context) =
-  discard ctx.H.hashMessageBlocks(ctx.buf)
+func hashBuffer(ctx: var Sha256Context) {.inline.} =
+  ctx.s.hashMessageBlocks(ctx.buf.asUnchecked(), numBlocks = 1)
   ctx.buf.setZero()
-  ctx.bufIdx = 0
 
 # Public API
 # ----------------------------------------------------------------
 
 template digestSize*(H: type sha256): int =
   ## Returns the output size in bytes
-  32
+  DigestSize
 
 template internalBlockSize*(H: type sha256): int =
   ## Returns the byte size of the hash function ingested blocks
-  64
+  BlockSize
 
 func init*(ctx: var Sha256Context) =
   ## Initialize or reinitialize a Sha256 context
 
   ctx.msgLen = 0
   ctx.buf.setZero()
-  ctx.bufIdx = 0
 
-  ctx.H[0] = 0x6a09e667'u32
-  ctx.H[1] = 0xbb67ae85'u32
-  ctx.H[2] = 0x3c6ef372'u32
-  ctx.H[3] = 0xa54ff53a'u32
-  ctx.H[4] = 0x510e527f'u32
-  ctx.H[5] = 0x9b05688c'u32
-  ctx.H[6] = 0x1f83d9ab'u32
-  ctx.H[7] = 0x5be0cd19'u32
+  ctx.s.H[0] = 0x6a09e667'u32
+  ctx.s.H[1] = 0xbb67ae85'u32
+  ctx.s.H[2] = 0x3c6ef372'u32
+  ctx.s.H[3] = 0xa54ff53a'u32
+  ctx.s.H[4] = 0x510e527f'u32
+  ctx.s.H[5] = 0x9b05688c'u32
+  ctx.s.H[6] = 0x1f83d9ab'u32
+  ctx.s.H[7] = 0x5be0cd19'u32
 
 func initZeroPadded*(ctx: var Sha256Context) =
   ## Initialize a Sha256 context
@@ -312,18 +109,17 @@ func initZeroPadded*(ctx: var Sha256Context) =
 
   ctx.msgLen = 64
   ctx.buf.setZero()
-  ctx.bufIdx = 0
 
-  ctx.H[0] = 0xda5698be'u32
-  ctx.H[1] = 0x17b9b469'u32
-  ctx.H[2] = 0x62335799'u32
-  ctx.H[3] = 0x779fbeca'u32
-  ctx.H[4] = 0x8ce5d491'u32
-  ctx.H[5] = 0xc0d26243'u32
-  ctx.H[6] = 0xbafef9ea'u32
-  ctx.H[7] = 0x1837a9d8'u32
+  ctx.s.H[0] = 0xda5698be'u32
+  ctx.s.H[1] = 0x17b9b469'u32
+  ctx.s.H[2] = 0x62335799'u32
+  ctx.s.H[3] = 0x779fbeca'u32
+  ctx.s.H[4] = 0x8ce5d491'u32
+  ctx.s.H[5] = 0xc0d26243'u32
+  ctx.s.H[6] = 0xbafef9ea'u32
+  ctx.s.H[7] = 0x1837a9d8'u32
 
-func update*[T: char|byte](ctx: var Sha256Context, message: openarray[T]) =
+func update*(ctx: var Sha256Context, message: openarray[byte]) =
   ## Append a message to a SHA256 context
   ## for incremental SHA256 computation
   ##
@@ -336,53 +132,48 @@ func update*[T: char|byte](ctx: var Sha256Context, message: openarray[T]) =
   ##
   ## For passwords and secret keys, you MUST NOT use raw SHA-256
   ## use a Key Derivation Function instead (KDF)
+  
+  # Message processing state machine
+  var bufIdx = uint(ctx.msgLen mod BlockSize)
+  var cur = 0'u
+  var bytesLeft = message.len.uint
 
-  debug:
-    doAssert: 0 <= ctx.bufIdx and ctx.bufIdx.int < ctx.buf.len
-    for i in ctx.bufIdx ..< ctx.buf.len:
-      doAssert ctx.buf[i] == 0
-
-  if message.len == 0:
-    return
-
-  var # Message processing state machine
-    cur = 0'u
-    bytesLeft = message.len.uint
-
-  ctx.msgLen += bytesLeft
-
-  if ctx.bufIdx != 0: # Previous partial update
-    let bufIdx = ctx.bufIdx.uint
-    let free = ctx.buf.sizeof().uint - bufIdx
-
-    if free > bytesLeft:
-      # Enough free space, store in buffer
-      ctx.buf.copy(dStart = bufIdx, message, sStart = 0, len = bytesLeft)
-      ctx.bufIdx += bytesLeft.uint8
-      return
-    else:
-      # Fill the buffer and do one sha256 hash
-      ctx.buf.copy(dStart = bufIdx, message, sStart = 0, len = free)
-      ctx.hashBuffer()
-
-      # Update message state for further processing
-      cur = free
-      bytesLeft -= free
-
-  # Process n blocks (64 byte each)
-  let consumed = ctx.H.hashMessageBlocks(
-    message.toOpenArray(int cur, message.len-1))
-  cur += consumed
-  bytesLeft -= consumed
+  if bufIdx != 0 and bufIdx+bytesLeft >= BlockSize:
+    # Previous partial update, fill the buffer and do one sha256 hash
+    let free = BlockSize - bufIdx
+    ctx.buf.copy(dStart = bufIdx, message, sStart = 0, len = free)
+    ctx.hashBuffer()
+    bufIdx = 0
+    cur = free
+    bytesLeft -= free
+  
+  if bytesLeft >= BlockSize:
+    # Process n blocks (64 byte each)
+    let numBlocks = bytesLeft div BlockSize
+    ctx.s.hashMessageBlocks(message.asUnchecked +% cur, numBlocks)
+    cur += numBlocks * BlockSize
+    bytesLeft -= numBlocks * BlockSize
 
   if bytesLeft != 0:
     # Store the tail in buffer
-    debug: # TODO: state machine formal verification - https://nim-lang.org/docs/drnim.html
-      doAssert ctx.bufIdx == 0
-      doAssert cur + bytesLeft == message.len.uint
+    ctx.buf.copy(dStart = bufIdx, message, sStart = cur, len = bytesLeft)
 
-    ctx.buf.copy(dStart = 0'u, message, sStart = cur, len = bytesLeft)
-    ctx.bufIdx = uint8 bytesLeft
+  ctx.msgLen += message.len.uint
+
+func update*(ctx: var Sha256Context, message: openarray[char]) {.inline.} =
+  ## Append a message to a SHA256 context
+  ## for incremental SHA256 computation
+  ##
+  ## Security note: the tail of your message might be stored
+  ## in an internal buffer.
+  ## if sensitive content is used, ensure that
+  ## `ctx.finish(...)` and `ctx.clear()` are called as soon as possible.
+  ## Additionally ensure that the message(s) passed were stored
+  ## in memory considered secure for your threat model.
+  ##
+  ## For passwords and secret keys, you MUST NOT use raw SHA-256
+  ## use a Key Derivation Function instead (KDF)
+  ctx.update(message.toOpenArrayByte(message.low, message.high))
 
 func finish*(ctx: var Sha256Context, digest: var array[32, byte]) =
   ## Finalize a SHA256 computation and output the
@@ -396,26 +187,23 @@ func finish*(ctx: var Sha256Context, digest: var array[32, byte]) =
   ## For passwords and secret keys, you MUST NOT use raw SHA-256
   ## use a Key Derivation Function instead (KDF)
 
-  debug:
-    doAssert: 0 <= ctx.bufIdx and ctx.bufIdx.int < ctx.buf.len
-    for i in ctx.bufIdx ..< ctx.buf.len:
-      doAssert ctx.buf[i] == 0
+  let bufIdx = uint(ctx.msgLen mod BlockSize)
 
   # Add '1' bit at the end of the message (+7 zero bits)
-  ctx.buf[ctx.bufIdx] = 0b1000_0000
+  ctx.buf[bufIdx] = 0b1000_0000
 
   # Add k bits so that msgLenBits + 1 + k ≡ 448 mod 512
   # Hence in bytes msgLen + 1 + K ≡ 56 mod 64
   const padZone = 56
-  if ctx.bufIdx >= padZone:
+  if bufIdx >= padZone:
     # We are in the 56..<64 mod 64 byte count
     # and need to rollover to 0
     ctx.hashBuffer()
 
   let lenInBits = ctx.msgLen.uint64 * 8
   ctx.buf.dumpRawInt(lenInBits, padZone, bigEndian)
-  discard ctx.H.hashMessageBlocks(ctx.buf)
-  digest.dumpHash(ctx.H)
+  ctx.s.hashMessageBlocks(ctx.buf.asUnchecked(), numBlocks = 1)
+  digest.dumpHash(ctx.s)
 
 func clear*(ctx: var Sha256Context) =
   ## Clear the context internal buffers
@@ -423,7 +211,6 @@ func clear*(ctx: var Sha256Context) =
   ## For passwords and secret keys, you MUST NOT use raw SHA-256
   ## use a Key Derivation Function instead (KDF)
   # TODO: ensure compiler cannot optimize the code away
-  ctx.H.setZero()
+  ctx.s.H.setZero()
   ctx.buf.setZero()
   ctx.msgLen = 0
-  ctx.bufIdx = 0
\ No newline at end of file
diff --git a/constantine/hashes/sha256/sha256_generic.nim b/constantine/hashes/sha256/sha256_generic.nim
new file mode 100644
index 0000000..2e8dbd4
--- /dev/null
+++ b/constantine/hashes/sha256/sha256_generic.nim
@@ -0,0 +1,185 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  ../../platforms/primitives
+
+# SHA256, a hash function from the SHA2 family
+# --------------------------------------------------------------------------------
+#
+# References:
+# - NIST: https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.180-4.pdf
+# - IETF: US Secure Hash Algorithms (SHA and HMAC-SHA) https://tools.ietf.org/html/rfc4634
+# Vectors:
+# - https://csrc.nist.gov/CSRC/media/Projects/Cryptographic-Standards-and-Guidelines/documents/examples/SHA256.pdf
+
+# No exceptions allowed in core cryptographic operations
+{.push raises: [].}
+{.push checks: off.}
+
+# Types & Constants
+# ------------------------------------------------
+# The enforced alignment should help the compiler produce optimized code
+
+type Word* = uint32
+
+const
+  DigestSize* = 32
+  BlockSize* = 64
+  HashSize* = DigestSize div sizeof(Word) # 8
+
+type Sha256_MessageSchedule* = object
+  w*{.align: 64.}: array[BlockSize div sizeof(Word), Word]
+
+type Sha256_state* = object
+  H*{.align: 64.}: array[HashSize, Word]
+
+const K256* = [
+  0x428a2f98'u32, 0x71374491'u32, 0xb5c0fbcf'u32, 0xe9b5dba5'u32, 0x3956c25b'u32, 0x59f111f1'u32, 0x923f82a4'u32, 0xab1c5ed5'u32,
+  0xd807aa98'u32, 0x12835b01'u32, 0x243185be'u32, 0x550c7dc3'u32, 0x72be5d74'u32, 0x80deb1fe'u32, 0x9bdc06a7'u32, 0xc19bf174'u32,
+  0xe49b69c1'u32, 0xefbe4786'u32, 0x0fc19dc6'u32, 0x240ca1cc'u32, 0x2de92c6f'u32, 0x4a7484aa'u32, 0x5cb0a9dc'u32, 0x76f988da'u32,
+  0x983e5152'u32, 0xa831c66d'u32, 0xb00327c8'u32, 0xbf597fc7'u32, 0xc6e00bf3'u32, 0xd5a79147'u32, 0x06ca6351'u32, 0x14292967'u32,
+  0x27b70a85'u32, 0x2e1b2138'u32, 0x4d2c6dfc'u32, 0x53380d13'u32, 0x650a7354'u32, 0x766a0abb'u32, 0x81c2c92e'u32, 0x92722c85'u32,
+  0xa2bfe8a1'u32, 0xa81a664b'u32, 0xc24b8b70'u32, 0xc76c51a3'u32, 0xd192e819'u32, 0xd6990624'u32, 0xf40e3585'u32, 0x106aa070'u32,
+  0x19a4c116'u32, 0x1e376c08'u32, 0x2748774c'u32, 0x34b0bcb5'u32, 0x391c0cb3'u32, 0x4ed8aa4a'u32, 0x5b9cca4f'u32, 0x682e6ff3'u32,
+  0x748f82ee'u32, 0x78a5636f'u32, 0x84c87814'u32, 0x8cc70208'u32, 0x90befffa'u32, 0xa4506ceb'u32, 0xbef9a3f7'u32, 0xc67178f2'u32
+]
+
+# Primitives
+# ------------------------------------------------
+
+template rotr(x, n: uint32): uint32 =
+  ## Rotate right the bits
+  # We always use it with constants in 0 ..< 32
+  # so no undefined behaviour.
+  (x shr n) or (x shl (32 - n))
+
+template ch(x, y, z: uint32): uint32 =
+  ## "Choose" function of SHA256
+  ## Choose bit i from yi or zi depending on xi
+  when false: # Spec FIPS 180-4
+    (x and y) xor (not(x) and z)
+  else:       # RFC4634
+    ((x and (y xor z)) xor z)
+
+template maj(x, y, z: uint32): uint32 =
+  ## "Majority" function of SHA256
+  when false: # Spec FIPS 180-4
+    (x and y) xor (x and z) xor (y and z)
+  else:      # RFC4634
+    (x and (y or z)) or (y and z)
+
+template S0(x: uint32): uint32 =
+  # Σ₀
+  rotr(x, 2) xor rotr(x, 13) xor rotr(x, 22)
+
+template S1(x: uint32): uint32 =
+  # Σ₁
+  rotr(x, 6) xor rotr(x, 11) xor rotr(x, 25)
+
+template s0(x: uint32): uint32 =
+  # σ₀
+  rotr(x, 7) xor rotr(x, 18) xor (x shr 3)
+
+template s1(x: uint32): uint32 =
+  # σ₁
+  rotr(x, 17) xor rotr(x, 19) xor (x shr 10)
+
+# Message schedule
+# ------------------------------------------------
+
+template u32BE(blob: array[4, byte]): uint32 =
+  ## Interpret a data blob as a big-endian uint32
+  when nimvm:
+    (blob[0].uint32 shl 24) or (blob[1].uint32 shl 16) or (blob[2].uint32 shl 8) or blob[3].uint32
+  else:
+    when cpuEndian == littleEndian:
+      (blob[0].uint32 shl 24) or (blob[1].uint32 shl 16) or (blob[2].uint32 shl 8) or blob[3].uint32
+    else:
+      cast[uint32](blob)
+
+template getU32at(msg: ptr UncheckedArray[byte], pos: SomeInteger): uint32 =
+  u32BE(cast[ptr array[4, byte]](msg[pos].addr)[])
+
+# State updates
+# ------------------------------------------------
+
+template copy*(dst: var Sha256_state, src: Sha256_state) =
+  ## State copy
+  # Should compile with a specialized aligned copy.
+  # No bounds check
+  for i in 0 ..< HashSize:
+    dst.H[i] = src.H[i]
+
+template accumulate*(dst: var Sha256_state, src: Sha256_state) =
+  ## State accumulation
+  # No bounds check
+  for i in 0 ..< HashSize:
+    dst.H[i] += src.H[i]
+
+template sha256_round*(s: var Sha256_state, wt, kt: Word) =
+  template a: Word = s.H[0]
+  template b: Word = s.H[1]
+  template c: Word = s.H[2]
+  template d: Word = s.H[3]
+  template e: Word = s.H[4]
+  template f: Word = s.H[5]
+  template g: Word = s.H[6]
+  template h: Word = s.H[7]
+
+  let T1 = h + S1(e) + ch(e, f, g) + kt + wt
+  let T2 = S0(a) + maj(a, b, c)
+  d += T1
+  h = T1 + T2
+
+  s.H.rotateRight()
+
+# Hash Computation
+# ------------------------------------------------
+
+func sha256_rounds_0_15(
+       s: var Sha256_state,
+       ms: var Sha256_MessageSchedule,
+       message: ptr UncheckedArray[byte]) {.inline.} =
+  staticFor t, 0, 16:
+    ms.w[t] = message.getU32at(t * sizeof(Word))
+    sha256_round(s, ms.w[t], K256[t])
+
+func sha256_rounds_16_63(
+       s: var Sha256_state,
+       ms: var Sha256_MessageSchedule) {.inline.}  =
+  staticFor t, 16, 64:
+    ms.w[t and 15] += s1(ms.w[(t -  2) and 15])+
+                         ms.w[(t -  7) and 15] +
+                      s0(ms.w[(t - 15) and 15])
+
+    sha256_round(s, ms.w[t and 15], K256[t])
+
+func hashMessageBlocks_generic*(
+       H: var Sha256_state,
+       message: ptr UncheckedArray[byte],
+       numBlocks: uint) =
+  ## Hash a message block by block
+  ## Sha256 block size is 64 bytes hence
+  ## a message will be process 64 by 64 bytes.
+  ## FIPS.180-4 6.2.2. SHA-256 Hash Computation
+
+  var msg = message
+  var ms{.noInit.}: Sha256_MessageSchedule
+  var s{.noInit.}: Sha256_state
+
+  s.copy(H)
+
+  for _ in 0 ..< numBlocks:
+    sha256_rounds_0_15(s, ms, msg)
+    msg +%= BlockSize
+
+    sha256_rounds_16_63(s, ms)
+
+    s.accumulate(H) # accumulate on register variables
+    H.copy(s)
diff --git a/constantine/hashes/sha256/sha256_x86_shaext.nim b/constantine/hashes/sha256/sha256_x86_shaext.nim
new file mode 100644
index 0000000..4d38117
--- /dev/null
+++ b/constantine/hashes/sha256/sha256_x86_shaext.nim
@@ -0,0 +1,130 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  ../../platforms/isa/simd_x86,
+  ../../platforms/primitives,
+  ./sha256_generic
+
+{.localpassC:"-msse4.1 -msha".}
+
+# SHA256, a hash function from the SHA2 family
+# --------------------------------------------------------------------------------
+#
+# References:
+# - Intel SHA extensions whitepaper
+#   https://www.intel.com/content/dam/develop/external/us/en/documents/intel-sha-extensions-white-paper-402097.pdf
+# - Intel SHA extensions article
+#   https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sha-extensions.html
+
+# No exceptions allowed in core cryptographic operations
+{.push raises: [].}
+{.push checks: off.}
+
+# Primitives
+# ------------------------------------------------
+
+template setr_K(i: int): m128i =
+  setr_u32x4(K256[4*i], K256[4*i+1], K256[4*i+2], K256[4*i+3])
+
+# Hash Computation
+# ------------------------------------------------
+
+func hashMessageBlocks_shaext*(
+       H: var Sha256_state,
+       message: ptr UncheckedArray[byte],
+       numBlocks: uint)=
+  ## Hash a message block by block
+  ## Sha256 block size is 64 bytes hence
+  ## a message will be process 64 by 64 bytes.
+  
+  var
+    abef_save {.noInit.}: m128i
+    cdgh_save {.noInit.}: m128i
+    state0 {.noInit.}: m128i
+    state1 {.noInit.}: m128i
+    msgtmp {.noInit.}: array[4, m128i]
+    msg {.noInit.}: m128i
+    tmp {.noInit.}: m128i
+
+    data = message
+
+  let shuf_mask = set_u64x2(0x0c0d0e0f08090a0b, 0x0405060700010203)
+
+  # The SHA state is stored in this order:
+  #   D, C, B, A, H, G, F, E
+  #
+  # state0 contains ABEF, state1 contains CDGH
+
+  tmp    = shuf_u32x4(loada_u128(H.H[0].addr), 0xB1) # CDAB
+  state1 = shuf_u32x4(loada_u128(H.H[4].addr), 0x1B) # EFGH
+  state0 = alignr_u128(tmp, state1, 8)               # ABEF
+  state1 = blend_u16x8(state1, tmp, 0xF0)            # CDGH
+
+  for _ in 0 ..< numBlocks:
+    # Save current state
+    abef_save = state0
+    cdgh_save = state1
+
+    # Rounds 0-3
+    msgtmp[0] = shuf_u8x16(loadu_u128(data[0].addr), shuf_mask)
+    msg       = add_u32x4(msgtmp[0], setr_K(0))
+    state1    = sha256_2rounds(state1, state0, msg)
+    msg       = shuf_u32x4(msg, 0x0E)
+    state0    = sha256_2rounds(state0, state1, msg)
+
+    # Rounds 4-7 and 8-11
+    staticFor i, 1, 3:
+      msgtmp[i]   = shuf_u8x16(loadu_u128(data[16*i].addr), shuf_mask)
+      msg         = add_u32x4(msgtmp[i], setr_K(i))
+      state1      = sha256_2rounds(state1, state0, msg)
+      msg         = shuf_u32x4(msg, 0x0E)
+      state0      = sha256_2rounds(state0, state1, msg)
+      msgtmp[i-1] = sha256_msg1(msgtmp[i-1], msgtmp[i])
+
+    # Rounds 12-59
+    msgtmp[3] = shuf_u8x16(loadu_u128(data[16*3].addr), shuf_mask)
+    
+    staticFor i, 3, 15:
+      let prev = (i-1) and 3 # mod 4, we rotate buffers
+      let curr =  i    and 3
+      let next = (i+1) and 3
+
+      msg          = add_u32x4(msgtmp[curr], setr_K(i))
+      state1       = sha256_2rounds(state1, state0, msg)
+      tmp          = alignr_u128(msgtmp[curr], msgtmp[prev], 4)
+      msgtmp[next] = add_u32x4(msgtmp[next], tmp)
+      msgtmp[next] = sha256_msg2(msgtmp[next], msgtmp[curr])
+      msg          = shuf_u32x4(msg, 0x0E)
+      state0       = sha256_2rounds(state0, state1, msg)
+      msgtmp[prev] = sha256_msg1(msgtmp[prev], msgtmp[curr])
+
+    # Rounds 60-63
+    msg    = add_u32x4(msgtmp[3], setr_K(15))
+    state1 = sha256_2rounds(state1, state0, msg)
+    msg    = shuf_u32x4(msg, 0x0E)
+    state0 = sha256_2rounds(state0, state1, msg)
+
+    # Accumulate
+    state0 = add_u32x4(state0, abef_save)
+    state1 = add_u32x4(state1, cdgh_save)
+
+    data +%= BlockSize
+  
+  # The SHA state is stored in this order:
+  #   D, C, B, A, H, G, F, E
+  #
+  # state0 contains ABEF, state1 contains CDGH
+
+  tmp    = shuf_u32x4(state0, 0x1B)       # FEBA
+  state1 = shuf_u32x4(state1, 0xB1)       # DCHG
+  state0 = blend_u16x8(tmp, state1, 0xF0) # DCBA
+  state1 = alignr_u128(state1, tmp, 8)    # HGFE
+
+  storea_u128(H.H[0].addr, state0)
+  storea_u128(H.H[4].addr, state1)
\ No newline at end of file
diff --git a/constantine/hashes/sha256/sha256_x86_ssse3.nim b/constantine/hashes/sha256/sha256_x86_ssse3.nim
new file mode 100644
index 0000000..a74b60c
--- /dev/null
+++ b/constantine/hashes/sha256/sha256_x86_ssse3.nim
@@ -0,0 +1,211 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  ../../platforms/isa/simd_x86,
+  ../../platforms/primitives,
+  ./sha256_generic
+
+{.localpassC:"-mssse3".}
+
+# SHA256, SSSE3 optimizations
+# --------------------------------------------------------------------------------
+#
+# References:
+# - NIST: https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.180-4.pdf
+# - IETF: US Secure Hash Algorithms (SHA and HMAC-SHA) https://tools.ietf.org/html/rfc4634
+# - Fast SHA-256 Implementations on Intel® Architecture Processors
+#   https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/sha-256-implementations-paper.pdf
+
+# Following the intel whitepaper we split our code into:
+# We keep track of a 256-bit state vector corresponding
+# to {a, b, c, d, e, f, g, h} in specification
+#
+# Processing is done in 2 steps
+# - Message scheduler:
+#   Takes the input 16 DWORDs and
+#   computes 48 new DWORDs. Together with the original 16 DWORDs, these
+#   form a vector of 64 DWORDs that is the input to the second step.
+#   This can be vectorized.
+# - 64 SHA rounds:
+#   This code is scalar.
+
+# No exceptions allowed in core cryptographic operations
+{.push raises: [].}
+{.push checks: off.}
+
+# Vectorized message scheduler
+# ------------------------------------------------
+
+const VecNum = BlockSize div 16      # BlockSize / sizeof(m128i)
+const VecWords = 16 div sizeof(Word) # sizeof(m128i) / sizeof(Word) 
+
+func initMessageSchedule(
+       msnext: var array[VecNum, m128i],
+       ms: var Sha256_MessageSchedule,
+       message: ptr UncheckedArray[byte]) {.inline.} =
+  ## Initial state, from data
+  ## - Precompute steps for the future message schedule `msnext`
+  ## - compute the current message schedule `ms`
+  
+  let mask = setr_u32x4(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f)
+  let pK256 = K256.unsafeAddr()
+
+  staticFor i, 0, VecNum:
+    msnext[i] = loadu_u128(message[i * sizeof(m128i)].addr)
+    msnext[i] = shuf_u8x16(msnext[i], mask)
+    storea_u128(ms.w[VecWords*i].addr, add_u32x4(msnext[i], loadu_u128(pK256[VecWords*i].addr)))
+
+func updateMessageSchedule(
+       W: var array[4, m128i],
+       loMask, hiMask: m128i) {.inline.} =
+  # Steady state
+  # ------------
+  # The message schedule workspace W[16:0]
+  # is updated with
+  #   W[t mod 16] += s0 + s1 + W[(t-7) mod 16]
+  # with
+  #   s0 = σ₀(W[(t-15) mod 16])
+  #   s1 = σ₁(W[(t-2)  mod 16])
+  # by denoting the right rotation >>>, and xor ⊕
+  #   σ₀(x) = (x >>>  7) ⊕ (x >>> 18) ⊕ (x >>>  3)
+  #   σ₁(x) = (x >>> 17) ⊕ (x >>> 19) ⊕ (x >>> 10)
+
+  const rot0 = [int32  7, 18,  3]
+  const rot1 = [int32 17, 19, 10]
+
+  var v{.noInit.}: array[4, m128i]
+
+  v[0] = alignr_u128(W[1], W[0], 4)
+  v[3] = alignr_u128(W[3], W[2], 4)
+  v[2] = shr_u32x4(v[0], rot0[0])
+  W[0] = add_u32x4(W[0], v[3])
+
+  v[3] = shr_u32x4(v[0], rot0[2])
+  v[1] = shl_u32x4(v[0], 32-rot0[1])
+  v[0] = xor_u128(v[3], v[2])
+
+  v[3] = shuf_u32x4(W[3], 0xfa)
+  v[2] = shr_u32x4(v[2], rot0[1] - rot0[0])
+  v[0] = xor_u128(v[0], v[1])
+  v[0] = xor_u128(v[0], v[2])
+
+  v[1] = shl_u32x4(v[1], rot0[1] - rot0[0])
+  v[2] = shr_u32x4(v[3], rot1[2])
+  v[3] = shr_u64x2(v[3], rot1[0])
+  W[0] = add_u32x4(W[0], xor_u128(v[0], v[1]))
+
+  v[2] = xor_u128(v[2], v[3])
+  v[3] = shr_u64x2(v[3], rot1[1] - rot1[0])
+  v[2] = shuf_u8x16(xor_u128(v[2], v[3]), lo_mask)
+  W[0] = add_u32x4(W[0], v[2])
+
+  v[3] = shuf_u32x4(W[0], 0x50)
+  v[2] = shr_u32x4(v[3], rot1[2])
+  v[3] = shr_u64x2(v[3], rot1[0])
+  v[2] = xor_u128(v[2], v[3])
+  v[3] = shr_u64x2(v[3], rot1[1] - rot1[0])
+
+  W[0] = add_u32x4(W[0], shuf_u8x16(xor_u128(v[2], v[3]), hi_mask)) 
+
+  W.rotateLeft()
+
+# Hash Computation
+# ------------------------------------------------
+
+func sha256_rounds_0_47(
+       s: var Sha256_state,
+       ms: var Sha256_MessageSchedule,
+       msnext: var array[VecNum, m128i]) {.inline.} =
+  ## Process Sha256 rounds 0 to 47
+  
+  let loMask = setr_u32x4(0x03020100, 0x0b0a0908, -1, -1)
+  let hiMask = setr_u32x4(-1, -1, 0x03020100, 0x0b0a0908)
+
+  # The first items of K256 were processed in initMessageSchedule
+  var k256_idx = 16
+
+  # Rounds 0-15, 16-31, 32-47
+  for r in 0 ..< 3:
+
+    # Important unrolling for 2 reasons, see Intel paper
+    # - State updates:
+    #   In each round calculation six out of the eight state variables are shifted to the
+    #   next state variable. Rather than do these using mov instructions, we rename
+    #   the virtual registers (symbols) to effect this “shift”. Thus each round
+    #   effectively rotates the set of state register names by one place. By doing 8 or
+    #   16 rounds in the body of the loop, the names have rotated back to their
+    #   starting values, so no register moves are needed before looping.
+    #
+    # - Message schedule:
+    #   Similarly on the vector unit for the message scheduling, the 16 necessary
+    #   scheduled DWORDs are stored in four XMM registers, as described earlier. For
+    #   example, the initial data DWORDs are stored in order as {X0, X1, X2, X3}.
+    #   When we compute four new scheduled DWORDs, we store them in X0
+    #   (overwriting the “oldest” data DWORDs), so now the scheduled DWORDs are
+    #   stored in order in {X1, X2, X3, X0}. Once again, we handle this by “rotating”
+    #   the four names, where in this case the names rotate one place every four
+    #   rounds (because we compute four scheduled DWORDs in each calculation).
+    #   By having 16 rounds (four scheduling operations) in the body of the loop,
+    #   these XMM register names rotate back to their initial value, and again no
+    #   register moves are needed before looping.
+
+    staticFor i, 0, VecNum:
+      # We interleave computing the message scheduled at {t+4, t+5, t+6, t+7}
+      # with SHA256 state update for {t, t+1, t+2, t+3}
+
+      # As they are independent, hopefully the compiler reorders instructions
+      # for maximum throughput.
+      # Also it optimize away the moves and use register renaming to avoid rotations
+      const pos = VecWords * i
+
+      msnext.updateMessageSchedule(loMask, hiMask)
+      let wnext = add_u32x4(msnext[3], loadu_u128(K256[k256_idx].unsafeAddr))
+
+      # K256 was already included in the computation of wnext, hence kt = 0
+      s.sha256_round(wt = ms.w[pos + 0], kt = 0)
+      s.sha256_round(wt = ms.w[pos + 1], kt = 0)
+      s.sha256_round(wt = ms.w[pos + 2], kt = 0)
+      s.sha256_round(wt = ms.w[pos + 3], kt = 0)
+
+      storea_u128(ms.w[pos].addr, wnext)
+      k256_idx += VecWords
+
+
+func sha256_rounds_48_63(
+       s: var Sha256_state,
+       ms: var Sha256_MessageSchedule) {.inline.} =
+  ## Process Sha256 rounds 48 to 63
+  staticFor t, 48, 64:
+    # Wt[i mod 16] and K256 was already integrated in the computation of wnext
+    s.sha256_round(wt = ms.w[t and 15], kt = 0)
+
+func hashMessageBlocks_ssse3*(
+       H: var Sha256_state,
+       message: ptr UncheckedArray[byte],
+       numBlocks: uint)=
+  ## Hash a message block by block
+  ## Sha256 block size is 64 bytes hence
+  ## a message will be process 64 by 64 bytes.
+
+  var msg = message
+  var ms{.noInit.}: Sha256_MessageSchedule
+  var msnext{.noInit.}: array[VecNum, m128i]
+  var s{.noInit.}: Sha256_state
+
+  s.copy(H)
+
+  for _ in 0 ..< numBlocks:
+    initMessageSchedule(msnext, ms, msg)
+    msg +%= BlockSize
+
+    sha256_rounds_0_47(s, ms, msnext)
+    sha256_rounds_48_63(s, ms)
+
+    s.accumulate(H) # accumulate on register variables
+    H.copy(s)
diff --git a/constantine/math/config/type_bigint.nim b/constantine/math/config/type_bigint.nim
index 33ed2b5..5ecd105 100644
--- a/constantine/math/config/type_bigint.nim
+++ b/constantine/math/config/type_bigint.nim
@@ -36,7 +36,7 @@ debug:
     result[0] = '0'
     result[1] = 'x'
     var a = a
-    for j in countdown(L-1, 0):
+    for j in countdown(result.len-1, 2):
       result[j] = hexChars.secretLookup(a and SecretWord 0xF)
       a = a shr 4
 
@@ -45,7 +45,7 @@ debug:
     result.add " " & toHex(a[0])
     for i in 1 ..< a.len:
       result.add ", " & toHex(a[i])
-    result.add "])"
+    result.add "]"
 
   func `$`*(a: BigInt): string =
     result = "BigInt["
diff --git a/constantine/platforms/isa/simd_x86.nim b/constantine/platforms/isa/simd_x86.nim
new file mode 100644
index 0000000..487637d
--- /dev/null
+++ b/constantine/platforms/isa/simd_x86.nim
@@ -0,0 +1,277 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+static: doAssert defined(i386) or defined(amd64)
+
+# SIMD throughput and latency:
+#   - https://software.intel.com/sites/landingpage/IntrinsicsGuide/
+#   - https://www.agner.org/optimize/instruction_tables.pdf
+
+# Reminder: x86 is little-endian, order is [low part, high part]
+# Documentation at https://software.intel.com/sites/landingpage/IntrinsicsGuide/
+
+when defined(vcc):
+  {.pragma: x86_type, byCopy, header:"<intrin.h>".}
+  {.pragma: x86, noDecl, header:"<intrin.h>".}
+else:
+  {.pragma: x86_type, byCopy, header:"<x86intrin.h>".}
+  {.pragma: x86, noDecl, header:"<x86intrin.h>".}
+
+type
+  m128* {.importc: "__m128", x86_type.} = object
+    raw: array[4, float32]
+  m128d* {.importc: "__m128d", x86_type.} = object
+    raw: array[2, float64]
+  m128i* {.importc: "__m128i", x86_type.} = object
+    raw: array[16, byte]
+  m256* {.importc: "__m256", x86_type.} = object
+    raw: array[8, float32]
+  m256d* {.importc: "__m256d", x86_type.} = object
+    raw: array[4, float64]
+  m256i* {.importc: "__m256i", x86_type.} = object
+    raw: array[32, byte]
+  m512* {.importc: "__m512", x86_type.} = object
+    raw: array[16, float32]
+  m512d* {.importc: "__m512d", x86_type.} = object
+    raw: array[8, float64]
+  m512i* {.importc: "__m512i", x86_type.} = object
+    raw: array[64, byte]
+  mmask8* {.importc: "__mmask8", x86_type.} = uint8
+  mmask16* {.importc: "__mmask16", x86_type.} = uint16
+  mmask64* {.importc: "__mmask64", x86_type.} = uint64
+
+# ############################################################
+#
+#                    SSE2 - integer - packed
+#
+# ############################################################
+
+func mm_setzero_si128(): m128i {.importc: "_mm_setzero_si128", x86.}
+func mm_set1_epi8(a: int8 or uint8): m128i {.importc: "_mm_set1_epi8", x86.}
+func mm_set1_epi16(a: int16 or uint16): m128i {.importc: "_mm_set1_epi16", x86.}
+func mm_set1_epi32(a: int32 or uint32): m128i {.importc: "_mm_set1_epi32", x86.}
+func mm_set1_epi64x(a: int64 or uint64): m128i {.importc: "_mm_set1_epi64x", x86.}
+func mm_set_epi64x(e1, e0: int64 or uint64): m128i {.importc: "_mm_set_epi64x", x86.}
+func mm_load_si128(mem_addr: ptr m128i): m128i {.importc: "_mm_load_si128", x86.}
+func mm_loadu_si128(mem_addr: ptr m128i): m128i {.importc: "_mm_loadu_si128", x86.}
+func mm_store_si128(mem_addr: ptr m128i, a: m128i) {.importc: "_mm_store_si128", x86.}
+func mm_storeu_si128(mem_addr: ptr m128i, a: m128i) {.importc: "_mm_storeu_si128", x86.}
+
+func mm_set_epi32(e3, e2, e1, e0: int32 or uint32): m128i {.importc: "_mm_set_epi32", x86.}
+  ## Initialize m128i with {e3, e2, e1, e0} (big endian order)
+  ## in order [e0, e1, e2, e3]
+func mm_setr_epi32(e3, e2, e1, e0: int32 or uint32): m128i {.importc: "_mm_setr_epi32", x86.}
+  ## Initialize m128i with {e3, e2, e1, e0} (big endian order)
+  ## in order [e3, e2, e1, e0]
+
+func mm_xor_si128(a, b: m128i): m128i {.importc: "_mm_xor_si128", x86.}
+
+func mm_add_epi8(a, b: m128i): m128i {.importc: "_mm_add_epi8", x86.}
+func mm_add_epi16(a, b: m128i): m128i {.importc: "_mm_add_epi16", x86.}
+func mm_add_epi32(a, b: m128i): m128i {.importc: "_mm_add_epi32", x86.}
+func mm_add_epi64(a, b: m128i): m128i {.importc: "_mm_add_epi64", x86.}
+
+func mm_slli_epi64(a: m128i, imm8: int32 or uint32): m128i {.importc: "_mm_slli_epi64", x86.}
+  ## Shift 2xint64 left
+func mm_srli_epi64(a: m128i, imm8: int32 or uint32): m128i {.importc: "_mm_srli_epi64", x86.}
+  ## Shift 2xint64 right
+func mm_srli_epi32(a: m128i, imm8: int32 or uint32): m128i {.importc: "_mm_srli_epi32", x86.}
+  ## Shift 4xint32 left
+func mm_slli_epi32(a: m128i, imm8: int32 or uint32): m128i {.importc: "_mm_slli_epi32", x86.}
+  ## Shift 4xint32 right
+
+func mm_shuffle_epi32(a: m128i, imm8: int32 or uint32): m128i {.importc: "_mm_shuffle_epi32", x86.}
+  ## Shuffle 32-bit integers in a according to the control in imm8
+  ## Formula is in big endian representation
+  ## a = {a3, a2, a1, a0}
+  ## dst = {d3, d2, d1, d0}
+  ## imm8 = {bits[7:6], bits[5:4], bits[3:2], bits[1:0]}
+  ## d0 will refer a[bits[1:0]]
+  ## d1            a[bits[3:2]]
+
+# ############################################################
+#
+#                    SSSE3 - integer - packed
+#
+# ############################################################
+
+func mm_alignr_epi8(a, b: m128i, imm8: int32 or uint32): m128i {.importc: "_mm_alignr_epi8", x86.}
+  ## Concatenate 16-byte blocks in a and b into a 32-byte temporary result,
+  ## shift the result right by imm8 bytes, and return the low 16 bytes
+  ## Input:
+  ##   a[127:0], b[127:0]
+  ## Result:
+  ##   tmp[255:128] = a
+  ##   tmp[127:0]   = b
+  ##   tmp[255:0]   = tmp[255:0] >> (imm8*8)
+  ##   dst[127:0]   = tmp[127:0]
+
+func mm_shuffle_epi8(a, b: m128i): m128i {.importc: "_mm_shuffle_epi8", x86.}
+  ## Shuffle 8-bit integers in a according to the control mask in b
+  ## Formula is in big endian representation
+  ## a =   {a15, a14, a13, a12, a11, a10, a9, a8, a7, a6, a5, a4, a3, a2, a1, a0}
+  ## b =   {b15, b14, b13, b12, b11, b10, b9, b8, b7, b6, b5, b4, b3, b2, b1, b0}
+  ## dst = {d15, d14, d13, d12, d11, d10, d9, d8, d7, d6, d5, d4, d3, d2, d1, d0}
+  ##
+  ## The control mask b0 ... b15 have the shape:
+  ## bits z000uvwx
+  ## if z is set, the corresponding d is set to zero.
+  ## otherwise uvwx represents a binary number in 0..15,
+  ##           the corresponding d will be set to a(uvwx)
+  ## 
+  ## for i in 0 ..< 16:
+  ##  if bitand(b[i], 0x80) != 0:
+  ##   dst[i] = 0
+  ##  else:
+  ##   dst[i] = a[bitand(b[i], 0x0F)]
+
+# ############################################################
+#
+#                    SSE4.1 - integer - packed
+#
+# ############################################################
+
+func mm_blend_epi16(a, b: m128i, imm8: int32 or uint32): m128i {.importc: "_mm_blend_epi16", x86.}
+  ## Blend packed 16-bit integers from a and b using control mask imm8,
+  ## and store the results in dst.
+  ## 
+  ## FOR j := 0 to 7
+  ## i := j*16
+  ## IF imm8[j]
+  ##   dst[i+15:i] := b[i+15:i]
+  ## ELSE
+  ##   dst[i+15:i] := a[i+15:i]
+
+# ############################################################
+#
+#                  AVX512F - integer - packed
+#
+# ############################################################
+
+func mm_ror_epi32(a: m128i, imm8: int32 or uint32): m128i {.importc: "_mm_ror_epi32", x86.}
+  ## Rotate 4xint32 right
+
+func mm_mask_add_epi32(src: m128i, mask: mmask8, a, b: m128i): m128i {.importc: "_mm_mask_add_epi32", x86.}
+  ## Add packed 32-bit integers in a and b, and store the results in dst using writemask mask
+  ## (elements are copied from src when the corresponding mask bit is not set).
+  ## for j in 0 ..< 4:
+  ##   let i = j*32
+  ##   if k[j]:
+  ##     dst[i+31:i] := a[i+31:i] + b[i+31:i]
+  ##   else:
+  ##     dst[i+31:i] := src[i+31:i]
+
+# ############################################################
+#
+#                  SHA extensions
+#
+# ############################################################
+
+func mm_sha256msg1_epu32(a, b: m128i): m128i {.importc: "_mm_sha256msg1_epu32", x86.}
+  ## Perform an intermediate calculation for the next four SHA256 message values (unsigned 32-bit integers)
+  ## using previous message values from a and b, and store the result in dst.
+  ## 
+  ## W4 := b[31:0]
+  ## W3 := a[127:96]
+  ## W2 := a[95:64]
+  ## W1 := a[63:32]
+  ## W0 := a[31:0]
+  ## dst[127:96] := W3 + sigma0(W4)
+  ## dst[95:64] := W2 + sigma0(W3)
+  ## dst[63:32] := W1 + sigma0(W2)
+  ## dst[31:0] := W0 + sigma0(W1)
+
+func mm_sha256msg2_epu32(a, b: m128i): m128i {.importc: "_mm_sha256msg2_epu32", x86.}
+  ## Perform the final calculation for the next four SHA256 message values (unsigned 32-bit integers)
+  ## using previous message values from a and b, and store the result in dst.
+  ## 
+  ## W14 := b[95:64]
+  ## W15 := b[127:96]
+  ## W16 := a[31:0] + sigma1(W14)
+  ## W17 := a[63:32] + sigma1(W15)
+  ## W18 := a[95:64] + sigma1(W16)
+  ## W19 := a[127:96] + sigma1(W17)
+  ## dst[127:96] := W19
+  ## dst[95:64] := W18
+  ## dst[63:32] := W17
+  ## dst[31:0] := W16
+
+func mm_sha256rnds2_epu32(cdgh, abef, k: m128i): m128i {.importc: "_mm_sha256rnds2_epu32", x86.}
+  ## Perform 2 rounds of SHA256 operation using
+  ##   an initial SHA256 state (C,D,G,H) from a,
+  ##   an initial SHA256 state (A,B,E,F) from b,
+  ##   and a pre-computed sum of the next 2 round message values (unsigned 32-bit integers)
+  ##     and the corresponding round constants from k,
+  ## and store the updated SHA256 state (A,B,E,F) in dst.
+  ## 
+  ## A[0] := b[127:96]
+  ## B[0] := b[95:64]
+  ## C[0] := a[127:96]
+  ## D[0] := a[95:64]
+  ## E[0] := b[63:32]
+  ## F[0] := b[31:0]
+  ## G[0] := a[63:32]
+  ## H[0] := a[31:0]
+  ## W_K[0] := k[31:0]
+  ## W_K[1] := k[63:32]
+  ## FOR i := 0 to 1
+  ##   A[i+1] := Ch(E[i], F[i], G[i]) + sum1(E[i]) + W_K[i] + H[i] + Maj(A[i], B[i], C[i]) + sum0(A[i])
+  ##   B[i+1] := A[i]
+  ##   C[i+1] := B[i]
+  ##   D[i+1] := C[i]
+  ##   E[i+1] := Ch(E[i], F[i], G[i]) + sum1(E[i]) + W_K[i] + H[i] + D[i]
+  ##   F[i+1] := E[i]
+  ##   G[i+1] := F[i]
+  ##   H[i+1] := G[i]
+  ## ENDFOR
+  ## dst[127:96] := A[2]
+  ## dst[95:64] := B[2]
+  ## dst[63:32] := E[2]
+  ## dst[31:0] := F[2]
+
+# Aliases
+# ------------------------------------------------
+
+template set_u64x2*(e1, e0: int64 or uint64): m128i =
+  mm_set_epi64x(e1, e0)
+template setr_u32x4*(e3, e2, e1, e0: int32 or uint32): m128i =
+  mm_setr_epi32(e3, e2, e1, e0)
+template loada_u128*(data: pointer): m128i =
+  mm_load_si128(cast[ptr m128i](data))
+template loadu_u128*(data: pointer): m128i =
+  mm_loadu_si128(cast[ptr m128i](data))
+template storea_u128*(mem_addr: pointer, a: m128i) =
+  mm_store_si128(cast[ptr m128i](mem_addr), a)
+
+template xor_u128*(a, b: m128i): m128i =
+  mm_xor_si128(a, b)
+
+template add_u32x4*(a, b: m128i): m128i =
+  mm_add_epi32(a, b)
+template shl_u32x4*(a: m128i, imm8: int32 or uint32): m128i =
+  mm_slli_epi32(a, imm8)
+template shr_u32x4*(a: m128i, imm8: int32 or uint32): m128i =
+  mm_srli_epi32(a, imm8)
+template shr_u64x2*(a: m128i, imm8: int32 or uint32): m128i =
+  mm_srli_epi64(a, imm8)
+
+template alignr_u128*(a, b: m128i, shiftRightByBytes: int32 or uint32): m128i =
+  mm_alignr_epi8(a, b, shiftRightByBytes)
+template shuf_u8x16*(a: m128i, mask: m128i): m128i =
+  mm_shuffle_epi8(a, mask)
+template shuf_u32x4*(a: m128i, mask: int32 or uint32): m128i =
+  mm_shuffle_epi32(a, mask)
+template blend_u16x8*(a, b: m128i, mask: int32 or uint32): m128i =
+  mm_blend_epi16(a, b, mask)
+
+template sha256_msg1*(a, b: m128i): m128i = 
+  mm_sha256msg1_epu32(a, b)
+template sha256_msg2*(a, b: m128i): m128i = 
+  mm_sha256msg2_epu32(a, b)
+template sha256_2rounds*(cdgh, abef, k: m128i): m128i =
+  mm_sha256rnds2_epu32(cdgh, abef, k)
\ No newline at end of file
diff --git a/constantine/platforms/primitives.nim b/constantine/platforms/primitives.nim
index b6ee9b3..b3460e5 100644
--- a/constantine/platforms/primitives.nim
+++ b/constantine/platforms/primitives.nim
@@ -34,6 +34,10 @@ when X86 and GCC_Compatible:
   import isa/[cpuinfo_x86, macro_assembler_x86]
   export cpuinfo_x86, macro_assembler_x86
 
+# No exceptions allowed in core cryptographic operations
+{.push raises: [].}
+{.push checks: off.}
+
 # ############################################################
 #
 #                      Instrumentation
@@ -71,4 +75,46 @@ func copy*[T: byte|char](
 
   {.push checks: off.} # No OverflowError or IndexError allowed
   for i in 0 ..< len:
-    dst[dStart + i] = byte src[sStart + i]
\ No newline at end of file
+    dst[dStart + i] = byte src[sStart + i]
+
+func rotateRight*[N: static int, T](a: var array[N, T]) {.inline.} =
+  # Rotate right (Somehow we can't use a generic template here)
+  # Inline
+  # Hopefully we want the compiler to see that N rounds of rotation
+  # can be optimized away with register renaming
+  let tmp = a[a.len-1]
+  staticForCountdown i, a.len-1, 1:
+    a[i] = a[i-1]
+  a[0] = tmp
+
+func rotateLeft*[N: static int, T](a: var array[N, T]) {.inline.} =
+  # Rotate left (Somehow we can't use a generic template here)
+  # Inline
+  # Hopefully we want the compiler to see that N rounds of rotation
+  # can be optimized away with register renaming
+  let tmp = a[0]
+  staticFor i, 0, a.len-1:
+    a[i] = a[i+1]
+  a[a.len-1] = tmp
+
+# ############################################################
+#
+#                    Pointer arithmetics
+#
+# ############################################################
+
+template asUnchecked*[T](a: openArray[T]): ptr UncheckedArray[T] =
+  cast[ptr UncheckedArray[T]](a[0].unsafeAddr)
+
+# Warning for pointer arithmetics via inline C
+# be careful of not passing a `var ptr`
+# to a function as `var` are passed by hidden pointers in Nim and the wrong
+# pointer will be modified. Templates are fine.
+
+func `+%`*(p: ptr, offset: SomeInteger): type(p) {.inline, noInit.}=
+  ## Pointer increment
+  {.emit: [result, " = ", p, " + ", offset, ";"].}
+
+func `+%=`*(p: var ptr, offset: SomeInteger){.inline.}=
+  ## Pointer increment
+  p = p +% offset
\ No newline at end of file
diff --git a/helpers/static_for.nim b/helpers/static_for.nim
index ab988a0..4dca6e6 100644
--- a/helpers/static_for.nim
+++ b/helpers/static_for.nim
@@ -35,6 +35,14 @@ macro staticFor*(idx: untyped{nkIdent}, start, stopEx: static int, body: untyped
       body.replaceNodes(idx, newLit i)
     )
 
+macro staticForCountdown*(idx: untyped{nkIdent}, start, stopIncl: static int, body: untyped): untyped =
+  result = newStmtList()
+  for i in countdown(start, stopIncl):
+    result.add nnkBlockStmt.newTree(
+      ident("unrolledIter_" & $idx & $i),
+      body.replaceNodes(idx, newLit i)
+    )
+
 {.experimental: "dynamicBindSym".}
 
 macro staticFor*(ident: untyped{nkIdent}, choices: typed, body: untyped): untyped =
diff --git a/tests/t_hash_sha256_vs_openssl.nim b/tests/t_hash_sha256_vs_openssl.nim
index ce18520..463b0ac 100644
--- a/tests/t_hash_sha256_vs_openssl.nim
+++ b/tests/t_hash_sha256_vs_openssl.nim
@@ -34,7 +34,7 @@ else:
 # But the new API isn't expose on Linux :/
 
 # TODO: fix Windows
-when not defined(Windows):
+when not defined(windows):
   proc SHA256[T: byte|char](
         msg: openarray[T],
         digest: ptr array[32, byte] = nil