SHA256 Hash function

2020-12-15 19:18:36 +01:00 · 2020-12-15 19:18:36 +01:00 · e89429e822
parent c89c78d2d9
commit e89429e822
11 changed files with 817 additions and 376 deletions
--- a/benchmarks/bench_blueprint.nim
+++ b/benchmarks/bench_blueprint.nim
@ -0,0 +1,108 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+# ############################################################
+#
+#             Benchmark blueprint
+#
+# ############################################################
+
+import
+  # Internal
+  ../constantine/config/common,
+  # Helpers
+  ../helpers/[prng_unsafe, static_for],
+  ./platforms,
+  # Standard library
+  std/[monotimes, times, strformat, strutils, macros]
+
+export strformat, platforms, times, monotimes, macros
+
+var rng*: RngState
+let seed = uint32(getTime().toUnix() and (1'i64 shl 32 - 1)) # unixTime mod 2^32
+rng.seed(seed)
+echo "bench xoshiro512** seed: ", seed
+
+# warmup
+proc warmup*() =
+  # Warmup - make sure cpu is on max perf
+  let start = cpuTime()
+  var foo = 123
+  for i in 0 ..< 300_000_000:
+    foo += i*i mod 456
+    foo = foo mod 789
+
+  # Compiler shouldn't optimize away the results as cpuTime rely on sideeffects
+  let stop = cpuTime()
+  echo &"Warmup: {stop - start:>4.4f} s, result {foo} (displayed to avoid compiler optimizing warmup away)\n"
+
+warmup()
+
+when defined(gcc):
+  echo "\nCompiled with GCC"
+elif defined(clang):
+  echo "\nCompiled with Clang"
+elif defined(vcc):
+  echo "\nCompiled with MSVC"
+elif defined(icc):
+  echo "\nCompiled with ICC"
+else:
+  echo "\nCompiled with an unknown compiler"
+
+echo "Optimization level => "
+echo "  no optimization: ", not defined(release)
+echo "  release: ", defined(release)
+echo "  danger: ", defined(danger)
+echo "  inline assembly: ", UseASM_X86_64
+
+when (sizeof(int) == 4) or defined(Constantine32):
+  echo "⚠️ Warning: using Constantine with 32-bit limbs"
+else:
+  echo "Using Constantine with 64-bit limbs"
+
+when SupportsCPUName:
+  echo "Running on ", cpuName(), ""
+
+when SupportsGetTicks:
+  echo "\n⚠️ Cycles measurements are approximate and use the CPU nominal clock: Turbo-Boost and overclocking will skew them."
+  echo "i.e. a 20% overclock will be about 20% off (assuming no dynamic frequency scaling)"
+
+echo "\n=================================================================================================================\n"
+
+proc separator*(length: int) =
+  echo "-".repeat(length)
+
+proc notes*() =
+  echo "Notes:"
+  echo "  - Compilers:"
+  echo "    Compilers are severely limited on multiprecision arithmetic."
+  echo "    Constantine compile-time assembler is used by default (nimble bench_fp)."
+  echo "    GCC is significantly slower than Clang on multiprecision arithmetic due to catastrophic handling of carries."
+  echo "    GCC also seems to have issues with large temporaries and register spilling."
+  echo "    This is somewhat alleviated by Constantine compile-time assembler."
+  echo "    Bench on specific compiler with assembler: \"nimble bench_ec_g1_gcc\" or \"nimble bench_ec_g1_clang\"."
+  echo "    Bench on specific compiler with assembler: \"nimble bench_ec_g1_gcc_noasm\" or \"nimble bench_ec_g1_clang_noasm\"."
+  echo "  - The simplest operations might be optimized away by the compiler."
+  echo "  - Fast Squaring and Fast Multiplication are possible if there are spare bits in the prime representation (i.e. the prime uses 254 bits out of 256 bits)"
+
+template measure*(iters: int,
+               startTime, stopTime: untyped,
+               startClk, stopClk: untyped,
+               body: untyped): untyped =
+  let startTime = getMonotime()
+  when SupportsGetTicks:
+    let startClk = getTicks()
+  for _ in 0 ..< iters:
+    body
+  when SupportsGetTicks:
+    let stopClk = getTicks()
+  let stopTime = getMonotime()
+
+  when not SupportsGetTicks:
+    let startClk = -1'i64
+    let stopClk = -1'i64
--- a/benchmarks/bench_elliptic_template.nim
+++ b/benchmarks/bench_elliptic_template.nim
@ -21,85 +21,12 @@ import
  # Helpers
  ../helpers/[prng_unsafe, static_for],
  ./platforms,
-  # Standard library
-  std/[monotimes, times, strformat, strutils, macros],
+  ./bench_blueprint,
  # Reference unsafe scalar multiplication
  ../tests/support/ec_reference_scalar_mult

-var rng: RngState
-let seed = uint32(getTime().toUnix() and (1'i64 shl 32 - 1)) # unixTime mod 2^32
-rng.seed(seed)
-echo "bench xoshiro512** seed: ", seed
-
-# warmup
-proc warmup*() =
-  # Warmup - make sure cpu is on max perf
-  let start = cpuTime()
-  var foo = 123
-  for i in 0 ..< 300_000_000:
-    foo += i*i mod 456
-    foo = foo mod 789
-
-  # Compiler shouldn't optimize away the results as cpuTime rely on sideeffects
-  let stop = cpuTime()
-  echo &"Warmup: {stop - start:>4.4f} s, result {foo} (displayed to avoid compiler optimizing warmup away)\n"
-
-warmup()
-
-when defined(gcc):
-  echo "\nCompiled with GCC"
-elif defined(clang):
-  echo "\nCompiled with Clang"
-elif defined(vcc):
-  echo "\nCompiled with MSVC"
-elif defined(icc):
-  echo "\nCompiled with ICC"
-else:
-  echo "\nCompiled with an unknown compiler"
-
-echo "Optimization level => "
-echo "  no optimization: ", not defined(release)
-echo "  release: ", defined(release)
-echo "  danger: ", defined(danger)
-echo "  inline assembly: ", UseASM_X86_64
-
-when (sizeof(int) == 4) or defined(Constantine32):
-  echo "⚠️ Warning: using Constantine with 32-bit limbs"
-else:
-  echo "Using Constantine with 64-bit limbs"
-
-when SupportsCPUName:
-  echo "Running on ", cpuName(), ""
-
-when SupportsGetTicks:
-  echo "\n⚠️ Cycles measurements are approximate and use the CPU nominal clock: Turbo-Boost and overclocking will skew them."
-  echo "i.e. a 20% overclock will be about 20% off (assuming no dynamic frequency scaling)"
-
-echo "\n=================================================================================================================\n"
-
-proc separator*() =
-  echo "-".repeat(177)
-
-proc report(op, elliptic: string, start, stop: MonoTime, startClk, stopClk: int64, iters: int) =
-  let ns = inNanoseconds((stop-start) div iters)
-  let throughput = 1e9 / float64(ns)
-  when SupportsGetTicks:
-    echo &"{op:<60} {elliptic:<40} {throughput:>15.3f} ops/s     {ns:>9} ns/op     {(stopClk - startClk) div iters:>9} CPU cycles (approx)"
-  else:
-    echo &"{op:<60} {elliptic:<40} {throughput:>15.3f} ops/s     {ns:>9} ns/op"
-
-proc notes*() =
-  echo "Notes:"
-  echo "  - Compilers:"
-  echo "    Compilers are severely limited on multiprecision arithmetic."
-  echo "    Constantine compile-time assembler is used by default (nimble bench_fp)."
-  echo "    GCC is significantly slower than Clang on multiprecision arithmetic due to catastrophic handling of carries."
-  echo "    GCC also seems to have issues with large temporaries and register spilling."
-  echo "    This is somewhat alleviated by Constantine compile-time assembler."
-  echo "    Bench on specific compiler with assembler: \"nimble bench_ec_g1_gcc\" or \"nimble bench_ec_g1_clang\"."
-  echo "    Bench on specific compiler with assembler: \"nimble bench_ec_g1_gcc_noasm\" or \"nimble bench_ec_g1_clang_noasm\"."
-  echo "  - The simplest operations might be optimized away by the compiler."
-  echo "  - Fast Squaring and Fast Multiplication are possible if there are spare bits in the prime representation (i.e. the prime uses 254 bits out of 256 bits)"
+export notes
+proc separator*() = separator(177)

 macro fixEllipticDisplay(T: typedesc): untyped =
  # At compile-time, enums are integers and their display is buggy
@ -111,21 +38,17 @@ macro fixEllipticDisplay(T: typedesc): untyped =
  name.add "[" & fieldName & "[" & curveName & "]]"
  result = newLit name

+proc report(op, elliptic: string, start, stop: MonoTime, startClk, stopClk: int64, iters: int) =
+  let ns = inNanoseconds((stop-start) div iters)
+  let throughput = 1e9 / float64(ns)
+  when SupportsGetTicks:
+    echo &"{op:<60} {elliptic:<40} {throughput:>15.3f} ops/s     {ns:>9} ns/op     {(stopClk - startClk) div iters:>9} CPU cycles (approx)"
+  else:
+    echo &"{op:<60} {elliptic:<40} {throughput:>15.3f} ops/s     {ns:>9} ns/op"
+
 template bench(op: string, T: typedesc, iters: int, body: untyped): untyped =
-  let start = getMonotime()
-  when SupportsGetTicks:
-    let startClk = getTicks()
-  for _ in 0 ..< iters:
-    body
-  when SupportsGetTicks:
-    let stopClk = getTicks()
-  let stop = getMonotime()
-
-  when not SupportsGetTicks:
-    let startClk = -1'i64
-    let stopClk = -1'i64
-
-  report(op, fixEllipticDisplay(T), start, stop, startClk, stopClk, iters)
+  measure(iters, startTime, stopTime, startClk, stopClk, body)
+  report(op, fixEllipticDisplay(T), startTime, stopTime, startClk, stopClk, iters)

 proc addBench*(T: typedesc, iters: int) =
  const G1_or_G2 = when T.F is Fp: "G1" else: "G2"
--- a/benchmarks/bench_fields_template.nim
+++ b/benchmarks/bench_fields_template.nim
@ -19,63 +19,10 @@ import
  ../constantine/towers,
  # Helpers
  ../helpers/[prng_unsafe, static_for],
-  ./platforms,
-  # Standard library
-  std/[monotimes, times, strformat, strutils, macros]
+  ./bench_blueprint

-var rng: RngState
-let seed = uint32(getTime().toUnix() and (1'i64 shl 32 - 1)) # unixTime mod 2^32
-rng.seed(seed)
-echo "bench xoshiro512** seed: ", seed
-
-# warmup
-proc warmup*() =
-  # Warmup - make sure cpu is on max perf
-  let start = cpuTime()
-  var foo = 123
-  for i in 0 ..< 300_000_000:
-    foo += i*i mod 456
-    foo = foo mod 789
-
-  # Compiler shouldn't optimize away the results as cpuTime rely on sideeffects
-  let stop = cpuTime()
-  echo &"Warmup: {stop - start:>4.4f} s, result {foo} (displayed to avoid compiler optimizing warmup away)\n"
-
-warmup()
-
-when defined(gcc):
-  echo "\nCompiled with GCC"
-elif defined(clang):
-  echo "\nCompiled with Clang"
-elif defined(vcc):
-  echo "\nCompiled with MSVC"
-elif defined(icc):
-  echo "\nCompiled with ICC"
-else:
-  echo "\nCompiled with an unknown compiler"
-
-echo "Optimization level => "
-echo "  no optimization: ", not defined(release)
-echo "  release: ", defined(release)
-echo "  danger: ", defined(danger)
-echo "  inline assembly: ", UseASM_X86_64
-
-when (sizeof(int) == 4) or defined(Constantine32):
-  echo "⚠️ Warning: using Constantine with 32-bit limbs"
-else:
-  echo "Using Constantine with 64-bit limbs"
-
-when SupportsCPUName:
-  echo "Running on ", cpuName(), ""
-
-when SupportsGetTicks:
-  echo "\n⚠️ Cycles measurements are approximate and use the CPU nominal clock: Turbo-Boost and overclocking will skew them."
-  echo "i.e. a 20% overclock will be about 20% off (assuming no dynamic frequency scaling)"
-
-echo "\n=================================================================================================================\n"
-
-proc separator*() =
-  echo "-".repeat(145)
+export notes
+proc separator*() = separator(145)

 proc report(op, field: string, start, stop: MonoTime, startClk, stopClk: int64, iters: int) =
  let ns = inNanoseconds((stop-start) div iters)
@ -85,19 +32,6 @@ proc report(op, field: string, start, stop: MonoTime, startClk, stopClk: int64,
  else:
    echo &"{op:<50} {field:<18} {throughput:>15.3f} ops/s     {ns:>9} ns/op"

-proc notes*() =
-  echo "Notes:"
-  echo "  - Compilers:"
-  echo "    Compilers are severely limited on multiprecision arithmetic."
-  echo "    Constantine compile-time assembler is used by default (nimble bench_fp)."
-  echo "    GCC is significantly slower than Clang on multiprecision arithmetic due to catastrophic handling of carries."
-  echo "    GCC also seems to have issues with large temporaries and register spilling."
-  echo "    This is somewhat alleviated by Constantine compile-time assembler."
-  echo "    Bench on specific compiler with assembler: \"nimble bench_fp_gcc\" or \"nimble bench_fp_clang\"."
-  echo "    Bench on specific compiler with assembler: \"nimble bench_fp_gcc_noasm\" or \"nimble bench_fp_clang_noasm\"."
-  echo "  - The simplest operations might be optimized away by the compiler."
-  echo "  - Fast Squaring and Fast Multiplication are possible if there are spare bits in the prime representation (i.e. the prime uses 254 bits out of 256 bits)"
-
 macro fixFieldDisplay(T: typedesc): untyped =
  # At compile-time, enums are integers and their display is buggy
  # we get the Curve ID instead of the curve name.
@ -107,20 +41,8 @@ macro fixFieldDisplay(T: typedesc): untyped =
  result = newLit name

 template bench(op: string, T: typedesc, iters: int, body: untyped): untyped =
-  let start = getMonotime()
-  when SupportsGetTicks:
-    let startClk = getTicks()
-  for _ in 0 ..< iters:
-    body
-  when SupportsGetTicks:
-    let stopClk = getTicks()
-  let stop = getMonotime()
-
-  when not SupportsGetTicks:
-    let startClk = -1'i64
-    let stopClk = -1'i64
-
-  report(op, fixFieldDisplay(T), start, stop, startClk, stopClk, iters)
+  measure(iters, startTime, stopTime, startClk, stopClk, body)
+  report(op, fixFieldDisplay(T), startTime, stopTime, startClk, stopClk, iters)

 proc addBench*(T: typedesc, iters: int) =
  var x = rng.random_unsafe(T)
--- a/benchmarks/bench_pairing_template.nim
+++ b/benchmarks/bench_pairing_template.nim
@ -28,101 +28,23 @@ import
    pairing_bn
  ],
  # Helpers
-  ../helpers/[prng_unsafe, static_for],
-  ./platforms,
-  # Standard library
-  std/[monotimes, times, strformat, strutils, macros]
+  ../helpers/prng_unsafe,
+  ./bench_blueprint

-var rng: RngState
-let seed = uint32(getTime().toUnix() and (1'i64 shl 32 - 1)) # unixTime mod 2^32
-rng.seed(seed)
-echo "bench xoshiro512** seed: ", seed
+export notes
+proc separator*() = separator(177)

-# warmup
-proc warmup*() =
-  # Warmup - make sure cpu is on max perf
-  let start = cpuTime()
-  var foo = 123
-  for i in 0 ..< 300_000_000:
-    foo += i*i mod 456
-    foo = foo mod 789
-
-  # Compiler shouldn't optimize away the results as cpuTime rely on sideeffects
-  let stop = cpuTime()
-  echo &"Warmup: {stop - start:>4.4f} s, result {foo} (displayed to avoid compiler optimizing warmup away)\n"
-
-warmup()
-
-when defined(gcc):
-  echo "\nCompiled with GCC"
-elif defined(clang):
-  echo "\nCompiled with Clang"
-elif defined(vcc):
-  echo "\nCompiled with MSVC"
-elif defined(icc):
-  echo "\nCompiled with ICC"
-else:
-  echo "\nCompiled with an unknown compiler"
-
-echo "Optimization level => "
-echo "  no optimization: ", not defined(release)
-echo "  release: ", defined(release)
-echo "  danger: ", defined(danger)
-echo "  inline assembly: ", UseASM_X86_64
-
-when (sizeof(int) == 4) or defined(Constantine32):
-  echo "⚠️ Warning: using Constantine with 32-bit limbs"
-else:
-  echo "Using Constantine with 64-bit limbs"
-
-when SupportsCPUName:
-  echo "Running on ", cpuName(), ""
-
-when SupportsGetTicks:
-  echo "\n⚠️ Cycles measurements are approximate and use the CPU nominal clock: Turbo-Boost and overclocking will skew them."
-  echo "i.e. a 20% overclock will be about 20% off (assuming no dynamic frequency scaling)"
-
-echo "\n=================================================================================================================\n"
-
-proc separator*() =
-  echo "-".repeat(177)
-
-proc report(op, curve: string, start, stop: MonoTime, startClk, stopClk: int64, iters: int) =
-  let ns = inNanoseconds((stop-start) div iters)
+proc report(op, curve: string, startTime, stopTime: MonoTime, startClk, stopClk: int64, iters: int) =
+  let ns = inNanoseconds((stopTime-startTime) div iters)
  let throughput = 1e9 / float64(ns)
  when SupportsGetTicks:
    echo &"{op:<60} {curve:<15} {throughput:>15.3f} ops/s     {ns:>9} ns/op     {(stopClk - startClk) div iters:>9} CPU cycles (approx)"
  else:
    echo &"{op:<60} {curve:<15} {throughput:>15.3f} ops/s     {ns:>9} ns/op"

-proc notes*() =
-  echo "Notes:"
-  echo "  - Compilers:"
-  echo "    Compilers are severely limited on multiprecision arithmetic."
-  echo "    Constantine compile-time assembler is used by default (nimble bench_fp)."
-  echo "    GCC is significantly slower than Clang on multiprecision arithmetic due to catastrophic handling of carries."
-  echo "    GCC also seems to have issues with large temporaries and register spilling."
-  echo "    This is somewhat alleviated by Constantine compile-time assembler."
-  echo "    Bench on specific compiler with assembler: \"nimble bench_ec_g1_gcc\" or \"nimble bench_ec_g1_clang\"."
-  echo "    Bench on specific compiler with assembler: \"nimble bench_ec_g1_gcc_noasm\" or \"nimble bench_ec_g1_clang_noasm\"."
-  echo "  - The simplest operations might be optimized away by the compiler."
-  echo "  - Fast Squaring and Fast Multiplication are possible if there are spare bits in the prime representation (i.e. the prime uses 254 bits out of 256 bits)"
-
 template bench(op: string, C: static Curve, iters: int, body: untyped): untyped =
-  let start = getMonotime()
-  when SupportsGetTicks:
-    let startClk = getTicks()
-  for _ in 0 ..< iters:
-    body
-  when SupportsGetTicks:
-    let stopClk = getTicks()
-  let stop = getMonotime()
-
-  when not SupportsGetTicks:
-    let startClk = -1'i64
-    let stopClk = -1'i64
-
-  report(op, $C, start, stop, startClk, stopClk, iters)
+  measure(iters, startTime, stopTime, startClk, stopClk, body)
+  report(op, $C, startTime, stopTime, startClk, stopClk, iters)

 func random_point*(rng: var RngState, EC: typedesc): EC {.noInit.} =
  result = rng.random_unsafe(EC)
--- a/benchmarks/bench_sha256.nim
+++ b/benchmarks/bench_sha256.nim
@ -0,0 +1,58 @@
+import
+  # Internals
+  ../constantine/hashes/h_sha256,
+  # Helpers
+  ../helpers/prng_unsafe,
+  ./bench_blueprint
+
+proc separator*() = separator(69)
+
+proc SHA256[T: byte|char](
+       msg: openarray[T],
+       digest: ptr array[32, byte] = nil
+     ): ptr array[32, byte] {.cdecl, dynlib: "libssl.so", importc.}
+
+proc SHA256_OpenSSL[T: byte|char](
+       digest: var array[32, byte],
+       s: openarray[T]) =
+  discard SHA256(s, digest.addr)
+
+proc report(op: string, bytes: int, startTime, stopTime: MonoTime, startClk, stopClk: int64, iters: int) =
+  let ns = inNanoseconds((stopTime-startTime) div iters)
+  let throughput = 1e9 / float64(ns)
+  when SupportsGetTicks:
+    let cycles = (stopClk - startClk) div iters
+    let cyclePerByte = cycles.float64 / bytes.float64
+    echo &"{op:<30}     {throughput:>15.3f} ops/s    {ns:>9} ns/op    {cycles:>10} cycles    {cyclePerByte:>5.2f} cycles/byte"
+  else:
+    echo &"{op:<30}     {throughput:>15.3f} ops/s    {ns:>9} ns/op"
+
+template bench(op: string, bytes: int, iters: int, body: untyped): untyped =
+  measure(iters, startTime, stopTime, startClk, stopClk, body)
+  report(op, bytes, startTime, stopTime, startClk, stopClk, iters)
+
+proc benchSHA256_constantine[T](msg: openarray[T], msgComment: string, iters: int) =
+  var digest: array[32, byte]
+  bench("SHA256 - Constantine - " & msgComment, msg.len, iters):
+    sha256.hash(digest, msg)
+
+proc benchSHA256_openssl[T](msg: openarray[T], msgComment: string, iters: int) =
+  var digest: array[32, byte]
+  bench("SHA256 - OpenSSL - " & msgComment, msg.len, iters):
+    SHA256_OpenSSL(digest, msg)
+
+when isMainModule:
+  proc main() =
+    block:
+      let msg128B = rng.random_byte_seq(128)
+      benchSHA256_constantine(msg128B, "128B", 128)
+      benchSHA256_openssl(msg128B, "128B", 128)
+    block:
+      let msg5MB = rng.random_byte_seq(5_000_000)
+      benchSHA256_constantine(msg5MB, "5MB", 16)
+      benchSHA256_openssl(msg5MB, "5MB", 16)
+    block:
+      let msg100MB = rng.random_byte_seq(100_000_000)
+      benchSHA256_constantine(msg100MB, "100MB", 3)
+      benchSHA256_openssl(msg100MB, "100MB", 3)
+  main()
--- a/constantine.nimble
+++ b/constantine.nimble
@ -129,17 +129,40 @@ const testDesc: seq[tuple[path: string, useGMP: bool]] = @[
  ("tests/t_pairing_bn254_snarks_optate.nim", false),
  ("tests/t_pairing_bls12_377_optate.nim", false),
  ("tests/t_pairing_bls12_381_optate.nim", false),
+
+  # Hashing vs OpenSSL
+  ("tests/t_hash_sha256_vs_openssl.nim", true),
 ]

 # For temporary (hopefully) investigation that can only be reproduced in CI
 const useDebug = [
-  "tests/t_bigints.nim"
+  "tests/t_bigints.nim",
+  "tests/t_hash_sha256_vs_openssl.nim",
 ]

+# Tests that uses sequences require Nim GC, stack scanning and nil pointer passed to openarray
+# In particular the tests that uses the json test vectors, don't sanitize them.
+# we do use gc:none to help
+const skipSanitizers = [
+  "tests/t_ec_sage_bn254_nogami.nim",
+  "tests/t_ec_sage_bn254_snarks.nim",
+  "tests/t_ec_sage_bls12_377.nim",
+  "tests/t_ec_sage_bls12_381.nim",
+]
+
+const sanitizers =
+  " --passC:-fsanitize=undefined --passL:-fsanitize=undefined" &
+  " --passC:-fno-sanitize-recover" & # Enforce crash on undefined behaviour
+  " --gc:none" # The conservative stack scanning of Nim default GC triggers, alignment UB and stack-buffer-overflow check.
+  # " --passC:-fsanitize=address --passL:-fsanitize=address" & # Requires too much stack for the inline assembly
+

 # Helper functions
 # ----------------------------------------------------------------

+proc clearParallelBuild() =
+  exec "> " & buildParallel
+
 proc test(flags, path: string, commandFile = false) =
  # commandFile should be a "file" but Nimscript doesn't support IO
  if not dirExists "build":
@ -153,6 +176,7 @@ proc test(flags, path: string, commandFile = false) =
  if existsEnv"CC":
    cc = " --cc:" & getEnv"CC"

+  var flags = flags & " --passC:-fstack-protector-all"
  let command = "nim " & lang & cc & " " & flags &
    " --verbosity:0 --outdir:build/testsuite -r --hints:off --warnings:off " &
    " --nimcache:nimcache/" & path & " " &
@ -164,7 +188,6 @@ proc test(flags, path: string, commandFile = false) =
    echo "=============================================================================================="
    exec command
  else:
-    # commandFile.writeLine command
    exec "echo \'" & command & "\' >> " & buildParallel

 proc runBench(benchName: string, compiler = "", useAsm = true) =
@ -181,24 +204,29 @@ proc runBench(benchName: string, compiler = "", useAsm = true) =
       " --nimcache:nimcache/" & benchName & "_" & compiler & "_" & (if useAsm: "useASM" else: "noASM") &
       " -r --hints:off --warnings:off benchmarks/" & benchName & ".nim"

+proc runTests(requireGMP: bool, dumpCmdFile = false, test32bit = false, testASM = true) =
+  for td in testDesc:
+    if not(td.useGMP and not requireGMP):
+      var flags = ""
+      if not testASM:
+        flags &= " -d:ConstantineASM=false"
+      if test32bit:
+        flags &= " -d:Constantine32"
+      if td.path in useDebug:
+        flags &= " -d:debugConstantine"
+      if td.path notin skipSanitizers:
+        flags &= sanitizers
+      test flags, td.path, dumpCmdFile
+
 # Tasks
 # ----------------------------------------------------------------

 task test, "Run all tests":
  # -d:testingCurves is configured in a *.nim.cfg for convenience
-
-  for td in testDesc:
-    if td.path in useDebug:
-      test "-d:debugConstantine", td.path
-    else:
-      test "", td.path
+  runTests(requireGMP = true)

  # if sizeof(int) == 8: # 32-bit tests on 64-bit arch
-  #   for td in testDesc:
-  #     if td.path in useDebug:
-  #       test "-d:Constantine32 -d:debugConstantine", td.path
-  #     else:
-  #       test "-d:Constantine32", td.path
+  #   runTests(requireGMP = true, test32bit = true)

  # Ensure benchmarks stay relevant. Ignore Windows 32-bit at the moment
  if not defined(windows) or not (existsEnv"UCPU" or getEnv"UCPU" == "i686"):
@ -213,23 +241,14 @@ task test, "Run all tests":
    runBench("bench_pairing_bls12_381")
    runBench("bench_pairing_bn254_nogami")
    runBench("bench_pairing_bn254_snarks")
+    runBench("bench_sha256")

 task test_no_gmp, "Run tests that don't require GMP":
  # -d:testingCurves is configured in a *.nim.cfg for convenience
-  for td in testDesc:
-    if not td.useGMP:
-      if td.path in useDebug:
-        test "-d:debugConstantine", td.path
-      else:
-        test "", td.path
+  runTests(requireGMP = false)

-  if sizeof(int) == 8: # 32-bit tests on 64-bit arch
-    for td in testDesc:
-      if not td.useGMP:
-        if td.path in useDebug:
-          test "-d:Constantine32 -d:debugConstantine", td.path
-        else:
-          test "-d:Constantine32", td.path
+  # if sizeof(int) == 8: # 32-bit tests on 64-bit arch
+  #   runTests(requireGMP = true, test32bit = true)

  # Ensure benchmarks stay relevant. Ignore Windows 32-bit at the moment
  if not defined(windows) or not (existsEnv"UCPU" or getEnv"UCPU" == "i686"):
@ -243,31 +262,17 @@ task test_no_gmp, "Run tests that don't require GMP":
    runBench("bench_pairing_bls12_381")
    runBench("bench_pairing_bn254_nogami")
    runBench("bench_pairing_bn254_snarks")
+    runBench("bench_sha256")

 task test_parallel, "Run all tests in parallel (via GNU parallel)":
  # -d:testingCurves is configured in a *.nim.cfg for convenience
-  let cmdFile = true # open(buildParallel, mode = fmWrite) # Nimscript doesn't support IO :/
-  exec "> " & buildParallel
-
-  for td in testDesc:
-    if td.path in useDebug:
-      test "-d:debugConstantine", td.path, cmdFile
-    else:
-      test "", td.path, cmdFile
-
-  # cmdFile.close()
-  # Execute everything in parallel with GNU parallel
+  clearParallelBuild()
+  runTests(requireGMP = true, dumpCmdFile = true)
  exec "parallel --keep-order --group < " & buildParallel

-  exec "> " & buildParallel
  if sizeof(int) == 8: # 32-bit tests on 64-bit arch
-    for td in testDesc:
-      if td.path in useDebug:
-        test "-d:Constantine32 -d:debugConstantine", td.path, cmdFile
-      else:
-        test "-d:Constantine32", td.path, cmdFile
-    # cmdFile.close()
-    # Execute everything in parallel with GNU parallel
+    clearParallelBuild()
+    runTests(requireGMP = true, dumpCmdFile = true, test32bit = true)
    exec "parallel --keep-order --group < " & buildParallel

  # Now run the benchmarks
@ -286,31 +291,18 @@ task test_parallel, "Run all tests in parallel (via GNU parallel)":
    runBench("bench_pairing_bls12_381")
    runBench("bench_pairing_bn254_nogami")
    runBench("bench_pairing_bn254_snarks")
+    runBench("bench_sha256")

 task test_parallel_no_assembler, "Run all tests (without macro assembler) in parallel (via GNU parallel)":
  # -d:testingCurves is configured in a *.nim.cfg for convenience
-  let cmdFile = true # open(buildParallel, mode = fmWrite) # Nimscript doesn't support IO :/
-  exec "> " & buildParallel
-
-  for td in testDesc:
-    if td.path in useDebug:
-      test "-d:debugConstantine -d:ConstantineASM=false", td.path, cmdFile
-    else:
-      test " -d:ConstantineASM=false", td.path, cmdFile
-
-  # cmdFile.close()
-  # Execute everything in parallel with GNU parallel
+  clearParallelBuild()
+  runTests(requireGMP = true, dumpCmdFile = true, testASM = false)
  exec "parallel --keep-order --group < " & buildParallel

  exec "> " & buildParallel
  if sizeof(int) == 8: # 32-bit tests on 64-bit arch
-    for td in testDesc:
-      if td.path in useDebug:
-        test "-d:Constantine32 -d:debugConstantine -d:ConstantineASM=false", td.path, cmdFile
-      else:
-        test "-d:Constantine32 -d:ConstantineASM=false", td.path, cmdFile
-    # cmdFile.close()
-    # Execute everything in parallel with GNU parallel
+    clearParallelBuild()
+    runTests(requireGMP = true, dumpCmdFile = true, test32bit = true, testASM = false)
    exec "parallel --keep-order --group < " & buildParallel

  # Now run the benchmarks
@ -329,33 +321,17 @@ task test_parallel_no_assembler, "Run all tests (without macro assembler) in par
    runBench("bench_pairing_bls12_381")
    runBench("bench_pairing_bn254_nogami")
    runBench("bench_pairing_bn254_snarks")
+    runBench("bench_sha256")

 task test_parallel_no_gmp, "Run all tests in parallel (via GNU parallel)":
  # -d:testingCurves is configured in a *.nim.cfg for convenience
-  let cmdFile = true # open(buildParallel, mode = fmWrite) # Nimscript doesn't support IO :/
-  exec "> " & buildParallel
-
-  for td in testDesc:
-    if not td.useGMP:
-      if td.path in useDebug:
-        test "-d:debugConstantine", td.path, cmdFile
-      else:
-        test "", td.path, cmdFile
-
-  # cmdFile.close()
-  # Execute everything in parallel with GNU parallel
+  clearParallelBuild()
+  runTests(requireGMP = false, dumpCmdFile = true)
  exec "parallel --keep-order --group < " & buildParallel

-  exec "> " & buildParallel
  if sizeof(int) == 8: # 32-bit tests on 64-bit arch
-    for td in testDesc:
-      if not td.useGMP:
-        if td.path in useDebug:
-          test "-d:Constantine32 -d:debugConstantine", td.path, cmdFile
-        else:
-          test "-d:Constantine32", td.path, cmdFile
-    # cmdFile.close()
-    # Execute everything in parallel with GNU parallel
+    clearParallelBuild()
+    runTests(requireGMP = false, dumpCmdFile = true, test32bit = true)
    exec "parallel --keep-order --group < " & buildParallel

  # Now run the benchmarks
@ -374,33 +350,18 @@ task test_parallel_no_gmp, "Run all tests in parallel (via GNU parallel)":
    runBench("bench_pairing_bls12_381")
    runBench("bench_pairing_bn254_nogami")
    runBench("bench_pairing_bn254_snarks")
+    runBench("bench_sha256")

 task test_parallel_no_gmp_no_assembler, "Run all tests in parallel (via GNU parallel)":
  # -d:testingCurves is configured in a *.nim.cfg for convenience
-  let cmdFile = true # open(buildParallel, mode = fmWrite) # Nimscript doesn't support IO :/
-  exec "> " & buildParallel
-
-  for td in testDesc:
-    if not td.useGMP:
-      if td.path in useDebug:
-        test "-d:debugConstantine -d:ConstantineASM=false", td.path, cmdFile
-      else:
-        test "-d:ConstantineASM=false", td.path, cmdFile
-
-  # cmdFile.close()
-  # Execute everything in parallel with GNU parallel
+  clearParallelBuild()
+  runTests(requireGMP = false, dumpCmdFile = true, testASM = false)
  exec "parallel --keep-order --group < " & buildParallel

  exec "> " & buildParallel
  if sizeof(int) == 8: # 32-bit tests on 64-bit arch
-    for td in testDesc:
-      if not td.useGMP:
-        if td.path in useDebug:
-          test "-d:Constantine32 -d:debugConstantine", td.path, cmdFile
-        else:
-          test "-d:Constantine32", td.path, cmdFile
-    # cmdFile.close()
-    # Execute everything in parallel with GNU parallel
+    clearParallelBuild()
+    runTests(requireGMP = false, dumpCmdFile = true, test32bit = true, testASM = false)
    exec "parallel --keep-order --group < " & buildParallel

  # Now run the benchmarks
@ -419,6 +380,7 @@ task test_parallel_no_gmp_no_assembler, "Run all tests in parallel (via GNU para
    runBench("bench_pairing_bls12_381")
    runBench("bench_pairing_bn254_nogami")
    runBench("bench_pairing_bn254_snarks")
+    runBench("bench_sha256")

 task bench_fp, "Run benchmark 𝔽p with your default compiler":
  runBench("bench_fp")
@ -599,3 +561,6 @@ task bench_pairing_bn254_snarks_gcc_noasm, "Run pairings benchmarks for BN254-Sn

 task bench_pairing_bn254_snarks_clang_noasm, "Run pairings benchmarks for BN254-Snarks - Clang no Assembly":
  runBench("bench_pairing_bn254_snarks", "clang", useAsm = false)
+
+task bench_sha256, "Run SHA256 benchmarks":
+  runBench("bench_sha256")
--- a/constantine/hashes/h_sha256.nim
+++ b/constantine/hashes/h_sha256.nim
@ -0,0 +1,348 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  ../config/common,
+  ../io/endians
+
+# SHA256, a hash function from the SHA2 family
+# --------------------------------------------------------------------------------
+#
+# References:
+# - NIST: https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.180-4.pdf
+# - IETF: US Secure Hash Algorithms (SHA and HMAC-SHA) https://tools.ietf.org/html/rfc4634
+# - Intel optimization https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/sha-256-implementations-paper.pdf
+# - Parallelizing message schedules
+#   to accelerate the computations of hash functions
+#   Shay Gueron, Vlad Krasnov, 2012
+#   https://eprint.iacr.org/2012/067.pdf
+#
+# Vectors:
+# - https://csrc.nist.gov/CSRC/media/Projects/Cryptographic-Standards-and-Guidelines/documents/examples/SHA256.pdf
+
+# Types and constants
+# ----------------------------------------------------------------
+
+const
+  DigestSize = 32
+  BlockSize = 64
+  HashSize = DigestSize div sizeof(uint32) # 8
+
+type
+  Sha256Context* = object
+    ## Align to 64 for cache line and SIMD friendliness
+    H{.align: 64}: array[HashSize, uint32]
+    buf{.align: 64}: array[BlockSize, byte]
+    msgLen: uint64
+    bufIdx: uint8
+
+  sha256* = Sha256Context
+
+# Internal
+# ----------------------------------------------------------------
+# TODO: vectorized implementations
+
+# No exceptions allowed in core cryptographic operations
+{.push raises: [].}
+{.push checks: off.}
+
+template rotr(x, n: uint32): uint32 =
+  ## Rotate right the bits
+  # We always use it with constants in 0 ..< 32
+  # so undefined behaviour.
+  (x shr n) or (x shl (32 - n))
+
+template ch(x, y, z: uint32): uint32 =
+  ## "Choose" function of SHA256
+  ## Choose bit i from yi or zi depending on xi
+  when false: # Spec FIPS 180-4
+    (x and y) xor (not(x) and z)
+  else:      # RFC4634
+    ((x and (y xor z)) xor z)
+
+template maj(x, y, z: uint32): uint32 =
+  ## "Majority" function of SHA256
+  when false: # Spec FIPS 180-4
+    (x and y) xor (x and z) xor (y and z)
+  else:      # RFC4634
+    (x and (y or z)) or (y and z)
+
+template S0(x: uint32): uint32 =
+  # Σ₀
+  rotr(x, 2) xor rotr(x, 13) xor rotr(x, 22)
+
+template S1(x: uint32): uint32 =
+  # Σ₁
+  rotr(x, 6) xor rotr(x, 11) xor rotr(x, 25)
+
+template s0(x: uint32): uint32 =
+  # σ₀
+  rotr(x, 7) xor rotr(x, 18) xor (x shr 3)
+
+template s1(x: uint32): uint32 =
+  # σ₁
+  rotr(x, 17) xor rotr(x, 19) xor (x shr 10)
+
+func setZero[N](a: var array[N, SomeNumber]){.inline.} =
+  for i in 0 ..< a.len:
+    a[i] = 0
+
+func hashMessageBlocks[T: byte|char](
+       H: var array[HashSize, uint32],
+       message: openarray[T]): uint =
+  ## Hash a message block by block
+  ## Sha256 block size is 64 bytes hence
+  ## a message will be process 64 by 64 bytes.
+  ## FIPS.180-4 6.2.2. SHA-256 Hash Computation
+
+  result = 0
+  let numBlocks = message.len.uint div BlockSize
+  if numBlocks == 0:
+    return 0
+
+  const K256 = [
+    0x428a2f98'u32, 0x71374491'u32, 0xb5c0fbcf'u32, 0xe9b5dba5'u32, 0x3956c25b'u32, 0x59f111f1'u32, 0x923f82a4'u32, 0xab1c5ed5'u32,
+    0xd807aa98'u32, 0x12835b01'u32, 0x243185be'u32, 0x550c7dc3'u32, 0x72be5d74'u32, 0x80deb1fe'u32, 0x9bdc06a7'u32, 0xc19bf174'u32,
+    0xe49b69c1'u32, 0xefbe4786'u32, 0x0fc19dc6'u32, 0x240ca1cc'u32, 0x2de92c6f'u32, 0x4a7484aa'u32, 0x5cb0a9dc'u32, 0x76f988da'u32,
+    0x983e5152'u32, 0xa831c66d'u32, 0xb00327c8'u32, 0xbf597fc7'u32, 0xc6e00bf3'u32, 0xd5a79147'u32, 0x06ca6351'u32, 0x14292967'u32,
+    0x27b70a85'u32, 0x2e1b2138'u32, 0x4d2c6dfc'u32, 0x53380d13'u32, 0x650a7354'u32, 0x766a0abb'u32, 0x81c2c92e'u32, 0x92722c85'u32,
+    0xa2bfe8a1'u32, 0xa81a664b'u32, 0xc24b8b70'u32, 0xc76c51a3'u32, 0xd192e819'u32, 0xd6990624'u32, 0xf40e3585'u32, 0x106aa070'u32,
+    0x19a4c116'u32, 0x1e376c08'u32, 0x2748774c'u32, 0x34b0bcb5'u32, 0x391c0cb3'u32, 0x4ed8aa4a'u32, 0x5b9cca4f'u32, 0x682e6ff3'u32,
+    0x748f82ee'u32, 0x78a5636f'u32, 0x84c87814'u32, 0x8cc70208'u32, 0x90befffa'u32, 0xa4506ceb'u32, 0xbef9a3f7'u32, 0xc67178f2'u32
+  ]
+
+  var
+    a = H[0]
+    b = H[1]
+    c = H[2]
+    d = H[3]
+    e = H[4]
+    f = H[5]
+    g = H[6]
+    h = H[7]
+
+  for _ in 0 ..< numBlocks:
+    # The first 16 bytes have different handling
+    # from bytes 16..<64.
+    # Using an array[64, uint32] will span it
+    # across 8 cache lines impacting performance
+
+    # Workspace with message schedule Wₜ
+    var W{.noInit.}: array[16, uint32]
+    var t = 0'u32
+    while t < 16: # Wₜ = Mⁱₜ
+      W[t].parseFromBlob(message, result, bigEndian)
+      let T1 = h + S1(e) + ch(e, f, g) + K256[t] + W[t]
+      let T2 = S0(a) + maj(a, b, c)
+      h = g
+      g = f
+      f = e
+      e = d + T1
+      d = c
+      c = b
+      b = a
+      a = T1+T2
+
+      t += 1
+
+    while t < 64:
+      W[t mod 16] += s1(W[(t-2) mod 16]) +
+                     W[(t-7) mod 16] +
+                     s0(W[(t-15) mod 16])
+      let T1 = h + S1(e) + ch(e, f, g) + K256[t] + W[t mod 16]
+      let T2 = S0(a) + maj(a, b, c)
+      h = g
+      g = f
+      f = e
+      e = d + T1
+      d = c
+      c = b
+      b = a
+      a = T1+T2
+
+      t += 1
+
+    a += H[0]; H[0] = a
+    b += H[1]; H[1] = b
+    c += H[2]; H[2] = c
+    d += H[3]; H[3] = d
+    e += H[4]; H[4] = e
+    f += H[5]; H[5] = f
+    g += H[6]; H[6] = g
+    h += H[7]; H[7] = h
+
+func dumpHash(
+       digest: var array[DigestSize, byte],
+       H: array[HashSize, uint32]) =
+  ## Convert the internal hash into a message digest
+  var dstIdx = 0'u
+  for i in 0 ..< H.len:
+    digest.dumpRawInt(H[i], dstIdx, bigEndian)
+    dstIdx += uint sizeof(uint32)
+
+func copy[N: static int, T: byte|char](
+       dst: var array[N, byte],
+       dStart: SomeInteger,
+       src: openArray[T],
+       sStart: SomeInteger,
+       len: SomeInteger
+     ) =
+  ## Copy dst[dStart ..< dStart+len] = src[sStart ..< sStart+len]
+  ## Unlike the standard library, this cannot throw
+  ## even a defect.
+  ## It also handles copy of char into byte arrays
+  debug:
+    doAssert 0 <= dStart and dStart+len <= dst.len.uint
+    doAssert 0 <= sStart and sStart+len <= src.len.uint
+
+  for i in 0 ..< len:
+    dst[dStart + i] = byte src[sStart + i]
+
+func hashBuffer(ctx: var Sha256Context) =
+  discard ctx.H.hashMessageBlocks(ctx.buf)
+  ctx.buf.setZero()
+  ctx.bufIdx = 0
+
+# Public API
+# ----------------------------------------------------------------
+
+func init*(ctx: var Sha256Context) =
+  ## Initialize or reinitialize a Sha256 context
+
+  ctx.msgLen = 0
+  ctx.buf.setZero()
+  ctx.bufIdx = 0
+
+  ctx.H[0] = 0x6a09e667'u32;
+  ctx.H[1] = 0xbb67ae85'u32;
+  ctx.H[2] = 0x3c6ef372'u32;
+  ctx.H[3] = 0xa54ff53a'u32;
+  ctx.H[4] = 0x510e527f'u32;
+  ctx.H[5] = 0x9b05688c'u32;
+  ctx.H[6] = 0x1f83d9ab'u32;
+  ctx.H[7] = 0x5be0cd19'u32;
+
+func update*[T: char|byte](ctx: var Sha256Context, message: openarray[T]) =
+  ## Append a message to a SHA256 context
+  ## for incremental SHA256 computation
+  ##
+  ## Security note: the tail of your message might be stored
+  ## in an internal buffer.
+  ## if sensitive content is used, ensure that
+  ## `ctx.finish(...)` and `ctx.clear()` are called as soon as possible.
+  ## Additionally ensure that the message(s) passed were stored
+  ## in memory considered secure for your threat model.
+  ##
+  ## For passwords and secret keys, you MUST NOT use raw SHA-256
+  ## use a Key Derivation Function instead (KDF)
+
+  debug:
+    doAssert: 0 <= ctx.bufIdx and ctx.bufIdx.int < ctx.buf.len
+    for i in ctx.bufIdx ..< ctx.buf.len:
+      doAssert ctx.buf[i] == 0
+
+  var # Message processing state machine
+    cur = 0'u
+    bytesLeft = message.len.uint
+
+  ctx.msgLen += bytesLeft
+
+  if ctx.bufIdx != 0: # Previous partial update
+    let bufIdx = ctx.bufIdx.uint
+    let free = ctx.buf.sizeof().uint - bufIdx
+
+    if free > bytesLeft:
+      # Enough free space, store in buffer
+      ctx.buf.copy(dStart = bufIdx, message, sStart = 0, len = bytesLeft)
+      ctx.bufIdx += bytesLeft.uint8
+      return
+    else:
+      # Fill the buffer and do one sha256 hash
+      ctx.buf.copy(dStart = bufIdx, message, sStart = 0, len = free)
+      ctx.hashBuffer()
+
+      # Update message state for further processing
+      cur = free
+      bytesLeft -= free
+
+  # Process n blocks (64 byte each)
+  let consumed = ctx.H.hashMessageBlocks(
+    message.toOpenArray(int cur, message.len-1))
+  cur += consumed
+  bytesLeft -= consumed
+
+  if bytesLeft != 0:
+    # Store the tail in buffer
+    debug: # TODO: state machine formal verification - https://nim-lang.org/docs/drnim.html
+      doAssert ctx.bufIdx == 0
+      doAssert cur + bytesLeft == message.len.uint
+
+    ctx.buf.copy(dStart = 0'u, message, sStart = cur, len = bytesLeft)
+    ctx.bufIdx = uint8 bytesLeft
+
+func finish*(ctx: var Sha256Context, digest: var array[32, byte]) =
+  ## Finalize a SHA256 computation and output the
+  ## message digest to the `digest` buffer.
+  ##
+  ## Security note: this does not clear the internal buffer.
+  ## if sensitive content is used, use "ctx.clear()"
+  ## and also make sure that the message(s) passed were stored
+  ## in memory considered secure for your threat model.
+  ##
+  ## For passwords and secret keys, you MUST NOT use raw SHA-256
+  ## use a Key Derivation Function instead (KDF)
+
+  debug:
+    doAssert: 0 <= ctx.bufIdx and ctx.bufIdx.int < ctx.buf.len
+    for i in ctx.bufIdx ..< ctx.buf.len:
+      doAssert ctx.buf[i] == 0
+
+  # Add '1' bit at the end of the message (+7 zero bits)
+  ctx.buf[ctx.bufIdx] = 0b1000_0000
+
+  # Add k bits so that msgLenBits + 1 + k ≡ 448 mod 512
+  # Hence in bytes msgLen + 1 + K ≡ 56 mod 64
+  const padZone = 56
+  if ctx.bufIdx >= padZone:
+    # We are in the 56..<64 mod 64 byte count
+    # and need to rollover to 0
+    ctx.hashBuffer()
+
+  let lenInBits = ctx.msgLen.uint64 * 8
+  ctx.buf.dumpRawInt(lenInBits, padZone, bigEndian)
+  discard ctx.H.hashMessageBlocks(ctx.buf)
+  digest.dumpHash(ctx.H)
+
+func clear*(ctx: var Sha256Context) =
+  ## Clear the context internal buffers
+  ## Security note:
+  ## For passwords and secret keys, you MUST NOT use raw SHA-256
+  ## use a Key Derivation Function instead (KDF)
+  # TODO: ensure compiler cannot optimize the code away
+  ctx.buf.setZero()
+
+func hash*[T: char|byte](
+       HashKind: type sha256,
+       digest: var array[32, byte],
+       message: openarray[T],
+       clearMem = false) =
+  ## Produce a SHA256 digest from a message
+  var ctx {.noInit.}: HashKind
+  ctx.init()
+  ctx.update(message)
+  ctx.finish(digest)
+
+  if clearMem:
+    ctx.clear()
+
+func hash*[T: char|byte](
+       HashKind: type sha256,
+       message: openarray[T],
+       clearmem = false): array[32, byte] =
+  ## Produce a SHA256 digest from a message
+  HashKind.hash(result, message, clearMem)
--- a/constantine/io/endians.nim
+++ b/constantine/io/endians.nim
@ -0,0 +1,78 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import ../config/common
+
+# perf critical we don't want bound checks here
+# So no checks and we avoid signed int to ensur eno exceptions.
+# TODO: Nim formal verification: https://nim-lang.org/docs/drnim.html
+{.push checks:off, raises: [].}
+
+template toByte*(x: SomeUnsignedInt): byte =
+  ## At compile-time, conversion to bytes checks the range
+  ## we want to ensure this is done at the register level
+  ## at runtime in a single "mov byte" instruction
+  when nimvm:
+    byte(x and 0xFF)
+  else:
+    byte(x)
+
+func parseFromBlob*[T: byte|char](
+           dst: var SomeUnsignedInt,
+           src: openArray[T],
+           cursor: var uint, endian: static Endianness) {.inline.} =
+  ## Read an unsigned integer from a raw binary blob.
+  ## The `cursor` represents the current index in the array and is updated
+  ## by N bytes where N is the size of `dst` type in bytes.
+  ## The binary blob is interpreted as:
+  ## - an array of words traversed from 0 ..< len (little-endian), via an incremented `cursor`
+  ## - with each word being of `endian` ordering for deserialization purpose.
+  debug:
+    doAssert 0 <= cursor and cursor < src.len.uint
+    doAssert cursor + sizeof(dst).uint <= src.len.uint,
+      "cursor (" & $cursor & ") + sizeof(dst) (" & $sizeof(dst) &
+      ") <= src.len (" & $src.len & ")"
+
+  type U = typeof(dst)
+  const L = sizeof(dst)
+
+  var accum: U = 0
+  when endian == littleEndian:
+    for i in 0'u ..< L:
+      accum = accum or (U(src[cursor+i]) shl (i * 8))
+  else:
+    for i in 0'u ..< L:
+      accum = accum or (U(src[cursor+i]) shl ((L - 1 - i) * 8))
+  dst = accum
+  cursor.inc(L)
+
+func dumpRawInt*[T: byte|char](
+           dst: var openArray[T],
+           src: SomeUnsignedInt,
+           cursor: uint, endian: static Endianness) {.inline.} =
+  ## Dump an integer into raw binary form
+  ## The `cursor` represents the current index in the array and is updated
+  ## by N bytes where N is the size of `src` type in bytes.
+  ## The binary blob is interpreted as:
+  ## - an array of words traversed from 0 ..< len (little-endian), via an incremented `cursor`
+  ## - with each word being of `endian` ordering for deserialization purpose.
+  debug:
+    doAssert 0 <= cursor and cursor < dst.len.uint
+    doAssert cursor + sizeof(src).uint <= dst.len.uint,
+      "cursor (" & $cursor & ") + sizeof(src) (" & $sizeof(src) &
+      ") <= dst.len (" & $dst.len & ")"
+
+  type U = typeof(src)
+  const L = uint sizeof(src)
+
+  when endian == littleEndian:
+    for i in 0'u ..< L:
+      dst[cursor+i] = toByte(src shr (i * 8))
+  else:
+    for i in 0'u ..< L:
+      dst[cursor+i] = toByte(src shr ((L-i-1) * 8))
--- a/constantine/io/io_bigints.nim
+++ b/constantine/io/io_bigints.nim
@ -12,7 +12,8 @@

 import
  ../primitives/constant_time,
-  ../config/[common, type_bigint]
+  ../config/[common, type_bigint],
+  ./endians

 # ############################################################
 #
@ -152,24 +153,17 @@ func fromUint*(
 #
 # ############################################################

-template toByte(x: SomeUnsignedInt): byte =
-  ## At compile-time, conversion to bytes checks the range
-  ## we want to ensure this is done at the register level
-  ## at runtime in a single "mov byte" instruction
-  when nimvm:
-    byte(x and 0xFF)
-  else:
-    byte(x)
-
 template blobFrom(dst: var openArray[byte], src: SomeUnsignedInt, startIdx: int, endian: static Endianness) =
  ## Write an integer into a raw binary blob
  ## Swapping endianness if needed
+  ## startidx is the first written array item if littleEndian is requested
+  ## or the last if bigEndian is requested
  when endian == cpuEndian:
    for i in 0 ..< sizeof(src):
-      dst[startIdx+i] = toByte((src shr (i * 8)))
+      dst[startIdx+i] = toByte(src shr (i * 8))
  else:
    for i in 0 ..< sizeof(src):
-      dst[startIdx+sizeof(src)-1-i] = toByte((src shr (i * 8)))
+      dst[startIdx+sizeof(src)-1-i] = toByte(src shr (i * 8))

 func exportRawUintLE(
        dst: var openarray[byte],
--- a/helpers/prng_unsafe.nim
+++ b/helpers/prng_unsafe.nim
@ -364,6 +364,14 @@ func random_long01Seq_with_randZ*(rng: var RngState, T: typedesc[ECP_ShortW_Proj
  ## Skewed towards long bitstrings of 0 or 1
  rng.random_long01Seq_with_randZ(result)

+# Byte sequences
+# ------------------------------------------------------------
+
+func random_byte_seq*(rng: var RngState, length: int): seq[byte] =
+  result.newSeq(length)
+  for b in result.mitems:
+    b = byte rng.next()
+
 # Sanity checks
 # ------------------------------------------------------------

--- a/tests/t_hash_sha256_vs_openssl.nim
+++ b/tests/t_hash_sha256_vs_openssl.nim
@ -0,0 +1,115 @@
+import
+  # Internals
+  ../constantine/hashes/h_sha256,
+  # Helpers
+  ../helpers/prng_unsafe,
+  # Third-party
+  stew/byteutils
+
+proc SHA256[T: byte|char](
+       msg: openarray[T],
+       digest: ptr array[32, byte] = nil
+     ): ptr array[32, byte] {.cdecl, dynlib: "libssl.so", importc.}
+
+proc SHA256_OpenSSL[T: byte|char](
+       digest: var array[32, byte],
+       s: openarray[T]) =
+  discard SHA256(s, digest.addr)
+
+echo "\n------------------------------------------------------\n"
+const SmallSizeIters = 128
+const LargeSizeIters =  10
+
+proc sanityABC =
+  var bufCt: array[32, byte]
+  let msg = "abc"
+
+  let hashed = hexToByteArray[32](
+    "BA7816BF8F01CFEA414140DE5DAE2223" &
+    "B00361A396177A9CB410FF61F20015AD")
+
+  sha256.hash(bufCt, msg)
+
+  doAssert bufCt == hashed
+
+proc sanityABC2 =
+  var bufCt: array[32, byte]
+  let msg = "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq"
+
+  let hashed = hexToByteArray[32](
+    "248D6A61D20638B8E5C026930C3E6039" &
+    "A33CE45964FF2167F6ECEDD419DB06C1")
+
+  sha256.hash(bufCt, msg)
+
+  doAssert bufCt == hashed
+
+proc innerTest(rng: var RngState, sizeRange: Slice[int]) =
+  let size = rng.random_unsafe(sizeRange)
+  let msg = rng.random_byte_seq(size)
+
+  var bufCt, bufOssl: array[32, byte]
+
+  sha256.hash(bufCt, msg)
+  SHA256_OpenSSL(bufOssl, msg)
+  doAssert bufCt == bufOssl
+
+proc chunkTest(rng: var RngState, sizeRange: Slice[int]) =
+  let size = rng.random_unsafe(sizeRange)
+  let msg = rng.random_byte_seq(size)
+
+  let chunkSize = rng.random_unsafe(2 ..< 20)
+
+  var bufOnePass: array[32, byte]
+  sha256.hash(bufOnePass, msg)
+
+  var bufChunked: array[32, byte]
+  let maxChunk = max(2, sizeRange.b div 10) # Consume up to 10% at once
+
+  var ctx: Sha256Context
+  ctx.init()
+  var cur = 0
+  while size - cur > 0:
+    let chunkSize = rng.random_unsafe(0 ..< maxChunk)
+    let stop = min(cur+chunkSize-1, size-1)
+    let consumed = stop-cur+1
+    ctx.update(msg.toOpenArray(cur, stop))
+    cur += consumed
+
+  ctx.finish(bufChunked)
+
+  doAssert bufOnePass == bufChunked
+
+proc main() =
+  echo "SHA256 - sanity checks"
+  sanityABC()
+  sanityABC2()
+
+  echo "SHA256 - Starting differential testing vs OpenSSL"
+
+  var rng: RngState
+  rng.seed(0xFACADE)
+
+  echo "SHA256 - 0 <= size < 64 - exhaustive"
+  for i in 0 ..< 64:
+    rng.innerTest(i .. i)
+
+  echo "SHA256 - 0 <= size < 64 - exhaustive chunked"
+  for i in 0 ..< 64:
+    rng.chunkTest(i .. i)
+
+  echo "SHA256 - 64 <= size < 1024B"
+  for _ in 0 ..< SmallSizeIters:
+    rng.innerTest(0 ..< 1024)
+
+  echo "SHA256 - 64 <= size < 1024B - chunked"
+  for _ in 0 ..< SmallSizeIters:
+    rng.chunkTest(0 ..< 1024)
+
+  echo "SHA256 - 1MB <= size < 50MB"
+  for _ in 0 ..< LargeSizeIters:
+    rng.innerTest(1_000_000 ..< 50_000_000)
+
+  echo "SHA256 - Differential testing vs OpenSSL - SUCCESS"
+
+main()