SHA256 Hash function

2020-12-15 19:18:36 +01:00 · 2020-12-15 19:18:36 +01:00 · e89429e822
parent c89c78d2d9
commit e89429e822
11 changed files with 817 additions and 376 deletions
--- a/benchmarks/bench_blueprint.nim
+++ b/benchmarks/bench_blueprint.nim
@ -0,0 +1,108 @@
 # Constantine
 # Copyright (c) 2018-2019    Status Research & Development GmbH
 # Copyright (c) 2020-Present Mamy André-Ratsimbazafy
 # Licensed and distributed under either of
 #   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 # ############################################################
 #
 #             Benchmark blueprint
 #
 # ############################################################
 import
  # Internal
  ../constantine/config/common,
  # Helpers
  ../helpers/[prng_unsafe, static_for],
  ./platforms,
  # Standard library
  std/[monotimes, times, strformat, strutils, macros]
 export strformat, platforms, times, monotimes, macros
 var rng*: RngState
 let seed = uint32(getTime().toUnix() and (1'i64 shl 32 - 1)) # unixTime mod 2^32
 rng.seed(seed)
 echo "bench xoshiro512** seed: ", seed
 # warmup
 proc warmup*() =
  # Warmup - make sure cpu is on max perf
  let start = cpuTime()
  var foo = 123
  for i in 0 ..< 300_000_000:
    foo += i*i mod 456
    foo = foo mod 789
  # Compiler shouldn't optimize away the results as cpuTime rely on sideeffects
  let stop = cpuTime()
  echo &"Warmup: {stop - start:>4.4f} s, result {foo} (displayed to avoid compiler optimizing warmup away)\n"
 warmup()
 when defined(gcc):
  echo "\nCompiled with GCC"
 elif defined(clang):
  echo "\nCompiled with Clang"
 elif defined(vcc):
  echo "\nCompiled with MSVC"
 elif defined(icc):
  echo "\nCompiled with ICC"
 else:
  echo "\nCompiled with an unknown compiler"
 echo "Optimization level => "
 echo "  no optimization: ", not defined(release)
 echo "  release: ", defined(release)
 echo "  danger: ", defined(danger)
 echo "  inline assembly: ", UseASM_X86_64
 when (sizeof(int) == 4) or defined(Constantine32):
  echo "⚠️ Warning: using Constantine with 32-bit limbs"
 else:
  echo "Using Constantine with 64-bit limbs"
 when SupportsCPUName:
  echo "Running on ", cpuName(), ""
 when SupportsGetTicks:
  echo "\n⚠️ Cycles measurements are approximate and use the CPU nominal clock: Turbo-Boost and overclocking will skew them."
  echo "i.e. a 20% overclock will be about 20% off (assuming no dynamic frequency scaling)"
 echo "\n=================================================================================================================\n"
 proc separator*(length: int) =
  echo "-".repeat(length)
 proc notes*() =
  echo "Notes:"
  echo "  - Compilers:"
  echo "    Compilers are severely limited on multiprecision arithmetic."
  echo "    Constantine compile-time assembler is used by default (nimble bench_fp)."
  echo "    GCC is significantly slower than Clang on multiprecision arithmetic due to catastrophic handling of carries."
  echo "    GCC also seems to have issues with large temporaries and register spilling."
  echo "    This is somewhat alleviated by Constantine compile-time assembler."
  echo "    Bench on specific compiler with assembler: \"nimble bench_ec_g1_gcc\" or \"nimble bench_ec_g1_clang\"."
  echo "    Bench on specific compiler with assembler: \"nimble bench_ec_g1_gcc_noasm\" or \"nimble bench_ec_g1_clang_noasm\"."
  echo "  - The simplest operations might be optimized away by the compiler."
  echo "  - Fast Squaring and Fast Multiplication are possible if there are spare bits in the prime representation (i.e. the prime uses 254 bits out of 256 bits)"
 template measure*(iters: int,
               startTime, stopTime: untyped,
               startClk, stopClk: untyped,
               body: untyped): untyped =
  let startTime = getMonotime()
  when SupportsGetTicks:
    let startClk = getTicks()
  for _ in 0 ..< iters:
    body
  when SupportsGetTicks:
    let stopClk = getTicks()
  let stopTime = getMonotime()
  when not SupportsGetTicks:
    let startClk = -1'i64
    let stopClk = -1'i64
--- a/benchmarks/bench_elliptic_template.nim
+++ b/benchmarks/bench_elliptic_template.nim
@ -21,85 +21,12 @@ import
  # Helpers
  ../helpers/[prng_unsafe, static_for],
  ./platforms,
-  # Standard library
+  ./bench_blueprint,
  std/[monotimes, times, strformat, strutils, macros],
  # Reference unsafe scalar multiplication
  ../tests/support/ec_reference_scalar_mult
-var rng: RngState
+export notes
-let seed = uint32(getTime().toUnix() and (1'i64 shl 32 - 1)) # unixTime mod 2^32
+proc separator*() = separator(177)
 rng.seed(seed)
 echo "bench xoshiro512** seed: ", seed
 # warmup
 proc warmup*() =
  # Warmup - make sure cpu is on max perf
  let start = cpuTime()
  var foo = 123
  for i in 0 ..< 300_000_000:
    foo += i*i mod 456
    foo = foo mod 789
  # Compiler shouldn't optimize away the results as cpuTime rely on sideeffects
  let stop = cpuTime()
  echo &"Warmup: {stop - start:>4.4f} s, result {foo} (displayed to avoid compiler optimizing warmup away)\n"
 warmup()
 when defined(gcc):
  echo "\nCompiled with GCC"
 elif defined(clang):
  echo "\nCompiled with Clang"
 elif defined(vcc):
  echo "\nCompiled with MSVC"
 elif defined(icc):
  echo "\nCompiled with ICC"
 else:
  echo "\nCompiled with an unknown compiler"
 echo "Optimization level => "
 echo "  no optimization: ", not defined(release)
 echo "  release: ", defined(release)
 echo "  danger: ", defined(danger)
 echo "  inline assembly: ", UseASM_X86_64
 when (sizeof(int) == 4) or defined(Constantine32):
  echo "⚠️ Warning: using Constantine with 32-bit limbs"
 else:
  echo "Using Constantine with 64-bit limbs"
 when SupportsCPUName:
  echo "Running on ", cpuName(), ""
 when SupportsGetTicks:
  echo "\n⚠️ Cycles measurements are approximate and use the CPU nominal clock: Turbo-Boost and overclocking will skew them."
  echo "i.e. a 20% overclock will be about 20% off (assuming no dynamic frequency scaling)"
 echo "\n=================================================================================================================\n"
 proc separator*() =
  echo "-".repeat(177)
 proc report(op, elliptic: string, start, stop: MonoTime, startClk, stopClk: int64, iters: int) =
  let ns = inNanoseconds((stop-start) div iters)
  let throughput = 1e9 / float64(ns)
  when SupportsGetTicks:
    echo &"{op:<60} {elliptic:<40} {throughput:>15.3f} ops/s     {ns:>9} ns/op     {(stopClk - startClk) div iters:>9} CPU cycles (approx)"
  else:
    echo &"{op:<60} {elliptic:<40} {throughput:>15.3f} ops/s     {ns:>9} ns/op"
 proc notes*() =
  echo "Notes:"
  echo "  - Compilers:"
  echo "    Compilers are severely limited on multiprecision arithmetic."
  echo "    Constantine compile-time assembler is used by default (nimble bench_fp)."
  echo "    GCC is significantly slower than Clang on multiprecision arithmetic due to catastrophic handling of carries."
  echo "    GCC also seems to have issues with large temporaries and register spilling."
  echo "    This is somewhat alleviated by Constantine compile-time assembler."
  echo "    Bench on specific compiler with assembler: \"nimble bench_ec_g1_gcc\" or \"nimble bench_ec_g1_clang\"."
  echo "    Bench on specific compiler with assembler: \"nimble bench_ec_g1_gcc_noasm\" or \"nimble bench_ec_g1_clang_noasm\"."
  echo "  - The simplest operations might be optimized away by the compiler."
  echo "  - Fast Squaring and Fast Multiplication are possible if there are spare bits in the prime representation (i.e. the prime uses 254 bits out of 256 bits)"
 macro fixEllipticDisplay(T: typedesc): untyped =
  # At compile-time, enums are integers and their display is buggy
@ -111,21 +38,17 @@ macro fixEllipticDisplay(T: typedesc): untyped =
  name.add "[" & fieldName & "[" & curveName & "]]"
  result = newLit name
 proc report(op, elliptic: string, start, stop: MonoTime, startClk, stopClk: int64, iters: int) =
  let ns = inNanoseconds((stop-start) div iters)
  let throughput = 1e9 / float64(ns)
  when SupportsGetTicks:
    echo &"{op:<60} {elliptic:<40} {throughput:>15.3f} ops/s     {ns:>9} ns/op     {(stopClk - startClk) div iters:>9} CPU cycles (approx)"
  else:
    echo &"{op:<60} {elliptic:<40} {throughput:>15.3f} ops/s     {ns:>9} ns/op"
 template bench(op: string, T: typedesc, iters: int, body: untyped): untyped =
-  let start = getMonotime()
+  measure(iters, startTime, stopTime, startClk, stopClk, body)
-  when SupportsGetTicks:
+  report(op, fixEllipticDisplay(T), startTime, stopTime, startClk, stopClk, iters)
    let startClk = getTicks()
  for _ in 0 ..< iters:
    body
  when SupportsGetTicks:
    let stopClk = getTicks()
  let stop = getMonotime()
  when not SupportsGetTicks:
    let startClk = -1'i64
    let stopClk = -1'i64
  report(op, fixEllipticDisplay(T), start, stop, startClk, stopClk, iters)
 proc addBench*(T: typedesc, iters: int) =
  const G1_or_G2 = when T.F is Fp: "G1" else: "G2"
--- a/benchmarks/bench_fields_template.nim
+++ b/benchmarks/bench_fields_template.nim
@ -19,63 +19,10 @@ import
  ../constantine/towers,
  # Helpers
  ../helpers/[prng_unsafe, static_for],
-  ./platforms,
+  ./bench_blueprint
  # Standard library
  std/[monotimes, times, strformat, strutils, macros]
-var rng: RngState
+export notes
-let seed = uint32(getTime().toUnix() and (1'i64 shl 32 - 1)) # unixTime mod 2^32
+proc separator*() = separator(145)
 rng.seed(seed)
 echo "bench xoshiro512** seed: ", seed
 # warmup
 proc warmup*() =
  # Warmup - make sure cpu is on max perf
  let start = cpuTime()
  var foo = 123
  for i in 0 ..< 300_000_000:
    foo += i*i mod 456
    foo = foo mod 789
  # Compiler shouldn't optimize away the results as cpuTime rely on sideeffects
  let stop = cpuTime()
  echo &"Warmup: {stop - start:>4.4f} s, result {foo} (displayed to avoid compiler optimizing warmup away)\n"
 warmup()
 when defined(gcc):
  echo "\nCompiled with GCC"
 elif defined(clang):
  echo "\nCompiled with Clang"
 elif defined(vcc):
  echo "\nCompiled with MSVC"
 elif defined(icc):
  echo "\nCompiled with ICC"
 else:
  echo "\nCompiled with an unknown compiler"
 echo "Optimization level => "
 echo "  no optimization: ", not defined(release)
 echo "  release: ", defined(release)
 echo "  danger: ", defined(danger)
 echo "  inline assembly: ", UseASM_X86_64
 when (sizeof(int) == 4) or defined(Constantine32):
  echo "⚠️ Warning: using Constantine with 32-bit limbs"
 else:
  echo "Using Constantine with 64-bit limbs"
 when SupportsCPUName:
  echo "Running on ", cpuName(), ""
 when SupportsGetTicks:
  echo "\n⚠️ Cycles measurements are approximate and use the CPU nominal clock: Turbo-Boost and overclocking will skew them."
  echo "i.e. a 20% overclock will be about 20% off (assuming no dynamic frequency scaling)"
 echo "\n=================================================================================================================\n"
 proc separator*() =
  echo "-".repeat(145)
 proc report(op, field: string, start, stop: MonoTime, startClk, stopClk: int64, iters: int) =
  let ns = inNanoseconds((stop-start) div iters)
@ -85,19 +32,6 @@ proc report(op, field: string, start, stop: MonoTime, startClk, stopClk: int64,
  else:
    echo &"{op:<50} {field:<18} {throughput:>15.3f} ops/s     {ns:>9} ns/op"
 proc notes*() =
  echo "Notes:"
  echo "  - Compilers:"
  echo "    Compilers are severely limited on multiprecision arithmetic."
  echo "    Constantine compile-time assembler is used by default (nimble bench_fp)."
  echo "    GCC is significantly slower than Clang on multiprecision arithmetic due to catastrophic handling of carries."
  echo "    GCC also seems to have issues with large temporaries and register spilling."
  echo "    This is somewhat alleviated by Constantine compile-time assembler."
  echo "    Bench on specific compiler with assembler: \"nimble bench_fp_gcc\" or \"nimble bench_fp_clang\"."
  echo "    Bench on specific compiler with assembler: \"nimble bench_fp_gcc_noasm\" or \"nimble bench_fp_clang_noasm\"."
  echo "  - The simplest operations might be optimized away by the compiler."
  echo "  - Fast Squaring and Fast Multiplication are possible if there are spare bits in the prime representation (i.e. the prime uses 254 bits out of 256 bits)"
 macro fixFieldDisplay(T: typedesc): untyped =
  # At compile-time, enums are integers and their display is buggy
  # we get the Curve ID instead of the curve name.
@ -107,20 +41,8 @@ macro fixFieldDisplay(T: typedesc): untyped =
  result = newLit name
 template bench(op: string, T: typedesc, iters: int, body: untyped): untyped =
-  let start = getMonotime()
+  measure(iters, startTime, stopTime, startClk, stopClk, body)
-  when SupportsGetTicks:
+  report(op, fixFieldDisplay(T), startTime, stopTime, startClk, stopClk, iters)
    let startClk = getTicks()
  for _ in 0 ..< iters:
    body
  when SupportsGetTicks:
    let stopClk = getTicks()
  let stop = getMonotime()
  when not SupportsGetTicks:
    let startClk = -1'i64
    let stopClk = -1'i64
  report(op, fixFieldDisplay(T), start, stop, startClk, stopClk, iters)
 proc addBench*(T: typedesc, iters: int) =
  var x = rng.random_unsafe(T)
--- a/benchmarks/bench_pairing_template.nim
+++ b/benchmarks/bench_pairing_template.nim
@ -28,101 +28,23 @@ import
    pairing_bn
  ],
  # Helpers
-  ../helpers/[prng_unsafe, static_for],
+  ../helpers/prng_unsafe,
-  ./platforms,
+  ./bench_blueprint
  # Standard library
  std/[monotimes, times, strformat, strutils, macros]
-var rng: RngState
+export notes
-let seed = uint32(getTime().toUnix() and (1'i64 shl 32 - 1)) # unixTime mod 2^32
+proc separator*() = separator(177)
 rng.seed(seed)
 echo "bench xoshiro512** seed: ", seed
-# warmup
+proc report(op, curve: string, startTime, stopTime: MonoTime, startClk, stopClk: int64, iters: int) =
-proc warmup*() =
+  let ns = inNanoseconds((stopTime-startTime) div iters)
  # Warmup - make sure cpu is on max perf
  let start = cpuTime()
  var foo = 123
  for i in 0 ..< 300_000_000:
    foo += i*i mod 456
    foo = foo mod 789
  # Compiler shouldn't optimize away the results as cpuTime rely on sideeffects
  let stop = cpuTime()
  echo &"Warmup: {stop - start:>4.4f} s, result {foo} (displayed to avoid compiler optimizing warmup away)\n"
 warmup()
 when defined(gcc):
  echo "\nCompiled with GCC"
 elif defined(clang):
  echo "\nCompiled with Clang"
 elif defined(vcc):
  echo "\nCompiled with MSVC"
 elif defined(icc):
  echo "\nCompiled with ICC"
 else:
  echo "\nCompiled with an unknown compiler"
 echo "Optimization level => "
 echo "  no optimization: ", not defined(release)
 echo "  release: ", defined(release)
 echo "  danger: ", defined(danger)
 echo "  inline assembly: ", UseASM_X86_64
 when (sizeof(int) == 4) or defined(Constantine32):
  echo "⚠️ Warning: using Constantine with 32-bit limbs"
 else:
  echo "Using Constantine with 64-bit limbs"
 when SupportsCPUName:
  echo "Running on ", cpuName(), ""
 when SupportsGetTicks:
  echo "\n⚠️ Cycles measurements are approximate and use the CPU nominal clock: Turbo-Boost and overclocking will skew them."
  echo "i.e. a 20% overclock will be about 20% off (assuming no dynamic frequency scaling)"
 echo "\n=================================================================================================================\n"
 proc separator*() =
  echo "-".repeat(177)
 proc report(op, curve: string, start, stop: MonoTime, startClk, stopClk: int64, iters: int) =
  let ns = inNanoseconds((stop-start) div iters)
  let throughput = 1e9 / float64(ns)
  when SupportsGetTicks:
    echo &"{op:<60} {curve:<15} {throughput:>15.3f} ops/s     {ns:>9} ns/op     {(stopClk - startClk) div iters:>9} CPU cycles (approx)"
  else:
    echo &"{op:<60} {curve:<15} {throughput:>15.3f} ops/s     {ns:>9} ns/op"
 proc notes*() =
  echo "Notes:"
  echo "  - Compilers:"
  echo "    Compilers are severely limited on multiprecision arithmetic."
  echo "    Constantine compile-time assembler is used by default (nimble bench_fp)."
  echo "    GCC is significantly slower than Clang on multiprecision arithmetic due to catastrophic handling of carries."
  echo "    GCC also seems to have issues with large temporaries and register spilling."
  echo "    This is somewhat alleviated by Constantine compile-time assembler."
  echo "    Bench on specific compiler with assembler: \"nimble bench_ec_g1_gcc\" or \"nimble bench_ec_g1_clang\"."
  echo "    Bench on specific compiler with assembler: \"nimble bench_ec_g1_gcc_noasm\" or \"nimble bench_ec_g1_clang_noasm\"."
  echo "  - The simplest operations might be optimized away by the compiler."
  echo "  - Fast Squaring and Fast Multiplication are possible if there are spare bits in the prime representation (i.e. the prime uses 254 bits out of 256 bits)"
 template bench(op: string, C: static Curve, iters: int, body: untyped): untyped =
-  let start = getMonotime()
+  measure(iters, startTime, stopTime, startClk, stopClk, body)
-  when SupportsGetTicks:
+  report(op, $C, startTime, stopTime, startClk, stopClk, iters)
    let startClk = getTicks()
  for _ in 0 ..< iters:
    body
  when SupportsGetTicks:
    let stopClk = getTicks()
  let stop = getMonotime()
  when not SupportsGetTicks:
    let startClk = -1'i64
    let stopClk = -1'i64
  report(op, $C, start, stop, startClk, stopClk, iters)
 func random_point*(rng: var RngState, EC: typedesc): EC {.noInit.} =
  result = rng.random_unsafe(EC)
--- a/benchmarks/bench_sha256.nim
+++ b/benchmarks/bench_sha256.nim
@ -0,0 +1,58 @@
 import
  # Internals
  ../constantine/hashes/h_sha256,
  # Helpers
  ../helpers/prng_unsafe,
  ./bench_blueprint
 proc separator*() = separator(69)
 proc SHA256[T: byte|char](
       msg: openarray[T],
       digest: ptr array[32, byte] = nil
     ): ptr array[32, byte] {.cdecl, dynlib: "libssl.so", importc.}
 proc SHA256_OpenSSL[T: byte|char](
       digest: var array[32, byte],
       s: openarray[T]) =
  discard SHA256(s, digest.addr)
 proc report(op: string, bytes: int, startTime, stopTime: MonoTime, startClk, stopClk: int64, iters: int) =
  let ns = inNanoseconds((stopTime-startTime) div iters)
  let throughput = 1e9 / float64(ns)
  when SupportsGetTicks:
    let cycles = (stopClk - startClk) div iters
    let cyclePerByte = cycles.float64 / bytes.float64
    echo &"{op:<30}     {throughput:>15.3f} ops/s    {ns:>9} ns/op    {cycles:>10} cycles    {cyclePerByte:>5.2f} cycles/byte"
  else:
    echo &"{op:<30}     {throughput:>15.3f} ops/s    {ns:>9} ns/op"
 template bench(op: string, bytes: int, iters: int, body: untyped): untyped =
  measure(iters, startTime, stopTime, startClk, stopClk, body)
  report(op, bytes, startTime, stopTime, startClk, stopClk, iters)
 proc benchSHA256_constantine[T](msg: openarray[T], msgComment: string, iters: int) =
  var digest: array[32, byte]
  bench("SHA256 - Constantine - " & msgComment, msg.len, iters):
    sha256.hash(digest, msg)
 proc benchSHA256_openssl[T](msg: openarray[T], msgComment: string, iters: int) =
  var digest: array[32, byte]
  bench("SHA256 - OpenSSL - " & msgComment, msg.len, iters):
    SHA256_OpenSSL(digest, msg)
 when isMainModule:
  proc main() =
    block:
      let msg128B = rng.random_byte_seq(128)
      benchSHA256_constantine(msg128B, "128B", 128)
      benchSHA256_openssl(msg128B, "128B", 128)
    block:
      let msg5MB = rng.random_byte_seq(5_000_000)
      benchSHA256_constantine(msg5MB, "5MB", 16)
      benchSHA256_openssl(msg5MB, "5MB", 16)
    block:
      let msg100MB = rng.random_byte_seq(100_000_000)
      benchSHA256_constantine(msg100MB, "100MB", 3)
      benchSHA256_openssl(msg100MB, "100MB", 3)
  main()
--- a/constantine.nimble
+++ b/constantine.nimble
@ -129,17 +129,40 @@ const testDesc: seq[tuple[path: string, useGMP: bool]] = @[
  ("tests/t_pairing_bn254_snarks_optate.nim", false),
  ("tests/t_pairing_bls12_377_optate.nim", false),
  ("tests/t_pairing_bls12_381_optate.nim", false),
  # Hashing vs OpenSSL
  ("tests/t_hash_sha256_vs_openssl.nim", true),
 ]
 # For temporary (hopefully) investigation that can only be reproduced in CI
 const useDebug = [
-  "tests/t_bigints.nim"
+  "tests/t_bigints.nim",
  "tests/t_hash_sha256_vs_openssl.nim",
 ]
 # Tests that uses sequences require Nim GC, stack scanning and nil pointer passed to openarray
 # In particular the tests that uses the json test vectors, don't sanitize them.
 # we do use gc:none to help
 const skipSanitizers = [
  "tests/t_ec_sage_bn254_nogami.nim",
  "tests/t_ec_sage_bn254_snarks.nim",
  "tests/t_ec_sage_bls12_377.nim",
  "tests/t_ec_sage_bls12_381.nim",
 ]
 const sanitizers =
  " --passC:-fsanitize=undefined --passL:-fsanitize=undefined" &
  " --passC:-fno-sanitize-recover" & # Enforce crash on undefined behaviour
  " --gc:none" # The conservative stack scanning of Nim default GC triggers, alignment UB and stack-buffer-overflow check.
  # " --passC:-fsanitize=address --passL:-fsanitize=address" & # Requires too much stack for the inline assembly
 # Helper functions
 # ----------------------------------------------------------------
 proc clearParallelBuild() =
  exec "> " & buildParallel
 proc test(flags, path: string, commandFile = false) =
  # commandFile should be a "file" but Nimscript doesn't support IO
  if not dirExists "build":
@ -153,6 +176,7 @@ proc test(flags, path: string, commandFile = false) =
  if existsEnv"CC":
    cc = " --cc:" & getEnv"CC"
  var flags = flags & " --passC:-fstack-protector-all"
  let command = "nim " & lang & cc & " " & flags &
    " --verbosity:0 --outdir:build/testsuite -r --hints:off --warnings:off " &
    " --nimcache:nimcache/" & path & " " &
@ -164,7 +188,6 @@ proc test(flags, path: string, commandFile = false) =
    echo "=============================================================================================="
    exec command
  else:
    # commandFile.writeLine command
    exec "echo \'" & command & "\' >> " & buildParallel
 proc runBench(benchName: string, compiler = "", useAsm = true) =
@ -181,24 +204,29 @@ proc runBench(benchName: string, compiler = "", useAsm = true) =
       " --nimcache:nimcache/" & benchName & "_" & compiler & "_" & (if useAsm: "useASM" else: "noASM") &
       " -r --hints:off --warnings:off benchmarks/" & benchName & ".nim"
 proc runTests(requireGMP: bool, dumpCmdFile = false, test32bit = false, testASM = true) =
  for td in testDesc:
    if not(td.useGMP and not requireGMP):
      var flags = ""
      if not testASM:
        flags &= " -d:ConstantineASM=false"
      if test32bit:
        flags &= " -d:Constantine32"
      if td.path in useDebug:
        flags &= " -d:debugConstantine"
      if td.path notin skipSanitizers:
        flags &= sanitizers
      test flags, td.path, dumpCmdFile
 # Tasks
 # ----------------------------------------------------------------
 task test, "Run all tests":
  # -d:testingCurves is configured in a *.nim.cfg for convenience
-
+  runTests(requireGMP = true)
  for td in testDesc:
    if td.path in useDebug:
      test "-d:debugConstantine", td.path
    else:
      test "", td.path
  # if sizeof(int) == 8: # 32-bit tests on 64-bit arch
-  #   for td in testDesc:
+  #   runTests(requireGMP = true, test32bit = true)
  #     if td.path in useDebug:
  #       test "-d:Constantine32 -d:debugConstantine", td.path
  #     else:
  #       test "-d:Constantine32", td.path
  # Ensure benchmarks stay relevant. Ignore Windows 32-bit at the moment
  if not defined(windows) or not (existsEnv"UCPU" or getEnv"UCPU" == "i686"):
@ -213,23 +241,14 @@ task test, "Run all tests":
    runBench("bench_pairing_bls12_381")
    runBench("bench_pairing_bn254_nogami")
    runBench("bench_pairing_bn254_snarks")
    runBench("bench_sha256")
 task test_no_gmp, "Run tests that don't require GMP":
  # -d:testingCurves is configured in a *.nim.cfg for convenience
-  for td in testDesc:
+  runTests(requireGMP = false)
    if not td.useGMP:
      if td.path in useDebug:
        test "-d:debugConstantine", td.path
      else:
        test "", td.path
-  if sizeof(int) == 8: # 32-bit tests on 64-bit arch
+  # if sizeof(int) == 8: # 32-bit tests on 64-bit arch
-    for td in testDesc:
+  #   runTests(requireGMP = true, test32bit = true)
      if not td.useGMP:
        if td.path in useDebug:
          test "-d:Constantine32 -d:debugConstantine", td.path
        else:
          test "-d:Constantine32", td.path
  # Ensure benchmarks stay relevant. Ignore Windows 32-bit at the moment
  if not defined(windows) or not (existsEnv"UCPU" or getEnv"UCPU" == "i686"):
@ -243,31 +262,17 @@ task test_no_gmp, "Run tests that don't require GMP":
    runBench("bench_pairing_bls12_381")
    runBench("bench_pairing_bn254_nogami")
    runBench("bench_pairing_bn254_snarks")
    runBench("bench_sha256")
 task test_parallel, "Run all tests in parallel (via GNU parallel)":
  # -d:testingCurves is configured in a *.nim.cfg for convenience
-  let cmdFile = true # open(buildParallel, mode = fmWrite) # Nimscript doesn't support IO :/
+  clearParallelBuild()
-  exec "> " & buildParallel
+  runTests(requireGMP = true, dumpCmdFile = true)
  for td in testDesc:
    if td.path in useDebug:
      test "-d:debugConstantine", td.path, cmdFile
    else:
      test "", td.path, cmdFile
  # cmdFile.close()
  # Execute everything in parallel with GNU parallel
  exec "parallel --keep-order --group < " & buildParallel
  exec "> " & buildParallel
  if sizeof(int) == 8: # 32-bit tests on 64-bit arch
-    for td in testDesc:
+    clearParallelBuild()
-      if td.path in useDebug:
+    runTests(requireGMP = true, dumpCmdFile = true, test32bit = true)
        test "-d:Constantine32 -d:debugConstantine", td.path, cmdFile
      else:
        test "-d:Constantine32", td.path, cmdFile
    # cmdFile.close()
    # Execute everything in parallel with GNU parallel
    exec "parallel --keep-order --group < " & buildParallel
  # Now run the benchmarks
@ -286,31 +291,18 @@ task test_parallel, "Run all tests in parallel (via GNU parallel)":
    runBench("bench_pairing_bls12_381")
    runBench("bench_pairing_bn254_nogami")
    runBench("bench_pairing_bn254_snarks")
    runBench("bench_sha256")
 task test_parallel_no_assembler, "Run all tests (without macro assembler) in parallel (via GNU parallel)":
  # -d:testingCurves is configured in a *.nim.cfg for convenience
-  let cmdFile = true # open(buildParallel, mode = fmWrite) # Nimscript doesn't support IO :/
+  clearParallelBuild()
-  exec "> " & buildParallel
+  runTests(requireGMP = true, dumpCmdFile = true, testASM = false)
  for td in testDesc:
    if td.path in useDebug:
      test "-d:debugConstantine -d:ConstantineASM=false", td.path, cmdFile
    else:
      test " -d:ConstantineASM=false", td.path, cmdFile
  # cmdFile.close()
  # Execute everything in parallel with GNU parallel
  exec "parallel --keep-order --group < " & buildParallel
  exec "> " & buildParallel
  if sizeof(int) == 8: # 32-bit tests on 64-bit arch
-    for td in testDesc:
+    clearParallelBuild()
-      if td.path in useDebug:
+    runTests(requireGMP = true, dumpCmdFile = true, test32bit = true, testASM = false)
        test "-d:Constantine32 -d:debugConstantine -d:ConstantineASM=false", td.path, cmdFile
      else:
        test "-d:Constantine32 -d:ConstantineASM=false", td.path, cmdFile
    # cmdFile.close()
    # Execute everything in parallel with GNU parallel
    exec "parallel --keep-order --group < " & buildParallel
  # Now run the benchmarks
@ -329,33 +321,17 @@ task test_parallel_no_assembler, "Run all tests (without macro assembler) in par
    runBench("bench_pairing_bls12_381")
    runBench("bench_pairing_bn254_nogami")
    runBench("bench_pairing_bn254_snarks")
    runBench("bench_sha256")
 task test_parallel_no_gmp, "Run all tests in parallel (via GNU parallel)":
  # -d:testingCurves is configured in a *.nim.cfg for convenience
-  let cmdFile = true # open(buildParallel, mode = fmWrite) # Nimscript doesn't support IO :/
+  clearParallelBuild()
-  exec "> " & buildParallel
+  runTests(requireGMP = false, dumpCmdFile = true)
  for td in testDesc:
    if not td.useGMP:
      if td.path in useDebug:
        test "-d:debugConstantine", td.path, cmdFile
      else:
        test "", td.path, cmdFile
  # cmdFile.close()
  # Execute everything in parallel with GNU parallel
  exec "parallel --keep-order --group < " & buildParallel
  exec "> " & buildParallel
  if sizeof(int) == 8: # 32-bit tests on 64-bit arch
-    for td in testDesc:
+    clearParallelBuild()
-      if not td.useGMP:
+    runTests(requireGMP = false, dumpCmdFile = true, test32bit = true)
        if td.path in useDebug:
          test "-d:Constantine32 -d:debugConstantine", td.path, cmdFile
        else:
          test "-d:Constantine32", td.path, cmdFile
    # cmdFile.close()
    # Execute everything in parallel with GNU parallel
    exec "parallel --keep-order --group < " & buildParallel
  # Now run the benchmarks
@ -374,33 +350,18 @@ task test_parallel_no_gmp, "Run all tests in parallel (via GNU parallel)":
    runBench("bench_pairing_bls12_381")
    runBench("bench_pairing_bn254_nogami")
    runBench("bench_pairing_bn254_snarks")
    runBench("bench_sha256")
 task test_parallel_no_gmp_no_assembler, "Run all tests in parallel (via GNU parallel)":
  # -d:testingCurves is configured in a *.nim.cfg for convenience
-  let cmdFile = true # open(buildParallel, mode = fmWrite) # Nimscript doesn't support IO :/
+  clearParallelBuild()
-  exec "> " & buildParallel
+  runTests(requireGMP = false, dumpCmdFile = true, testASM = false)
  for td in testDesc:
    if not td.useGMP:
      if td.path in useDebug:
        test "-d:debugConstantine -d:ConstantineASM=false", td.path, cmdFile
      else:
        test "-d:ConstantineASM=false", td.path, cmdFile
  # cmdFile.close()
  # Execute everything in parallel with GNU parallel
  exec "parallel --keep-order --group < " & buildParallel
  exec "> " & buildParallel
  if sizeof(int) == 8: # 32-bit tests on 64-bit arch
-    for td in testDesc:
+    clearParallelBuild()
-      if not td.useGMP:
+    runTests(requireGMP = false, dumpCmdFile = true, test32bit = true, testASM = false)
        if td.path in useDebug:
          test "-d:Constantine32 -d:debugConstantine", td.path, cmdFile
        else:
          test "-d:Constantine32", td.path, cmdFile
    # cmdFile.close()
    # Execute everything in parallel with GNU parallel
    exec "parallel --keep-order --group < " & buildParallel
  # Now run the benchmarks
@ -419,6 +380,7 @@ task test_parallel_no_gmp_no_assembler, "Run all tests in parallel (via GNU para
    runBench("bench_pairing_bls12_381")
    runBench("bench_pairing_bn254_nogami")
    runBench("bench_pairing_bn254_snarks")
    runBench("bench_sha256")
 task bench_fp, "Run benchmark 𝔽p with your default compiler":
  runBench("bench_fp")
@ -599,3 +561,6 @@ task bench_pairing_bn254_snarks_gcc_noasm, "Run pairings benchmarks for BN254-Sn
 task bench_pairing_bn254_snarks_clang_noasm, "Run pairings benchmarks for BN254-Snarks - Clang no Assembly":
  runBench("bench_pairing_bn254_snarks", "clang", useAsm = false)
 task bench_sha256, "Run SHA256 benchmarks":
  runBench("bench_sha256")
--- a/constantine/hashes/h_sha256.nim
+++ b/constantine/hashes/h_sha256.nim
@ -0,0 +1,348 @@
 # Constantine
 # Copyright (c) 2018-2019    Status Research & Development GmbH
 # Copyright (c) 2020-Present Mamy André-Ratsimbazafy
 # Licensed and distributed under either of
 #   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 import
  ../config/common,
  ../io/endians
 # SHA256, a hash function from the SHA2 family
 # --------------------------------------------------------------------------------
 #
 # References:
 # - NIST: https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.180-4.pdf
 # - IETF: US Secure Hash Algorithms (SHA and HMAC-SHA) https://tools.ietf.org/html/rfc4634
 # - Intel optimization https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/sha-256-implementations-paper.pdf
 # - Parallelizing message schedules
 #   to accelerate the computations of hash functions
 #   Shay Gueron, Vlad Krasnov, 2012
 #   https://eprint.iacr.org/2012/067.pdf
 #
 # Vectors:
 # - https://csrc.nist.gov/CSRC/media/Projects/Cryptographic-Standards-and-Guidelines/documents/examples/SHA256.pdf
 # Types and constants
 # ----------------------------------------------------------------
 const
  DigestSize = 32
  BlockSize = 64
  HashSize = DigestSize div sizeof(uint32) # 8
 type
  Sha256Context* = object
    ## Align to 64 for cache line and SIMD friendliness
    H{.align: 64}: array[HashSize, uint32]
    buf{.align: 64}: array[BlockSize, byte]
    msgLen: uint64
    bufIdx: uint8
  sha256* = Sha256Context
 # Internal
 # ----------------------------------------------------------------
 # TODO: vectorized implementations
 # No exceptions allowed in core cryptographic operations
 {.push raises: [].}
 {.push checks: off.}
 template rotr(x, n: uint32): uint32 =
  ## Rotate right the bits
  # We always use it with constants in 0 ..< 32
  # so undefined behaviour.
  (x shr n) or (x shl (32 - n))
 template ch(x, y, z: uint32): uint32 =
  ## "Choose" function of SHA256
  ## Choose bit i from yi or zi depending on xi
  when false: # Spec FIPS 180-4
    (x and y) xor (not(x) and z)
  else:      # RFC4634
    ((x and (y xor z)) xor z)
 template maj(x, y, z: uint32): uint32 =
  ## "Majority" function of SHA256
  when false: # Spec FIPS 180-4
    (x and y) xor (x and z) xor (y and z)
  else:      # RFC4634
    (x and (y or z)) or (y and z)
 template S0(x: uint32): uint32 =
  # Σ₀
  rotr(x, 2) xor rotr(x, 13) xor rotr(x, 22)
 template S1(x: uint32): uint32 =
  # Σ₁
  rotr(x, 6) xor rotr(x, 11) xor rotr(x, 25)
 template s0(x: uint32): uint32 =
  # σ₀
  rotr(x, 7) xor rotr(x, 18) xor (x shr 3)
 template s1(x: uint32): uint32 =
  # σ₁
  rotr(x, 17) xor rotr(x, 19) xor (x shr 10)
 func setZero[N](a: var array[N, SomeNumber]){.inline.} =
  for i in 0 ..< a.len:
    a[i] = 0
 func hashMessageBlocks[T: byte|char](
       H: var array[HashSize, uint32],
       message: openarray[T]): uint =
  ## Hash a message block by block
  ## Sha256 block size is 64 bytes hence
  ## a message will be process 64 by 64 bytes.
  ## FIPS.180-4 6.2.2. SHA-256 Hash Computation
  result = 0
  let numBlocks = message.len.uint div BlockSize
  if numBlocks == 0:
    return 0
  const K256 = [
    0x428a2f98'u32, 0x71374491'u32, 0xb5c0fbcf'u32, 0xe9b5dba5'u32, 0x3956c25b'u32, 0x59f111f1'u32, 0x923f82a4'u32, 0xab1c5ed5'u32,
    0xd807aa98'u32, 0x12835b01'u32, 0x243185be'u32, 0x550c7dc3'u32, 0x72be5d74'u32, 0x80deb1fe'u32, 0x9bdc06a7'u32, 0xc19bf174'u32,
    0xe49b69c1'u32, 0xefbe4786'u32, 0x0fc19dc6'u32, 0x240ca1cc'u32, 0x2de92c6f'u32, 0x4a7484aa'u32, 0x5cb0a9dc'u32, 0x76f988da'u32,
    0x983e5152'u32, 0xa831c66d'u32, 0xb00327c8'u32, 0xbf597fc7'u32, 0xc6e00bf3'u32, 0xd5a79147'u32, 0x06ca6351'u32, 0x14292967'u32,
    0x27b70a85'u32, 0x2e1b2138'u32, 0x4d2c6dfc'u32, 0x53380d13'u32, 0x650a7354'u32, 0x766a0abb'u32, 0x81c2c92e'u32, 0x92722c85'u32,
    0xa2bfe8a1'u32, 0xa81a664b'u32, 0xc24b8b70'u32, 0xc76c51a3'u32, 0xd192e819'u32, 0xd6990624'u32, 0xf40e3585'u32, 0x106aa070'u32,
    0x19a4c116'u32, 0x1e376c08'u32, 0x2748774c'u32, 0x34b0bcb5'u32, 0x391c0cb3'u32, 0x4ed8aa4a'u32, 0x5b9cca4f'u32, 0x682e6ff3'u32,
    0x748f82ee'u32, 0x78a5636f'u32, 0x84c87814'u32, 0x8cc70208'u32, 0x90befffa'u32, 0xa4506ceb'u32, 0xbef9a3f7'u32, 0xc67178f2'u32
  ]
  var
    a = H[0]
    b = H[1]
    c = H[2]
    d = H[3]
    e = H[4]
    f = H[5]
    g = H[6]
    h = H[7]
  for _ in 0 ..< numBlocks:
    # The first 16 bytes have different handling
    # from bytes 16..<64.
    # Using an array[64, uint32] will span it
    # across 8 cache lines impacting performance
    # Workspace with message schedule Wₜ
    var W{.noInit.}: array[16, uint32]
    var t = 0'u32
    while t < 16: # Wₜ = Mⁱₜ
      W[t].parseFromBlob(message, result, bigEndian)
      let T1 = h + S1(e) + ch(e, f, g) + K256[t] + W[t]
      let T2 = S0(a) + maj(a, b, c)
      h = g
      g = f
      f = e
      e = d + T1
      d = c
      c = b
      b = a
      a = T1+T2
      t += 1
    while t < 64:
      W[t mod 16] += s1(W[(t-2) mod 16]) +
                     W[(t-7) mod 16] +
                     s0(W[(t-15) mod 16])
      let T1 = h + S1(e) + ch(e, f, g) + K256[t] + W[t mod 16]
      let T2 = S0(a) + maj(a, b, c)
      h = g
      g = f
      f = e
      e = d + T1
      d = c
      c = b
      b = a
      a = T1+T2
      t += 1
    a += H[0]; H[0] = a
    b += H[1]; H[1] = b
    c += H[2]; H[2] = c
    d += H[3]; H[3] = d
    e += H[4]; H[4] = e
    f += H[5]; H[5] = f
    g += H[6]; H[6] = g
    h += H[7]; H[7] = h
 func dumpHash(
       digest: var array[DigestSize, byte],
       H: array[HashSize, uint32]) =
  ## Convert the internal hash into a message digest
  var dstIdx = 0'u
  for i in 0 ..< H.len:
    digest.dumpRawInt(H[i], dstIdx, bigEndian)
    dstIdx += uint sizeof(uint32)
 func copy[N: static int, T: byte|char](
       dst: var array[N, byte],
       dStart: SomeInteger,
       src: openArray[T],
       sStart: SomeInteger,
       len: SomeInteger
     ) =
  ## Copy dst[dStart ..< dStart+len] = src[sStart ..< sStart+len]
  ## Unlike the standard library, this cannot throw
  ## even a defect.
  ## It also handles copy of char into byte arrays
  debug:
    doAssert 0 <= dStart and dStart+len <= dst.len.uint
    doAssert 0 <= sStart and sStart+len <= src.len.uint
  for i in 0 ..< len:
    dst[dStart + i] = byte src[sStart + i]
 func hashBuffer(ctx: var Sha256Context) =
  discard ctx.H.hashMessageBlocks(ctx.buf)
  ctx.buf.setZero()
  ctx.bufIdx = 0
 # Public API
 # ----------------------------------------------------------------
 func init*(ctx: var Sha256Context) =
  ## Initialize or reinitialize a Sha256 context
  ctx.msgLen = 0
  ctx.buf.setZero()
  ctx.bufIdx = 0
  ctx.H[0] = 0x6a09e667'u32;
  ctx.H[1] = 0xbb67ae85'u32;
  ctx.H[2] = 0x3c6ef372'u32;
  ctx.H[3] = 0xa54ff53a'u32;
  ctx.H[4] = 0x510e527f'u32;
  ctx.H[5] = 0x9b05688c'u32;
  ctx.H[6] = 0x1f83d9ab'u32;
  ctx.H[7] = 0x5be0cd19'u32;
 func update*[T: char|byte](ctx: var Sha256Context, message: openarray[T]) =
  ## Append a message to a SHA256 context
  ## for incremental SHA256 computation
  ##
  ## Security note: the tail of your message might be stored
  ## in an internal buffer.
  ## if sensitive content is used, ensure that
  ## `ctx.finish(...)` and `ctx.clear()` are called as soon as possible.
  ## Additionally ensure that the message(s) passed were stored
  ## in memory considered secure for your threat model.
  ##
  ## For passwords and secret keys, you MUST NOT use raw SHA-256
  ## use a Key Derivation Function instead (KDF)
  debug:
    doAssert: 0 <= ctx.bufIdx and ctx.bufIdx.int < ctx.buf.len
    for i in ctx.bufIdx ..< ctx.buf.len:
      doAssert ctx.buf[i] == 0
  var # Message processing state machine
    cur = 0'u
    bytesLeft = message.len.uint
  ctx.msgLen += bytesLeft
  if ctx.bufIdx != 0: # Previous partial update
    let bufIdx = ctx.bufIdx.uint
    let free = ctx.buf.sizeof().uint - bufIdx
    if free > bytesLeft:
      # Enough free space, store in buffer
      ctx.buf.copy(dStart = bufIdx, message, sStart = 0, len = bytesLeft)
      ctx.bufIdx += bytesLeft.uint8
      return
    else:
      # Fill the buffer and do one sha256 hash
      ctx.buf.copy(dStart = bufIdx, message, sStart = 0, len = free)
      ctx.hashBuffer()
      # Update message state for further processing
      cur = free
      bytesLeft -= free
  # Process n blocks (64 byte each)
  let consumed = ctx.H.hashMessageBlocks(
    message.toOpenArray(int cur, message.len-1))
  cur += consumed
  bytesLeft -= consumed
  if bytesLeft != 0:
    # Store the tail in buffer
    debug: # TODO: state machine formal verification - https://nim-lang.org/docs/drnim.html
      doAssert ctx.bufIdx == 0
      doAssert cur + bytesLeft == message.len.uint
    ctx.buf.copy(dStart = 0'u, message, sStart = cur, len = bytesLeft)
    ctx.bufIdx = uint8 bytesLeft
 func finish*(ctx: var Sha256Context, digest: var array[32, byte]) =
  ## Finalize a SHA256 computation and output the
  ## message digest to the `digest` buffer.
  ##
  ## Security note: this does not clear the internal buffer.
  ## if sensitive content is used, use "ctx.clear()"
  ## and also make sure that the message(s) passed were stored
  ## in memory considered secure for your threat model.
  ##
  ## For passwords and secret keys, you MUST NOT use raw SHA-256
  ## use a Key Derivation Function instead (KDF)
  debug:
    doAssert: 0 <= ctx.bufIdx and ctx.bufIdx.int < ctx.buf.len
    for i in ctx.bufIdx ..< ctx.buf.len:
      doAssert ctx.buf[i] == 0
  # Add '1' bit at the end of the message (+7 zero bits)
  ctx.buf[ctx.bufIdx] = 0b1000_0000
  # Add k bits so that msgLenBits + 1 + k ≡ 448 mod 512
  # Hence in bytes msgLen + 1 + K ≡ 56 mod 64
  const padZone = 56
  if ctx.bufIdx >= padZone:
    # We are in the 56..<64 mod 64 byte count
    # and need to rollover to 0
    ctx.hashBuffer()
  let lenInBits = ctx.msgLen.uint64 * 8
  ctx.buf.dumpRawInt(lenInBits, padZone, bigEndian)
  discard ctx.H.hashMessageBlocks(ctx.buf)
  digest.dumpHash(ctx.H)
 func clear*(ctx: var Sha256Context) =
  ## Clear the context internal buffers
  ## Security note:
  ## For passwords and secret keys, you MUST NOT use raw SHA-256
  ## use a Key Derivation Function instead (KDF)
  # TODO: ensure compiler cannot optimize the code away
  ctx.buf.setZero()
 func hash*[T: char|byte](
       HashKind: type sha256,
       digest: var array[32, byte],
       message: openarray[T],
       clearMem = false) =
  ## Produce a SHA256 digest from a message
  var ctx {.noInit.}: HashKind
  ctx.init()
  ctx.update(message)
  ctx.finish(digest)
  if clearMem:
    ctx.clear()
 func hash*[T: char|byte](
       HashKind: type sha256,
       message: openarray[T],
       clearmem = false): array[32, byte] =
  ## Produce a SHA256 digest from a message
  HashKind.hash(result, message, clearMem)
--- a/constantine/io/endians.nim
+++ b/constantine/io/endians.nim
@ -0,0 +1,78 @@
 # Constantine
 # Copyright (c) 2018-2019    Status Research & Development GmbH
 # Copyright (c) 2020-Present Mamy André-Ratsimbazafy
 # Licensed and distributed under either of
 #   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 import ../config/common
 # perf critical we don't want bound checks here
 # So no checks and we avoid signed int to ensur eno exceptions.
 # TODO: Nim formal verification: https://nim-lang.org/docs/drnim.html
 {.push checks:off, raises: [].}
 template toByte*(x: SomeUnsignedInt): byte =
  ## At compile-time, conversion to bytes checks the range
  ## we want to ensure this is done at the register level
  ## at runtime in a single "mov byte" instruction
  when nimvm:
    byte(x and 0xFF)
  else:
    byte(x)
 func parseFromBlob*[T: byte|char](
           dst: var SomeUnsignedInt,
           src: openArray[T],
           cursor: var uint, endian: static Endianness) {.inline.} =
  ## Read an unsigned integer from a raw binary blob.
  ## The `cursor` represents the current index in the array and is updated
  ## by N bytes where N is the size of `dst` type in bytes.
  ## The binary blob is interpreted as:
  ## - an array of words traversed from 0 ..< len (little-endian), via an incremented `cursor`
  ## - with each word being of `endian` ordering for deserialization purpose.
  debug:
    doAssert 0 <= cursor and cursor < src.len.uint
    doAssert cursor + sizeof(dst).uint <= src.len.uint,
      "cursor (" & $cursor & ") + sizeof(dst) (" & $sizeof(dst) &
      ") <= src.len (" & $src.len & ")"
  type U = typeof(dst)
  const L = sizeof(dst)
  var accum: U = 0
  when endian == littleEndian:
    for i in 0'u ..< L:
      accum = accum or (U(src[cursor+i]) shl (i * 8))
  else:
    for i in 0'u ..< L:
      accum = accum or (U(src[cursor+i]) shl ((L - 1 - i) * 8))
  dst = accum
  cursor.inc(L)
 func dumpRawInt*[T: byte|char](
           dst: var openArray[T],
           src: SomeUnsignedInt,
           cursor: uint, endian: static Endianness) {.inline.} =
  ## Dump an integer into raw binary form
  ## The `cursor` represents the current index in the array and is updated
  ## by N bytes where N is the size of `src` type in bytes.
  ## The binary blob is interpreted as:
  ## - an array of words traversed from 0 ..< len (little-endian), via an incremented `cursor`
  ## - with each word being of `endian` ordering for deserialization purpose.
  debug:
    doAssert 0 <= cursor and cursor < dst.len.uint
    doAssert cursor + sizeof(src).uint <= dst.len.uint,
      "cursor (" & $cursor & ") + sizeof(src) (" & $sizeof(src) &
      ") <= dst.len (" & $dst.len & ")"
  type U = typeof(src)
  const L = uint sizeof(src)
  when endian == littleEndian:
    for i in 0'u ..< L:
      dst[cursor+i] = toByte(src shr (i * 8))
  else:
    for i in 0'u ..< L:
      dst[cursor+i] = toByte(src shr ((L-i-1) * 8))
--- a/constantine/io/io_bigints.nim
+++ b/constantine/io/io_bigints.nim
@ -12,7 +12,8 @@
 import
  ../primitives/constant_time,
-  ../config/[common, type_bigint]
+  ../config/[common, type_bigint],
  ./endians
 # ############################################################
 #
@ -152,24 +153,17 @@ func fromUint*(
 #
 # ############################################################
 template toByte(x: SomeUnsignedInt): byte =
  ## At compile-time, conversion to bytes checks the range
  ## we want to ensure this is done at the register level
  ## at runtime in a single "mov byte" instruction
  when nimvm:
    byte(x and 0xFF)
  else:
    byte(x)
 template blobFrom(dst: var openArray[byte], src: SomeUnsignedInt, startIdx: int, endian: static Endianness) =
  ## Write an integer into a raw binary blob
  ## Swapping endianness if needed
  ## startidx is the first written array item if littleEndian is requested
  ## or the last if bigEndian is requested
  when endian == cpuEndian:
    for i in 0 ..< sizeof(src):
-      dst[startIdx+i] = toByte((src shr (i * 8)))
+      dst[startIdx+i] = toByte(src shr (i * 8))
  else:
    for i in 0 ..< sizeof(src):
-      dst[startIdx+sizeof(src)-1-i] = toByte((src shr (i * 8)))
+      dst[startIdx+sizeof(src)-1-i] = toByte(src shr (i * 8))
 func exportRawUintLE(
        dst: var openarray[byte],
--- a/helpers/prng_unsafe.nim
+++ b/helpers/prng_unsafe.nim
@ -364,6 +364,14 @@ func random_long01Seq_with_randZ*(rng: var RngState, T: typedesc[ECP_ShortW_Proj
  ## Skewed towards long bitstrings of 0 or 1
  rng.random_long01Seq_with_randZ(result)
 # Byte sequences
 # ------------------------------------------------------------
 func random_byte_seq*(rng: var RngState, length: int): seq[byte] =
  result.newSeq(length)
  for b in result.mitems:
    b = byte rng.next()
 # Sanity checks
 # ------------------------------------------------------------
--- a/tests/t_hash_sha256_vs_openssl.nim
+++ b/tests/t_hash_sha256_vs_openssl.nim
@ -0,0 +1,115 @@
 import
  # Internals
  ../constantine/hashes/h_sha256,
  # Helpers
  ../helpers/prng_unsafe,
  # Third-party
  stew/byteutils
 proc SHA256[T: byte|char](
       msg: openarray[T],
       digest: ptr array[32, byte] = nil
     ): ptr array[32, byte] {.cdecl, dynlib: "libssl.so", importc.}
 proc SHA256_OpenSSL[T: byte|char](
       digest: var array[32, byte],
       s: openarray[T]) =
  discard SHA256(s, digest.addr)
 echo "\n------------------------------------------------------\n"
 const SmallSizeIters = 128
 const LargeSizeIters =  10
 proc sanityABC =
  var bufCt: array[32, byte]
  let msg = "abc"
  let hashed = hexToByteArray[32](
    "BA7816BF8F01CFEA414140DE5DAE2223" &
    "B00361A396177A9CB410FF61F20015AD")
  sha256.hash(bufCt, msg)
  doAssert bufCt == hashed
 proc sanityABC2 =
  var bufCt: array[32, byte]
  let msg = "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq"
  let hashed = hexToByteArray[32](
    "248D6A61D20638B8E5C026930C3E6039" &
    "A33CE45964FF2167F6ECEDD419DB06C1")
  sha256.hash(bufCt, msg)
  doAssert bufCt == hashed
 proc innerTest(rng: var RngState, sizeRange: Slice[int]) =
  let size = rng.random_unsafe(sizeRange)
  let msg = rng.random_byte_seq(size)
  var bufCt, bufOssl: array[32, byte]
  sha256.hash(bufCt, msg)
  SHA256_OpenSSL(bufOssl, msg)
  doAssert bufCt == bufOssl
 proc chunkTest(rng: var RngState, sizeRange: Slice[int]) =
  let size = rng.random_unsafe(sizeRange)
  let msg = rng.random_byte_seq(size)
  let chunkSize = rng.random_unsafe(2 ..< 20)
  var bufOnePass: array[32, byte]
  sha256.hash(bufOnePass, msg)
  var bufChunked: array[32, byte]
  let maxChunk = max(2, sizeRange.b div 10) # Consume up to 10% at once
  var ctx: Sha256Context
  ctx.init()
  var cur = 0
  while size - cur > 0:
    let chunkSize = rng.random_unsafe(0 ..< maxChunk)
    let stop = min(cur+chunkSize-1, size-1)
    let consumed = stop-cur+1
    ctx.update(msg.toOpenArray(cur, stop))
    cur += consumed
  ctx.finish(bufChunked)
  doAssert bufOnePass == bufChunked
 proc main() =
  echo "SHA256 - sanity checks"
  sanityABC()
  sanityABC2()
  echo "SHA256 - Starting differential testing vs OpenSSL"
  var rng: RngState
  rng.seed(0xFACADE)
  echo "SHA256 - 0 <= size < 64 - exhaustive"
  for i in 0 ..< 64:
    rng.innerTest(i .. i)
  echo "SHA256 - 0 <= size < 64 - exhaustive chunked"
  for i in 0 ..< 64:
    rng.chunkTest(i .. i)
  echo "SHA256 - 64 <= size < 1024B"
  for _ in 0 ..< SmallSizeIters:
    rng.innerTest(0 ..< 1024)
  echo "SHA256 - 64 <= size < 1024B - chunked"
  for _ in 0 ..< SmallSizeIters:
    rng.chunkTest(0 ..< 1024)
  echo "SHA256 - 1MB <= size < 50MB"
  for _ in 0 ..< LargeSizeIters:
    rng.innerTest(1_000_000 ..< 50_000_000)
  echo "SHA256 - Differential testing vs OpenSSL - SUCCESS"
 main()