C API for Ethereum BLS signatures (#228)

* [testsuite] Rework parallel test runner to buffer beyond 65536 chars and properly wait for process exit * [testsuite] improve error reporting * rework openArray[byte/char] for BLS signature C API * Prepare for optimized library and bindings * properly link to constantine * Compiler fixes, global sanitizers, GCC bug with --opt:size * workaround/fix #229: don't inline field reduction in Fp2 * fix clang running out of registers with LTO * [C API] missed length parameters for ctt_eth_bls_fast_aggregate_verify * double-precision asm is too large for inlining, try to fix Linux and MacOS woes at https://github.com/mratsim/constantine/pull/228#issuecomment-1512773460 * Use FORTIFY_SOURCE for testing * Fix #230 - gcc miscompiles Fp6 mul with LTO * disable LTO for now, PR is too long
2023-04-18 22:02:23 +02:00 · 2023-04-18 22:02:23 +02:00 · 9a7137466e
parent 93dac2503c
commit 9a7137466e
57 changed files with 1693 additions and 858 deletions
--- a/benchmarks/bench_ec_g1_msm_bls12_381.nim
+++ b/benchmarks/bench_ec_g1_msm_bls12_381.nim
@ -32,7 +32,7 @@ const AvailableCurves = [
 ]

 # const testNumPoints = [10, 100, 1000, 10000, 100000]
-const testNumPoints = [8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192,
+const testNumPoints = [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192,
                       16384, 32768, 65536, 131072, 262144]

 proc main() =
--- a/benchmarks/bench_ec_g1_scalar_mul.nim
+++ b/benchmarks/bench_ec_g1_scalar_mul.nim
@ -44,7 +44,7 @@ proc main() =
  separator()
  staticFor i, 0, AvailableCurves.len:
    const curve = AvailableCurves[i]
-    const bits = 64 # curve.getCurveOrderBitwidth()
+    const bits = curve.getCurveOrderBitwidth()
    scalarMulUnsafeDoubleAddBench(ECP_ShortW_Prj[Fp[curve], G1], bits, MulIters)
    scalarMulUnsafeDoubleAddBench(ECP_ShortW_Jac[Fp[curve], G1], bits, MulIters)
    separator()
--- a/benchmarks/bench_ec_g2_scalar_mul.nim
+++ b/benchmarks/bench_ec_g2_scalar_mul.nim
@ -45,7 +45,7 @@ proc main() =
  separator()
  staticFor i, 0, AvailableCurves.len:
    const curve = AvailableCurves[i]
-    const bits = 64 # curve.getCurveOrderBitwidth()
+    const bits = curve.getCurveOrderBitwidth()
    scalarMulUnsafeDoubleAddBench(ECP_ShortW_Prj[Fp2[curve], G2], bits, MulIters)
    scalarMulUnsafeDoubleAddBench(ECP_ShortW_Jac[Fp2[curve], G2], bits, MulIters)
    separator()
--- a/benchmarks/bench_ethereum_bls_signatures.nim
+++ b/benchmarks/bench_ethereum_bls_signatures.nim
@ -9,7 +9,7 @@
 import
  # Internals
  ../constantine/[
-    blssig_pop_on_bls12381_g2,
+    ethereum_bls_signatures,
    ethereum_eip2333_bls12381_key_derivation],
  ../constantine/math/arithmetic,
  # Helpers
@ -33,10 +33,10 @@ template bench(op: string, curve: string, iters: int, body: untyped): untyped =
 proc demoKeyGen(): tuple[seckey: SecretKey, pubkey: PublicKey] =
  # Don't do this at home, this is for benchmarking purposes
  # The RNG is NOT cryptographically secure
-  # The API for keygen is not ready in blssig_pop_on_bls12381_g2
+  # The API for keygen is not ready in ethereum_bls_signatures
  let ikm = rng.random_byte_seq(32)
  doAssert cast[ptr BigInt[255]](result.seckey.addr)[].derive_master_secretKey(ikm)
-  let ok = result.pubkey.derive_public_key(result.seckey)
+  let ok = result.pubkey.derive_pubkey(result.seckey)
  doAssert ok == cttBLS_Success

 proc benchDeserPubkey*(iters: int) =
@ -44,26 +44,26 @@ proc benchDeserPubkey*(iters: int) =
  var pk_comp{.noInit.}: array[48, byte]

  # Serialize compressed
-  let ok = pk_comp.serialize_public_key_compressed(pk)
+  let ok = pk_comp.serialize_pubkey_compressed(pk)
  doAssert ok == cttBLS_Success

  var pk2{.noInit.}: PublicKey

  bench("Pubkey deserialization (full checks)", "BLS12_381 G1", iters):
-    let status = pk2.deserialize_public_key_compressed(pk_comp)
+    let status = pk2.deserialize_pubkey_compressed(pk_comp)

 proc benchDeserPubkeyUnchecked*(iters: int) =
  let (sk, pk) = demoKeyGen()
  var pk_comp{.noInit.}: array[48, byte]

  # Serialize compressed
-  let ok = pk_comp.serialize_public_key_compressed(pk)
+  let ok = pk_comp.serialize_pubkey_compressed(pk)
  doAssert ok == cttBLS_Success

  var pk2{.noInit.}: PublicKey

  bench("Pubkey deserialization (skip checks)", "BLS12_381 G1", iters):
-    let status = pk2.deserialize_public_key_compressed_unchecked(pk_comp)
+    let status = pk2.deserialize_pubkey_compressed_unchecked(pk_comp)

 proc benchDeserSig*(iters: int) =
  let (sk, pk) = demoKeyGen()
@ -139,7 +139,7 @@ proc benchFastAggregateVerify*(numKeys, iters: int) =
    let status = sigs[i].sign(sk, msg)
    doAssert status == cttBLS_Success

-  aggSig.aggregate_signatures(sigs)
+  aggSig.aggregate_signatures_unstable_api(sigs)

  bench("BLS agg verif of 1 msg by " & $numKeys & " pubkeys", "BLS12_381", iters):
    let valid = validators.fast_aggregate_verify(msg, aggSig)
--- a/bindings_generators/README.md
+++ b/bindings_generators/README.md
--- a/bindings_generators/constantine_bls12_381.nim
+++ b/bindings_generators/constantine_bls12_381.nim
--- a/bindings_generators/constantine_pasta.nim
+++ b/bindings_generators/constantine_pasta.nim
--- a/bindings_generators/gen_bindings.nim
+++ b/bindings_generators/gen_bindings.nim
@ -18,8 +18,11 @@ export curves, curves_primitives
 # This files provides template for C bindings generation

 template genBindingsField*(Field: untyped) =
-  {.push cdecl, dynlib, exportc,  raises: [].} # No exceptions allowed
-  
+  when appType == "lib":
+    {.push cdecl, dynlib, exportc,  raises: [].} # No exceptions allowed
+  else:
+    {.push cdecl, exportc,  raises: [].} # No exceptions allowed
+
  func `ctt _ Field _ unmarshalBE`(dst: var Field, src: openarray[byte]) =
    ## Deserialize
    unmarshalBE(dst, src)
@ -77,7 +80,7 @@ template genBindingsField*(Field: untyped) =

  func `ctt _ Field _ mul_in_place`(a: var Field, b: Field) =
    a *= b
-  
+
  func `ctt _ Field _ square`(r: var Field, a: Field) =
    r.square(a)

@ -86,10 +89,10 @@ template genBindingsField*(Field: untyped) =
  # --------------------------------------------------------------------------------------
  func `ctt _ Field _ div2`(a: var Field) =
    a.div2()
-  
+
  func `ctt _ Field _ inv`(r: var Field, a: Field) =
    r.inv(a)
-  
+
  func `ctt _ Field _ inv_in_place`(a: var Field) =
    a.inv()
  # --------------------------------------------------------------------------------------
@ -98,10 +101,10 @@ template genBindingsField*(Field: untyped) =

  func `ctt _ Field _ cswap`(a, b: var Field, ctl: SecretBool) =
    a.cswap(b, ctl)
-  
+
  func `ctt _ Field _ cset_zero`(a: var Field, ctl: SecretBool) =
    a.csetZero(ctl)
-  
+
  func `ctt _ Field _ cset_one`(a: var Field, ctl: SecretBool) =
    a.csetOne(ctl)

@ -118,7 +121,10 @@ template genBindingsField*(Field: untyped) =


 template genBindingsFieldSqrt*(Field: untyped) =
-  {.push cdecl, dynlib, exportc,  raises: [].} # No exceptions allowed
+  when appType == "lib":
+    {.push cdecl, dynlib, exportc,  raises: [].} # No exceptions allowed
+  else:
+    {.push cdecl, exportc,  raises: [].} # No exceptions allowed

  func `ctt _ Field _ is_square`(a: Field): SecretBool =
    a.isSquare()
@ -148,7 +154,10 @@ template genBindingsFieldSqrt*(Field: untyped) =


 template genBindingsExtField*(Field: untyped) =
-  {.push cdecl, dynlib, exportc,  raises: [].} # No exceptions allowed
+  when appType == "lib":
+    {.push cdecl, dynlib, exportc,  raises: [].} # No exceptions allowed
+  else:
+    {.push cdecl, exportc,  raises: [].} # No exceptions allowed

  # --------------------------------------------------------------------------------------
  func `ctt _ Field _ is_eq`(a, b: Field): SecretBool =
@ -195,13 +204,13 @@ template genBindingsExtField*(Field: untyped) =

  func `ctt _ Field _ conj`(r: var Field, a: Field) =
    r.conj(a)
-  
+
  func `ctt _ Field _ conj_in_place`(a: var Field) =
    a.conj()

  func `ctt _ Field _ conjneg`(r: var Field, a: Field) =
    r.conjneg(a)
-  
+
  func `ctt _ Field _ conjneg_in_place`(a: var Field) =
    a.conjneg()

@ -211,7 +220,7 @@ template genBindingsExtField*(Field: untyped) =

  func `ctt _ Field _ mul_in_place`(a: var Field, b: Field) =
    a *= b
-  
+
  func `ctt _ Field _ square`(r: var Field, a: Field) =
    r.square(a)

@ -220,10 +229,10 @@ template genBindingsExtField*(Field: untyped) =
  # --------------------------------------------------------------------------------------
  func `ctt _ Field _ div2`(a: var Field) =
    a.div2()
-  
+
  func `ctt _ Field _ inv`(r: var Field, a: Field) =
    r.inv(a)
-  
+
  func `ctt _ Field _ inv_in_place`(a: var Field) =
    a.inv()
  # --------------------------------------------------------------------------------------
@ -232,7 +241,7 @@ template genBindingsExtField*(Field: untyped) =

  func `ctt _ Field _ cset_zero`(a: var Field, ctl: SecretBool) =
    a.csetZero(ctl)
-  
+
  func `ctt _ Field _ cset_one`(a: var Field, ctl: SecretBool) =
    a.csetOne(ctl)

@ -248,7 +257,10 @@ template genBindingsExtField*(Field: untyped) =
  {.pop.}

 template genBindingsExtFieldSqrt*(Field: untyped) =
-  {.push cdecl, dynlib, exportc,  raises: [].} # No exceptions allowed
+  when appType == "lib":
+    {.push cdecl, dynlib, exportc,  raises: [].} # No exceptions allowed
+  else:
+    {.push cdecl, exportc,  raises: [].} # No exceptions allowed

  func `ctt _ Field _ is_square`(a: Field): SecretBool =
    a.isSquare()
@ -262,12 +274,15 @@ template genBindingsExtFieldSqrt*(Field: untyped) =
  {.pop}

 template genBindings_EC_ShortW_Affine*(ECP, Field: untyped) =
-  {.push cdecl, dynlib, exportc,  raises: [].} # No exceptions allowed
+  when appType == "lib":
+    {.push cdecl, dynlib, exportc,  raises: [].} # No exceptions allowed
+  else:
+    {.push cdecl, exportc,  raises: [].} # No exceptions allowed

  # --------------------------------------------------------------------------------------
  func `ctt _ ECP _ is_eq`(P, Q: ECP): SecretBool =
    P == Q
-  
+
  func `ctt _ ECP _ is_inf`(P: ECP): SecretBool =
    P.isInf()

@ -276,7 +291,7 @@ template genBindings_EC_ShortW_Affine*(ECP, Field: untyped) =

  func `ctt _ ECP _ ccopy`(P: var ECP, Q: ECP, ctl: SecretBool) =
    P.ccopy(Q, ctl)
-  
+
  func `ctt _ ECP _ is_on_curve`(x, y: Field): SecretBool =
    isOnCurve(x, y, ECP.G)

@ -289,12 +304,15 @@ template genBindings_EC_ShortW_Affine*(ECP, Field: untyped) =
  {.pop.}

 template genBindings_EC_ShortW_NonAffine*(ECP, ECP_Aff, Field: untyped) =
-  {.push cdecl, dynlib, exportc,  raises: [].} # No exceptions allowed
+  when appType == "lib":
+    {.push cdecl, dynlib, exportc,  raises: [].} # No exceptions allowed
+  else:
+    {.push cdecl, exportc,  raises: [].} # No exceptions allowed

  # --------------------------------------------------------------------------------------
  func `ctt _ ECP _ is_eq`(P, Q: ECP): SecretBool =
    P == Q
-  
+
  func `ctt _ ECP _ is_inf`(P: ECP): SecretBool =
    P.isInf()

@ -303,7 +321,7 @@ template genBindings_EC_ShortW_NonAffine*(ECP, ECP_Aff, Field: untyped) =

  func `ctt _ ECP _ ccopy`(P: var ECP, Q: ECP, ctl: SecretBool) =
    P.ccopy(Q, ctl)
-  
+
  func `ctt _ ECP _ neg`(P: var ECP, Q: ECP) =
    P.neg(Q)

@ -327,7 +345,7 @@ template genBindings_EC_ShortW_NonAffine*(ECP, ECP_Aff, Field: untyped) =

  func `ctt _ ECP _ double_in_place`(P: var ECP) =
    P.double()
-  
+
  func `ctt _ ECP _ affine`(dst: var ECP_Aff, src: ECP) =
    dst.affine(src)

--- a/bindings_generators/gen_header.nim
+++ b/bindings_generators/gen_header.nim
@ -16,14 +16,13 @@ import

 proc genHeaderLicense*(): string =
  """
-/*
- * Constantine
- * Copyright (c) 2018-2019    Status Research & Development GmbH
- * Copyright (c) 2020-Present Mamy André-Ratsimbazafy
- * Licensed and distributed under either of
- *   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
- *   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
- * at your option. This file may not be copied, modified, or distributed except according to those terms.
+/** Constantine
+ *  Copyright (c) 2018-2019    Status Research & Development GmbH
+ *  Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+ *  Licensed and distributed under either of
+ *    * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+ *    * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+ *  at your option. This file may not be copied, modified, or distributed except according to those terms.
 */
 """

@ -102,7 +101,7 @@ proc declNimMain*(libName: string): string =
  ## - the Nim runtime if seqs, strings or heap-allocated types are used,
  ##   this is the case only if Constantine is multithreaded.
  ## - runtime CPU features detection
-  ## 
+  ##
  ## Assumes library is compiled with --nimMainPrefix:ctt_{libName}_
  &"""

@ -124,9 +123,9 @@ proc toCrettype(node: NimNode): string =
  node.expectKind({nnkEmpty, nnkSym})
  if node.kind == nnkEmpty:
    # align iwth secret_bool and secret_word
-    "void       "  
+    "void       "
  else:
-    TypeMap[$node] 
+    TypeMap[$node]

 proc toCtrivialParam(name: string, typ: NimNode): string =
  typ.expectKind({nnkVarTy, nnkSym})
@ -181,16 +180,16 @@ macro collectBindings*(cBindingsStr: untyped, body: typed): untyped =
    for fnDef in generator:
      if fnDef.kind notin {nnkProcDef, nnkFuncDef}:
        continue
-    
+
      cBindings &= "\n"
-      # rettype name(pType0* pName0, pType1* pName1, ...);    
+      # rettype name(pType0* pName0, pType1* pName1, ...);
      cBindings &= fnDef.params[0].toCrettype()
      cBindings &= ' '
      cBindings &= $fnDef.name
      cBindings &= '('
      for i in 1 ..< fnDef.params.len:
        if i != 1: cBindings &= ", "
-        
+
        let paramDef = fnDef.params[i]
        paramDef.expectKind(nnkIdentDefs)
        let pType = paramDef[^2]
@ -198,7 +197,7 @@ macro collectBindings*(cBindingsStr: untyped, body: typed): untyped =
        paramDef[^1].expectKind(nnkEmpty)

        for j in 0 ..< paramDef.len - 2:
-          if j != 0: cBindings &= ", " 
+          if j != 0: cBindings &= ", "
          var name = $paramDef[j]
          cBindings &= toCparam(name.split('`')[0], pType)

--- a/constantine.nimble
+++ b/constantine.nimble
@ -7,7 +7,185 @@ license       = "MIT or Apache License 2.0"
 # Dependencies
 # ----------------------------------------------------------------

-requires "nim >= 1.1.0"
+requires "nim >= 1.6.12"
+
+# Nimscript imports
+# ----------------------------------------------------------------
+
+import std/strformat
+
+# Library compilation
+# ----------------------------------------------------------------
+
+proc releaseBuildOptions: string =
+  # -d:danger --opt:size
+  #           to avoid boundsCheck and overflowChecks that would trigger exceptions or allocations in a crypto library.
+  #           Those are internally guaranteed at compile-time by fixed-sized array
+  #           and checked at runtime with an appropriate error code if any for user-input.
+  #
+  #           Furthermore we optimize for size, the performance critical procedures
+  #           either use assembly or are unrolled manually with staticFor,
+  #           Optimizations at -O3 deal with loops and branching
+  #           which we mostly don't have. It's better to optimize
+  #           for instructions cache.
+  #
+  # --panics:on -d:noSignalHandler
+  #           Even with `raises: []`, Nim still has an exception path
+  #           for defects, for example array out-of-bound accesses (though deactivated with -d:danger)
+  #           This turns them into panics, removing exceptiosn from the library.
+  #           We also remove signal handlers as it's not our business.
+  #
+  # -mm:arc -d:useMalloc
+  #           Constantine stack allocates everything (except for multithreading).
+  #           Inputs are through unmanaged ptr+len. So we don't want any runtime.
+  #           Combined with -d:useMalloc, sanitizers and valgrind work as in C,
+  #           even for test cases that needs to allocate (json inputs).
+  #
+  # -fno-semantic-interposition
+  #           https://fedoraproject.org/wiki/Changes/PythonNoSemanticInterpositionSpeedup
+  #           Default in Clang, not default in GCC, prevents optimizations, not portable to non-Linux.
+  #           Also disabling this prevents overriding symbols which might actually be wanted in a cryptographic library
+  #
+  # -falign-functions=64
+  #           Reduce instructions cache misses.
+  #           https://lkml.org/lkml/2015/5/21/443
+  #           Our non-inlined functions are large so size cost is minimal.
+  " -d:danger --opt:size " &
+  " --panics:on -d:noSignalHandler " &
+  " --mm:arc -d:useMalloc " &
+  " --verbosity:0 --hints:off --warnings:off " &
+  # " --passC:-flto --passL:-flto " &
+  " --passC:-fno-semantic-interposition " &
+  " --passC:-falign-functions=64 "
+
+type BindingsKind = enum
+  kCurve
+  kProtocol
+
+proc genDynamicBindings(bindingsKind: BindingsKind, bindingsName, prefixNimMain: string) =
+  proc compile(libName: string, flags = "") =
+    echo "Compiling dynamic library: lib/" & libName
+    exec "nim c " &
+         " --noMain --app:lib " &
+         flags &
+         releaseBuildOptions() &
+         &" --nimMainPrefix:{prefixNimMain} " &
+         &" --out:{libName} --outdir:lib " &
+         (block:
+           case bindingsKind
+           of kCurve:
+             &" --nimcache:nimcache/bindings_curves/{bindingsName}" &
+             &" bindings_generators/{bindingsName}.nim"
+           of kProtocol:
+             &" --nimcache:nimcache/bindings_protocols/{bindingsName}" &
+             &" constantine/{bindingsName}.nim")
+
+  let bindingsName = block:
+    case bindingsKind
+    of kCurve: bindingsName
+    of kProtocol: "constantine_" & bindingsName
+
+  when defined(windows):
+    compile bindingsName & ".dll"
+
+  elif defined(macosx):
+    compile "lib" & bindingsName & ".dylib.arm", "--cpu:arm64 -l:'-target arm64-apple-macos11' -t:'-target arm64-apple-macos11'"
+    compile "lib" & bindingsName & ".dylib.x64", "--cpu:amd64 -l:'-target x86_64-apple-macos10.12' -t:'-target x86_64-apple-macos10.12'"
+    exec "lipo lib/lib" & bindingsName & ".dylib.arm " &
+             " lib/lib" & bindingsName & ".dylib.x64 " &
+             " -output lib/lib" & bindingsName & ".dylib -create"
+
+  else:
+    compile "lib" & bindingsName & ".so"
+
+proc genStaticBindings(bindingsKind: BindingsKind, bindingsName, prefixNimMain: string) =
+  proc compile(libName: string, flags = "") =
+    echo "Compiling static library:  lib/" & libName
+    exec "nim c " &
+         " --noMain --app:staticLib " &
+         flags &
+         releaseBuildOptions() &
+         " --nimMainPrefix:" & prefixNimMain &
+         " --out:" & libName & " --outdir:lib " &
+         (block:
+           case bindingsKind
+           of kCurve:
+             " --nimcache:nimcache/bindings_curves/" & bindingsName &
+             " bindings_generators/" & bindingsName & ".nim"
+           of kProtocol:
+             " --nimcache:nimcache/bindings_protocols/" & bindingsName &
+             " constantine/" & bindingsName & ".nim"
+         )
+
+  let bindingsName = block:
+    case bindingsKind
+    of kCurve: bindingsName
+    of kProtocol: "constantine_" & bindingsName
+
+  when defined(windows):
+    compile bindingsName & ".lib"
+
+  elif defined(macosx):
+    compile "lib" & bindingsName & ".a.arm", "--cpu:arm64 -l:'-target arm64-apple-macos11' -t:'-target arm64-apple-macos11'"
+    compile "lib" & bindingsName & ".a.x64", "--cpu:amd64 -l:'-target x86_64-apple-macos10.12' -t:'-target x86_64-apple-macos10.12'"
+    exec "lipo lib/lib" & bindingsName & ".a.arm " &
+             " lib/lib" & bindingsName & ".a.x64 " &
+             " -output lib/lib" & bindingsName & ".a -create"
+
+  else:
+    compile "lib" & bindingsName & ".a"
+
+proc genHeaders(bindingsName: string) =
+  echo "Generating header:         include/" & bindingsName & ".h"
+  exec "nim c -d:CttGenerateHeaders " &
+       releaseBuildOptions() &
+       " --out:" & bindingsName & "_gen_header.exe --outdir:build " &
+       " --nimcache:nimcache/bindings_curves_headers/" & bindingsName & "_header" &
+       " bindings_generators/" & bindingsName & ".nim"
+  exec "build/" & bindingsName & "_gen_header.exe include"
+
+task bindings, "Generate Constantine bindings":
+  # Curve arithmetic
+  genStaticBindings(kCurve, "constantine_bls12_381", "ctt_bls12381_init_")
+  genDynamicBindings(kCurve, "constantine_bls12_381", "ctt_bls12381_init_")
+  genHeaders("constantine_bls12_381")
+  echo ""
+  genStaticBindings(kCurve, "constantine_pasta", "ctt_pasta_init_")
+  genDynamicBindings(kCurve, "constantine_pasta", "ctt_pasta_init_")
+  genHeaders("constantine_pasta")
+  echo ""
+
+  # Protocols
+  genStaticBindings(kProtocol, "ethereum_bls_signatures", "ctt_eth_bls_init_")
+  genDynamicBindings(kProtocol, "ethereum_bls_signatures", "ctt_eth_bls_init_")
+
+proc testLib(path, testName, libName: string, useGMP: bool) =
+  let dynlibName = if defined(windows): libName & ".dll"
+                   elif defined(macosx): "lib" & libName & ".dylib"
+                   else: "lib" & libName & ".so"
+  let staticlibName = if defined(windows): libName & ".lib"
+                      else: "lib" & libName & ".a"
+
+  echo &"\n[Bindings: {path}/{testName}.c] Testing dynamically linked library {dynlibName}"
+  exec &"gcc -Iinclude -Llib -o build/testbindings/{testName}_dynlink.exe {path}/{testName}.c -l{libName} " & (if useGMP: "-lgmp" else: "")
+  when defined(windows):
+    # Put DLL near the exe as LD_LIBRARY_PATH doesn't work even in a POSIX compatible shell
+    exec &"./build/testbindings/{testName}_dynlink.exe"
+  else:
+    exec &"LD_LIBRARY_PATH=lib ./build/testbindings/{testName}_dynlink.exe"
+
+
+  echo &"\n[Bindings: {path}/{testName}.c] Testing statically linked library: {staticlibName}"
+  # Beware MacOS annoying linker with regards to static libraries
+  # The following standard way cannot be used on MacOS
+  # exec "gcc -Iinclude -Llib -o build/t_libctt_bls12_381_sl.exe examples_c/t_libctt_bls12_381.c -lgmp -Wl,-Bstatic -lconstantine_bls12_381 -Wl,-Bdynamic"
+  exec &"gcc -Iinclude -o build/testbindings/{testName}_staticlink.exe {path}/{testName}.c lib/{staticlibName} " & (if useGMP: "-lgmp" else: "")
+  exec &"./build/testbindings/{testName}_staticlink.exe"
+
+task test_bindings, "Test C bindings":
+  exec "mkdir -p build/testbindings"
+  testLib("examples_c", "t_libctt_bls12_381", "constantine_bls12_381", useGMP = true)
+  testLib("examples_c", "ethereum_bls_signatures", "constantine_ethereum_bls_signatures", useGMP = false)

 # Test config
 # ----------------------------------------------------------------
@ -232,7 +410,7 @@ const testDesc: seq[tuple[path: string, useGMP: bool]] = @[
  # Protocols
  # ----------------------------------------------------------
  ("tests/t_ethereum_evm_precompiles.nim", false),
-  ("tests/t_blssig_pop_on_bls12381_g2.nim", false),
+  ("tests/t_ethereum_bls_signatures.nim", false),
  ("tests/t_ethereum_eip2333_bls12381_key_derivation.nim", false),
 ]

@ -291,7 +469,7 @@ const benchDesc = [
  "bench_poly1305",
  "bench_sha256",
  "bench_hash_to_curve",
-  "bench_blssig_on_bls12_381_g2"
+  "bench_ethereum_bls_signatures"
 ]

 # For temporary (hopefully) investigation that can only be reproduced in CI
@ -300,22 +478,9 @@ const useDebug = [
  "tests/math/t_hash_sha256_vs_openssl.nim",
 ]

-# Tests that uses sequences require Nim GC, stack scanning and nil pointer passed to openarray
-# In particular the tests that uses the json test vectors, don't sanitize them.
-# we do use gc:none to help
+# Skip sanitizers for specific tests
 const skipSanitizers = [
-  "tests/math/t_ec_sage_bn254_nogami.nim",
-  "tests/math/t_ec_sage_bn254_snarks.nim",
-  "tests/math/t_ec_sage_bls12_377.nim",
-  "tests/math/t_ec_sage_bls12_381.nim",
-  "tests/t_blssig_pop_on_bls12381_g2.nim",
-  "tests/t_hash_to_field.nim",
-  "tests/t_hash_to_curve.nim",
-  "tests/t_hash_to_curve_random.nim",
-  "tests/t_mac_poly1305.nim",
-  "tests/t_mac_hmac.nim",
-  "tests/t_kdf_hkdf.nim",
-  "tests/t_ethereum_eip2333_bls12381_key_derivation.nim"
+  "tests/t_"
 ]

 when defined(windows):
@ -323,13 +488,19 @@ when defined(windows):
  const sanitizers = ""
 else:
  const sanitizers =
-    " --passC:-fsanitize=undefined --passL:-fsanitize=undefined" &
-    " --passC:-fno-sanitize-recover" & # Enforce crash on undefined behaviour
-    " --gc:none" # The conservative stack scanning of Nim default GC triggers, alignment UB and stack-buffer-overflow check.
-    # " --passC:-fsanitize=address --passL:-fsanitize=address" & # Requires too much stack for the inline assembly
+    # Sanitizers are incompatible with nim default GC
+    # The conservative stack scanning of Nim default GC triggers, alignment UB and stack-buffer-overflow check.
+    # Address sanitizer requires free registers and needs to be disabled for some inline assembly files.
+    # Ensure you use --mm:arc -d:useMalloc
+    #
+    # Sanitizers are deactivated by default as they slow down CI by at least 6x
+
+    # " --passC:-fsanitize=undefined --passL:-fsanitize=undefined" &
+    # " --passC:-fsanitize=address --passL:-fsanitize=address" &
+    " --passC:-fno-sanitize-recover" # Enforce crash on undefined behaviour


-# Helper functions
+# Tests & Benchmarks helper functions
 # ----------------------------------------------------------------

 proc clearParallelBuild() =
@ -337,7 +508,7 @@ proc clearParallelBuild() =
  if fileExists(buildParallel):
    rmFile(buildParallel)

-template setupCommand(): untyped {.dirty.} =
+template setupTestCommand(): untyped {.dirty.} =
  var lang = "c"
  if existsEnv"TEST_LANG":
    lang = getEnv"TEST_LANG"
@ -349,10 +520,12 @@ template setupCommand(): untyped {.dirty.} =
  var flags = flags
  when not defined(windows):
    # Not available in MinGW https://github.com/libressl-portable/portable/issues/54
-    flags &= " --passC:-fstack-protector-strong"
-  let command = "nim " & lang & cc & " -d:release " & flags &
-    " --panics:on " & # Defects are not catchable
-    " --verbosity:0 --outdir:build/testsuite -r --hints:off --warnings:off " &
+    flags &= " --passC:-fstack-protector-strong --passC:-D_FORTIFY_SOURCE=2 "
+  let command = "nim " & lang & cc &
+    " -r " &
+    flags &
+    releaseBuildOptions() &
+    " --outdir:build/testsuite " &
    " --nimcache:nimcache/" & path & " " &
    path

@ -363,7 +536,7 @@ proc test(cmd: string) =
  exec cmd

 proc testBatch(commands: var string, flags, path: string) =
-  setupCommand()
+  setupTestCommand()
  commands &= command & '\n'

 template setupBench(): untyped {.dirty.} =
@ -383,10 +556,10 @@ template setupBench(): untyped {.dirty.} =
  if not useAsm:
    cc &= " -d:CttASM=false"
  let command = "nim " & lang & cc &
-       " --panics:on " & # Defects are not catchable
-       " -d:danger --verbosity:0 -o:build/bench/" & benchName & "_" & compiler & "_" & (if useAsm: "useASM" else: "noASM") &
+       releaseBuildOptions() &
+       " -o:build/bench/" & benchName & "_" & compiler & "_" & (if useAsm: "useASM" else: "noASM") &
       " --nimcache:nimcache/benches/" & benchName & "_" & compiler & "_" & (if useAsm: "useASM" else: "noASM") &
-       runFlag & "--hints:off --warnings:off benchmarks/" & benchName & ".nim"
+       runFlag & " benchmarks/" & benchName & ".nim"

 proc runBench(benchName: string, compiler = "", useAsm = true) =
  if not dirExists "build":
@ -410,11 +583,11 @@ proc addTestSet(cmdFile: var string, requireGMP: bool, test32bit = false, testAS
    if not(td.useGMP and not requireGMP):
      var flags = ""
      if not testASM:
-        flags &= " -d:CttASM=false"
+        flags &= " -d:CttASM=false "
      if test32bit:
-        flags &= " -d:Constantine32"
+        flags &= " -d:Constantine32 "
      if td.path in useDebug:
-        flags &= " -d:debugConstantine"
+        flags &= " -d:debugConstantine "
      if td.path notin skipSanitizers:
        flags &= sanitizers

@ -425,8 +598,11 @@ proc addTestSetNvidia(cmdFile: var string) =
    mkDir "build"
  echo "Found " & $testDescNvidia.len & " tests to run."

-  for path in testDescNvidia:
-    cmdFile.testBatch(flags = "", path)
+  for path in testDescThreadpool:
+    var flags = ""
+    if path notin skipSanitizers:
+      flags &= sanitizers
+    cmdFile.testBatch(flags, path)

 proc addTestSetThreadpool(cmdFile: var string) =
  if not dirExists "build":
@ -434,7 +610,10 @@ proc addTestSetThreadpool(cmdFile: var string) =
  echo "Found " & $testDescThreadpool.len & " tests to run."

  for path in testDescThreadpool:
-    cmdFile.testBatch(flags = "--threads:on --linetrace:on --debugger:native", path)
+    var flags = " --threads:on --debugger:native "
+    if path notin skipSanitizers:
+      flags &= sanitizers
+    cmdFile.testBatch(flags, path)

 proc addTestSetMultithreadedCrypto(cmdFile: var string, test32bit = false, testASM = true) =
  if not dirExists "build":
@ -461,115 +640,12 @@ proc addBenchSet(cmdFile: var string, useAsm = true) =
  for bd in benchDesc:
    cmdFile.buildBenchBatch(bd, useASM = useASM)

-proc genDynamicBindings(bindingsName, prefixNimMain: string) =
-  proc compile(libName: string, flags = "") =
-    # -d:danger to avoid boundsCheck, overflowChecks that would trigger exceptions or allocations in a crypto library.
-    #           Those are internally guaranteed at compile-time by fixed-sized array
-    #           and checked at runtime with an appropriate error code if any for user-input.
-    # -gc:arc   Constantine stack allocates everything. Inputs are through unmanaged ptr+len.
-    #           In the future, Constantine might use:
-    #             - heap-allocated sequences and objects manually managed or managed by destructors for multithreading.
-    #             - heap-allocated strings for hex-string or decimal strings
-    echo "Compiling dynamic library: lib/" & libName
-    exec "nim c -f " & flags & " --noMain -d:danger --app:lib --gc:arc " &
-         " --panics:on " & # Defects are not catchable
-         " --verbosity:0 --hints:off --warnings:off " &
-         " --nimMainPrefix:" & prefixNimMain &
-         " --out:" & libName & " --outdir:lib " &
-         " --nimcache:nimcache/bindings/" & bindingsName &
-         " bindings/" & bindingsName & ".nim"
-
-  when defined(windows):
-    compile bindingsName & ".dll"
-
-  elif defined(macosx):
-    compile "lib" & bindingsName & ".dylib.arm", "--cpu:arm64 -l:'-target arm64-apple-macos11' -t:'-target arm64-apple-macos11'"
-    compile "lib" & bindingsName & ".dylib.x64", "--cpu:amd64 -l:'-target x86_64-apple-macos10.12' -t:'-target x86_64-apple-macos10.12'"
-    exec "lipo lib/lib" & bindingsName & ".dylib.arm " &
-             " lib/lib" & bindingsName & ".dylib.x64 " &
-             " -output lib/lib" & bindingsName & ".dylib -create"
-
-  else:
-    compile "lib" & bindingsName & ".so"
-
-proc genStaticBindings(bindingsName, prefixNimMain: string) =
-  proc compile(libName: string, flags = "") =
-    # -d:danger to avoid boundsCheck, overflowChecks that would trigger exceptions or allocations in a crypto library.
-    #           Those are internally guaranteed at compile-time by fixed-sized array
-    #           and checked at runtime with an appropriate error code if any for user-input.
-    # -gc:arc   Constantine stack allocates everything. Inputs are through unmanaged ptr+len.
-    #           In the future, Constantine might use:
-    #             - heap-allocated sequences and objects manually managed or managed by destructors for multithreading.
-    #             - heap-allocated strings for hex-string or decimal strings
-    echo "Compiling static library:  lib/" & libName
-    exec "nim c -f " & flags & " --noMain -d:danger --app:staticLib --gc:arc " &
-         " --panics:on " & # Defects are not catchable
-         " --verbosity:0 --hints:off --warnings:off " &
-         " --nimMainPrefix:" & prefixNimMain &
-         " --out:" & libName & " --outdir:lib " &
-         " --nimcache:nimcache/bindings/" & bindingsName &
-         " bindings/" & bindingsName & ".nim"
-
-  when defined(windows):
-    compile bindingsName & ".lib"
-
-  elif defined(macosx):
-    compile "lib" & bindingsName & ".a.arm", "--cpu:arm64 -l:'-target arm64-apple-macos11' -t:'-target arm64-apple-macos11'"
-    compile "lib" & bindingsName & ".a.x64", "--cpu:amd64 -l:'-target x86_64-apple-macos10.12' -t:'-target x86_64-apple-macos10.12'"
-    exec "lipo lib/lib" & bindingsName & ".a.arm " &
-             " lib/lib" & bindingsName & ".a.x64 " &
-             " -output lib/lib" & bindingsName & ".a -create"
-
-  else:
-    compile "lib" & bindingsName & ".a"
-
-proc genHeaders(bindingsName: string) =
-  echo "Generating header:         include/" & bindingsName & ".h"
-  exec "nim c -d:release -d:CttGenerateHeaders " &
-       " --verbosity:0 --hints:off --warnings:off " &
-       " --out:" & bindingsName & "_gen_header.exe --outdir:build " &
-       " --nimcache:nimcache/bindings/" & bindingsName & "_header" &
-       " bindings/" & bindingsName & ".nim"
-  exec "build/" & bindingsName & "_gen_header.exe include"
-
 proc genParallelCmdRunner() =
  exec "nim c --verbosity:0 --hints:off --warnings:off -d:release --out:build/pararun --nimcache:nimcache/pararun helpers/pararun.nim"

 # Tasks
 # ----------------------------------------------------------------

-task bindings, "Generate Constantine bindings":
-  genDynamicBindings("constantine_bls12_381", "ctt_bls12381_init_")
-  genStaticBindings("constantine_bls12_381", "ctt_bls12381_init_")
-  genHeaders("constantine_bls12_381")
-  echo ""
-  genDynamicBindings("constantine_pasta", "ctt_pasta_init_")
-  genStaticBindings("constantine_pasta", "ctt_pasta_init_")
-  genHeaders("constantine_pasta")
-
-task test_bindings, "Test C bindings":
-  exec "mkdir -p build/testsuite"
-  echo "--> Testing dynamically linked library"
-  when not defined(windows):
-    exec "gcc -Iinclude -Llib -o build/testsuite/t_libctt_bls12_381_dl examples_c/t_libctt_bls12_381.c -lgmp -lconstantine_bls12_381"
-    exec "LD_LIBRARY_PATH=lib ./build/testsuite/t_libctt_bls12_381_dl"
-  else:
-    # Put DLL near the exe as LD_LIBRARY_PATH doesn't work even in an POSIX compatible shell
-    exec "gcc -Iinclude -Llib -o build/testsuite/t_libctt_bls12_381_dl.exe examples_c/t_libctt_bls12_381.c -lgmp -lconstantine_bls12_381"
-    exec "./build/testsuite/t_libctt_bls12_381_dl.exe"
-
-  echo "--> Testing statically linked library"
-  when not defined(windows):
-    # Beware MacOS annoying linker with regards to static libraries
-    # The following standard way cannot be used on MacOS
-    # exec "gcc -Iinclude -Llib -o build/t_libctt_bls12_381_sl.exe examples_c/t_libctt_bls12_381.c -lgmp -Wl,-Bstatic -lconstantine_bls12_381 -Wl,-Bdynamic"
-
-    exec "gcc -Iinclude -o build/testsuite/t_libctt_bls12_381_sl examples_c/t_libctt_bls12_381.c lib/libconstantine_bls12_381.a -lgmp"
-    exec "./build/testsuite/t_libctt_bls12_381_sl"
-  else:
-    exec "gcc -Iinclude -o build/testsuite/t_libctt_bls12_381_sl.exe examples_c/t_libctt_bls12_381.c lib/constantine_bls12_381.lib -lgmp"
-    exec "./build/testsuite/t_libctt_bls12_381_sl.exe"
-
 task test, "Run all tests":
  # -d:testingCurves is configured in a *.nim.cfg for convenience
  var cmdFile: string
@ -1123,17 +1199,17 @@ task bench_hash_to_curve_clang_noasm, "Run Hash-to-Curve benchmarks":

 # BLS signatures
 # ------------------------------------------
-task bench_blssig_on_bls12_381_g2, "Run Hash-to-Curve benchmarks":
-  runBench("bench_blssig_on_bls12_381_g2")
+task bench_ethereum_bls_signatures, "Run Ethereum BLS signatures benchmarks":
+  runBench("bench_ethereum_bls_signatures")

-task bench_blssig_on_bls12_381_g2_gcc, "Run Hash-to-Curve benchmarks":
-  runBench("bench_blssig_on_bls12_381_g2", "gcc")
+task bench_ethereum_bls_signatures_gcc, "Run Ethereum BLS signatures benchmarks":
+  runBench("bench_ethereum_bls_signatures", "gcc")

-task bench_blssig_on_bls12_381_g2_clang, "Run Hash-to-Curve benchmarks":
-  runBench("bench_blssig_on_bls12_381_g2", "clang")
+task bench_ethereum_bls_signatures_clang, "Run Ethereum BLS signatures benchmarks":
+  runBench("bench_ethereum_bls_signatures", "clang")

-task bench_blssig_on_bls12_381_g2_gcc_noasm, "Run Hash-to-Curve benchmarks":
-  runBench("bench_blssig_on_bls12_381_g2", "gcc", useAsm = false)
+task bench_ethereum_bls_signatures_gcc_noasm, "Run Ethereum BLS signatures benchmarks":
+  runBench("bench_ethereum_bls_signatures", "gcc", useAsm = false)

-task bench_blssig_on_bls12_381_g2_clang_noasm, "Run Hash-to-Curve benchmarks":
-  runBench("bench_blssig_on_bls12_381_g2", "clang", useAsm = false)
+task bench_ethereum_bls_signatures_clang_noasm, "Run Ethereum BLS signatures benchmarks":
+  runBench("bench_ethereum_bls_signatures", "clang", useAsm = false)
--- a/constantine/ciphers/chacha20.nim
+++ b/constantine/ciphers/chacha20.nim
@ -6,7 +6,7 @@
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.

-import ../platforms/endians
+import ../platforms/[endians, views]

 # ############################################################
 #
@ -79,18 +79,18 @@ func chacha20_block(

  # uint32 are 4 bytes so multiply destination by 4
  for i in 0'u ..< 4:
-    key_stream.dumpRawInt(state[i] + cccc[i], i shl 2, littleEndian) 
+    key_stream.dumpRawInt(state[i] + cccc[i], i shl 2, littleEndian)
  for i in 4'u ..< 12:
    key_stream.dumpRawInt(state[i] + key[i-4], i shl 2, littleEndian)
  key_stream.dumpRawInt(state[12] + block_counter, 12 shl 2, littleEndian)
  for i in 13'u ..< 16:
    key_stream.dumpRawInt(state[i] + nonce[i-13], i shl 2, littleEndian)

-func chacha20_cipher*[T: byte|char](
+func chacha20_cipher*(
       key: array[32, byte],
       counter: uint32,
       nonce: array[12, byte],
-       data: var openarray[T]): uint32 =
+       data: var openArray[byte]): uint32 {.genCharAPI.} =
  ## Encrypt or decrypt `data` using the ChaCha20 cipher
  ## - `key` is a 256-bit (32 bytes) secret shared encryption/decryption key.
  ## - `counter`. A monotonically increasing value per encryption.
--- a/constantine/blssig_pop_on_bls12381_g2.nim
+++ b/constantine/blssig_pop_on_bls12381_g2.nim
@ -6,34 +6,15 @@
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.

-import
-    ./platforms/abstractions,
-    ./math/config/curves,
-    ./math/[
-      ec_shortweierstrass,
-      extension_fields,
-      arithmetic,
-      constants/zoo_subgroups
-    ],
-    ./math/io/[io_bigints, io_fields],
-    hashes,
-    signatures/bls_signatures
-
-export
-  abstractions, # generic sandwich on SecretBool and SecretBool in Jacobian sumImpl
-  curves, # generic sandwich on matchingBigInt
-  extension_fields, # generic sandwich on extension field access
-  hashes, # generic sandwich on sha256
-  ec_shortweierstrass # generic sandwich on affine
-
 ## ############################################################
 ##
-##              BLS Signatures on BLS12-381 G2
+##              BLS Signatures on for Ethereum
 ##
 ## ############################################################
 ##
 ## This module implements BLS Signatures (Boneh-Lynn-Schacham)
-## on top of the BLS12-381 curve (Barreto-Lynn-Scott).
+## on top of the BLS12-381 curve (Barreto-Lynn-Scott) G2.
+## for the Ethereum blockchain.
 ##
 ## Ciphersuite:
 ##
@ -45,33 +26,83 @@ export
 ## - Domain separation tag: "BLS_SIG_BLS12381G2_XMD:SHA-256_SSWU_RO_POP_"
 ## - Hash function: SHA256
 ##
-## Currently Constantine does not provide popProve and popVerify
-## which are thin wrapper over sign/verify with
-## - the message to sign or verify being the compressed or uncompressed public key
-##   or another application-specific "hash_pubkey_to_point" scheme
-## - domain-separation-tag: "BLS_POP_BLS12381G2_XMD:SHA-256_SSWU_RO_POP_"
+## Specs:
+## - https://github.com/ethereum/consensus-specs/blob/v1.2.0/specs/phase0/beacon-chain.md#bls-signatures
+## - https://github.com/ethereum/consensus-specs/blob/v1.2.0/specs/altair/bls.md
+## - https://www.ietf.org/archive/id/draft-irtf-cfrg-bls-signature-05.html
 ##
-## Constantine currently assumes that proof-of-possessions are handled at the application-level
+## Test vectors:
+## - https://github.com/ethereum/bls12-381-tests
 ##
-## In proof-of-stake blockchains, being part of the staker/validator sets
-## already serve as proof-of-possession.
+## The Ethereum blockchain uses the proof-of-possession scheme (PoP).
+## Each public key is associated with a deposit proof required to participate
+## in the blockchain consensus protocol, hence PopProve and PopVerify
+## as defined in the IETF spec are not needed.

 const DST = "BLS_SIG_BLS12381G2_XMD:SHA-256_SSWU_RO_POP_"
-const ffi_prefix {.used.} = "ctt_blssig_pop_on_bls12381_g2_"
+const prefix_ffi = "ctt_eth_bls_"

-{.push raises: [].} # No exceptions allowed in core cryptographic operations
-# {.push cdecl, dynlib, exportc:ffi_prefix & "$1".} # TODO, C API
+# Dependencies exports for C FFI
+# ------------------------------------------------------------------------------------------------
+
+import ./zoo_exports
+
+static:
+  # Xxport SHA256 routines with a protocol specific prefix
+  # This exports sha256.init(), sha256.update(), sha256.finish() and sha256.clear()
+  prefix_sha256 = prefix_ffi & "_sha256_"
+
+import hashes
+export hashes # generic sandwich on sha256
+
+func sha256_hash*(digest: var array[32, byte], message: openArray[byte], clearMem: bool) {.libPrefix: prefix_ffi.} =
+  ## Compute the SHA-256 hash of message
+  ## and store the result in digest.
+  ## Optionally, clear the memory buffer used.
+
+  # There is an extra indirect function call as we use a generic `hash` concept but:
+  # - the indirection saves space (instead of duplicating `hash`)
+  # - minimal overhead compared to hashing time
+  # - Can be tail-call optimized into a goto jump instead of call/return
+  # - Can be LTO-optimized
+  sha256.hash(digest, message, clearMem)
+
+# Imports
+# ------------------------------------------------------------------------------------------------
+
+import
+    ./platforms/[abstractions, views],
+    ./math/config/curves,
+    ./math/[
+      ec_shortweierstrass,
+      extension_fields,
+      arithmetic,
+      constants/zoo_subgroups
+    ],
+    ./math/io/[io_bigints, io_fields],
+    signatures/bls_signatures
+
+export
+  abstractions, # generic sandwich on SecretBool and SecretBool in Jacobian sumImpl
+  curves, # generic sandwich on matchingBigInt
+  extension_fields, # generic sandwich on extension field access
+  ec_shortweierstrass # generic sandwich on affine
+
+# Protocol types
+# ------------------------------------------------------------------------------------------------
+
+{.checks: off.} # No exceptions allowed in core cryptographic operations

 type
-  SecretKey* {.byref.} = object
+  SecretKey* {.byref, exportc: prefix_ffi & "seckey".} = object
    ## A BLS12_381 secret key
    raw: matchingOrderBigInt(BLS12_381)

-  PublicKey* {.byref.} = object
+  PublicKey* {.byref, exportc: prefix_ffi & "pubkey".} = object
    ## A BLS12_381 public key for BLS signature schemes with public keys on G1 and signatures on G2
    raw: ECP_ShortW_Aff[Fp[BLS12_381], G1]

-  Signature* {.byref.} = object
+  Signature* {.byref, exportc: prefix_ffi & "signature".} = object
    ## A BLS12_381 signature for BLS signature schemes with public keys on G1 and signatures on G2
    raw: ECP_ShortW_Aff[Fp2[BLS12_381], G2]

@ -91,18 +122,26 @@ type
 # Comparisons
 # ------------------------------------------------------------------------------------------------

-func isZero*(elem: PublicKey or Signature): bool =
+func pubkey_is_zero*(pubkey: PublicKey): bool {.libPrefix: prefix_ffi.} =
  ## Returns true if input is 0
-  bool(elem.raw.isInf())
+  bool(pubkey.raw.isInf())

-func `==`*(a, b: PublicKey or Signature): bool =
+func signature_is_zero*(sig: Signature): bool {.libPrefix: prefix_ffi.} =
+  ## Returns true if input is 0
+  bool(sig.raw.isInf())
+
+func pubkeys_are_equal*(a, b: PublicKey): bool {.libPrefix: prefix_ffi.} =
+  ## Returns true if inputs are equal
+  bool(a.raw == b.raw)
+
+func signatures_are_equal*(a, b: Signature): bool {.libPrefix: prefix_ffi.} =
  ## Returns true if inputs are equal
  bool(a.raw == b.raw)

 # Input validation
 # ------------------------------------------------------------------------------------------------

-func validate_seckey*(secret_key: SecretKey): CttBLSStatus =
+func validate_seckey*(secret_key: SecretKey): CttBLSStatus {.libPrefix: prefix_ffi.} =
  ## Validate the secret key.
  ## Regarding timing attacks, this will leak timing information only if the key is invalid.
  ## Namely, the secret key is 0 or the secret key is too large.
@ -112,7 +151,7 @@ func validate_seckey*(secret_key: SecretKey): CttBLSStatus =
    return cttBLS_SecretKeyLargerThanCurveOrder
  return cttBLS_Success

-func validate_pubkey*(public_key: PublicKey): CttBLSStatus =
+func validate_pubkey*(public_key: PublicKey): CttBLSStatus {.libPrefix: prefix_ffi.} =
  ## Validate the public key.
  ## This is an expensive operation that can be cached
  if public_key.raw.isInf().bool():
@ -122,7 +161,7 @@ func validate_pubkey*(public_key: PublicKey): CttBLSStatus =
  if not public_key.raw.isInSubgroup().bool():
    return cttBLS_PointNotInSubgroup

-func validate_sig*(signature: Signature): CttBLSStatus =
+func validate_signature*(signature: Signature): CttBLSStatus {.libPrefix: prefix_ffi.} =
  ## Validate the signature.
  ## This is an expensive operation that can be cached
  if signature.raw.isInf().bool():
@ -153,17 +192,17 @@ func validate_sig*(signature: Signature): CttBLSStatus =
 ##     The third-most significant bit is set if (and only if) this point is in compressed form
 ##     and it is not the point at infinity and its y-coordinate is the lexicographically largest of the two associated with the encoded x-coordinate.
 ##
-## - https://datatracker.ietf.org/doc/html/draft-irtf-cfrg-bls-signature-04#appendix-A
+## - https://datatracker.ietf.org/doc/html/draft-irtf-cfrg-bls-signature-05#appendix-A
 ## - https://docs.rs/bls12_381/latest/bls12_381/notes/serialization/index.html
 ##   - https://github.com/zkcrypto/bls12_381/blob/0.6.0/src/notes/serialization.rs

-func serialize_secret_key*(dst: var array[32, byte], secret_key: SecretKey): CttBLSStatus =
+func serialize_seckey*(dst: var array[32, byte], secret_key: SecretKey): CttBLSStatus {.libPrefix: prefix_ffi.} =
  ## Serialize a secret key
  ## Returns cttBLS_Success if successful
  dst.marshal(secret_key.raw, bigEndian)
  return cttBLS_Success

-func serialize_public_key_compressed*(dst: var array[48, byte], public_key: PublicKey): CttBLSStatus =
+func serialize_pubkey_compressed*(dst: var array[48, byte], public_key: PublicKey): CttBLSStatus {.libPrefix: prefix_ffi.} =
  ## Serialize a public key in compressed (Zcash) format
  ##
  ## Returns cttBLS_Success if successful
@ -184,7 +223,7 @@ func serialize_public_key_compressed*(dst: var array[48, byte], public_key: Publ

  return cttBLS_Success

-func serialize_signature_compressed*(dst: var array[96, byte], signature: Signature): CttBLSStatus =
+func serialize_signature_compressed*(dst: var array[96, byte], signature: Signature): CttBLSStatus {.libPrefix: prefix_ffi.} =
  ## Serialize a signature in compressed (Zcash) format
  ##
  ## Returns cttBLS_Success if successful
@ -206,8 +245,9 @@ func serialize_signature_compressed*(dst: var array[96, byte], signature: Signat

  return cttBLS_Success

-func deserialize_secret_key*(dst: var SecretKey, src: array[32, byte]): CttBLSStatus =
-  ## deserialize a secret key
+func deserialize_seckey*(dst: var SecretKey, src: array[32, byte]): CttBLSStatus {.libPrefix: prefix_ffi.} =
+  ## Deserialize a secret key
+  ## This also validates the secret key.
  ##
  ## This is protected against side-channel unless your key is invalid.
  ## In that case it will like whether it's all zeros or larger than the curve order.
@ -218,7 +258,7 @@ func deserialize_secret_key*(dst: var SecretKey, src: array[32, byte]): CttBLSSt
    return status
  return cttBLS_Success

-func deserialize_public_key_compressed_unchecked*(dst: var PublicKey, src: array[48, byte]): CttBLSStatus =
+func deserialize_pubkey_compressed_unchecked*(dst: var PublicKey, src: array[48, byte]): CttBLSStatus {.libPrefix: prefix_ffi.} =
  ## Deserialize a public_key in compressed (Zcash) format.
  ##
  ## Warning ⚠:
@ -260,19 +300,20 @@ func deserialize_public_key_compressed_unchecked*(dst: var PublicKey, src: array
  let srcIsLargest = SecretBool((src[0] shr 5) and byte 1)
  dst.raw.y.cneg(isLexicographicallyLargest xor srcIsLargest)

-func deserialize_public_key_compressed*(dst: var PublicKey, src: array[48, byte]): CttBLSStatus =
+func deserialize_pubkey_compressed*(dst: var PublicKey, src: array[48, byte]): CttBLSStatus {.libPrefix: prefix_ffi.} =
  ## Deserialize a public_key in compressed (Zcash) format
+  ## This also validates the public key.
  ##
  ## Returns cttBLS_Success if successful

-  result = deserialize_public_key_compressed_unchecked(dst, src)
+  result = deserialize_pubkey_compressed_unchecked(dst, src)
  if result != cttBLS_Success:
    return result

-  if not(bool dst.raw.isInSubgroup):
+  if not(bool dst.raw.isInSubgroup()):
    return cttBLS_PointNotInSubgroup

-func deserialize_signature_compressed_unchecked*(dst: var Signature, src: array[96, byte]): CttBLSStatus =
+func deserialize_signature_compressed_unchecked*(dst: var Signature, src: array[96, byte]): CttBLSStatus {.libPrefix: prefix_ffi.} =
  ## Deserialize a signature in compressed (Zcash) format.
  ##
  ## Warning ⚠:
@ -325,7 +366,7 @@ func deserialize_signature_compressed_unchecked*(dst: var Signature, src: array[
  let srcIsLargest = SecretBool((src[0] shr 5) and byte 1)
  dst.raw.y.cneg(isLexicographicallyLargest xor srcIsLargest)

-func deserialize_signature_compressed*(dst: var Signature, src: array[96, byte]): CttBLSStatus =
+func deserialize_signature_compressed*(dst: var Signature, src: array[96, byte]): CttBLSStatus {.libPrefix: prefix_ffi.} =
  ## Deserialize a public_key in compressed (Zcash) format
  ##
  ## Returns cttBLS_Success if successful
@ -334,13 +375,13 @@ func deserialize_signature_compressed*(dst: var Signature, src: array[96, byte])
  if result != cttBLS_Success:
    return result

-  if not(bool dst.raw.isInSubgroup):
+  if not(bool dst.raw.isInSubgroup()):
    return cttBLS_PointNotInSubgroup

-# Signatures
+# BLS Signatures
 # ------------------------------------------------------------------------------------------------

-func derive_public_key*(public_key: var PublicKey, secret_key: SecretKey): CttBLSStatus =
+func derive_pubkey*(public_key: var PublicKey, secret_key: SecretKey): CttBLSStatus {.libPrefix: prefix_ffi.} =
  ## Derive the public key matching with a secret key
  ##
  ## Secret protection:
@ -356,7 +397,7 @@ func derive_public_key*(public_key: var PublicKey, secret_key: SecretKey): CttBL
    return cttBLS_InvalidEncoding
  return cttBLS_Success

-func sign*[T: byte|char](signature: var Signature, secret_key: SecretKey, message: openArray[T]): CttBLSStatus =
+func sign*(signature: var Signature, secret_key: SecretKey, message: openArray[byte]): CttBLSStatus {.libPrefix: prefix_ffi, genCharAPI.} =
  ## Produce a signature for the message under the specified secret key
  ## Signature is on BLS12-381 G2 (and public key on G1)
  ##
@ -382,7 +423,7 @@ func sign*[T: byte|char](signature: var Signature, secret_key: SecretKey, messag
  coreSign(signature.raw, secretKey.raw, message, sha256, 128, augmentation = "", DST)
  return cttBLS_Success

-func verify*[T: byte|char](public_key: PublicKey, message: openarray[T], signature: Signature): CttBLSStatus =
+func verify*(public_key: PublicKey, message: openArray[byte], signature: Signature): CttBLSStatus {.libPrefix: prefix_ffi, genCharAPI.} =
  ## Check that a signature is valid for a message
  ## under the provided public key.
  ## returns `true` if the signature is valid, `false` otherwise.
@ -394,9 +435,13 @@ func verify*[T: byte|char](public_key: PublicKey, message: openarray[T], signatu
  ##   Or validated via validate_pubkey
  ## - A message
  ## - A signature initialized by one of the key derivation or deserialization procedure.
-  ##   Or validated via validate_pubkey
+  ##   Or validated via validate_signature
  ##
-  ## In particular, the public key and signature are assumed to be on curve subgroup checked.
+  ## Output:
+  ## - a status code with verification success if signature is valid
+  ##   or indicating verification failure
+  ##
+  ## In particular, the public key and signature are assumed to be on curve and subgroup-checked.

  # Deal with cases were pubkey or signature were mistakenly zero-init, due to a generic aggregation tentative for example
  if bool(public_key.raw.isInf() or signature.raw.isInf()):
@ -411,25 +456,29 @@ template unwrap[T: PublicKey|Signature](elems: openArray[T]): auto =
  # Unwrap collection of high-level type into collection of low-level type
  toOpenArray(cast[ptr UncheckedArray[typeof elems[0].raw]](elems[0].raw.unsafeAddr), elems.low, elems.high)

-func aggregate_pubkeys*(aggregate_pubkey: var PublicKey, pubkeys: openArray[PublicKey]) =
+func aggregate_pubkeys_unstable_api*(aggregate_pubkey: var PublicKey, pubkeys: openArray[PublicKey]) =
  ## Aggregate public keys into one
  ## The individual public keys are assumed to be validated, either during deserialization
  ## or by validate_pubkeys
+  #
+  # TODO: Return a bool or status code or nothing?
  if pubkeys.len == 0:
    aggregate_pubkey.raw.setInf()
    return
  aggregate_pubkey.raw.aggregate(pubkeys.unwrap())

-func aggregate_signatures*(aggregate_sig: var Signature, signatures: openArray[Signature]) =
+func aggregate_signatures_unstable_api*(aggregate_sig: var Signature, signatures: openArray[Signature]) =
  ## Aggregate signatures into one
  ## The individual signatures are assumed to be validated, either during deserialization
  ## or by validate_signature
+  #
+  # TODO: Return a bool or status code or nothing?
  if signatures.len == 0:
    aggregate_sig.raw.setInf()
    return
  aggregate_sig.raw.aggregate(signatures.unwrap())

-func fast_aggregate_verify*[T: byte|char](pubkeys: openArray[PublicKey], message: openarray[T], aggregate_sig: Signature): CttBLSStatus =
+func fast_aggregate_verify*(pubkeys: openArray[PublicKey], message: openArray[byte], aggregate_sig: Signature): CttBLSStatus {.libPrefix: prefix_ffi, genCharAPI.} =
  ## Check that a signature is valid for a message
  ## under the aggregate of provided public keys.
  ## returns `true` if the signature is valid, `false` otherwise.
@ -441,7 +490,7 @@ func fast_aggregate_verify*[T: byte|char](pubkeys: openArray[PublicKey], message
  ##   Or validated via validate_pubkey
  ## - A message
  ## - A signature initialized by one of the key derivation or deserialization procedure.
-  ##   Or validated via validate_sig
+  ##   Or validated via validate_signature
  ##
  ## In particular, the public keys and signature are assumed to be on curve subgroup checked.

@ -465,7 +514,11 @@ func fast_aggregate_verify*[T: byte|char](pubkeys: openArray[PublicKey], message
    return cttBLS_Success
  return cttBLS_VerificationFailure

-func aggregate_verify*[M](pubkeys: openArray[PublicKey], messages: openarray[M], aggregate_sig: Signature): CttBLSStatus =
+# C FFI
+func aggregate_verify*(pubkeys: ptr UncheckedArray[PublicKey],
+                       messages: ptr UncheckedArray[View[byte]],
+                       len: int,
+                       aggregate_sig: Signature): CttBLSStatus {.libPrefix: prefix_ffi.} =
  ## Verify the aggregated signature of multiple (pubkey, message) pairs
  ## returns `true` if the signature is valid, `false` otherwise.
  ##
@ -476,12 +529,53 @@ func aggregate_verify*[M](pubkeys: openArray[PublicKey], messages: openarray[M],
  ##   Or validated via validate_pubkey
  ## - Messages
  ## - a signature initialized by one of the key derivation or deserialization procedure.
-  ##   Or validated via validate_sig
+  ##   Or validated via validate_signature
  ##
  ## In particular, the public keys and signature are assumed to be on curve subgroup checked.
  ##
  ## To avoid splitting zeros and rogue keys attack:
-  ## 1. Public keys signing the same message MUST be aggregated and checked for 0 before calling BLSAggregateSigAccumulator.update()
+  ## 1. Public keys signing the same message MUST be aggregated and checked for 0 before calling this function.
+  ## 2. Augmentation or Proof of possessions must used for each public keys.
+
+  if len == 0:
+    # IETF spec precondition
+    return cttBLS_ZeroLengthAggregation
+
+  # Deal with cases were pubkey or signature were mistakenly zero-init, due to a generic aggregation tentative for example
+  if aggregate_sig.raw.isInf().bool:
+    return cttBLS_PointAtInfinity
+
+  for i in 0 ..< len:
+    if pubkeys[i].raw.isInf().bool:
+      return cttBLS_PointAtInfinity
+
+  let verified = aggregateVerify(
+    pubkeys.toOpenArray(len).unwrap(),
+    messages.toOpenArray(len),
+    aggregate_sig.raw,
+    sha256, 128, DST)
+  if verified:
+    return cttBLS_Success
+  return cttBLS_VerificationFailure
+
+# Nim
+func aggregate_verify*[Msg](pubkeys: openArray[PublicKey], messages: openArray[Msg], aggregate_sig: Signature): CttBLSStatus =
+  ## Verify the aggregated signature of multiple (pubkey, message) pairs
+  ## returns `true` if the signature is valid, `false` otherwise.
+  ##
+  ## For message domain separation purpose, the tag is `BLS_SIG_BLS12381G2_XMD:SHA-256_SSWU_RO_POP_`
+  ##
+  ## Input:
+  ## - Public keys initialized by one of the key derivation or deserialization procedure.
+  ##   Or validated via validate_pubkey
+  ## - Messages
+  ## - a signature initialized by one of the key derivation or deserialization procedure.
+  ##   Or validated via validate_signature
+  ##
+  ## In particular, the public keys and signature are assumed to be on curve subgroup checked.
+  ##
+  ## To avoid splitting zeros and rogue keys attack:
+  ## 1. Public keys signing the same message MUST be aggregated and checked for 0 before calling this function.
  ## 2. Augmentation or Proof of possessions must used for each public keys.

  if pubkeys.len == 0:
@ -507,7 +601,12 @@ func aggregate_verify*[M](pubkeys: openArray[PublicKey], messages: openarray[M],
    return cttBLS_Success
  return cttBLS_VerificationFailure

-func batch_verify*[M](pubkeys: openArray[PublicKey], messages: openarray[M], signatures: openArray[Signature], secureRandomBytes: array[32, byte]): CttBLSStatus =
+# C FFI
+func batch_verify*[Msg](pubkeys: ptr UncheckedArray[PublicKey],
+                        messages: ptr UncheckedArray[View[byte]],
+                        signatures: ptr UncheckedArray[Signature],
+                        len: int,
+                        secureRandomBytes: array[32, byte]): CttBLSStatus {.libPrefix: prefix_ffi.} =
  ## Verify that all (pubkey, message, signature) triplets are valid
  ## returns `true` if all signatures are valid, `false` if at least one is invalid.
  ##
@ -518,7 +617,54 @@ func batch_verify*[M](pubkeys: openArray[PublicKey], messages: openarray[M], sig
  ##   Or validated via validate_pubkey
  ## - Messages
  ## - Signatures initialized by one of the key derivation or deserialization procedure.
-  ##   Or validated via validate_sig
+  ##   Or validated via validate_signature
+  ##
+  ## In particular, the public keys and signature are assumed to be on curve subgroup checked.
+  ##
+  ## To avoid splitting zeros and rogue keys attack:
+  ## 1. Cryptographically-secure random bytes must be provided.
+  ## 2. Augmentation or Proof of possessions must used for each public keys.
+  ##
+  ## The secureRandomBytes will serve as input not under the attacker control to foil potential splitting zeros inputs.
+  ## The scheme assumes that the attacker cannot
+  ## resubmit 2^64 times forged (publickey, message, signature) triplets
+  ## against the same `secureRandomBytes`
+
+  if len == 0:
+    # IETF spec precondition
+    return cttBLS_ZeroLengthAggregation
+
+  # Deal with cases were pubkey or signature were mistakenly zero-init, due to a generic aggregation tentative for example
+  for i in 0 ..< len:
+    if pubkeys[i].raw.isInf().bool:
+      return cttBLS_PointAtInfinity
+
+  for i in 0 ..< len:
+    if signatures[i].raw.isInf().bool:
+      return cttBLS_PointAtInfinity
+
+  let verified = batchVerify(
+    pubkeys.toOpenArray(len).unwrap(),
+    messages,
+    signatures.toOpenArray(len).unwrap(),
+    sha256, 128, DST, secureRandomBytes)
+  if verified:
+    return cttBLS_Success
+  return cttBLS_VerificationFailure
+
+# Nim
+func batch_verify*[Msg](pubkeys: openArray[PublicKey], messages: openarray[Msg], signatures: openArray[Signature], secureRandomBytes: array[32, byte]): CttBLSStatus =
+  ## Verify that all (pubkey, message, signature) triplets are valid
+  ## returns `true` if all signatures are valid, `false` if at least one is invalid.
+  ##
+  ## For message domain separation purpose, the tag is `BLS_SIG_BLS12381G2_XMD:SHA-256_SSWU_RO_POP_`
+  ##
+  ## Input:
+  ## - Public keys initialized by one of the key derivation or deserialization procedure.
+  ##   Or validated via validate_pubkey
+  ## - Messages
+  ## - Signatures initialized by one of the key derivation or deserialization procedure.
+  ##   Or validated via validate_signature
  ##
  ## In particular, the public keys and signature are assumed to be on curve subgroup checked.
  ##
--- a/constantine/ethereum_eip2333_bls12381_key_derivation.nim
+++ b/constantine/ethereum_eip2333_bls12381_key_derivation.nim
@ -12,18 +12,18 @@ import
  ./math/config/[curves, type_ff],
  ./math/arithmetic/[bigints, limbs_montgomery],
  ./math/io/io_bigints,
-  ./platforms/endians
+  ./platforms/[primitives, endians]

 # EIP2333: BLS12-381 Key Generation
 # ------------------------------------------------------------
 #
 # https://eips.ethereum.org/EIPS/eip-2333

-{.push raises: [].} # No exceptions
+{.push raises: [], checks: off.} # No exceptions

 type SecretKey = matchingOrderBigInt(BLS12_381)

-func hkdf_mod_r[T: char|byte](secretKey: var SecretKey, ikm: openArray[byte], key_info: openArray[T]) =
+func hkdf_mod_r(secretKey: var SecretKey, ikm: openArray[byte], key_info: openArray[byte]) =
  ## Ethereum 2 EIP-2333, extracts this from the BLS signature schemes
  # 1. salt = "BLS-SIG-KEYGEN-SALT-"
  # 2. SK = 0
@ -52,7 +52,7 @@ func hkdf_mod_r[T: char|byte](secretKey: var SecretKey, ikm: openArray[byte], ke
    const L = 48
    var okm{.noInit.}: array[L, byte]
    const L_octetstring = L.uint16.toBytesBE()
-    ctx.hkdfExpand(okm, prk, key_info, append = L_octetstring)
+    ctx.hkdfExpand(okm, prk, key_info, append = L_octetstring, clearMem = true)
    #  7. x = OS2IP(OKM) mod r
    #  We reduce mod r via Montgomery reduction, instead of bigint division
    #  as constant-time division works bits by bits (384 bits) while
@ -64,10 +64,10 @@ func hkdf_mod_r[T: char|byte](secretKey: var SecretKey, ikm: openArray[byte], ke
    seckeyDbl.unmarshal(okm, bigEndian)
    # secretKey.reduce(seckeyDbl, BLS12_381.getCurveOrder())
    secretKey.limbs.redc2xMont(seckeyDbl.limbs,                                      # seckey/R
-                               BLS12_381.getCurveOrder().limbs, Fr[BLS12_381].getNegInvModWord(), 
+                               BLS12_381.getCurveOrder().limbs, Fr[BLS12_381].getNegInvModWord(),
                               Fr[BLS12_381].getSpareBits())
    secretKey.limbs.mulMont(secretKey.limbs, Fr[BLS12_381].getR2modP().limbs,        # (seckey/R) * R² * R⁻¹ = seckey
-                            BLS12_381.getCurveOrder().limbs, Fr[BLS12_381].getNegInvModWord(), 
+                            BLS12_381.getCurveOrder().limbs, Fr[BLS12_381].getNegInvModWord(),
                            Fr[BLS12_381].getSpareBits())

    if bool secretKey.isZero():
@ -90,19 +90,20 @@ iterator ikm_to_lamport_SK(

  # 1. OKM = HKDF-Expand(PRK, "" , L)
  #    with L = K * 255 and K = 32 (sha256 output)
-  {.push checks: off.} # No OverflowError or IndexError allowed
  for i in ctx.hkdfExpandChunk(
            lamportSecretKeyChunk,
-            prk, "",""):
+            prk, default(array[0, byte]), default(array[0, byte])):
    yield i

+  ctx.clear()
+
 func parent_SK_to_lamport_PK(
       lamportPublicKey: var array[32, byte],
       parentSecretKey: SecretKey,
       index: uint32) =
  ## Derives the index'th child's lamport PublicKey
  ## from the parent SecretKey
-  
+
  # 0. salt = I2OSP(index, 4)
  let salt{.noInit.} = index.toBytesBE()

@ -119,8 +120,6 @@ func parent_SK_to_lamport_PK(

  var tmp{.noInit.}, chunk{.noInit.}: array[32, byte]

-  {.push checks: off.} # No OverflowError or IndexError allowed
-
  # 2. lamport_0 = IKM_to_lamport_SK(IKM, salt)
  # 6. for i = 1, .., 255 (inclusive)
  #        lamport_PK = lamport_PK | SHA256(lamport_0[i])
@ -130,7 +129,7 @@ func parent_SK_to_lamport_PK(
    if i == 254:
      # We iterate from 0
      break
-  
+
  # 3. not_IKM = flip_bits(parent_SK)
  for i in 0 ..< 32:
    ikm[i] = not ikm[i]
@ -152,26 +151,26 @@ func parent_SK_to_lamport_PK(
 func derive_child_secretKey*(
        childSecretKey: var SecretKey,
        parentSecretKey: SecretKey,
-        index: uint32
-     ): bool =
+        index: uint32): bool =
  ## EIP2333 Child Key derivation function
  var compressed_lamport_PK{.noInit.}: array[32, byte]
  # 0. compressed_lamport_PK = parent_SK_to_lamport_PK(parent_SK, index)
  parent_SK_to_lamport_PK(
    compressed_lamport_PK,
    parentSecretKey,
-    index,
-  )
-  childSecretKey.hkdf_mod_r(compressed_lamport_PK, key_info = "")
+    index)
+  childSecretKey.hkdf_mod_r(compressed_lamport_PK, key_info = default(array[0, byte]))
+  compressed_lamport_PK.setZero()
  return true

 func derive_master_secretKey*(
        masterSecretKey: var SecretKey,
-        ikm: openArray[byte]
-     ): bool =
+        ikm: openArray[byte]): bool =
  ## EIP2333 Master key derivation
+  ## The input keying material SHOULD be cleared after use
+  ## to prevent leakage.
  if ikm.len < 32:
    return false

-  masterSecretKey.hkdf_mod_r(ikm, key_info = "")
+  masterSecretKey.hkdf_mod_r(ikm, key_info = default(array[0, byte]))
  return true
--- a/constantine/ethereum_evm_precompiles.nim
+++ b/constantine/ethereum_evm_precompiles.nim
@ -113,7 +113,7 @@ func eth_evm_ecadd*(r: var array[64, byte], inputs: openarray[byte]): CttEVMStat

  # Auto-pad with zero
  var padded: array[128, byte]
-  padded.copy(0, inputs, 0, min(inputs.len, 128))
+  padded.rawCopy(0, inputs, 0, min(inputs.len, 128))

  var P{.noInit.}, Q{.noInit.}, R{.noInit.}: ECP_ShortW_Prj[Fp[BN254_Snarks], G1]

@ -168,7 +168,7 @@ func eth_evm_ecmul*(r: var array[64, byte], inputs: openarray[byte]): CttEVMStat

  # Auto-pad with zero
  var padded: array[128, byte]
-  padded.copy(0, inputs, 0, min(inputs.len, 128))
+  padded.rawCopy(0, inputs, 0, min(inputs.len, 128))

  var P{.noInit.}: ECP_ShortW_Prj[Fp[BN254_Snarks], G1]

--- a/constantine/hash_to_curve/h2c_hash_to_field.nim
+++ b/constantine/hash_to_curve/h2c_hash_to_field.nim
@ -8,7 +8,7 @@

 import
  # Internals
-  ../platforms/[abstractions, endians],
+  ../platforms/[abstractions, endians, views],
  ../hashes,
  ../math/io/[io_bigints, io_fields],
  ../math/config/curves,
@ -37,10 +37,10 @@ template strxor(b_i: var array, b0: array): untyped =
    b_i[i] = b_i[i] xor b0[i]
 # ----------------------------------------------------------------

-func shortDomainSepTag*[DigestSize: static int, B: byte|char](
+func shortDomainSepTag*[DigestSize: static int](
       H: type CryptoHash,
       output: var array[DigestSize, byte],
-       oversizedDST: openarray[B]) =
+       oversizedDST: openArray[byte]) {.genCharAPI.} =
  ## Compute a short Domain Separation Tag
  ## from a domain separation tag larger than 255 bytes
  ##
@ -52,13 +52,13 @@ func shortDomainSepTag*[DigestSize: static int, B: byte|char](
  ctx.update oversizedDST
  ctx.finish(output)

-func expandMessageXMD*[B1, B2, B3: byte|char, len_in_bytes: static int](
+func expandMessageXMD*[len_in_bytes: static int](
       H: type CryptoHash,
       output: var array[len_in_bytes, byte],
-       augmentation: openarray[B1],
-       message: openarray[B2],
-       domainSepTag: openarray[B3]
-     ) =
+       augmentation: openArray[byte],
+       message: openArray[byte],
+       domainSepTag: openArray[byte]
+     ) {.genCharAPI.} =
  ## The expand_message_xmd function produces a uniformly random byte
  ## string using a cryptographic hash function H that outputs "b" bits,
  ## with b >= 2*k and k the target security level (for example 128-bit)
@ -77,7 +77,7 @@ func expandMessageXMD*[B1, B2, B3: byte|char, len_in_bytes: static int](
  ## - `augmentation`, an optional augmentation to the message. This will be prepended,
  ##   prior to hashing.
  ##   This is used for building the "message augmentation" variant of BLS signatures
-  ##   https://tools.ietf.org/html/draft-irtf-cfrg-bls-signature-04#section-3.2
+  ##   https://www.ietf.org/archive/id/draft-irtf-cfrg-bls-signature-05.html#section-3.2
  ##   which requires `CoreSign(SK, PK || message)`
  ##   and `CoreVerify(PK, PK || message, signature)`
  ## - `message` is the message to hash
@ -163,14 +163,14 @@ func mulMont(r: var BigInt, a, b: BigInt, FF: type) {.inline.} =
    FF.getSpareBits()
  )

-func hashToField*[Field; B1, B2, B3: byte|char, count: static int](
+func hashToField*[Field; count: static int](
       H: type CryptoHash,
       k: static int,
       output: var array[count, Field],
-       augmentation: openarray[B1],
-       message: openarray[B2],
-       domainSepTag: openarray[B3]
-     ) =
+       augmentation: openArray[byte],
+       message: openArray[byte],
+       domainSepTag: openArray[byte]
+     ) {.genCharAPI.} =
  ## Hash to a field or an extension field
  ## https://datatracker.ietf.org/doc/html/draft-irtf-cfrg-hash-to-curve-11#section-5.3
  ##
@ -186,7 +186,7 @@ func hashToField*[Field; B1, B2, B3: byte|char, count: static int](
  ## - `augmentation`, an optional augmentation to the message. This will be prepended,
  ##   prior to hashing.
  ##   This is used for building the "message augmentation" variant of BLS signatures
-  ##   https://tools.ietf.org/html/draft-irtf-cfrg-bls-signature-04#section-3.2
+  ##   https://www.ietf.org/archive/id/draft-irtf-cfrg-bls-signature-05.html#section-3.2
  ##   which requires `CoreSign(SK, PK || message)`
  ##   and `CoreVerify(PK, PK || message, signature)`
  ## - `message` is the message to hash
--- a/constantine/hash_to_curve/hash_to_curve.nim
+++ b/constantine/hash_to_curve/hash_to_curve.nim
@ -8,7 +8,7 @@

 import
  # Internals
-  ../platforms/abstractions,
+  ../platforms/[abstractions, views],
  ../math/config/curves,
  ../math/[arithmetic, extension_fields],
  ../math/constants/[zoo_hash_to_curve, zoo_subgroups],
@ -43,7 +43,7 @@ func mapToCurve_svdw[F, G](
  ## Deterministically map a field element u
  ## to an elliptic curve point `r`
  ## https://datatracker.ietf.org/doc/html/draft-irtf-cfrg-hash-to-curve-14#section-6.6.1
-  
+
  var
    tv1 {.noInit.}, tv2{.noInit.}, tv3{.noInit.}: F
    tv4{.noInit.}: F
@ -62,7 +62,7 @@ func mapToCurve_svdw[F, G](
    tv1.c1.neg()
  tv3.prod(tv1, tv2)
  tv3.inv()
-  
+
  tv4.prod(u, tv1)
  tv4 *= tv3
  tv4.mulCheckSparse(h2cConst(F.C, svdw, G, z3))
@ -87,7 +87,7 @@ func mapToCurve_svdw[F, G](

  r.y.curve_eq_rhs(r.x, G)
  r.y.sqrt()
-  
+
  r.y.cneg(sgn0(u) xor sgn0(r.y))

 func mapToIsoCurve_sswuG1_opt3mod4[F](
@ -101,8 +101,7 @@ func mapToIsoCurve_sswuG1_opt3mod4[F](
  mapToIsoCurve_sswuG1_opt3mod4(
    xn, xd,
    yn,
-    u, xd3
-  )
+    u, xd3)

  # Convert to Jacobian
  r.z = xd          # Z = xd
@ -120,8 +119,7 @@ func mapToIsoCurve_sswuG2_opt9mod16[F](
  mapToIsoCurve_sswuG2_opt9mod16(
    xn, xd,
    yn,
-    u, xd3
-  )
+    u, xd3)

  # Convert to Jacobian
  r.z = xd          # Z = xd
@ -167,7 +165,7 @@ func mapToCurve_sswu_fusedAdd[F; G: static Subgroup](
    # Simplified Shallue-van de Woestijne-Ulas method for AB == 0

    var P0{.noInit.}, P1{.noInit.}: ECP_ShortW_Jac[F, G]
-    
+
    # 1. Map to E' isogenous to E
    when F is Fp and F.C.has_P_3mod4_primeModulus():
      # 1. Map to E'1 isogenous to E1
@ -191,16 +189,13 @@ func mapToCurve_sswu_fusedAdd[F; G: static Subgroup](
 # Hash to curve
 # ----------------------------------------------------------------

-func hashToCurve_svdw*[
-         F; G: static Subgroup;
-         B1, B2, B3: byte|char](
+func hashToCurve_svdw*[F; G: static Subgroup](
       H: type CryptoHash,
       k: static int,
       output: var ECP_ShortW_Jac[F, G],
-       augmentation: openarray[B1],
-       message: openarray[B2],
-       domainSepTag: openarray[B3]
-     ) =
+       augmentation: openArray[byte],
+       message: openArray[byte],
+       domainSepTag: openArray[byte]) {.genCharAPI.} =
  ## Hash a message to an elliptic curve
  ##
  ## Arguments:
@ -215,14 +210,14 @@ func hashToCurve_svdw*[
  ## - `augmentation`, an optional augmentation to the message. This will be prepended,
  ##   prior to hashing.
  ##   This is used for building the "message augmentation" variant of BLS signatures
-  ##   https://tools.ietf.org/html/draft-irtf-cfrg-bls-signature-04#section-3.2
+  ##   https://www.ietf.org/archive/id/draft-irtf-cfrg-bls-signature-05.html#section-3.2
  ##   which requires `CoreSign(SK, PK || message)`
  ##   and `CoreVerify(PK, PK || message, signature)`
  ## - `message` is the message to hash
  ## - `domainSepTag` is the protocol domain separation tag (DST).

  var u{.noInit.}: array[2, F]
-  if domainSepTag.len <= 255: 
+  if domainSepTag.len <= 255:
    H.hashToField(k, u, augmentation, message, domainSepTag)
  else:
    const N = H.type.digestSize()
@ -233,16 +228,13 @@ func hashToCurve_svdw*[
  output.mapToCurve_svdw_fusedAdd(u[0], u[1])
  output.clearCofactor()

-func hashToCurve_sswu*[
-         F; G: static Subgroup;
-         B1, B2, B3: byte|char](
+func hashToCurve_sswu*[F; G: static Subgroup](
       H: type CryptoHash,
       k: static int,
       output: var ECP_ShortW_Jac[F, G],
-       augmentation: openarray[B1],
-       message: openarray[B2],
-       domainSepTag: openarray[B3]
-     ) =
+       augmentation: openArray[byte],
+       message: openArray[byte],
+       domainSepTag: openArray[byte]) {.genCharAPI.} =
  ## Hash a message to an elliptic curve
  ##
  ## Arguments:
@ -257,14 +249,14 @@ func hashToCurve_sswu*[
  ## - `augmentation`, an optional augmentation to the message. This will be prepended,
  ##   prior to hashing.
  ##   This is used for building the "message augmentation" variant of BLS signatures
-  ##   https://tools.ietf.org/html/draft-irtf-cfrg-bls-signature-04#section-3.2
+  ##   https://www.ietf.org/archive/id/draft-irtf-cfrg-bls-signature-05.html#section-3.2
  ##   which requires `CoreSign(SK, PK || message)`
  ##   and `CoreVerify(PK, PK || message, signature)`
  ## - `message` is the message to hash
  ## - `domainSepTag` is the protocol domain separation tag (DST).

  var u{.noInit.}: array[2, F]
-  if domainSepTag.len <= 255: 
+  if domainSepTag.len <= 255:
    H.hashToField(k, u, augmentation, message, domainSepTag)
  else:
    const N = H.type.digestSize()
@ -275,16 +267,13 @@ func hashToCurve_sswu*[
  output.mapToCurve_sswu_fusedAdd(u[0], u[1])
  output.clearCofactor()

-func hashToCurve*[
-         F; G: static Subgroup;
-         B1, B2, B3: byte|char](
+func hashToCurve*[F; G: static Subgroup](
       H: type CryptoHash,
       k: static int,
       output: var ECP_ShortW_Jac[F, G],
-       augmentation: openarray[B1],
-       message: openarray[B2],
-       domainSepTag: openarray[B3]
-     ) {.inline.} =
+       augmentation: openArray[byte],
+       message: openArray[byte],
+       domainSepTag: openArray[byte]) {.inline, genCharAPI.} =
  ## Hash a message to an elliptic curve
  ##
  ## Arguments:
@ -299,7 +288,7 @@ func hashToCurve*[
  ## - `augmentation`, an optional augmentation to the message. This will be prepended,
  ##   prior to hashing.
  ##   This is used for building the "message augmentation" variant of BLS signatures
-  ##   https://tools.ietf.org/html/draft-irtf-cfrg-bls-signature-04#section-3.2
+  ##   https://www.ietf.org/archive/id/draft-irtf-cfrg-bls-signature-05.html#section-3.2
  ##   which requires `CoreSign(SK, PK || message)`
  ##   and `CoreVerify(PK, PK || message, signature)`
  ## - `message` is the message to hash
@ -313,16 +302,13 @@ func hashToCurve*[
  else:
    {.error: "Not implemented".}

-func hashToCurve*[
-         F; G: static Subgroup;
-         B1, B2, B3: byte|char](
+func hashToCurve*[F; G: static Subgroup](
       H: type CryptoHash,
       k: static int,
       output: var (ECP_ShortW_Prj[F, G] or ECP_ShortW_Aff[F, G]),
-       augmentation: openarray[B1],
-       message: openarray[B2],
-       domainSepTag: openarray[B3]
-     ) {.inline.} =
+       augmentation: openArray[byte],
+       message: openArray[byte],
+       domainSepTag: openArray[byte]) {.inline, genCharAPI.} =
  ## Hash a message to an elliptic curve
  ##
  ## Arguments:
@ -337,12 +323,12 @@ func hashToCurve*[
  ## - `augmentation`, an optional augmentation to the message. This will be prepended,
  ##   prior to hashing.
  ##   This is used for building the "message augmentation" variant of BLS signatures
-  ##   https://tools.ietf.org/html/draft-irtf-cfrg-bls-signature-04#section-3.2
+  ##   https://www.ietf.org/archive/id/draft-irtf-cfrg-bls-signature-05.html#section-3.2
  ##   which requires `CoreSign(SK, PK || message)`
  ##   and `CoreVerify(PK, PK || message, signature)`
  ## - `message` is the message to hash
  ## - `domainSepTag` is the protocol domain separation tag (DST).
-  
+
  var Pjac{.noInit.}: ECP_ShortW_Jac[F, G]
  H.hashToCurve(k, Pjac, augmentation, message, domainSepTag)
  when output is ECP_ShortW_Prj:
--- a/constantine/hashes.nim
+++ b/constantine/hashes.nim
@ -6,6 +6,8 @@
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.

+import platforms/views
+
 # ############################################################
 #
 #                Hash Function concept
@ -30,23 +32,19 @@ type

    # Context
    # -------------------------------------------
-    # update/finish are not matching properly
-
-    # type B = char or byte
    ctx.init()
-    # ctx.update(openarray[B])
-    # ctx.finish(var array[H.digestSize, byte])
+    ctx.update(openarray[byte])
+    ctx.finish(var array[H.digestSize, byte])
    ctx.clear()

-func hash*[DigestSize: static int, T: char|byte](
+func hash*[DigestSize: static int](
       HashKind: type CryptoHash,
       digest: var array[DigestSize, byte],
-       message: openarray[T],
-       clearMem = false) =
+       message: openArray[byte],
+       clearMem = false) {.genCharAPI.} =
  ## Produce a digest from a message
  static: doAssert DigestSize == HashKind.type.digestSize

-  mixin update, finish
  var ctx {.noInit.}: HashKind
  ctx.init()
  ctx.update(message)
@ -55,10 +53,10 @@ func hash*[DigestSize: static int, T: char|byte](
  if clearMem:
    ctx.clear()

-func hash*[T: char|byte](
+func hash*(
       HashKind: type CryptoHash,
-       message: openarray[T],
-       clearmem = false): array[HashKind.digestSize, byte] {.noInit.} =
+       message: openArray[byte],
+       clearmem = false): array[HashKind.digestSize, byte] {.noInit, genCharAPI.} =
  ## Produce a digest from a message
  HashKind.hash(result, message, clearMem)

--- a/constantine/hashes/h_sha256.nim
+++ b/constantine/hashes/h_sha256.nim
@ -6,8 +6,10 @@
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.

+import ../zoo_exports
+
 import
-  ../platforms/[abstractions, endians],
+  ../platforms/[abstractions, endians, views],
  ./sha256/sha256_generic

 when UseASM_X86_32:
@ -82,7 +84,7 @@ template internalBlockSize*(H: type sha256): int =
  ## Returns the byte size of the hash function ingested blocks
  BlockSize

-func init*(ctx: var Sha256Context) =
+func init*(ctx: var Sha256Context) {.libPrefix: prefix_sha256.} =
  ## Initialize or reinitialize a Sha256 context

  ctx.msgLen = 0
@ -119,7 +121,7 @@ func initZeroPadded*(ctx: var Sha256Context) =
  ctx.s.H[6] = 0xbafef9ea'u32
  ctx.s.H[7] = 0x1837a9d8'u32

-func update*(ctx: var Sha256Context, message: openarray[byte]) =
+func update*(ctx: var Sha256Context, message: openarray[byte]) {.libPrefix: prefix_sha256, genCharAPI.} =
  ## Append a message to a SHA256 context
  ## for incremental SHA256 computation
  ##
@ -132,7 +134,7 @@ func update*(ctx: var Sha256Context, message: openarray[byte]) =
  ##
  ## For passwords and secret keys, you MUST NOT use raw SHA-256
  ## use a Key Derivation Function instead (KDF)
-  
+
  # Message processing state machine
  var bufIdx = uint(ctx.msgLen mod BlockSize)
  var cur = 0'u
@ -141,12 +143,12 @@ func update*(ctx: var Sha256Context, message: openarray[byte]) =
  if bufIdx != 0 and bufIdx+bytesLeft >= BlockSize:
    # Previous partial update, fill the buffer and do one sha256 hash
    let free = BlockSize - bufIdx
-    ctx.buf.copy(dStart = bufIdx, message, sStart = 0, len = free)
+    ctx.buf.rawCopy(dStart = bufIdx, message, sStart = 0, len = free)
    ctx.hashBuffer()
    bufIdx = 0
    cur = free
    bytesLeft -= free
-  
+
  if bytesLeft >= BlockSize:
    # Process n blocks (64 byte each)
    let numBlocks = bytesLeft div BlockSize
@ -156,26 +158,11 @@ func update*(ctx: var Sha256Context, message: openarray[byte]) =

  if bytesLeft != 0:
    # Store the tail in buffer
-    ctx.buf.copy(dStart = bufIdx, message, sStart = cur, len = bytesLeft)
+    ctx.buf.rawCopy(dStart = bufIdx, message, sStart = cur, len = bytesLeft)

  ctx.msgLen += message.len.uint

-func update*(ctx: var Sha256Context, message: openarray[char]) {.inline.} =
-  ## Append a message to a SHA256 context
-  ## for incremental SHA256 computation
-  ##
-  ## Security note: the tail of your message might be stored
-  ## in an internal buffer.
-  ## if sensitive content is used, ensure that
-  ## `ctx.finish(...)` and `ctx.clear()` are called as soon as possible.
-  ## Additionally ensure that the message(s) passed were stored
-  ## in memory considered secure for your threat model.
-  ##
-  ## For passwords and secret keys, you MUST NOT use raw SHA-256
-  ## use a Key Derivation Function instead (KDF)
-  ctx.update(message.toOpenArrayByte(message.low, message.high))
-
-func finish*(ctx: var Sha256Context, digest: var array[32, byte]) =
+func finish*(ctx: var Sha256Context, digest: var array[32, byte]) {.libPrefix: prefix_sha256.} =
  ## Finalize a SHA256 computation and output the
  ## message digest to the `digest` buffer.
  ##
@ -205,7 +192,7 @@ func finish*(ctx: var Sha256Context, digest: var array[32, byte]) =
  ctx.s.hashMessageBlocks(ctx.buf.asUnchecked(), numBlocks = 1)
  digest.dumpHash(ctx.s)

-func clear*(ctx: var Sha256Context) =
+func clear*(ctx: var Sha256Context) {.libPrefix: prefix_sha256.} =
  ## Clear the context internal buffers
  ## Security note:
  ## For passwords and secret keys, you MUST NOT use raw SHA-256
--- a/constantine/kdf/kdf_hkdf.nim
+++ b/constantine/kdf/kdf_hkdf.nim
@ -9,7 +9,7 @@
 import
  ../hashes,
  ../mac/mac_hmac,
-  ../platforms/primitives
+  ../platforms/[primitives, views]

 # HMAC-based Extract-and-Expand Key Derivation Function (HKDF)
 # ------------------------------------------------------------
@ -21,15 +21,18 @@ import
 type HKDF*[H: CryptoHash] = object
  hmac: HMAC[H]

-func hkdf_extract_init*[H: CryptoHash, S, I: char|byte](
+func clear*(ctx: var HKDF) {.inline.} =
+  ctx.hmac.clear()
+
+func hkdf_extract_init*[H: CryptoHash](
       ctx: var HKDF[H],
-       salt: openArray[S],
-       ikm: openArray[I]) {.inline.}=
+       salt: openArray[byte],
+       ikm: openArray[byte]) {.inline.}=
  ctx.hmac.init(salt)
  ctx.hmac.update(ikm)

-func hkdf_extract_append_to_IKM*[H: CryptoHash, T: char|byte](
-       ctx: var HKDF[H], append: openArray[T]) {.inline.} =
+func hkdf_extract_append_to_IKM*[H: CryptoHash](
+       ctx: var HKDF[H], append: openArray[byte]) {.inline.} =
  ctx.hmac.update(append)

 func hkdf_extract_finish*[H: CryptoHash, N: static int](
@ -38,11 +41,11 @@ func hkdf_extract_finish*[H: CryptoHash, N: static int](
  static: doAssert H.digestSize == N
  ctx.hmac.finish(prk)

-func hkdfExtract*[H: CryptoHash;S,I: char|byte, N: static int](
+func hkdfExtract*[H: CryptoHash; N: static int](
                     ctx: var HKDF[H],
                     prk: var array[N, byte],
-                     salt: openArray[S],
-                     ikm: openArray[I]) {.inline.} =
+                     salt: openArray[byte],
+                     ikm: openArray[byte]) {.inline.} =
  ## "Extract" step of HKDF.
  ## Extract a fixed size pseudom-random key
  ## from an optional salt value
@ -69,17 +72,17 @@ func hkdfExtract*[H: CryptoHash;S,I: char|byte, N: static int](
  ctx.hkdf_extract_init(salt, ikm)
  ctx.hkdf_extract_finish(prk)

-iterator hkdfExpandChunk*[H: CryptoHash; N: static int; I, A: char|byte](
+iterator hkdfExpandChunk*[H: CryptoHash; N: static int](
          ctx: var HKDF[H],
          chunk: var array[N, byte],
          prk: array[N, byte],
-          info: openArray[I],
-          append: openArray[A]): int =
+          info: openArray[byte],
+          append: openArray[byte]): int =
  ## "Expand" step of HKDF, with an iterator with up to 255 iterations.
-  ## 
+  ##
  ## Note: The output MUST be at most 255 iterations as per RFC5869
  ##       https://datatracker.ietf.org/doc/html/rfc5869
-  ## 
+  ##
  ## Expand a fixed size pseudo random-key
  ## into several pseudo-random keys
  ##
@ -94,12 +97,15 @@ iterator hkdfExpandChunk*[H: CryptoHash; N: static int; I, A: char|byte](
  ## - chunk:
  ##   In:  OKMᵢ₋₁ (output keying material chunk i-1)
  ##   Out: OKMᵢ (output keying material chunk i).
-  ## 
+  ##
  ## Output:
  ## - returns the current chunk number i
-  ## 
+  ##
  ## Temporary:
  ## - ctx: a HMAC["cryptographic-hash"] context, for example HMAC[sha256].
+  ##
+  ## After iterating, the HKDF context should be cleared
+  ## if secret keying material was used.

  const HashLen = H.digestSize()
  static: doAssert N == HashLen
@ -117,12 +123,13 @@ iterator hkdfExpandChunk*[H: CryptoHash; N: static int; I, A: char|byte](

    yield i

-func hkdfExpand*[H: CryptoHash; K: static int; I, A: char|byte](
+func hkdfExpand*[H: CryptoHash; K: static int](
                    ctx: var HKDF[H],
                    output: var openArray[byte],
                    prk: array[K, byte],
-                    info: openArray[I],
-                    append: openArray[A]) =
+                    info: openArray[byte],
+                    append: openArray[byte],
+                    clearMem = false) =
  ## "Expand" step of HKDF
  ## Expand a fixed size pseudo random-key
  ## into several pseudo-random keys
@ -153,18 +160,20 @@ func hkdfExpand*[H: CryptoHash; K: static int; I, A: char|byte](
  for i in ctx.hkdfExpandChunk(t, prk, info, append):
    let iStart = i * HashLen
    let size = min(HashLen, output.len - iStart)
-    copy(output, iStart, t, 0, size)
-   
+    rawCopy(output, iStart, t, 0, size)
+
    if iStart+HashLen >= output.len:
      break

-  # ctx.clear() - TODO: very expensive
+  if clearMem:
+    ctx.clear()

-func hkdfExpand*[H: CryptoHash; K: static int; I: char|byte](
+func hkdfExpand*[H: CryptoHash; K: static int](
                    ctx: var HKDF[H],
                    output: var openArray[byte],
                    prk: array[K, byte],
-                    info: openArray[I]) {.inline.} =
+                    info: openArray[byte],
+                    clearMem = false) {.inline.} =
  ## "Expand" step of HKDF
  ## Expand a fixed size pseudo random-key
  ## into several pseudo-random keys
@ -178,17 +187,18 @@ func hkdfExpand*[H: CryptoHash; K: static int; I: char|byte](
  ##
  ## Temporary:
  ## - ctx: a HMAC["cryptographic-hash"] context, for example HMAC[sha256].
-  hkdfExpand(ctx, output, prk, info, default(array[0, byte]))
+  hkdfExpand(ctx, output, prk, info, default(array[0, byte]), clearMem)

-func hkdf*[H: CryptoHash, N: static int, O, S, K, I: char|byte](
+func hkdf*[H: CryptoHash, N: static int](
       Hash: typedesc[H],
-       output: var openArray[O],
-       salt: openArray[S],
-       ikm: openArray[K],
-       info: openArray[I]) {.inline.} =
+       output: var openArray[byte],
+       salt: openArray[byte],
+       ikm: openArray[byte],
+       info: openArray[byte],
+       clearMem = false) {.inline, genCharAPI.} =
  ## HKDF
  ## Inputs:
-  ## - A hash function, with an output digest length HashLen 
+  ## - A hash function, with an output digest length HashLen
  ## - An opttional salt value (non-secret random value), if not provided,
  ##   it is set to an array of HashLen zero bytes
  ## - A secret Input Keying Material
@ -197,4 +207,4 @@ func hkdf*[H: CryptoHash, N: static int, O, S, K, I: char|byte](
  var ctx{.noInit.}: HMAC[H]
  var prk{.noInit.}: array[H.digestSize(), byte]
  ctx.hkdfExtract(prk, salt, ikm)
-  ctx.hkdfExpand(output, prk, info)
+  ctx.hkdfExpand(output, prk, info, clearMem)
--- a/constantine/mac/mac_hmac.nim
+++ b/constantine/mac/mac_hmac.nim
@ -8,7 +8,7 @@

 import
  ../hashes,
-  ../platforms/primitives
+  ../platforms/[primitives, views]

 # HMAC: Keyed-Hashing for Message Authentication
 # ----------------------------------------------
@ -26,19 +26,19 @@ type HMAC*[H: CryptoHash] = object
  inner: H
  outer: H

-func init*[H: CryptoHash, T: char|byte](ctx: var HMAC[H], secretKey: openArray[T]) =
+func init*[H: CryptoHash](ctx: var HMAC[H], secretKey: openArray[byte]) {.genCharAPI.} =
  ## Initialize a HMAC-based Message Authentication Code
  ## with a pre-shared secret key
  ## between the parties that want to authenticate messages between each other.
-  ## 
+  ##
  ## Keys should be at least the same size as the hash function output size.
-  ## 
+  ##
  ## Keys need to be chosen at random (or using a cryptographically strong
  ## pseudo-random generator seeded with a random seed), and periodically
  ## refreshed.
  var key{.noInit.}: array[H.internalBlockSize(), byte]
  if secretKey.len <= key.len:
-    copy(key, 0, secretKey, 0, secretKey.len)
+    rawCopy(key, 0, secretKey, 0, secretKey.len)
    for i in secretKey.len ..< key.len:
      key[i] = byte 0
  else:
@ -62,15 +62,15 @@ func init*[H: CryptoHash, T: char|byte](ctx: var HMAC[H], secretKey: openArray[T
  ctx.outer.init()
  ctx.outer.update(key)

-func update*[H: CryptoHash, T: char|byte](ctx: var HMAC[H], message: openArray[T]) =
+func update*[H: CryptoHash](ctx: var HMAC[H], message: openArray[byte]) {.genCharAPI.} =
  ## Append a message to a HMAC authentication context.
  ## for incremental HMAC computation.
  ctx.inner.update(message)

-func finish*[H: CryptoHash, T: char|byte, N: static int](ctx: var HMAC[H], tag: var array[N, T]) =
+func finish*[H: CryptoHash, N: static int](ctx: var HMAC[H], tag: var array[N, byte]) =
  ## Finalize a HMAC authentication
  ## and output an authentication tag to the `tag` buffer
-  ## 
+  ##
  ## Output may be used truncated, with the leftmost bits are kept.
  ## It is recommended that the tag length is at least half the length of the hash output
  ## and at least 80-bits.
@ -85,17 +85,18 @@ func clear*[H: CryptoHash](ctx: var HMAC[H]) =
  ctx.inner.clear()
  ctx.outer.clear()

-func mac*[T: char|byte, H: CryptoHash, N: static int](
+func mac*[T0, T1: char|byte, H: CryptoHash, N: static int](
       Hash: type HMAC[H],
       tag: var array[N, byte],
-       message: openArray[T],
-       secretKey: openarray[T],
+       message: openArray[T0],
+       secretKey: openArray[T1],
       clearMem = false) =
  ## Produce an authentication tag from a message
  ## and a preshared unique non-reused secret key
-  
+  # TODO: we can't use the {.genCharAPI.} macro
+  #       due to 2 openArray[bytes] and the CryptoHash concept
  static: doAssert N == H.digestSize()
-  
+
  var ctx {.noInit.}: HMAC[H]
  ctx.init(secretKey)
  ctx.update(message)
--- a/constantine/mac/mac_poly1305.nim
+++ b/constantine/mac/mac_poly1305.nim
@ -7,7 +7,7 @@
 # at your option. This file may not be copied, modified, or distributed except according to those terms.

 import
-  ../platforms/abstractions,
+  ../platforms/[abstractions, views],
  ../math/arithmetic/bigints,
  ../math/arithmetic/[limbs, limbs_extmul],
  ../math/io/io_bigints
@ -42,7 +42,7 @@ func partialReduce_1305[N1, N2: static int](r: var Limbs[N1], a: Limbs[N2]) =
  ##        2ᵐ-c ≡  0     (mod p)
  ##   <=>  2ᵐ   ≡  c     (mod p)   [1]
  ##   <=> a2ᵐ+b ≡ ac + b (mod p)
-  ## 
+  ##
  ## This partially reduces the input in range [0, 2¹³⁰)
  #
  # Assuming 64-bit words,
@ -51,25 +51,25 @@ func partialReduce_1305[N1, N2: static int](r: var Limbs[N1], a: Limbs[N2]) =
  # Assuming 32-bit words,
  #   N1 = 5 words (160-bit necessary for 2¹³⁰-1)
  #   N2 = 8 words (288-bit necessary for 2¹³¹.2¹²⁴)
-  # 
+  #
  # from 64-bit, starting from [1]
  #   2ᵐ      ≡  c     (mod p)
  #   2¹³⁰    ≡  5     (mod p)
  # 2¹³⁰.2⁶²  ≡  5.2⁶² (mod p)
  #   2¹⁹²    ≡  5.2⁶² (mod p)
-  # 
+  #
  # Hence if we call a the [2¹⁹², 2²⁶⁰) range
  # and b the [0, 2¹⁹²) range
  # we have
  # a2¹⁹²+b ≡ a.5.2⁶² + b (mod p)
-  # 
+  #
  # Then we can handle the highest word which has
  # 62 bits that should be folded back as well
-  # 
+  #
  # Similarly for 32-bit
  #   2¹⁶⁰    ≡  5.2³⁰ (mod p)
  # and we need to fold back the top 30 bits
-  # 
+  #
  # But there is a twist. 5.2⁶² need 65-bit not 64
  # and 5.2³⁰ need 33-bit not 32

@ -77,7 +77,7 @@ func partialReduce_1305[N1, N2: static int](r: var Limbs[N1], a: Limbs[N2]) =
    static:
      doAssert N1 == 3
      doAssert N2 == 4
-    
+
    block:
      # First pass, fold everything greater than 2¹⁹²-1
      # a2¹⁹²+b ≡ a.5.2⁶² + b (mod p)
@ -99,7 +99,7 @@ func partialReduce_1305[N1, N2: static int](r: var Limbs[N1], a: Limbs[N2]) =
    static:
      doAssert N1 == 5
      doAssert N2 == 8
-    
+
    block:
      # First pass, fold everything greater than 2¹⁶⁰-1
      # a2¹⁶⁰+b ≡ a.5.2³⁰ + b (mod p)
@ -109,7 +109,7 @@ func partialReduce_1305[N1, N2: static int](r: var Limbs[N1], a: Limbs[N2]) =

      staticFor i, 0, N1:
        r[i] = a[i]
-      
+
      mulDoubleAcc(r[2], r[1], r[0], a[5], cExcess)
      mulDoubleAcc(r[3], r[2], r[1], a[6], cExcess)
      mulDoubleAcc(r[4], r[3], r[2], a[7], cExcess)
@ -122,7 +122,7 @@ func partialReduce_1305[N1, N2: static int](r: var Limbs[N1], a: Limbs[N2]) =
  var carry, carry2: Carry
  var hi = r[N1-1] shr (WordBitWidth - excessBits)
  r[N1-1] = r[N1-1] and (MaxWord shr excessBits)
-  
+
  # hi *= 5, with overflow stored in carry
  let hi4 = hi shl 2                   # Cannot overflow as we have 2 spare bits
  addC(carry2, hi, hi, hi4, Carry(0))  # Use the carry bit for storing a 63/31 bit result
@ -132,7 +132,7 @@ func partialReduce_1305[N1, N2: static int](r: var Limbs[N1], a: Limbs[N2]) =
  addC(carry, r[1], r[1], SecretWord(carry2), carry)
  staticFor i, 2, N1:
    addC(carry, r[i], r[i], Zero, carry)
-  
+
 func finalReduce_1305[N: static int](a: var Limbs[N]) =
  ## Maps an input in redundant representation [0, 2¹³¹-10)
  ## to the canonical representation in [0, 2¹³⁰-5)
@ -157,10 +157,10 @@ type Poly1305_CTX = object

 type poly1305* = Poly1305_CTX

-func macMessageBlocks[T: byte|char](
+func macMessageBlocks(
       acc: var BigInt[130+1],
       r: BigInt[124],
-       message: openArray[T],
+       message: openArray[byte],
       blockSize = BlockSize): uint =
  ## Authenticate a message block by block
  ## Poly1305 block size is 16 bytes.
@ -180,20 +180,13 @@ func macMessageBlocks[T: byte|char](

  for curBlock in 0 ..< numBlocks:
    # range [0, 2¹²⁸-1)
-    when T is byte:
-      input.unmarshal(
-        message.toOpenArray(curBlock*BlockSize, curBlock*BlockSize + BlockSize - 1),
-        littleEndian
-      )
-    else:
-      input.unmarshal(
-        message.toOpenArrayByte(curBlock*BlockSize, curBlock*BlockSize + BlockSize - 1),
-        littleEndian
-      )
+    input.unmarshal(
+      message.toOpenArray(curBlock*BlockSize, curBlock*BlockSize + BlockSize - 1),
+      littleEndian)
    input.setBit(8*blockSize) # range [2¹²⁸, 2¹²⁸+2¹²⁸-1)
    acc += input              # range [2¹²⁸, 2¹³⁰-1+2¹²⁸+2¹²⁸-1)
    t.prod(acc, r)            # range [2²⁵⁶, (2¹²⁴-1)(2¹³⁰+2(2¹²⁸-1)))
-    
+
    acc.limbs.partialReduce_1305(t.limbs)

  return BlockSize * numBlocks.uint
@ -213,7 +206,7 @@ func init*(ctx: var Poly1305_CTX, nonReusedKey: array[32, byte]) =
  ## nonReusedKey is an unique not-reused pre-shared key
  ## between the parties that want to authenticate messages between each other
  ctx.acc.setZero()
-  
+
  const clamp = BigInt[128].fromHex"0x0ffffffc0ffffffc0ffffffc0fffffff"
  ctx.r.unmarshal(nonReusedKey.toOpenArray(0, 15), littleEndian)
  staticFor i, 0, ctx.r.limbs.len:
@ -224,7 +217,7 @@ func init*(ctx: var Poly1305_CTX, nonReusedKey: array[32, byte]) =
  ctx.msgLen = 0
  ctx.bufIdx = 0

-func update*[T: char|byte](ctx: var Poly1305_CTX, message: openArray[T]) =
+func update*(ctx: var Poly1305_CTX, message: openArray[byte]) {.genCharAPI.} =
  ## Append a message to a Poly1305 authentication context.
  ## for incremental Poly1305 computation
  ##
@ -246,7 +239,7 @@ func update*[T: char|byte](ctx: var Poly1305_CTX, message: openArray[T]) =
  var # Message processing state machine
    cur = 0'u
    bytesLeft = message.len.uint
-  
+
  ctx.msgLen += bytesLeft

  if ctx.bufIdx != 0: # Previous partial update
@ -255,21 +248,21 @@ func update*[T: char|byte](ctx: var Poly1305_CTX, message: openArray[T]) =

    if free > bytesLeft:
      # Enough free space, store in buffer
-      ctx.buf.copy(dStart = bufIdx, message, sStart = 0, len = bytesLeft)
+      ctx.buf.rawCopy(dStart = bufIdx, message, sStart = 0, len = bytesLeft)
      ctx.bufIdx += bytesLeft.uint8
      return
    else:
      # Fill the buffer and do one Poly1305 MAC
-      ctx.buf.copy(dStart = bufIdx, message, sStart = 0, len = free)
+      ctx.buf.rawCopy(dStart = bufIdx, message, sStart = 0, len = free)
      ctx.macBuffer(blockSize = BlockSize)

      # Update message state for further processing
      cur = free
      bytesLeft -= free
-  
+
  # Process n blocks (16 bytes each)
  let consumed = ctx.acc.macMessageBlocks(
-    ctx.r, 
+    ctx.r,
    message.toOpenArray(int cur, message.len-1),
    blockSize = BlockSize
  )
@ -282,7 +275,7 @@ func update*[T: char|byte](ctx: var Poly1305_CTX, message: openArray[T]) =
      doAssert ctx.bufIdx == 0
      doAssert cur + bytesLeft == message.len.uint

-    ctx.buf.copy(dStart = 0'u, message, sStart = cur, len = bytesLeft)
+    ctx.buf.rawCopy(dStart = 0'u, message, sStart = cur, len = bytesLeft)
    ctx.bufIdx = uint8 bytesLeft

 func finish*(ctx: var Poly1305_CTX, tag: var array[16, byte]) =
@ -305,7 +298,7 @@ func finish*(ctx: var Poly1305_CTX, tag: var array[16, byte]) =
  # Input is only partially reduced to [0, 2¹³⁰)
  # Map it to [0, 2¹³⁰-5)
  ctx.acc.limbs.finalReduce_1305()
-  
+
  # Starting from now, we only care about the 128 least significant bits
  var acc128{.noInit.}: BigInt[128]
  acc128.copyTruncatedFrom(ctx.acc)
@ -328,15 +321,15 @@ func clear*(ctx: var Poly1305_CTX) =
  ctx.msgLen = 0
  ctx.bufIdx = 0

-func mac*[T: char|byte](
+func mac*(
       _: type poly1305,
       tag: var array[16, byte],
-       message: openArray[T],
+       message: openArray[byte],
       nonReusedKey: array[32, byte],
-       clearMem = false) =
+       clearMem = false) {.genCharAPI.} =
  ## Produce an authentication tag from a message
  ## and a preshared unique non-reused secret key
-  
+
  var ctx {.noInit.}: poly1305
  ctx.init(nonReusedKey)
  ctx.update(message)
@ -345,11 +338,11 @@ func mac*[T: char|byte](
  if clearMem:
    ctx.clear()

-func mac*[T: char|byte](
+func mac*(
       _: type poly1305,
-       message: openArray[T],
+       message: openArray[byte],
       nonReusedKey: array[32, byte],
-       clearMem = false): array[16, byte]{.noInit.}=
+       clearMem = false): array[16, byte]{.noInit, genCharAPI.}=
  ## Produce an authentication tag from a message
  ## and a preshared unique non-reused secret key
  poly1305.mac(result, message, nonReusedKey, clearMem)
--- a/constantine/math/arithmetic/assembly/limbs_asm_modular_dbl_prec_x86.nim
+++ b/constantine/math/arithmetic/assembly/limbs_asm_modular_dbl_prec_x86.nim
@ -26,7 +26,8 @@ import
 # and so FpDbl would 768 bits.

 static: doAssert UseASM_X86_64
-{.localPassC:"-fomit-frame-pointer".} # Needed so that the compiler finds enough registers
+# Necessary for the compiler to find enough registers
+{.localPassC:"-fomit-frame-pointer".}  # (enabled at -O1)

 # Double-precision field addition
 # ------------------------------------------------------------
@ -93,7 +94,7 @@ macro addmod2x_gen[N: static int](R: var Limbs[N], A, B: Limbs[N], m: Limbs[N di

  result.add ctx.generate

-func addmod2x_asm*[N: static int](r: var Limbs[N], a, b: Limbs[N], M: Limbs[N div 2]) =
+func addmod2x_asm*[N: static int](r: var Limbs[N], a, b: Limbs[N], M: Limbs[N div 2]) {.noInline.} =
  ## Constant-time double-precision addition
  ## Output is conditionally reduced by 2ⁿp
  ## to stay in the [0, 2ⁿp) range
@ -159,7 +160,7 @@ macro submod2x_gen[N: static int](R: var Limbs[N], A, B: Limbs[N], m: Limbs[N di

  result.add ctx.generate

-func submod2x_asm*[N: static int](r: var Limbs[N], a, b: Limbs[N], M: Limbs[N div 2]) =
+func submod2x_asm*[N: static int](r: var Limbs[N], a, b: Limbs[N], M: Limbs[N div 2]) {.noInline.} =
  ## Constant-time double-precision substraction
  ## Output is conditionally reduced by 2ⁿp
  ## to stay in the [0, 2ⁿp) range
@ -233,6 +234,6 @@ macro negmod2x_gen[N: static int](R: var Limbs[N], A: Limbs[N], m: Limbs[N div 2
    var `usym`{.noinit, used.}: typeof(`A`)
  result.add ctx.generate

-func negmod2x_asm*[N: static int](r: var Limbs[N], a: Limbs[N], M: Limbs[N div 2]) =
+func negmod2x_asm*[N: static int](r: var Limbs[N], a: Limbs[N], M: Limbs[N div 2]) {.noInline.} =
  ## Constant-time double-precision negation
  negmod2x_gen(r, a, M)
--- a/constantine/math/arithmetic/assembly/limbs_asm_modular_x86.nim
+++ b/constantine/math/arithmetic/assembly/limbs_asm_modular_x86.nim
@ -25,22 +25,22 @@ import

 static: doAssert UseASM_X86_32

-{.localPassC:"-fomit-frame-pointer".} # Needed so that the compiler finds enough registers
+# Necessary for the compiler to find enough registers
+{.localPassC:"-fomit-frame-pointer".}  # (enabled at -O1)

 proc finalSubNoOverflowImpl*(
       ctx: var Assembler_x86,
       r: Operand or OperandArray,
-       a, M, scratch: OperandArray
-     ) =
+       a, M, scratch: OperandArray) =
  ## Reduce `a` into `r` modulo `M`
  ## To be used when the modulus does not use the full bitwidth of the storing words
  ## for example a 255-bit modulus in n words of total max size 2^256
-  ## 
+  ##
  ## r, a, scratch, scratchReg are mutated
  ## M is read-only
  let N = M.len
  ctx.comment "Final substraction (cannot overflow its limbs)"
-  
+
  # Substract the modulus, and test a < p with the last borrow
  ctx.mov scratch[0], a[0]
  ctx.sub scratch[0], M[0]
@ -58,12 +58,11 @@ proc finalSubMayOverflowImpl*(
       ctx: var Assembler_x86,
       r: Operand or OperandArray,
       a, M, scratch: OperandArray,
-       scratchReg: Operand or Register or OperandReuse
-     ) =
+       scratchReg: Operand or Register or OperandReuse) =
  ## Reduce `a` into `r` modulo `M`
  ## To be used when the final substraction can
  ## also overflow the limbs (a 2^256 order of magnitude modulus stored in n words of total max size 2^256)
-  ## 
+  ##
  ## r, a, scratch, scratchReg are mutated
  ## M is read-only
  let N = M.len
@ -97,7 +96,7 @@ macro finalSub_gen*[N: static int](
  ## Returns:
  ##   a-M if a > M
  ##   a otherwise
-  ## 
+  ##
  ## - r_PIR is a pointer to the result array, mutated,
  ## - a_EIR is an array of registers, mutated,
  ## - M_PIR is a pointer to an array, read-only,
@ -173,8 +172,9 @@ macro addmod_gen[N: static int](R: var Limbs[N], A, B, m: Limbs[N], spareBits: s

  result.add ctx.generate()

-func addmod_asm*(r: var Limbs, a, b, m: Limbs, spareBits: static int) =
+func addmod_asm*(r: var Limbs, a, b, m: Limbs, spareBits: static int) {.noInline.} =
  ## Constant-time modular addition
+  # This MUST be noInline or Clang will run out of registers with LTO
  addmod_gen(r, a, b, m, spareBits)

 # Field substraction
@ -233,9 +233,10 @@ macro submod_gen[N: static int](R: var Limbs[N], A, B, m: Limbs[N]): untyped =

  result.add ctx.generate

-func submod_asm*(r: var Limbs, a, b, M: Limbs) =
+func submod_asm*(r: var Limbs, a, b, M: Limbs) {.noInline.} =
  ## Constant-time modular substraction
  ## Warning, does not handle aliasing of a and b
+  # This MUST be noInline or Clang will run out of registers with LTO
  submod_gen(r, a, b, M)

 # Field negation
--- a/constantine/math/arithmetic/assembly/limbs_asm_mul_mont_x86.nim
+++ b/constantine/math/arithmetic/assembly/limbs_asm_mul_mont_x86.nim
@ -28,8 +28,9 @@ import

 static: doAssert UseASM_X86_64

-# Necessary for the compiler to find enough registers (enabled at -O1)
-{.localPassC:"-fomit-frame-pointer".}
+# Necessary for the compiler to find enough registers
+{.localPassC:"-fomit-frame-pointer".}  # (enabled at -O1)
+{.localPassC:"-fno-sanitize=address".} # need 15 registers out of 16 (1 reserved for stack pointer, none available for Address Sanitizer)

 # Montgomery multiplication
 # ------------------------------------------------------------
@ -37,8 +38,7 @@ static: doAssert UseASM_X86_64
 macro mulMont_CIOS_sparebit_gen[N: static int](
        r_PIR: var Limbs[N], a_PIR, b_PIR,
        M_PIR: Limbs[N], m0ninv_REG: BaseType,
-        skipFinalSub: static bool
-      ): untyped =
+        skipFinalSub: static bool): untyped =
  ## Generate an optimized Montgomery Multiplication kernel
  ## using the CIOS method
  ##
@ -184,26 +184,19 @@ macro mulMont_CIOS_sparebit_gen[N: static int](
    )
  result.add ctx.generate()

-func mulMont_CIOS_sparebit_asm*(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType, skipFinalSub: static bool = false) =
+func mulMont_CIOS_sparebit_asm*(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType, skipFinalSub: static bool = false) {.noInline.} =
  ## Constant-time Montgomery multiplication
  ## If "skipFinalSub" is set
  ## the result is in the range [0, 2M)
  ## otherwise the result is in the range [0, M)
-  ## 
+  ##
  ## This procedure can only be called if the modulus doesn't use the full bitwidth of its underlying representation
+  # This MUST be noInline or Clang will run out of registers with LTO
  r.mulMont_CIOS_sparebit_gen(a, b, M, m0ninv, skipFinalSub)

 # Montgomery Squaring
 # ------------------------------------------------------------

-func square_asm_inline[rLen, aLen: static int](r: var Limbs[rLen], a: Limbs[aLen]) {.inline.} =
-  ## Multi-precision Squaring
-  ## Assumes r doesn't alias a
-  ## Extra indirection as the generator assumes that
-  ## arrays are pointers, which is true for parameters
-  ## but not for stack variables
-  sqr_gen(r, a)
-
 func squareMont_CIOS_asm*[N](
       r: var Limbs[N],
       a, M: Limbs[N],
@ -211,8 +204,8 @@ func squareMont_CIOS_asm*[N](
       spareBits: static int, skipFinalSub: static bool) =
  ## Constant-time modular squaring
  var r2x {.noInit.}: Limbs[2*N]
-  r2x.square_asm_inline(a)
-  r.redcMont_asm_inline(r2x, M, m0ninv, spareBits, skipFinalSub)
+  square_asm(r2x, a)
+  r.redcMont_asm(r2x, M, m0ninv, spareBits, skipFinalSub)

 # Montgomery Sum of Products
 # ------------------------------------------------------------
@ -220,11 +213,10 @@ func squareMont_CIOS_asm*[N](
 macro sumprodMont_CIOS_spare2bits_gen[N, K: static int](
        r_PIR: var Limbs[N], a_PIR, b_PIR: array[K, Limbs[N]],
        M_PIR: Limbs[N], m0ninv_REG: BaseType,
-        skipFinalSub: static bool
-      ): untyped =
+        skipFinalSub: static bool): untyped =
  ## Generate an optimized Montgomery merged sum of products ⅀aᵢ.bᵢ kernel
  ## using the CIOS method
-  ## 
+  ##
  ## This requires 2 spare bits in the most significant word
  ## so that we can skip the intermediate reductions

@ -276,7 +268,7 @@ macro sumprodMont_CIOS_spare2bits_gen[N, K: static int](
    tN = scratch[2]                                  # High part of extended precision multiplication
    C = scratch[3]                                   # Carry during reduction step
    r = scratch[4]                                   # Stores the `r` operand
-    S = scratch[5]                                   # Mul step: Stores the carry A 
+    S = scratch[5]                                   # Mul step: Stores the carry A
                                                     # Red step: Stores (t[0] * m0ninv) mod 2ʷ

  # Registers used:
@ -338,7 +330,7 @@ macro sumprodMont_CIOS_spare2bits_gen[N, K: static int](
        ctx.add t[0], rax
        ctx.adc rdx, 0
      ctx.mov A, rdx
-      
+
      for j in 1 ..< N:
        ctx.comment "        (A,t[j])  := t[j] + a[k][j]*b[k][i] + A"
        ctx.mov rax, a[k, j]
@ -351,7 +343,7 @@ macro sumprodMont_CIOS_spare2bits_gen[N, K: static int](
        ctx.`xor` A, A
        ctx.add t[j], rax
        ctx.adc A, rdx
-      
+
      ctx.comment "    tN += A"
      ctx.add tN, A

@ -407,6 +399,6 @@ func sumprodMont_CIOS_spare2bits_asm*[N, K: static int](
  ## If "skipFinalSub" is set
  ## the result is in the range [0, 2M)
  ## otherwise the result is in the range [0, M)
-  ## 
+  ##
  ## This procedure can only be called if the modulus doesn't use the full bitwidth of its underlying representation
  r.sumprodMont_CIOS_spare2bits_gen(a, b, M, m0ninv, skipFinalSub)
--- a/constantine/math/arithmetic/assembly/limbs_asm_mul_mont_x86_adx_bmi2.nim
+++ b/constantine/math/arithmetic/assembly/limbs_asm_mul_mont_x86_adx_bmi2.nim
@ -30,8 +30,9 @@ static: doAssert UseASM_X86_64

 # MULX/ADCX/ADOX
 {.localPassC:"-madx -mbmi2".}
-# Necessary for the compiler to find enough registers (enabled at -O1)
-{.localPassC:"-fomit-frame-pointer".}
+# Necessary for the compiler to find enough registers
+{.localPassC:"-fomit-frame-pointer".}  # (enabled at -O1)
+{.localPassC:"-fno-sanitize=address".} # need 15 registers out of 16 (1 reserved for stack pointer, none available for Address Sanitizer)

 # Montgomery Multiplication
 # ------------------------------------------------------------
@ -42,8 +43,7 @@ proc mulx_by_word(
       t: OperandArray,
       a: Operand, # Pointer in scratchspace
       word0: Operand,
-       lo: Operand
-     ) =
+       lo: Operand) =
  ## Multiply the `a[0..<N]` by `word` and store in `t[0..<N]`
  ## and carry register `C` (t[N])
  ## `t` and `C` overwritten
@ -89,8 +89,7 @@ proc mulaccx_by_word(
       a: Operand, # Pointer in scratchspace
       i: int,
       word: Operand,
-       lo: Operand
-     ) =
+       lo: Operand) =
  ## Multiply the `a[0..<N]` by `word`
  ## and accumulate in `t[0..<N]`
  ## and carry register `C` (t[N])
@ -131,8 +130,7 @@ proc partialRedx(
       M: OperandArray,
       m0ninv: Operand,
       lo: Operand or Register,
-       S: Operand
-     ) =
+       S: Operand) =
    ## Partial Montgomery reduction
    ## For CIOS method
    ## `C` the update carry flag (represents t[N])
@ -284,7 +282,7 @@ func mulMont_CIOS_sparebit_asm_adx_inline*(r: var Limbs, a, b, M: Limbs, m0ninv:
  ## If "skipFinalSub" is set
  ## the result is in the range [0, 2M)
  ## otherwise the result is in the range [0, M)
-  ## 
+  ##
  ## This procedure can only be called if the modulus doesn't use the full bitwidth of its underlying representation
  r.mulMont_CIOS_sparebit_adx_gen(a, b, M, m0ninv, skipFinalSub)

@ -293,7 +291,7 @@ func mulMont_CIOS_sparebit_asm_adx*(r: var Limbs, a, b, M: Limbs, m0ninv: BaseTy
  ## If "skipFinalSub" is set
  ## the result is in the range [0, 2M)
  ## otherwise the result is in the range [0, M)
-  ## 
+  ##
  ## This procedure can only be called if the modulus doesn't use the full bitwidth of its underlying representation
  r.mulMont_CIOS_sparebit_asm_adx_inline(a, b, M, m0ninv, skipFinalSub)

@ -307,7 +305,7 @@ func squareMont_CIOS_asm_adx*[N](
       spareBits: static int, skipFinalSub: static bool) =
  ## Constant-time modular squaring
  var r2x {.noInit.}: Limbs[2*N]
-  r2x.square_asm_adx_inline(a)
+  r2x.square_asm_adx(a)
  r.redcMont_asm_adx(r2x, M, m0ninv, spareBits, skipFinalSub)

 # Montgomery Sum of Products
@ -316,11 +314,10 @@ func squareMont_CIOS_asm_adx*[N](
 macro sumprodMont_CIOS_spare2bits_adx_gen[N, K: static int](
        r_PIR: var Limbs[N], a_PIR, b_PIR: array[K, Limbs[N]],
        M_PIR: Limbs[N], m0ninv_REG: BaseType,
-        skipFinalSub: static bool
-      ): untyped =
+        skipFinalSub: static bool): untyped =
  ## Generate an optimized Montgomery merged sum of products ⅀aᵢ.bᵢ kernel
  ## using the CIOS method
-  ## 
+  ##
  ## This requires 2 spare bits in the most significant word
  ## so that we can skip the intermediate reductions

@ -372,7 +369,7 @@ macro sumprodMont_CIOS_spare2bits_adx_gen[N, K: static int](
    tN = scratch[2]                                  # High part of extended precision multiplication
    C = scratch[3]                                   # Carry during reduction step
    r = scratch[4]                                   # Stores the `r` operand
-    S = scratch[5]                                   # Mul step: Stores the carry A 
+    S = scratch[5]                                   # Mul step: Stores the carry A
                                                     # Red step: Stores (t[0] * m0ninv) mod 2ʷ

  # Registers used:
@ -433,7 +430,7 @@ macro sumprodMont_CIOS_spare2bits_adx_gen[N, K: static int](
        ctx.mulx A, rax, a[k, 0], rdx
        ctx.adcx t[0], rax
        ctx.adox t[1], A
-      
+
      for j in 1 ..< N-1:
        ctx.comment "        (A,t[j])  := t[j] + a[k][j]*b[k][i] + A"
        if i == 0 and k == 0:
@ -449,7 +446,7 @@ macro sumprodMont_CIOS_spare2bits_adx_gen[N, K: static int](

      # Last limb
      ctx.mulx A, rax, a[k, N-1], rdx
-      if i == 0 and k == 0:  
+      if i == 0 and k == 0:
        ctx.adc t[N-1], rax
        ctx.comment "    tN += A"
        ctx.adc tN, A
@ -490,6 +487,6 @@ func sumprodMont_CIOS_spare2bits_asm_adx*[N, K: static int](
  ## If "skipFinalSub" is set
  ## the result is in the range [0, 2M)
  ## otherwise the result is in the range [0, M)
-  ## 
+  ##
  ## This procedure can only be called if the modulus doesn't use the full bitwidth of its underlying representation
  r.sumprodMont_CIOS_spare2bits_adx_gen(a, b, M, m0ninv, skipFinalSub)
--- a/constantine/math/arithmetic/assembly/limbs_asm_mul_x86_adx_bmi2.nim
+++ b/constantine/math/arithmetic/assembly/limbs_asm_mul_x86_adx_bmi2.nim
@ -27,8 +27,8 @@ static: doAssert UseASM_X86_64

 # MULX/ADCX/ADOX
 {.localPassC:"-madx -mbmi2".}
-# Necessary for the compiler to find enough registers (enabled at -O1)
-# {.localPassC:"-fomit-frame-pointer".}
+# Necessary for the compiler to find enough registers
+# {.localPassC:"-fomit-frame-pointer".}  # (enabled at -O1)

 # Multiplication
 # ------------------------------------------------------------
@ -36,8 +36,7 @@ proc mulx_by_word(
       ctx: var Assembler_x86,
       r0: Operand,
       a, t: OperandArray,
-       word0: Operand
-     ) =
+       word0: Operand) =
  ## Multiply the `a[0..<N]` by `word`
  ## and store in `[t[n..1]:r0]`
  ## with [t[n..1]:r0] = tn, tn-1, ... t1, r0
@ -74,8 +73,7 @@ proc mulaccx_by_word(
       r: OperandArray,
       i: int,
       a, t: OperandArray,
-       word: Operand
-     ) =
+       word: Operand) =
  ## Multiply the `a[0..<N]` by `word`
  ## and store in `[t[n..0]:r0]`
  ## with [t[n..0]:r0] = tn, tn-1, ... t1, r0
@ -603,12 +601,7 @@ macro sqrx_gen*[rLen, aLen: static int](r_PIR: var Limbs[rLen], a_PIR: Limbs[aLe
  # Codegen
  result.add ctx.generate

-func square_asm_adx_inline*[rLen, aLen: static int](r: var Limbs[rLen], a: Limbs[aLen]) {.inline.} =
-  ## Multi-precision Squaring
-  ## inline version
-  sqrx_gen(r, a)
-
 func square_asm_adx*[rLen, aLen: static int](r: var Limbs[rLen], a: Limbs[aLen]) =
  ## Multi-precision Squaring
  ## Assumes r doesn't alias a
-  square_asm_adx_inline(r, a)
+  sqrx_gen(r, a)
--- a/constantine/math/arithmetic/assembly/limbs_asm_redc_mont_x86.nim
+++ b/constantine/math/arithmetic/assembly/limbs_asm_redc_mont_x86.nim
@ -22,8 +22,8 @@ import

 static: doAssert UseASM_X86_32

-# Necessary for the compiler to find enough registers (enabled at -O1)
-{.localPassC:"-fomit-frame-pointer".}
+# Necessary for the compiler to find enough registers
+{.localPassC:"-fomit-frame-pointer".}  # (enabled at -O1)

 # Montgomery reduction
 # ------------------------------------------------------------
@ -33,9 +33,7 @@ macro redc2xMont_gen*[N: static int](
       a_PIR: array[N*2, SecretWord],
       M_PIR: array[N, SecretWord],
       m0ninv_REG: BaseType,
-       spareBits: static int, skipFinalSub: static bool
-      ) =
-
+       spareBits: static int, skipFinalSub: static bool) =
  # No register spilling handling
  doAssert N > 2, "The Assembly-optimized montgomery reduction requires a minimum of 2 limbs."
  doAssert N <= 6, "The Assembly-optimized montgomery reduction requires at most 6 limbs."
@ -152,7 +150,7 @@ macro redc2xMont_gen*[N: static int](

  # v is invalidated from now on
  let t = repackRegisters(v, u[N], u[N+1])
-  
+
  if spareBits >= 2 and skipFinalSub:
    for i in 0 ..< N:
      ctx.mov r_temp[i], u[i]
@ -164,29 +162,17 @@ macro redc2xMont_gen*[N: static int](
  # Code generation
  result.add ctx.generate()

-func redcMont_asm_inline*[N: static int](
-       r: var array[N, SecretWord],
-       a: array[N*2, SecretWord],
-       M: array[N, SecretWord],
-       m0ninv: BaseType,
-       spareBits: static int,
-       skipFinalSub: static bool = false
-      ) {.inline.} =
-  ## Constant-time Montgomery reduction
-  ## Inline-version
-  redc2xMont_gen(r, a, M, m0ninv, spareBits, skipFinalSub)
-
 func redcMont_asm*[N: static int](
       r: var array[N, SecretWord],
       a: array[N*2, SecretWord],
       M: array[N, SecretWord],
       m0ninv: BaseType,
       spareBits: static int,
-       skipFinalSub: static bool
-      ) =
+       skipFinalSub: static bool) {.noInline.}  =
  ## Constant-time Montgomery reduction
+  # This MUST be noInline or Clang will run out of registers with LTO
  static: doAssert UseASM_X86_64, "This requires x86-64."
-  redcMont_asm_inline(r, a, M, m0ninv, spareBits, skipFinalSub)
+  redc2xMont_gen(r, a, M, m0ninv, spareBits, skipFinalSub)

 # Montgomery conversion
 # ----------------------------------------------------------
@ -230,7 +216,7 @@ macro mulMont_by_1_gen[N: static int](
    m = scratch[1] # Stores (t[0] * m0ninv) mod 2ʷ

  let scratchSym = scratch.nimSymbol
-  
+
  # Copy a in t
  result.add quote do:
    var `scratchSym` {.noInit, used.}: Limbs[`scratchSlots`]
--- a/constantine/math/arithmetic/assembly/limbs_asm_redc_mont_x86_adx_bmi2.nim
+++ b/constantine/math/arithmetic/assembly/limbs_asm_redc_mont_x86_adx_bmi2.nim
@ -23,8 +23,8 @@ static: doAssert UseASM_X86_64

 # MULX/ADCX/ADOX
 {.localPassC:"-madx -mbmi2".}
-# Necessary for the compiler to find enough registers (enabled at -O1)
-{.localPassC:"-fomit-frame-pointer".}
+# Necessary for the compiler to find enough registers
+{.localPassC:"-fomit-frame-pointer".} # (enabled at -O1)

 # No exceptions allowed
 {.push raises: [].}
@ -37,8 +37,7 @@ macro redc2xMont_adx_gen[N: static int](
       a_PIR: array[N*2, SecretWord],
       M_PIR: array[N, SecretWord],
       m0ninv_REG: BaseType,
-       spareBits: static int, skipFinalSub: static bool
-      ) =
+       spareBits: static int, skipFinalSub: static bool) =

  # No register spilling handling
  doAssert N <= 6, "The Assembly-optimized montgomery multiplication requires at most 6 limbs."
@ -141,28 +140,18 @@ macro redc2xMont_adx_gen[N: static int](
  # Code generation
  result.add ctx.generate()

-func redcMont_asm_adx_inline*[N: static int](
-       r: var array[N, SecretWord],
-       a: array[N*2, SecretWord],
-       M: array[N, SecretWord],
-       m0ninv: BaseType,
-       spareBits: static int,
-       skipFinalSub: static bool = false
-      ) {.inline.} =
-  ## Constant-time Montgomery reduction
-  ## Inline-version
-  redc2xMont_adx_gen(r, a, M, m0ninv, spareBits, skipFinalSub)
-
 func redcMont_asm_adx*[N: static int](
       r: var array[N, SecretWord],
       a: array[N*2, SecretWord],
       M: array[N, SecretWord],
       m0ninv: BaseType,
       spareBits: static int,
-       skipFinalSub: static bool = false
-      ) =
+       skipFinalSub: static bool = false) {.noInline.} =
  ## Constant-time Montgomery reduction
-  redcMont_asm_adx_inline(r, a, M, m0ninv, spareBits, skipFinalSub)
+  # Inlining redcMont_asm_adx twice in mul_fp2_complex_asm_adx
+  # causes GCC to miscompile with -Os (--opt:size)
+  # see https://github.com/mratsim/constantine/issues/229
+  redc2xMont_adx_gen(r, a, M, m0ninv, spareBits, skipFinalSub)

 # Montgomery conversion
 # ----------------------------------------------------------
@ -205,7 +194,7 @@ macro mulMont_by_1_adx_gen[N: static int](
    C = scratch[0] # Stores the high-part of muliplication

  let scratchSym = scratch.nimSymbol
-  
+
  # Copy a in t
  result.add quote do:
    var `scratchSym` {.noInit, used.}: Limbs[`scratchSlots`]
--- a/constantine/math/arithmetic/finite_fields.nim
+++ b/constantine/math/arithmetic/finite_fields.nim
@ -249,8 +249,7 @@ func sumprod*[N: static int](r: var FF, a, b: array[N, FF], skipFinalSub: static
  r.mres.sumprodMont(
    cast[ptr array[N, typeof(a[0].mres)]](a.unsafeAddr)[],
    cast[ptr array[N, typeof(b[0].mres)]](b.unsafeAddr)[],
-    FF.fieldMod(), FF.getNegInvModWord(), FF.getSpareBits(), skipFinalSub
-  )
+    FF.fieldMod(), FF.getNegInvModWord(), FF.getSpareBits(), skipFinalSub)

 # ############################################################
 #
--- a/constantine/math/arithmetic/limbs_exgcd.nim
+++ b/constantine/math/arithmetic/limbs_exgcd.nim
@ -117,7 +117,6 @@ debug:
      r = SecretWord r

    var a, b: array[2, SecretWord]
-    var e: array[2, SecretWord]
    smul(a[1], a[0], u, r)
    smul(b[1], b[0], v, q)

@ -373,8 +372,8 @@ template matVecMul_shr_k_impl(

  # First iteration of [u v] [f]
  #                    [q r].[g]
-  cf.ssumprodAccNoCarry(u, f[0], v, g[0])
-  cg.ssumprodAccNoCarry(q, f[0], r, g[0])
+  ssumprodAccNoCarry(cf, u, f[0], v, g[0])
+  ssumprodAccNoCarry(cg, q, f[0], r, g[0])
  # bottom k bits are zero by construction
  debug:
    doAssert BaseType(cf.lo and Max) == 0, "bottom k bits should be 0, cf.lo: " & $BaseType(cf.lo)
@ -384,8 +383,8 @@ template matVecMul_shr_k_impl(
  cg.ashr(k)

  for i in 1 ..< numLimbsLeft:
-    cf.ssumprodAccNoCarry(u, f[i], v, g[i])
-    cg.ssumprodAccNoCarry(q, f[i], r, g[i])
+    ssumprodAccNoCarry(cf, u, f[i], v, g[i])
+    ssumprodAccNoCarry(cg, q, f[i], r, g[i])
    f[i-1] = cf.lo and Max
    g[i-1] = cg.lo and Max
    cf.ashr(k)
--- a/constantine/math/arithmetic/limbs_montgomery.nim
+++ b/constantine/math/arithmetic/limbs_montgomery.nim
@ -56,11 +56,11 @@ func redc2xMont_CIOS[N: static int](
       M: array[N, SecretWord],
       m0ninv: BaseType, skipFinalSub: static bool = false) =
  ## Montgomery reduce a double-precision bigint modulo M
-  ## 
+  ##
  ## This maps
  ## - [0, 4p²) -> [0, 2p) with skipFinalSub
  ## - [0, 4p²) -> [0, p) without
-  ## 
+  ##
  ## skipFinalSub skips the final substraction step.
  # - Analyzing and Comparing Montgomery Multiplication Algorithms
  #   Cetin Kaya Koc and Tolga Acar and Burton S. Kaliski Jr.
@ -125,11 +125,11 @@ func redc2xMont_Comba[N: static int](
       M: array[N, SecretWord],
       m0ninv: BaseType, skipFinalSub: static bool = false) {.used.} =
  ## Montgomery reduce a double-precision bigint modulo M
-  ## 
+  ##
  ## This maps
  ## - [0, 4p²) -> [0, 2p) with skipFinalSub
  ## - [0, 4p²) -> [0, p) without
-  ## 
+  ##
  ## skipFinalSub skips the final substraction step.
  # We use Product Scanning / Comba multiplication
  var t, u, v = Zero
@ -179,11 +179,11 @@ func mulMont_CIOS_sparebit(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType, skipF
  ## This requires the most significant word of the Modulus
  ##   M[^1] < high(SecretWord) shr 1 (i.e. less than 0b01111...1111)
  ## https://hackmd.io/@gnark/modular_multiplication
-  ## 
+  ##
  ## This maps
  ## - [0, 2p) -> [0, 2p) with skipFinalSub
  ## - [0, 2p) -> [0, p) without
-  ## 
+  ##
  ## skipFinalSub skips the final substraction step.

  # We want all the computation to be kept in registers
@ -262,11 +262,11 @@ func mulMont_CIOS(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType) {.used.} =

 func mulMont_FIPS(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType, skipFinalSub: static bool = false) =
  ## Montgomery Multiplication using Finely Integrated Product Scanning (FIPS)
-  ## 
+  ##
  ## This maps
  ## - [0, 2p) -> [0, 2p) with skipFinalSub
  ## - [0, 2p) -> [0, p) without
-  ## 
+  ##
  ## skipFinalSub skips the final substraction step.
  # - Architectural Enhancements for Montgomery
  #   Multiplication on Embedded RISC Processors
@ -310,11 +310,11 @@ func sumprodMont_CIOS_spare2bits[K: static int](
       skipFinalSub: static bool = false) =
  ## Compute r = ⅀aᵢ.bᵢ (mod M) (suim of products)
  ## This requires 2 unused bits in the field element representation
-  ## 
+  ##
  ## This maps
  ## - [0, 2p) -> [0, 2p) with skipFinalSub
  ## - [0, 2p) -> [0, p) without
-  ## 
+  ##
  ## skipFinalSub skips the final substraction step.

  # We want all the computation to be kept in registers
@ -398,7 +398,7 @@ func sumprodMont_CIOS_spare2bits[K: static int](

 # Montgomery Conversion
 # ------------------------------------------------------------
-# 
+#
 # In Montgomery form, inputs are scaled by a constant R
 # so a' = aR (mod p) and b' = bR (mod p)
 #
@ -453,7 +453,7 @@ func redc2xMont*[N: static int](
       m0ninv: BaseType,
       spareBits: static int, skipFinalSub: static bool = false) {.inline.} =
  ## Montgomery reduce a double-precision bigint modulo M
-  
+
  const skipFinalSub = skipFinalSub and spareBits >= 2

  when UseASM_X86_64 and r.len <= 6:
@ -543,14 +543,17 @@ func sumprodMont*[N: static int](
        r: var Limbs, a, b: array[N, Limbs],
        M: Limbs, m0ninv: BaseType,
        spareBits: static int,
-        skipFinalSub: static bool = false) {.inline.} =
+        skipFinalSub: static bool = false) {.noInline.} =
+  ## Compute r <- ⅀aᵢ.bᵢ (mod M) (sum of products)
+  # This function must be noInline or GCC miscompiles
+  # with LTO, see https://github.com/mratsim/constantine/issues/230
  when spareBits >= 2:
    when UseASM_X86_64 and r.len in {2 .. 6}:
      if ({.noSideEffect.}: hasAdx()):
        r.sumprodMont_CIOS_spare2bits_asm_adx(a, b, M, m0ninv, skipFinalSub)
      else:
        r.sumprodMont_CIOS_spare2bits_asm(a, b, M, m0ninv, skipFinalSub)
-    else:  
+    else:
      r.sumprodMont_CIOS_spare2bits(a, b, M, m0ninv, skipFinalSub)
  else:
    r.mulMont(a[0], b[0], M, m0ninv, spareBits, skipFinalSub = false)
@ -719,7 +722,7 @@ func powMontSquarings(

  # We have k bits and can do k squaring
  for i in 0 ..< k:
-    a.squareMont(a, M, m0ninv, spareBits)  
+    a.squareMont(a, M, m0ninv, spareBits)

  return (k, bits)

--- a/constantine/math/config/curves_prop_field_core.nim
+++ b/constantine/math/config/curves_prop_field_core.nim
@ -45,10 +45,6 @@ func has_P_3mod4_primeModulus*(C: static Curve): static bool =
  ## Returns true iff p ≡ 3 (mod 4)
  (BaseType(C.Mod.limbs[0]) and 3) == 3

-func has_P_3mod8_primeModulus*(C: static Curve): static bool =
-  ## Returns true iff p ≡ 3 (mod 8)
-  (BaseType(C.Mod.limbs[0]) and 7) == 3
-
 func has_P_5mod8_primeModulus*(C: static Curve): static bool =
  ## Returns true iff p ≡ 5 (mod 8)
  (BaseType(C.Mod.limbs[0]) and 7) == 5
--- a/constantine/math/extension_fields/assembly/fp2_asm_x86_adx_bmi2.nim
+++ b/constantine/math/extension_fields/assembly/fp2_asm_x86_adx_bmi2.nim
@ -28,8 +28,8 @@ static: doAssert UseASM_X86_64

 # MULX/ADCX/ADOX
 {.localPassC:"-madx -mbmi2".}
-# Necessary for the compiler to find enough registers (enabled at -O1)
-{.localPassC:"-fomit-frame-pointer".}
+# Necessary for the compiler to find enough registers
+{.localPassC:"-fomit-frame-pointer".} # (enabled at -O1)

 # No exceptions allowed
 {.push raises: [].}
@ -48,8 +48,7 @@ func has1extraBit(F: type Fp): bool =

 func sqrx2x_complex_asm_adx*(
        r: var array[2, FpDbl],
-        a: array[2, Fp]
-      ) =
+        a: array[2, Fp]) =
  ## Complex squaring on 𝔽p2
  # This specialized proc inlines all calls and avoids many ADX support checks.
  # and push/pop for paramater passing.
@ -69,8 +68,7 @@ func sqrx2x_complex_asm_adx*(

 func sqrx_complex_sparebit_asm_adx*(
        r: var array[2, Fp],
-        a: array[2, Fp]
-      ) =
+        a: array[2, Fp]) =
  ## Complex squaring on 𝔽p2
  # This specialized proc inlines all calls and avoids many ADX support checks.
  # and push/pop for paramater passing.
@ -91,8 +89,7 @@ func sqrx_complex_sparebit_asm_adx*(

 func mul2x_fp2_complex_asm_adx*(
        r: var array[2, FpDbl],
-        a, b: array[2, Fp]
-      ) =
+        a, b: array[2, Fp]) =
  ## Complex multiplication on 𝔽p2
  var D {.noInit.}: typeof(r.c0)
  var t0 {.noInit.}, t1 {.noInit.}: typeof(a.c0)
@ -121,15 +118,15 @@ func mul_fp2_complex_asm_adx*(
  ## Complex multiplication on 𝔽p2
  var d {.noInit.}: array[2,doublePrec(Fp)]
  d.mul2x_fp2_complex_asm_adx(a, b)
-  r.c0.mres.limbs.redcMont_asm_adx_inline(
+  # Inlining redcMont_asm_adx causes GCC to miscompile with -Os (--opt:size)
+  # see https://github.com/mratsim/constantine/issues/229
+  r.c0.mres.limbs.redcMont_asm_adx(
    d.c0.limbs2x,
    Fp.fieldMod().limbs,
    Fp.getNegInvModWord(),
-    Fp.getSpareBits()
-  )
-  r.c1.mres.limbs.redcMont_asm_adx_inline(
+    Fp.getSpareBits())
+  r.c1.mres.limbs.redcMont_asm_adx(
    d.c1.limbs2x,
    Fp.fieldMod().limbs,
    Fp.getNegInvModWord(),
-    Fp.getSpareBits()
-  )
+    Fp.getSpareBits())
--- a/constantine/math/extension_fields/exponentiations.nim
+++ b/constantine/math/extension_fields/exponentiations.nim
@ -43,7 +43,7 @@ func getWindowLen(bufLen: int): uint =
 func powPrologue[F](a: var F, scratchspace: var openarray[F]): uint =
  ## Setup the scratchspace, then set a to 1.
  ## Returns the fixed-window size for exponentiation with window optimization
-  result = scratchspace.len.getWindowLen
+  result = scratchspace.len.getWindowLen()
  # Precompute window content, special case for window = 1
  # (i.e scratchspace has only space for 2 temporaries)
  # The content scratchspace[2+k] is set at [k]P
@ -62,8 +62,7 @@ func powSquarings[F](
       tmp: var F,
       window: uint,
       acc, acc_len: var uint,
-       e: var int
-     ): tuple[k, bits: uint] {.inline.}=
+       e: var int): tuple[k, bits: uint] {.inline.}=
  ## Squaring step of exponentiation by squaring
  ## Get the next k bits in range [1, window)
  ## Square k times
@ -105,8 +104,7 @@ func powSquarings[F](
 func powUnsafeExponent[F](
       a: var F,
       exponent: openArray[byte],
-       scratchspace: var openArray[F]
-     ) =
+       scratchspace: var openArray[F]) =
  ## Extension field exponentiation r = a^exponent (mod p^m)
  ##
  ## Warning ⚠️ :
--- a/constantine/math/extension_fields/towers.nim
+++ b/constantine/math/extension_fields/towers.nim
@ -979,12 +979,17 @@ func square2x_disjoint*[Fdbl, F](
 # Multiplications (specializations)
 # -------------------------------------------------------------------

-func prodImpl_fp4o2_p3mod8[C: static Curve](r: var Fp4[C], a, b: Fp4[C]) =
+func prodImpl_fp4o2_complex_snr_1pi[C: static Curve](r: var Fp4[C], a, b: Fp4[C]) =
  ## Returns r = a * b
-  ## For 𝔽p4/𝔽p2 with p ≡ 3 (mod 8),
-  ##   hence 𝔽p QNR is 𝑖 = √-1 as p ≡ 3 (mod 8) implies p ≡ 3 (mod 4)
-  ##   and 𝔽p SNR is (1 + i)
-  static: doAssert C.has_P_3mod8_primeModulus()
+  ## For 𝔽p4/𝔽p2 with the following non-residue (NR) constraints:
+  ##   * -1 is a quadratic non-residue in 𝔽p hence 𝔽p2 has coordinates a+𝑖b with i = √-1. This implies p ≡ 3 (mod 4)
+  ##   * (1 + i) is a quadratic non-residue in 𝔽p hence 𝔽p2 has coordinates a+vb with v = √(1+𝑖).
+  ##
+  ## According to Benger-Scott 2009(https://eprint.iacr.org/2009/556.pdf)
+  ## About 2/3 of the p ≡ 3 (mod 8) primes are in this case
+  static:
+    doAssert C.getNonResidueFp() == -1
+    doAssert C.getNonresidueFp2() == (1, 1)
  var
    b10_m_b11{.noInit.}, b10_p_b11{.noInit.}: Fp[C]
    n_a01{.noInit.}, n_a11{.noInit.}: Fp[C]
@ -1374,8 +1379,8 @@ func prod*(r: var QuadraticExt, a, b: QuadraticExt) =
    when QuadraticExt is Fp12 or r.typeof.F.C.has_large_field_elem():
      # BW6-761 requires too many registers for Dbl width path
      r.prod_generic(a, b)
-    elif QuadraticExt is Fp4 and QuadraticExt.C.has_P_3mod8_primeModulus():
-      r.prodImpl_fp4o2_p3mod8(a, b)
+    elif QuadraticExt is Fp4 and QuadraticExt.C.getNonResidueFp() == -1 and QuadraticExt.C.getNonResidueFp2() == (1, 1):
+      r.prodImpl_fp4o2_complex_snr_1pi(a, b)
    else:
      var d {.noInit.}: doublePrec(typeof(r))
      d.prod2x_disjoint(a.c0, a.c1, b.c0, b.c1)
@ -1628,13 +1633,18 @@ func square_Chung_Hasan_SQR3(r: var CubicExt, a: CubicExt) =
 # Multiplications (specializations)
 # -------------------------------------------------------------------

-func prodImpl_fp6o2_p3mod8[C: static Curve](r: var Fp6[C], a, b: Fp6[C]) =
+func prodImpl_fp6o2_complex_snr_1pi[C: static Curve](r: var Fp6[C], a, b: Fp6[C]) =
  ## Returns r = a * b
-  ## For 𝔽p6/𝔽p2 with p ≡ 3 (mod 8),
-  ##   hence 𝔽p QNR is 𝑖 = √-1 as p ≡ 3 (mod 8) implies p ≡ 3 (mod 4)
-  ##   and 𝔽p SNR is (1 + i)
+  ## For 𝔽p4/𝔽p2 with the following non-residue (NR) constraints:
+  ##   * -1 is a quadratic non-residue in 𝔽p hence 𝔽p2 has coordinates a+𝑖b with i = √-1. This implies p ≡ 3 (mod 4)
+  ##   * (1 + i) is a cubic non-residue in 𝔽p hence 𝔽p2 has coordinates a+vb with v = √(1+𝑖).
+  ##
+  ## According to Benger-Scott 2009 (https://eprint.iacr.org/2009/556.pdf)
+  ## About 2/3 of the p ≡ 3 (mod 8) primes are in this case
  # https://eprint.iacr.org/2022/367 - Equation 8
-  static: doAssert C.has_P_3mod8_primeModulus()
+  static:
+    doAssert C.getNonResidueFp() == -1
+    doAssert C.getNonresidueFp2() == (1, 1)
  var
    b10_p_b11{.noInit.}, b10_m_b11{.noInit.}: Fp[C]
    b20_p_b21{.noInit.}, b20_m_b21{.noInit.}: Fp[C]
@ -2133,8 +2143,8 @@ func prod*(r: var CubicExt, a, b: CubicExt) =
  ## Out-of-place multiplication
  when CubicExt.C.has_large_field_elem():
    r.prodImpl(a, b)
-  elif r is Fp6 and CubicExt.C.has_P_3mod8_primeModulus():
-    r.prodImpl_fp6o2_p3mod8(a, b)
+  elif r is Fp6 and CubicExt.C.getNonResidueFp() == -1 and CubicExt.C.getNonResidueFp2() == (1, 1):
+    r.prodImpl_fp6o2_complex_snr_1pi(a, b)
  else:
    var d {.noInit.}: doublePrec(typeof(r))
    d.prod2x(a, b)
--- a/constantine/platforms/endians.nim
+++ b/constantine/platforms/endians.nim
@ -34,9 +34,9 @@ template blobFrom*(dst: var openArray[byte], src: SomeUnsignedInt, startIdx: int
    for i in 0 ..< sizeof(src):
      dst[startIdx+sizeof(src)-1-i] = toByte(src shr (i * 8))

-func parseFromBlob*[T: byte|char](
+func parseFromBlob*(
           dst: var SomeUnsignedInt,
-           src: openArray[T],
+           src: openArray[byte],
           cursor: var uint, endian: static Endianness) {.inline.} =
  ## Read an unsigned integer from a raw binary blob.
  ## The `cursor` represents the current index in the array and is updated
@ -63,8 +63,8 @@ func parseFromBlob*[T: byte|char](
  dst = accum
  cursor.inc(L)

-func dumpRawInt*[T: byte|char](
-           dst: var openArray[T],
+func dumpRawInt*(
+           dst: var openArray[byte],
           src: SomeUnsignedInt,
           cursor: uint, endian: static Endianness) {.inline.} =
  ## Dump an integer into raw binary form
--- a/constantine/platforms/gpu/bindings/c_abi.nim
+++ b/constantine/platforms/gpu/bindings/c_abi.nim
--- a/constantine/platforms/gpu/bindings/llvm_abi.nim
+++ b/constantine/platforms/gpu/bindings/llvm_abi.nim
@ -6,7 +6,7 @@
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.

-import ./utils
+import ./c_abi

 {.passc: gorge("llvm-config --cflags").}
 {.passl: gorge("llvm-config --libs").}
@ -67,7 +67,7 @@ proc getBufferSize(buf: MemoryBufferRef): csize_t {.used, importc: "LLVMGetBuffe

 proc dispose(msg: ErrorMessageString) {.used, importc: "LLVMDisposeErrorMessage".}
 proc getErrorMessage(err: ErrorRef): ErrorMessageString {.used, importc: "LLVMGetErrorMessage".}
- 
+
 # ############################################################
 #
 #                         Module
@ -117,7 +117,7 @@ proc verify(module: ModuleRef, failureAction: VerifierFailureAction, msg: var LL
 # - initializeNativeTarget()
 # - initializeNativeAsmPrinter()
 # are implemented in the development header macros and aren't in the LLVM library
-# We want to only depend on the runtime for installation ease and size. 
+# We want to only depend on the runtime for installation ease and size.
 #
 # We can emulate the calls based on:
 # - /usr/include/llvm-c/Target.h
@ -375,7 +375,7 @@ proc getTypeOf*(v: ValueRef): TypeRef {.importc: "LLVMTypeOf".}
 proc getValueName2(v: ValueRef, rLen: var csize_t): cstring {.used, importc: "LLVMGetValueName2".}
  ## Returns the name of a valeu if it exists.
  ## `rLen` stores the returned string length
-  ## 
+  ##
  ## This is not free, it requires internal hash table access
  ## The return value does not have to be freed and is a pointer an internal LLVM data structure

@ -473,7 +473,7 @@ proc getInlineAsm*(

 # Intermediate Representation
 # ------------------------------------------------------------
-# 
+#
 # - NSW: no signed wrap, signed value cannot over- or underflow.
 # - NUW: no unsigned wrap, unsigned value cannot over- or underflow.

--- a/constantine/platforms/gpu/bindings/nvidia_abi.nim
+++ b/constantine/platforms/gpu/bindings/nvidia_abi.nim
@ -12,7 +12,7 @@
 #
 # ############################################################

-import ./utils
+import ./c_abi

 # ############################################################
 #
@ -466,7 +466,7 @@ type
    CU_DEVICE_ATTRIBUTE_DEFERRED_MAPPING_CUDA_ARRAY_SUPPORTED = 121,        ## Device supports deferred mapping CUDA arrays and CUDA mipmapped arrays */
    CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS_V2 = 122,             ## 64-bit operations are supported in ::cuStreamBatchMemOp_v2 and related v2 MemOp APIs. */
    CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR_V2 = 123,             ## ::CU_STREAM_WAIT_VALUE_NOR is supported by v2 MemOp APIs. */
-    CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED = 124,                            ## Device supports buffer sharing with dma_buf mechanism. */ 
+    CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED = 124,                            ## Device supports buffer sharing with dma_buf mechanism. */
    CU_DEVICE_ATTRIBUTE_MAX

  CUmemAttach_flags* = enum
--- a/constantine/platforms/gpu/nvidia.nim
+++ b/constantine/platforms/gpu/nvidia.nim
@ -8,7 +8,7 @@

 import
  ./bindings/nvidia_abi {.all.},
-  ./bindings/utils,
+  ./bindings/c_abi,
  ./llvm, ./ir,
  ./nvidia_inlineasm,
  ../primitives
@ -41,12 +41,12 @@ export
 #
 # Unified memory is fully supported starting from Pascal GPU (GTX 1080, 2016, Compute Capability SM6.0)
 # and require Kepler at minimum.
-# 
+#
 # Cuda 9 exposes the current explicit synchronization primitives (cooperative groups) and deprecated the old ones
 # Those primitives are particularly suitable for Volta GPUs (GTX 2080, 2018, Compute Capability SM7.5)
 # and requiring.
 #
-# Furthermore Pascal GPUs predates the high demand for deep learning and cryptocurrency mining 
+# Furthermore Pascal GPUs predates the high demand for deep learning and cryptocurrency mining
 # and were widely available at an affordable price point.
 # Also given iven that it's a 7 years old architecture,
 # it is unlikely that users have an older Nvidia GPU available.
@ -64,7 +64,7 @@ export
 template check*(status: CUresult) =
  ## Check the status code of a CUDA operation
  ## Exit program with error if failure
-  
+
  let code = status # ensure that the input expression is evaluated once only
  if code != CUDA_SUCCESS:
    writeStackTrace()
@ -77,15 +77,15 @@ func cuModuleGetFunction*(kernel: var CUfunction, module: CUmodule, fnName: open
  cuModuleGetFunction(kernel, module, fnName[0].unsafeAddr)

 proc cudaDeviceInit*(deviceID = 0'i32): CUdevice =
-  
+
  check cuInit(deviceID.uint32)
-  
+
  var devCount: int32
  check cuDeviceGetCount(devCount)
  if devCount == 0:
    echo "cudaDeviceInit error: no devices supporting CUDA"
    quit 1
-  
+
  var cuDevice: CUdevice
  check cuDeviceGet(cuDevice, deviceID)
  var name = newString(128)
@ -99,7 +99,7 @@ proc cudaDeviceInit*(deviceID = 0'i32): CUdevice =
  if major < 6:
    echo "Error: Device ",deviceID," is not sm_60 (Pascal generation, GTX 1080) or later"
    quit 1
-  
+
  return cuDevice

 # ############################################################
@ -110,7 +110,7 @@ proc cudaDeviceInit*(deviceID = 0'i32): CUdevice =

 proc tagCudaKernel(module: ModuleRef, fn: FnDef) =
  ## Tag a function as a Cuda Kernel, i.e. callable from host
-  
+
  doAssert fn.fnTy.getReturnType().isVoid(), block:
    "Kernels must not return values but function returns " & $fn.fnTy.getReturnType().getTypeKind()

@ -129,10 +129,10 @@ proc setCallableCudaKernel*(module: ModuleRef, fn: FnDef) =
  ##
  ## A function named `addmod` can be found by appending _public
  ##   check cuModuleGetFunction(fnPointer, cuModule, "addmod_public")
-  
+
  let pubName = fn.fnImpl.getName() & "_public"
  let pubFn = module.addFunction(cstring(pubName), fn.fnTy)
-  
+
  let ctx = module.getContext()
  let builder = ctx.createBuilder()
  defer: builder.dispose()
@ -160,11 +160,11 @@ proc codegenNvidiaPTX*(asy: Assembler_LLVM, sm: tuple[major, minor: int32]): str
  ## SM corresponds to the target GPU architecture Compute Capability
  ## - https://developer.nvidia.com/cuda-gpus
  ## - https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-capabilities
-  ## 
+  ##
  ## This requires the following function to be called beforehand:
  ## - initializePasses()
  ## - initializeFullNVPTXTarget()
-  
+
  debug: doAssert asy.backend == bkNvidiaPTX

  asy.module.verify(AbortProcessAction)
@ -242,9 +242,9 @@ proc exec*[T](jitFn: CUfunction, r: var T, a, b: T) =

    "Most CPUs (x86-64, ARM) are little-endian, as are Nvidia GPUs, which allows naive copying of parameters.\n" &
    "Your architecture '" & $hostCPU & "' is big-endian and GPU offloading is unsupported on it."
-  
+
  # We assume that all arguments are passed by reference in the Cuda kernel, hence the need for GPU alloc.
-  
+
  var rGPU, aGPU, bGPU: CUdeviceptr
  check cuMemAlloc(rGPU, csize_t sizeof(r))
  check cuMemAlloc(aGPU, csize_t sizeof(a))
--- a/constantine/platforms/primitives.nim
+++ b/constantine/platforms/primitives.nim
@ -86,17 +86,16 @@ func setZero*[N](a: var array[N, SomeNumber]){.inline.} =
  for i in 0 ..< a.len:
    a[i] = 0

-func copy*[T: byte|char](
+func rawCopy*(
       dst: var openArray[byte],
       dStart: SomeInteger,
-       src: openArray[T],
+       src: openArray[byte],
       sStart: SomeInteger,
       len: SomeInteger
     ) {.inline.} =
  ## Copy dst[dStart ..< dStart+len] = src[sStart ..< sStart+len]
  ## Unlike the standard library, this cannot throw
  ## even a defect.
-  ## It also handles copy of char into byte arrays
  debug:
    doAssert 0 <= dStart and dStart+len <= dst.len.uint, "dStart: " & $dStart & ", dStart+len: " & $(dStart+len) & ", dst.len: " & $dst.len
    doAssert 0 <= sStart and sStart+len <= src.len.uint, "sStart: " & $sStart & ", sStart+len: " & $(sStart+len) & ", src.len: " & $src.len
--- a/constantine/platforms/views.nim
+++ b/constantine/platforms/views.nim
@ -0,0 +1,181 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import std/macros
+
+# OpenArray type
+# ---------------------------------------------------------
+
+template toOpenArray*[T](p: ptr UncheckedArray[T], len: int): openArray[T] =
+  p.toOpenArray(0, len-1)
+
+# View type
+# ---------------------------------------------------------
+#
+# This view type is equivalent to (pointer + length)
+# like openArray. Unlike openArray it can be stored in a type
+# Or can be used for nested views like openArray[View[byte]]
+
+type View*[T] = object
+  # TODO, use `lent UncheckedArray[T]` for proper borrow-checking - https://github.com/nim-lang/Nim/issues/21674
+  data: ptr UncheckedArray[T]
+  len: int
+
+template toOpenArray*[T](v: View[T]): openArray[T] =
+  v.data.toOpenArray(0, v.len-1)
+
+# Binary blob API
+# ---------------------------------------------------------
+#
+# High-level API needs to provide functions of the form
+# - func verify[T: byte|char](pubkey: PubKey, message: T, signature: Signature)
+# - func update[T: byte|char](ctx: var Sha256Context, message: openarray[T])
+#
+# for all APIs that ingest bytes/strings including:
+# - Ciphers
+# - Signature protocols
+# - Hashing algorithms
+# - Message Authentication code
+# - Key derivation functions
+#
+# This causes the following issues:
+# - Code explosion due to monomorphization. The code for bytes and char will be duplicated needlessly.
+# - Cannot be exported to C. Generic code cannot be exported to C and so will need manual split
+# - Longer compile-times. The inner functions can be byte-only instead of using generics.
+#
+# Instead we create a `genCharAPI` macro that generates the same function as an openArray[byte]
+# but with openArray[char] inputs
+
+template toOpenArrayByte[T: byte|char](oa: openArray[T]): openArray[byte] =
+  when T is byte:
+    oa
+  else:
+    oa.toOpenArrayByte(oa.low, oa.high)
+
+macro genCharAPI*(procAst: untyped): untyped =
+  ## For each openArray[byte] parameter in the input proc
+  ## generate an openArray[char] variation.
+  procAst.expectKind({nnkProcDef, nnkFuncDef})
+
+  result = newStmtList()
+  result.add procAst
+
+  var genericParams = procAst[2].copyNimTree()
+  var wrapperParams = nnkFormalParams.newTree(procAst.params[0].copyNimTree())
+  var wrapperBody = newCall(ident($procAst.name))
+
+  proc matchBytes(node: NimNode): bool =
+    node.kind == nnkBracketExpr and
+      node[0].eqIdent"openArray" and
+      node[1].eqIdent"byte"
+
+  # We do 2 passes:
+  # If a single params is openArray[byte], we instantiate a non-generic proc.
+  # - This should make for faster compile-times.
+  # - It is also necessary for `hash` and `mac`, as it seems like overloading
+  #   a concept function with an argument that matches but the generic and a concrete param
+  #   crashes. i.e. either you use full generic (with genCharAPI) or you instantiate 2 concrete procs
+
+  let countBytesParams = block:
+    var count = 0
+    for i in 1 ..< procAst.params.len:
+      if procAst.params[i][^2].matchBytes():
+        count += 1
+      elif procAst.params[i][^2].kind == nnkVarTy and procAst.params[i][^2][0].matchBytes():
+        count += 1
+    count
+
+  if countBytesParams == 0:
+    error "Using genCharAPI on an input without any openArray[byte] parameter."
+
+  if countBytesParams == 1:
+    for i in 1 ..< procAst.params.len:
+      # Unfortunately, even in typed macro, .sameType(getType(openArray[byte])) doesn't match
+      if procAst.params[i][^2].matchBytes():
+        # Handle "a, b: openArray[byte]"
+        for j in 0 ..< procAst.params[i].len - 2:
+          wrapperParams.add newIdentDefs(
+            procAst.params[i][j].copyNimTree(),
+            nnkBracketExpr.newTree(ident"openArray", ident"char"))
+          wrapperBody.add newCall(bindSym"toOpenArrayByte", procAst.params[i][j])
+      elif procAst.params[i][^2].kind == nnkVarTy and procAst.params[i][^2][0].matchBytes():
+        # Handle "a, b: openArray[byte]"
+        for j in 0 ..< procAst.params[i].len - 2:
+          wrapperParams.add newIdentDefs(
+            procAst.params[i][j].copyNimTree(),
+            nnkVarTy.newTree(nnkBracketExpr.newTree(ident"openArray", ident"char")))
+          wrapperBody.add newCall(bindSym"toOpenArrayByte", procAst.params[i][j])
+      else:
+        wrapperParams.add procAst.params[i].copyNimTree()
+        # Handle "a, b: int"
+        for j in 0 ..< procAst.params[i].len - 2:
+          wrapperBody.add ident($procAst.params[i][j])
+
+  else:
+    if genericParams.kind == nnkEmpty:
+      genericParams = nnkGenericParams.newTree()
+
+    for i in 1 ..< procAst.params.len:
+      # Unfortunately, even in typed macro, .sameType(getType(openArray[byte])) doesn't match
+      if procAst.params[i][^2].matchBytes():
+        # Handle "a, b: openArray[byte]"
+        for j in 0 ..< procAst.params[i].len - 2:
+          let genericId = ident("API_" & $i & "_" & $j)
+          wrapperParams.add newIdentDefs(
+            procAst.params[i][j].copyNimTree(),
+            nnkBracketExpr.newTree(ident"openArray", genericId))
+          genericParams.add newIdentDefs(
+            genericId,
+            nnkInfix.newTree(ident("|"), ident("byte"), ident("char")))
+          wrapperBody.add newCall(bindSym"toOpenArrayByte", procAst.params[i][j])
+      elif procAst.params[i][^2].kind == nnkVarTy and procAst.params[i][^2][0].matchBytes():
+        for j in 0 ..< procAst.params[i].len - 2:
+          let genericId = ident("API_" & $i & "_" & $j)
+          wrapperParams.add newIdentDefs(
+            procAst.params[i][j].copyNimTree(),
+            nnkVarTy.newTree(nnkBracketExpr.newTree(bindSym"openArray", genericId)))
+          genericParams.add newIdentDefs(
+            genericId,
+            nnkInfix.newTree(ident("|"), ident("byte"), ident("char")))
+          wrapperBody.add newCall(bindSym"toOpenArrayByte", procAst.params[i][j])
+      else:
+        wrapperParams.add procAst.params[i].copyNimTree()
+        # Handle "a, b: int"
+        for j in 0 ..< procAst.params[i].len - 2:
+          wrapperBody.add ident($procAst.params[i][j])
+
+  var pragmas = nnkPragma.newTree(ident"inline")
+  let skipPragmas = ["inline", "noinline", "noInline", "exportc", "exportcpp", "extern", "cdecl", "stdcall", "dynlib", "libPrefix"]
+  for i in 0 ..< procAst.pragma.len:
+    if procAst.pragma[i].kind == nnkIdent:
+      if $procAst.pragma[i] notin skipPragmas:
+        pragmas.add procAst.pragma[i].copyNimTree()
+    else:
+      procAst.pragma[i].expectKind(nnkExprColonExpr)
+      if $procAst.pragma[i][0] notin skipPragmas:
+        pragmas.add procAst.pragma[i].copyNimTree()
+
+  let wrapper = newTree(
+    procAst.kind,             # proc or func
+    procAst[0].copyNimTree(), # name: Keep export marker if any
+    newEmptyNode(),           # term-rewriting macros
+    genericParams,
+    wrapperParams,
+    pragmas,
+    newEmptyNode(),
+    wrapperBody)
+  result.add wrapper
+
+when isMainModule:
+  expandMacros:
+
+    proc foo(x: int, a: openArray[byte]) {.genCharAPI.} =
+      discard
+
+    proc bar(x: int, a: openArray[byte], b: openArray[byte]) {.genCharAPI.} =
+      discard
--- a/constantine/signatures/bls_signatures.nim
+++ b/constantine/signatures/bls_signatures.nim
@ -14,7 +14,8 @@ import
    ../math/constants/zoo_generators,
    ../math/config/curves,
    ../hash_to_curve/[hash_to_curve, h2c_hash_to_field],
-    ../hashes
+    ../hashes,
+    ../platforms/views

 # ############################################################
 #
@ -23,7 +24,7 @@ import
 # ############################################################

 # This module implements generic BLS signatures
-# https://datatracker.ietf.org/doc/html/draft-irtf-cfrg-bls-signature-04
+# https://www.ietf.org/archive/id/draft-irtf-cfrg-bls-signature-05.html
 # https://github.com/cfrg/draft-irtf-cfrg-bls-signature
 #
 # We use generic shortnames SecKey, PubKey, Sig
@ -56,14 +57,14 @@ func derivePubkey*[Pubkey, SecKey](pubkey: var Pubkey, seckey: SecKey): bool =
  pubkey.affine(pk)
  return true

-func coreSign*[B1, B2, B3: byte|char, Sig, SecKey](
+func coreSign*[Sig, SecKey](
    signature: var Sig,
    secretKey: SecKey,
-    message: openarray[B1],
+    message: openArray[byte],
    H: type CryptoHash,
    k: static int,
-    augmentation: openarray[B2],
-    domainSepTag: openarray[B3]) =
+    augmentation: openArray[byte],
+    domainSepTag: openArray[byte]) {.genCharAPI.} =
  ## Computes a signature for the message from the specified secret key.
  ##
  ## Output:
@ -81,7 +82,7 @@ func coreSign*[B1, B2, B3: byte|char, Sig, SecKey](
  ## - `augmentation`, an optional augmentation to the message. This will be prepended,
  ##   prior to hashing.
  ##   This is used for building the "message augmentation" variant of BLS signatures
-  ##   https://tools.ietf.org/html/draft-irtf-cfrg-bls-signature-04#section-3.2
+  ##   https://www.ietf.org/archive/id/draft-irtf-cfrg-bls-signature-05.html#section-3.2
  ##   which requires `CoreSign(SK, PK || message)`
  ##   and `CoreVerify(PK, PK || message, signature)`
  ## - `message` is the message to hash
@ -95,14 +96,14 @@ func coreSign*[B1, B2, B3: byte|char, Sig, SecKey](

  signature.affine(sig)

-func coreVerify*[B1, B2, B3: byte|char, Pubkey, Sig](
+func coreVerify*[Pubkey, Sig](
    pubkey: Pubkey,
-    message: openarray[B1],
+    message: openarray[byte],
    signature: Sig,
    H: type CryptoHash,
    k: static int,
-    augmentation: openarray[B2],
-    domainSepTag: openarray[B3]): bool =
+    augmentation: openarray[byte],
+    domainSepTag: openarray[byte]): bool {.genCharAPI.} =
  ## Check that a signature is valid
  ## for a message under the provided public key
  ## This assumes that the PublicKey and Signatures
@ -165,8 +166,7 @@ type
    domainSepTag{.align: 64.}: array[255, byte] # Alignment to enable SIMD
    dst_len: uint8

-func init*[T: char|byte](
-       ctx: var BLSAggregateSigAccumulator, domainSepTag: openArray[T]) =
+func init*(ctx: var BLSAggregateSigAccumulator, domainSepTag: openArray[byte]) {.genCharAPI.} =
  ## Initializes a BLS Aggregate Signature accumulator context.

  type H = BLSAggregateSigAccumulator.H
@ -176,22 +176,18 @@ func init*[T: char|byte](
  if domainSepTag.len > 255:
    var t {.noInit.}: array[H.digestSize(), byte]
    H.shortDomainSepTag(output = t, domainSepTag)
-    copy(ctx.domainSepTag, dStart = 0,
-        t, sStart = 0,
-        H.digestSize())
+    rawCopy(ctx.domainSepTag, dStart = 0, t, sStart = 0, H.digestSize())
    ctx.dst_len = uint8 H.digestSize()
  else:
-    copy(ctx.domainSepTag, dStart = 0,
-        domainSepTag, sStart = 0,
-        domainSepTag.len)
+    rawCopy(ctx.domainSepTag, dStart = 0, domainSepTag, sStart = 0, domainSepTag.len)
    ctx.dst_len = uint8 domainSepTag.len
  for i in ctx.dst_len ..< ctx.domainSepTag.len:
    ctx.domainSepTag[i] = byte 0

-func update*[T: char|byte, Pubkey: ECP_ShortW_Aff](
+func update*[Pubkey: ECP_ShortW_Aff](
       ctx: var BLSAggregateSigAccumulator,
       pubkey: Pubkey,
-       message: openArray[T]): bool =
+       message: openArray[byte]): bool {.genCharAPI.} =
  ## Add a (public key, message) pair
  ## to a BLS aggregate signature accumulator
  ##
@ -224,6 +220,12 @@ func update*[T: char|byte, Pubkey: ECP_ShortW_Aff](

    ctx.millerAccum.update(hmsgG1_aff, pubkey)

+func update*[Pubkey: ECP_ShortW_Aff](
+       ctx: var BLSAggregateSigAccumulator,
+       pubkey: Pubkey,
+       message: View[byte]): bool {.inline.} =
+  ctx.update(pubkey, message.toOpenArray())
+
 func merge*(ctxDst: var BLSAggregateSigAccumulator, ctxSrc: BLSAggregateSigAccumulator): bool =
  ## Merge 2 BLS signature accumulators: ctxDst <- ctxDst + ctxSrc
  ##
@ -318,8 +320,8 @@ type
    # 20*1 (blinding 64-bit) + 50 (Miller) + 50 (final exp) = 120
    secureBlinding{.align: 32.}: array[32, byte]

-func hash[DigestSize: static int, T0, T1: char|byte](
-      H: type CryptoHash, digest: var array[DigestSize, byte], input0: openArray[T0], input1: openArray[T1]) =
+func hash[DigestSize: static int](
+      H: type CryptoHash, digest: var array[DigestSize, byte], input0: openArray[byte], input1: openArray[byte]) =

  static: doAssert DigestSize == H.digestSize()

@ -329,9 +331,9 @@ func hash[DigestSize: static int, T0, T1: char|byte](
  h.update(input1)
  h.finish(digest)

-func init*[T0, T1: char|byte](
-       ctx: var BLSBatchSigAccumulator, domainSepTag: openArray[T0],
-       secureRandomBytes: array[32, byte], accumSepTag: openArray[T1]) =
+func init*(
+       ctx: var BLSBatchSigAccumulator, domainSepTag: openArray[byte],
+       secureRandomBytes: array[32, byte], accumSepTag: openArray[byte]) {.genCharAPI.} =
  ## Initializes a Batch BLS Signature accumulator context.
  ##
  ## This requires cryptographically secure random bytes
@ -352,25 +354,21 @@ func init*[T0, T1: char|byte](
  if domainSepTag.len > 255:
    var t {.noInit.}: array[H.digestSize(), byte]
    H.shortDomainSepTag(output = t, domainSepTag)
-    copy(ctx.domainSepTag, dStart = 0,
-        t, sStart = 0,
-        H.digestSize())
+    rawCopy(ctx.domainSepTag, dStart = 0, t, sStart = 0, H.digestSize())
    ctx.dst_len = uint8 H.digestSize()
  else:
-    copy(ctx.domainSepTag, dStart = 0,
-        domainSepTag, sStart = 0,
-        domainSepTag.len)
+    rawCopy(ctx.domainSepTag, dStart = 0, domainSepTag, sStart = 0, domainSepTag.len)
    ctx.dst_len = uint8 domainSepTag.len
  for i in ctx.dst_len ..< ctx.domainSepTag.len:
    ctx.domainSepTag[i] = byte 0

  H.hash(ctx.secureBlinding, secureRandomBytes, accumSepTag)

-func update*[T: char|byte, Pubkey, Sig: ECP_ShortW_Aff](
+func update*[Pubkey, Sig: ECP_ShortW_Aff](
       ctx: var BLSBatchSigAccumulator,
       pubkey: Pubkey,
-       message: openArray[T],
-       signature: Sig): bool =
+       message: openArray[byte],
+       signature: Sig): bool {.genCharAPI.} =
  ## Add a (public key, message, signature) triplet
  ## to a BLS signature accumulator
  ##
@ -480,6 +478,13 @@ func update*[T: char|byte, Pubkey, Sig: ECP_ShortW_Aff](
    hmsgG1_aff.affine(hmsgG1_jac)
    ctx.millerAccum.update(hmsgG1_aff, pubkey)

+func update*[Pubkey, Sig: ECP_ShortW_Aff](
+       ctx: var BLSBatchSigAccumulator,
+       pubkey: Pubkey,
+       message: View[byte],
+       signature: Sig): bool {.inline.} =
+  ctx.update(pubkey, message, signature)
+
 func merge*(ctxDst: var BLSBatchSigAccumulator, ctxSrc: BLSBatchSigAccumulator): bool =
  ## Merge 2 BLS signature accumulators: ctxDst <- ctxDst + ctxSrc
  ##
@ -548,13 +553,13 @@ func aggregate*[T: ECP_ShortW_Aff](r: var T, points: openarray[T]) =
  accum.sum_reduce_vartime(points)
  r.affine(accum)

-func fastAggregateVerify*[B1, B2: byte|char, Pubkey, Sig](
+func fastAggregateVerify*[Pubkey, Sig](
    pubkeys: openArray[Pubkey],
-    message: openarray[B1],
+    message: openArray[byte],
    aggregateSignature: Sig,
    H: type CryptoHash,
    k: static int,
-    domainSepTag: openarray[B2]): bool =
+    domainSepTag: openArray[byte]): bool {.genCharAPI.} =
  ## Verify the aggregate of multiple signatures on the same message by multiple pubkeys
  ## Assumes pubkeys and sig have been checked for non-infinity and group-checked.

@ -563,15 +568,19 @@ func fastAggregateVerify*[B1, B2: byte|char, Pubkey, Sig](

  var aggPubkey {.noinit.}: Pubkey
  aggPubkey.aggregate(pubkeys)
+
+  if bool(aggPubkey.isInf()):
+    return false
+
  aggPubkey.coreVerify(message, aggregateSignature, H, k, augmentation = "", domainSepTag)

-func aggregateVerify*[Msg; B: byte|char, Pubkey, Sig](
+func aggregateVerify*[Msg, Pubkey, Sig](
    pubkeys: openArray[Pubkey],
    messages: openArray[Msg],
    aggregateSignature: Sig,
    H: type CryptoHash,
    k: static int,
-    domainSepTag: openarray[B]): bool =
+    domainSepTag: openarray[byte]): bool {.genCharAPI.} =
  ## Verify the aggregated signature of multiple (pubkey, message) pairs
  ## Assumes pubkeys and the aggregated signature have been checked for non-infinity and group-checked.
  ##
@ -598,14 +607,14 @@ func aggregateVerify*[Msg; B: byte|char, Pubkey, Sig](

  return accum.finalVerify(aggregateSignature)

-func batchVerify*[Msg; B: byte|char, Pubkey, Sig](
+func batchVerify*[Msg, Pubkey, Sig](
    pubkeys: openArray[Pubkey],
    messages: openArray[Msg],
    signatures: openArray[Sig],
    H: type CryptoHash,
    k: static int,
-    domainSepTag: openarray[B],
-    secureRandomBytes: array[32, byte]): bool =
+    domainSepTag: openarray[byte],
+    secureRandomBytes: array[32, byte]): bool {.genCharAPI.} =
  ## Verify that all (pubkey, message, signature) triplets are valid
  ##
  ## Returns false if there is at least one incorrect signature
--- a/constantine/zoo_exports.nim
+++ b/constantine/zoo_exports.nim
@ -0,0 +1,52 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+# This module allows flexible exports of procedures.
+# 1. This allows configuring all exported names from the protocol files
+#    instead of having those in many different places.
+# 2. No extra public wrapper proc are needed, reducing function call/return overhead.
+#    i.e. if we have an inner sha256.hash function
+#         and we need an exported `ctt_sha256_hash` and we also have a `hash_to_curve` function
+#         that internally uses `sha256.hash`,
+#         the ideal outcome is for `sha256.hash to be exported as `ctt_sha256_hash` and
+#         have `hash_to_curve` directly use that.
+# 3. Furthermore while compiling Nim only, no export marker (cdecl, dynlib, exportc) are used.
+#
+# Each prefix must be modified before importing the module to export
+
+# Exportable functions
+# ----------------------------------------------------------------------------------------------
+
+var prefix_sha256* {.compileTime.} = ""
+
+# Conditional exports
+# ----------------------------------------------------------------------------------------------
+
+import std/macros
+
+macro libPrefix*(prefix: static string, procAst: untyped): untyped =
+  if prefix == "":
+    return procAst
+  else:
+    var pragmas = procAst.pragma
+    if pragmas.kind == nnkEmpty:
+      pragmas = nnkPragma.newTree()
+
+    pragmas.add ident"cdecl"
+    pragmas.add nnkExprColonExpr.newTree(
+      ident"exportc",
+      newLit(prefix & "$1"))
+    pragmas.add nnkExprColonExpr.newTree(
+      ident"raises",
+      nnkBracket.newTree())
+
+    if appType == "lib":
+      pragmas.add ident"dynlib"
+
+    result = procAst
+    result.pragma = pragmas
--- a/examples_c/ethereum_bls_signatures.c
+++ b/examples_c/ethereum_bls_signatures.c
@ -0,0 +1,63 @@
+/** Constantine
+ *  Copyright (c) 2018-2019    Status Research & Development GmbH
+ *  Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+ *  Licensed and distributed under either of
+ *    * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+ *    * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+ *  at your option. This file may not be copied, modified, or distributed except according to those terms.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <constantine_ethereum_bls_signatures.h>
+
+int main(){
+
+  // Initialize the runtime. For Constantine, it populates the CPU runtime detection dispatch.
+  ctt_eth_bls_init_NimMain();
+
+  ctt_eth_bls_status status;
+
+  // Declare an example insecure non-cryptographically random non-secret key. DO NOT USE IN PRODUCTION.
+  byte raw_seckey[32] = "Security pb becomes key mgmt pb!";
+  ctt_eth_bls_seckey seckey;
+
+  status = ctt_eth_bls_deserialize_seckey(&seckey, raw_seckey);
+  if (status != cttBLS_Success) {
+    printf("Secret key deserialization failure: status %d - %s\n", status, ctt_eth_bls_status_to_string(status));
+    exit(1);
+  }
+
+  // Derive the matching public key
+  ctt_eth_bls_pubkey pubkey;
+
+  status = ctt_eth_bls_derive_pubkey(&pubkey, &seckey);
+  if (status != cttBLS_Success) {
+    printf("Public key derivation failure: status %d - %s\n", status, ctt_eth_bls_status_to_string(status));
+    exit(1);
+  }
+
+  // Sign a message
+  byte message[32];
+  ctt_eth_bls_signature sig;
+
+  ctt_eth_bls_sha256_hash(message, "Mr F was here", 13, /* clear_memory = */ 0);
+
+  status = ctt_eth_bls_sign(&sig, &seckey, message, 32);
+  if (status != cttBLS_Success) {
+    printf("Message signing failure: status %d - %s\n", status, ctt_eth_bls_status_to_string(status));
+    exit(1);
+  }
+
+  // Verify that a signature is valid for a message under the provided public key
+  status = ctt_eth_bls_verify(&pubkey, message, 32, &sig);
+  if (status != cttBLS_Success) {
+    printf("Signature verification failure: status %d - %s\n", status, ctt_eth_bls_status_to_string(status));
+    exit(1);
+  }
+
+  printf("Example BLS signature/verification protocol completed successfully\n");
+  return 0;
+}
--- a/examples_c/t_libctt_bls12_381.c
+++ b/examples_c/t_libctt_bls12_381.c
@ -1,21 +1,21 @@
-// Constantine
-// Copyright (c) 2018-2019    Status Research & Development GmbH
-// Copyright (c) 2020-Present Mamy André-Ratsimbazafy
-// Licensed and distributed under either of
-//   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
-//   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
-// at your option. This file may not be copied, modified, or distributed except according to those terms.
+/** Constantine
+ *  Copyright (c) 2018-2019    Status Research & Development GmbH
+ *  Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+ *  Licensed and distributed under either of
+ *    * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+ *    * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+ *  at your option. This file may not be copied, modified, or distributed except according to those terms.
+ */

 // This is a test to ensure Constantine's modular arithmetic is consistent with GMP.
 // While not intended as a tutorial, it showcases serialization, deserialization and computation.

 #include <assert.h>
-#include <gmp.h>
-#include <constantine_bls12_381.h>
 #include <stdio.h>
 #include <stdlib.h>

-typedef unsigned char byte;
+#include <gmp.h>
+#include <constantine_bls12_381.h>

 // https://gmplib.org/manual/Integer-Import-and-Export.html
 const int GMP_WordLittleEndian = -1;
--- a/helpers/explain_bin_size.sh
+++ b/helpers/explain_bin_size.sh
@ -0,0 +1,4 @@
+#!/bin/sh
+# Explain size of ELF .o files (does not work with gcc -flto).
+nm -oS --defined-only -fposix -td "$@" |
+    sort -nk5 | awk '{print $1,$2,$3,$5}'
--- a/helpers/pararun.nim
+++ b/helpers/pararun.nim
@ -7,7 +7,7 @@
 # at your option. This file may not be copied, modified, or distributed except according to those terms.

 import
-  std/[os, strutils, cpuinfo, strformat, deques, terminal],
+  std/[os, strutils, cpuinfo, strformat, deques],
  std/[asyncfutures, asyncdispatch],
  asynctools/[asyncproc, asyncpipe, asyncsync]

@ -46,7 +46,7 @@ proc release(s: AsyncSemaphore) =
  if s.waiters.len > 0:
    let waiter = s.waiters.popFirst()
    waiter.complete()
-  
+
  doAssert s.slots in {0..s.max}

 # Task runner
@ -55,72 +55,90 @@ proc release(s: AsyncSemaphore) =
 type WorkQueue = ref object
  sem: AsyncSemaphore
  cmdQueue: Deque[string]
-  outputQueue: AsyncQueue[tuple[cmd: string, p: AsyncProcess]]
-  lineBuf: string
+  outputQueue: AsyncQueue[tuple[cmd: string, p: AsyncProcess, output: AsyncQueue[string]]]

-proc releaseOnProcessExit(sem: AsyncSemaphore, p: AsyncProcess) {.async.} =
-  # TODO: addProcess callback on exit is cleaner but locks the AsyncPipe "readInto"
-  #
-  # p.processID.addProcess do (fd: AsyncFD) -> bool:
-  #   sem.release()
-  #
-  # see also: https://forum.nim-lang.org/t/5565
-  # and https://github.com/cheatfate/asynctools/issues/20
+proc monitorProcessLoop(output: AsyncQueue[string], cmd: string, id, total: int, p: AsyncProcess, sem: AsyncSemaphore) {.async.} =
+  # Ideally we want AsynStreams but that requires chronos, which doesn't support processes/pipes
+  # Or the nimboost package that hasn't been updated since 2019. So poor man's streams.
+  template doBuffering: untyped =
+    while true:
+      buf.setLen(256)
+      let charsRead = await p.outputHandle.readInto(buf[0].addr, buf.len)
+      if charsRead > 0:
+        buf.setLen(charsRead)
+        output.putNoWait(buf)
+      else:
+        break

+  var buf = newString(256)
+  doBuffering()
+
+  # Despite the output being empty we might still get STILL_ACTIVE: https://github.com/cheatfate/asynctools/blob/84ced6d/asynctools/asyncproc.nim#L24
+  # Unfortunately this gives "Resource temporarily unavailable" so we use exponential backoff.
+  # See also:
+  #  - https://github.com/cheatfate/asynctools/issues/20
+  #  - https://forum.nim-lang.org/t/5565
+  #
+  # let exitCode = await p.waitForExit()
  var backoff = 8
  while p.running():
    backoff = min(backoff*2, 1024) # Exponential backoff
    await sleepAsync(backoff)
-  sem.release()
+
+  doBuffering()
+  buf.setLen(0)
+
+  let exitCode = p.peekExitCode()
+  if exitCode != 0:
+    buf.add("\n" & '='.repeat(26) & " Command exited with code " & $exitCode & " " & '='.repeat(26) & '\n')
+    buf.add("[FAIL]: '" & cmd & "' (#" & $id & "/" & $total & ")\n")
+    buf.add("[FAIL]: Command #" & $id & " exited with error " & $exitCode & '\n')
+    buf.add('='.repeat(80) & '\n')
+    output.putNoWait(buf)
+
+  # close not exported: https://github.com/cheatfate/asynctools/issues/16
+  p.inputHandle.close()
+  p.outputHandle.close()
+  p.errorHandle.close()
+
+  output.putNoWait("")
+  if exitCode == 0:
+    sem.release()

 proc enqueuePendingCommands(wq: WorkQueue) {.async.} =
+  var id = 0
+  let total = wq.cmdQueue.len
  while wq.cmdQueue.len > 0:
+    id += 1
+
    await wq.sem.acquire()
    let cmd = wq.cmdQueue.popFirst()
-    let p = cmd.startProcess(
-      options = {poStdErrToStdOut, poUsePath, poEvalCommand}
-    )
-    p.inputHandle.close()
+    let p = cmd.startProcess(options = {poStdErrToStdOut, poUsePath, poEvalCommand})

-    asyncCheck wq.sem.releaseOnProcessExit(p)
-    wq.outputQueue.putNoWait((cmd, p))
+    let bufOut = newAsyncQueue[string]()
+    asyncCheck bufOut.monitorProcessLoop(cmd, id, total, p, wq.sem)

-proc flushCommandsOutput(wq: WorkQueue) {.async.} =
+    wq.outputQueue.putNoWait((cmd, p, bufOut))
+
+proc flushCommandsOutput(wq: WorkQueue, total: int) {.async.} =
  var id = 0
  while true:
-    let (cmd, p) = await wq.outputQueue.get()
-    
-    echo '\n', '='.repeat(80)
-    echo "||\n|| Running: ", cmd ,"\n||"
-    echo '='.repeat(80)
-    
-    while true:
-      let charsRead = await p.outputHandle.readInto(wq.lineBuf[0].addr, wq.lineBuf.len)
-      if charsRead == 0:
-        break
-      let charsWritten = stdout.writeBuffer(wq.lineBuf[0].addr, charsRead)
-      doAssert charsRead == charsWritten
-    
-    # close not exported: https://github.com/cheatfate/asynctools/issues/16
-    p.outputHandle.close()
-    
-    let exitCode = p.peekExitCode()
-    if exitCode == 259:
-      echo "==== Command exited with code 259 ===="
-      echo "[SKIP]: '", cmd, "' (#", id, ")"
-      echo "==== Custom stacktrace ===="
-      writeStackTrace()
-      echo "==== Custom stacktrace ===="
-      echo "[SKIP]: Assuming process was unregistered when trying to retrieve its exit code"
-    elif exitCode != 0:
-      echo "==== Command exited with code ", exitCode, " ===="
-      echo "[FAIL]: '", cmd, "' (#", id, ")"
-      echo "==== Custom stacktrace ===="
-      writeStackTrace()
-      echo "==== Custom stacktrace ===="
-      quit "[FAIL]: Command #" & $id & " exited with error " & $exitCode, exitCode
-
    id += 1
+    let (cmd, p, processOutput) = await wq.outputQueue.get()
+
+    echo '\n', '='.repeat(80)
+    echo "||\n|| Running #", id, "/", total, ": ", cmd ,"\n||"
+    echo '='.repeat(80)
+
+    while true:
+      let output = await processOutput.get()
+      if output == "":
+        break
+      stdout.write(output)
+
+    let exitCode = p.peekExitCode()
+    if exitCode != 0:
+      quit exitCode

    if wq.cmdQueue.len == 0 and wq.outputQueue.len == 0:
      return
@ -132,9 +150,7 @@ proc runCommands(commandFile: string, numWorkers: int) =
  let wq = WorkQueue(
    sem: AsyncSemaphore.new(numWorkers),
    cmdQueue: initDeque[string](),
-    outputQueue: newAsyncQueue[tuple[cmd: string, p: AsyncProcess]](),
-    lineBuf: newString(max(80, terminalWidth()))
-  )
+    outputQueue: newAsyncQueue[tuple[cmd: string, p: AsyncProcess, output: AsyncQueue[string]]]())

  # Parse the file
  # --------------
@ -142,16 +158,17 @@ proc runCommands(commandFile: string, numWorkers: int) =
    if cmd.len == 0: continue
    wq.cmdQueue.addLast(cmd)

-  echo "Found ", wq.cmdQueue.len, " commands to run"
-  
+  let total = wq.cmdQueue.len
+  echo "Found ", total, " commands to run"
+
  # Run the commands
  # ----------------
  asyncCheck wq.enqueuePendingCommands()
-  waitFor wq.flushCommandsOutput()
+  waitFor wq.flushCommandsOutput(total)

 # Main
 # ----------------------------------------------------------------
-  
+
 proc main() =
  var commandFile: string
  var numWorkers = countProcessors()
@ -162,7 +179,7 @@ proc main() =

  if paramCount() >= 1:
    commandFile = paramStr(1)
-  
+
  if paramCount() == 2:
    numWorkers = paramStr(2).parseInt()

--- a/include/constantine_bls12_381.h
+++ b/include/constantine_bls12_381.h
@ -1,11 +1,10 @@
-/*
- * Constantine
- * Copyright (c) 2018-2019    Status Research & Development GmbH
- * Copyright (c) 2020-Present Mamy André-Ratsimbazafy
- * Licensed and distributed under either of
- *   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
- *   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
- * at your option. This file may not be copied, modified, or distributed except according to those terms.
+/** Constantine
+ *  Copyright (c) 2018-2019    Status Research & Development GmbH
+ *  Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+ *  Licensed and distributed under either of
+ *    * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+ *    * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+ *  at your option. This file may not be copied, modified, or distributed except according to those terms.
 */
 #ifndef __CTT_H_BLS12381__
 #define __CTT_H_BLS12381__
--- a/include/constantine_ethereum_bls_signatures.h
+++ b/include/constantine_ethereum_bls_signatures.h
@ -0,0 +1,353 @@
+/** Constantine
+ *  Copyright (c) 2018-2019    Status Research & Development GmbH
+ *  Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+ *  Licensed and distributed under either of
+ *    * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+ *    * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+ *  at your option. This file may not be copied, modified, or distributed except according to those terms.
+ */
+#ifndef __CTT_H_ETHEREUM_BLS_SIGNATURES__
+#define __CTT_H_ETHEREUM_BLS_SIGNATURES__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Basic Types
+// ------------------------------------------------------------------------------------------------
+
+#if defined(__SIZE_TYPE__) && defined(__PTRDIFF_TYPE__)
+typedef __SIZE_TYPE__    size_t;
+typedef __PTRDIFF_TYPE__ ptrdiff_t;
+#else
+#include <stddef.h>
+#endif
+
+#if defined(__UINT8_TYPE__) && defined(__UINT32_TYPE__) && defined(__UINT64_TYPE__)
+typedef __UINT8_TYPE__   uint8_t;
+typedef __UINT32_TYPE__  uint32_t;
+typedef __UINT64_TYPE__  uint64_t;
+#else
+#include <stdint.h>
+#endif
+
+// https://github.com/nim-lang/Nim/blob/v1.6.12/lib/nimbase.h#L318
+#if defined(__STDC_VERSION__) && __STDC_VERSION__>=199901
+# define bool _Bool
+#else
+# define bool unsigned char
+#endif
+
+typedef uint8_t          byte;
+
+// Attributes
+// ------------------------------------------------------------------------------------------------
+
+#if defined(_MSC_VER)
+#  define ctt_pure __declspec(noalias)
+#elif defined(__GNUC__)
+#  define ctt_pure __attribute__((pure))
+#else
+#  define ctt_pure
+#endif
+
+#if defined(_MSC_VER)
+#  define align(x)  __declspec(align(x))
+#else
+#  define align(x)  __attribute__((aligned(x)))
+#endif
+
+// BLS signature types
+// ------------------------------------------------------------------------------------------------
+
+#define FIELD_BITS 381
+#define ORDER_BITS 255
+#define BYTES(bits) ((int) ((bits) + 8 - 1) / 8)
+
+struct ctt_eth_bls_fp { byte raw[BYTES(FIELD_BITS)]; };
+struct ctt_eth_bls_fp2 { struct ctt_eth_bls_fp coords[2]; };
+
+typedef struct { byte raw[BYTES(ORDER_BITS)]; } ctt_eth_bls_seckey;
+typedef struct { struct ctt_eth_bls_fp  x, y; } ctt_eth_bls_pubkey;
+typedef struct { struct ctt_eth_bls_fp2 x, y; } ctt_eth_bls_signature;
+
+typedef enum __attribute__((__packed__)) {
+    cttBLS_Success,
+    cttBLS_VerificationFailure,
+    cttBLS_InvalidEncoding,
+    cttBLS_CoordinateGreaterOrEqualThanModulus,
+    cttBLS_PointAtInfinity,
+    cttBLS_PointNotOnCurve,
+    cttBLS_PointNotInSubgroup,
+    cttBLS_ZeroSecretKey,
+    cttBLS_SecretKeyLargerThanCurveOrder,
+    cttBLS_ZeroLengthAggregation,
+    cttBLS_InconsistentLengthsOfInputs,
+} ctt_eth_bls_status;
+
+static const char* ctt_eth_bls_status_to_string(ctt_eth_bls_status status) {
+  static const char* const statuses[] = {
+    "cttBLS_Success",
+    "cttBLS_VerificationFailure",
+    "cttBLS_InvalidEncoding",
+    "cttBLS_CoordinateGreaterOrEqualThanModulus",
+    "cttBLS_PointAtInfinity",
+    "cttBLS_PointNotOnCurve",
+    "cttBLS_PointNotInSubgroup",
+    "cttBLS_ZeroSecretKey",
+    "cttBLS_SecretKeyLargerThanCurveOrder",
+    "cttBLS_ZeroLengthAggregation",
+    "cttBLS_InconsistentLengthsOfInputs",
+  };
+  size_t length = sizeof statuses / sizeof *statuses;
+  if (0 <= status && status < length) {
+    return statuses[status];
+  }
+  return "cttBLS_InvalidStatusCode";
+}
+
+// Initialization
+// ------------------------------------------------------------------------------------------------
+
+/** Initializes the library:
+ *  - detect CPU features like ADX instructions support (MULX, ADCX, ADOX)
+ */
+void ctt_eth_bls_init_NimMain(void);
+
+// SHA-256
+// ------------------------------------------------------------------------------------------------
+
+typedef struct {
+  align(64) uint32_t message_schedule[16];
+  align(64) byte     buf[64];
+            uint64_t msgLen;
+} ctt_eth_bls_sha256_context;
+
+/** Initialize or reinitialize a Sha256 context.
+ */
+void ctt_eth_bls_sha256_init(ctt_eth_bls_sha256_context* ctx);
+
+/** Append a message to a SHA256 context
+ *  for incremental SHA256 computation
+ *
+ *  Security note: the tail of your message might be stored
+ *  in an internal buffer.
+ *  if sensitive content is used, ensure that
+ *  `ctx.finish(...)` and `ctx.clear()` are called as soon as possible.
+ *  Additionally ensure that the message(s) passed were stored
+ *  in memory considered secure for your threat model.
+ *
+ *  For passwords and secret keys, you MUST NOT use raw SHA-256
+ *  use a Key Derivation Function instead (KDF)
+ */
+void ctt_eth_bls_sha256_update(ctt_eth_bls_sha256_context* ctx, const byte* message, ptrdiff_t message_len);
+
+/** Finalize a SHA256 computation and output the
+ *  message digest to the `digest` buffer.
+ *
+ *  Security note: this does not clear the internal buffer.
+ *  if sensitive content is used, use "ctx.clear()"
+ *  and also make sure that the message(s) passed were stored
+ *  in memory considered secure for your threat model.
+ *
+ *  For passwords and secret keys, you MUST NOT use raw SHA-256
+ *  use a Key Derivation Function instead (KDF)
+ */
+void ctt_eth_bls_sha256_finish(ctt_eth_bls_sha256_context* ctx, byte digest[32]);
+
+/** Clear the context internal buffers
+ *  Security note:
+ *  For passwords and secret keys, you MUST NOT use raw SHA-256
+ *  use a Key Derivation Function instead (KDF)
+ */
+void ctt_eth_bls_sha256_clear(ctt_eth_bls_sha256_context* ctx);
+
+/** Compute the SHA-256 hash of message
+ *  and store the result in digest.
+ *  Optionally, clear the memory buffer used.
+ */
+void ctt_eth_bls_sha256_hash(byte digest[32], const byte* message, ptrdiff_t message_len, bool clear_memory);
+
+// Comparisons
+// ------------------------------------------------------------------------------------------------
+
+ctt_pure bool ctt_eth_bls_pubkey_is_zero(const ctt_eth_bls_pubkey* pubkey);
+ctt_pure bool ctt_eth_bls_signature_is_zero(const ctt_eth_bls_signature* sig);
+
+ctt_pure bool ctt_eth_bls_pubkeys_are_equal(const ctt_eth_bls_pubkey* a,
+                                            const ctt_eth_bls_pubkey* b);
+ctt_pure bool ctt_eth_bls_signatures_are_equal(const ctt_eth_bls_signature* a,
+                                               const ctt_eth_bls_signature* b);
+
+// Input validation
+// ------------------------------------------------------------------------------------------------
+
+/** Validate the secret key.
+ *
+ *  Regarding timing attacks, this will leak timing information only if the key is invalid.
+ *  Namely, the secret key is 0 or the secret key is too large.
+ */
+ctt_pure ctt_eth_bls_status ctt_eth_bls_validate_seckey(const ctt_eth_bls_seckey* seckey);
+
+/** Validate the public key.
+ *
+ *  This is an expensive operation that can be cached.
+ */
+ctt_pure ctt_eth_bls_status ctt_eth_bls_validate_pubkey(const ctt_eth_bls_pubkey* pubkey);
+
+/** Validate the signature.
+ *
+ *  This is an expensive operation that can be cached.
+ */
+ctt_pure ctt_eth_bls_status ctt_eth_bls_validate_signature(const ctt_eth_bls_signature* pubkey);
+
+// Codecs
+// ------------------------------------------------------------------------------------------------
+/** Serialize a secret key
+ *
+ *  Returns cttBLS_Success if successful
+ */
+ctt_eth_bls_status ctt_eth_bls_serialize_seckey(byte dst[32], const ctt_eth_bls_seckey* seckey);
+
+/** Serialize a public key in compressed (Zcash) format
+ *
+ *  Returns cttBLS_Success if successful
+ */
+ctt_eth_bls_status ctt_eth_bls_serialize_pubkey_compressed(byte dst[48], const ctt_eth_bls_pubkey* pubkey);
+
+/** Serialize a signature in compressed (Zcash) format
+ *
+ *  Returns cttBLS_Success if successful
+ */
+ctt_eth_bls_status ctt_eth_bls_serialize_signature_compressed(byte dst[96], const ctt_eth_bls_signature* sig);
+
+/** Deserialize a secret key
+ *  This also validates the secret key.
+ *
+ *  This is protected against side-channel unless your key is invalid.
+ *  In that case it will like whether it's all zeros or larger than the curve order.
+ */
+ctt_eth_bls_status ctt_eth_bls_deserialize_seckey(ctt_eth_bls_seckey* seckey, const byte src[32]);
+
+/** Deserialize a public key in compressed (Zcash) format.
+ *  This does not validate the public key.
+ *  It is intended for cases where public keys are stored in a trusted location
+ *  and validation can be cached.
+ *
+ *  Warning ⚠:
+ *    This procedure skips the very expensive subgroup checks.
+ *    Not checking subgroup exposes a protocol to small subgroup attacks.
+ *
+ *  Returns cttBLS_Success if successful
+ */
+ctt_eth_bls_status ctt_eth_bls_deserialize_pubkey_compressed_unchecked(ctt_eth_bls_pubkey* pubkey, const byte src[48]);
+
+/** Deserialize a public_key in compressed (Zcash) format.
+ *  This also validates the public key.
+ *
+ *  Returns cttBLS_Success if successful
+ */
+ctt_eth_bls_status ctt_eth_bls_deserialize_pubkey_compressed(ctt_eth_bls_pubkey* pubkey, const byte src[48]);
+
+/** Deserialize a signature in compressed (Zcash) format.
+ *  This does not validate the signature.
+ *  It is intended for cases where public keys are stored in a trusted location
+ *  and validation can be cached.
+ *
+ *  Warning ⚠:
+ *    This procedure skips the very expensive subgroup checks.
+ *    Not checking subgroup exposes a protocol to small subgroup attacks.
+ *
+ *  Returns cttBLS_Success if successful
+ */
+ctt_eth_bls_status ctt_eth_bls_deserialize_signature_compressed_unchecked(ctt_eth_bls_signature* sig, const byte src[96]);
+
+/** Deserialize a signature in compressed (Zcash) format.
+ *  This also validates the signature.
+ *
+ *  Returns cttBLS_Success if successful
+ */
+ctt_eth_bls_status ctt_eth_bls_deserialize_signature_compressed(ctt_eth_bls_signature* sig, const byte src[96]);
+
+// BLS signatures
+// ------------------------------------------------------------------------------------------------
+
+/** Derive the public key matching with a secret key
+ *
+ *  Secret protection:
+ *  - A valid secret key will only leak that it is valid.
+ *  - An invalid secret key will leak whether it's all zero or larger than the curve order.
+ */
+ctt_eth_bls_status ctt_eth_bls_derive_pubkey(ctt_eth_bls_pubkey* pubkey, const ctt_eth_bls_seckey* seckey);
+
+/** Produce a signature for the message under the specified secret key
+ *  Signature is on BLS12-381 G2 (and public key on G1)
+ *
+ *  For message domain separation purpose, the tag is `BLS_SIG_BLS12381G2_XMD:SHA-256_SSWU_RO_POP_`
+ *
+ *  Input:
+ *  - A secret key
+ *  - A message
+ *
+ *  Output:
+ *  - `signature` is overwritten with `message` signed with `secretKey`
+ *    with the scheme
+ *  - A status code indicating success or if the secret key is invalid.
+ *
+ *  Secret protection:
+ *  - A valid secret key will only leak that it is valid.
+ *  - An invalid secret key will leak whether it's all zero or larger than the curve order.
+ */
+ctt_eth_bls_status ctt_eth_bls_sign(ctt_eth_bls_signature* sig,
+                                    const ctt_eth_bls_seckey* seckey,
+                                    const byte* message, ptrdiff_t message_len);
+
+/** Check that a signature is valid for a message
+ *  under the provided public key.
+ *  returns `true` if the signature is valid, `false` otherwise.
+ *
+ *  For message domain separation purpose, the tag is `BLS_SIG_BLS12381G2_XMD:SHA-256_SSWU_RO_POP_`
+ *
+ *  Input:
+ *  - A public key initialized by one of the key derivation or deserialization procedure.
+ *    Or validated via validate_pubkey
+ *  - A message
+ *  - A signature initialized by one of the key derivation or deserialization procedure.
+ *    Or validated via validate_signature
+ *
+ *  Output:
+ *  - a status code with verification success if signature is valid
+ *    or indicating verification failure
+ *
+ *  In particular, the public key and signature are assumed to be on curve and subgroup-checked.
+ */
+ctt_pure ctt_eth_bls_status ctt_eth_bls_verify(const ctt_eth_bls_pubkey* pubkey,
+                                               const byte* message, ptrdiff_t message_len,
+                                               const ctt_eth_bls_signature* sig);
+
+// TODO: API for pubkeys and signature aggregation. Return a bool or a status code or nothing?
+
+/** Check that a signature is valid for a message
+ *  under the aggregate of provided public keys.
+ *  returns `true` if the signature is valid, `false` otherwise.
+ *
+ *  For message domain separation purpose, the tag is `BLS_SIG_BLS12381G2_XMD:SHA-256_SSWU_RO_POP_`
+ *
+ *  Input:
+ *  - Public keys initialized by one of the key derivation or deserialization procedure.
+ *    Or validated via validate_pubkey
+ *  - A message
+ *  - A signature initialized by one of the key derivation or deserialization procedure.
+ *    Or validated via validate_signature
+ *
+ *  In particular, the public keys and signature are assumed to be on curve subgroup checked.
+ */
+ctt_pure ctt_eth_bls_status ctt_eth_bls_fast_aggregate_verify(const ctt_eth_bls_pubkey pubkeys[], ptrdiff_t pubkeys_len,
+                                                              const byte* message, ptrdiff_t message_len,
+                                                              const ctt_eth_bls_signature* aggregate_sig);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/include/constantine_pasta.h
+++ b/include/constantine_pasta.h
@ -1,11 +1,10 @@
-/*
- * Constantine
- * Copyright (c) 2018-2019    Status Research & Development GmbH
- * Copyright (c) 2020-Present Mamy André-Ratsimbazafy
- * Licensed and distributed under either of
- *   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
- *   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
- * at your option. This file may not be copied, modified, or distributed except according to those terms.
+/** Constantine
+ *  Copyright (c) 2018-2019    Status Research & Development GmbH
+ *  Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+ *  Licensed and distributed under either of
+ *    * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+ *    * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+ *  at your option. This file may not be copied, modified, or distributed except according to those terms.
 */
 #ifndef __CTT_H_PASTA__
 #define __CTT_H_PASTA__
--- a/lib/.gitignore
+++ b/lib/.gitignore
@ -0,0 +1,5 @@
+# Ignore everything in this directory
+*
+# Except this file and README
+!.gitignore
+!README.md
--- a/tests/gpu/hello_world_nvidia.nim
+++ b/tests/gpu/hello_world_nvidia.nim
@ -6,7 +6,7 @@
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.

-import ../../constantine/platforms/gpu/[llvm, nvidia, bindings/utils]
+import ../../constantine/platforms/gpu/[llvm, nvidia, bindings/c_abi]

 # ############################################################
 #
@ -60,8 +60,8 @@ proc nvvmIRVersion*(majorIR, minorIR, majorDbg, minorDbg: var int32): NvvmResult

 proc nvvmCreateProgram*(prog: var NvvmProgram): NvvmResult
 proc nvvmDestroyProgram*(prog: var NvvmProgram): NvvmResult
-proc nvvmAddModuleToProgram*(prog: NvvmProgram, buffer: openArray[byte], name: cstring): NvvmResult {.wrapOpenArrayLenType: csize_t.} 
-proc nvvmLazyAddModuleToProgram*(prog: NvvmProgram, buffer: openArray[byte], name: cstring): NvvmResult {.wrapOpenArrayLenType: csize_t.} 
+proc nvvmAddModuleToProgram*(prog: NvvmProgram, buffer: openArray[byte], name: cstring): NvvmResult {.wrapOpenArrayLenType: csize_t.}
+proc nvvmLazyAddModuleToProgram*(prog: NvvmProgram, buffer: openArray[byte], name: cstring): NvvmResult {.wrapOpenArrayLenType: csize_t.}
 proc nvvmCompileProgram*(prog: NvvmProgram; numOptions: int32; options: cstringArray): NvvmResult
 proc nvvmVerifyProgram*(prog: NvvmProgram; numOptions: int32; options: cstringArray): NvvmResult
 proc nvvmGetCompiledResultSize*(prog: NvvmProgram; bufferSizeRet: var csize_t): NvvmResult
@ -93,7 +93,7 @@ proc getNvvmLog(prog: NvvmProgram): string {.used.} =

 proc ptxCodegenViaNvidiaNvvm(module: ModuleRef, sm: tuple[major, minor: int32]): string =
  ## PTX codegen via Nvidia NVVM
-  
+
  # ######################################
  # LLVM -> NNVM handover

@ -120,7 +120,7 @@ proc ptxCodegenViaNvidiaNvvm(module: ModuleRef, sm: tuple[major, minor: int32]):

 proc ptxCodegenViaLlvmNvptx(module: ModuleRef, sm: tuple[major, minor: int32]): string =
  ## PTX codegen via LLVM NVPTX
-  
+
  module.verify(AbortProcessAction)

  initializeFullNVPTXTarget()
--- a/tests/math/t_ec_template.nim
+++ b/tests/math/t_ec_template.nim
@ -85,10 +85,7 @@ func random_point*(rng: var RngState, EC: typedesc, randZ: bool, gen: RandomGen)
 proc run_EC_addition_tests*(
       ec: typedesc,
       Iters: static int,
-       moduleName: string
-     ) =
-
-
+       moduleName: string) =
  var rng: RngState
  let seed = uint32(getTime().toUnix() and (1'i64 shl 32 - 1)) # unixTime mod 2^32
  rng.seed(seed)
@ -274,9 +271,7 @@ proc run_EC_addition_tests*(
 proc run_EC_mul_sanity_tests*(
       ec: typedesc,
       ItersMul: static int,
-       moduleName: string
-     ) =
-
+       moduleName: string) =
  # Random seed for reproducibility
  var rng: RngState
  let seed = uint32(getTime().toUnix() and (1'i64 shl 32 - 1)) # unixTime mod 2^32
@ -306,16 +301,16 @@ proc run_EC_mul_sanity_tests*(
            bool(reference.isInf())
            bool(refMinWeight.isInf())

-          proc refWNaf(w: static int) = # workaround staticFor symbol visibility
+          proc refWNaf(bits, w: static int) = # workaround staticFor symbol visibility
            var refWNAF = a
-            refWNAF.scalarMul_minHammingWeight_windowed_vartime(exponent, window = w)
+            refWNAF.scalarMul_minHammingWeight_windowed_vartime(BigInt[bits](), window = w)
            check: bool(refWNAF.isInf())

-          refWNaf(2)
-          refWNaf(3)
-          refWNaf(5)
-          refWNaf(8)
-          refWNaf(13)
+          refWNaf(bits, w = 2)
+          refWNaf(bits, w = 3)
+          refWNaf(bits, w = 5)
+          refWNaf(bits, w = 8)
+          refWNaf(bits, w = 13)

      test(ec, bits = ec.F.C.getCurveOrderBitwidth(), randZ = false, gen = Uniform)
      test(ec, bits = ec.F.C.getCurveOrderBitwidth(), randZ = true, gen = Uniform)
@ -381,9 +376,7 @@ proc run_EC_mul_sanity_tests*(
 proc run_EC_mul_distributive_tests*(
       ec: typedesc,
       ItersMul: static int,
-       moduleName: string
-     ) =
-
+       moduleName: string) =
  # Random seed for reproducibility
  var rng: RngState
  let seed = uint32(getTime().toUnix() and (1'i64 shl 32 - 1)) # unixTime mod 2^32
@ -446,9 +439,7 @@ proc run_EC_mul_distributive_tests*(
 proc run_EC_mul_vs_ref_impl*(
       ec: typedesc,
       ItersMul: static int,
-       moduleName: string
-     ) =
-
+       moduleName: string) =
  # Random seed for reproducibility
  var rng: RngState
  let seed = uint32(getTime().toUnix() and (1'i64 shl 32 - 1)) # unixTime mod 2^32
@ -501,9 +492,7 @@ proc run_EC_mul_vs_ref_impl*(
 proc run_EC_mixed_add_impl*(
       ec: typedesc,
       Iters: static int,
-       moduleName: string
-     ) =
-
+       moduleName: string) =
  # Random seed for reproducibility
  var rng: RngState
  let seed = uint32(getTime().toUnix() and (1'i64 shl 32 - 1)) # unixTime mod 2^32
@ -634,8 +623,7 @@ proc run_EC_mixed_add_impl*(
 proc run_EC_subgroups_cofactors_impl*(
       ec: typedesc,
       ItersMul: static int,
-       moduleName: string
-     ) =
+       moduleName: string) =
  # Random seed for reproducibility
  var rng: RngState
  let seed = uint32(getTime().toUnix() and (1'i64 shl 32 - 1)) # unixTime mod 2^32
@ -706,9 +694,7 @@ proc run_EC_subgroups_cofactors_impl*(
 proc run_EC_affine_conversion*(
       ec: typedesc,
       Iters: static int,
-       moduleName: string
-     ) =
-
+       moduleName: string) =
  # Random seed for reproducibility
  var rng: RngState
  let seed = uint32(getTime().toUnix() and (1'i64 shl 32 - 1)) # unixTime mod 2^32
@ -869,9 +855,7 @@ proc run_EC_conversion_failures*(
 proc run_EC_batch_add_impl*[N: static int](
       ec: typedesc,
       numPoints: array[N, int],
-       moduleName: string
-     ) =
-
+       moduleName: string) =
  # Random seed for reproducibility
  var rng: RngState
  let seed = uint32(getTime().toUnix() and (1'i64 shl 32 - 1)) # unixTime mod 2^32
@ -942,9 +926,7 @@ proc run_EC_batch_add_impl*[N: static int](
 proc run_EC_multi_scalar_mul_impl*[N: static int](
       ec: typedesc,
       numPoints: array[N, int],
-       moduleName: string
-     ) =
-
+       moduleName: string) =
  # Random seed for reproducibility
  var rng: RngState
  let seed = uint32(getTime().toUnix() and (1'i64 shl 32 - 1)) # unixTime mod 2^32
--- a/tests/math/t_fp_tower_frobenius_template.nim
+++ b/tests/math/t_fp_tower_frobenius_template.nim
@ -60,8 +60,7 @@ proc runFrobeniusTowerTests*[N](
      Iters: static int,
      TestCurves: static array[N, Curve],
      moduleName: string,
-      testSuiteDesc: string
-    ) =
+      testSuiteDesc: string) =
  # Random seed for reproducibility
  var rng: RngState
  let seed = uint32(getTime().toUnix() and (1'i64 shl 32 - 1)) # unixTime mod 2^32
@ -75,7 +74,6 @@ proc runFrobeniusTowerTests*[N](
          var a = rng.random_elem(Field, gen)
          var fa {.noInit.}: typeof(a)
          fa.frobenius_map(a, k = 1)
-
          a.powUnsafeExponent(Field.fieldMod(), window = 3)
          check: bool(a == fa)

--- a/tests/t_blssig_pop_on_bls12381_g2.nim
+++ b/tests/t_blssig_pop_on_bls12381_g2.nim
@ -9,7 +9,7 @@
 import
  std/[os, unittest, strutils],
  pkg/jsony,
-  ../constantine/blssig_pop_on_bls12381_g2,
+  ../constantine/ethereum_bls_signatures,
  ../constantine/platforms/codecs,
  ../constantine/hashes

@ -115,7 +115,7 @@ template testGen*(name, testData, TestType, body: untyped): untyped =
 testGen(deserialization_G1, testVector, DeserG1_test):
  var pubkey{.noInit.}: PublicKey

-  let status = pubkey.deserialize_public_key_compressed(testVector.input.pubkey)
+  let status = pubkey.deserialize_pubkey_compressed(testVector.input.pubkey)
  let success = status == cttBLS_Success or status == cttBLS_PointAtInfinity

  doAssert success == testVector.output, block:
@ -126,7 +126,7 @@ testGen(deserialization_G1, testVector, DeserG1_test):
  if success: # Roundtrip
    var s{.noInit.}: array[48, byte]

-    let status2 = s.serialize_public_key_compressed(pubkey)
+    let status2 = s.serialize_pubkey_compressed(pubkey)
    doAssert status2 == cttBLS_Success
    doAssert s == testVector.input.pubkey, block:
      "\nSerialization roundtrip differs from expected \n" &
@ -158,7 +158,7 @@ testGen(sign, testVector, Sign_test):
  var seckey{.noInit.}: SecretKey
  var sig{.noInit.}: Signature

-  let status = seckey.deserialize_secret_key(testVector.input.privkey)
+  let status = seckey.deserialize_seckey(testVector.input.privkey)
  if status != cttBLS_Success:
    doAssert testVector.output == default(array[96, byte])
    let status2 = sig.sign(seckey, testVector.input.message)
@ -171,7 +171,7 @@ testGen(sign, testVector, Sign_test):
      var output{.noInit.}: Signature
      let status3 = output.deserialize_signature_compressed(testVector.output)
      doAssert status3 == cttBLS_Success
-      doAssert sig == output, block:
+      doAssert signatures_are_equal(sig, output), block:
        var sig_bytes{.noInit.}: array[96, byte]
        var roundtrip{.noInit.}: array[96, byte]
        let sb_status = sig_bytes.serialize_signature_compressed(sig)
@ -198,7 +198,7 @@ testGen(verify, testVector, Verify_test):
    status = cttBLS_VerificationFailure

  block testChecks:
-    status = pubkey.deserialize_public_key_compressed(testVector.input.pubkey)
+    status = pubkey.deserialize_pubkey_compressed(testVector.input.pubkey)
    if status notin {cttBLS_Success, cttBLS_PointAtInfinity}:
      # For point at infinity, we want to make sure that "verify" itself handles them.
      break testChecks
@ -218,7 +218,7 @@ testGen(verify, testVector, Verify_test):
  if success: # Extra codec testing
    block:
      var output{.noInit.}: array[48, byte]
-      let s = output.serialize_public_key_compressed(pubkey)
+      let s = output.serialize_pubkey_compressed(pubkey)
      doAssert s == cttBLS_Success
      doAssert output == testVector.input.pubkey

@ -236,7 +236,7 @@ testGen(fast_aggregate_verify, testVector, FastAggregateVerify_test):

  block testChecks:
    for i in 0 ..< testVector.input.pubkeys.len:
-      status = pubkeys[i].deserialize_public_key_compressed(testVector.input.pubkeys[i])
+      status = pubkeys[i].deserialize_pubkey_compressed(testVector.input.pubkeys[i])
      if status notin {cttBLS_Success, cttBLS_PointAtInfinity}:
        # For point at infinity, we want to make sure that "verify" itself handles them.
        break testChecks
@ -262,7 +262,7 @@ testGen(aggregate_verify, testVector, AggregateVerify_test):

  block testChecks:
    for i in 0 ..< testVector.input.pubkeys.len:
-      status = pubkeys[i].deserialize_public_key_compressed(testVector.input.pubkeys[i])
+      status = pubkeys[i].deserialize_pubkey_compressed(testVector.input.pubkeys[i])
      if status notin {cttBLS_Success, cttBLS_PointAtInfinity}:
        # For point at infinity, we want to make sure that "verify" itself handles them.
        break testChecks
@ -288,7 +288,7 @@ testGen(batch_verify, testVector, BatchVerify_test):

  block testChecks:
    for i in 0 ..< testVector.input.pubkeys.len:
-      status = pubkeys[i].deserialize_public_key_compressed(testVector.input.pubkeys[i])
+      status = pubkeys[i].deserialize_pubkey_compressed(testVector.input.pubkeys[i])
      if status notin {cttBLS_Success, cttBLS_PointAtInfinity}:
        # For point at infinity, we want to make sure that "verify" itself handles them.
        break testChecks
--- a/tests/t_hash_sha256_vs_openssl.nim
+++ b/tests/t_hash_sha256_vs_openssl.nim
@ -39,7 +39,7 @@ when not defined(windows):
        digest: ptr array[32, byte] = nil
      ): ptr array[32, byte] {.cdecl, dynlib: DLLSSLName, importc.}

-  # proc EVP_Q_digest[T: byte| char](
+  # proc EVP_Q_digest[T: byte|char](
  #                 ossl_libctx: pointer,
  #                 algoName: cstring,
  #                 propq: cstring,
@ -49,7 +49,7 @@ when not defined(windows):

  proc SHA256_OpenSSL[T: byte|char](
        digest: var array[32, byte],
-        s: openarray[T]) =
+        s: openArray[T]) =
    discard SHA256(s, digest.addr)
    # discard EVP_Q_digest(nil, "SHA256", nil, s, digest, nil)