[Backend] Add support for Nvidia GPUs (#210)

* Add PoC of JIT exec on Nvidia GPUs [skip ci] * Split GPU bindings into low-level (ABI) and high-level [skip ci] * small typedef reorg [skip ci] * refine LLVM IR/Nvidia GPU hello worlds * [Nvidia GPU] PoC implementation of field addition [skip ci] * prod-ready field addition + tests on Nvidia GPUs via LLVM codegen
2023-01-12 01:01:57 +01:00 · 2023-01-12 01:01:57 +01:00 · 1f4bb174a3
parent c0b30a08be
commit 1f4bb174a3
29 changed files with 3685 additions and 183 deletions
--- a/constantine.nim
+++ b/constantine.nim
@ -1,10 +0,0 @@
-# Constantine
-# Copyright (c) 2018-2019    Status Research & Development GmbH
-# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
-# Licensed and distributed under either of
-#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
-#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
-# at your option. This file may not be copied, modified, or distributed except according to those terms.
-
-# TODO
-# export public proc
--- a/constantine.nimble
+++ b/constantine.nimble
@ -229,6 +229,10 @@ const testDesc: seq[tuple[path: string, useGMP: bool]] = @[
  ("tests/t_ethereum_eip2333_bls12381_key_derivation.nim", false),
 ]

+const testDescNvidia: seq[string] = @[
+  "tests/gpu/t_nvidia_fp.nim",
+]
+
 const benchDesc = [
  "bench_fp",
  "bench_fp_double_precision",
@ -378,6 +382,14 @@ proc addTestSet(cmdFile: var string, requireGMP: bool, test32bit = false, testAS
      
      cmdFile.testBatch(flags, td.path)

+proc addTestSetNvidia(cmdFile: var string) =
+  if not dirExists "build":
+    mkDir "build"
+  echo "Found " & $testDescNvidia.len & " tests to run."
+  
+  for path in testDescNvidia:
+    cmdFile.testBatch(flags = "", path)
+
 proc addBenchSet(cmdFile: var string, useAsm = true) =
  if not dirExists "build":
    mkDir "build"
@ -568,6 +580,12 @@ task test_parallel_no_gmp_no_asm, "Run all tests in parallel (via GNU parallel)"
  writeFile(buildParallel, cmdFile)
  exec "build/pararun " & buildParallel

+task test_nvidia, "Run all tests for Nvidia GPUs":
+  var cmdFile: string
+  cmdFile.addTestSetNvidia()
+  for cmd in cmdFile.splitLines():
+    exec cmd
+
 # Finite field 𝔽p
 # ------------------------------------------

--- a/constantine/math/arithmetic/assembly/limbs_asm_modular_dbl_prec_x86.nim
+++ b/constantine/math/arithmetic/assembly/limbs_asm_modular_dbl_prec_x86.nim
@ -60,11 +60,10 @@ macro addmod2x_gen[N: static int](R: var Limbs[N], A, B: Limbs[N], m: Limbs[N di

  # Addition
  # u = a[0..<H] + b[0..<H], v = a[H..<N]
-  for i in 0 ..< H:
-    if i == 0:
-      ctx.add u[0], b[0]
-    else:
-      ctx.adc u[i], b[i]
+  ctx.add u[0], b[0]
+  ctx.mov r[0], u[0]
+  for i in 1 ..< H:
+    ctx.adc u[i], b[i]
    ctx.mov r[i], u[i]

  # v = a[H..<N] + b[H..<N], a[0..<H] = u, u = v
@ -78,11 +77,9 @@ macro addmod2x_gen[N: static int](R: var Limbs[N], A, B: Limbs[N], m: Limbs[N di
  ctx.sbb overflowed, overflowed

  # Now substract the modulus to test a < 2ⁿp
-  for i in 0 ..< H:
-    if i == 0:
-      ctx.sub v[0], M[0]
-    else:
-      ctx.sbb v[i], M[i]
+  ctx.sub v[0], M[0]
+  for i in 1 ..< H:
+    ctx.sbb v[i], M[i]

  # If it overflows here, it means that it was
  # smaller than the modulus and we don't need v
@ -134,11 +131,10 @@ macro submod2x_gen[N: static int](R: var Limbs[N], A, B: Limbs[N], m: Limbs[N di

  # Substraction
  # u = a[0..<H] - b[0..<H], v = a[H..<N]
-  for i in 0 ..< H:
-    if i == 0:
-      ctx.sub u[0], b[0]
-    else:
-      ctx.sbb u[i], b[i]
+  ctx.sub u[0], b[0]
+  ctx.mov r[0], u[0]
+  for i in 1 ..< H:
+    ctx.sbb u[i], b[i]
    ctx.mov r[i], u[i]

  # v = a[H..<N] - b[H..<N], a[0..<H] = u, u = M
@ -155,11 +151,10 @@ macro submod2x_gen[N: static int](R: var Limbs[N], A, B: Limbs[N], m: Limbs[N di
    ctx.`and` u[i], underflowed

  # Add the masked modulus
-  for i in 0 ..< H:
-    if i == 0:
-      ctx.add u[0], v[0]
-    else:
-      ctx.adc u[i], v[i]
+  ctx.add u[0], v[0]
+  ctx.mov r[H], u[0]
+  for i in 1 ..< H:
+    ctx.adc u[i], v[i]
    ctx.mov r[i+H], u[i]

  result.add ctx.generate
--- a/constantine/math/arithmetic/assembly/limbs_asm_modular_x86.nim
+++ b/constantine/math/arithmetic/assembly/limbs_asm_modular_x86.nim
@ -27,23 +27,26 @@ static: doAssert UseASM_X86_32

 {.localPassC:"-fomit-frame-pointer".} # Needed so that the compiler finds enough registers

-proc finalSubNoCarryImpl*(
+proc finalSubNoOverflowImpl*(
       ctx: var Assembler_x86,
       r: Operand or OperandArray,
       a, M, scratch: OperandArray
     ) =
  ## Reduce `a` into `r` modulo `M`
+  ## To be used when the modulus does not use the full bitwidth of the storing words
+  ## for example a 255-bit modulus in n words of total max size 2^256
  ## 
  ## r, a, scratch, scratchReg are mutated
  ## M is read-only
  let N = M.len
-  ctx.comment "Final substraction (no carry)"
-  for i in 0 ..< N:
+  ctx.comment "Final substraction (cannot overflow its limbs)"
+  
+  # Substract the modulus, and test a < p with the last borrow
+  ctx.mov scratch[0], a[0]
+  ctx.sub scratch[0], M[0]
+  for i in 1 ..< N:
    ctx.mov scratch[i], a[i]
-    if i == 0:
-      ctx.sub scratch[i], M[i]
-    else:
-      ctx.sbb scratch[i], M[i]
+    ctx.sbb scratch[i], M[i]

  # If we borrowed it means that we were smaller than
  # the modulus and we don't need "scratch"
@ -51,7 +54,7 @@ proc finalSubNoCarryImpl*(
    ctx.cmovnc a[i], scratch[i]
    ctx.mov r[i], a[i]

-proc finalSubMayCarryImpl*(
+proc finalSubMayOverflowImpl*(
       ctx: var Assembler_x86,
       r: Operand or OperandArray,
       a, M, scratch: OperandArray,
@ -59,24 +62,22 @@ proc finalSubMayCarryImpl*(
     ) =
  ## Reduce `a` into `r` modulo `M`
  ## To be used when the final substraction can
-  ## also depend on the carry flag
+  ## also overflow the limbs (a 2^256 order of magnitude modulus stored in n words of total max size 2^256)
  ## 
  ## r, a, scratch, scratchReg are mutated
  ## M is read-only
-
+  let N = M.len
  ctx.comment "Final substraction (may carry)"

  # Mask: scratchReg contains 0xFFFF or 0x0000
  ctx.sbb scratchReg, scratchReg

-  # Now substract the modulus to test a < p
-  let N = M.len
-  for i in 0 ..< N:
+  # Now substract the modulus, and test a < p with the last borrow
+  ctx.mov scratch[0], a[0]
+  ctx.sub scratch[0], M[0]
+  for i in 1 ..< N:
    ctx.mov scratch[i], a[i]
-    if i == 0:
-      ctx.sub scratch[i], M[i]
-    else:
-      ctx.sbb scratch[i], M[i]
+    ctx.sbb scratch[i], M[i]

  # If it overflows here, it means that it was
  # smaller than the modulus and we don't need `scratch`
@ -92,7 +93,7 @@ macro finalSub_gen*[N: static int](
       r_PIR: var array[N, SecretWord],
       a_EIR, M_PIR: array[N, SecretWord],
       scratch_EIR: var array[N, SecretWord],
-       mayCarry: static bool): untyped =
+       mayOverflow: static bool): untyped =
  ## Returns:
  ##   a-M if a > M
  ##   a otherwise
@ -101,7 +102,7 @@ macro finalSub_gen*[N: static int](
  ## - a_EIR is an array of registers, mutated,
  ## - M_PIR is a pointer to an array, read-only,
  ## - scratch_EIR is an array of registers, mutated
-  ## - mayCarry is set to true when the carry flag also needs to be read
+  ## - mayOverflow is set to true when the carry flag also needs to be read
  result = newStmtList()

  var ctx = init(Assembler_x86, BaseType)
@ -113,12 +114,12 @@ macro finalSub_gen*[N: static int](
    M = init(OperandArray, nimSymbol = M_PIR, N, PointerInReg, Input)
    t = init(OperandArray, nimSymbol = scratch_EIR, N, ElemsInReg, Output_EarlyClobber)

-  if mayCarry:
-    ctx.finalSubMayCarryImpl(
+  if mayOverflow:
+    ctx.finalSubMayOverflowImpl(
      r, a, M, t, rax
    )
  else:
-    ctx.finalSubNoCarryImpl(
+    ctx.finalSubNoOverflowImpl(
      r, a, M, t
    )

@ -156,18 +157,17 @@ macro addmod_gen[N: static int](R: var Limbs[N], A, B, m: Limbs[N], spareBits: s
      `usym`[i] = `A`[i]

  # Addition
-  for i in 0 ..< N:
-    if i == 0:
-      ctx.add u[0], b[0]
-    else:
-      ctx.adc u[i], b[i]
+  ctx.add u[0], b[0]
+  ctx.mov v[0], u[0]
+  for i in 1 ..< N:
+    ctx.adc u[i], b[i]
    # Interleaved copy in a second buffer as well
    ctx.mov v[i], u[i]

  if spareBits >= 1:
-    ctx.finalSubNoCarryImpl(r, u, M, v)
+    ctx.finalSubNoOverflowImpl(r, u, M, v)
  else:
-    ctx.finalSubMayCarryImpl(
+    ctx.finalSubMayOverflowImpl(
      r, u, M, v, b.reuseRegister()
    )

@ -209,11 +209,10 @@ macro submod_gen[N: static int](R: var Limbs[N], A, B, m: Limbs[N]): untyped =
      `usym`[i] = `A`[i]

  # Substraction
-  for i in 0 ..< N:
-    if i == 0:
-      ctx.sub u[0], b[0]
-    else:
-      ctx.sbb u[i], b[i]
+  ctx.sub u[0], b[0]
+  ctx.mov v[0], M[0]
+  for i in 1 ..< N:
+    ctx.sbb u[i], b[i]
    # Interleaved copy the modulus to hide SBB latencies
    ctx.mov v[i], M[i]

@ -226,11 +225,10 @@ macro submod_gen[N: static int](R: var Limbs[N], A, B, m: Limbs[N]): untyped =
    ctx.`and` v[i], underflowed

  # Add the masked modulus
-  for i in 0 ..< N:
-    if i == 0:
-      ctx.add u[0], v[0]
-    else:
-      ctx.adc u[i], v[i]
+  ctx.add u[0], v[0]
+  ctx.mov r[0], u[0]
+  for i in 1 ..< N:
+    ctx.adc u[i], v[i]
    ctx.mov r[i], u[i]

  result.add ctx.generate
@ -258,12 +256,11 @@ macro negmod_gen[N: static int](R: var Limbs[N], A, m: Limbs[N]): untyped =
    M = init(OperandArray, nimSymbol = m, N, PointerInReg, InputOutput)

  # Substraction m - a
-  for i in 0 ..< N:
+  ctx.mov u[0], M[0]
+  ctx.sub u[0], a[0]
+  for i in 1 ..< N:
    ctx.mov u[i], M[i]
-    if i == 0:
-      ctx.sub u[0], a[0]
-    else:
-      ctx.sbb u[i], a[i]
+    ctx.sbb u[i], a[i]

  # Deal with a == 0
  let isZero = M.reuseRegister()
--- a/constantine/math/arithmetic/assembly/limbs_asm_mul_mont_x86.nim
+++ b/constantine/math/arithmetic/assembly/limbs_asm_mul_mont_x86.nim
@ -178,7 +178,7 @@ macro mulMont_CIOS_sparebit_gen[N: static int](
    for i in 0 ..< N:
      ctx.mov r2[i], t[i]
  else:
-    ctx.finalSubNoCarryImpl(
+    ctx.finalSubNoOverflowImpl(
      r2, t, M,
      scratch
    )
@ -393,7 +393,7 @@ macro sumprodMont_CIOS_spare2bits_gen[N, K: static int](
      ctx.mov r2[i], t[i]
  else:
    ctx.comment "  Final substraction"
-    ctx.finalSubNoCarryImpl(
+    ctx.finalSubNoOverflowImpl(
      r2, t, M,
      scratch
    )
--- a/constantine/math/arithmetic/assembly/limbs_asm_mul_mont_x86_adx_bmi2.nim
+++ b/constantine/math/arithmetic/assembly/limbs_asm_mul_mont_x86_adx_bmi2.nim
@ -272,7 +272,7 @@ macro mulMont_CIOS_sparebit_adx_gen[N: static int](
    for i in 0 ..< N:
      ctx.mov r[i], t[i]
  else:
-    ctx.finalSubNoCarryImpl(
+    ctx.finalSubNoOverflowImpl(
      r, t, M,
      scratch
    )
@ -476,7 +476,7 @@ macro sumprodMont_CIOS_spare2bits_adx_gen[N, K: static int](
      ctx.mov r2[i], t[i]
  else:
    ctx.comment "  Final substraction"
-    ctx.finalSubNoCarryImpl(
+    ctx.finalSubNoOverflowImpl(
      r2, t, M,
      scratch
    )
--- a/constantine/math/arithmetic/assembly/limbs_asm_redc_mont_x86.nim
+++ b/constantine/math/arithmetic/assembly/limbs_asm_redc_mont_x86.nim
@ -157,9 +157,9 @@ macro redc2xMont_gen*[N: static int](
    for i in 0 ..< N:
      ctx.mov r_temp[i], u[i]
  elif spareBits >= 1:
-    ctx.finalSubNoCarryImpl(r, u, M, t)
+    ctx.finalSubNoOverflowImpl(r, u, M, t)
  else:
-    ctx.finalSubMayCarryImpl(r, u, M, t, rax)
+    ctx.finalSubMayOverflowImpl(r, u, M, t, rax)

  # Code generation
  result.add ctx.generate()
@ -282,4 +282,4 @@ func fromMont_asm*(r: var Limbs, a, M: Limbs, m0ninv: BaseType) =

  block: # Map from [0, 2p) to [0, p)
    var workspace{.noInit.}: typeof(r)
-    r.finalSub_gen(t, M, workspace, mayCarry = false)
+    r.finalSub_gen(t, M, workspace, mayOverflow = false)
--- a/constantine/math/arithmetic/assembly/limbs_asm_redc_mont_x86_adx_bmi2.nim
+++ b/constantine/math/arithmetic/assembly/limbs_asm_redc_mont_x86_adx_bmi2.nim
@ -134,9 +134,9 @@ macro redc2xMont_adx_gen[N: static int](
    for i in 0 ..< N:
      ctx.mov r[i], t[i]
  elif spareBits >= 1:
-    ctx.finalSubNoCarryImpl(r, u, M, t)
+    ctx.finalSubNoOverflowImpl(r, u, M, t)
  else:
-    ctx.finalSubMayCarryImpl(r, u, M, t, hi)
+    ctx.finalSubMayOverflowImpl(r, u, M, t, hi)

  # Code generation
  result.add ctx.generate()
@ -264,4 +264,4 @@ func fromMont_asm_adx*(r: var Limbs, a, M: Limbs, m0ninv: BaseType) =

  block: # Map from [0, 2p) to [0, p)
    var workspace{.noInit.}: typeof(r)
-    r.finalSub_gen(t, M, workspace, mayCarry = false)
+    r.finalSub_gen(t, M, workspace, mayOverflow = false)
--- a/constantine/math/config/curves_declaration.nim
+++ b/constantine/math/config/curves_declaration.nim
@ -70,6 +70,9 @@ declareCurves:
  curve P224: # NIST P-224
    bitwidth: 224
    modulus: "0xffffffffffffffffffffffffffffffff000000000000000000000001"
+    order: "0xffffffffffffffffffffffffffff16a2e0b8f03e13dd29455c5c2a3d"
+    orderBitwidth: 224
+
  curve BN254_Nogami: # Integer Variable χ–Based Ate Pairing, 2008, Nogami et al
    bitwidth: 254
    modulus: "0x2523648240000001ba344d80000000086121000000000013a700000000000013"
@ -162,7 +165,7 @@ declareCurves:
    # or for use in Hisil, Wong, Carter, and Dawson extended coordinates 
    #                       ax² + y² = 1+dx²y²           with a = -1 d=-121665/121666                     
    order: "0x1000000000000000000000000000000014def9dea2f79cd65812631a5cf5d3ed"
-    orderBItwidth: 253
+    orderBitwidth: 253
    cofactor: 8
    eq_form: TwistedEdwards
    coef_a: -1
@ -188,9 +191,13 @@ declareCurves:
  curve P256: # secp256r1 / NIST P-256
    bitwidth: 256
    modulus: "0xffffffff00000001000000000000000000000000ffffffffffffffffffffffff"
+    order: "0xffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2fc632551"
+    orderBitwidth: 256
  curve Secp256k1: # Bitcoin curve
    bitwidth: 256
    modulus: "0xfffffffffffffffffffffffffffffffffffffffffffffffffffffffefffffc2f"
+    order: "0xfffffffffffffffffffffffffffffffebaaedce6af48a03bbfd25e8cd0364141"
+    orderBitwidth: 256
  curve BLS12_377:
    # Zexe curve
    # (p41) https://eprint.iacr.org/2018/962.pdf
--- a/constantine/math/config/precompute.nim
+++ b/constantine/math/config/precompute.nim
@ -257,7 +257,7 @@ func countSpareBits*(M: BigInt): int =
  let msb = log2_vartime(BaseType(M.limbs[M.limbs.len-1]))
  result = WordBitWidth - 1 - msb.int

-func invModBitwidth[T: SomeUnsignedInt](a: T): T =
+func invModBitwidth*[T: SomeUnsignedInt](a: T): T =
  # We use BaseType for return value because static distinct type
  # confuses Nim semchecks [UPSTREAM BUG]
  # We don't enforce compile-time evaluation here
--- a/constantine/math/io/io_bigints.nim
+++ b/constantine/math/io/io_bigints.nim
@ -23,8 +23,10 @@ export BigInt, wordsRequired
 #
 # ############################################################

-# No exceptions for the byte API
-{.push raises: [].}
+# No exceptions for the byte API.
+# In particular we don't want if-branches when indexing an array
+# that contains secret data
+{.push raises: [], checks: off.}

 # Note: the parsing/serialization routines were initially developed
 #       with an internal representation that used 31 bits out of a uint32
@ -36,9 +38,11 @@ export BigInt, wordsRequired
 #       prototyping, research and debugging purposes,
 #       and can use exceptions.

-func unmarshalLE(
-        dst: var BigInt,
-        src: openarray[byte]) =
+func unmarshalLE[T](
+        dst: var openArray[T],
+        src: openarray[byte],
+        wordBitWidth: static int
+     ) =
  ## Parse an unsigned integer from its canonical
  ## little-endian unsigned representation
  ## and store it into a BigInt
@ -47,36 +51,42 @@ func unmarshalLE(
  ##   - no leaks
  ##
  ## Can work at compile-time
+  ## 
+  ## It is possible to use a 63-bit representation out of a 64-bit words
+  ## by setting `wordBitWidth` to something different from sizeof(T) * 8
+  ## This might be useful for architectures with no add-with-carry instructions.
  # TODO: error on destination to small

  var
    dst_idx = 0
-    acc = Zero
+    acc = T(0)
    acc_len = 0
-
+  
  for src_idx in 0 ..< src.len:
-    let src_byte = SecretWord(src[src_idx])
+    let src_byte = T(src[src_idx])

    # buffer reads
    acc = acc or (src_byte shl acc_len)
    acc_len += 8 # We count bit by bit

    # if full, dump
-    if acc_len >= WordBitWidth:
-      dst.limbs[dst_idx] = acc
+    if acc_len >= wordBitWidth:
+      dst[dst_idx] = acc
      inc dst_idx
-      acc_len -= WordBitWidth
+      acc_len -= wordBitWidth
      acc = src_byte shr (8 - acc_len)

-  if dst_idx < dst.limbs.len:
-    dst.limbs[dst_idx] = acc
+  if dst_idx < dst.len:
+    dst[dst_idx] = acc

-  for i in dst_idx + 1 ..< dst.limbs.len:
-    dst.limbs[i] = Zero
+  for i in dst_idx + 1 ..< dst.len:
+    dst[i] = T(0)

-func unmarshalBE(
-        dst: var BigInt,
-        src: openarray[byte]) =
+func unmarshalBE[T](
+        dst: var openArray[T],
+        src: openarray[byte],
+        wordBitWidth: static int
+     ) =
  ## Parse an unsigned integer from its canonical
  ## big-endian unsigned representation (octet string)
  ## and store it into a BigInt.
@ -88,36 +98,61 @@ func unmarshalBE(
  ##   - no leaks
  ##
  ## Can work at compile-time
+  ## 
+  ## It is possible to use a 63-bit representation out of a 64-bit words
+  ## by setting `wordBitWidth` to something different from sizeof(T) * 8
+  ## This might be useful for architectures with no add-with-carry instructions.

  var
    dst_idx = 0
-    acc = Zero
+    acc = T(0)
    acc_len = 0

+  const wordBitWidth = sizeof(T) * 8
+
  for src_idx in countdown(src.len-1, 0):
-    let src_byte = SecretWord(src[src_idx])
+    let src_byte = T(src[src_idx])

    # buffer reads
    acc = acc or (src_byte shl acc_len)
    acc_len += 8 # We count bit by bit

    # if full, dump
-    if acc_len >= WordBitWidth:
-      dst.limbs[dst_idx] = acc
+    if acc_len >= wordBitWidth:
+      dst[dst_idx] = acc
      inc dst_idx
-      acc_len -= WordBitWidth
+      acc_len -= wordBitWidth
      acc = src_byte shr (8 - acc_len)

-  if dst_idx < dst.limbs.len:
-    dst.limbs[dst_idx] = acc
+  if dst_idx < dst.len:
+    dst[dst_idx] = acc

-  for i in dst_idx + 1 ..< dst.limbs.len:
-    dst.limbs[i] = Zero
+  for i in dst_idx + 1 ..< dst.len:
+    dst[i] = T(0)
+
+func unmarshal*[T](
+        dst: var openArray[T],
+        src: openarray[byte],
+        wordBitWidth: static int,
+        srcEndianness: static Endianness) {.inline.} =
+  ## Parse an unsigned integer from its canonical
+  ## big-endian or little-endian unsigned representation
+  ##
+  ## Constant-Time:
+  ##   - no leaks
+  ##
+  ## Can work at compile-time to embed curve moduli
+  ## from a canonical integer representation
+
+  when srcEndianness == littleEndian:
+    dst.unmarshalLE(src, wordBitWidth)
+  else:
+    dst.unmarshalBE(src, wordBitWidth)

 func unmarshal*(
        dst: var BigInt,
        src: openarray[byte],
-        srcEndianness: static Endianness) =
+        srcEndianness: static Endianness) {.inline.}=
  ## Parse an unsigned integer from its canonical
  ## big-endian or little-endian unsigned representation
  ## And store it into a BigInt of size `bits`
@ -127,11 +162,7 @@ func unmarshal*(
  ##
  ## Can work at compile-time to embed curve moduli
  ## from a canonical integer representation
-
-  when srcEndianness == littleEndian:
-    dst.unmarshalLE(src)
-  else:
-    dst.unmarshalBE(src)
+  dst.limbs.unmarshal(src, WordBitWidth, srcEndianness)

 func unmarshal*(
        T: type BigInt,
@ -146,7 +177,7 @@ func unmarshal*(
  ##
  ## Can work at compile-time to embed curve moduli
  ## from a canonical integer representation
-  result.unmarshal(src, srcEndianness)
+  result.limbs.unmarshal(src, WordBitWidth, srcEndianness)

 func fromUint*(
        T: type BigInt,
@ -168,53 +199,54 @@ func fromUint*(
 #
 # ############################################################

-template blobFrom(dst: var openArray[byte], src: SomeUnsignedInt, startIdx: int, endian: static Endianness) =
-  ## Write an integer into a raw binary blob
-  ## Swapping endianness if needed
-  ## startidx is the first written array item if littleEndian is requested
-  ## or the last if bigEndian is requested
-  when endian == cpuEndian:
-    for i in 0 ..< sizeof(src):
-      dst[startIdx+i] = toByte(src shr (i * 8))
-  else:
-    for i in 0 ..< sizeof(src):
-      dst[startIdx+sizeof(src)-1-i] = toByte(src shr (i * 8))
-
-func marshalLE(
+func marshalLE[T](
        dst: var openarray[byte],
-        src: BigInt) =
+        src: openArray[T],
+        wordBitWidth: static int) =
  ## Serialize a bigint into its canonical little-endian representation
  ## I.e least significant bit first
+  ## 
+  ## It is possible to use a 63-bit representation out of a 64-bit words
+  ## by setting `wordBitWidth` to something different from sizeof(T) * 8
+  ## This might be useful for architectures with no add-with-carry instructions.

  var
    src_idx, dst_idx = 0
-    acc: BaseType = 0
    acc_len = 0

+  when sizeof(T) == 8:
+    type BT = uint64
+  elif sizeof(T) == 4:
+    type BT = uint32
+  else:
+    {.error "Unsupported word size uint" & $(sizeof(T) * 8).}
+
+  var acc = BT(0)
+
  var tail = dst.len
  while tail > 0:
-    let w = if src_idx < src.limbs.len: BaseType(src.limbs[src_idx])
+    let w = if src_idx < src.len: BT(src[src_idx])
            else: 0
    inc src_idx

    if acc_len == 0:
      # We need to refill the buffer to output 64-bit
      acc = w
-      acc_len = WordBitWidth
+      acc_len = wordBitWidth
    else:
-      when WordBitWidth == sizeof(SecretWord) * 8:
+      when wordBitWidth == sizeof(T) * 8:
        let lo = acc
        acc = w
      else: # If using 63-bit (or less) out of uint64
        let lo = (w shl acc_len) or acc
        dec acc_len
-        acc = w shr (WordBitWidth - acc_len)
+        acc = w shr (wordBitWidth - acc_len)

-      if tail >= sizeof(SecretWord):
+      if tail >= sizeof(T):
        # Unrolled copy
        dst.blobFrom(src = lo, dst_idx, littleEndian)
-        dst_idx += sizeof(SecretWord)
-        tail -= sizeof(SecretWord)
+        dst_idx += sizeof(T)
+        tail -= sizeof(T)
      else:
        # Process the tail and exit
        when cpuEndian == littleEndian:
@ -229,43 +261,56 @@ func marshalLE(
            dst[dst_idx+i] = toByte(lo shr ((tail-i)*8))
        return

-func marshalBE(
+func marshalBE[T](
        dst: var openarray[byte],
-        src: BigInt) =
+        src: openArray[T],
+        wordBitWidth: static int) =
  ## Serialize a bigint into its canonical big-endian representation
  ## (octet string)
  ## I.e most significant bit first
  ##
  ## In cryptography specifications, this is often called
  ## "Octet string to Integer"
+  ## 
+  ## It is possible to use a 63-bit representation out of a 64-bit words
+  ## by setting `wordBitWidth` to something different from sizeof(T) * 8
+  ## This might be useful for architectures with no add-with-carry instructions.

  var
    src_idx = 0
-    acc: BaseType = 0
    acc_len = 0

+  when sizeof(T) == 8:
+    type BT = uint64
+  elif sizeof(T) == 4:
+    type BT = uint32
+  else:
+    {.error "Unsupported word size uint" & $(sizeof(T) * 8).}
+
+  var acc = BT(0)
+
  var tail = dst.len
  while tail > 0:
-    let w = if src_idx < src.limbs.len: BaseType(src.limbs[src_idx])
+    let w = if src_idx < src.len: BaseType(src[src_idx])
            else: 0
    inc src_idx

    if acc_len == 0:
      # We need to refill the buffer to output 64-bit
      acc = w
-      acc_len = WordBitWidth
+      acc_len = wordBitWidth
    else:
-      when WordBitWidth == sizeof(SecretWord) * 8:
+      when wordBitWidth == sizeof(T) * 8:
        let lo = acc
        acc = w
      else: # If using 63-bit (or less) out of uint64
        let lo = (w shl acc_len) or acc
        dec acc_len
-        acc = w shr (WordBitWidth - acc_len)
+        acc = w shr (wordBitWidth - acc_len)

-      if tail >= sizeof(SecretWord):
+      if tail >= sizeof(T):
        # Unrolled copy
-        tail -= sizeof(SecretWord)
+        tail -= sizeof(T)
        dst.blobFrom(src = lo, tail, bigEndian)
      else:
        # Process the tail and exit
@ -281,10 +326,26 @@ func marshalBE(
            dst[tail-1-i] = toByte(lo shr ((tail-i)*8))
        return

+func marshal*[T](
+        dst: var openArray[byte],
+        src: openArray[T],
+        wordBitWidth: static int,
+        dstEndianness: static Endianness) {.inline.} =
+  ## Serialize a bigint into its canonical big-endian or little endian
+  ## representation.
+  ##
+  ## If the buffer is bigger, output will be zero-padded left for big-endian
+  ## or zero-padded right for little-endian.
+  ## I.e least significant bit is aligned to buffer boundary
+  when dstEndianness == littleEndian:
+    marshalLE(dst, src, wordBitWidth)
+  else:
+    marshalBE(dst, src, wordBitWidth)
+
 func marshal*(
-        dst: var openarray[byte],
+        dst: var openArray[byte],
        src: BigInt,
-        dstEndianness: static Endianness) =
+        dstEndianness: static Endianness) {.inline.} =
  ## Serialize a bigint into its canonical big-endian or little endian
  ## representation.
  ## A destination buffer of size "(BigInt.bits + 7) div 8" at minimum is needed,
@ -293,17 +354,16 @@ func marshal*(
  ## If the buffer is bigger, output will be zero-padded left for big-endian
  ## or zero-padded right for little-endian.
  ## I.e least significant bit is aligned to buffer boundary
-
  debug:
-    doAssert dst.len >= (BigInt.bits + 7) div 8, "BigInt -> Raw int conversion: destination buffer is too small"
+    doAssert dst.len >= (BigInt.bits + 7) div 8, block:
+      "BigInt -> Raw int conversion: destination buffer is too small\n" &
+      "  bits: " & $BigInt.bits & "\n" &
+      "  bytes allocated: " & $dst.len & '\n'

  when BigInt.bits == 0:
    zeroMem(dst, dst.len)

-  when dstEndianness == littleEndian:
-    marshalLE(dst, src)
-  else:
-    marshalBE(dst, src)
+  dst.marshal(src.limbs, WordBitWidth, dstEndianness)

 {.pop.} # {.push raises: [].}

@ -379,7 +439,7 @@ func hexToPaddedByteArray*(hexStr: string, output: var openArray[byte], order: s
    shift = (shift + 4) and 4
    dstIdx += shift shr 2

-func nativeEndianToHex(bytes: openarray[byte], order: static[Endianness]): string =
+func nativeEndianToHex*(bytes: openarray[byte], order: static[Endianness]): string =
  ## Convert a byte-array to its hex representation
  ## Output is in lowercase and not prefixed.
  ## This assumes that input is in platform native endianness
@ -465,9 +525,6 @@ func appendHex*(dst: var string, big: BigInt, order: static Endianness = bigEndi
  # 2 Convert canonical uint to hex
  dst.add bytes.nativeEndianToHex(order)

-func toHex*(a: openArray[byte]): string =
-  nativeEndianToHex(a, system.cpuEndian)
-
 func toHex*(big: BigInt, order: static Endianness = bigEndian): string =
  ## Stringify an int to hex.
  ## Note. Leading zeros are not removed.
--- a/constantine/math_gpu/fields_nvidia.nim
+++ b/constantine/math_gpu/fields_nvidia.nim
@ -0,0 +1,117 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  ../platforms/gpu/[llvm, nvidia, ir]
+
+# ############################################################
+#
+#               Field arithmetic on Nvidia GPU
+#
+# ############################################################
+
+# Loads from global (kernel params) take over 100 cycles
+# https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#operand-costs
+
+proc finalSubMayOverflow*(asy: Assembler_LLVM, cm: CurveMetadata, field: Field, r, a: Array) =
+  ## If a >= Modulus: r <- a-M
+  ## else:            r <- a
+  ## 
+  ## This is constant-time straightline code.
+  ## Due to warp divergence, the overhead of doing comparison with shortcutting might not be worth it on GPU.
+  ## 
+  ## To be used when the final substraction can
+  ## also overflow the limbs (a 2^256 order of magnitude modulus stored in n words of total max size 2^256)
+
+  let bld = asy.builder
+  let fieldTy = cm.getFieldType(field)
+  let scratch = bld.makeArray(fieldTy)
+  let M = cm.getModulus(field)
+  let N = M.len
+
+  # Contains 0x0001 (if overflowed limbs) or 0x0000
+  let overflowedLimbs = bld.add_ci(0'u32, 0'u32)
+
+  # Now substract the modulus, and test a < M with the last borrow
+  scratch[0] = bld.sub_bo(a[0], M[0])
+  for i in 1 ..< N:
+    scratch[i] = bld.sub_bio(a[i], M[i])
+
+  # 1. if `overflowedLimbs`, underflowedModulus >= 0
+  # 2. if a >= M, underflowedModulus >= 0
+  # if underflowedModulus >= 0: a-M else: a
+  let underflowedModulus = bld.sub_bi(overflowedLimbs, 0'u32)
+
+  for i in 0 ..< N:
+    r[i] = bld.slct(scratch[i], a[i], underflowedModulus) 
+
+proc finalSubNoOverflow*(asy: Assembler_LLVM, cm: CurveMetadata, field: Field, r, a: Array) =
+  ## If a >= Modulus: r <- a-M
+  ## else:            r <- a
+  ## 
+  ## This is constant-time straightline code.
+  ## Due to warp divergence, the overhead of doing comparison with shortcutting might not be worth it on GPU.
+  ## 
+  ## To be used when the modulus does not use the full bitwidth of the storing words
+  ## (say using 255 bits for the modulus out of 256 available in words)
+
+  let bld = asy.builder
+  let fieldTy = cm.getFieldType(field)
+  let scratch = bld.makeArray(fieldTy)
+  let M = cm.getModulus(field)
+  let N = M.len
+ 
+  # Now substract the modulus, and test a < M with the last borrow
+  scratch[0] = bld.sub_bo(a[0], M[0])
+  for i in 1 ..< N:
+    scratch[i] = bld.sub_bio(a[i], M[i])
+
+  # If it underflows here a was smaller than the modulus, which is what we want
+  let underflowedModulus = bld.sub_bi(0'u32, 0'u32)
+
+  for i in 0 ..< N:
+    r[i] = bld.slct(scratch[i], a[i], underflowedModulus)
+
+proc field_add_gen*(asy: Assembler_LLVM, cm: CurveMetadata, field: Field): FnDef =
+  ## Generate an optimized modular addition kernel
+  ## with parameters `a, b, modulus: Limbs -> Limbs`
+  
+  let procName = cm.genSymbol(block:
+    case field
+    of fp: opFpAdd
+    of fr: opFrAdd)
+  let fieldTy = cm.getFieldType(field)
+  let pFieldTy = pointer_t(fieldTy)
+
+  let addModTy = function_t(asy.void_t, [pFieldTy, pFieldTy, pFieldTy])
+  let addModKernel = asy.module.addFunction(cstring procName, addModTy)
+  let blck = asy.ctx.appendBasicBlock(addModKernel, "addModBody")
+  asy.builder.positionAtEnd(blck)
+
+  let bld = asy.builder
+  
+  let r = bld.asArray(addModKernel.getParam(0), fieldTy)
+  let a = bld.asArray(addModKernel.getParam(1), fieldTy)
+  let b = bld.asArray(addModKernel.getParam(2), fieldTy)
+
+  let t = bld.makeArray(fieldTy)
+  let N = cm.getNumWords(field)
+  
+  t[0] = bld.add_co(a[0], b[0])
+  for i in 1 ..< N:
+    t[i] = bld.add_cio(a[i], b[i])
+
+  if cm.getSpareBits(field) >= 1:
+    asy.finalSubNoOverflow(cm, field, t, t)
+  else:
+    asy.finalSubMayOverflow(cm, field, t, t)
+
+  bld.store(r, t)
+  bld.retVoid()
+
+  return (addModTy, addModKernel)
--- a/constantine/platforms/bithacks.nim
+++ b/constantine/platforms/bithacks.nim
@ -6,6 +6,8 @@
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.

+import ./compilers/bitops
+
 # ############################################################
 #
 #                           Bit hacks
@ -14,19 +16,19 @@

 # Bithacks
 # ------------------------------------------------------------
-# TODO: Nim std/bitops is unsatisfactory
-#       in particular the "noUndefined" flag
-#       for countLeadingZeroBits/countTrailingZeroBits
-#       is returning zero instead of the integer bitwidth
+# Nim std/bitops is unsatisfactory
+# in particular the "noUndefined" flag
+# for countLeadingZeroBits/countTrailingZeroBits
+# is returning zero instead of the integer bitwidth
 #
-#       Furthermore it is not guaranteed constant-time
-#       And lastly, even compiler builtin may be slightly inefficient
-#       for example when doing fastLog2
-#       which is "31 - builtin_clz" we get
-#       `bsr + xor (from clz) + sub`
-#       instead of plain `bsr`
+# Furthermore it is not guaranteed constant-time
+# And lastly, even compiler builtin may be slightly inefficient
+# for example when doing fastLog2
+# which is "31 - builtin_clz" we get
+# `bsr + xor (from clz) + sub`
+# instead of plain `bsr`
 #
-#       At the moment we don't need them to operate on secret data
+# At the moment we don't need them to operate on secret data
 #
 # See: https://www.chessprogramming.org/BitScan
 #      https://www.chessprogramming.org/General_Setwise_Operations
@ -80,11 +82,20 @@ func log2impl_vartime(x: uint64): uint64 {.inline.} =

 func log2_vartime*[T: SomeUnsignedInt](n: T): T {.inline.} =
  ## Find the log base 2 of an integer
-  when sizeof(T) == sizeof(uint64):
-    T(log2impl_vartime(uint64(n)))
+  ## 
+  ## ⚠ With GCC and Clang compilers on x86, if n is zero, result is undefined.
+  when nimvm:
+    when sizeof(T) == sizeof(uint64):
+      T(log2impl_vartime(uint64(n)))
+    else:
+      static: doAssert sizeof(T) <= sizeof(uint32)
+      T(log2impl_vartime(uint32(n)))
  else:
-    static: doAssert sizeof(T) <= sizeof(uint32)
-    T(log2impl_vartime(uint32(n)))
+    when sizeof(T) == sizeof(uint64):
+      T(log2_c_compiler_vartime(uint64(n)))
+    else:
+      static: doAssert sizeof(T) <= sizeof(uint32)
+      T(log2_c_compiler_vartime(uint32(n)))

 func hammingWeight*(x: uint32): uint {.inline.} =
  ## Counts the set bits in integer.
--- a/constantine/platforms/compilers/bitops.nim
+++ b/constantine/platforms/compilers/bitops.nim
@ -0,0 +1,87 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import ../constant_time/ct_types
+
+when GCC_Compatible:
+  func builtin_clz(n: uint32): cint {.importc: "__builtin_clz", nodecl.}
+    ## Count the number of leading zeros
+    ## undefined if n is zero
+  func builtin_clzll(n: uint64): cint {.importc: "__builtin_clzll", nodecl.}
+    ## Count the number of leading zeros
+    ## undefined if n is zero
+
+  func log2_c_compiler_vartime*(n: uint8|uint16|uint32): int {.inline.} =
+    ## Compute the log2 of n using compiler builtin
+    ## ⚠ Depending on the compiler:
+    ## - It is undefined if n == 0
+    ## - It is not constant-time as a zero input is checked
+    cast[int](31 - cast[cuint](builtin_clz(n.uint32)))
+  func log2_c_compiler_vartime*(n: uint64): int {.inline.} =
+    ## Compute the log2 of n using compiler builtin
+    ## ⚠ Depending on the compiler:
+    ## - It is undefined if n == 0
+    ## - It is not constant-time as a zero input is checked
+    cast[int](63 - cast[cuint](builtin_clzll(n)))
+
+elif defined(icc):
+  func bitScanReverse(r: var uint32, n: uint32): uint8 {.importc: "_BitScanReverse", header: "<immintrin.h>".}
+    ## Returns 0 if n is zero and non-zero otherwise
+    ## Returns the position of the first set bit in `r`
+  func bitScanReverse64(r: var uint32, n: uint64): uint8 {.importc: "_BitScanReverse64", header: "<immintrin.h>".}
+    ## Returns 0 if n is zero and non-zero otherwise
+    ## Returns the position of the first set bit in `r`
+
+  template bitscan(fnc: untyped; v: untyped): int {.inline.} =
+    var index: uint32
+    if fnc(index.addr, v) == 0:
+      return 0
+    return index.int
+  
+  func log2_c_compiler_vartime*(n: uint8|uint16|uint32): int {.inline.} =
+    ## Compute the log2 of n using compiler builtin
+    ## ⚠ Depending on the compiler:
+    ## - It is undefined if n == 0
+    ## - It is not constant-time as a zero input is checked
+    bitscan(bitScanReverse, c.uint32)
+  func log2_c_compiler_vartime*(n: uint64): int {.inline.} =
+    ## Compute the log2 of n using compiler builtin
+    ## ⚠ Depending on the compiler:
+    ## - It is undefined if n == 0
+    ## - It is not constant-time as a zero input is checked
+    bitscan(bitScanReverse64, n)
+    
+elif defined(vcc):
+  func bitScanReverse(p: ptr uint32, b: uint32): uint8 {.importc: "_BitScanReverse", header: "<intrin.h>".}
+    ## Returns 0 if n s no set bit and non-zero otherwise
+    ## Returns the position of the first set bit in `r`
+  func bitScanReverse64(p: ptr uint32, b: uint64): uint8 {.importc: "_BitScanReverse64", header: "<intrin.h>".}
+    ## Returns 0 if n s no set bit and non-zero otherwise
+    ## Returns the position of the first set bit in `r`
+
+  template bitscan(fnc: untyped; v: untyped): int =
+    var index: uint32
+    if fnc(index.addr, v) == 0:
+      return 0
+    return index.int
+  
+  func log2_c_compiler_vartime*(n: uint8|uint16|uint32): int {.inline.} =
+    ## Compute the log2 of n using compiler builtin
+    ## ⚠ Depending on the compiler:
+    ## - It is undefined if n == 0
+    ## - It is not constant-time as a zero input is checked
+    bitscan(bitScanReverse, c.uint32)
+  func log2_c_compiler_vartime*(n: uint64): int {.inline.} =
+    ## Compute the log2 of n using compiler builtin
+    ## ⚠ Depending on the compiler:
+    ## - It is undefined if n == 0
+    ## - It is not constant-time as a zero input is checked
+    bitscan(bitScanReverse64, n)
+
+else:
+  {. error: "Unsupported compiler".}
--- a/constantine/platforms/endians.nim
+++ b/constantine/platforms/endians.nim
@ -22,6 +22,18 @@ template toByte*(x: SomeUnsignedInt): byte =
  else:
    byte(x)

+template blobFrom*(dst: var openArray[byte], src: SomeUnsignedInt, startIdx: int, endian: static Endianness) =
+  ## Write an integer into a raw binary blob
+  ## Swapping endianness if needed
+  ## startidx is the first written array item if littleEndian is requested
+  ## or the last if bigEndian is requested
+  when endian == cpuEndian:
+    for i in 0 ..< sizeof(src):
+      dst[startIdx+i] = toByte(src shr (i * 8))
+  else:
+    for i in 0 ..< sizeof(src):
+      dst[startIdx+sizeof(src)-1-i] = toByte(src shr (i * 8))
+
 func parseFromBlob*[T: byte|char](
           dst: var SomeUnsignedInt,
           src: openArray[T],
--- a/constantine/platforms/gpu/README.md
+++ b/constantine/platforms/gpu/README.md
@ -0,0 +1,16 @@
+# GPU compilation targets
+
+For CPUs, Constantine JIT-compiles the cryptographic kernels via LLVM.
+
+This allows targeting several devices with a single frontend, the LLVM IR.
+
+Current use-cases are large scale aggregations, large-scale multi-scalar-multiplications and large-scale FFTs.
+
+Those are important primitives for:
+- aggregation of public keys or signatures from a large number of nodes
+- protocols based on polynomial commitments
+- zero-knowledge proof systems
+
+Potential future use-cases include erasure coding and lattice-based cryptography acceleration.
+
+⚠️ GPU usage is not constant-time and requires allocation of dynamic memory. It MUST NOT be used for secret data.
--- a/constantine/platforms/gpu/bindings/llvm_abi.nim
+++ b/constantine/platforms/gpu/bindings/llvm_abi.nim
@ -0,0 +1,581 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import ./utils
+
+{.passc: gorge("llvm-config --cflags").}
+{.passl: gorge("llvm-config --libs").}
+const libLLVM = gorge("llvm-config --libnames")
+
+static: echo "[Constantine] Using library " & libLLVM
+
+# ############################################################
+#
+#                 Bindings to LLVM JIT
+#
+# ############################################################
+
+# https://llvm.org/doxygen/group__LLVMC.html
+
+# Constantine is a library. It is possible that applications relying on Constantine
+# also link to libLLVM, for example if they implement a virtual machine (for the EVM, for Snarks/zero-knowledge, ...).
+# Hence Constantine should always use LLVM context to "namespace" its own codegen and avoid collisions in the global context.
+
+{.push used, cdecl, dynlib: libLLVM.}
+
+# ############################################################
+#
+#                         LLVM
+#
+# ############################################################
+
+type
+  LlvmBool = distinct int32
+  ErrorRef = distinct pointer
+  MemoryBufferRef = distinct pointer
+  ContextRef* = distinct pointer
+  ModuleRef* = distinct pointer
+  TargetRef* = distinct pointer
+  ExecutionEngineRef* = distinct pointer
+  TargetMachineRef* = distinct pointer
+  PassManagerRef* = distinct pointer
+  PassManagerBuilderRef* = distinct pointer
+  PassBuilderOptionsRef* = distinct pointer
+  PassRegistryRef* = distinct pointer
+  TypeRef* = distinct pointer
+  ValueRef* = distinct pointer
+  MetadataRef = distinct pointer
+  LLVMstring = distinct cstring
+  ErrorMessageString = distinct cstring
+    ## A string with a buffer owned by LLVM
+
+# <llvm-c/Core.h>
+
+proc createContext*(): ContextRef {.importc: "LLVMContextCreate".}
+proc dispose*(ctx: ContextRef) {.importc: "LLVMContextDispose".}
+
+proc dispose(msg: LLVMstring) {.importc: "LLVMDisposeMessage".}
+  ## cstring in LLVM are owned by LLVM and must be destroyed with a specific function
+proc dispose(buf: MemoryBufferRef){.importc: "LLVMDisposeMemoryBuffer".}
+proc getBufferStart(buf: MemoryBufferRef): ptr byte {.importc: "LLVMGetBufferStart".}
+proc getBufferSize(buf: MemoryBufferRef): csize_t {.importc: "LLVMGetBufferSize".}
+
+proc dispose(msg: ErrorMessageString) {.importc: "LLVMDisposeErrorMessage".}
+proc getErrorMessage(err: ErrorRef): ErrorMessageString {.importc: "LLVMGetErrorMessage".}
+ 
+# ############################################################
+#
+#                         Module
+#
+# ############################################################
+
+# {.push header: "<llvm-c/Core.h>".}
+
+proc createModule(name: cstring, ctx: ContextRef): ModuleRef {.importc: "LLVMModuleCreateWithNameInContext".}
+proc dispose*(m: ModuleRef) {.importc: "LLVMDisposeModule".}
+  ## Destroys a module
+  ## Note: destroying an Execution Engine will also destroy modules attached to it
+proc toIR_LLVMstring(m: ModuleRef): LLVMstring {.used, importc: "LLVMPrintModuleToString".}
+  ## Print a module IR to textual IR string. The string must be disposed with LLVM "dispose" or memory will leak.
+proc getContext*(m: ModuleRef): ContextRef {.importc: "LLVMGetModuleContext".}
+proc getIdentifier*(m: ModuleRef, rLen: var csize_t): cstring {.used, importc: "LLVMGetModuleIdentifier".}
+
+proc addNamedMetadataOperand*(m: ModuleRef, name: cstring, val: ValueRef) {.importc: "LLVMAddNamedMetadataOperand".}
+proc metadataNode*(ctx: ContextRef, metadataNodes: openArray[MetadataRef]): MetadataRef {.wrapOpenArrayLenType: csize_t, importc: "LLVMMDNodeInContext2".}
+proc metadataNode*(ctx: ContextRef, str: openArray[char]): MetadataRef {.wrapOpenArrayLenType: csize_t, importc: "LLVMMDStringInContext2".}
+proc asMetadataRef*(val: ValueRef): MetadataRef {.importc: "LLVMValueAsMetadata".}
+proc asValueRef*(ctx: ContextRef, md: MetadataRef): ValueRef {.importc: "LLVMMetadataAsValue".}
+
+# <llvm-c/BitWriter.h>
+proc writeBitcodeToFile*(m: ModuleRef, path: cstring) {.importc: "LLVMWriteBitcodeToFile".}
+proc writeBitcodeToMemoryBuffer(m: ModuleRef): MemoryBufferRef {.used, importc: "LLVMWriteBitcodeToMemoryBuffer".}
+  ## Write bitcode to a memory buffer
+  ## The MemoryBuffer must be disposed appropriately or memory will leak
+
+type VerifierFailureAction* {.size: sizeof(cint).} = enum
+  AbortProcessAction # verifier will print to stderr and abort()
+  PrintMessageAction # verifier will print to stderr and return 1
+  ReturnStatusAction # verifier will just return 1
+
+# {.push header: "<llvm-c/Analysis.h>".}
+proc verify(module: ModuleRef, failureAction: VerifierFailureAction, msg: var LLVMstring): LlvmBool {.used, importc: "LLVMVerifyModule".}
+
+# ############################################################
+#
+#                         Target
+#
+# ############################################################
+
+# "<llvm-c/Target.h>"
+
+# The following procedures:
+# - initializeNativeTarget()
+# - initializeNativeAsmPrinter()
+# are implemented in the development header macros and aren't in the LLVM library
+# We want to only depend on the runtime for installation ease and size. 
+#
+# We can emulate the calls based on:
+# - /usr/include/llvm-c/Target.h
+# - /usr/include/llvm/Config/llvm-config-64.h
+
+# proc initializeNativeTarget*(): LlvmBool {.discardable, importc: "LLVMInitializeNativeTarget".}
+# proc initializeNativeAsmPrinter*(): LlvmBool {.discardable, importc: "LLVMInitializeNativeAsmPrinter".}
+
+proc initializeX86AsmPrinter() {.importc: "LLVMInitializeX86AsmPrinter".}
+proc initializeX86Target() {.importc: "LLVMInitializeX86Target".}
+proc initializeX86TargetInfo() {.importc: "LLVMInitializeX86TargetInfo".}
+proc initializeX86TargetMC() {.importc: "LLVMInitializeX86TargetMC".}
+
+proc initializeNVPTXAsmPrinter() {.importc: "LLVMInitializeNVPTXAsmPrinter".}
+proc initializeNVPTXTarget() {.importc: "LLVMInitializeNVPTXTarget".}
+proc initializeNVPTXTargetInfo() {.importc: "LLVMInitializeNVPTXTargetInfo".}
+proc initializeNVPTXTargetMC() {.importc: "LLVMInitializeNVPTXTargetMC".}
+
+proc getTargetFromName*(name: cstring): TargetRef {.importc: "LLVMGetTargetFromName".}
+proc getTargetFromTriple*(triple: cstring, target: var TargetRef, errorMessage: var LLVMstring
+       ): LLVMBool {.importc: "LLVMGetTargetFromTriple".}
+
+proc getTargetDescription*(t: TargetRef): cstring {.importc: "LLVMGetTargetDescription".}
+
+proc hasJIT*(t: TargetRef): LLVMBool {.importc: "LLVMTargetHasJIT".}
+proc hasTargetMachine*(t: TargetRef): LLVMBool {.importc: "LLVMTargetHasTargetMachine".}
+proc hasAsmBackend*(t: TargetRef): LLVMBool {.importc: "LLVMTargetHasAsmBackend".}
+
+# {.push header: "<llvm-c/Core.h>".}
+proc setTarget*(module: ModuleRef, triple: cstring) {.importc: "LLVMSetTarget".}
+proc setDataLayout*(module: ModuleRef, layout: cstring) {.importc: "LLVMSetDataLayout".}
+
+# ############################################################
+#
+#                    Execution Engine
+#
+# ############################################################
+
+# "<llvm-c/ExecutionEngine.h>"
+proc createJITCompilerForModule(
+       engine: var ExecutionEngineRef,
+       module: ModuleRef,
+       optLevel: uint32,
+       err: var LLVMstring): LlvmBool {.used, importc: "LLVMCreateJITCompilerForModule".}
+proc dispose*(engine: ExecutionEngineRef) {.importc: "LLVMDisposeExecutionEngine".}
+  ## Destroys an execution engine
+  ## Note: destroying an Execution Engine will also destroy modules attached to it
+proc getFunctionAddress*(engine: ExecutionEngineRef, name: cstring): distinct pointer {.importc: "LLVMGetFunctionAddress".}
+
+# ############################################################
+#
+#                    Target Machine
+#
+# ############################################################
+
+type
+  CodeGenOptLevel* {.size: sizeof(cint).} = enum
+    CodeGenLevelNone, CodeGenLevelLess, CodeGenLevelDefault, CodeGenLevelAggressive
+  RelocMode* {.size: sizeof(cint).} = enum
+    RelocDefault, RelocStatic, RelocPIC, RelocDynamicNoPic, RelocROPI, RelocRWPI,
+    RelocROPI_RWPI
+  CodeModel* {.size: sizeof(cint).} = enum
+    CodeModelDefault, CodeModelJITDefault, CodeModelTiny, CodeModelSmall,
+    CodeModelKernel, CodeModelMedium, CodeModelLarge
+  CodeGenFileType* {.size: sizeof(cint).} = enum
+    AssemblyFile, ObjectFile
+
+  TargetDataRef* = distinct pointer
+  TargetLibraryInfoRef* = distinct pointer
+
+# "<llvm-c/TargetMachine.h>"
+proc createTargetMachine*(
+       target: TargetRef, triple, cpu, features: cstring,
+       level: CodeGenOptLevel, reloc: RelocMode, codeModel: CodeModel): TargetMachineRef {.importc: "LLVMCreateTargetMachine".}
+proc dispose*(m: TargetMachineRef) {.importc: "LLVMDisposeTargetMachine".}
+
+proc createTargetDataLayout*(t: TargetMachineRef): TargetDataRef {.importc: "LLVMCreateTargetDataLayout".}
+proc dispose*(m: TargetDataRef) {.importc: "LLVMDisposeTargetData".}
+proc setDataLayout*(module: ModuleRef, dataLayout: TargetDataRef) {.importc: "LLVMSetModuleDataLayout".}
+
+proc targetMachineEmitToFile*(t: TargetMachineRef, m: ModuleRef, fileName: cstring,
+                             codegen: CodeGenFileType, errorMessage: var LLVMstring): LLVMBool {.importc: "LLVMTargetMachineEmitToFile".}
+proc targetMachineEmitToMemoryBuffer*(t: TargetMachineRef, m: ModuleRef,
+                                     codegen: CodeGenFileType,
+                                     errorMessage: var LLVMstring,
+                                     outMemBuf: var MemoryBufferRef): LLVMBool {.importc: "LLVMTargetMachineEmitToMemoryBuffer".}
+
+# ############################################################
+#
+#                    Passes and transforms
+#
+# ############################################################
+
+# - https://blog.llvm.org/posts/2021-03-26-the-new-pass-manager/
+# - https://llvm.org/docs/NewPassManager.html
+
+# https://llvm.org/doxygen/group__LLVMCCorePassManagers.html
+# # header: "<llvm-c/Core.h>"
+
+proc createPassManager*(): PassManagerRef {.importc: "LLVMCreatePassManager".}
+proc dispose*(pm: PassManagerRef) {.importc: "LLVMDisposePassManager".}
+proc run*(pm: PassManagerRef, module: ModuleRef) {. importc: "LLVMRunPassManager".}
+
+# https://llvm.org/doxygen/group__LLVMCTransformsPassManagerBuilder.html
+# header: "<llvm-c/Transforms/PassManagerBuilder.h>"
+
+proc createPassManagerBuilder*(): PassManagerBuilderRef {.importc: "LLVMPassManagerBuilderCreate".}
+proc dispose*(pmb: PassManagerBuilderRef) {.importc: "LLVMPassManagerBuilderDispose".}
+proc setOptLevel*(pmb: PassManagerBuilderRef, level: uint32) {.importc: "LLVMPassManagerBuilderSetOptLevel".}
+proc setSizeLevel*(pmb: PassManagerBuilderRef, level: uint32) {.importc: "LLVMPassManagerBuilderSetSizeLevel".}
+proc populateModulePassManager*(pmb: PassManagerBuilderRef, legacyPM: PassManagerRef) {. importc: "LLVMPassManagerBuilderPopulateModulePassManager".}
+
+# https://llvm.org/doxygen/group__LLVMCCoreNewPM.html
+# header: "<llvm-c/Transforms/PassBuilder.h>"
+
+proc createPassBuilderOptions*(): PassBuilderOptionsRef {.importc: "LLVMCreatePassBuilderOptions".}
+proc dispose*(pbo: PassBuilderOptionsRef) {.importc: "LLVMDisposePassBuilderOptions".}
+proc runPasses(module: ModuleRef, passes: cstring, machine: TargetMachineRef, pbo: PassBuilderOptionsRef): ErrorRef {.importc: "LLVMRunPasses".}
+
+# https://llvm.org/docs/doxygen/group__LLVMCInitialization.html
+# header: "<llvm-c/Initialization.h>"
+
+proc getGlobalPassRegistry(): PassRegistryRef {.importc: "LLVMGetGlobalPassRegistry".}
+
+proc initializeCore(registry: PassRegistryRef) {.importc: "LLVMInitializeCore".}
+proc initializeTransformUtils(registry: PassRegistryRef) {.importc: "LLVMInitializeTransformUtils".}
+proc initializeScalarOpts(registry: PassRegistryRef) {.importc: "LLVMInitializeScalarOpts".}
+proc initializeObjCARCOpts(registry: PassRegistryRef) {.importc: "LLVMInitializeObjCARCOpts".}
+proc initializeVectorization(registry: PassRegistryRef) {.importc: "LLVMInitializeVectorization".}
+proc initializeInstCombine(registry: PassRegistryRef) {.importc: "LLVMInitializeInstCombine".}
+proc initializeAggressiveInstCombiner(registry: PassRegistryRef) {.importc: "LLVMInitializeAggressiveInstCombiner".}
+proc initializeIPO(registry: PassRegistryRef) {.importc: "LLVMInitializeIPO".}
+proc initializeInstrumentation(registry: PassRegistryRef) {.importc: "LLVMInitializeInstrumentation".}
+proc initializeAnalysis(registry: PassRegistryRef) {.importc: "LLVMInitializeAnalysis".}
+proc initializeIPA(registry: PassRegistryRef) {.importc: "LLVMInitializeIPA".}
+proc initializeCodeGen(registry: PassRegistryRef) {.importc: "LLVMInitializeCodeGen".}
+proc initializeTarget(registry: PassRegistryRef) {.importc: "LLVMInitializeTarget".}
+
+# https://llvm.org/doxygen/group__LLVMCTarget.html
+proc addTargetLibraryInfo*(tli: TargetLibraryInfoRef, pm: PassManagerRef) {.importc: "LLVMAddTargetLibraryInfo".}
+  # There doesn't seem to be a way to instantiate TargetLibraryInfoRef :/
+proc addAnalysisPasses*(machine: TargetMachineRef, pm: PassManagerRef) {.importc: "LLVMAddAnalysisPasses".}
+
+# https://www.llvm.org/docs/Passes.html
+# -------------------------------------
+
+# https://llvm.org/doxygen/group__LLVMCTransformsInstCombine.html
+proc addInstructionCombiningPass*(pm: PassManagerRef) {.importc: "LLVMAddInstructionCombiningPass".}
+
+# https://llvm.org/doxygen/group__LLVMCTransformsUtils.html
+proc addPromoteMemoryToRegisterPass*(pm: PassManagerRef) {.importc: "LLVMAddPromoteMemoryToRegisterPass".}
+
+# https://llvm.org/doxygen/group__LLVMCTransformsScalar.html
+proc addAggressiveDeadCodeEliminationPass*(pm: PassManagerRef) {.importc: "LLVMAddAggressiveDCEPass".}
+proc addDeadStoreEliminationPass*(pm: PassManagerRef) {.importc: "LLVMAddDeadStoreEliminationPass".}
+proc addGlobalValueNumberingPass*(pm: PassManagerRef) {.importc: "LLVMAddNewGVNPass".}
+proc addMemCpyOptPass*(pm: PassManagerRef) {.importc: "LLVMAddMemCpyOptPass".}
+proc addScalarReplacementOfAggregatesPass*(pm: PassManagerRef) {.importc: "LLVMAddScalarReplAggregatesPass".}
+
+# https://llvm.org/doxygen/group__LLVMCTransformsIPO.html
+proc addDeduceFunctionAttributesPass*(pm: PassManagerRef) {.importc: "LLVMAddFunctionAttrsPass".}
+proc addFunctionInliningPass*(pm: PassManagerRef) {.importc: "LLVMAddFunctionInliningPass".}
+
+# ############################################################
+#
+#                         Types
+#
+# ############################################################
+
+# https://llvm.org/doxygen/group__LLVMCCoreType.html
+
+type
+  TypeKind* {.size: sizeof(cint).} = enum
+    tkVoid,           ## type with no size
+    tkHalf,           ## 16 bit floating point type
+    tkFloat,          ## 32 bit floating point type
+    tkDouble,         ## 64 bit floating point type
+    tkX86_FP80,       ## 80 bit floating point type (X87)
+    tkFP128,          ## 128 bit floating point type (112-bit mantissa)
+    tkPPC_FP128,      ## 128 bit floating point type (two 64-bits)
+    tkLabel,          ## Labels
+    tkInteger,        ## Arbitrary bit width integers
+    tkFunction,       ## Functions
+    tkStruct,         ## Structures
+    tkArray,          ## Arrays
+    tkPointer,        ## Pointers
+    tkVector,         ## Fixed width SIMD vector type
+    tkMetadata,       ## Metadata
+    tkX86_MMX,        ## X86 MMX
+    tkToken,          ## Tokens
+    tkScalableVector, ## Scalable SIMD vector type
+    tkBFloat,         ## 16 bit brain floating point type
+    tkX86_AMX         ## X86 AMX
+
+# header: "<llvm-c/Core.h>"
+
+proc getContext*(ty: TypeRef): ContextRef {.importc: "LLVMGetTypeContext".}
+proc getTypeKind*(ty: TypeRef): TypeKind {.importc: "LLVMGetTypeKind".}
+proc dumpType*(ty: TypeRef) {.sideeffect, importc: "LLVMDumpType".}
+proc toLLVMstring(ty: TypeRef): LLVMstring {.used, importc: "LLVMPrintTypeToString".}
+
+proc void_t*(ctx: ContextRef): TypeRef {.importc: "LLVMVoidTypeInContext".}
+
+# Integers
+# ------------------------------------------------------------
+proc int1_t*(ctx: ContextRef): TypeRef {.importc: "LLVMInt1TypeInContext".}
+proc int8_t*(ctx: ContextRef): TypeRef {.importc: "LLVMInt8TypeInContext".}
+proc int16_t*(ctx: ContextRef): TypeRef {.importc: "LLVMInt16TypeInContext".}
+proc int32_t*(ctx: ContextRef): TypeRef {.importc: "LLVMInt32TypeInContext".}
+proc int64_t*(ctx: ContextRef): TypeRef {.importc: "LLVMInt64TypeInContext".}
+proc int128_t*(ctx: ContextRef): TypeRef {.importc: "LLVMInt128TypeInContext".}
+proc int_t*(ctx: ContextRef, numBits: uint32): TypeRef {.importc: "LLVMIntTypeInContext".}
+
+proc getIntTypeWidth*(ty: TypeRef): uint32 {.importc: "LLVMGetIntTypeWidth".}
+
+# Composite
+# ------------------------------------------------------------
+proc struct_t*(
+       ctx: ContextRef,
+       elemTypes: openArray[TypeRef],
+       packed: LlvmBool): TypeRef {.wrapOpenArrayLenType: cuint, importc: "LLVMStructTypeInContext".}
+proc array_t*(elemType: TypeRef, elemCount: uint32): TypeRef {.importc: "LLVMArrayType".}
+
+proc pointerType(elementType: TypeRef; addressSpace: cuint): TypeRef {.used, importc: "LLVMPointerType".}
+
+proc getElementType*(arrayOrVectorTy: TypeRef): TypeRef {.importc: "LLVMGetElementType".}
+
+# Functions
+# ------------------------------------------------------------
+proc function_t*(
+       returnType: TypeRef,
+       paramTypes: openArray[TypeRef],
+       isVarArg: LlvmBool): TypeRef {.wrapOpenArrayLenType: cuint, importc: "LLVMFunctionType".}
+
+proc addFunction*(m: ModuleRef, name: cstring, ty: TypeRef): ValueRef {.importc: "LLVMAddFunction".}
+  ## Declare a function `name` in a module.
+  ## Returns a handle to specify its instructions
+
+# TODO: Function and Parameter attributes:
+# - https://www.llvm.org/docs/LangRef.html?highlight=attribute#function-attributes
+# - https://www.llvm.org/docs/LangRef.html?highlight=attribute#parameter-attributes
+#
+# We can use attributes to specify additional guarantees of Constantine code, for instance:
+# - "pure" function with: nounwind, readonly
+# - pointer particularities: readonly, writeonly, noalias, inalloca, byval
+
+proc getReturnType*(functionTy: TypeRef): TypeRef {.importc: "LLVMGetReturnType".}
+proc countParamTypes*(functionTy: TypeRef): uint32 {.importc: "LLVMCountParamTypes".}
+
+# ############################################################
+#
+#                         Values
+#
+# ############################################################
+
+# {.push header: "<llvm-c/Core.h>".}
+
+proc getTypeOf*(v: ValueRef): TypeRef {.importc: "LLVMTypeOf".}
+proc getValueName2(v: ValueRef, rLen: var csize_t): cstring {.used, importc: "LLVMGetValueName2".}
+  ## Returns the name of a valeu if it exists.
+  ## `rLen` stores the returned string length
+  ## 
+  ## This is not free, it requires internal hash table access
+  ## The return value does not have to be freed and is a pointer an internal LLVM data structure
+
+proc dumpValue*(v: ValueRef) {.sideeffect, importc: "LLVMDumpValue".}
+  ## Print the value to stderr
+
+proc toLLVMstring(v: ValueRef): LLVMstring {.used, importc: "LLVMPrintValueToString".}
+
+# Constants
+# ------------------------------------------------------------
+# https://llvm.org/doxygen/group__LLVMCCoreValueConstant.html
+
+proc constInt(ty: TypeRef, n: culonglong, signExtend: LlvmBool): ValueRef {.importc: "LLVMConstInt".}
+proc constReal*(ty: TypeRef, n: cdouble): ValueRef {.importc: "LLVMConstReal".}
+
+proc constNull*(ty: TypeRef): ValueRef {.importc: "LLVMConstNull".}
+proc constAllOnes*(ty: TypeRef): ValueRef {.importc: "LLVMConstAllOnes".}
+proc constStruct(
+       constantVals: openArray[ValueRef],
+       packed: LlvmBool): ValueRef {.wrapOpenArrayLenType: cuint, importc: "LLVMConstStruct".}
+proc constArray*(
+       ty: TypeRef,
+       constantVals: openArray[ValueRef],
+    ): ValueRef {.wrapOpenArrayLenType: cuint, importc: "LLVMConstArray".}
+
+# ############################################################
+#
+#                      IR builder
+#
+# ############################################################
+
+# https://llvm.org/doxygen/group__LLVMCCoreInstructionBuilder.html
+
+type
+  BasicBlockRef* = distinct pointer
+  BuilderRef* = distinct pointer
+    ##  An instruction builder represents a point within a basic block and is
+    ##  the exclusive means of building instructions using the C interface.
+
+  IntPredicate* {.size: sizeof(cint).} = enum
+    IntEQ = 32               ## equal
+    IntNE                    ## not equal
+    IntUGT                   ## unsigned greater than
+    IntUGE                   ## unsigned greater or equal
+    IntULT                   ## unsigned less than
+    IntULE                   ## unsigned less or equal
+    IntSGT                   ## signed greater than
+    IntSGE                   ## signed greater or equal
+    IntSLT                   ## signed less than
+    IntSLE                   ## signed less or equal
+
+  InlineAsmDialect* {.size: sizeof(cint).} = enum
+    InlineAsmDialectATT
+    InlineAsmDialectIntel
+
+# "<llvm-c/Core.h>"
+
+# Instantiation
+# ------------------------------------------------------------
+
+proc appendBasicBlock*(ctx: ContextRef, fn: ValueRef, name: cstring): BasicBlockRef {.importc: "LLVMAppendBasicBlockInContext".}
+  ## Append a basic block to the end of a function
+
+proc createBuilder*(ctx: ContextRef): BuilderRef {.importc: "LLVMCreateBuilderInContext".}
+proc dispose*(builder: BuilderRef) {.importc: "LLVMDisposeBuilder".}
+
+# Functions
+# ------------------------------------------------------------
+
+proc getParam*(fn: ValueRef, index: uint32): ValueRef {.importc: "LLVMGetParam".}
+proc retVoid*(builder: BuilderRef): ValueRef {.discardable, importc: "LLVMBuildRetVoid".}
+proc ret*(builder: BuilderRef, returnVal: ValueRef): ValueRef {.discardable, importc: "LLVMBuildRet".}
+
+# Positioning
+# ------------------------------------------------------------
+
+proc position*(builder: BuilderRef, blck: BasicBlockRef, instr: ValueRef) {.importc: "LLVMPositionBuilder".}
+proc positionBefore*(builder: BuilderRef, instr: ValueRef) {.importc: "LLVMPositionBuilderBefore".}
+proc positionAtEnd*(builder: BuilderRef, blck: BasicBlockRef) {.importc: "LLVMPositionBuilderAtEnd".}
+
+proc getInsertBlock(builder: BuilderRef): BasicBlockRef {.importc: "LLVMGetInsertBlock".}
+  ## This function is not documented and probably for special use
+  ## However due to https://github.com/llvm/llvm-project/issues/59875
+  ## it's our workaround to get the context of a Builder
+
+proc getBasicBlockParent*(blck: BasicBlockRef): ValueRef {.importc: "LLVMGetBasicBlockParent".}
+  ## Obtains the function to which a basic block belongs
+
+# Inline Assembly
+# ------------------------------------------------------------
+proc getInlineAsm*(
+       ty: TypeRef,
+       asmString: openArray[char],
+       constraints: openArray[char],
+       hasSideEffects, isAlignStack: LlvmBool,
+       dialect: InlineAsmDialect, canThrow: LlvmBool
+     ): ValueRef {.importc: "LLVMGetInlineAsm"}
+
+# Intermediate Representation
+# ------------------------------------------------------------
+# 
+# - NSW: no signed wrap, signed value cannot over- or underflow.
+# - NUW: no unsigned wrap, unsigned value cannot over- or underflow.
+
+proc call2*(
+       builder: BuilderRef,
+       ty: TypeRef,
+       fn: ValueRef,
+       args: openArray[ValueRef],
+       name: cstring = ""): ValueRef {.wrapOpenArrayLenType: cuint, importc: "LLVMBuildCall2".}
+
+proc add*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildAdd".}
+proc addNSW*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildNSWAdd".}
+proc addNUW*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildNUWAdd".}
+
+proc sub*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildSub".}
+proc subNSW*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildNSWSub".}
+proc subNUW*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildNUWSub".}
+
+proc neg*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildNeg".}
+proc negNSW*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildNSWNeg".}
+proc negNUW*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildNUWNeg".}
+
+proc mul*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildMul".}
+proc mulNSW*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildNSWMul".}
+proc mulNUW*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildNUWMul".}
+
+proc divU*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildUDiv".}
+proc divU_exact*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildExactUDiv".}
+proc divS*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildSDiv".}
+proc divS_exact*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildExactSDiv".}
+proc remU*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildURem".}
+proc remS*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildSRem".}
+
+proc lshl*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildShl".}
+proc lshr*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildLShr".}
+proc ashr*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildAShr".}
+
+proc `and`*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildAnd".}
+proc `or`*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildOr".}
+proc `xor`*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildXor".}
+proc `not`*(builder: BuilderRef, val: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildNot".}
+proc select*(builder: BuilderRef, condition, then, otherwise: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildSelect".}
+
+proc icmp*(builder: BuilderRef, op: IntPredicate, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildICmp".}
+
+proc bitcast*(builder: BuilderRef, val: ValueRef, destTy: TypeRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildBitcast".}
+proc trunc*(builder: BuilderRef, val: ValueRef, destTy: TypeRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildTrunc".}
+proc zext*(builder: BuilderRef, val: ValueRef, destTy: TypeRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildZExt".}
+  ## Zero-extend
+proc sext*(builder: BuilderRef, val: ValueRef, destTy: TypeRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildSExt".}
+  ## Sign-extend
+
+proc malloc*(builder: BuilderRef, ty: TypeRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildMalloc".}
+proc mallocArray*(builder: BuilderRef, ty: TypeRef, length: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildArrayMalloc".}
+proc free*(builder: BuilderRef, ty: TypeRef, `ptr`: ValueRef): ValueRef {.importc: "LLVMBuildFree".}
+proc alloca*(builder: BuilderRef, ty: TypeRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildAlloca".}
+proc allocaArray*(builder: BuilderRef, ty: TypeRef, length: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildArrayAlloca".}
+
+proc extractValue*(builder: BuilderRef, aggVal: ValueRef, index: uint32, name: cstring = ""): ValueRef {.importc: "LLVMBuildExtractValue".}
+proc insertValue*(builder: BuilderRef, aggVal: ValueRef, eltVal: ValueRef, index: uint32, name: cstring = ""): ValueRef {.discardable, importc: "LLVMBuildInsertValue".}
+
+proc getElementPtr2*(
+       builder: BuilderRef,
+       ty: TypeRef,
+       `ptr`: ValueRef,
+       indices: openArray[ValueRef],
+       name: cstring = ""
+     ): ValueRef {.wrapOpenArrayLenType: cuint, importc: "LLVMBuildGEP2".}
+  ## https://www.llvm.org/docs/GetElementPtr.html
+proc getElementPtr2_InBounds*(
+       builder: BuilderRef,
+       ty: TypeRef,
+       `ptr`: ValueRef,
+       indices: openArray[ValueRef],
+       name: cstring = ""
+     ): ValueRef {.wrapOpenArrayLenType: cuint, importc: "LLVMBuildInBoundsGEP2".}
+  ## https://www.llvm.org/docs/GetElementPtr.html
+  ## If the GEP lacks the inbounds keyword, the value is the result from evaluating the implied two’s complement integer computation.
+  ## However, since there’s no guarantee of where an object will be allocated in the address space, such values have limited meaning.
+proc getElementPtr2_Struct*(
+       builder: BuilderRef,
+       ty: TypeRef,
+       `ptr`: ValueRef,
+       idx: uint32,
+       name: cstring = ""
+     ): ValueRef {.importc: "LLVMBuildStructGEP2".}
+  ## https://www.llvm.org/docs/GetElementPtr.html
+  ## If the GEP lacks the inbounds keyword, the value is the result from evaluating the implied two’s complement integer computation.
+  ## However, since there’s no guarantee of where an object will be allocated in the address space, such values have limited meaning.
+
+proc load2*(builder: BuilderRef, ty: TypeRef, `ptr`: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildLoad2".}
+proc store*(builder: BuilderRef, val, `ptr`: ValueRef): ValueRef {.discardable, importc: "LLVMBuildStore".}
+
+proc memset*(builder: BuilderRef, `ptr`, val, len: ValueRef, align: uint32) {.importc: "LLVMBuildMemset".}
+proc memcpy*(builder: BuilderRef, dst: ValueRef, dstAlign: uint32, src: ValueRef, srcAlign: uint32, size: ValueRef) {.importc: "LLVMBuildMemcpy".}
+proc memmove*(builder: BuilderRef, dst: ValueRef, dstAlign: uint32, src: ValueRef, srcAlign: uint32, size: ValueRef) {.importc: "LLVMBuildMemmove".}
+
+{.pop.} # {.used, hint[Name]: off, cdecl, dynlib: libLLVM.}
--- a/constantine/platforms/gpu/bindings/nvidia_abi.nim
+++ b/constantine/platforms/gpu/bindings/nvidia_abi.nim
@ -0,0 +1,518 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+# ############################################################
+#
+#               Bindings to Nvidia GPUs libraries
+#
+# ############################################################
+
+import ./utils
+
+# ############################################################
+#
+#                         Cuda
+#
+# ############################################################
+
+static: echo "[Constantine] Using library libcuda.so"
+{.passl: "-L/opt/cuda/lib64 -lcuda".}
+
+# Cuda offers 2 APIs:
+# - cuda.h               the driver API
+# - cuda_runtime.h       the runtime API
+#
+# https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html
+#
+# We need to use the lower-level driver API for JIT modules loading and reloading
+
+type
+  CUresult* {.size: sizeof(cint).} = enum
+    ##  The API call returned with no errors. In the case of query calls, this
+    ##  also means that the operation being queried is complete (see
+    ##  ::cuEventQuery() and ::cuStreamQuery()).
+    CUDA_SUCCESS = 0
+    ##  This indicates that one or more of the parameters passed to the API call
+    ##  is not within an acceptable range of values.
+    CUDA_ERROR_INVALID_VALUE = 1
+    ##  The API call failed because it was unable to allocate enough memory to
+    ##  perform the requested operation.
+    CUDA_ERROR_OUT_OF_MEMORY = 2
+    ##  This indicates that the CUDA driver has not been initialized with
+    ##  ::cuInit() or that initialization has failed.
+    CUDA_ERROR_NOT_INITIALIZED = 3
+    ##  This indicates that the CUDA driver is in the process of shutting down.
+    CUDA_ERROR_DEINITIALIZED = 4
+    ##  This indicates profiler is not initialized for this run. This can
+    ##  happen when the application is running with external profiling tools
+    ##  like visual profiler.
+    CUDA_ERROR_PROFILER_DISABLED = 5
+    ##  to attempt to enable/disable the profiling via ::cuProfilerStart or
+    ##  ::cuProfilerStop without initialization.
+    CUDA_ERROR_PROFILER_NOT_INITIALIZED = 6
+    ##  to call cuProfilerStart() when profiling is already enabled.
+    CUDA_ERROR_PROFILER_ALREADY_STARTED = 7
+    ##  to call cuProfilerStop() when profiling is already disabled.
+    CUDA_ERROR_PROFILER_ALREADY_STOPPED = 8
+    ##  This indicates that the CUDA driver that the application has loaded is a
+    ##  stub library. Applications that run with the stub rather than a real
+    ##  driver loaded will result in CUDA API returning this error.
+    CUDA_ERROR_STUB_LIBRARY = 34
+    ##  This indicates that requested CUDA device is unavailable at the current
+    ##  time. Devices are often unavailable due to use of
+    ##  ::CU_COMPUTEMODE_EXCLUSIVE_PROCESS or ::CU_COMPUTEMODE_PROHIBITED.
+    CUDA_ERROR_DEVICE_UNAVAILABLE = 46
+    ##  This indicates that no CUDA-capable devices were detected by the installed
+    ##  CUDA driver.
+    CUDA_ERROR_NO_DEVICE = 100
+    ##  This indicates that the device ordinal supplied by the user does not
+    ##  correspond to a valid CUDA device or that the action requested is
+    ##  invalid for the specified device.
+    CUDA_ERROR_INVALID_DEVICE = 101
+    ##  This error indicates that the Grid license is not applied.
+    CUDA_ERROR_DEVICE_NOT_LICENSED = 102
+    ##  This indicates that the device kernel image is invalid. This can also
+    ##  indicate an invalid CUDA module.
+    CUDA_ERROR_INVALID_IMAGE = 200
+    ##  This most frequently indicates that there is no context bound to the
+    ##  current thread. This can also be returned if the context passed to an
+    ##  API call is not a valid handle (such as a context that has had
+    ##  ::cuCtxDestroy() invoked on it). This can also be returned if a user
+    ##  mixes different API versions (i.e. 3010 context with 3020 API calls).
+    ##  See ::cuCtxGetApiVersion() for more details.
+    CUDA_ERROR_INVALID_CONTEXT = 201
+    ##  This indicated that the context being supplied as a parameter to the
+    ##  API call was already the active context.
+    ##  error to attempt to push the active context via ::cuCtxPushCurrent().
+    CUDA_ERROR_CONTEXT_ALREADY_CURRENT = 202
+    ##  This indicates that a map or register operation has failed.
+    CUDA_ERROR_MAP_FAILED = 205
+    ##  This indicates that an unmap or unregister operation has failed.
+    CUDA_ERROR_UNMAP_FAILED = 206
+    ##  This indicates that the specified array is currently mapped and thus
+    ##  cannot be destroyed.
+    CUDA_ERROR_ARRAY_IS_MAPPED = 207
+    ##  This indicates that the resource is already mapped.
+    CUDA_ERROR_ALREADY_MAPPED = 208
+    ##  This indicates that there is no kernel image available that is suitable
+    ##  for the device. This can occur when a user specifies code generation
+    ##  options for a particular CUDA source file that do not include the
+    ##  corresponding device configuration.
+    CUDA_ERROR_NO_BINARY_FOR_GPU = 209
+    ##  This indicates that a resource has already been acquired.
+    CUDA_ERROR_ALREADY_ACQUIRED = 210
+    ##  This indicates that a resource is not mapped.
+    CUDA_ERROR_NOT_MAPPED = 211
+    ##  This indicates that a mapped resource is not available for access as an
+    ##  array.
+    CUDA_ERROR_NOT_MAPPED_AS_ARRAY = 212
+    ##  This indicates that a mapped resource is not available for access as a
+    ##  pointer.
+    CUDA_ERROR_NOT_MAPPED_AS_POINTER = 213
+    ##  This indicates that an uncorrectable ECC error was detected during
+    ##  execution.
+    CUDA_ERROR_ECC_UNCORRECTABLE = 214
+    ##  This indicates that the ::CUlimit passed to the API call is not
+    ##  supported by the active device.
+    CUDA_ERROR_UNSUPPORTED_LIMIT = 215
+    ##  This indicates that the ::CUcontext passed to the API call can
+    ##  only be bound to a single CPU thread at a time but is already
+    ##  bound to a CPU thread.
+    CUDA_ERROR_CONTEXT_ALREADY_IN_USE = 216
+    ##  This indicates that peer access is not supported across the given
+    ##  devices.
+    CUDA_ERROR_PEER_ACCESS_UNSUPPORTED = 217
+    ##  This indicates that a PTX JIT compilation failed.
+    CUDA_ERROR_INVALID_PTX = 218
+    ##  This indicates an error with OpenGL or DirectX context.
+    CUDA_ERROR_INVALID_GRAPHICS_CONTEXT = 219
+    ##  This indicates that an uncorrectable NVLink error was detected during the
+    ##  execution.
+    CUDA_ERROR_NVLINK_UNCORRECTABLE = 220
+    ##  This indicates that the PTX JIT compiler library was not found.
+    CUDA_ERROR_JIT_COMPILER_NOT_FOUND = 221
+    ##  This indicates that the provided PTX was compiled with an unsupported toolchain.
+    CUDA_ERROR_UNSUPPORTED_PTX_VERSION = 222
+    ##  This indicates that the PTX JIT compilation was disabled.
+    CUDA_ERROR_JIT_COMPILATION_DISABLED = 223
+    ##  This indicates that the ::CUexecAffinityType passed to the API call is not
+    ##  supported by the active device.
+    CUDA_ERROR_UNSUPPORTED_EXEC_AFFINITY = 224
+    ##  This indicates that the device kernel source is invalid. This includes
+    ##  compilation/linker errors encountered in device code or user error.
+    CUDA_ERROR_INVALID_SOURCE = 300
+    ##  This indicates that the file specified was not found.
+    CUDA_ERROR_FILE_NOT_FOUND = 301
+    ##  This indicates that a link to a shared object failed to resolve.
+    CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND = 302
+    ##  This indicates that initialization of a shared object failed.
+    CUDA_ERROR_SHARED_OBJECT_INIT_FAILED = 303
+    ##  This indicates that an OS call failed.
+    CUDA_ERROR_OPERATING_SYSTEM = 304
+    ##  This indicates that a resource handle passed to the API call was not
+    ##  valid. Resource handles are opaque types like ::CUstream and ::CUevent.
+    CUDA_ERROR_INVALID_HANDLE = 400
+    ##  This indicates that a resource required by the API call is not in a
+    ##  valid state to perform the requested operation.
+    CUDA_ERROR_ILLEGAL_STATE = 401
+    ##  This indicates that a named symbol was not found. Examples of symbols
+    ##  are global/constant variable names, driver function names, texture names,
+    ##  and surface names.
+    CUDA_ERROR_NOT_FOUND = 500
+    ##  This indicates that asynchronous operations issued previously have not
+    ##  completed yet. This result is not actually an error, but must be indicated
+    ##  differently than ::CUDA_SUCCESS (which indicates completion). Calls that
+    ##  may return this value include ::cuEventQuery() and ::cuStreamQuery().
+    CUDA_ERROR_NOT_READY = 600
+    ##  While executing a kernel, the device encountered a
+    ##  load or store instruction on an invalid memory address.
+    ##  This leaves the process in an inconsistent state and any further CUDA work
+    ##  will return the same error. To continue using CUDA, the process must be terminated
+    ##  and relaunched.
+    CUDA_ERROR_ILLEGAL_ADDRESS = 700
+    ##  This indicates that a launch did not occur because it did not have
+    ##  appropriate resources. This error usually indicates that the user has
+    ##  attempted to pass too many arguments to the device kernel, or the
+    ##  kernel launch specifies too many threads for the kernel's register
+    ##  count. Passing arguments of the wrong size (i.e. a 64-bit pointer
+    ##  when a 32-bit int is expected) is equivalent to passing too many
+    ##  arguments and can also result in this error.
+    CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES = 701
+    ##  This indicates that the device kernel took too long to execute. This can
+    ##  only occur if timeouts are enabled - see the device attribute
+    ##  ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT for more information.
+    ##  This leaves the process in an inconsistent state and any further CUDA work
+    ##  will return the same error. To continue using CUDA, the process must be terminated
+    ##  and relaunched.
+    CUDA_ERROR_LAUNCH_TIMEOUT = 702
+    ##  This error indicates a kernel launch that uses an incompatible texturing
+    ##  mode.
+    CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING = 703
+    ##  This error indicates that a call to ::cuCtxEnablePeerAccess() is
+    ##  trying to re-enable peer access to a context which has already
+    ##  had peer access to it enabled.
+    CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED = 704
+    ##  This error indicates that ::cuCtxDisablePeerAccess() is
+    ##  trying to disable peer access which has not been enabled yet
+    ##  via ::cuCtxEnablePeerAccess().
+    CUDA_ERROR_PEER_ACCESS_NOT_ENABLED = 705
+    ##  This error indicates that the primary context for the specified device
+    ##  has already been initialized.
+    CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE = 708
+    ##  This error indicates that the context current to the calling thread
+    ##  has been destroyed using ::cuCtxDestroy, or is a primary context which
+    ##  has not yet been initialized.
+    CUDA_ERROR_CONTEXT_IS_DESTROYED = 709
+    ##  A device-side assert triggered during kernel execution. The context
+    ##  cannot be used anymore, and must be destroyed. All existing device
+    ##  memory allocations from this context are invalid and must be
+    ##  reconstructed if the program is to continue using CUDA.
+    CUDA_ERROR_ASSERT = 710
+    ##  This error indicates that the hardware resources required to enable
+    ##  peer access have been exhausted for one or more of the devices
+    ##  passed to ::cuCtxEnablePeerAccess().
+    CUDA_ERROR_TOO_MANY_PEERS = 711
+    ##  This error indicates that the memory range passed to ::cuMemHostRegister()
+    ##  has already been registered.
+    CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED = 712
+    ##  This error indicates that the pointer passed to ::cuMemHostUnregister()
+    ##  does not correspond to any currently registered memory region.
+    CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED = 713
+    ##  While executing a kernel, the device encountered a stack error.
+    ##  This can be due to stack corruption or exceeding the stack size limit.
+    ##  This leaves the process in an inconsistent state and any further CUDA work
+    ##  will return the same error. To continue using CUDA, the process must be terminated
+    ##  and relaunched.
+    CUDA_ERROR_HARDWARE_STACK_ERROR = 714
+    ##  While executing a kernel, the device encountered an illegal instruction.
+    ##  This leaves the process in an inconsistent state and any further CUDA work
+    ##  will return the same error. To continue using CUDA, the process must be terminated
+    ##  and relaunched.
+    CUDA_ERROR_ILLEGAL_INSTRUCTION = 715
+    ##  While executing a kernel, the device encountered a load or store instruction
+    ##  on a memory address which is not aligned.
+    ##  This leaves the process in an inconsistent state and any further CUDA work
+    ##  will return the same error. To continue using CUDA, the process must be terminated
+    ##  and relaunched.
+    CUDA_ERROR_MISALIGNED_ADDRESS = 716
+    ##  While executing a kernel, the device encountered an instruction
+    ##  which can only operate on memory locations in certain address spaces
+    ##  (global, shared, or local), but was supplied a memory address not
+    ##  belonging to an allowed address space.
+    ##  This leaves the process in an inconsistent state and any further CUDA work
+    ##  will return the same error. To continue using CUDA, the process must be terminated
+    ##  and relaunched.
+    CUDA_ERROR_INVALID_ADDRESS_SPACE = 717
+    ##  While executing a kernel, the device program counter wrapped its address space.
+    ##  This leaves the process in an inconsistent state and any further CUDA work
+    ##  will return the same error. To continue using CUDA, the process must be terminated
+    ##  and relaunched.
+    CUDA_ERROR_INVALID_PC = 718
+    ##  An exception occurred on the device while executing a kernel. Common
+    ##  causes include dereferencing an invalid device pointer and accessing
+    ##  out of bounds shared memory. Less common cases can be system specific - more
+    ##  information about these cases can be found in the system specific user guide.
+    ##  This leaves the process in an inconsistent state and any further CUDA work
+    ##  will return the same error. To continue using CUDA, the process must be terminated
+    ##  and relaunched.
+    CUDA_ERROR_LAUNCH_FAILED = 719
+    ##  This error indicates that the number of blocks launched per grid for a kernel that was
+    ##  launched via either ::cuLaunchCooperativeKernel or ::cuLaunchCooperativeKernelMultiDevice
+    ##  exceeds the maximum number of blocks as allowed by ::cuOccupancyMaxActiveBlocksPerMultiprocessor
+    ##  or ::cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags times the number of multiprocessors
+    ##  as specified by the device attribute ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT.
+    CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE = 720
+    ##  This error indicates that the attempted operation is not permitted.
+    CUDA_ERROR_NOT_PERMITTED = 800
+    ##  This error indicates that the attempted operation is not supported
+    ##  on the current system or device.
+    CUDA_ERROR_NOT_SUPPORTED = 801
+    ##  This error indicates that the system is not yet ready to start any CUDA
+    ##  work.  To continue using CUDA, verify the system configuration is in a
+    ##  valid state and all required driver daemons are actively running.
+    ##  More information about this error can be found in the system specific
+    ##  user guide.
+    CUDA_ERROR_SYSTEM_NOT_READY = 802
+    ##  This error indicates that there is a mismatch between the versions of
+    ##  the display driver and the CUDA driver. Refer to the compatibility documentation
+    ##  for supported versions.
+    CUDA_ERROR_SYSTEM_DRIVER_MISMATCH = 803
+    ##  This error indicates that the system was upgraded to run with forward compatibility
+    ##  but the visible hardware detected by CUDA does not support this configuration.
+    ##  Refer to the compatibility documentation for the supported hardware matrix or ensure
+    ##  that only supported hardware is visible during initialization via the CUDA_VISIBLE_DEVICES
+    ##  environment variable.
+    CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE = 804
+    ##  This error indicates that the MPS client failed to connect to the MPS control daemon or the MPS server.
+    CUDA_ERROR_MPS_CONNECTION_FAILED = 805
+    ##  This error indicates that the remote procedural call between the MPS server and the MPS client failed.
+    CUDA_ERROR_MPS_RPC_FAILURE = 806
+    ##  This error indicates that the MPS server is not ready to accept new MPS client requests.
+    ##  This error can be returned when the MPS server is in the process of recovering from a fatal failure.
+    CUDA_ERROR_MPS_SERVER_NOT_READY = 807
+    ##  This error indicates that the hardware resources required to create MPS client have been exhausted.
+    CUDA_ERROR_MPS_MAX_CLIENTS_REACHED = 808
+    ##  This error indicates the the hardware resources required to support device connections have been exhausted.
+    CUDA_ERROR_MPS_MAX_CONNECTIONS_REACHED = 809
+    ##  This error indicates that the MPS client has been terminated by the server. To continue using CUDA, the process must be terminated and relaunched.
+    CUDA_ERROR_MPS_CLIENT_TERMINATED = 810
+    ##  This error indicates that the operation is not permitted when
+    ##  the stream is capturing.
+    CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED = 900
+    ##  This error indicates that the current capture sequence on the stream
+    ##  has been invalidated due to a previous error.
+    CUDA_ERROR_STREAM_CAPTURE_INVALIDATED = 901
+    ##  This error indicates that the operation would have resulted in a merge
+    ##  of two independent capture sequences.
+    CUDA_ERROR_STREAM_CAPTURE_MERGE = 902
+    ##  This error indicates that the capture was not initiated in this stream.
+    CUDA_ERROR_STREAM_CAPTURE_UNMATCHED = 903
+    ##  This error indicates that the capture sequence contains a fork that was
+    ##  not joined to the primary stream.
+    CUDA_ERROR_STREAM_CAPTURE_UNJOINED = 904
+    ##  This error indicates that a dependency would have been created which
+    ##  crosses the capture sequence boundary. Only implicit in-stream ordering
+    ##  dependencies are allowed to cross the boundary.
+    CUDA_ERROR_STREAM_CAPTURE_ISOLATION = 905
+    ##  This error indicates a disallowed implicit dependency on a current capture
+    ##  sequence from cudaStreamLegacy.
+    CUDA_ERROR_STREAM_CAPTURE_IMPLICIT = 906
+    ##  This error indicates that the operation is not permitted on an event which
+    ##  was last recorded in a capturing stream.
+    CUDA_ERROR_CAPTURED_EVENT = 907
+    ##  A stream capture sequence not initiated with the ::CU_STREAM_CAPTURE_MODE_RELAXED
+    ##  argument to ::cuStreamBeginCapture was passed to ::cuStreamEndCapture in a
+    ##  different thread.
+    CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD = 908
+    ##  This error indicates that the timeout specified for the wait operation has lapsed.
+    CUDA_ERROR_TIMEOUT = 909
+    ##  This error indicates that the graph update was not performed because it included
+    ##  changes which violated constraints specific to instantiated graph update.
+    CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE = 910
+    ##  This indicates that an async error has occurred in a device outside of CUDA.
+    ##  If CUDA was waiting for an external device's signal before consuming shared data,
+    ##  the external device signaled an error indicating that the data is not valid for
+    ##  consumption. This leaves the process in an inconsistent state and any further CUDA
+    ##  work will return the same error. To continue using CUDA, the process must be
+    ##  terminated and relaunched.
+    CUDA_ERROR_EXTERNAL_DEVICE = 911
+    ##  Indicates a kernel launch error due to cluster misconfiguration.
+    CUDA_ERROR_INVALID_CLUSTER_SIZE = 912
+    ##  This indicates that an unknown internal error has occurred.
+    CUDA_ERROR_UNKNOWN = 999
+
+  CUdevice_attribute* {.size: sizeof(cint).} = enum
+    CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1,                          ## Maximum number of threads per block */
+    CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2,                                ## Maximum block dimension X */
+    CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 3,                                ## Maximum block dimension Y */
+    CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z = 4,                                ## Maximum block dimension Z */
+    CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5,                                 ## Maximum grid dimension X */
+    CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y = 6,                                 ## Maximum grid dimension Y */
+    CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z = 7,                                 ## Maximum grid dimension Z */
+    CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8,                    ## Maximum shared memory available per block in bytes */
+    CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 9,                          ## Memory available on device for __constant__ variables in a CUDA C kernel in bytes */
+    CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10,                                     ## Warp size in threads */
+    CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11,                                     ## Maximum pitch in bytes allowed by memory copies */
+    CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12,                       ## Maximum number of 32-bit registers available per block */
+    CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13,                                    ## Typical clock frequency in kilohertz */
+    CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14,                             ## Alignment requirement for textures */
+    CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16,                          ## Number of multiprocessors on device */
+    CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17,                           ## Specifies whether there is a run time limit on kernels */
+    CU_DEVICE_ATTRIBUTE_INTEGRATED = 18,                                    ## Device is integrated with host memory */
+    CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 19,                           ## Device can map host memory into CUDA address space */
+    CU_DEVICE_ATTRIBUTE_COMPUTE_MODE = 20,                                  ## Compute mode (See ::CUcomputemode for details) */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH = 21,                       ## Maximum 1D texture width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH = 22,                       ## Maximum 2D texture width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT = 23,                      ## Maximum 2D texture height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH = 24,                       ## Maximum 3D texture width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT = 25,                      ## Maximum 3D texture height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH = 26,                       ## Maximum 3D texture depth */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH = 27,               ## Maximum 2D layered texture width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT = 28,              ## Maximum 2D layered texture height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS = 29,              ## Maximum layers in a 2D layered texture */
+    CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT = 30,                             ## Alignment requirement for surfaces */
+    CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31,                            ## Device can possibly execute multiple kernels concurrently */
+    CU_DEVICE_ATTRIBUTE_ECC_ENABLED = 32,                                   ## Device has ECC support enabled */
+    CU_DEVICE_ATTRIBUTE_PCI_BUS_ID = 33,                                    ## PCI bus ID of the device */
+    CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID = 34,                                 ## PCI device ID of the device */
+    CU_DEVICE_ATTRIBUTE_TCC_DRIVER = 35,                                    ## Device is using TCC driver model */
+    CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36,                             ## Peak memory clock frequency in kilohertz */
+    CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH = 37,                       ## Global memory bus width in bits */
+    CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE = 38,                                 ## Size of L2 cache in bytes */
+    CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39,                ## Maximum resident threads per multiprocessor */
+    CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT = 40,                            ## Number of asynchronous engines */
+    CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING = 41,                            ## Device shares a unified address space with the host */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH = 42,               ## Maximum 1D layered texture width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS = 43,              ## Maximum layers in a 1D layered texture */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH = 45,                ## Maximum 2D texture width if CUDA_ARRAY3D_TEXTURE_GATHER is set */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT = 46,               ## Maximum 2D texture height if CUDA_ARRAY3D_TEXTURE_GATHER is set */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE = 47,             ## Alternate maximum 3D texture width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE = 48,            ## Alternate maximum 3D texture height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE = 49,             ## Alternate maximum 3D texture depth */
+    CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID = 50,                                 ## PCI domain ID of the device */
+    CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT = 51,                       ## Pitch alignment requirement for textures */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH = 52,                  ## Maximum cubemap texture width/height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH = 53,          ## Maximum cubemap layered texture width/height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS = 54,         ## Maximum layers in a cubemap layered texture */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH = 55,                       ## Maximum 1D surface width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH = 56,                       ## Maximum 2D surface width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT = 57,                      ## Maximum 2D surface height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH = 58,                       ## Maximum 3D surface width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT = 59,                      ## Maximum 3D surface height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH = 60,                       ## Maximum 3D surface depth */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH = 61,               ## Maximum 1D layered surface width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS = 62,              ## Maximum layers in a 1D layered surface */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH = 63,               ## Maximum 2D layered surface width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT = 64,              ## Maximum 2D layered surface height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS = 65,              ## Maximum layers in a 2D layered surface */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH = 66,                  ## Maximum cubemap surface width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH = 67,          ## Maximum cubemap layered surface width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS = 68,         ## Maximum layers in a cubemap layered surface */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH = 70,                ## Maximum 2D linear texture width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT = 71,               ## Maximum 2D linear texture height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH = 72,                ## Maximum 2D linear texture pitch in bytes */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH = 73,             ## Maximum mipmapped 2D texture width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT = 74,            ## Maximum mipmapped 2D texture height */
+    CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75,                      ## Major compute capability version number */
+    CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76,                      ## Minor compute capability version number */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH = 77,             ## Maximum mipmapped 1D texture width */
+    CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED = 78,                   ## Device supports stream priorities */
+    CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED = 79,                     ## Device supports caching globals in L1 */
+    CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED = 80,                      ## Device supports caching locals in L1 */
+    CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR = 81,          ## Maximum shared memory available per multiprocessor in bytes */
+    CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82,              ## Maximum number of 32-bit registers available per multiprocessor */
+    CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY = 83,                                ## Device can allocate managed memory on this system */
+    CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD = 84,                               ## Device is on a multi-GPU board */
+    CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID = 85,                      ## Unique id for a group of devices on the same multi-GPU board */
+    CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED = 86,                  ## Link between the device and the host supports native atomic operations (this is a placeholder attribute, and is not supported on any current hardware)*/
+    CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO = 87,         ## Ratio of single precision performance (in floating-point operations per second) to double precision performance */
+    CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS = 88,                        ## Device supports coherently accessing pageable memory without calling cudaHostRegister on it */
+    CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS = 89,                     ## Device can coherently access managed memory concurrently with the CPU */
+    CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED = 90,                  ## Device supports compute preemption. */
+    CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM = 91,       ## Device can access host registered memory at the same virtual address as the CPU */
+    CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS = 92,                        ## ::cuStreamBatchMemOp and related APIs are supported. */
+    CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS = 93,                 ## 64-bit operations are supported in ::cuStreamBatchMemOp and related APIs. */
+    CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR = 94,                 ## ::CU_STREAM_WAIT_VALUE_NOR is supported. */
+    CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH = 95,                            ## Device supports launching cooperative kernels via ::cuLaunchCooperativeKernel */
+    CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN = 97,             ## Maximum optin shared memory per block */
+    CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES = 98,                       ## The ::CU_STREAM_WAIT_VALUE_FLUSH flag and the ::CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES MemOp are supported on the device. See \ref CUDA_MEMOP for additional details. */
+    CU_DEVICE_ATTRIBUTE_HOST_REGISTER_SUPPORTED = 99,                       ## Device supports host memory registration via ::cudaHostRegister. */
+    CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES = 100, ## Device accesses pageable memory via the host's page tables. */
+    CU_DEVICE_ATTRIBUTE_DIRECT_MANAGED_MEM_ACCESS_FROM_HOST = 101,          ## The host can directly access managed memory on the device without migration. */
+    CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED = 102,          ## Device supports virtual memory management APIs like ::cuMemAddressReserve, ::cuMemCreate, ::cuMemMap and related APIs */
+    CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR_SUPPORTED = 103,  ## Device supports exporting memory to a posix file descriptor with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate */
+    CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_HANDLE_SUPPORTED = 104,           ## Device supports exporting memory to a Win32 NT handle with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate */
+    CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_KMT_HANDLE_SUPPORTED = 105,       ## Device supports exporting memory to a Win32 KMT handle with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate */
+    CU_DEVICE_ATTRIBUTE_MAX_BLOCKS_PER_MULTIPROCESSOR = 106,                ## Maximum number of blocks per multiprocessor */
+    CU_DEVICE_ATTRIBUTE_GENERIC_COMPRESSION_SUPPORTED = 107,                ## Device supports compression of memory */
+    CU_DEVICE_ATTRIBUTE_MAX_PERSISTING_L2_CACHE_SIZE = 108,                 ## Maximum L2 persisting lines capacity setting in bytes. */
+    CU_DEVICE_ATTRIBUTE_MAX_ACCESS_POLICY_WINDOW_SIZE = 109,                ## Maximum value of CUaccessPolicyWindow::num_bytes. */
+    CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED = 110,      ## Device supports specifying the GPUDirect RDMA flag with ::cuMemCreate */
+    CU_DEVICE_ATTRIBUTE_RESERVED_SHARED_MEMORY_PER_BLOCK = 111,             ## Shared memory reserved by CUDA driver per block in bytes */
+    CU_DEVICE_ATTRIBUTE_SPARSE_CUDA_ARRAY_SUPPORTED = 112,                  ## Device supports sparse CUDA arrays and sparse CUDA mipmapped arrays */
+    CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED = 113,            ## Device supports using the ::cuMemHostRegister flag ::CU_MEMHOSTERGISTER_READ_ONLY to register memory that must be mapped as read-only to the GPU */
+    CU_DEVICE_ATTRIBUTE_TIMELINE_SEMAPHORE_INTEROP_SUPPORTED = 114,         ## External timeline semaphore interop is supported on the device */
+    CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED = 115,                       ## Device supports using the ::cuMemAllocAsync and ::cuMemPool family of APIs */
+    CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED = 116,                    ## Device supports GPUDirect RDMA APIs, like nvidia_p2p_get_pages (see https://docs.nvidia.com/cuda/gpudirect-rdma for more information) */
+    CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS = 117,         ## The returned attribute shall be interpreted as a bitmask, where the individual bits are described by the ::CUflushGPUDirectRDMAWritesOptions enum */
+    CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING = 118,              ## GPUDirect RDMA writes to the device do not need to be flushed for consumers within the scope indicated by the returned attribute. See ::CUGPUDirectRDMAWritesOrdering for the numerical values returned here. */
+    CU_DEVICE_ATTRIBUTE_MEMPOOL_SUPPORTED_HANDLE_TYPES = 119,               ## Handle types supported with mempool based IPC */
+    CU_DEVICE_ATTRIBUTE_CLUSTER_LAUNCH = 120,                               ## Indicates device supports cluster launch */
+    CU_DEVICE_ATTRIBUTE_DEFERRED_MAPPING_CUDA_ARRAY_SUPPORTED = 121,        ## Device supports deferred mapping CUDA arrays and CUDA mipmapped arrays */
+    CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS_V2 = 122,             ## 64-bit operations are supported in ::cuStreamBatchMemOp_v2 and related v2 MemOp APIs. */
+    CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR_V2 = 123,             ## ::CU_STREAM_WAIT_VALUE_NOR is supported by v2 MemOp APIs. */
+    CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED = 124,                            ## Device supports buffer sharing with dma_buf mechanism. */ 
+    CU_DEVICE_ATTRIBUTE_MAX
+
+  CUmemAttach_flags* = enum
+    CU_MEM_ATTACH_GLOBAL = 0x1, ## Memory can be accessed by any stream on any device
+    CU_MEM_ATTACH_HOST = 0x2,   ## Memory cannot be accessed by any stream on any device
+    CU_MEM_ATTACH_SINGLE = 0x4
+
+  CUdevice* = distinct int32
+    ## Compute Device handle
+  CUcontext* = distinct pointer
+  CUmodule* = distinct pointer
+  CUfunction* = distinct pointer
+  CUstream* = distinct pointer
+  CUdeviceptr* = distinct pointer
+
+{.push cdecl, importc, dynlib: "libcuda.so".}
+
+proc cuInit*(flags: uint32): CUresult
+
+proc cuDeviceGetCount*(count: var int32): CUresult
+proc cuDeviceGet*(device: var CUdevice, ordinal: int32): CUresult
+proc cuDeviceGetName*(name: ptr char, len: int32, dev: CUdevice): CUresult
+proc cuDeviceGetAttribute*(r: var int32, attrib: CUdevice_attribute, dev: CUdevice): CUresult
+
+proc cuCtxCreate*(pctx: var CUcontext, flags: uint32, dev: CUdevice): CUresult
+proc cuCtxDestroy*(ctx: CUcontext): CUresult
+proc cuCtxSynchronize*(ctx: CUcontext): CUresult
+
+proc cuModuleLoadData(module: var CUmodule, sourceCode: ptr char): CUresult {.used.}
+proc cuModuleUnload*(module: CUmodule): CUresult
+proc cuModuleGetFunction(kernel: var CUfunction, module: CUmodule, fnName: ptr char): CUresult {.used.}
+
+proc cuLaunchKernel(
+       kernel: CUfunction,
+       gridDimX, gridDimY, gridDimZ: uint32,
+       blockDimX, blockDimY, blockDimZ: uint32,
+       sharedMemBytes: uint32,
+       stream: CUstream,
+       kernelParams: ptr pointer,
+       extra: ptr pointer
+     ): CUresult {.used.}
+
+proc cuMemAlloc*(devptr: var CUdeviceptr, size: csize_t): CUresult
+proc cuMemAllocManaged*(devptr: var CUdeviceptr, size: csize_t, flags: Flag[CUmemAttach_flags]): CUresult
+proc cuMemFree*(devptr: CUdeviceptr): CUresult
+proc cuMemcpyHtoD*(dst: CUdeviceptr, src: pointer, size: csize_t): CUresult
+proc cuMemcpyDtoH*(dst: pointer, src: CUdeviceptr, size: csize_t): CUresult
+
+{.pop.} # {.push cdecl, importc, dynlib: "libcuda.so".}
--- a/constantine/platforms/gpu/bindings/utils.nim
+++ b/constantine/platforms/gpu/bindings/utils.nim
@ -0,0 +1,174 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import std/macros
+
+# ############################################################
+#
+#                     Binding utilities
+#
+# ############################################################
+
+# Flag parameters
+# ------------------------------------------------------------
+
+type Flag*[E: enum] = distinct cint
+
+func flag*[E: enum](e: varargs[E]): Flag[E] {.inline.} =
+  ## Enum should only have power of 2 fields
+  # static:
+  #   for val in E:
+  #     assert (ord(val) and (ord(val) - 1)) == 0, "Enum values should all be power of 2, found " &
+  #                                                 $val & " with value " & $ord(val) & "."
+  var flags = 0
+  for val in e:
+    flags = flags or ord(val)
+  result = Flag[E](flags)
+
+# Macros
+# ------------------------------------------------------------
+
+proc replaceSymsByIdents*(ast: NimNode): NimNode =
+  proc inspect(node: NimNode): NimNode =
+    case node.kind:
+    of {nnkIdent, nnkSym}:
+      return ident($node)
+    of nnkEmpty:
+      return node
+    of nnkLiterals:
+      return node
+    of nnkHiddenStdConv:
+      if node[1].kind == nnkIntLit:
+        return node[1]
+      else:
+        expectKind(node[1], nnkSym)
+        return ident($node[1])
+    of nnkConv: # type conversion needs to be replaced by a function call in untyped AST
+      var rTree = nnkCall.newTree()
+      for child in node:
+        rTree.add inspect(child)
+      return rTree
+    else:
+      var rTree = node.kind.newTree()
+      for child in node:
+        rTree.add inspect(child)
+      return rTree
+  result = inspect(ast)
+
+macro replacePragmasByInline(procAst: typed): untyped =
+  ## Replace pragmas by the inline pragma
+  ## We need a separate "typed" macro
+  ## so that it is executed after the {.push mypragma.} calls
+  var params: seq[NimNode]
+  for i in 0 ..< procAst.params.len:
+    params.add procAst.params[i]
+
+  result = newStmtList()
+
+  # The push cdecl is applied multiple times :/, so fight push with push
+  result.add nnkPragma.newTree(ident"push", ident"nimcall", ident"inline")
+
+  result.add newProc(
+    name = procAst.name,
+    params = params,
+    body = procAst.body.replaceSymsByIdents(),
+    procType = nnkProcDef,
+    pragmas = nnkPragma.newTree(ident"inline", ident"nimcall")
+  )
+
+  result.add nnkPragma.newTree(ident"pop")
+
+macro wrapOpenArrayLenType*(ty: typedesc, procAst: untyped): untyped =
+  ## Wraps pointer+len library calls in properly typed and converted openArray calls 
+  ##
+  ## ```
+  ## {.push cdecl.}
+  ## proc foo*(r: int, a: openArray[CustomType], b: int) {.wrapOpenArrayLenType: uint32, importc: "foo", dynlib: "libfoo.so".}
+  ## {.pop.}
+  ## ```
+  ## 
+  ## is transformed into
+  ## 
+  ## ```
+  ## proc foo(r: int, a: ptr CustomType, aLen: uint32, b: int) {.cdecl, importc: "foo", dynlib: "libfoo.so".}
+  ## 
+  ## proc foo*(r: int, a: openArray[CustomType], b: int) {.inline.} =
+  ##   foo(r, a[0].unsafeAddr, a.len.uint32, b)
+  ## ```
+  procAst.expectKind(nnkProcDef)
+
+  var
+    wrappeeParams = @[procAst.params[0]]
+    wrapperParams = @[procAst.params[0]]
+    wrapperBody = newCall(ident($procAst.name))
+
+  for i in 1 ..< procAst.params.len:
+    if procAst.params[i][^2].kind == nnkBracketExpr and procAst.params[i][^2][0].eqident"openarray":
+      procAst.params[i].expectLen(3) # prevent `proc foo(a, b: openArray[int])`
+      wrappeeParams.add newIdentDefs(
+        ident($procAst.params[i][0] & "Ptr"),
+        nnkPtrTy.newTree(procAst.params[i][^2][1]),
+        newEmptyNode()
+      )
+      wrappeeParams.add newIdentDefs(
+        ident($procAst.params[i][0] & "Len"),
+        ty,
+        newEmptyNode()
+      )
+      wrapperParams.add procAst.params[i]
+      wrapperBody.add nnkIfExpr.newTree(
+        nnkElifExpr.newTree(
+          nnkInfix.newTree(
+            ident"==",
+            nnkDotExpr.newTree(ident($procAst.params[i][0]), bindSym"len"),
+            newLit 0
+          ),
+          newNilLit()
+        ),
+        nnkElseExpr.newTree(
+          newCall(
+          ident"unsafeAddr",
+          nnkBracketExpr.newTree(
+            ident($procAst.params[i][0]),
+            newLit 0
+          ))
+        )
+      )
+      wrapperBody.add newCall(ty, nnkDotExpr.newTree(ident($procAst.params[i][0]), bindSym"len"))
+    else:
+      wrappeeParams.add procAst.params[i]
+      wrapperParams.add procAst.params[i]
+      # Handle "a, b: int"
+      for j in 0 ..< procAst.params[i].len - 2:
+        wrapperBody.add ident($procAst.params[i][j])
+
+  let wrappee = newProc(
+    name = ident($procAst.name),                                 # Remove export marker if any
+    params = wrappeeParams,
+    body = procAst.body.copyNimTree(),
+    procType = nnkProcDef,
+    pragmas = procAst.pragma
+  )
+  let wrapper = newProc(
+    name = procAst[0],                                           # keep export marker if any
+    params = wrapperParams,
+    body = newStmtList(procAst.body.copyNimTree(), wrapperBody), # original procAst body can contain comments that we copy
+    procType = nnkProcDef,
+    pragmas = nnkPragma.newTree(bindSym"replacePragmasByInline") # pragmas are for the wrappee
+  )
+
+  result = newStmtList(wrappee, wrapper)
+
+when isMainModule:
+  expandMacros:
+    {.push cdecl.}
+
+    proc foo(x: int, a: openArray[uint32], name: cstring) {.wrapOpenArrayLenType: cuint.} =
+      discard
+
+    {.pop.}
--- a/constantine/platforms/gpu/ir.nim
+++ b/constantine/platforms/gpu/ir.nim
@ -0,0 +1,345 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  ../../math/config/[curves, precompute],
+  ../../math/io/io_bigints,
+  ../primitives, ../bithacks, ../endians,
+  ./llvm,
+  std/hashes
+
+# ############################################################
+#
+#                      Assembler
+#
+# ############################################################
+
+type
+  Assembler_LLVM* = ref object
+    # LLVM
+    ctx*: ContextRef
+    module*: ModuleRef
+    builder*: BuilderRef
+    i1_t*, i32_t*, i64_t*, void_t*: TypeRef
+    backend*: Backend
+
+  Backend* = enum
+    bkNvidiaPTX
+
+  FnDef* = tuple[fnTy: TypeRef, fnImpl: ValueRef]
+    # calling getTypeOf on a ValueRef function
+    # loses type information like return value type or arity
+
+proc finalizeAssemblerLLVM(asy: Assembler_LLVM) =
+  if not asy.isNil:
+    asy.builder.dispose()
+    asy.module.dispose()
+    asy.ctx.dispose()
+
+proc new*(T: type Assembler_LLVM, backend: Backend, moduleName: cstring): Assembler_LLVM =
+  new result, finalizeAssemblerLLVM
+  result.ctx = createContext()
+  result.module = result.ctx.createModule(moduleName)
+
+  case backend
+  of bkNvidiaPTX:
+    result.module.setTarget("nvptx64-nvidia-cuda")
+    # Datalayout for NVVM IR 1.8 (CUDA 11.6)
+    result.module.setDataLayout("e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64")
+
+  result.builder = result.ctx.createBuilder()
+  result.i1_t = result.ctx.int1_t()
+  result.i32_t = result.ctx.int32_t()
+  result.i64_t = result.ctx.int32_t()
+  result.void_t = result.ctx.void_t()
+  result.backend = backend
+
+# ############################################################
+#
+#                Metadata precomputation
+#
+# ############################################################
+
+# Constantine on CPU is configured at compile-time for several properties that need to be runtime configuration GPUs:
+# - word size (32-bit or 64-bit)
+# - curve properties access like modulus bitsize or -1/M[0] a.k.a. m0ninv
+# - constants are stored in freestanding `const`
+#
+# This is because it's not possible to store a BigInt[254] and a BigInt[384]
+# in a generic way in the same structure, especially without using heap allocation.
+# And with Nim's dead code elimination, unused curves are not compiled in.
+#
+# As there would be no easy way to dynamically retrieve (via an array or a table)
+#    const BLS12_381_modulus = ...
+#    const BN254_Snarks_modulus = ...
+#
+# - We would need a macro to properly access each constant.
+# - We would need to create a 32-bit and a 64-bit version.
+# - Unused curves would be compiled in the program.
+#
+# Note: on GPU we don't manipulate secrets hence branches and dynamic memory allocations are allowed.
+#
+# As GPU is a niche usage, instead we recreate the relevant `precompute` and IO procedures
+# with dynamic wordsize support.
+
+type
+  DynWord = uint32 or uint64
+  BigNum[T: DynWord] = object
+    bits: uint32
+    limbs: seq[T] 
+
+# Serialization
+# ------------------------------------------------
+
+func byteLen(bits: SomeInteger): SomeInteger {.inline.} =
+  ## Length in bytes to serialize BigNum
+  (bits + 7) shr 3 # (bits + 8 - 1) div 8
+
+func wordsRequiredForBits(bits, wordBitwidth: SomeInteger): SomeInteger {.inline.} =
+  ## Compute the number of limbs required
+  ## from the announced bit length
+  
+  debug: doAssert wordBitwidth == 32 or wordBitwidth == 64        # Power of 2
+  (bits + wordBitwidth - 1) shr log2_vartime(uint32 wordBitwidth) # 5x to 55x faster than dividing by wordBitwidth
+
+func fromHex[T](a: var BigNum[T], s: string) =
+   var bytes = newSeq[byte](a.bits.byteLen())
+   hexToPaddedByteArray(s, bytes, bigEndian)
+   
+   # 2. Convert canonical uint to BigNum
+   const wordBitwidth = sizeof(T) * 8
+   a.limbs.unmarshal(bytes, wordBitwidth, bigEndian)
+
+func fromHex[T](BN: type BigNum[T], bits: uint32, s: string): BN =
+  const wordBitwidth = sizeof(T) * 8
+  let numWords = wordsRequiredForBits(bits, wordBitwidth)
+  
+  result.bits = bits
+  result.limbs.setLen(numWords)
+  result.fromHex(s)
+
+func toHex[T](a: BigNum[T]): string =
+  ## Conversion to big-endian hex
+  ## This is variable-time
+  # 1. Convert BigInt to canonical uint
+  const wordBitwidth = sizeof(T) * 8
+  var bytes = newSeq[byte](byteLen(a.bits))
+  bytes.marshal(a.limbs, wordBitwidth, cpuEndian)
+
+  # 2 Convert canonical uint to hex
+  return bytes.nativeEndianToHex(bigEndian)
+
+# Checks
+# ------------------------------------------------
+
+func checkValidModulus(M: BigNum) =
+  const wordBitwidth = uint32(BigNum.T.sizeof() * 8)
+  let expectedMsb = M.bits-1 - wordBitwidth * (M.limbs.len.uint32 - 1)
+  let msb = log2_vartime(M.limbs[M.limbs.len-1])
+
+  doAssert msb == expectedMsb, "Internal Error: the modulus must use all declared bits and only those:\n" &
+    "    Modulus '" & M.toHex() & "' is declared with " & $M.bits &
+    " bits but uses " & $(msb + wordBitwidth * uint32(M.limbs.len - 1)) & " bits."
+
+# Fields metadata
+# ------------------------------------------------
+
+func negInvModWord[T](M: BigNum[T]): T =
+  ## Returns the Montgomery domain magic constant for the input modulus:
+  ##
+  ##   µ ≡ -1/M[0] (mod SecretWord)
+  ##
+  ## M[0] is the least significant limb of M
+  ## M must be odd and greater than 2.
+  ##
+  ## Assuming 64-bit words:
+  ##
+  ## µ ≡ -1/M[0] (mod 2^64)
+  checkValidModulus(M)
+  
+  result = invModBitwidth(M.limbs[0])
+  # negate to obtain the negative inverse
+  result = not(result) + 1
+
+# ############################################################
+#
+#               Intermediate Representation
+#
+# ############################################################
+
+type
+  WordSize* = enum
+    size32
+    size64
+  
+  Field* = enum
+    fp
+    fr
+  
+  FieldConst* = object
+    wordTy: TypeRef
+    fieldTy: TypeRef
+    modulus*: seq[ConstValueRef]
+    m0ninv*: ConstValueRef
+    bits*: uint32
+    spareBits*: uint8
+
+  CurveMetadata* = object
+    curve*: Curve
+    prefix*: string
+    wordSize*: WordSize
+    fp*: FieldConst
+    fr*: FieldConst 
+
+  Opcode* = enum
+    opFpAdd = "fp_add"
+    opFrAdd = "fr_add"
+
+proc setFieldConst(fc: var FieldConst, ctx: ContextRef, wordSize: WordSize, modBits: uint32, modulus: string) =
+  let wordTy = case wordSize
+    of size32: ctx.int32_t()
+    of size64: ctx.int64_t()
+  
+  let wordBitwidth = case wordSize
+    of size32: 32'u32
+    of size64: 64'u32
+
+  let numWords = wordsRequiredForBits(modBits, wordBitwidth)
+
+  fc.wordTy = wordTy
+  fc.fieldTy = array_t(wordTy, numWords)
+  
+  case wordSize
+  of size32:
+    let m = BigNum[uint32].fromHex(modBits, modulus)
+    fc.modulus.setlen(m.limbs.len)
+    for i in 0 ..< m.limbs.len:
+      fc.modulus[i] = ctx.int32_t().constInt(m.limbs[i])
+
+    fc.m0ninv = ctx.int32_t().constInt(m.negInvModWord())
+
+  of size64:
+    let m = BigNum[uint64].fromHex(modBits, modulus)
+    fc.modulus.setlen(m.limbs.len)
+    for i in 0 ..< m.limbs.len:
+      fc.modulus[i] = ctx.int64_t().constInt(m.limbs[i])
+
+    fc.m0ninv = ctx.int64_t().constInt(m.negInvModWord())
+
+  debug: doAssert numWords == fc.modulus.len.uint32
+  fc.bits = modBits
+  fc.spareBits = uint8(numWords*wordBitwidth - modBits)
+
+proc init*(
+       C: type CurveMetadata, ctx: ContextRef,
+       prefix: string, wordSize: WordSize,
+       fpBits: uint32, fpMod: string,
+       frBits: uint32, frMod: string): CurveMetadata =
+    
+  result = C(prefix: prefix, wordSize: wordSize)
+  result.fp.setFieldConst(ctx, wordSize, fpBits, fpMod)
+  result.fr.setFieldConst(ctx, wordSize, frBits, frMod)
+
+proc hash*(curveOp: tuple[cm: CurveMetadata, op: Opcode]): Hash {.inline.} =
+  result = hash(curveOp.cm.curve) !& int(hash(curveOp.op))
+  result = !$result
+
+proc genSymbol*(cm: CurveMetadata, opcode: Opcode): string {.inline.} =
+  cm.prefix & 
+    (if cm.wordSize == size32: "32b_" else: "64b_") &
+    $opcode
+
+func getFieldType*(cm: CurveMetadata, field: Field): TypeRef {.inline.} =
+  if field == fp:
+    return cm.fp.fieldTy
+  else:
+    return cm.fr.fieldTy
+
+func getNumWords*(cm: CurveMetadata, field: Field): int {.inline.} =
+  case field
+  of fp:
+    return cm.fp.modulus.len
+  of fr:
+    return cm.fr.modulus.len
+
+func getModulus*(cm: CurveMetadata, field: Field): lent seq[ConstValueRef] {.inline.} =
+  # TODO: replace static typing, the returned type is incorrect for 64-bit
+  case field
+  of fp:
+    return cm.fp.modulus
+  of fr:
+    return cm.fr.modulus
+
+func getSpareBits*(cm: CurveMetadata, field: Field): uint8 {.inline.} =
+  if field == fp:
+    return cm.fp.sparebits
+  else:
+    return cm.fr.sparebits
+
+# ############################################################
+#
+#                    Syntax Sugar
+#
+# ############################################################
+
+# For array access we need to use:
+# 
+#   builder.extractValue(array, index, name)
+#   builder.insertValue(array, index, value, name)
+#
+# which is very verbose compared to array[index].
+# So we wrap in syntactic sugar to improve readability, maintainability and auditability
+
+type
+  Array* = object
+    builder: BuilderRef
+    p: ValueRef
+    arrayTy: TypeRef
+    elemTy: TypeRef
+    int32_t: TypeRef
+
+proc asArray*(builder: BuilderRef, arrayPtr: ValueRef, arrayTy: TypeRef): Array =
+  Array(
+    builder: builder,
+    p: arrayPtr,
+    arrayTy: arrayTy,
+    elemTy: arrayTy.getElementType(),
+    int32_t: arrayTy.getContext().int32_t()
+  )
+
+proc makeArray*(builder: BuilderRef, arrayTy: TypeRef): Array =
+  Array(
+    builder: builder,
+    p: builder.alloca(arrayTy),
+    arrayTy: arrayTy,
+    elemTy: arrayTy.getElementType(),
+    int32_t: arrayTy.getContext().int32_t()
+  )
+
+proc makeArray*(builder: BuilderRef, elemTy: TypeRef, len: uint32): Array =
+  let arrayTy = array_t(elemTy, len)
+  Array(
+    builder: builder,
+    p: builder.alloca(arrayTy),
+    arrayTy: arrayTy,
+    elemTy: elemTy,
+    int32_t: arrayTy.getContext().int32_t()
+  )
+
+proc `[]`*(a: Array, index: SomeInteger): ValueRef {.inline.}=
+  # First dereference the array pointer with 0, then access the `index`
+  let pelem = a.builder.getElementPtr2_InBounds(a.arrayTy, a.p, [ValueRef constInt(a.int32_t, 0), ValueRef constInt(a.int32_t, uint64 index)])
+  a.builder.load2(a.elemTy, pelem)
+
+proc `[]=`*(a: Array, index: SomeInteger, val: ValueRef) {.inline.}=
+  let pelem = a.builder.getElementPtr2_InBounds(a.arrayTy, a.p, [ValueRef constInt(a.int32_t, 0), ValueRef constInt(a.int32_t, uint64 index)])
+  a.builder.store(val, pelem)
+
+proc store*(builder: BuilderRef, dst: Array, src: Array) {.inline.}=
+  let v = builder.load2(src.arrayTy, src.p)
+  builder.store(v, dst.p)
--- a/constantine/platforms/gpu/llvm.nim
+++ b/constantine/platforms/gpu/llvm.nim
@ -0,0 +1,212 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import ./bindings/llvm_abi {.all.}
+export llvm_abi
+
+# ############################################################
+#
+#                    LLVM API
+#
+# ############################################################
+
+# This file exposes a Nimified LLVM API.
+# Most functions are reexported as-is except functions involving:
+# - LLVM cstring
+# - LLVM Memory buffer
+# - LLVM bool
+# - LLVM metadata
+# The cstring and memory buffers require manual memory management on the LLVM side.
+# The LLVM bool uses 32-bit representation instead of 1-bit.
+# LLVM metadata is easier to use with dedicated procedures.
+
+# ⚠ Warning: To provide full diagnostic (filename:line), we wrap some LLVM procedures in template
+# Templates copy-paste their inputs parameters.
+# For example if `module` parameter is passed `foo.launchMissiles()`
+# and that parameter is used twice within the template, `foo.launchMissiles()` will be called twice.
+
+# Module
+# ------------------------------------------------------------
+proc createModule*(ctx: ContextRef, name: cstring): ModuleRef {.inline.} =
+  createModule(name, ctx)
+
+proc `$`*(ty: ModuleRef): string =
+  let s = ty.toIR_LLVMstring()
+  result = $cstring(s)
+  s.dispose()
+
+proc toBitcode*(m: ModuleRef): seq[byte] =
+  ## Print a module IR to bitcode
+  let mb = m.writeBitcodeToMemoryBuffer()
+  let len = mb.getBufferSize()
+  result.newSeq(len)
+  copyMem(result[0].addr, mb.getBufferStart(), len)
+  mb.dispose()
+
+template verify*(module: ModuleRef, failureAction: VerifierFailureAction) =
+  ## Verify the IR code in a module
+  var errMsg: LLVMstring
+  let err = bool verify(module, failureAction, errMsg)
+  if err:
+    writeStackTrace()
+    stderr.write("\"verify\" for module '" & astToStr(module) & "' " & $instantiationInfo() & " exited with error: " & $cstring(errMsg) & '\n')
+    errMsg.dispose()
+    quit 1
+
+proc getIdentifier*(module: ModuleRef): string =
+  var rLen: csize_t
+  let rStr = getIdentifier(module, rLen)
+
+  result = newString(rLen.int)
+  copyMem(result[0].addr, rStr, rLen.int)
+
+# Target
+# ------------------------------------------------------------
+
+template toTarget*(triple: cstring): TargetRef =
+  var target: TargetRef
+  var errMsg: LLVMstring
+  let err = bool triple.getTargetFromTriple(target, errMsg)
+  if err:
+    writeStackTrace()
+    echo "\"toTarget\" for triple '", triple, "' " & $instantiationInfo() & " exited with error: " & $cstring(errMsg) & '\n'
+    errMsg.dispose()
+    quit 1
+  target
+
+proc initializeFullNativeTarget* {.inline.} =
+  static: doAssert defined(amd64) or defined(i386), "Only x86 is configured at the moment"
+  initializeX86TargetInfo()
+  initializeX86Target()
+  initializeX86TargetMC()
+  # With usual `initializeNativeTarget`
+  # it's a separate call but it's mandatory so include it
+  initializeX86AsmPrinter()
+
+proc initializeFullNVPTXTarget* {.inline.} =
+  initializeNVPTXTargetInfo()
+  initializeNVPTXTarget()
+  initializeNVPTXTargetMC()
+  initializeNVPTXAsmPrinter()
+
+# Execution Engine
+# ------------------------------------------------------------
+
+template createJITCompilerForModule*(
+       engine: var ExecutionEngineRef,
+       module: ModuleRef,
+       optLevel: uint32) =
+  var errMsg: LLVMstring
+  let err = bool createJITCompilerForModule(engine, module, optLevel, errMsg)
+  if err:
+    writeStackTrace()
+    stderr.write("\"createJITCompilerForModule\" for module '" & astToStr(module) & "' " & $instantiationInfo() & " exited with error: " & $cstring(errMsg) & '\n')
+    errMsg.dispose()
+    quit 1
+
+# Target Machine
+# ------------------------------------------------------------
+
+template emitToFile*(t: TargetMachineRef, m: ModuleRef,
+                 fileName: string, codegen: CodeGenFileType) =
+  var errMsg: LLVMstring
+  let err = bool targetMachineEmitToFile(t, m, cstring(fileName), codegen, errMsg)
+  if err:
+    writeStackTrace()
+    stderr.write("\"emitToFile\" for module '" & astToStr(module) & "' " & $instantiationInfo() & " exited with error: " & $cstring(errMsg) & '\n')
+    errMsg.dispose()
+    quit 1 
+
+template emitToString*(t: TargetMachineRef, m: ModuleRef, codegen: CodeGenFileType): string =
+  ## Codegen to string
+  var errMsg: LLVMstring
+  var mb: MemoryBufferRef
+  let err = bool targetMachineEmitToMemoryBuffer(t, m, codegen, errMsg, mb)
+  if err:
+    writeStackTrace()
+    stderr.write("\"emitToString\" for module '" & astToStr(module) & "' " & $instantiationInfo() & " exited with error: " & $cstring(errMsg) & '\n')
+    errMsg.dispose()
+    quit 1
+  let len = mb.getBufferSize()
+  var emitted = newString(len)
+  copyMem(emitted[0].addr, mb.getBufferStart(), len)
+  mb.dispose()
+  emitted
+
+# Target Machine
+# ------------------------------------------------------------
+
+proc initializePasses* =
+  let registry = getGlobalPassRegistry()
+
+  # TODO: Some passes in llc aren't exposed
+  # https://github.com/llvm/llvm-project/blob/main/llvm/tools/llc/llc.cpp
+  registry.initializeCore()
+  registry.initializeTransformUtils()
+  registry.initializeScalarOpts()
+  registry.initializeObjCARCOpts()
+  registry.initializeVectorization()
+  registry.initializeInstCombine()
+  registry.initializeAggressiveInstCombiner()
+  registry.initializeIPO()
+  registry.initializeInstrumentation()
+  registry.initializeAnalysis()
+  registry.initializeIPA()
+  registry.initializeCodeGen()
+  registry.initializeTarget()
+
+# Builder
+# ------------------------------------------------------------
+
+proc getContext*(builder: BuilderRef): ContextRef =
+  # LLVM C API does not expose IRBuilder.getContext()
+  # making this unnecessary painful
+  # https://github.com/llvm/llvm-project/issues/59875
+  builder.getInsertBlock().getBasicBlockParent().getTypeOf().getContext()
+
+# Types
+# ------------------------------------------------------------
+
+proc `$`*(ty: TypeRef): string =
+  let s = ty.toLLVMstring()
+  result = $cstring(s)
+  s.dispose()
+
+proc isVoid*(ty: TypeRef): bool {.inline.} =
+  ty.getTypeKind == tkVoid
+
+proc pointer_t*(elementTy: TypeRef): TypeRef {.inline.} =
+  pointerType(elementTy, addressSpace = 0)
+
+proc array_t*(elemType: TypeRef, elemCount: SomeInteger): TypeRef {.inline.}=
+  array_t(elemType, uint32(elemCount))
+
+proc function_t*(returnType: TypeRef, paramTypes: openArray[TypeRef]): TypeRef {.inline.} =
+  function_t(returnType, paramTypes, isVarArg = LlvmBool(false))
+
+# Values
+# ------------------------------------------------------------
+
+type
+  ConstValueRef* = distinct ValueRef
+  AnyValueRef* = ValueRef or ConstValueRef
+
+proc getName*(v: ValueRef): string =
+  var rLen: csize_t
+  let rStr = getValueName2(v, rLen)
+
+  result = newString(rLen.int)
+  copyMem(result[0].addr, rStr, rLen.int)
+
+proc constInt*(ty: TypeRef, n: uint64, signExtend = false): ConstValueRef {.inline.} =
+  ConstValueRef constInt(ty, culonglong(n), LlvmBool(signExtend))
+
+proc constStruct*(constantVals: openArray[ValueRef], packed = false): ConstValueRef {.inline.} =
+  ConstValueRef constStruct(constantVals, LlvmBool(packed))
+
+proc getTypeOf*(v: ConstValueRef): TypeRef {.borrow.}
--- a/constantine/platforms/gpu/nvidia.nim
+++ b/constantine/platforms/gpu/nvidia.nim
@ -0,0 +1,270 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  ./bindings/nvidia_abi {.all.},
+  ./bindings/utils,
+  ./llvm, ./ir,
+  ./nvidia_inlineasm,
+  ../primitives
+
+export
+  nvidia_abi, nvidia_inlineasm,
+  Flag, flag
+
+# ############################################################
+#
+#                     Nvidia GPUs API
+#
+# ############################################################
+
+# Versioning and hardware support
+# ------------------------------------------------------------
+
+# GPU architectures:
+# - Kepler   Geforce GTX 780,  2012, Compute Capability SM3.5
+# - Maxwell  Geforce GTX 980,  2014, Compute Capability SM5.2
+# - Pascal   Geforce GTX 1080, 2016, Compute Capability SM6.1
+# - Volta    Tesla V100,       2017, Compute Capability SM7.0
+# - Turing   Geforce RTX 2080, 2018, Compute Capability SM7.5
+# - Ampere   Geforce RTX 3080, 2020, Compute Capability SM8.6
+# - Ada      Geforce RTX 4080, 2022, Compute Capability SM8.9
+
+# We likely want to use unified memory in the future to avoid having to copy back and from device explicitly
+# - https://developer.nvidia.com/blog/unified-memory-cuda-beginners/
+# - https://developer.nvidia.com/blog/unified-memory-in-cuda-6/
+#
+# Unified memory is fully supported starting from Pascal GPU (GTX 1080, 2016, Compute Capability SM6.0)
+# and require Kepler at minimum.
+# 
+# Cuda 9 exposes the current explicit synchronization primitives (cooperative groups) and deprecated the old ones
+# Those primitives are particularly suitable for Volta GPUs (GTX 2080, 2018, Compute Capability SM7.5)
+# and requiring.
+#
+# Furthermore Pascal GPUs predates the high demand for deep learning and cryptocurrency mining 
+# and were widely available at an affordable price point.
+# Also given iven that it's a 7 years old architecture,
+# it is unlikely that users have an older Nvidia GPU available.
+#
+# Hence we can target Cuda 9 at minimum (Sept 2017): https://developer.nvidia.com/cuda-toolkit-archive
+# which corresponds to PTX ISA 6.0: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#release-notes__ptx-release-history
+#
+# Unfortunately, there is no easy programmatic way to retrieve the PTX ISA version supported
+# only the Cuda/Compiler version (https://docs.nvidia.com/cuda/ptx-compiler-api/index.html#group__versioning)
+# Hence it's likely easier to ask users to update Cuda in case of ISA incompatibility.
+
+# Cuda Driver API
+# ------------------------------------------------------------
+
+template check*(status: CUresult) =
+  ## Check the status code of a CUDA operation
+  ## Exit program with error if failure
+  
+  let code = status # ensure that the input expression is evaluated once only
+  if code != CUDA_SUCCESS:
+    writeStackTrace()
+    stderr.write(astToStr(status) & " " & $instantiationInfo() & " exited with error: " & $code & '\n')
+    quit 1
+
+func cuModuleLoadData*(module: var CUmodule, sourceCode: openArray[char]): CUresult {.inline.}=
+  cuModuleLoadData(module, sourceCode[0].unsafeAddr)
+func cuModuleGetFunction*(kernel: var CUfunction, module: CUmodule, fnName: openArray[char]): CUresult {.inline.}=
+  cuModuleGetFunction(kernel, module, fnName[0].unsafeAddr)
+
+proc cudaDeviceInit*(deviceID = 0'i32): CUdevice =
+  
+  check cuInit(deviceID.uint32)
+  
+  var devCount: int32
+  check cuDeviceGetCount(devCount)
+  if devCount == 0:
+    echo "cudaDeviceInit error: no devices supporting CUDA"
+    quit 1
+  
+  var cuDevice: CUdevice
+  check cuDeviceGet(cuDevice, deviceID)
+  var name = newString(128)
+  check cuDeviceGetName(name[0].addr, name.len.int32, cuDevice)
+  echo "Using CUDA Device [", deviceID, "]: ", cstring(name)
+
+  var major, minor: int32
+  check cuDeviceGetAttribute(major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice)
+  check cuDeviceGetAttribute(minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice)
+  echo "Compute Capability: SM ", major, ".", minor
+  if major < 6:
+    echo "Error: Device ",deviceID," is not sm_60 (Pascal generation, GTX 1080) or later"
+    quit 1
+  
+  return cuDevice
+
+# ############################################################
+#
+#                   LLVM IR for Nvidia GPUs
+#
+# ############################################################
+
+proc tagCudaKernel(module: ModuleRef, fn: FnDef) =
+  ## Tag a function as a Cuda Kernel, i.e. callable from host
+  
+  doAssert fn.fnTy.getReturnType().isVoid(), block:
+    "Kernels must not return values but function returns " & $fn.fnTy.getReturnType().getTypeKind()
+
+  let ctx = module.getContext()
+  module.addNamedMetadataOperand(
+    "nvvm.annotations",
+    ctx.asValueRef(ctx.metadataNode([
+      fn.fnImpl.asMetadataRef(),
+      ctx.metadataNode("kernel"),
+      constInt(ctx.int32_t(), 1, LlvmBool(false)).asMetadataRef()
+    ]))
+  )
+
+proc setCallableCudaKernel*(module: ModuleRef, fn: FnDef) =
+  ## Create a public wrapper of a cuda device function
+  ##
+  ## A function named `addmod` can be found by appending _public
+  ##   check cuModuleGetFunction(fnPointer, cuModule, "addmod_public")
+  
+  let pubName = fn.fnImpl.getName() & "_public"
+  let pubFn = module.addFunction(cstring(pubName), fn.fnTy)
+  
+  let ctx = module.getContext()
+  let builder = ctx.createBuilder()
+  defer: builder.dispose()
+
+  let blck = ctx.appendBasicBlock(pubFn, "publicKernelBody")
+  builder.positionAtEnd(blck)
+
+  var args = newSeq[ValueRef](fn.fnTy.countParamTypes())
+  for i, arg in mpairs(args):
+    arg = pubFn.getParam(i.uint32)
+  discard builder.call2(fn.fnTy, fn.fnImpl, args)
+
+  # A public kernel must return void
+  builder.retVoid()
+  module.tagCudaKernel((fn.fnTy, pubFn))
+
+# ############################################################
+#
+#                      Code generation
+#
+# ############################################################
+
+proc codegenNvidiaPTX*(asy: Assembler_LLVM, sm: tuple[major, minor: int32]): string =
+  ## Generate Nvidia PTX via LLVM
+  ## SM corresponds to the target GPU architecture Compute Capability
+  ## - https://developer.nvidia.com/cuda-gpus
+  ## - https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-capabilities
+  ## 
+  ## This requires the following function to be called beforehand:
+  ## - initializePasses()
+  ## - initializeFullNVPTXTarget()
+  
+  debug: doAssert asy.backend == bkNvidiaPTX
+
+  asy.module.verify(AbortProcessAction)
+
+  const triple = "nvptx64-nvidia-cuda"
+
+  let machine = createTargetMachine(
+    target = toTarget(triple),
+    triple = triple,
+    cpu = cstring("sm_" & $sm.major & $sm.minor),
+    features = "",
+    level = CodeGenLevelAggressive,
+    reloc = RelocDefault,
+    codeModel = CodeModelDefault
+  )
+
+  # https://www.llvm.org/docs/Passes.html
+  let pm = createPassManager()
+
+  machine.addAnalysisPasses(pm)
+  pm.addDeduceFunctionAttributesPass()
+  pm.addMemCpyOptPass()
+  pm.addScalarReplacementOfAggregatesPass()
+  pm.addPromoteMemoryToRegisterPass()
+  pm.addGlobalValueNumberingPass()
+  pm.addDeadStoreEliminationPass()
+  pm.addInstructionCombiningPass()
+  pm.addFunctionInliningPass()
+  pm.addAggressiveDeadCodeEliminationPass()
+
+  when false:
+    # As most (all?) of our code is straightline, unoptimizable inline assembly, no loop and no branches
+    # most optimizations, even at -O3, are not applicable
+    let pmb = createPassManagerBuilder()
+    pmb.setOptLevel(3)
+    pmb.populateModulePassManager(pm)
+    pmb.dispose()
+
+  pm.run(asy.module)
+  pm.dispose()
+
+  return machine.emitToString(asy.module, AssemblyFile)
+
+# ############################################################
+#
+#                      Code execution
+#
+# ############################################################
+
+proc getCudaKernel*(cuMod: CUmodule, cm: CurveMetadata, opcode: Opcode): CUfunction =
+  # Public kernels are appended _public
+  let fnName = cm.genSymbol(opcode) & "_public"
+  check cuModuleGetFunction(result, cuMod, fnName)
+
+proc exec*[T](jitFn: CUfunction, r: var T, a, b: T) =
+  ## Execute a binary operation in the form r <- op(a, b)
+  ## on Nvidia GPU
+  # The execution wrapper provided are mostly for testing and debugging low-level kernels
+  # that serve as building blocks, like field addition or multiplication.
+  # They aren't parallelizable so we are not concern about the grid and block size.
+  # We also aren't concerned about the cuda stream when testing.
+  #
+  # This is not the case for production kernels (multi-scalar-multiplication, FFT)
+  # as we want to execute kernels asynchronously then merge results which might require multiple streams.
+
+  static: doAssert cpuEndian == littleEndian, block:
+    # From https://developer.nvidia.com/cuda-downloads?target_os=Linux
+    # Supported architectures for Cuda are:
+    # x86-64, PowerPC 64 little-endian, ARM64 (aarch64)
+    # which are all little-endian at word-level.
+    #
+    # Due to limbs being also stored in little-endian, on little-endian host
+    # the CPU and GPU will have the same binary representation
+    # whether we use 32-bit or 64-bit words, so naive memcpy can be used for parameter passing.
+
+    "Most CPUs (x86-64, ARM) are little-endian, as are Nvidia GPUs, which allows naive copying of parameters.\n" &
+    "Your architecture '" & $hostCPU & "' is big-endian and GPU offloading is unsupported on it."
+  
+  # We assume that all arguments are passed by reference in the Cuda kernel, hence the need for GPU alloc.
+  
+  var rGPU, aGPU, bGPU: CUdeviceptr
+  check cuMemAlloc(rGPU, csize_t sizeof(r))
+  check cuMemAlloc(aGPU, csize_t sizeof(a))
+  check cuMemAlloc(bGPU, csize_t sizeof(b))
+
+  check cuMemcpyHtoD(aGPU, a.unsafeAddr, csize_t sizeof(a))
+  check cuMemcpyHtoD(bGPU, b.unsafeAddr, csize_t sizeof(b))
+
+  let params = [pointer(rGPU.addr), pointer(aGPU.addr), pointer(bGPU.addr)]
+
+  check cuLaunchKernel(
+          jitFn,
+          1, 1, 1, # grid(x, y, z)
+          1, 1, 1, # block(x, y, z)
+          sharedMemBytes = 0,
+          CUstream(nil),
+          params[0].unsafeAddr, nil)
+
+  check cuMemcpyDtoH(r.addr, rGPU, csize_t sizeof(r))
+
+  check cuMemFree(rGPU)
+  check cuMemFree(aGPU)
+  check cuMemFree(bGPU)
--- a/constantine/platforms/gpu/nvidia_inlineasm.nim
+++ b/constantine/platforms/gpu/nvidia_inlineasm.nim
@ -0,0 +1,377 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  std/[macros, strutils],
+  ./llvm
+
+# ############################################################
+#
+#                   Nvidia Inline ASM
+#
+# ############################################################
+
+# We want to generate procedures related to the following
+# instructions -> inline assembly -> argument mapping
+
+# Inline assembly looks like this:
+# 
+# C:    asm volatile ("add.cc.u64 %0, %1, %2;" : "=l"(c) : "l"(a), "l"(b) : "memory" );
+# LLVM: call i64 asm "add.cc.u64 $0, $1, $2;", "=l,l,l,~{memory}"(i64 %1, i64 %2)
+#
+# So we need to do the following steps
+#
+# 1. Collect inline ASM opcodes definition for Nvidia PTX inline assembly
+# 2. Generate u32 and u64 `getInlineAsm()` definition (that is associated with an LLVM IR ContextRef)
+# 3. Create an initialization proc to be called after initializing the LLVM ContextRef
+#    For each instruction, return a routine with signature that mirrors LLVM builtin instructions:
+#    
+#    proc myInstr(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring): ValueRef =
+#      let numBits = lhs.getTypeOf().getIntTypeWidth()
+#      if numBits == 32: 
+#        builder.call2(inlineAsmFnType, inlineAsmFn32, [arg0, arg1, ...], name)
+#      elif numBits == 64: 
+#        builder.call2(inlineAsmFnType, inlineAsmFn64, [arg0, arg1, ...], name)
+#      else:
+#        doAssert false, "Unsupported int" & $numBits
+# 
+# To create `inlineAsmFn32` and `inlineAsmFn64` we may use `getInlineAsm` just before the corresponding
+# builder.call2. This allows us to define freestanding functions.
+# The potential issue is the overhead of repeated definition of add/sub/mul/muladd
+# and their carry-in, carry-out variations.
+# LLVM internally ensures that only a single instance will be defined via a HashTable
+# Though this will involve thousands of repeated hashing: https://llvm.org/doxygen/InlineAsm_8cpp_source.html#l00043
+#
+# Alternatively, we can cache the functions created and the design challenge is how to expose
+# the routines with the same API as LLVM builder. We could use a global, a wrapper builder type,
+# or a template to call at the beginning of each function that setups some boilerplate indirection.
+#
+# However caching only works if inlineAsmFn32 and inlineAsmFn64 are stable
+# but it's very clunky in our case as a fused multiply-addfunction like
+# mad.lo.u32 "%0, %1, %2, %3;" "=l,l,l,l" [lmul, rmul, addend]
+# can also have immediate operand (a modulus for example) as constraint.
+# So we don't cache and rely on LLVM own deduplication.
+
+template selConstraint(operand: auto, append = ""): string =
+  when operand is ValueRef:
+    # PTX Assembly:
+    # r for 32-bit operand
+    # l for 64-bit operand
+    # n for immediates
+    if operand.getTypeOf().getIntTypeWidth() == 32:
+      "r" & append
+    else:
+      "l" & append
+  else: # ConstValueRef or uint32 or uint64
+    "n" & append
+
+macro genInstr(body: untyped): untyped =
+  result = newStmtList()
+
+  body.expectKind(nnkStmtList)
+  for op in body:
+    op.expectKind(nnkCommand)
+    doAssert op[0].eqIdent"op"
+
+    let instrName = op[1]
+    # For each op, generate a builder proc
+    op[2][0].expectKind(nnkTupleConstr)
+    op[2][0][0].expectKind({nnkStrLit, nnkCurly})
+    op[2][0][1].expectKind(nnkStrLit)
+    op[2][0][2].expectKind(nnkStrLit)
+    op[2][0][3].expectKind(nnkBracket)
+
+    let instrBody = newStmtList()
+
+    # 1. Detect the size of registers
+    let numBits = ident"numBits"
+    let regTy = ident"regTy"
+    let fnTy = ident"fnTy"
+    let ctx = ident"ctx"
+    let lhs = op[2][0][3][0]
+    
+    instrBody.add quote do:
+      let `ctx` = builder.getContext()
+      # lhs: ValueRef or uint32 or uint64
+      let `numBits` = when `lhs` is ValueRef|ConstValueRef: `lhs`.getTypeOf().getIntTypeWidth()
+                      else: 8*sizeof(`lhs`)
+      let `regTy` = when `lhs` is ValueRef|ConstValueRef: `lhs`.getTypeOf()
+                    elif `lhs` is uint32: `ctx`.int32_t()
+                    elif `lhs` is uint64: `ctx`.int64_t()
+                    else: {.error "Unsupported input type " & $typeof(`lhs`).}
+
+    # 2. Create the LLVM asm signature
+    let operands = op[2][0][3]
+    let arity = operands.len
+
+    let constraintString = op[2][0][2]
+    let constraints = ident"constraints"
+
+    let instr = op[2][0][0]
+    let hasTwoTypes = instr.kind == nnkCurly
+
+    if hasTwoTypes:
+      # For now only slct has 2 types so we special case to it
+      instrBody.add quote do:
+        let `fnTy` = function_t(`regTy`, [`regTy`, `regTy`, `ctx`.int32_t()])
+
+    elif arity == 2:
+      if constraintString.strVal.startsWith('='):
+        instrBody.add quote do:
+          let `fnTy` = function_t(`regTy`, [`regTy`, `regTy`])
+      else:
+        # We only support out of place "=" function.
+        # In-place with "+" requires alloca + load/stores in codegen
+        error "Unsupported constraint: " & constraintString.strVal
+    elif arity == 3:
+      if constraintString.strVal.startsWith('='):
+        instrBody.add quote do:
+          let `fnTy` = function_t(`regTy`, [`regTy`, `regTy`, `regTy`])
+        # We only support out of place "=" function.
+        # In-place with "+" requires alloca + load/stores in codegen
+      else:
+        error "Unsupported constraint: " & constraintString.strVal
+    else:
+      error "Unsupported arity: " & $arity
+
+    # 3. Create the constraints string
+
+    # We could have generic constraint string generation, but we only have 2 arities to support
+    # and codegen without quote do would be even more verbose and hard to read.
+    if arity == 2:
+      let op0 = operands[0]
+      let op1 = operands[1]
+      instrBody.add quote do:
+        let `constraints` = block:
+          var c: string
+          let constraintRegisterSymbol =
+            if `numBits` == 32: "r"
+            else: "l"
+          when `constraintString`.startsWith('='):
+            c.add "=" & constraintRegisterSymbol & ','
+            c.add selConstraint(`op0`,",")
+            c.add selConstraint(`op1`)
+          else:
+            static: doAssert false, " Constraint misconfigured"
+          when `constraintString`.endsWith(",~{memory}"):
+            c.add ",~{memory}"
+          c
+
+    elif arity == 3:
+      let op0 = operands[0]
+      let op1 = operands[1]
+      let op2 = operands[2]
+      instrBody.add quote do:
+        let `constraints` = block:
+          var c: string
+          let constraintRegisterSymbol =
+            if `numBits` == 32: "r"
+            else: "l"
+          when `constraintString`.startsWith('='):
+            c.add "=" & constraintRegisterSymbol & ','
+            c.add selConstraint(`op0`,",")
+            c.add selConstraint(`op1`, ",")
+            c.add selConstraint(`op2`)
+          else:
+            static: doAssert false, " Constraint misconfigured"
+          when `constraintString`.endsWith(",~{memory}"):
+            c.add ",~{memory}"
+          c
+
+    else:
+      error "Unsupported arity: " & $arity
+
+
+    # 4. Register the inline ASM with LLVM
+    let inlineASM = ident"inlineASM"
+    let instrParam = op[2][0][1]
+    let asmString = ident"asmString"
+
+    if hasTwoTypes:
+      # Only slct has 2 types, and it has to be s32, so there is no need to dynamically the type of the parameter at the moment
+      let mnemo = instr[0]
+      let type2 = instr[1]
+      instrBody.add quote do:
+        let `asmString` = static(`mnemo` & ".u") & $`numBits` & static(`type2` & " " & `instrParam`)
+    else:
+      instrBody.add quote do:
+        let `asmString` = static(`instr` & ".u") & $`numBits` & static(" " & `instrParam`)
+    
+    instrBody.add quote do:
+      # Chapter 6 of https://docs.nvidia.com/cuda/pdf/NVVM_IR_Specification.pdf
+      # inteldialect is not supported (but the NVPTX dialect is akin to intel dialect)
+
+      let `inlineASM` = getInlineAsm(
+        ty = `fnTy`,
+        asmString = `asmString`,
+        constraints = `constraints`,
+        # All carry instructions have sideffect on carry flag and can't be reordered
+        # However, function calls can't be reordered and
+        # by default on NVPTX load/stores, comparisons and arithmetic operations don't affect carry
+        # flags so it's fine for the compiler to intersperse them.
+        hasSideEffects = LlvmBool(0),
+        isAlignStack = LlvmBool(0),
+        dialect = InlineAsmDialectATT,
+        canThrow = LlvmBool(0)
+      ) 
+
+    # 5. Call it
+    let opArray = nnkBracket.newTree()
+    for op in operands:
+      # when op is ValueRef: op
+      # else: constInt(uint64(op))
+      opArray.add newCall(
+        bindSym"ValueRef",
+        nnkWhenStmt.newTree(
+          nnkElifBranch.newTree(nnkInfix.newTree(ident"is", op, bindSym"AnyValueRef"), op),
+          nnkElse.newTree(newCall(ident"constInt", regTy, newCall(ident"uint64", op)))
+        )
+      )
+    # builder.call2(ty, inlineASM, [lhs, rhs], name)
+    instrBody.add newCall(
+      ident"call2", ident"builder", fnTy,
+      inlineASM, opArray, ident"name"
+    )
+
+    # 6. Create the function signature
+    var opDefs: seq[NimNode]
+    opDefs.add ident"ValueRef" # Return type
+    opDefs.add newIdentDefs(ident"builder", bindSym"BuilderRef")
+    block:
+      var i = 0
+      for constraint in constraintString.strVal.split(','):
+        if constraint.startsWith('=') or constraint.startsWith("~{memory}"):
+          # Don't increment i
+          continue
+        elif constraint == "rl":
+          opDefs.add newIdentDefs(operands[i], ident"ValueRef")
+        elif constraint == "rln":
+          opDefs.add newIdentDefs(
+            operands[i],
+            nnkInfix.newTree(ident"or",
+              nnkInfix.newTree(ident"or", ident"AnyValueRef", ident"uint32"),
+              ident"uint64")
+          )
+        elif constraint == "rn":
+          opDefs.add newIdentDefs(
+            operands[i],
+            nnkInfix.newTree(ident"or",
+              ident"AnyValueRef",
+              ident"uint32")
+          )
+        else:
+          error "Unsupported constraint: " & constraint
+        i += 1
+    opDefs.add newIdentDefs(ident"name", bindSym"cstring", newLit"")
+
+    result.add newProc(
+      name = nnkPostfix.newTree(ident"*", instrName),
+      params = opDefs,
+      procType = nnkProcDef,
+      body = instrBody
+    )
+
+# Inline PTX assembly
+# ------------------------------------------------------------
+# See docs/implementation_nvidia_gpus.md for detailed implementation considerations
+#
+# Need Cuda 11.5.1 at least for madc.hi.u64:
+# https://forums.developer.nvidia.com/t/wrong-result-returned-by-madc-hi-u64-ptx-instruction-for-specific-operands/196094
+#
+# The PTX compilation part is done by NVVM
+# but NVVM version is not listed here: https://docs.nvidia.com/cuda/archive/11.5.1/cuda-toolkit-release-notes/index.html
+# and nvvmVersion returns the IR version instead of the version of the compiler library.
+# Alternatively we use LLVM NVPTX backend instead of Nvidia's NVVM.
+#
+# Nvidia manual
+# ~~~~~~~~~~~~~
+#
+# https://docs.nvidia.com/cuda/inline-ptx-assembly/index.html#constraints
+# There is a separate constraint letter for each PTX register type:
+# 
+# "h" = .u16 reg
+# "r" = .u32 reg
+# "l" = .u64 reg
+# "f" = .f32 reg
+# "d" = .f64 reg
+#
+# The constraint "n" may be used for immediate integer operands with a known value.
+#
+#
+# 1.2.3. Incorrect Optimization
+# 
+# The compiler assumes that an asm() statement has no side effects except to change the output operands. To ensure that the asm is not deleted or moved during generation of PTX, you should use the volatile keyword, e.g.:
+# 
+# asm volatile ("mov.u32 %0, %%clock;" : "=r"(x));
+# 
+# Normally any memory that is written to will be specified as an out operand, but if there is a hidden side effect on user memory (for example, indirect access of a memory location via an operand), or if you want to stop any memory optimizations around the asm() statement performed during generation of PTX, you can add a “memory” clobbers specification after a 3rd colon, e.g.:
+# 
+# asm volatile ("mov.u32 %0, %%clock;" : "=r"(x) :: "memory");
+# asm ("st.u32 [%0], %1;" : "r"(p), "r"(x) :: "memory");
+#
+# Constantine implementation
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# To encode the allowed constraints we use rl to allow the r and l constraints
+# and we use rln to allow r and l constraints and n immediate.
+#
+# The asm volatile constraint is passed via `hasSideEffects` in getInlineAsm.
+#
+# For the memory constraint, it is specified the following way:
+#
+# C:    asm volatile ("add.cc.u64 %0, %1, %2;" : "=l"(c) : "l"(a), "l"(b) : "memory" );
+# LLVM: call i64 asm "add.u64 $0, $1, $2;", "=l,l,l,~{memory}"(i64 %1, i64 %2)
+#
+# Instructions that use carries should not be reordered hence need volatile/hasSideEffect
+
+genInstr():
+  # The PTX is without size indicator i.e. add.cc instead of add.cc.u32
+  # Both version will be generated. 
+  #
+  # op name:       ("ptx",        "args;",            "constraints", [params])
+
+  # r <- a+b
+  op add_co:       ("add.cc",     "$0, $1, $2;",     "=rl,rln,rln",   [lhs, rhs])
+  op add_ci:       ("addc",       "$0, $1, $2;",     "=rl,rln,rln",   [lhs, rhs])
+  op add_cio:      ("addc.cc",    "$0, $1, $2;",     "=rl,rln,rln",   [lhs, rhs])
+  # r <- a-b
+  op sub_bo:       ("sub.cc",     "$0, $1, $2;",     "=rl,rln,rln",   [lhs, rhs])
+  op sub_bi:       ("subc",       "$0, $1, $2;",     "=rl,rln,rln",   [lhs, rhs])
+  op sub_bio:      ("subc.cc",    "$0, $1, $2;",     "=rl,rln,rln",   [lhs, rhs])
+  # r <- a * b >> 32
+  op mulhi:        ("mul.hi",     "$0, $1, $2;",     "=rl,rln,rln",   [lhs, rhs])
+  # r <- a * b + c
+  op mulloadd:     ("mad.lo",     "$0, $1, $2, $3;", "=rl,rln,rln,rln", [lmul, rmul, addend])
+  op mulloadd_co:  ("mad.lo.cc",  "$0, $1, $2, $3;", "=rl,rln,rln,rln", [lmul, rmul, addend])
+  op mulloadd_cio: ("madc.lo.cc", "$0, $1, $2, $3;", "=rl,rln,rln,rln", [lmul, rmul, addend])
+  # r <- (a * b) >> 32 + c
+  # r <- (a * b) >> 64 + c
+  op mulhiadd:     ("mad.hi",     "$0, $1, $2, $3;", "=rl,rln,rln,rln", [lmul, rmul, addend])
+  op mulhiadd_co:  ("mad.hi.cc",  "$0, $1, $2, $3;", "=rl,rln,rln,rln", [lmul, rmul, addend])
+  op mulhiadd_cio: ("madc.hi.cc", "$0, $1, $2, $3;", "=rl,rln,rln,rln", [lmul, rmul, addend])
+
+  # Conditional mov / select
+  
+  # slct r, a, b, c;
+  # r <- (c >= 0) ? a : b;
+  op slct:         ({"slct",".s32"},     "$0, $1, $2, $3;", "=rl,rln,rln,rn", [ifPos, ifNeg, condition])
+
+  # selp is the classic select operation, however the selector `c` is of type "predicate"
+  # and quoting the PTX ISA doc
+  # https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#manipulating-predicates
+  # > There is no direct conversion between predicates and integer values, and no direct way to load or store predicate register values. However, setp can be used to generate a predicate from an integer, and the predicate-based select (selp) instruction can be used to generate an integer value based on the value of a predicate; for example:
+  # >     selp.u32 %r1,1,0,%p; 	// convert predicate to 32-bit value
+  #
+  # If selp is more practical than slct in some cases, then it's likely easier to use LLVM builtin IR trunc/icmp + select
+
+  # selp r, a, b, c;
+  # r <- (c == 1) ? a : b;
+  # op selp:         ("selp",     "$0, $1, $2, $3;", "=rl,rln,rln,rln", [ifTrue, ifFalse, condition])
+
+  # Alternatively, for conditional moves use-cases, we might want to use
+  # 'setp' to set a predicate and then '@p mov' for predicated moves
--- a/docs/implementation_nvidia_gpus.md
+++ b/docs/implementation_nvidia_gpus.md
@ -0,0 +1,209 @@
+# Implementation on Nvidia GPUs
+
+This documentation references useful information for implementing and optimizing for Nvidia GPUs
+
+## Integer instruction bug
+
+The instruction integer fused-multiply-ad  with carry-in may
+be incorrectly compiled in PTX prior to Cuda 11.5.1:
+https://forums.developer.nvidia.com/t/wrong-result-returned-by-madc-hi-u64-ptx-instruction-for-specific-operands/196094
+
+Test case from: https://github.com/tickinbuaa/CudaTest/blob/master/main.cu
+
+```C
+#include <cuda_runtime.h>
+#include <memory>
+
+__device__
+inline void mac_with_carry(uint64_t &lo, uint64_t &hi, const uint64_t &a, const uint64_t &b, const uint64_t &c) {
+    if (blockIdx.x == 0 && threadIdx.x == 0) {
+        printf("GPU calculation input: a = %lx b = %lx c = %lx\n", a, b, c);
+    }
+    asm("mad.lo.cc.u64 %0, %2, %3, %4;\n\t"
+        "madc.hi.u64 %1, %2, %3, 0;\n\t"
+        :"=l"(lo), "=l"(hi): "l"(a), "l"(b), "l"(c));
+    if (blockIdx.x == 0 && threadIdx.x == 0) {
+        printf("GPU calculation result: hi = %lx low = %lx\n", hi, lo);
+    }
+}
+
+__global__
+void test(uint64_t *out, uint32_t num){
+    unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= num) {
+        return;
+    }
+    uint64_t a = 0x42737a020c0d6393UL;
+    uint64_t b = 0xffffffff00000001UL;
+    uint64_t c = 0xc999e990f3f29c6dUL;
+    mac_with_carry(out[tid << 1], out[(tid << 1) + 1], a, b, c);
+}
+
+int main() {
+    uint64_t *d_out;
+    uint32_t num = 1;
+    cudaMalloc(&d_out, num * 2 * sizeof(uint64_t));
+    const uint32_t BLOCK_SIZE = 256;
+    uint32_t block_num = (num + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    test<<<block_num, BLOCK_SIZE>>>(d_out, num);
+    cudaDeviceSynchronize();
+    unsigned __int128 a = 0x42737a020c0d6393UL;
+    unsigned __int128 b = 0xffffffff00000001UL;
+    unsigned __int128 c = 0xc999e990f3f29c6dUL;
+    unsigned __int128 result = a * b + c;
+    printf("Cpu result: hi:%lx low:%lx\n", (uint64_t)((result >> 64) & 0xffffffffffffffffUL), (uint64_t)(result & 0xffffffffffffffffUL));
+}
+```
+
+## The hidden XMAD instruction
+
+There is a "hidden" instruction called xmad on Nvidia GPUs described in
+- Optimizing Modular Multiplication for NVIDIA’s Maxwell GPUs\
+  Niall Emmart , Justin Luitjens , Charles Weems and Cliff Woolley\
+  https://ieeexplore.ieee.org/abstract/document/7563271
+
+On Maxwell and Pascal GPUs (SM 5.3), there was no native 32-bit integer multiplication, probably due to die size constraint.
+So 32-bit mul was based on 16-bit muladd (XMAD) with some PTX->SASS compiler pattern matching to detect optimal XMAD
+scheduling.
+Starting from Volta (SM 7.0 / RTX 2XXX), there is now an hardware integer multiply again
+https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#arithmetic-instructions
+
+Code to generate the proper XMAD is available in:
+- https://github.com/NVlabs/xmp/blob/0052dbb/src/include/ptx/PTXInliner_impl.h#L371-L384
+- https://github.com/NVlabs/CGBN/blob/e8b9d26/include/cgbn/arith/asm.cu#L131-L142
+
+## Double-precision floating point arithmetic
+
+On double-precision floating point arithmetic.
+There are some recent papers exploring using the 52-bit mantissa of a float64 to accelerate elliptic curve cryptography.
+This is similar to the AVX approaches on CPU.
+
+- Faster Modular Exponentiation Using Double Precision Floating Point Arithmetic on the GPU\
+  Niall Emmart, Fangyu Zheng, Charles Weems\
+  https://ieeexplore.ieee.org/document/8464792
+
+- DPF-ECC: Accelerating Elliptic Curve Cryptography with Floating-Point Computing Power of GPUs
+  Lili Gao, Fangyu Zheng, Niall Emmart, Jiankuo Dong, Jingqiang Lin, C. Weems\
+  https://ieeexplore.ieee.org/document/9139772
+
+Unfortunately float64 arithmetic is extremely slow on Nvidia GPUs except for Tesla-class GPU due to market segmentation.
+https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#architecture-8-x
+
+SM 8.0 corresponds to a Tesla A100, and SM 8.6 to RTX 30X0 or Quadro AX000
+
+> A Streaming Multiprocessor (SM) consists of:
+>   - 64 FP32 cores for single-precision arithmetic operations in devices of compute capability 8.0\
+>        and 128 FP32 cores in devices of compute capability 8.6, 8.7 and 8.9,
+>   - 32 FP64 cores for double-precision arithmetic operations in devices of compute capability 8.0\
+>        and **2 FP64 cores** in devices of compute capability 8.6, 8.7 and 8.9
+>   - **64 INT32 cores** for integer math
+
+Hence Nvidia choose to replace 30 FP64 cores with 64 FP32 cores on consumer GPU. An understandable business decision since graphics and machine learning use and are benchmarked on FP32 with FP64 being used mostly in scientific and simulation workloads. Hozever for blockchain, it's important for decentralization that as much as possible can run on consumer hardware, Tesla cards are $10k so we want to optimize for consumer GPUs with 1/32 INT32/FP64 throughput ratio.
+
+So assuming 1 cycle per instruction on the matching core, we can do 64 INT32 instructions while we do 2 FP64 instructions, hence 1/32 throughput ratio.
+
+Concretely to emulate 64x64->128 extended precision multiplication we need 4 32-bit multiplications (and fused additions):
+```
+      a₁a₀
+*     b₁b₀
+---------------------------
+      a₀b₀
+    a₁b₀
+    a₀b₁
+  a₁b₁
+```
+
+Assuming we need only 2 FP64 instructions for 64x64->128 integer mul (mul.lo and mul.hi) the throughput ratio would be:
+`1/32 (base throughput) * 4 (mul int32 instr) * 1/2 (mul fp64) = 1/16`
+
+In reality:
+- we use 52-bit mantissa so we would have calculated only 104 bit
+- there is extra addition/substraction, shifting and masking involved
+- this significantly increase the chances of mistakes. Furthermore formal verification or fuzzing on GPUs isn't the easiest
+
+## Code generation considerations
+
+### Parameter passing:
+- https://reviews.llvm.org/D118084
+  > The motivation for this change is to allow SROA to eliminate local copies in more cases. Local copies that make it to the generated PTX present a substantial performance hit, as we end up with all threads on the GPU rushing to access their own chunk of very high-latency memory.
+Direct parameter passing is easier to analyze but not worthwhile 
+for large aggregate
+
+
+### Important optimization passes:
+
+- https://www.llvm.org/docs/Passes.html
+- gvn, global value numbering to remove redundant loads
+- mem2reg, will promote memory into regisster, memory is expensive in GPUs
+  > This file promotes memory references to be register references. It promotes alloca instructions which only have loads and stores as uses. An alloca is transformed by using dominator frontiers to place phi nodes, then traversing the function in depth-first order to rewrite loads and stores as appropriate. This is just the standard SSA construction algorithm to construct “pruned” SSA form.
+  https://stackoverflow.com/a/66082008
+- SROA, Scalar Replacement of Aggregates, to remove local copies and alloca. Static indices access help.
+  as mentioned in https://discourse.llvm.org/t/nvptx-calling-convention-for-aggregate-arguments-passed-by-value/
+
+  https://github.com/llvm/llvm-project/issues/51734#issuecomment-981047833
+  > Local loads/stores on GPU are expensive enough to be worth quite a few extra instructions.
+- https://github.com/apc-llc/nvcc-llvm-ir
+
+Note: The dead code/instructions elimination passes might remove the ASM not marked sideeffect/volatile
+
+Ordering GVN before SROA: https://reviews.llvm.org/D111471
+
+If we use "normal" instructions instead of inline assembly, this thread links to many LLVM internal discussions
+on the passes that optimize to add-with-carry: https://github.com/llvm/llvm-project/issues/31102
+We have:
+- InstCombine, for instruction combining (see also: https://reviews.llvm.org/D8889, https://reviews.llvm.org/D124698, https://github.com/llvm/llvm-project/issues/39832)
+- CodegenPrepare, for ISA specific codegen
+
+### LLVM NVPTX or Nvidia libNVVM
+
+https://docs.nvidia.com/cuda/libnvvm-api/index.html
+https://docs.nvidia.com/pdf/libNVVM_API.pdf
+https://docs.nvidia.com/cuda/nvvm-ir-spec/index.html
+https://docs.nvidia.com/cuda/pdf/NVVM_IR_Specification.pdf
+
+⚠ NVVM IR is based on LLVM 7.0.1 IR which dates from december 2018.
+There are a couple of caveats:
+- LLVM 7.0.1 is usually not available in repo, making installation difficult
+- There was a ABI breaking bug making the 7.0.1 and 7.1.0 versions messy (https://www.phoronix.com/news/LLVM-7.0.1-Released)
+- LLVM 7.0.1 does not have LLVMBuildCall2 and relies on the deprecated LLVMBuildCall meaning
+  supporting that and latest LLVM (for AMDGPU and SPIR-V backends) will likely have heavy costs
+- When generating a add-with-carry kernel with inline ASM calls from LLVM-14,
+  if the LLVM IR is passed as bitcode,
+  the kernel content is silently discarded, this does not happen with built-in add.
+  It is unsure if it's call2 or inline ASM incompatibility that causes the issues
+- When generating a add-with-carry kernel with inline ASM calls from LLVM-14,
+  if the LLVM IR is passed as testual IR, the code is refused with NVVM_ERROR_INVALID_IR
+
+Hence, using LLVM NVPTX backend instead of libNVVM is likely the sustainable way forward
+
+### Register pressure
+
+See this AMD paper https://dl.acm.org/doi/pdf/10.1145/3368826.3377918
+However if we want to reduce register pressure we need to store to local memory which is also expensive.
+
+## Parallel reductions
+
+Batch elliptic point addition `r = P₀ + P₁ + ... + Pₙ` and
+multi-scalar multiplication (MSM) `r = [k₀]P₀ + [k₁]P₁ + ... + [kₙ]Pₙ`
+are reduction operations.
+
+There is a wealth of resources regarding optimized implementations of those.
+The baseline is provided by: [Optimizing Parallel Reduction in CUDA, Mark harris](https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf)
+Then on later architectures:
+- https://developer.nvidia.com/blog/faster-parallel-reductions-kepler/
+- https://www.irisa.fr/alf/downloads/collange/talks/collange_warp_synchronous_19.pdf
+
+Other interesting resources:
+- https://on-demand.gputechconf.com/gtc/2017/presentation/s7622-Kyrylo-perelygin-robust-and-scalable-cuda.pdf \
+  This explains in great details the cooperative group features
+  and examples in reduction kernels
+- https://github.com/umfranzw/cuda-reduction-example \
+  This explains and uses overlapping streams for latency hiding
+- https://vccvisualization.org/teaching/CS380/CS380_fall2020_lecture_25.pdf
+  SHFL instruction
+- https://unum.cloud/post/2022-01-28-reduce/
+  - https://github.com/ashvardanian/ParallelReductionsBenchmark \
+  This provides an overview and benchmark code across CPU (AVX2, OpenMP, TBB), OpenCL, Cuda (Cublas, Thrust, Cub)
+- https://diglib.eg.org/bitstream/handle/10.2312/egt20211037/CUDA_day2.pdf
+  - https://cuda-tutorial.github.io/part3_22.pdf
+  - https://github.com/CUDA-Tutorial/CodeSamples
--- a/docs/optimizations.md
+++ b/docs/optimizations.md
@ -37,7 +37,7 @@ The optimizations can be of algebraic, algorithmic or "implementation details" n
 - Representation
  - [x] Montgomery Representation
  - [ ] Barret Reduction
-  - [x] Unsaturated Representation
+  - [ ] Unsaturated Representation
    - [ ] Mersenne Prime (2ᵏ - 1),
    - [ ] Generalized Mersenne Prime (NIST Prime P256: 2^256 - 2^224 + 2^192 + 2^96 - 1)
    - [ ] Pseudo-Mersenne Prime (2^m - k for example Edwards25519: 2^255 - 19)
@ -69,14 +69,14 @@ The optimizations can be of algebraic, algorithmic or "implementation details" n
 - Montgomery Squaring
  - [x] Dedicated squaring functions
  - [x] Fused multiply + reduce
-  - [ ] int128
-  - [ ] loop unrolling
+  - [x] int128
+  - [x] loop unrolling
  - [x] x86: Full Assembly implementation
  - [x] x86: MULX, ADCX, ADOX instructions
  - [ ] no-carry optimization for CIOS (Coarsely Integrated Operand Scanning)

 - Addition chains
-  - [ ] unreduced squarings/multiplications in addition chains
+  - [x] unreduced squarings/multiplications in addition chains

 - Exponentiation
  - [x] variable-time exponentiation
@ -217,4 +217,4 @@ The optimizations can be of algebraic, algorithmic or "implementation details" n
  - [ ] BLS G1: Bowe, endomorphism accelerated
  - [ ] BLS G2: Bowe, endomorphism accelerated
  - [x] BLS G1: Scott, endomorphism accelerated
-  - [x] BLS G2: Scott, endomorphism accelerated
+  - [x] BLS G2: Scott, endomorphism accelerated
--- a/nim.cfg
+++ b/nim.cfg
@ -0,0 +1 @@
+--hint[Name]:off
--- a/tests/gpu/hello_world_llvm.nim
+++ b/tests/gpu/hello_world_llvm.nim
@ -0,0 +1,48 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import ../../constantine/platforms/gpu/llvm
+
+echo "LLVM JIT compiler Hello World"
+
+let ctx = createContext()
+let module = ctx.createModule("addition")
+let i32 = ctx.int32_t()
+
+let addType = function_t(i32, [i32, i32], isVarArg = LlvmBool(false))
+let addBody = module.addFunction("add", addType)
+
+let builder = ctx.createBuilder()
+let blck = ctx.appendBasicBlock(addBody, "addBody")
+builder.positionAtEnd(blck)
+
+block:
+  let a = addBody.getParam(0)
+  let b = addBody.getParam(1)
+  let sum = builder.add(a, b, "sum")
+  builder.ret(sum)
+
+module.verify(AbortProcessAction)
+
+var engine: ExecutionEngineRef
+block:
+  initializeFullNativeTarget()
+  createJITCompilerForModule(engine, module, optLevel = 0)
+
+let jitAdd = cast[proc(a, b: int32): int32 {.noconv.}](
+  engine.getFunctionAddress("add"))
+
+echo "jitAdd(1, 2) = ", jitAdd(1, 2)
+doAssert jitAdd(1, 2) == 1 + 2
+
+block:
+  # Cleanup
+  builder.dispose()
+  engine.dispose()  # also destroys the module attached to it
+  ctx.dispose()
+echo "LLVM JIT - SUCCESS"
--- a/tests/gpu/hello_world_nvidia.nim
+++ b/tests/gpu/hello_world_nvidia.nim
@ -0,0 +1,326 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import ../../constantine/platforms/gpu/[llvm, nvidia, bindings/utils]
+
+# ############################################################
+#
+#                         NVVM
+#
+# ############################################################
+
+# https://docs.nvidia.com/cuda/libnvvm-api/index.html
+# https://docs.nvidia.com/pdf/libNVVM_API.pdf
+# https://docs.nvidia.com/cuda/nvvm-ir-spec/index.html
+# https://docs.nvidia.com/cuda/pdf/NVVM_IR_Specification.pdf
+
+# ⚠ NVVM IR is based on LLVM 7.0.1 IR which dates from december 2018.
+# There are a couple of caveats:
+# - LLVM 7.0.1 is usually not available in repo, making installation difficult
+# - There was a ABI breaking bug making the 7.0.1 and 7.1.0 versions messy (https://www.phoronix.com/news/LLVM-7.0.1-Released)
+# - LLVM 7.0.1 does not have LLVMBuildCall2 and relies on the deprecated LLVMBuildCall meaning
+#   supporting that and latest LLVM (for AMDGPU and SPIR-V backends) will likely have heavy costs
+# - When generating a add-with-carry kernel with inline ASM calls from LLVM-14,
+#   if the LLVM IR is passed as bitcode,
+#   the kernel content is silently discarded, this does not happen with built-in add.
+#   It is unsure if it's call2 or inline ASM incompatibility that causes the issues
+# - When generating a add-with-carry kernel with inline ASM calls from LLVM-14,
+#   if the LLVM IR is passed as testual IR, the code is refused with NVVM_ERROR_INVALID_IR
+
+# Hence, using LLVM NVPTX backend instead of libNVVM is likely the sustainable way forward
+
+static: echo "[Constantine] Using library libnvvm.so"
+{.passl: "-L/opt/cuda/nvvm/lib64 -lnvvm".}
+
+type
+  NvvmResult* {.size: sizeof(cint).} = enum
+    NVVM_SUCCESS = 0
+    NVVM_ERROR_OUT_OF_MEMORY = 1
+    NVVM_ERROR_PROGRAM_CREATION_FAILURE = 2
+    NVVM_ERROR_IR_VERSION_MISMATCH = 3
+    NVVM_ERROR_INVALID_INPUT = 4
+    NVVM_ERROR_INVALID_PROGRAM = 5
+    NVVM_ERROR_INVALID_IR = 6
+    NVVM_ERROR_INVALID_OPTION = 7
+    NVVM_ERROR_NO_MODULE_IN_PROGRAM = 8
+    NVVM_ERROR_COMPILATION = 9
+
+  NvvmProgram = distinct pointer
+
+{.push cdecl, importc, dynlib: "libnvvm.so".}
+
+proc nvvmGetErrorString*(r: NvvmResult): cstring
+proc nvvmVersion*(major, minor: var int32): NvvmResult
+proc nvvmIRVersion*(majorIR, minorIR, majorDbg, minorDbg: var int32): NvvmResult
+
+proc nvvmCreateProgram*(prog: var NvvmProgram): NvvmResult
+proc nvvmDestroyProgram*(prog: var NvvmProgram): NvvmResult
+proc nvvmAddModuleToProgram*(prog: NvvmProgram, buffer: openArray[byte], name: cstring): NvvmResult {.wrapOpenArrayLenType: csize_t.} 
+proc nvvmLazyAddModuleToProgram*(prog: NvvmProgram, buffer: openArray[byte], name: cstring): NvvmResult {.wrapOpenArrayLenType: csize_t.} 
+proc nvvmCompileProgram*(prog: NvvmProgram; numOptions: int32; options: cstringArray): NvvmResult
+proc nvvmVerifyProgram*(prog: NvvmProgram; numOptions: int32; options: cstringArray): NvvmResult
+proc nvvmGetCompiledResultSize*(prog: NvvmProgram; bufferSizeRet: var csize_t): NvvmResult
+proc nvvmGetCompiledResult*(prog: NvvmProgram; buffer: ptr char): NvvmResult
+proc nvvmGetProgramLogSize*(prog: NvvmProgram; bufferSizeRet: var csize_t): NvvmResult
+proc nvvmGetProgramLog*(prog: NvvmProgram; buffer: ptr char): NvvmResult
+
+{.pop.} # {.push cdecl, importc, header: "<nvvm.h>".}
+
+# ############################################################
+#
+#                    PTX Codegen
+#
+# ############################################################
+
+template check*(status: NvvmResult) =
+  let code = status # Assign so execution is done once only.
+  if code != NVVM_SUCCESS:
+    stderr.write astToStr(status) & " " & $instantiationInfo() & " exited with error: " & $code
+    quit 1
+
+proc getNvvmLog(prog: NvvmProgram): string {.used.} =
+  var logSize: csize_t
+  check nvvmGetProgramLogSize(prog, logSize)
+
+  if logSize > 0:
+    result = newString(logSize)
+    check nvvmGetProgramLog(prog, result[0].addr)
+
+proc ptxCodegenViaNvidiaNvvm(module: ModuleRef, sm: tuple[major, minor: int32]): string {.used.} =
+  ## PTX codegen via Nvidia NVVM
+  
+  # ######################################
+  # LLVM -> NNVM handover
+
+  var prog{.noInit.}: NvvmProgram
+  check nvvmCreateProgram(prog)
+
+  let bitcode = module.toBitcode()
+  check nvvmAddModuleToProgram(prog, bitcode, cstring module.getIdentifier())
+
+  # ######################################
+  # GPU codegen
+
+  check nvvmVerifyProgram(prog, 0, nil)
+
+  let options = allocCStringArray(["-arch=compute_" & $sm.major & $sm.minor])
+  check nvvmCompileProgram(prog, 1, options)
+  deallocCStringArray(options)
+  var ptxSize: csize_t
+  check nvvmGetCompiledResultSize(prog, ptxSize)
+  result = newString(ptxSize-1) # The NNVM size includes '\0' ending char while Nim excludes it.
+  check nvvmGetCompiledResult(prog, result[0].addr)
+
+  check nvvmDestroyProgram(prog)
+
+proc ptxCodegenViaLlvmNvptx(module: ModuleRef, sm: tuple[major, minor: int32]): string {.used.} =
+  ## PTX codegen via LLVM NVPTX
+  
+  module.verify(AbortProcessAction)
+
+  initializeFullNVPTXTarget()
+  const triple = "nvptx64-nvidia-cuda"
+
+  let machine = createTargetMachine(
+    target = toTarget(triple),
+    triple = triple,
+    cpu = cstring("sm_" & $sm.major & $sm.minor),
+    features = "",
+    level = CodeGenLevelAggressive,
+    reloc = RelocDefault,
+    codeModel = CodeModelDefault
+  )
+
+  machine.emitToString(module, AssemblyFile)
+
+# ############################################################
+#
+#                    Hello world
+#
+# ############################################################
+
+echo "Nvidia JIT compiler Hello World"
+
+proc writeExampleAddMul(ctx: ContextRef, module: ModuleRef, addKernelName, mulKernelName: string) =
+
+  # ######################################
+  # Metadata
+
+  const triple = "nvptx64-nvidia-cuda"
+  # Datalayout for NVVM IR 1.8 (CUDA 11.6)
+  const datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+
+  # ######################################
+  # LLVM IR codegen
+
+  module.setTarget(triple)
+  module.setDataLayout(datalayout)
+  let i128 = ctx.int128_t()
+  let void_t = ctx.void_t()
+
+  let builder = ctx.createBuilder()
+  defer: builder.dispose()
+
+  block:
+    let addType = function_t(void_t, [i128.pointer_t(), i128, i128], isVarArg = LlvmBool(false))
+    let addKernel = module.addFunction(addKernelName, addType)
+    let blck = ctx.appendBasicBlock(addKernel, "addBody")
+    builder.positionAtEnd(blck)
+    let r = addKernel.getParam(0)
+    let a = addKernel.getParam(1)
+    let b = addKernel.getParam(2)
+    let sum = builder.add(a, b, "sum")
+    builder.store(sum, r)
+    builder.retVoid()
+
+    module.setCallableCudaKernel((addType, addKernel))
+
+  block:
+    let mulType = function_t(void_t, [i128.pointer_t(), i128, i128], isVarArg = LlvmBool(false))
+    let mulKernel = module.addFunction(mulKernelName, mulType)
+    let blck = ctx.appendBasicBlock(mulKernel, "mulBody")
+    builder.positionAtEnd(blck)
+    let r = mulKernel.getParam(0)
+    let a = mulKernel.getParam(1)
+    let b = mulKernel.getParam(2)
+    let prod = builder.mul(a, b, "prod")
+    builder.store(prod, r)
+    builder.retVoid()
+
+    module.setCallableCudaKernel((mulType, mulKernel))
+
+  module.verify(AbortProcessAction)
+
+  block:
+    echo "================="
+    echo "LLVM IR output"
+    echo $module
+    echo "================="
+
+func toHex*(a: uint64): string =
+  const hexChars = "0123456789abcdef"
+  const L = 2*sizeof(uint64)
+  result = newString(L)
+  var a = a
+  for j in countdown(result.len-1, 0):
+    result[j] = hexChars[a and 0xF]
+    a = a shr 4
+
+func toString*(a: openArray[uint64]): string =
+  result = "0x"
+  for i in countdown(a.len-1, 0):
+    result.add toHex(a[i])
+
+type
+  CodegenBackend = enum
+    PTXviaNvidiaNvvm
+    PTXviaLlvmNvptx
+
+proc getCudaKernel(cuMod: CUmodule, fnName: string): CUfunction =
+  check cuModuleGetFunction(result, cuMod, fnName & "_public")
+
+proc main(backend: CodegenBackend) =
+
+  #######################################
+  # GPU init
+  let cudaDevice = cudaDeviceInit()
+  var sm: tuple[major, minor: int32]
+  check cuDeviceGetAttribute(sm.major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cudaDevice)
+  check cuDeviceGetAttribute(sm.minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cudaDevice)
+
+  #######################################
+  # LLVM IR
+  let ctx = createContext()
+  let module = ctx.createModule("test_nnvm")
+
+  let addKernelName = "addKernel"
+  let mulKernelName = "mulKernel"
+
+  writeExampleAddMul(ctx, module, addKernelName, mulKernelName)
+
+  #######################################
+  # PTX codegen
+  let ptx = case backend
+    of PTXviaNvidiaNvvm:
+      module.ptxCodegenViaNvidiaNVVM(sm)
+    of PTXviaLlvmNvptx:
+      module.ptxCodegenViaLlvmNvptx(sm)
+
+  module.dispose()
+  ctx.dispose()
+
+  block:
+    echo "================="
+    echo "PTX output"
+    echo $ptx
+    echo "================="
+
+  #######################################
+  # GPU JIT
+  var cuCtx: CUcontext
+  var cuMod: CUmodule
+  check cuCtxCreate(cuCtx, 0, cudaDevice)
+  check cuModuleLoadData(cuMod, ptx)
+  let addKernel = cuMod.getCudaKernel(addKernelName)
+  let mulKernel = cuMod.getCudaKernel(mulKernelName)
+
+  #######################################
+  # Kernel launch
+  var r{.noInit.}, a, b: array[2, uint64]
+
+  a[1] = 0x00000000000001FF'u64; a[0] = 0xFFFFFFFFFFFFFFFF'u64
+  b[1] = 0x0000000000000000'u64; b[0] = 0x0010000000000000'u64
+
+  echo "r:   ", r.toString()
+  echo "a:   ", a.toString()
+  echo "b:   ", b.toString()
+
+  var rGPU: CUdeviceptr
+  check cuMemAlloc(rGPU, csize_t sizeof(r))
+
+  let params = [pointer(rGPU.addr), pointer(a.addr), pointer(b.addr)]
+
+  check cuLaunchKernel(
+          addKernel,
+          1, 1, 1,
+          1, 1, 1,
+          0, CUstream(nil),
+          params[0].unsafeAddr, nil)
+
+  check cuMemcpyDtoH(r.addr, rGPU, csize_t sizeof(r))
+  echo "a+b: ", r.toString()
+
+  check cuLaunchKernel(
+          mulKernel,
+          1, 1, 1,
+          1, 1, 1,
+          0, CUstream(nil),
+          params[0].unsafeAddr, nil)
+
+  check cuMemcpyDtoH(r.addr, rGPU, csize_t sizeof(r))
+  echo "a*b: ", r.toString()
+
+  #######################################
+  # Cleanup
+
+  check cuMemFree(rGPU)
+  rGPU = CUdeviceptr(nil)
+
+  check cuModuleUnload(cuMod)
+  cuMod = CUmodule(nil)
+
+  check cuCtxDestroy(cuCtx)
+  cuCtx = CUcontext(nil)
+
+echo "\n\nCompilation via Nvidia NVVM\n###########################\n"
+main(PTXviaNvidiaNvvm)
+echo "\n\nEnd: Compilation via Nvidia NVVM\n################################"
+
+echo "\n\nCompilation via LLVM NVPTX\n##########################\n"
+main(PTXviaLlvmNvptx)
+echo "\n\nEnd: Compilation via LLVM NVPTX\n###############################"
--- a/tests/gpu/t_nvidia_fp.nim
+++ b/tests/gpu/t_nvidia_fp.nim
@ -0,0 +1,134 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+
+import 
+  # Standard library
+  std/[unittest, times],
+  # Internal
+  ../../constantine/platforms/gpu/[llvm, nvidia, ir],
+  ../../constantine/math/config/[curves, type_bigint],
+  ../../constantine/math/io/io_bigints,
+  ../../constantine/math/arithmetic,
+  ../../constantine/math_gpu/fields_nvidia,
+  # Test utilities
+  ../../helpers/[prng_unsafe, static_for]
+
+var rng: RngState
+let seed = uint32(getTime().toUnix() and (1'i64 shl 32 - 1)) # unixTime mod 2^32
+rng.seed(seed)
+echo "\n------------------------------------------------------\n"
+echo "test_nvidia_fp xoshiro512** seed: ", seed
+
+const Iters = 10
+
+proc init(T: type CurveMetadata, asy: Assembler_LLVM, curve: static Curve, wordSize: WordSize): T =
+  CurveMetadata.init(
+      asy.ctx,
+      $curve & "_", wordSize,
+      fpBits = uint32 curve.getCurveBitwidth(),
+      fpMod = curve.Mod().toHex(),
+      frBits = uint32 curve.getCurveOrderBitwidth(),
+      frMod = curve.getCurveOrder().toHex()
+    )
+
+proc genFieldAddPTX(asy: Assembler_LLVM, cm: CurveMetadata) =
+  let fpAdd = asy.field_add_gen(cm, fp)
+  asy.module.setCallableCudaKernel(fpAdd)
+  let frAdd = asy.field_add_gen(cm, fr)
+  asy.module.setCallableCudaKernel(frAdd)
+
+# Init LLVM
+# -------------------------
+initializeFullNVPTXTarget()
+initializePasses()
+
+# Init GPU
+# -------------------------
+let cudaDevice = cudaDeviceInit()
+var sm: tuple[major, minor: int32]
+check cuDeviceGetAttribute(sm.major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cudaDevice)
+check cuDeviceGetAttribute(sm.minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cudaDevice)
+
+proc t_field_add(curve: static Curve) =
+  # Codegen
+  # -------------------------
+  let asy = Assembler_LLVM.new(bkNvidiaPTX, cstring("t_nvidia_" & $curve))
+  let cm32 = CurveMetadata.init(asy, curve, size32)
+  asy.genFieldAddPTX(cm32)
+  let cm64 = CurveMetadata.init(asy, curve, size64)
+  asy.genFieldAddPTX(cm64)
+
+  let ptx = asy.codegenNvidiaPTX(sm)
+
+  # GPU exec
+  # -------------------------
+  var cuCtx: CUcontext
+  var cuMod: CUmodule
+  check cuCtxCreate(cuCtx, 0, cudaDevice)
+  check cuModuleLoadData(cuMod, ptx)
+  defer:
+    check cuMod.cuModuleUnload()
+    check cuCtx.cuCtxDestroy()
+
+  let fpAdd32 = cuMod.getCudaKernel(cm32, opFpAdd)
+  let fpAdd64 = cuMod.getCudaKernel(cm64, opFpAdd)
+  let frAdd32 = cuMod.getCudaKernel(cm32, opFrAdd)
+  let frAdd64 = cuMod.getCudaKernel(cm64, opFrAdd)
+
+  # Fp
+  for i in 0 ..< Iters:
+    let a = rng.random_long01Seq(Fp[curve])
+    let b = rng.random_long01Seq(Fp[curve])
+
+    var rCPU, rGPU_32, rGPU_64: Fp[curve]
+    
+    rCPU.sum(a, b)
+    fpAdd32.exec(rGPU_32, a, b)
+    fpAdd64.exec(rGPU_64, a, b)
+
+    doAssert bool(rCPU == rGPU_32)
+    doAssert bool(rCPU == rGPU_64)
+
+  # Fr
+  for i in 0 ..< Iters:
+    let a = rng.random_long01Seq(Fr[curve])
+    let b = rng.random_long01Seq(Fr[curve])
+
+    var rCPU, rGPU_32, rGPU_64: Fr[curve]
+    
+    rCPU.sum(a, b)
+    frAdd32.exec(rGPU_32, a, b)
+    frAdd64.exec(rGPU_64, a, b)
+
+    doAssert bool(rCPU == rGPU_32)
+    doAssert bool(rCPU == rGPU_64)
+
+proc main() =
+  const curves = [
+    P224,
+    BN254_Nogami,
+    BN254_Snarks,
+    Edwards25519,
+    Bandersnatch,
+    Pallas,
+    Vesta,
+    P256,
+    Secp256k1,
+    BLS12_377,
+    BLS12_381,
+    BW6_761
+  ]
+
+  suite "[Nvidia GPU] Field Addition":
+    staticFor i, 0, curves.len:
+      const curve = curves[i]
+      test "Nvidia GPU field addition (𝔽p, 𝔽r) for " & $curve:
+        t_field_add(curve)
+
+main()