[Backend] Add support for Nvidia GPUs (#210)

* Add PoC of JIT exec on Nvidia GPUs [skip ci]

* Split GPU bindings into low-level (ABI) and high-level [skip ci]

* small typedef reorg [skip ci]

* refine LLVM IR/Nvidia GPU hello worlds

* [Nvidia GPU] PoC implementation of field addition [skip ci]

* prod-ready field addition + tests on Nvidia GPUs via LLVM codegen
This commit is contained in:
Mamy Ratsimbazafy 2023-01-12 01:01:57 +01:00 committed by GitHub
parent c0b30a08be
commit 1f4bb174a3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
29 changed files with 3685 additions and 183 deletions

View File

@ -1,10 +0,0 @@
# Constantine
# Copyright (c) 2018-2019 Status Research & Development GmbH
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.
# TODO
# export public proc

View File

@ -229,6 +229,10 @@ const testDesc: seq[tuple[path: string, useGMP: bool]] = @[
("tests/t_ethereum_eip2333_bls12381_key_derivation.nim", false), ("tests/t_ethereum_eip2333_bls12381_key_derivation.nim", false),
] ]
const testDescNvidia: seq[string] = @[
"tests/gpu/t_nvidia_fp.nim",
]
const benchDesc = [ const benchDesc = [
"bench_fp", "bench_fp",
"bench_fp_double_precision", "bench_fp_double_precision",
@ -378,6 +382,14 @@ proc addTestSet(cmdFile: var string, requireGMP: bool, test32bit = false, testAS
cmdFile.testBatch(flags, td.path) cmdFile.testBatch(flags, td.path)
proc addTestSetNvidia(cmdFile: var string) =
if not dirExists "build":
mkDir "build"
echo "Found " & $testDescNvidia.len & " tests to run."
for path in testDescNvidia:
cmdFile.testBatch(flags = "", path)
proc addBenchSet(cmdFile: var string, useAsm = true) = proc addBenchSet(cmdFile: var string, useAsm = true) =
if not dirExists "build": if not dirExists "build":
mkDir "build" mkDir "build"
@ -568,6 +580,12 @@ task test_parallel_no_gmp_no_asm, "Run all tests in parallel (via GNU parallel)"
writeFile(buildParallel, cmdFile) writeFile(buildParallel, cmdFile)
exec "build/pararun " & buildParallel exec "build/pararun " & buildParallel
task test_nvidia, "Run all tests for Nvidia GPUs":
var cmdFile: string
cmdFile.addTestSetNvidia()
for cmd in cmdFile.splitLines():
exec cmd
# Finite field 𝔽p # Finite field 𝔽p
# ------------------------------------------ # ------------------------------------------

View File

@ -60,11 +60,10 @@ macro addmod2x_gen[N: static int](R: var Limbs[N], A, B: Limbs[N], m: Limbs[N di
# Addition # Addition
# u = a[0..<H] + b[0..<H], v = a[H..<N] # u = a[0..<H] + b[0..<H], v = a[H..<N]
for i in 0 ..< H: ctx.add u[0], b[0]
if i == 0: ctx.mov r[0], u[0]
ctx.add u[0], b[0] for i in 1 ..< H:
else: ctx.adc u[i], b[i]
ctx.adc u[i], b[i]
ctx.mov r[i], u[i] ctx.mov r[i], u[i]
# v = a[H..<N] + b[H..<N], a[0..<H] = u, u = v # v = a[H..<N] + b[H..<N], a[0..<H] = u, u = v
@ -78,11 +77,9 @@ macro addmod2x_gen[N: static int](R: var Limbs[N], A, B: Limbs[N], m: Limbs[N di
ctx.sbb overflowed, overflowed ctx.sbb overflowed, overflowed
# Now substract the modulus to test a < 2ⁿp # Now substract the modulus to test a < 2ⁿp
for i in 0 ..< H: ctx.sub v[0], M[0]
if i == 0: for i in 1 ..< H:
ctx.sub v[0], M[0] ctx.sbb v[i], M[i]
else:
ctx.sbb v[i], M[i]
# If it overflows here, it means that it was # If it overflows here, it means that it was
# smaller than the modulus and we don't need v # smaller than the modulus and we don't need v
@ -134,11 +131,10 @@ macro submod2x_gen[N: static int](R: var Limbs[N], A, B: Limbs[N], m: Limbs[N di
# Substraction # Substraction
# u = a[0..<H] - b[0..<H], v = a[H..<N] # u = a[0..<H] - b[0..<H], v = a[H..<N]
for i in 0 ..< H: ctx.sub u[0], b[0]
if i == 0: ctx.mov r[0], u[0]
ctx.sub u[0], b[0] for i in 1 ..< H:
else: ctx.sbb u[i], b[i]
ctx.sbb u[i], b[i]
ctx.mov r[i], u[i] ctx.mov r[i], u[i]
# v = a[H..<N] - b[H..<N], a[0..<H] = u, u = M # v = a[H..<N] - b[H..<N], a[0..<H] = u, u = M
@ -155,11 +151,10 @@ macro submod2x_gen[N: static int](R: var Limbs[N], A, B: Limbs[N], m: Limbs[N di
ctx.`and` u[i], underflowed ctx.`and` u[i], underflowed
# Add the masked modulus # Add the masked modulus
for i in 0 ..< H: ctx.add u[0], v[0]
if i == 0: ctx.mov r[H], u[0]
ctx.add u[0], v[0] for i in 1 ..< H:
else: ctx.adc u[i], v[i]
ctx.adc u[i], v[i]
ctx.mov r[i+H], u[i] ctx.mov r[i+H], u[i]
result.add ctx.generate result.add ctx.generate

View File

@ -27,23 +27,26 @@ static: doAssert UseASM_X86_32
{.localPassC:"-fomit-frame-pointer".} # Needed so that the compiler finds enough registers {.localPassC:"-fomit-frame-pointer".} # Needed so that the compiler finds enough registers
proc finalSubNoCarryImpl*( proc finalSubNoOverflowImpl*(
ctx: var Assembler_x86, ctx: var Assembler_x86,
r: Operand or OperandArray, r: Operand or OperandArray,
a, M, scratch: OperandArray a, M, scratch: OperandArray
) = ) =
## Reduce `a` into `r` modulo `M` ## Reduce `a` into `r` modulo `M`
## To be used when the modulus does not use the full bitwidth of the storing words
## for example a 255-bit modulus in n words of total max size 2^256
## ##
## r, a, scratch, scratchReg are mutated ## r, a, scratch, scratchReg are mutated
## M is read-only ## M is read-only
let N = M.len let N = M.len
ctx.comment "Final substraction (no carry)" ctx.comment "Final substraction (cannot overflow its limbs)"
for i in 0 ..< N:
# Substract the modulus, and test a < p with the last borrow
ctx.mov scratch[0], a[0]
ctx.sub scratch[0], M[0]
for i in 1 ..< N:
ctx.mov scratch[i], a[i] ctx.mov scratch[i], a[i]
if i == 0: ctx.sbb scratch[i], M[i]
ctx.sub scratch[i], M[i]
else:
ctx.sbb scratch[i], M[i]
# If we borrowed it means that we were smaller than # If we borrowed it means that we were smaller than
# the modulus and we don't need "scratch" # the modulus and we don't need "scratch"
@ -51,7 +54,7 @@ proc finalSubNoCarryImpl*(
ctx.cmovnc a[i], scratch[i] ctx.cmovnc a[i], scratch[i]
ctx.mov r[i], a[i] ctx.mov r[i], a[i]
proc finalSubMayCarryImpl*( proc finalSubMayOverflowImpl*(
ctx: var Assembler_x86, ctx: var Assembler_x86,
r: Operand or OperandArray, r: Operand or OperandArray,
a, M, scratch: OperandArray, a, M, scratch: OperandArray,
@ -59,24 +62,22 @@ proc finalSubMayCarryImpl*(
) = ) =
## Reduce `a` into `r` modulo `M` ## Reduce `a` into `r` modulo `M`
## To be used when the final substraction can ## To be used when the final substraction can
## also depend on the carry flag ## also overflow the limbs (a 2^256 order of magnitude modulus stored in n words of total max size 2^256)
## ##
## r, a, scratch, scratchReg are mutated ## r, a, scratch, scratchReg are mutated
## M is read-only ## M is read-only
let N = M.len
ctx.comment "Final substraction (may carry)" ctx.comment "Final substraction (may carry)"
# Mask: scratchReg contains 0xFFFF or 0x0000 # Mask: scratchReg contains 0xFFFF or 0x0000
ctx.sbb scratchReg, scratchReg ctx.sbb scratchReg, scratchReg
# Now substract the modulus to test a < p # Now substract the modulus, and test a < p with the last borrow
let N = M.len ctx.mov scratch[0], a[0]
for i in 0 ..< N: ctx.sub scratch[0], M[0]
for i in 1 ..< N:
ctx.mov scratch[i], a[i] ctx.mov scratch[i], a[i]
if i == 0: ctx.sbb scratch[i], M[i]
ctx.sub scratch[i], M[i]
else:
ctx.sbb scratch[i], M[i]
# If it overflows here, it means that it was # If it overflows here, it means that it was
# smaller than the modulus and we don't need `scratch` # smaller than the modulus and we don't need `scratch`
@ -92,7 +93,7 @@ macro finalSub_gen*[N: static int](
r_PIR: var array[N, SecretWord], r_PIR: var array[N, SecretWord],
a_EIR, M_PIR: array[N, SecretWord], a_EIR, M_PIR: array[N, SecretWord],
scratch_EIR: var array[N, SecretWord], scratch_EIR: var array[N, SecretWord],
mayCarry: static bool): untyped = mayOverflow: static bool): untyped =
## Returns: ## Returns:
## a-M if a > M ## a-M if a > M
## a otherwise ## a otherwise
@ -101,7 +102,7 @@ macro finalSub_gen*[N: static int](
## - a_EIR is an array of registers, mutated, ## - a_EIR is an array of registers, mutated,
## - M_PIR is a pointer to an array, read-only, ## - M_PIR is a pointer to an array, read-only,
## - scratch_EIR is an array of registers, mutated ## - scratch_EIR is an array of registers, mutated
## - mayCarry is set to true when the carry flag also needs to be read ## - mayOverflow is set to true when the carry flag also needs to be read
result = newStmtList() result = newStmtList()
var ctx = init(Assembler_x86, BaseType) var ctx = init(Assembler_x86, BaseType)
@ -113,12 +114,12 @@ macro finalSub_gen*[N: static int](
M = init(OperandArray, nimSymbol = M_PIR, N, PointerInReg, Input) M = init(OperandArray, nimSymbol = M_PIR, N, PointerInReg, Input)
t = init(OperandArray, nimSymbol = scratch_EIR, N, ElemsInReg, Output_EarlyClobber) t = init(OperandArray, nimSymbol = scratch_EIR, N, ElemsInReg, Output_EarlyClobber)
if mayCarry: if mayOverflow:
ctx.finalSubMayCarryImpl( ctx.finalSubMayOverflowImpl(
r, a, M, t, rax r, a, M, t, rax
) )
else: else:
ctx.finalSubNoCarryImpl( ctx.finalSubNoOverflowImpl(
r, a, M, t r, a, M, t
) )
@ -156,18 +157,17 @@ macro addmod_gen[N: static int](R: var Limbs[N], A, B, m: Limbs[N], spareBits: s
`usym`[i] = `A`[i] `usym`[i] = `A`[i]
# Addition # Addition
for i in 0 ..< N: ctx.add u[0], b[0]
if i == 0: ctx.mov v[0], u[0]
ctx.add u[0], b[0] for i in 1 ..< N:
else: ctx.adc u[i], b[i]
ctx.adc u[i], b[i]
# Interleaved copy in a second buffer as well # Interleaved copy in a second buffer as well
ctx.mov v[i], u[i] ctx.mov v[i], u[i]
if spareBits >= 1: if spareBits >= 1:
ctx.finalSubNoCarryImpl(r, u, M, v) ctx.finalSubNoOverflowImpl(r, u, M, v)
else: else:
ctx.finalSubMayCarryImpl( ctx.finalSubMayOverflowImpl(
r, u, M, v, b.reuseRegister() r, u, M, v, b.reuseRegister()
) )
@ -209,11 +209,10 @@ macro submod_gen[N: static int](R: var Limbs[N], A, B, m: Limbs[N]): untyped =
`usym`[i] = `A`[i] `usym`[i] = `A`[i]
# Substraction # Substraction
for i in 0 ..< N: ctx.sub u[0], b[0]
if i == 0: ctx.mov v[0], M[0]
ctx.sub u[0], b[0] for i in 1 ..< N:
else: ctx.sbb u[i], b[i]
ctx.sbb u[i], b[i]
# Interleaved copy the modulus to hide SBB latencies # Interleaved copy the modulus to hide SBB latencies
ctx.mov v[i], M[i] ctx.mov v[i], M[i]
@ -226,11 +225,10 @@ macro submod_gen[N: static int](R: var Limbs[N], A, B, m: Limbs[N]): untyped =
ctx.`and` v[i], underflowed ctx.`and` v[i], underflowed
# Add the masked modulus # Add the masked modulus
for i in 0 ..< N: ctx.add u[0], v[0]
if i == 0: ctx.mov r[0], u[0]
ctx.add u[0], v[0] for i in 1 ..< N:
else: ctx.adc u[i], v[i]
ctx.adc u[i], v[i]
ctx.mov r[i], u[i] ctx.mov r[i], u[i]
result.add ctx.generate result.add ctx.generate
@ -258,12 +256,11 @@ macro negmod_gen[N: static int](R: var Limbs[N], A, m: Limbs[N]): untyped =
M = init(OperandArray, nimSymbol = m, N, PointerInReg, InputOutput) M = init(OperandArray, nimSymbol = m, N, PointerInReg, InputOutput)
# Substraction m - a # Substraction m - a
for i in 0 ..< N: ctx.mov u[0], M[0]
ctx.sub u[0], a[0]
for i in 1 ..< N:
ctx.mov u[i], M[i] ctx.mov u[i], M[i]
if i == 0: ctx.sbb u[i], a[i]
ctx.sub u[0], a[0]
else:
ctx.sbb u[i], a[i]
# Deal with a == 0 # Deal with a == 0
let isZero = M.reuseRegister() let isZero = M.reuseRegister()

View File

@ -178,7 +178,7 @@ macro mulMont_CIOS_sparebit_gen[N: static int](
for i in 0 ..< N: for i in 0 ..< N:
ctx.mov r2[i], t[i] ctx.mov r2[i], t[i]
else: else:
ctx.finalSubNoCarryImpl( ctx.finalSubNoOverflowImpl(
r2, t, M, r2, t, M,
scratch scratch
) )
@ -393,7 +393,7 @@ macro sumprodMont_CIOS_spare2bits_gen[N, K: static int](
ctx.mov r2[i], t[i] ctx.mov r2[i], t[i]
else: else:
ctx.comment " Final substraction" ctx.comment " Final substraction"
ctx.finalSubNoCarryImpl( ctx.finalSubNoOverflowImpl(
r2, t, M, r2, t, M,
scratch scratch
) )

View File

@ -272,7 +272,7 @@ macro mulMont_CIOS_sparebit_adx_gen[N: static int](
for i in 0 ..< N: for i in 0 ..< N:
ctx.mov r[i], t[i] ctx.mov r[i], t[i]
else: else:
ctx.finalSubNoCarryImpl( ctx.finalSubNoOverflowImpl(
r, t, M, r, t, M,
scratch scratch
) )
@ -476,7 +476,7 @@ macro sumprodMont_CIOS_spare2bits_adx_gen[N, K: static int](
ctx.mov r2[i], t[i] ctx.mov r2[i], t[i]
else: else:
ctx.comment " Final substraction" ctx.comment " Final substraction"
ctx.finalSubNoCarryImpl( ctx.finalSubNoOverflowImpl(
r2, t, M, r2, t, M,
scratch scratch
) )

View File

@ -157,9 +157,9 @@ macro redc2xMont_gen*[N: static int](
for i in 0 ..< N: for i in 0 ..< N:
ctx.mov r_temp[i], u[i] ctx.mov r_temp[i], u[i]
elif spareBits >= 1: elif spareBits >= 1:
ctx.finalSubNoCarryImpl(r, u, M, t) ctx.finalSubNoOverflowImpl(r, u, M, t)
else: else:
ctx.finalSubMayCarryImpl(r, u, M, t, rax) ctx.finalSubMayOverflowImpl(r, u, M, t, rax)
# Code generation # Code generation
result.add ctx.generate() result.add ctx.generate()
@ -282,4 +282,4 @@ func fromMont_asm*(r: var Limbs, a, M: Limbs, m0ninv: BaseType) =
block: # Map from [0, 2p) to [0, p) block: # Map from [0, 2p) to [0, p)
var workspace{.noInit.}: typeof(r) var workspace{.noInit.}: typeof(r)
r.finalSub_gen(t, M, workspace, mayCarry = false) r.finalSub_gen(t, M, workspace, mayOverflow = false)

View File

@ -134,9 +134,9 @@ macro redc2xMont_adx_gen[N: static int](
for i in 0 ..< N: for i in 0 ..< N:
ctx.mov r[i], t[i] ctx.mov r[i], t[i]
elif spareBits >= 1: elif spareBits >= 1:
ctx.finalSubNoCarryImpl(r, u, M, t) ctx.finalSubNoOverflowImpl(r, u, M, t)
else: else:
ctx.finalSubMayCarryImpl(r, u, M, t, hi) ctx.finalSubMayOverflowImpl(r, u, M, t, hi)
# Code generation # Code generation
result.add ctx.generate() result.add ctx.generate()
@ -264,4 +264,4 @@ func fromMont_asm_adx*(r: var Limbs, a, M: Limbs, m0ninv: BaseType) =
block: # Map from [0, 2p) to [0, p) block: # Map from [0, 2p) to [0, p)
var workspace{.noInit.}: typeof(r) var workspace{.noInit.}: typeof(r)
r.finalSub_gen(t, M, workspace, mayCarry = false) r.finalSub_gen(t, M, workspace, mayOverflow = false)

View File

@ -70,6 +70,9 @@ declareCurves:
curve P224: # NIST P-224 curve P224: # NIST P-224
bitwidth: 224 bitwidth: 224
modulus: "0xffffffffffffffffffffffffffffffff000000000000000000000001" modulus: "0xffffffffffffffffffffffffffffffff000000000000000000000001"
order: "0xffffffffffffffffffffffffffff16a2e0b8f03e13dd29455c5c2a3d"
orderBitwidth: 224
curve BN254_Nogami: # Integer Variable χBased Ate Pairing, 2008, Nogami et al curve BN254_Nogami: # Integer Variable χBased Ate Pairing, 2008, Nogami et al
bitwidth: 254 bitwidth: 254
modulus: "0x2523648240000001ba344d80000000086121000000000013a700000000000013" modulus: "0x2523648240000001ba344d80000000086121000000000013a700000000000013"
@ -162,7 +165,7 @@ declareCurves:
# or for use in Hisil, Wong, Carter, and Dawson extended coordinates # or for use in Hisil, Wong, Carter, and Dawson extended coordinates
# ax² + y² = 1+dx²y² with a = -1 d=-121665/121666 # ax² + y² = 1+dx²y² with a = -1 d=-121665/121666
order: "0x1000000000000000000000000000000014def9dea2f79cd65812631a5cf5d3ed" order: "0x1000000000000000000000000000000014def9dea2f79cd65812631a5cf5d3ed"
orderBItwidth: 253 orderBitwidth: 253
cofactor: 8 cofactor: 8
eq_form: TwistedEdwards eq_form: TwistedEdwards
coef_a: -1 coef_a: -1
@ -188,9 +191,13 @@ declareCurves:
curve P256: # secp256r1 / NIST P-256 curve P256: # secp256r1 / NIST P-256
bitwidth: 256 bitwidth: 256
modulus: "0xffffffff00000001000000000000000000000000ffffffffffffffffffffffff" modulus: "0xffffffff00000001000000000000000000000000ffffffffffffffffffffffff"
order: "0xffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2fc632551"
orderBitwidth: 256
curve Secp256k1: # Bitcoin curve curve Secp256k1: # Bitcoin curve
bitwidth: 256 bitwidth: 256
modulus: "0xfffffffffffffffffffffffffffffffffffffffffffffffffffffffefffffc2f" modulus: "0xfffffffffffffffffffffffffffffffffffffffffffffffffffffffefffffc2f"
order: "0xfffffffffffffffffffffffffffffffebaaedce6af48a03bbfd25e8cd0364141"
orderBitwidth: 256
curve BLS12_377: curve BLS12_377:
# Zexe curve # Zexe curve
# (p41) https://eprint.iacr.org/2018/962.pdf # (p41) https://eprint.iacr.org/2018/962.pdf

View File

@ -257,7 +257,7 @@ func countSpareBits*(M: BigInt): int =
let msb = log2_vartime(BaseType(M.limbs[M.limbs.len-1])) let msb = log2_vartime(BaseType(M.limbs[M.limbs.len-1]))
result = WordBitWidth - 1 - msb.int result = WordBitWidth - 1 - msb.int
func invModBitwidth[T: SomeUnsignedInt](a: T): T = func invModBitwidth*[T: SomeUnsignedInt](a: T): T =
# We use BaseType for return value because static distinct type # We use BaseType for return value because static distinct type
# confuses Nim semchecks [UPSTREAM BUG] # confuses Nim semchecks [UPSTREAM BUG]
# We don't enforce compile-time evaluation here # We don't enforce compile-time evaluation here

View File

@ -23,8 +23,10 @@ export BigInt, wordsRequired
# #
# ############################################################ # ############################################################
# No exceptions for the byte API # No exceptions for the byte API.
{.push raises: [].} # In particular we don't want if-branches when indexing an array
# that contains secret data
{.push raises: [], checks: off.}
# Note: the parsing/serialization routines were initially developed # Note: the parsing/serialization routines were initially developed
# with an internal representation that used 31 bits out of a uint32 # with an internal representation that used 31 bits out of a uint32
@ -36,9 +38,11 @@ export BigInt, wordsRequired
# prototyping, research and debugging purposes, # prototyping, research and debugging purposes,
# and can use exceptions. # and can use exceptions.
func unmarshalLE( func unmarshalLE[T](
dst: var BigInt, dst: var openArray[T],
src: openarray[byte]) = src: openarray[byte],
wordBitWidth: static int
) =
## Parse an unsigned integer from its canonical ## Parse an unsigned integer from its canonical
## little-endian unsigned representation ## little-endian unsigned representation
## and store it into a BigInt ## and store it into a BigInt
@ -47,36 +51,42 @@ func unmarshalLE(
## - no leaks ## - no leaks
## ##
## Can work at compile-time ## Can work at compile-time
##
## It is possible to use a 63-bit representation out of a 64-bit words
## by setting `wordBitWidth` to something different from sizeof(T) * 8
## This might be useful for architectures with no add-with-carry instructions.
# TODO: error on destination to small # TODO: error on destination to small
var var
dst_idx = 0 dst_idx = 0
acc = Zero acc = T(0)
acc_len = 0 acc_len = 0
for src_idx in 0 ..< src.len: for src_idx in 0 ..< src.len:
let src_byte = SecretWord(src[src_idx]) let src_byte = T(src[src_idx])
# buffer reads # buffer reads
acc = acc or (src_byte shl acc_len) acc = acc or (src_byte shl acc_len)
acc_len += 8 # We count bit by bit acc_len += 8 # We count bit by bit
# if full, dump # if full, dump
if acc_len >= WordBitWidth: if acc_len >= wordBitWidth:
dst.limbs[dst_idx] = acc dst[dst_idx] = acc
inc dst_idx inc dst_idx
acc_len -= WordBitWidth acc_len -= wordBitWidth
acc = src_byte shr (8 - acc_len) acc = src_byte shr (8 - acc_len)
if dst_idx < dst.limbs.len: if dst_idx < dst.len:
dst.limbs[dst_idx] = acc dst[dst_idx] = acc
for i in dst_idx + 1 ..< dst.limbs.len: for i in dst_idx + 1 ..< dst.len:
dst.limbs[i] = Zero dst[i] = T(0)
func unmarshalBE( func unmarshalBE[T](
dst: var BigInt, dst: var openArray[T],
src: openarray[byte]) = src: openarray[byte],
wordBitWidth: static int
) =
## Parse an unsigned integer from its canonical ## Parse an unsigned integer from its canonical
## big-endian unsigned representation (octet string) ## big-endian unsigned representation (octet string)
## and store it into a BigInt. ## and store it into a BigInt.
@ -88,36 +98,61 @@ func unmarshalBE(
## - no leaks ## - no leaks
## ##
## Can work at compile-time ## Can work at compile-time
##
## It is possible to use a 63-bit representation out of a 64-bit words
## by setting `wordBitWidth` to something different from sizeof(T) * 8
## This might be useful for architectures with no add-with-carry instructions.
var var
dst_idx = 0 dst_idx = 0
acc = Zero acc = T(0)
acc_len = 0 acc_len = 0
const wordBitWidth = sizeof(T) * 8
for src_idx in countdown(src.len-1, 0): for src_idx in countdown(src.len-1, 0):
let src_byte = SecretWord(src[src_idx]) let src_byte = T(src[src_idx])
# buffer reads # buffer reads
acc = acc or (src_byte shl acc_len) acc = acc or (src_byte shl acc_len)
acc_len += 8 # We count bit by bit acc_len += 8 # We count bit by bit
# if full, dump # if full, dump
if acc_len >= WordBitWidth: if acc_len >= wordBitWidth:
dst.limbs[dst_idx] = acc dst[dst_idx] = acc
inc dst_idx inc dst_idx
acc_len -= WordBitWidth acc_len -= wordBitWidth
acc = src_byte shr (8 - acc_len) acc = src_byte shr (8 - acc_len)
if dst_idx < dst.limbs.len: if dst_idx < dst.len:
dst.limbs[dst_idx] = acc dst[dst_idx] = acc
for i in dst_idx + 1 ..< dst.limbs.len: for i in dst_idx + 1 ..< dst.len:
dst.limbs[i] = Zero dst[i] = T(0)
func unmarshal*[T](
dst: var openArray[T],
src: openarray[byte],
wordBitWidth: static int,
srcEndianness: static Endianness) {.inline.} =
## Parse an unsigned integer from its canonical
## big-endian or little-endian unsigned representation
##
## Constant-Time:
## - no leaks
##
## Can work at compile-time to embed curve moduli
## from a canonical integer representation
when srcEndianness == littleEndian:
dst.unmarshalLE(src, wordBitWidth)
else:
dst.unmarshalBE(src, wordBitWidth)
func unmarshal*( func unmarshal*(
dst: var BigInt, dst: var BigInt,
src: openarray[byte], src: openarray[byte],
srcEndianness: static Endianness) = srcEndianness: static Endianness) {.inline.}=
## Parse an unsigned integer from its canonical ## Parse an unsigned integer from its canonical
## big-endian or little-endian unsigned representation ## big-endian or little-endian unsigned representation
## And store it into a BigInt of size `bits` ## And store it into a BigInt of size `bits`
@ -127,11 +162,7 @@ func unmarshal*(
## ##
## Can work at compile-time to embed curve moduli ## Can work at compile-time to embed curve moduli
## from a canonical integer representation ## from a canonical integer representation
dst.limbs.unmarshal(src, WordBitWidth, srcEndianness)
when srcEndianness == littleEndian:
dst.unmarshalLE(src)
else:
dst.unmarshalBE(src)
func unmarshal*( func unmarshal*(
T: type BigInt, T: type BigInt,
@ -146,7 +177,7 @@ func unmarshal*(
## ##
## Can work at compile-time to embed curve moduli ## Can work at compile-time to embed curve moduli
## from a canonical integer representation ## from a canonical integer representation
result.unmarshal(src, srcEndianness) result.limbs.unmarshal(src, WordBitWidth, srcEndianness)
func fromUint*( func fromUint*(
T: type BigInt, T: type BigInt,
@ -168,53 +199,54 @@ func fromUint*(
# #
# ############################################################ # ############################################################
template blobFrom(dst: var openArray[byte], src: SomeUnsignedInt, startIdx: int, endian: static Endianness) = func marshalLE[T](
## Write an integer into a raw binary blob
## Swapping endianness if needed
## startidx is the first written array item if littleEndian is requested
## or the last if bigEndian is requested
when endian == cpuEndian:
for i in 0 ..< sizeof(src):
dst[startIdx+i] = toByte(src shr (i * 8))
else:
for i in 0 ..< sizeof(src):
dst[startIdx+sizeof(src)-1-i] = toByte(src shr (i * 8))
func marshalLE(
dst: var openarray[byte], dst: var openarray[byte],
src: BigInt) = src: openArray[T],
wordBitWidth: static int) =
## Serialize a bigint into its canonical little-endian representation ## Serialize a bigint into its canonical little-endian representation
## I.e least significant bit first ## I.e least significant bit first
##
## It is possible to use a 63-bit representation out of a 64-bit words
## by setting `wordBitWidth` to something different from sizeof(T) * 8
## This might be useful for architectures with no add-with-carry instructions.
var var
src_idx, dst_idx = 0 src_idx, dst_idx = 0
acc: BaseType = 0
acc_len = 0 acc_len = 0
when sizeof(T) == 8:
type BT = uint64
elif sizeof(T) == 4:
type BT = uint32
else:
{.error "Unsupported word size uint" & $(sizeof(T) * 8).}
var acc = BT(0)
var tail = dst.len var tail = dst.len
while tail > 0: while tail > 0:
let w = if src_idx < src.limbs.len: BaseType(src.limbs[src_idx]) let w = if src_idx < src.len: BT(src[src_idx])
else: 0 else: 0
inc src_idx inc src_idx
if acc_len == 0: if acc_len == 0:
# We need to refill the buffer to output 64-bit # We need to refill the buffer to output 64-bit
acc = w acc = w
acc_len = WordBitWidth acc_len = wordBitWidth
else: else:
when WordBitWidth == sizeof(SecretWord) * 8: when wordBitWidth == sizeof(T) * 8:
let lo = acc let lo = acc
acc = w acc = w
else: # If using 63-bit (or less) out of uint64 else: # If using 63-bit (or less) out of uint64
let lo = (w shl acc_len) or acc let lo = (w shl acc_len) or acc
dec acc_len dec acc_len
acc = w shr (WordBitWidth - acc_len) acc = w shr (wordBitWidth - acc_len)
if tail >= sizeof(SecretWord): if tail >= sizeof(T):
# Unrolled copy # Unrolled copy
dst.blobFrom(src = lo, dst_idx, littleEndian) dst.blobFrom(src = lo, dst_idx, littleEndian)
dst_idx += sizeof(SecretWord) dst_idx += sizeof(T)
tail -= sizeof(SecretWord) tail -= sizeof(T)
else: else:
# Process the tail and exit # Process the tail and exit
when cpuEndian == littleEndian: when cpuEndian == littleEndian:
@ -229,43 +261,56 @@ func marshalLE(
dst[dst_idx+i] = toByte(lo shr ((tail-i)*8)) dst[dst_idx+i] = toByte(lo shr ((tail-i)*8))
return return
func marshalBE( func marshalBE[T](
dst: var openarray[byte], dst: var openarray[byte],
src: BigInt) = src: openArray[T],
wordBitWidth: static int) =
## Serialize a bigint into its canonical big-endian representation ## Serialize a bigint into its canonical big-endian representation
## (octet string) ## (octet string)
## I.e most significant bit first ## I.e most significant bit first
## ##
## In cryptography specifications, this is often called ## In cryptography specifications, this is often called
## "Octet string to Integer" ## "Octet string to Integer"
##
## It is possible to use a 63-bit representation out of a 64-bit words
## by setting `wordBitWidth` to something different from sizeof(T) * 8
## This might be useful for architectures with no add-with-carry instructions.
var var
src_idx = 0 src_idx = 0
acc: BaseType = 0
acc_len = 0 acc_len = 0
when sizeof(T) == 8:
type BT = uint64
elif sizeof(T) == 4:
type BT = uint32
else:
{.error "Unsupported word size uint" & $(sizeof(T) * 8).}
var acc = BT(0)
var tail = dst.len var tail = dst.len
while tail > 0: while tail > 0:
let w = if src_idx < src.limbs.len: BaseType(src.limbs[src_idx]) let w = if src_idx < src.len: BaseType(src[src_idx])
else: 0 else: 0
inc src_idx inc src_idx
if acc_len == 0: if acc_len == 0:
# We need to refill the buffer to output 64-bit # We need to refill the buffer to output 64-bit
acc = w acc = w
acc_len = WordBitWidth acc_len = wordBitWidth
else: else:
when WordBitWidth == sizeof(SecretWord) * 8: when wordBitWidth == sizeof(T) * 8:
let lo = acc let lo = acc
acc = w acc = w
else: # If using 63-bit (or less) out of uint64 else: # If using 63-bit (or less) out of uint64
let lo = (w shl acc_len) or acc let lo = (w shl acc_len) or acc
dec acc_len dec acc_len
acc = w shr (WordBitWidth - acc_len) acc = w shr (wordBitWidth - acc_len)
if tail >= sizeof(SecretWord): if tail >= sizeof(T):
# Unrolled copy # Unrolled copy
tail -= sizeof(SecretWord) tail -= sizeof(T)
dst.blobFrom(src = lo, tail, bigEndian) dst.blobFrom(src = lo, tail, bigEndian)
else: else:
# Process the tail and exit # Process the tail and exit
@ -281,10 +326,26 @@ func marshalBE(
dst[tail-1-i] = toByte(lo shr ((tail-i)*8)) dst[tail-1-i] = toByte(lo shr ((tail-i)*8))
return return
func marshal*[T](
dst: var openArray[byte],
src: openArray[T],
wordBitWidth: static int,
dstEndianness: static Endianness) {.inline.} =
## Serialize a bigint into its canonical big-endian or little endian
## representation.
##
## If the buffer is bigger, output will be zero-padded left for big-endian
## or zero-padded right for little-endian.
## I.e least significant bit is aligned to buffer boundary
when dstEndianness == littleEndian:
marshalLE(dst, src, wordBitWidth)
else:
marshalBE(dst, src, wordBitWidth)
func marshal*( func marshal*(
dst: var openarray[byte], dst: var openArray[byte],
src: BigInt, src: BigInt,
dstEndianness: static Endianness) = dstEndianness: static Endianness) {.inline.} =
## Serialize a bigint into its canonical big-endian or little endian ## Serialize a bigint into its canonical big-endian or little endian
## representation. ## representation.
## A destination buffer of size "(BigInt.bits + 7) div 8" at minimum is needed, ## A destination buffer of size "(BigInt.bits + 7) div 8" at minimum is needed,
@ -293,17 +354,16 @@ func marshal*(
## If the buffer is bigger, output will be zero-padded left for big-endian ## If the buffer is bigger, output will be zero-padded left for big-endian
## or zero-padded right for little-endian. ## or zero-padded right for little-endian.
## I.e least significant bit is aligned to buffer boundary ## I.e least significant bit is aligned to buffer boundary
debug: debug:
doAssert dst.len >= (BigInt.bits + 7) div 8, "BigInt -> Raw int conversion: destination buffer is too small" doAssert dst.len >= (BigInt.bits + 7) div 8, block:
"BigInt -> Raw int conversion: destination buffer is too small\n" &
" bits: " & $BigInt.bits & "\n" &
" bytes allocated: " & $dst.len & '\n'
when BigInt.bits == 0: when BigInt.bits == 0:
zeroMem(dst, dst.len) zeroMem(dst, dst.len)
when dstEndianness == littleEndian: dst.marshal(src.limbs, WordBitWidth, dstEndianness)
marshalLE(dst, src)
else:
marshalBE(dst, src)
{.pop.} # {.push raises: [].} {.pop.} # {.push raises: [].}
@ -379,7 +439,7 @@ func hexToPaddedByteArray*(hexStr: string, output: var openArray[byte], order: s
shift = (shift + 4) and 4 shift = (shift + 4) and 4
dstIdx += shift shr 2 dstIdx += shift shr 2
func nativeEndianToHex(bytes: openarray[byte], order: static[Endianness]): string = func nativeEndianToHex*(bytes: openarray[byte], order: static[Endianness]): string =
## Convert a byte-array to its hex representation ## Convert a byte-array to its hex representation
## Output is in lowercase and not prefixed. ## Output is in lowercase and not prefixed.
## This assumes that input is in platform native endianness ## This assumes that input is in platform native endianness
@ -465,9 +525,6 @@ func appendHex*(dst: var string, big: BigInt, order: static Endianness = bigEndi
# 2 Convert canonical uint to hex # 2 Convert canonical uint to hex
dst.add bytes.nativeEndianToHex(order) dst.add bytes.nativeEndianToHex(order)
func toHex*(a: openArray[byte]): string =
nativeEndianToHex(a, system.cpuEndian)
func toHex*(big: BigInt, order: static Endianness = bigEndian): string = func toHex*(big: BigInt, order: static Endianness = bigEndian): string =
## Stringify an int to hex. ## Stringify an int to hex.
## Note. Leading zeros are not removed. ## Note. Leading zeros are not removed.

View File

@ -0,0 +1,117 @@
# Constantine
# Copyright (c) 2018-2019 Status Research & Development GmbH
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.
import
../platforms/gpu/[llvm, nvidia, ir]
# ############################################################
#
# Field arithmetic on Nvidia GPU
#
# ############################################################
# Loads from global (kernel params) take over 100 cycles
# https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#operand-costs
proc finalSubMayOverflow*(asy: Assembler_LLVM, cm: CurveMetadata, field: Field, r, a: Array) =
## If a >= Modulus: r <- a-M
## else: r <- a
##
## This is constant-time straightline code.
## Due to warp divergence, the overhead of doing comparison with shortcutting might not be worth it on GPU.
##
## To be used when the final substraction can
## also overflow the limbs (a 2^256 order of magnitude modulus stored in n words of total max size 2^256)
let bld = asy.builder
let fieldTy = cm.getFieldType(field)
let scratch = bld.makeArray(fieldTy)
let M = cm.getModulus(field)
let N = M.len
# Contains 0x0001 (if overflowed limbs) or 0x0000
let overflowedLimbs = bld.add_ci(0'u32, 0'u32)
# Now substract the modulus, and test a < M with the last borrow
scratch[0] = bld.sub_bo(a[0], M[0])
for i in 1 ..< N:
scratch[i] = bld.sub_bio(a[i], M[i])
# 1. if `overflowedLimbs`, underflowedModulus >= 0
# 2. if a >= M, underflowedModulus >= 0
# if underflowedModulus >= 0: a-M else: a
let underflowedModulus = bld.sub_bi(overflowedLimbs, 0'u32)
for i in 0 ..< N:
r[i] = bld.slct(scratch[i], a[i], underflowedModulus)
proc finalSubNoOverflow*(asy: Assembler_LLVM, cm: CurveMetadata, field: Field, r, a: Array) =
## If a >= Modulus: r <- a-M
## else: r <- a
##
## This is constant-time straightline code.
## Due to warp divergence, the overhead of doing comparison with shortcutting might not be worth it on GPU.
##
## To be used when the modulus does not use the full bitwidth of the storing words
## (say using 255 bits for the modulus out of 256 available in words)
let bld = asy.builder
let fieldTy = cm.getFieldType(field)
let scratch = bld.makeArray(fieldTy)
let M = cm.getModulus(field)
let N = M.len
# Now substract the modulus, and test a < M with the last borrow
scratch[0] = bld.sub_bo(a[0], M[0])
for i in 1 ..< N:
scratch[i] = bld.sub_bio(a[i], M[i])
# If it underflows here a was smaller than the modulus, which is what we want
let underflowedModulus = bld.sub_bi(0'u32, 0'u32)
for i in 0 ..< N:
r[i] = bld.slct(scratch[i], a[i], underflowedModulus)
proc field_add_gen*(asy: Assembler_LLVM, cm: CurveMetadata, field: Field): FnDef =
## Generate an optimized modular addition kernel
## with parameters `a, b, modulus: Limbs -> Limbs`
let procName = cm.genSymbol(block:
case field
of fp: opFpAdd
of fr: opFrAdd)
let fieldTy = cm.getFieldType(field)
let pFieldTy = pointer_t(fieldTy)
let addModTy = function_t(asy.void_t, [pFieldTy, pFieldTy, pFieldTy])
let addModKernel = asy.module.addFunction(cstring procName, addModTy)
let blck = asy.ctx.appendBasicBlock(addModKernel, "addModBody")
asy.builder.positionAtEnd(blck)
let bld = asy.builder
let r = bld.asArray(addModKernel.getParam(0), fieldTy)
let a = bld.asArray(addModKernel.getParam(1), fieldTy)
let b = bld.asArray(addModKernel.getParam(2), fieldTy)
let t = bld.makeArray(fieldTy)
let N = cm.getNumWords(field)
t[0] = bld.add_co(a[0], b[0])
for i in 1 ..< N:
t[i] = bld.add_cio(a[i], b[i])
if cm.getSpareBits(field) >= 1:
asy.finalSubNoOverflow(cm, field, t, t)
else:
asy.finalSubMayOverflow(cm, field, t, t)
bld.store(r, t)
bld.retVoid()
return (addModTy, addModKernel)

View File

@ -6,6 +6,8 @@
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). # * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms. # at your option. This file may not be copied, modified, or distributed except according to those terms.
import ./compilers/bitops
# ############################################################ # ############################################################
# #
# Bit hacks # Bit hacks
@ -14,19 +16,19 @@
# Bithacks # Bithacks
# ------------------------------------------------------------ # ------------------------------------------------------------
# TODO: Nim std/bitops is unsatisfactory # Nim std/bitops is unsatisfactory
# in particular the "noUndefined" flag # in particular the "noUndefined" flag
# for countLeadingZeroBits/countTrailingZeroBits # for countLeadingZeroBits/countTrailingZeroBits
# is returning zero instead of the integer bitwidth # is returning zero instead of the integer bitwidth
# #
# Furthermore it is not guaranteed constant-time # Furthermore it is not guaranteed constant-time
# And lastly, even compiler builtin may be slightly inefficient # And lastly, even compiler builtin may be slightly inefficient
# for example when doing fastLog2 # for example when doing fastLog2
# which is "31 - builtin_clz" we get # which is "31 - builtin_clz" we get
# `bsr + xor (from clz) + sub` # `bsr + xor (from clz) + sub`
# instead of plain `bsr` # instead of plain `bsr`
# #
# At the moment we don't need them to operate on secret data # At the moment we don't need them to operate on secret data
# #
# See: https://www.chessprogramming.org/BitScan # See: https://www.chessprogramming.org/BitScan
# https://www.chessprogramming.org/General_Setwise_Operations # https://www.chessprogramming.org/General_Setwise_Operations
@ -80,11 +82,20 @@ func log2impl_vartime(x: uint64): uint64 {.inline.} =
func log2_vartime*[T: SomeUnsignedInt](n: T): T {.inline.} = func log2_vartime*[T: SomeUnsignedInt](n: T): T {.inline.} =
## Find the log base 2 of an integer ## Find the log base 2 of an integer
when sizeof(T) == sizeof(uint64): ##
T(log2impl_vartime(uint64(n))) ## ⚠ With GCC and Clang compilers on x86, if n is zero, result is undefined.
when nimvm:
when sizeof(T) == sizeof(uint64):
T(log2impl_vartime(uint64(n)))
else:
static: doAssert sizeof(T) <= sizeof(uint32)
T(log2impl_vartime(uint32(n)))
else: else:
static: doAssert sizeof(T) <= sizeof(uint32) when sizeof(T) == sizeof(uint64):
T(log2impl_vartime(uint32(n))) T(log2_c_compiler_vartime(uint64(n)))
else:
static: doAssert sizeof(T) <= sizeof(uint32)
T(log2_c_compiler_vartime(uint32(n)))
func hammingWeight*(x: uint32): uint {.inline.} = func hammingWeight*(x: uint32): uint {.inline.} =
## Counts the set bits in integer. ## Counts the set bits in integer.

View File

@ -0,0 +1,87 @@
# Constantine
# Copyright (c) 2018-2019 Status Research & Development GmbH
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.
import ../constant_time/ct_types
when GCC_Compatible:
func builtin_clz(n: uint32): cint {.importc: "__builtin_clz", nodecl.}
## Count the number of leading zeros
## undefined if n is zero
func builtin_clzll(n: uint64): cint {.importc: "__builtin_clzll", nodecl.}
## Count the number of leading zeros
## undefined if n is zero
func log2_c_compiler_vartime*(n: uint8|uint16|uint32): int {.inline.} =
## Compute the log2 of n using compiler builtin
## ⚠ Depending on the compiler:
## - It is undefined if n == 0
## - It is not constant-time as a zero input is checked
cast[int](31 - cast[cuint](builtin_clz(n.uint32)))
func log2_c_compiler_vartime*(n: uint64): int {.inline.} =
## Compute the log2 of n using compiler builtin
## ⚠ Depending on the compiler:
## - It is undefined if n == 0
## - It is not constant-time as a zero input is checked
cast[int](63 - cast[cuint](builtin_clzll(n)))
elif defined(icc):
func bitScanReverse(r: var uint32, n: uint32): uint8 {.importc: "_BitScanReverse", header: "<immintrin.h>".}
## Returns 0 if n is zero and non-zero otherwise
## Returns the position of the first set bit in `r`
func bitScanReverse64(r: var uint32, n: uint64): uint8 {.importc: "_BitScanReverse64", header: "<immintrin.h>".}
## Returns 0 if n is zero and non-zero otherwise
## Returns the position of the first set bit in `r`
template bitscan(fnc: untyped; v: untyped): int {.inline.} =
var index: uint32
if fnc(index.addr, v) == 0:
return 0
return index.int
func log2_c_compiler_vartime*(n: uint8|uint16|uint32): int {.inline.} =
## Compute the log2 of n using compiler builtin
## ⚠ Depending on the compiler:
## - It is undefined if n == 0
## - It is not constant-time as a zero input is checked
bitscan(bitScanReverse, c.uint32)
func log2_c_compiler_vartime*(n: uint64): int {.inline.} =
## Compute the log2 of n using compiler builtin
## ⚠ Depending on the compiler:
## - It is undefined if n == 0
## - It is not constant-time as a zero input is checked
bitscan(bitScanReverse64, n)
elif defined(vcc):
func bitScanReverse(p: ptr uint32, b: uint32): uint8 {.importc: "_BitScanReverse", header: "<intrin.h>".}
## Returns 0 if n s no set bit and non-zero otherwise
## Returns the position of the first set bit in `r`
func bitScanReverse64(p: ptr uint32, b: uint64): uint8 {.importc: "_BitScanReverse64", header: "<intrin.h>".}
## Returns 0 if n s no set bit and non-zero otherwise
## Returns the position of the first set bit in `r`
template bitscan(fnc: untyped; v: untyped): int =
var index: uint32
if fnc(index.addr, v) == 0:
return 0
return index.int
func log2_c_compiler_vartime*(n: uint8|uint16|uint32): int {.inline.} =
## Compute the log2 of n using compiler builtin
## ⚠ Depending on the compiler:
## - It is undefined if n == 0
## - It is not constant-time as a zero input is checked
bitscan(bitScanReverse, c.uint32)
func log2_c_compiler_vartime*(n: uint64): int {.inline.} =
## Compute the log2 of n using compiler builtin
## ⚠ Depending on the compiler:
## - It is undefined if n == 0
## - It is not constant-time as a zero input is checked
bitscan(bitScanReverse64, n)
else:
{. error: "Unsupported compiler".}

View File

@ -22,6 +22,18 @@ template toByte*(x: SomeUnsignedInt): byte =
else: else:
byte(x) byte(x)
template blobFrom*(dst: var openArray[byte], src: SomeUnsignedInt, startIdx: int, endian: static Endianness) =
## Write an integer into a raw binary blob
## Swapping endianness if needed
## startidx is the first written array item if littleEndian is requested
## or the last if bigEndian is requested
when endian == cpuEndian:
for i in 0 ..< sizeof(src):
dst[startIdx+i] = toByte(src shr (i * 8))
else:
for i in 0 ..< sizeof(src):
dst[startIdx+sizeof(src)-1-i] = toByte(src shr (i * 8))
func parseFromBlob*[T: byte|char]( func parseFromBlob*[T: byte|char](
dst: var SomeUnsignedInt, dst: var SomeUnsignedInt,
src: openArray[T], src: openArray[T],

View File

@ -0,0 +1,16 @@
# GPU compilation targets
For CPUs, Constantine JIT-compiles the cryptographic kernels via LLVM.
This allows targeting several devices with a single frontend, the LLVM IR.
Current use-cases are large scale aggregations, large-scale multi-scalar-multiplications and large-scale FFTs.
Those are important primitives for:
- aggregation of public keys or signatures from a large number of nodes
- protocols based on polynomial commitments
- zero-knowledge proof systems
Potential future use-cases include erasure coding and lattice-based cryptography acceleration.
⚠️ GPU usage is not constant-time and requires allocation of dynamic memory. It MUST NOT be used for secret data.

View File

@ -0,0 +1,581 @@
# Constantine
# Copyright (c) 2018-2019 Status Research & Development GmbH
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.
import ./utils
{.passc: gorge("llvm-config --cflags").}
{.passl: gorge("llvm-config --libs").}
const libLLVM = gorge("llvm-config --libnames")
static: echo "[Constantine] Using library " & libLLVM
# ############################################################
#
# Bindings to LLVM JIT
#
# ############################################################
# https://llvm.org/doxygen/group__LLVMC.html
# Constantine is a library. It is possible that applications relying on Constantine
# also link to libLLVM, for example if they implement a virtual machine (for the EVM, for Snarks/zero-knowledge, ...).
# Hence Constantine should always use LLVM context to "namespace" its own codegen and avoid collisions in the global context.
{.push used, cdecl, dynlib: libLLVM.}
# ############################################################
#
# LLVM
#
# ############################################################
type
LlvmBool = distinct int32
ErrorRef = distinct pointer
MemoryBufferRef = distinct pointer
ContextRef* = distinct pointer
ModuleRef* = distinct pointer
TargetRef* = distinct pointer
ExecutionEngineRef* = distinct pointer
TargetMachineRef* = distinct pointer
PassManagerRef* = distinct pointer
PassManagerBuilderRef* = distinct pointer
PassBuilderOptionsRef* = distinct pointer
PassRegistryRef* = distinct pointer
TypeRef* = distinct pointer
ValueRef* = distinct pointer
MetadataRef = distinct pointer
LLVMstring = distinct cstring
ErrorMessageString = distinct cstring
## A string with a buffer owned by LLVM
# <llvm-c/Core.h>
proc createContext*(): ContextRef {.importc: "LLVMContextCreate".}
proc dispose*(ctx: ContextRef) {.importc: "LLVMContextDispose".}
proc dispose(msg: LLVMstring) {.importc: "LLVMDisposeMessage".}
## cstring in LLVM are owned by LLVM and must be destroyed with a specific function
proc dispose(buf: MemoryBufferRef){.importc: "LLVMDisposeMemoryBuffer".}
proc getBufferStart(buf: MemoryBufferRef): ptr byte {.importc: "LLVMGetBufferStart".}
proc getBufferSize(buf: MemoryBufferRef): csize_t {.importc: "LLVMGetBufferSize".}
proc dispose(msg: ErrorMessageString) {.importc: "LLVMDisposeErrorMessage".}
proc getErrorMessage(err: ErrorRef): ErrorMessageString {.importc: "LLVMGetErrorMessage".}
# ############################################################
#
# Module
#
# ############################################################
# {.push header: "<llvm-c/Core.h>".}
proc createModule(name: cstring, ctx: ContextRef): ModuleRef {.importc: "LLVMModuleCreateWithNameInContext".}
proc dispose*(m: ModuleRef) {.importc: "LLVMDisposeModule".}
## Destroys a module
## Note: destroying an Execution Engine will also destroy modules attached to it
proc toIR_LLVMstring(m: ModuleRef): LLVMstring {.used, importc: "LLVMPrintModuleToString".}
## Print a module IR to textual IR string. The string must be disposed with LLVM "dispose" or memory will leak.
proc getContext*(m: ModuleRef): ContextRef {.importc: "LLVMGetModuleContext".}
proc getIdentifier*(m: ModuleRef, rLen: var csize_t): cstring {.used, importc: "LLVMGetModuleIdentifier".}
proc addNamedMetadataOperand*(m: ModuleRef, name: cstring, val: ValueRef) {.importc: "LLVMAddNamedMetadataOperand".}
proc metadataNode*(ctx: ContextRef, metadataNodes: openArray[MetadataRef]): MetadataRef {.wrapOpenArrayLenType: csize_t, importc: "LLVMMDNodeInContext2".}
proc metadataNode*(ctx: ContextRef, str: openArray[char]): MetadataRef {.wrapOpenArrayLenType: csize_t, importc: "LLVMMDStringInContext2".}
proc asMetadataRef*(val: ValueRef): MetadataRef {.importc: "LLVMValueAsMetadata".}
proc asValueRef*(ctx: ContextRef, md: MetadataRef): ValueRef {.importc: "LLVMMetadataAsValue".}
# <llvm-c/BitWriter.h>
proc writeBitcodeToFile*(m: ModuleRef, path: cstring) {.importc: "LLVMWriteBitcodeToFile".}
proc writeBitcodeToMemoryBuffer(m: ModuleRef): MemoryBufferRef {.used, importc: "LLVMWriteBitcodeToMemoryBuffer".}
## Write bitcode to a memory buffer
## The MemoryBuffer must be disposed appropriately or memory will leak
type VerifierFailureAction* {.size: sizeof(cint).} = enum
AbortProcessAction # verifier will print to stderr and abort()
PrintMessageAction # verifier will print to stderr and return 1
ReturnStatusAction # verifier will just return 1
# {.push header: "<llvm-c/Analysis.h>".}
proc verify(module: ModuleRef, failureAction: VerifierFailureAction, msg: var LLVMstring): LlvmBool {.used, importc: "LLVMVerifyModule".}
# ############################################################
#
# Target
#
# ############################################################
# "<llvm-c/Target.h>"
# The following procedures:
# - initializeNativeTarget()
# - initializeNativeAsmPrinter()
# are implemented in the development header macros and aren't in the LLVM library
# We want to only depend on the runtime for installation ease and size.
#
# We can emulate the calls based on:
# - /usr/include/llvm-c/Target.h
# - /usr/include/llvm/Config/llvm-config-64.h
# proc initializeNativeTarget*(): LlvmBool {.discardable, importc: "LLVMInitializeNativeTarget".}
# proc initializeNativeAsmPrinter*(): LlvmBool {.discardable, importc: "LLVMInitializeNativeAsmPrinter".}
proc initializeX86AsmPrinter() {.importc: "LLVMInitializeX86AsmPrinter".}
proc initializeX86Target() {.importc: "LLVMInitializeX86Target".}
proc initializeX86TargetInfo() {.importc: "LLVMInitializeX86TargetInfo".}
proc initializeX86TargetMC() {.importc: "LLVMInitializeX86TargetMC".}
proc initializeNVPTXAsmPrinter() {.importc: "LLVMInitializeNVPTXAsmPrinter".}
proc initializeNVPTXTarget() {.importc: "LLVMInitializeNVPTXTarget".}
proc initializeNVPTXTargetInfo() {.importc: "LLVMInitializeNVPTXTargetInfo".}
proc initializeNVPTXTargetMC() {.importc: "LLVMInitializeNVPTXTargetMC".}
proc getTargetFromName*(name: cstring): TargetRef {.importc: "LLVMGetTargetFromName".}
proc getTargetFromTriple*(triple: cstring, target: var TargetRef, errorMessage: var LLVMstring
): LLVMBool {.importc: "LLVMGetTargetFromTriple".}
proc getTargetDescription*(t: TargetRef): cstring {.importc: "LLVMGetTargetDescription".}
proc hasJIT*(t: TargetRef): LLVMBool {.importc: "LLVMTargetHasJIT".}
proc hasTargetMachine*(t: TargetRef): LLVMBool {.importc: "LLVMTargetHasTargetMachine".}
proc hasAsmBackend*(t: TargetRef): LLVMBool {.importc: "LLVMTargetHasAsmBackend".}
# {.push header: "<llvm-c/Core.h>".}
proc setTarget*(module: ModuleRef, triple: cstring) {.importc: "LLVMSetTarget".}
proc setDataLayout*(module: ModuleRef, layout: cstring) {.importc: "LLVMSetDataLayout".}
# ############################################################
#
# Execution Engine
#
# ############################################################
# "<llvm-c/ExecutionEngine.h>"
proc createJITCompilerForModule(
engine: var ExecutionEngineRef,
module: ModuleRef,
optLevel: uint32,
err: var LLVMstring): LlvmBool {.used, importc: "LLVMCreateJITCompilerForModule".}
proc dispose*(engine: ExecutionEngineRef) {.importc: "LLVMDisposeExecutionEngine".}
## Destroys an execution engine
## Note: destroying an Execution Engine will also destroy modules attached to it
proc getFunctionAddress*(engine: ExecutionEngineRef, name: cstring): distinct pointer {.importc: "LLVMGetFunctionAddress".}
# ############################################################
#
# Target Machine
#
# ############################################################
type
CodeGenOptLevel* {.size: sizeof(cint).} = enum
CodeGenLevelNone, CodeGenLevelLess, CodeGenLevelDefault, CodeGenLevelAggressive
RelocMode* {.size: sizeof(cint).} = enum
RelocDefault, RelocStatic, RelocPIC, RelocDynamicNoPic, RelocROPI, RelocRWPI,
RelocROPI_RWPI
CodeModel* {.size: sizeof(cint).} = enum
CodeModelDefault, CodeModelJITDefault, CodeModelTiny, CodeModelSmall,
CodeModelKernel, CodeModelMedium, CodeModelLarge
CodeGenFileType* {.size: sizeof(cint).} = enum
AssemblyFile, ObjectFile
TargetDataRef* = distinct pointer
TargetLibraryInfoRef* = distinct pointer
# "<llvm-c/TargetMachine.h>"
proc createTargetMachine*(
target: TargetRef, triple, cpu, features: cstring,
level: CodeGenOptLevel, reloc: RelocMode, codeModel: CodeModel): TargetMachineRef {.importc: "LLVMCreateTargetMachine".}
proc dispose*(m: TargetMachineRef) {.importc: "LLVMDisposeTargetMachine".}
proc createTargetDataLayout*(t: TargetMachineRef): TargetDataRef {.importc: "LLVMCreateTargetDataLayout".}
proc dispose*(m: TargetDataRef) {.importc: "LLVMDisposeTargetData".}
proc setDataLayout*(module: ModuleRef, dataLayout: TargetDataRef) {.importc: "LLVMSetModuleDataLayout".}
proc targetMachineEmitToFile*(t: TargetMachineRef, m: ModuleRef, fileName: cstring,
codegen: CodeGenFileType, errorMessage: var LLVMstring): LLVMBool {.importc: "LLVMTargetMachineEmitToFile".}
proc targetMachineEmitToMemoryBuffer*(t: TargetMachineRef, m: ModuleRef,
codegen: CodeGenFileType,
errorMessage: var LLVMstring,
outMemBuf: var MemoryBufferRef): LLVMBool {.importc: "LLVMTargetMachineEmitToMemoryBuffer".}
# ############################################################
#
# Passes and transforms
#
# ############################################################
# - https://blog.llvm.org/posts/2021-03-26-the-new-pass-manager/
# - https://llvm.org/docs/NewPassManager.html
# https://llvm.org/doxygen/group__LLVMCCorePassManagers.html
# # header: "<llvm-c/Core.h>"
proc createPassManager*(): PassManagerRef {.importc: "LLVMCreatePassManager".}
proc dispose*(pm: PassManagerRef) {.importc: "LLVMDisposePassManager".}
proc run*(pm: PassManagerRef, module: ModuleRef) {. importc: "LLVMRunPassManager".}
# https://llvm.org/doxygen/group__LLVMCTransformsPassManagerBuilder.html
# header: "<llvm-c/Transforms/PassManagerBuilder.h>"
proc createPassManagerBuilder*(): PassManagerBuilderRef {.importc: "LLVMPassManagerBuilderCreate".}
proc dispose*(pmb: PassManagerBuilderRef) {.importc: "LLVMPassManagerBuilderDispose".}
proc setOptLevel*(pmb: PassManagerBuilderRef, level: uint32) {.importc: "LLVMPassManagerBuilderSetOptLevel".}
proc setSizeLevel*(pmb: PassManagerBuilderRef, level: uint32) {.importc: "LLVMPassManagerBuilderSetSizeLevel".}
proc populateModulePassManager*(pmb: PassManagerBuilderRef, legacyPM: PassManagerRef) {. importc: "LLVMPassManagerBuilderPopulateModulePassManager".}
# https://llvm.org/doxygen/group__LLVMCCoreNewPM.html
# header: "<llvm-c/Transforms/PassBuilder.h>"
proc createPassBuilderOptions*(): PassBuilderOptionsRef {.importc: "LLVMCreatePassBuilderOptions".}
proc dispose*(pbo: PassBuilderOptionsRef) {.importc: "LLVMDisposePassBuilderOptions".}
proc runPasses(module: ModuleRef, passes: cstring, machine: TargetMachineRef, pbo: PassBuilderOptionsRef): ErrorRef {.importc: "LLVMRunPasses".}
# https://llvm.org/docs/doxygen/group__LLVMCInitialization.html
# header: "<llvm-c/Initialization.h>"
proc getGlobalPassRegistry(): PassRegistryRef {.importc: "LLVMGetGlobalPassRegistry".}
proc initializeCore(registry: PassRegistryRef) {.importc: "LLVMInitializeCore".}
proc initializeTransformUtils(registry: PassRegistryRef) {.importc: "LLVMInitializeTransformUtils".}
proc initializeScalarOpts(registry: PassRegistryRef) {.importc: "LLVMInitializeScalarOpts".}
proc initializeObjCARCOpts(registry: PassRegistryRef) {.importc: "LLVMInitializeObjCARCOpts".}
proc initializeVectorization(registry: PassRegistryRef) {.importc: "LLVMInitializeVectorization".}
proc initializeInstCombine(registry: PassRegistryRef) {.importc: "LLVMInitializeInstCombine".}
proc initializeAggressiveInstCombiner(registry: PassRegistryRef) {.importc: "LLVMInitializeAggressiveInstCombiner".}
proc initializeIPO(registry: PassRegistryRef) {.importc: "LLVMInitializeIPO".}
proc initializeInstrumentation(registry: PassRegistryRef) {.importc: "LLVMInitializeInstrumentation".}
proc initializeAnalysis(registry: PassRegistryRef) {.importc: "LLVMInitializeAnalysis".}
proc initializeIPA(registry: PassRegistryRef) {.importc: "LLVMInitializeIPA".}
proc initializeCodeGen(registry: PassRegistryRef) {.importc: "LLVMInitializeCodeGen".}
proc initializeTarget(registry: PassRegistryRef) {.importc: "LLVMInitializeTarget".}
# https://llvm.org/doxygen/group__LLVMCTarget.html
proc addTargetLibraryInfo*(tli: TargetLibraryInfoRef, pm: PassManagerRef) {.importc: "LLVMAddTargetLibraryInfo".}
# There doesn't seem to be a way to instantiate TargetLibraryInfoRef :/
proc addAnalysisPasses*(machine: TargetMachineRef, pm: PassManagerRef) {.importc: "LLVMAddAnalysisPasses".}
# https://www.llvm.org/docs/Passes.html
# -------------------------------------
# https://llvm.org/doxygen/group__LLVMCTransformsInstCombine.html
proc addInstructionCombiningPass*(pm: PassManagerRef) {.importc: "LLVMAddInstructionCombiningPass".}
# https://llvm.org/doxygen/group__LLVMCTransformsUtils.html
proc addPromoteMemoryToRegisterPass*(pm: PassManagerRef) {.importc: "LLVMAddPromoteMemoryToRegisterPass".}
# https://llvm.org/doxygen/group__LLVMCTransformsScalar.html
proc addAggressiveDeadCodeEliminationPass*(pm: PassManagerRef) {.importc: "LLVMAddAggressiveDCEPass".}
proc addDeadStoreEliminationPass*(pm: PassManagerRef) {.importc: "LLVMAddDeadStoreEliminationPass".}
proc addGlobalValueNumberingPass*(pm: PassManagerRef) {.importc: "LLVMAddNewGVNPass".}
proc addMemCpyOptPass*(pm: PassManagerRef) {.importc: "LLVMAddMemCpyOptPass".}
proc addScalarReplacementOfAggregatesPass*(pm: PassManagerRef) {.importc: "LLVMAddScalarReplAggregatesPass".}
# https://llvm.org/doxygen/group__LLVMCTransformsIPO.html
proc addDeduceFunctionAttributesPass*(pm: PassManagerRef) {.importc: "LLVMAddFunctionAttrsPass".}
proc addFunctionInliningPass*(pm: PassManagerRef) {.importc: "LLVMAddFunctionInliningPass".}
# ############################################################
#
# Types
#
# ############################################################
# https://llvm.org/doxygen/group__LLVMCCoreType.html
type
TypeKind* {.size: sizeof(cint).} = enum
tkVoid, ## type with no size
tkHalf, ## 16 bit floating point type
tkFloat, ## 32 bit floating point type
tkDouble, ## 64 bit floating point type
tkX86_FP80, ## 80 bit floating point type (X87)
tkFP128, ## 128 bit floating point type (112-bit mantissa)
tkPPC_FP128, ## 128 bit floating point type (two 64-bits)
tkLabel, ## Labels
tkInteger, ## Arbitrary bit width integers
tkFunction, ## Functions
tkStruct, ## Structures
tkArray, ## Arrays
tkPointer, ## Pointers
tkVector, ## Fixed width SIMD vector type
tkMetadata, ## Metadata
tkX86_MMX, ## X86 MMX
tkToken, ## Tokens
tkScalableVector, ## Scalable SIMD vector type
tkBFloat, ## 16 bit brain floating point type
tkX86_AMX ## X86 AMX
# header: "<llvm-c/Core.h>"
proc getContext*(ty: TypeRef): ContextRef {.importc: "LLVMGetTypeContext".}
proc getTypeKind*(ty: TypeRef): TypeKind {.importc: "LLVMGetTypeKind".}
proc dumpType*(ty: TypeRef) {.sideeffect, importc: "LLVMDumpType".}
proc toLLVMstring(ty: TypeRef): LLVMstring {.used, importc: "LLVMPrintTypeToString".}
proc void_t*(ctx: ContextRef): TypeRef {.importc: "LLVMVoidTypeInContext".}
# Integers
# ------------------------------------------------------------
proc int1_t*(ctx: ContextRef): TypeRef {.importc: "LLVMInt1TypeInContext".}
proc int8_t*(ctx: ContextRef): TypeRef {.importc: "LLVMInt8TypeInContext".}
proc int16_t*(ctx: ContextRef): TypeRef {.importc: "LLVMInt16TypeInContext".}
proc int32_t*(ctx: ContextRef): TypeRef {.importc: "LLVMInt32TypeInContext".}
proc int64_t*(ctx: ContextRef): TypeRef {.importc: "LLVMInt64TypeInContext".}
proc int128_t*(ctx: ContextRef): TypeRef {.importc: "LLVMInt128TypeInContext".}
proc int_t*(ctx: ContextRef, numBits: uint32): TypeRef {.importc: "LLVMIntTypeInContext".}
proc getIntTypeWidth*(ty: TypeRef): uint32 {.importc: "LLVMGetIntTypeWidth".}
# Composite
# ------------------------------------------------------------
proc struct_t*(
ctx: ContextRef,
elemTypes: openArray[TypeRef],
packed: LlvmBool): TypeRef {.wrapOpenArrayLenType: cuint, importc: "LLVMStructTypeInContext".}
proc array_t*(elemType: TypeRef, elemCount: uint32): TypeRef {.importc: "LLVMArrayType".}
proc pointerType(elementType: TypeRef; addressSpace: cuint): TypeRef {.used, importc: "LLVMPointerType".}
proc getElementType*(arrayOrVectorTy: TypeRef): TypeRef {.importc: "LLVMGetElementType".}
# Functions
# ------------------------------------------------------------
proc function_t*(
returnType: TypeRef,
paramTypes: openArray[TypeRef],
isVarArg: LlvmBool): TypeRef {.wrapOpenArrayLenType: cuint, importc: "LLVMFunctionType".}
proc addFunction*(m: ModuleRef, name: cstring, ty: TypeRef): ValueRef {.importc: "LLVMAddFunction".}
## Declare a function `name` in a module.
## Returns a handle to specify its instructions
# TODO: Function and Parameter attributes:
# - https://www.llvm.org/docs/LangRef.html?highlight=attribute#function-attributes
# - https://www.llvm.org/docs/LangRef.html?highlight=attribute#parameter-attributes
#
# We can use attributes to specify additional guarantees of Constantine code, for instance:
# - "pure" function with: nounwind, readonly
# - pointer particularities: readonly, writeonly, noalias, inalloca, byval
proc getReturnType*(functionTy: TypeRef): TypeRef {.importc: "LLVMGetReturnType".}
proc countParamTypes*(functionTy: TypeRef): uint32 {.importc: "LLVMCountParamTypes".}
# ############################################################
#
# Values
#
# ############################################################
# {.push header: "<llvm-c/Core.h>".}
proc getTypeOf*(v: ValueRef): TypeRef {.importc: "LLVMTypeOf".}
proc getValueName2(v: ValueRef, rLen: var csize_t): cstring {.used, importc: "LLVMGetValueName2".}
## Returns the name of a valeu if it exists.
## `rLen` stores the returned string length
##
## This is not free, it requires internal hash table access
## The return value does not have to be freed and is a pointer an internal LLVM data structure
proc dumpValue*(v: ValueRef) {.sideeffect, importc: "LLVMDumpValue".}
## Print the value to stderr
proc toLLVMstring(v: ValueRef): LLVMstring {.used, importc: "LLVMPrintValueToString".}
# Constants
# ------------------------------------------------------------
# https://llvm.org/doxygen/group__LLVMCCoreValueConstant.html
proc constInt(ty: TypeRef, n: culonglong, signExtend: LlvmBool): ValueRef {.importc: "LLVMConstInt".}
proc constReal*(ty: TypeRef, n: cdouble): ValueRef {.importc: "LLVMConstReal".}
proc constNull*(ty: TypeRef): ValueRef {.importc: "LLVMConstNull".}
proc constAllOnes*(ty: TypeRef): ValueRef {.importc: "LLVMConstAllOnes".}
proc constStruct(
constantVals: openArray[ValueRef],
packed: LlvmBool): ValueRef {.wrapOpenArrayLenType: cuint, importc: "LLVMConstStruct".}
proc constArray*(
ty: TypeRef,
constantVals: openArray[ValueRef],
): ValueRef {.wrapOpenArrayLenType: cuint, importc: "LLVMConstArray".}
# ############################################################
#
# IR builder
#
# ############################################################
# https://llvm.org/doxygen/group__LLVMCCoreInstructionBuilder.html
type
BasicBlockRef* = distinct pointer
BuilderRef* = distinct pointer
## An instruction builder represents a point within a basic block and is
## the exclusive means of building instructions using the C interface.
IntPredicate* {.size: sizeof(cint).} = enum
IntEQ = 32 ## equal
IntNE ## not equal
IntUGT ## unsigned greater than
IntUGE ## unsigned greater or equal
IntULT ## unsigned less than
IntULE ## unsigned less or equal
IntSGT ## signed greater than
IntSGE ## signed greater or equal
IntSLT ## signed less than
IntSLE ## signed less or equal
InlineAsmDialect* {.size: sizeof(cint).} = enum
InlineAsmDialectATT
InlineAsmDialectIntel
# "<llvm-c/Core.h>"
# Instantiation
# ------------------------------------------------------------
proc appendBasicBlock*(ctx: ContextRef, fn: ValueRef, name: cstring): BasicBlockRef {.importc: "LLVMAppendBasicBlockInContext".}
## Append a basic block to the end of a function
proc createBuilder*(ctx: ContextRef): BuilderRef {.importc: "LLVMCreateBuilderInContext".}
proc dispose*(builder: BuilderRef) {.importc: "LLVMDisposeBuilder".}
# Functions
# ------------------------------------------------------------
proc getParam*(fn: ValueRef, index: uint32): ValueRef {.importc: "LLVMGetParam".}
proc retVoid*(builder: BuilderRef): ValueRef {.discardable, importc: "LLVMBuildRetVoid".}
proc ret*(builder: BuilderRef, returnVal: ValueRef): ValueRef {.discardable, importc: "LLVMBuildRet".}
# Positioning
# ------------------------------------------------------------
proc position*(builder: BuilderRef, blck: BasicBlockRef, instr: ValueRef) {.importc: "LLVMPositionBuilder".}
proc positionBefore*(builder: BuilderRef, instr: ValueRef) {.importc: "LLVMPositionBuilderBefore".}
proc positionAtEnd*(builder: BuilderRef, blck: BasicBlockRef) {.importc: "LLVMPositionBuilderAtEnd".}
proc getInsertBlock(builder: BuilderRef): BasicBlockRef {.importc: "LLVMGetInsertBlock".}
## This function is not documented and probably for special use
## However due to https://github.com/llvm/llvm-project/issues/59875
## it's our workaround to get the context of a Builder
proc getBasicBlockParent*(blck: BasicBlockRef): ValueRef {.importc: "LLVMGetBasicBlockParent".}
## Obtains the function to which a basic block belongs
# Inline Assembly
# ------------------------------------------------------------
proc getInlineAsm*(
ty: TypeRef,
asmString: openArray[char],
constraints: openArray[char],
hasSideEffects, isAlignStack: LlvmBool,
dialect: InlineAsmDialect, canThrow: LlvmBool
): ValueRef {.importc: "LLVMGetInlineAsm"}
# Intermediate Representation
# ------------------------------------------------------------
#
# - NSW: no signed wrap, signed value cannot over- or underflow.
# - NUW: no unsigned wrap, unsigned value cannot over- or underflow.
proc call2*(
builder: BuilderRef,
ty: TypeRef,
fn: ValueRef,
args: openArray[ValueRef],
name: cstring = ""): ValueRef {.wrapOpenArrayLenType: cuint, importc: "LLVMBuildCall2".}
proc add*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildAdd".}
proc addNSW*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildNSWAdd".}
proc addNUW*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildNUWAdd".}
proc sub*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildSub".}
proc subNSW*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildNSWSub".}
proc subNUW*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildNUWSub".}
proc neg*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildNeg".}
proc negNSW*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildNSWNeg".}
proc negNUW*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildNUWNeg".}
proc mul*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildMul".}
proc mulNSW*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildNSWMul".}
proc mulNUW*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildNUWMul".}
proc divU*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildUDiv".}
proc divU_exact*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildExactUDiv".}
proc divS*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildSDiv".}
proc divS_exact*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildExactSDiv".}
proc remU*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildURem".}
proc remS*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildSRem".}
proc lshl*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildShl".}
proc lshr*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildLShr".}
proc ashr*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildAShr".}
proc `and`*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildAnd".}
proc `or`*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildOr".}
proc `xor`*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildXor".}
proc `not`*(builder: BuilderRef, val: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildNot".}
proc select*(builder: BuilderRef, condition, then, otherwise: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildSelect".}
proc icmp*(builder: BuilderRef, op: IntPredicate, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildICmp".}
proc bitcast*(builder: BuilderRef, val: ValueRef, destTy: TypeRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildBitcast".}
proc trunc*(builder: BuilderRef, val: ValueRef, destTy: TypeRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildTrunc".}
proc zext*(builder: BuilderRef, val: ValueRef, destTy: TypeRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildZExt".}
## Zero-extend
proc sext*(builder: BuilderRef, val: ValueRef, destTy: TypeRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildSExt".}
## Sign-extend
proc malloc*(builder: BuilderRef, ty: TypeRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildMalloc".}
proc mallocArray*(builder: BuilderRef, ty: TypeRef, length: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildArrayMalloc".}
proc free*(builder: BuilderRef, ty: TypeRef, `ptr`: ValueRef): ValueRef {.importc: "LLVMBuildFree".}
proc alloca*(builder: BuilderRef, ty: TypeRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildAlloca".}
proc allocaArray*(builder: BuilderRef, ty: TypeRef, length: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildArrayAlloca".}
proc extractValue*(builder: BuilderRef, aggVal: ValueRef, index: uint32, name: cstring = ""): ValueRef {.importc: "LLVMBuildExtractValue".}
proc insertValue*(builder: BuilderRef, aggVal: ValueRef, eltVal: ValueRef, index: uint32, name: cstring = ""): ValueRef {.discardable, importc: "LLVMBuildInsertValue".}
proc getElementPtr2*(
builder: BuilderRef,
ty: TypeRef,
`ptr`: ValueRef,
indices: openArray[ValueRef],
name: cstring = ""
): ValueRef {.wrapOpenArrayLenType: cuint, importc: "LLVMBuildGEP2".}
## https://www.llvm.org/docs/GetElementPtr.html
proc getElementPtr2_InBounds*(
builder: BuilderRef,
ty: TypeRef,
`ptr`: ValueRef,
indices: openArray[ValueRef],
name: cstring = ""
): ValueRef {.wrapOpenArrayLenType: cuint, importc: "LLVMBuildInBoundsGEP2".}
## https://www.llvm.org/docs/GetElementPtr.html
## If the GEP lacks the inbounds keyword, the value is the result from evaluating the implied twos complement integer computation.
## However, since theres no guarantee of where an object will be allocated in the address space, such values have limited meaning.
proc getElementPtr2_Struct*(
builder: BuilderRef,
ty: TypeRef,
`ptr`: ValueRef,
idx: uint32,
name: cstring = ""
): ValueRef {.importc: "LLVMBuildStructGEP2".}
## https://www.llvm.org/docs/GetElementPtr.html
## If the GEP lacks the inbounds keyword, the value is the result from evaluating the implied twos complement integer computation.
## However, since theres no guarantee of where an object will be allocated in the address space, such values have limited meaning.
proc load2*(builder: BuilderRef, ty: TypeRef, `ptr`: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildLoad2".}
proc store*(builder: BuilderRef, val, `ptr`: ValueRef): ValueRef {.discardable, importc: "LLVMBuildStore".}
proc memset*(builder: BuilderRef, `ptr`, val, len: ValueRef, align: uint32) {.importc: "LLVMBuildMemset".}
proc memcpy*(builder: BuilderRef, dst: ValueRef, dstAlign: uint32, src: ValueRef, srcAlign: uint32, size: ValueRef) {.importc: "LLVMBuildMemcpy".}
proc memmove*(builder: BuilderRef, dst: ValueRef, dstAlign: uint32, src: ValueRef, srcAlign: uint32, size: ValueRef) {.importc: "LLVMBuildMemmove".}
{.pop.} # {.used, hint[Name]: off, cdecl, dynlib: libLLVM.}

View File

@ -0,0 +1,518 @@
# Constantine
# Copyright (c) 2018-2019 Status Research & Development GmbH
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.
# ############################################################
#
# Bindings to Nvidia GPUs libraries
#
# ############################################################
import ./utils
# ############################################################
#
# Cuda
#
# ############################################################
static: echo "[Constantine] Using library libcuda.so"
{.passl: "-L/opt/cuda/lib64 -lcuda".}
# Cuda offers 2 APIs:
# - cuda.h the driver API
# - cuda_runtime.h the runtime API
#
# https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html
#
# We need to use the lower-level driver API for JIT modules loading and reloading
type
CUresult* {.size: sizeof(cint).} = enum
## The API call returned with no errors. In the case of query calls, this
## also means that the operation being queried is complete (see
## ::cuEventQuery() and ::cuStreamQuery()).
CUDA_SUCCESS = 0
## This indicates that one or more of the parameters passed to the API call
## is not within an acceptable range of values.
CUDA_ERROR_INVALID_VALUE = 1
## The API call failed because it was unable to allocate enough memory to
## perform the requested operation.
CUDA_ERROR_OUT_OF_MEMORY = 2
## This indicates that the CUDA driver has not been initialized with
## ::cuInit() or that initialization has failed.
CUDA_ERROR_NOT_INITIALIZED = 3
## This indicates that the CUDA driver is in the process of shutting down.
CUDA_ERROR_DEINITIALIZED = 4
## This indicates profiler is not initialized for this run. This can
## happen when the application is running with external profiling tools
## like visual profiler.
CUDA_ERROR_PROFILER_DISABLED = 5
## to attempt to enable/disable the profiling via ::cuProfilerStart or
## ::cuProfilerStop without initialization.
CUDA_ERROR_PROFILER_NOT_INITIALIZED = 6
## to call cuProfilerStart() when profiling is already enabled.
CUDA_ERROR_PROFILER_ALREADY_STARTED = 7
## to call cuProfilerStop() when profiling is already disabled.
CUDA_ERROR_PROFILER_ALREADY_STOPPED = 8
## This indicates that the CUDA driver that the application has loaded is a
## stub library. Applications that run with the stub rather than a real
## driver loaded will result in CUDA API returning this error.
CUDA_ERROR_STUB_LIBRARY = 34
## This indicates that requested CUDA device is unavailable at the current
## time. Devices are often unavailable due to use of
## ::CU_COMPUTEMODE_EXCLUSIVE_PROCESS or ::CU_COMPUTEMODE_PROHIBITED.
CUDA_ERROR_DEVICE_UNAVAILABLE = 46
## This indicates that no CUDA-capable devices were detected by the installed
## CUDA driver.
CUDA_ERROR_NO_DEVICE = 100
## This indicates that the device ordinal supplied by the user does not
## correspond to a valid CUDA device or that the action requested is
## invalid for the specified device.
CUDA_ERROR_INVALID_DEVICE = 101
## This error indicates that the Grid license is not applied.
CUDA_ERROR_DEVICE_NOT_LICENSED = 102
## This indicates that the device kernel image is invalid. This can also
## indicate an invalid CUDA module.
CUDA_ERROR_INVALID_IMAGE = 200
## This most frequently indicates that there is no context bound to the
## current thread. This can also be returned if the context passed to an
## API call is not a valid handle (such as a context that has had
## ::cuCtxDestroy() invoked on it). This can also be returned if a user
## mixes different API versions (i.e. 3010 context with 3020 API calls).
## See ::cuCtxGetApiVersion() for more details.
CUDA_ERROR_INVALID_CONTEXT = 201
## This indicated that the context being supplied as a parameter to the
## API call was already the active context.
## error to attempt to push the active context via ::cuCtxPushCurrent().
CUDA_ERROR_CONTEXT_ALREADY_CURRENT = 202
## This indicates that a map or register operation has failed.
CUDA_ERROR_MAP_FAILED = 205
## This indicates that an unmap or unregister operation has failed.
CUDA_ERROR_UNMAP_FAILED = 206
## This indicates that the specified array is currently mapped and thus
## cannot be destroyed.
CUDA_ERROR_ARRAY_IS_MAPPED = 207
## This indicates that the resource is already mapped.
CUDA_ERROR_ALREADY_MAPPED = 208
## This indicates that there is no kernel image available that is suitable
## for the device. This can occur when a user specifies code generation
## options for a particular CUDA source file that do not include the
## corresponding device configuration.
CUDA_ERROR_NO_BINARY_FOR_GPU = 209
## This indicates that a resource has already been acquired.
CUDA_ERROR_ALREADY_ACQUIRED = 210
## This indicates that a resource is not mapped.
CUDA_ERROR_NOT_MAPPED = 211
## This indicates that a mapped resource is not available for access as an
## array.
CUDA_ERROR_NOT_MAPPED_AS_ARRAY = 212
## This indicates that a mapped resource is not available for access as a
## pointer.
CUDA_ERROR_NOT_MAPPED_AS_POINTER = 213
## This indicates that an uncorrectable ECC error was detected during
## execution.
CUDA_ERROR_ECC_UNCORRECTABLE = 214
## This indicates that the ::CUlimit passed to the API call is not
## supported by the active device.
CUDA_ERROR_UNSUPPORTED_LIMIT = 215
## This indicates that the ::CUcontext passed to the API call can
## only be bound to a single CPU thread at a time but is already
## bound to a CPU thread.
CUDA_ERROR_CONTEXT_ALREADY_IN_USE = 216
## This indicates that peer access is not supported across the given
## devices.
CUDA_ERROR_PEER_ACCESS_UNSUPPORTED = 217
## This indicates that a PTX JIT compilation failed.
CUDA_ERROR_INVALID_PTX = 218
## This indicates an error with OpenGL or DirectX context.
CUDA_ERROR_INVALID_GRAPHICS_CONTEXT = 219
## This indicates that an uncorrectable NVLink error was detected during the
## execution.
CUDA_ERROR_NVLINK_UNCORRECTABLE = 220
## This indicates that the PTX JIT compiler library was not found.
CUDA_ERROR_JIT_COMPILER_NOT_FOUND = 221
## This indicates that the provided PTX was compiled with an unsupported toolchain.
CUDA_ERROR_UNSUPPORTED_PTX_VERSION = 222
## This indicates that the PTX JIT compilation was disabled.
CUDA_ERROR_JIT_COMPILATION_DISABLED = 223
## This indicates that the ::CUexecAffinityType passed to the API call is not
## supported by the active device.
CUDA_ERROR_UNSUPPORTED_EXEC_AFFINITY = 224
## This indicates that the device kernel source is invalid. This includes
## compilation/linker errors encountered in device code or user error.
CUDA_ERROR_INVALID_SOURCE = 300
## This indicates that the file specified was not found.
CUDA_ERROR_FILE_NOT_FOUND = 301
## This indicates that a link to a shared object failed to resolve.
CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND = 302
## This indicates that initialization of a shared object failed.
CUDA_ERROR_SHARED_OBJECT_INIT_FAILED = 303
## This indicates that an OS call failed.
CUDA_ERROR_OPERATING_SYSTEM = 304
## This indicates that a resource handle passed to the API call was not
## valid. Resource handles are opaque types like ::CUstream and ::CUevent.
CUDA_ERROR_INVALID_HANDLE = 400
## This indicates that a resource required by the API call is not in a
## valid state to perform the requested operation.
CUDA_ERROR_ILLEGAL_STATE = 401
## This indicates that a named symbol was not found. Examples of symbols
## are global/constant variable names, driver function names, texture names,
## and surface names.
CUDA_ERROR_NOT_FOUND = 500
## This indicates that asynchronous operations issued previously have not
## completed yet. This result is not actually an error, but must be indicated
## differently than ::CUDA_SUCCESS (which indicates completion). Calls that
## may return this value include ::cuEventQuery() and ::cuStreamQuery().
CUDA_ERROR_NOT_READY = 600
## While executing a kernel, the device encountered a
## load or store instruction on an invalid memory address.
## This leaves the process in an inconsistent state and any further CUDA work
## will return the same error. To continue using CUDA, the process must be terminated
## and relaunched.
CUDA_ERROR_ILLEGAL_ADDRESS = 700
## This indicates that a launch did not occur because it did not have
## appropriate resources. This error usually indicates that the user has
## attempted to pass too many arguments to the device kernel, or the
## kernel launch specifies too many threads for the kernel's register
## count. Passing arguments of the wrong size (i.e. a 64-bit pointer
## when a 32-bit int is expected) is equivalent to passing too many
## arguments and can also result in this error.
CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES = 701
## This indicates that the device kernel took too long to execute. This can
## only occur if timeouts are enabled - see the device attribute
## ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT for more information.
## This leaves the process in an inconsistent state and any further CUDA work
## will return the same error. To continue using CUDA, the process must be terminated
## and relaunched.
CUDA_ERROR_LAUNCH_TIMEOUT = 702
## This error indicates a kernel launch that uses an incompatible texturing
## mode.
CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING = 703
## This error indicates that a call to ::cuCtxEnablePeerAccess() is
## trying to re-enable peer access to a context which has already
## had peer access to it enabled.
CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED = 704
## This error indicates that ::cuCtxDisablePeerAccess() is
## trying to disable peer access which has not been enabled yet
## via ::cuCtxEnablePeerAccess().
CUDA_ERROR_PEER_ACCESS_NOT_ENABLED = 705
## This error indicates that the primary context for the specified device
## has already been initialized.
CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE = 708
## This error indicates that the context current to the calling thread
## has been destroyed using ::cuCtxDestroy, or is a primary context which
## has not yet been initialized.
CUDA_ERROR_CONTEXT_IS_DESTROYED = 709
## A device-side assert triggered during kernel execution. The context
## cannot be used anymore, and must be destroyed. All existing device
## memory allocations from this context are invalid and must be
## reconstructed if the program is to continue using CUDA.
CUDA_ERROR_ASSERT = 710
## This error indicates that the hardware resources required to enable
## peer access have been exhausted for one or more of the devices
## passed to ::cuCtxEnablePeerAccess().
CUDA_ERROR_TOO_MANY_PEERS = 711
## This error indicates that the memory range passed to ::cuMemHostRegister()
## has already been registered.
CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED = 712
## This error indicates that the pointer passed to ::cuMemHostUnregister()
## does not correspond to any currently registered memory region.
CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED = 713
## While executing a kernel, the device encountered a stack error.
## This can be due to stack corruption or exceeding the stack size limit.
## This leaves the process in an inconsistent state and any further CUDA work
## will return the same error. To continue using CUDA, the process must be terminated
## and relaunched.
CUDA_ERROR_HARDWARE_STACK_ERROR = 714
## While executing a kernel, the device encountered an illegal instruction.
## This leaves the process in an inconsistent state and any further CUDA work
## will return the same error. To continue using CUDA, the process must be terminated
## and relaunched.
CUDA_ERROR_ILLEGAL_INSTRUCTION = 715
## While executing a kernel, the device encountered a load or store instruction
## on a memory address which is not aligned.
## This leaves the process in an inconsistent state and any further CUDA work
## will return the same error. To continue using CUDA, the process must be terminated
## and relaunched.
CUDA_ERROR_MISALIGNED_ADDRESS = 716
## While executing a kernel, the device encountered an instruction
## which can only operate on memory locations in certain address spaces
## (global, shared, or local), but was supplied a memory address not
## belonging to an allowed address space.
## This leaves the process in an inconsistent state and any further CUDA work
## will return the same error. To continue using CUDA, the process must be terminated
## and relaunched.
CUDA_ERROR_INVALID_ADDRESS_SPACE = 717
## While executing a kernel, the device program counter wrapped its address space.
## This leaves the process in an inconsistent state and any further CUDA work
## will return the same error. To continue using CUDA, the process must be terminated
## and relaunched.
CUDA_ERROR_INVALID_PC = 718
## An exception occurred on the device while executing a kernel. Common
## causes include dereferencing an invalid device pointer and accessing
## out of bounds shared memory. Less common cases can be system specific - more
## information about these cases can be found in the system specific user guide.
## This leaves the process in an inconsistent state and any further CUDA work
## will return the same error. To continue using CUDA, the process must be terminated
## and relaunched.
CUDA_ERROR_LAUNCH_FAILED = 719
## This error indicates that the number of blocks launched per grid for a kernel that was
## launched via either ::cuLaunchCooperativeKernel or ::cuLaunchCooperativeKernelMultiDevice
## exceeds the maximum number of blocks as allowed by ::cuOccupancyMaxActiveBlocksPerMultiprocessor
## or ::cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags times the number of multiprocessors
## as specified by the device attribute ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT.
CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE = 720
## This error indicates that the attempted operation is not permitted.
CUDA_ERROR_NOT_PERMITTED = 800
## This error indicates that the attempted operation is not supported
## on the current system or device.
CUDA_ERROR_NOT_SUPPORTED = 801
## This error indicates that the system is not yet ready to start any CUDA
## work. To continue using CUDA, verify the system configuration is in a
## valid state and all required driver daemons are actively running.
## More information about this error can be found in the system specific
## user guide.
CUDA_ERROR_SYSTEM_NOT_READY = 802
## This error indicates that there is a mismatch between the versions of
## the display driver and the CUDA driver. Refer to the compatibility documentation
## for supported versions.
CUDA_ERROR_SYSTEM_DRIVER_MISMATCH = 803
## This error indicates that the system was upgraded to run with forward compatibility
## but the visible hardware detected by CUDA does not support this configuration.
## Refer to the compatibility documentation for the supported hardware matrix or ensure
## that only supported hardware is visible during initialization via the CUDA_VISIBLE_DEVICES
## environment variable.
CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE = 804
## This error indicates that the MPS client failed to connect to the MPS control daemon or the MPS server.
CUDA_ERROR_MPS_CONNECTION_FAILED = 805
## This error indicates that the remote procedural call between the MPS server and the MPS client failed.
CUDA_ERROR_MPS_RPC_FAILURE = 806
## This error indicates that the MPS server is not ready to accept new MPS client requests.
## This error can be returned when the MPS server is in the process of recovering from a fatal failure.
CUDA_ERROR_MPS_SERVER_NOT_READY = 807
## This error indicates that the hardware resources required to create MPS client have been exhausted.
CUDA_ERROR_MPS_MAX_CLIENTS_REACHED = 808
## This error indicates the the hardware resources required to support device connections have been exhausted.
CUDA_ERROR_MPS_MAX_CONNECTIONS_REACHED = 809
## This error indicates that the MPS client has been terminated by the server. To continue using CUDA, the process must be terminated and relaunched.
CUDA_ERROR_MPS_CLIENT_TERMINATED = 810
## This error indicates that the operation is not permitted when
## the stream is capturing.
CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED = 900
## This error indicates that the current capture sequence on the stream
## has been invalidated due to a previous error.
CUDA_ERROR_STREAM_CAPTURE_INVALIDATED = 901
## This error indicates that the operation would have resulted in a merge
## of two independent capture sequences.
CUDA_ERROR_STREAM_CAPTURE_MERGE = 902
## This error indicates that the capture was not initiated in this stream.
CUDA_ERROR_STREAM_CAPTURE_UNMATCHED = 903
## This error indicates that the capture sequence contains a fork that was
## not joined to the primary stream.
CUDA_ERROR_STREAM_CAPTURE_UNJOINED = 904
## This error indicates that a dependency would have been created which
## crosses the capture sequence boundary. Only implicit in-stream ordering
## dependencies are allowed to cross the boundary.
CUDA_ERROR_STREAM_CAPTURE_ISOLATION = 905
## This error indicates a disallowed implicit dependency on a current capture
## sequence from cudaStreamLegacy.
CUDA_ERROR_STREAM_CAPTURE_IMPLICIT = 906
## This error indicates that the operation is not permitted on an event which
## was last recorded in a capturing stream.
CUDA_ERROR_CAPTURED_EVENT = 907
## A stream capture sequence not initiated with the ::CU_STREAM_CAPTURE_MODE_RELAXED
## argument to ::cuStreamBeginCapture was passed to ::cuStreamEndCapture in a
## different thread.
CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD = 908
## This error indicates that the timeout specified for the wait operation has lapsed.
CUDA_ERROR_TIMEOUT = 909
## This error indicates that the graph update was not performed because it included
## changes which violated constraints specific to instantiated graph update.
CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE = 910
## This indicates that an async error has occurred in a device outside of CUDA.
## If CUDA was waiting for an external device's signal before consuming shared data,
## the external device signaled an error indicating that the data is not valid for
## consumption. This leaves the process in an inconsistent state and any further CUDA
## work will return the same error. To continue using CUDA, the process must be
## terminated and relaunched.
CUDA_ERROR_EXTERNAL_DEVICE = 911
## Indicates a kernel launch error due to cluster misconfiguration.
CUDA_ERROR_INVALID_CLUSTER_SIZE = 912
## This indicates that an unknown internal error has occurred.
CUDA_ERROR_UNKNOWN = 999
CUdevice_attribute* {.size: sizeof(cint).} = enum
CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1, ## Maximum number of threads per block */
CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2, ## Maximum block dimension X */
CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 3, ## Maximum block dimension Y */
CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z = 4, ## Maximum block dimension Z */
CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5, ## Maximum grid dimension X */
CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y = 6, ## Maximum grid dimension Y */
CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z = 7, ## Maximum grid dimension Z */
CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8, ## Maximum shared memory available per block in bytes */
CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 9, ## Memory available on device for __constant__ variables in a CUDA C kernel in bytes */
CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10, ## Warp size in threads */
CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11, ## Maximum pitch in bytes allowed by memory copies */
CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12, ## Maximum number of 32-bit registers available per block */
CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13, ## Typical clock frequency in kilohertz */
CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14, ## Alignment requirement for textures */
CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16, ## Number of multiprocessors on device */
CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17, ## Specifies whether there is a run time limit on kernels */
CU_DEVICE_ATTRIBUTE_INTEGRATED = 18, ## Device is integrated with host memory */
CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 19, ## Device can map host memory into CUDA address space */
CU_DEVICE_ATTRIBUTE_COMPUTE_MODE = 20, ## Compute mode (See ::CUcomputemode for details) */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH = 21, ## Maximum 1D texture width */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH = 22, ## Maximum 2D texture width */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT = 23, ## Maximum 2D texture height */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH = 24, ## Maximum 3D texture width */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT = 25, ## Maximum 3D texture height */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH = 26, ## Maximum 3D texture depth */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH = 27, ## Maximum 2D layered texture width */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT = 28, ## Maximum 2D layered texture height */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS = 29, ## Maximum layers in a 2D layered texture */
CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT = 30, ## Alignment requirement for surfaces */
CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31, ## Device can possibly execute multiple kernels concurrently */
CU_DEVICE_ATTRIBUTE_ECC_ENABLED = 32, ## Device has ECC support enabled */
CU_DEVICE_ATTRIBUTE_PCI_BUS_ID = 33, ## PCI bus ID of the device */
CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID = 34, ## PCI device ID of the device */
CU_DEVICE_ATTRIBUTE_TCC_DRIVER = 35, ## Device is using TCC driver model */
CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36, ## Peak memory clock frequency in kilohertz */
CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH = 37, ## Global memory bus width in bits */
CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE = 38, ## Size of L2 cache in bytes */
CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39, ## Maximum resident threads per multiprocessor */
CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT = 40, ## Number of asynchronous engines */
CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING = 41, ## Device shares a unified address space with the host */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH = 42, ## Maximum 1D layered texture width */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS = 43, ## Maximum layers in a 1D layered texture */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH = 45, ## Maximum 2D texture width if CUDA_ARRAY3D_TEXTURE_GATHER is set */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT = 46, ## Maximum 2D texture height if CUDA_ARRAY3D_TEXTURE_GATHER is set */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE = 47, ## Alternate maximum 3D texture width */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE = 48, ## Alternate maximum 3D texture height */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE = 49, ## Alternate maximum 3D texture depth */
CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID = 50, ## PCI domain ID of the device */
CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT = 51, ## Pitch alignment requirement for textures */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH = 52, ## Maximum cubemap texture width/height */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH = 53, ## Maximum cubemap layered texture width/height */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS = 54, ## Maximum layers in a cubemap layered texture */
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH = 55, ## Maximum 1D surface width */
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH = 56, ## Maximum 2D surface width */
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT = 57, ## Maximum 2D surface height */
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH = 58, ## Maximum 3D surface width */
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT = 59, ## Maximum 3D surface height */
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH = 60, ## Maximum 3D surface depth */
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH = 61, ## Maximum 1D layered surface width */
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS = 62, ## Maximum layers in a 1D layered surface */
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH = 63, ## Maximum 2D layered surface width */
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT = 64, ## Maximum 2D layered surface height */
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS = 65, ## Maximum layers in a 2D layered surface */
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH = 66, ## Maximum cubemap surface width */
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH = 67, ## Maximum cubemap layered surface width */
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS = 68, ## Maximum layers in a cubemap layered surface */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH = 70, ## Maximum 2D linear texture width */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT = 71, ## Maximum 2D linear texture height */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH = 72, ## Maximum 2D linear texture pitch in bytes */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH = 73, ## Maximum mipmapped 2D texture width */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT = 74, ## Maximum mipmapped 2D texture height */
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75, ## Major compute capability version number */
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76, ## Minor compute capability version number */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH = 77, ## Maximum mipmapped 1D texture width */
CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED = 78, ## Device supports stream priorities */
CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED = 79, ## Device supports caching globals in L1 */
CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED = 80, ## Device supports caching locals in L1 */
CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR = 81, ## Maximum shared memory available per multiprocessor in bytes */
CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82, ## Maximum number of 32-bit registers available per multiprocessor */
CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY = 83, ## Device can allocate managed memory on this system */
CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD = 84, ## Device is on a multi-GPU board */
CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID = 85, ## Unique id for a group of devices on the same multi-GPU board */
CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED = 86, ## Link between the device and the host supports native atomic operations (this is a placeholder attribute, and is not supported on any current hardware)*/
CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO = 87, ## Ratio of single precision performance (in floating-point operations per second) to double precision performance */
CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS = 88, ## Device supports coherently accessing pageable memory without calling cudaHostRegister on it */
CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS = 89, ## Device can coherently access managed memory concurrently with the CPU */
CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED = 90, ## Device supports compute preemption. */
CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM = 91, ## Device can access host registered memory at the same virtual address as the CPU */
CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS = 92, ## ::cuStreamBatchMemOp and related APIs are supported. */
CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS = 93, ## 64-bit operations are supported in ::cuStreamBatchMemOp and related APIs. */
CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR = 94, ## ::CU_STREAM_WAIT_VALUE_NOR is supported. */
CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH = 95, ## Device supports launching cooperative kernels via ::cuLaunchCooperativeKernel */
CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN = 97, ## Maximum optin shared memory per block */
CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES = 98, ## The ::CU_STREAM_WAIT_VALUE_FLUSH flag and the ::CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES MemOp are supported on the device. See \ref CUDA_MEMOP for additional details. */
CU_DEVICE_ATTRIBUTE_HOST_REGISTER_SUPPORTED = 99, ## Device supports host memory registration via ::cudaHostRegister. */
CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES = 100, ## Device accesses pageable memory via the host's page tables. */
CU_DEVICE_ATTRIBUTE_DIRECT_MANAGED_MEM_ACCESS_FROM_HOST = 101, ## The host can directly access managed memory on the device without migration. */
CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED = 102, ## Device supports virtual memory management APIs like ::cuMemAddressReserve, ::cuMemCreate, ::cuMemMap and related APIs */
CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR_SUPPORTED = 103, ## Device supports exporting memory to a posix file descriptor with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate */
CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_HANDLE_SUPPORTED = 104, ## Device supports exporting memory to a Win32 NT handle with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate */
CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_KMT_HANDLE_SUPPORTED = 105, ## Device supports exporting memory to a Win32 KMT handle with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate */
CU_DEVICE_ATTRIBUTE_MAX_BLOCKS_PER_MULTIPROCESSOR = 106, ## Maximum number of blocks per multiprocessor */
CU_DEVICE_ATTRIBUTE_GENERIC_COMPRESSION_SUPPORTED = 107, ## Device supports compression of memory */
CU_DEVICE_ATTRIBUTE_MAX_PERSISTING_L2_CACHE_SIZE = 108, ## Maximum L2 persisting lines capacity setting in bytes. */
CU_DEVICE_ATTRIBUTE_MAX_ACCESS_POLICY_WINDOW_SIZE = 109, ## Maximum value of CUaccessPolicyWindow::num_bytes. */
CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED = 110, ## Device supports specifying the GPUDirect RDMA flag with ::cuMemCreate */
CU_DEVICE_ATTRIBUTE_RESERVED_SHARED_MEMORY_PER_BLOCK = 111, ## Shared memory reserved by CUDA driver per block in bytes */
CU_DEVICE_ATTRIBUTE_SPARSE_CUDA_ARRAY_SUPPORTED = 112, ## Device supports sparse CUDA arrays and sparse CUDA mipmapped arrays */
CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED = 113, ## Device supports using the ::cuMemHostRegister flag ::CU_MEMHOSTERGISTER_READ_ONLY to register memory that must be mapped as read-only to the GPU */
CU_DEVICE_ATTRIBUTE_TIMELINE_SEMAPHORE_INTEROP_SUPPORTED = 114, ## External timeline semaphore interop is supported on the device */
CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED = 115, ## Device supports using the ::cuMemAllocAsync and ::cuMemPool family of APIs */
CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED = 116, ## Device supports GPUDirect RDMA APIs, like nvidia_p2p_get_pages (see https://docs.nvidia.com/cuda/gpudirect-rdma for more information) */
CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS = 117, ## The returned attribute shall be interpreted as a bitmask, where the individual bits are described by the ::CUflushGPUDirectRDMAWritesOptions enum */
CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING = 118, ## GPUDirect RDMA writes to the device do not need to be flushed for consumers within the scope indicated by the returned attribute. See ::CUGPUDirectRDMAWritesOrdering for the numerical values returned here. */
CU_DEVICE_ATTRIBUTE_MEMPOOL_SUPPORTED_HANDLE_TYPES = 119, ## Handle types supported with mempool based IPC */
CU_DEVICE_ATTRIBUTE_CLUSTER_LAUNCH = 120, ## Indicates device supports cluster launch */
CU_DEVICE_ATTRIBUTE_DEFERRED_MAPPING_CUDA_ARRAY_SUPPORTED = 121, ## Device supports deferred mapping CUDA arrays and CUDA mipmapped arrays */
CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS_V2 = 122, ## 64-bit operations are supported in ::cuStreamBatchMemOp_v2 and related v2 MemOp APIs. */
CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR_V2 = 123, ## ::CU_STREAM_WAIT_VALUE_NOR is supported by v2 MemOp APIs. */
CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED = 124, ## Device supports buffer sharing with dma_buf mechanism. */
CU_DEVICE_ATTRIBUTE_MAX
CUmemAttach_flags* = enum
CU_MEM_ATTACH_GLOBAL = 0x1, ## Memory can be accessed by any stream on any device
CU_MEM_ATTACH_HOST = 0x2, ## Memory cannot be accessed by any stream on any device
CU_MEM_ATTACH_SINGLE = 0x4
CUdevice* = distinct int32
## Compute Device handle
CUcontext* = distinct pointer
CUmodule* = distinct pointer
CUfunction* = distinct pointer
CUstream* = distinct pointer
CUdeviceptr* = distinct pointer
{.push cdecl, importc, dynlib: "libcuda.so".}
proc cuInit*(flags: uint32): CUresult
proc cuDeviceGetCount*(count: var int32): CUresult
proc cuDeviceGet*(device: var CUdevice, ordinal: int32): CUresult
proc cuDeviceGetName*(name: ptr char, len: int32, dev: CUdevice): CUresult
proc cuDeviceGetAttribute*(r: var int32, attrib: CUdevice_attribute, dev: CUdevice): CUresult
proc cuCtxCreate*(pctx: var CUcontext, flags: uint32, dev: CUdevice): CUresult
proc cuCtxDestroy*(ctx: CUcontext): CUresult
proc cuCtxSynchronize*(ctx: CUcontext): CUresult
proc cuModuleLoadData(module: var CUmodule, sourceCode: ptr char): CUresult {.used.}
proc cuModuleUnload*(module: CUmodule): CUresult
proc cuModuleGetFunction(kernel: var CUfunction, module: CUmodule, fnName: ptr char): CUresult {.used.}
proc cuLaunchKernel(
kernel: CUfunction,
gridDimX, gridDimY, gridDimZ: uint32,
blockDimX, blockDimY, blockDimZ: uint32,
sharedMemBytes: uint32,
stream: CUstream,
kernelParams: ptr pointer,
extra: ptr pointer
): CUresult {.used.}
proc cuMemAlloc*(devptr: var CUdeviceptr, size: csize_t): CUresult
proc cuMemAllocManaged*(devptr: var CUdeviceptr, size: csize_t, flags: Flag[CUmemAttach_flags]): CUresult
proc cuMemFree*(devptr: CUdeviceptr): CUresult
proc cuMemcpyHtoD*(dst: CUdeviceptr, src: pointer, size: csize_t): CUresult
proc cuMemcpyDtoH*(dst: pointer, src: CUdeviceptr, size: csize_t): CUresult
{.pop.} # {.push cdecl, importc, dynlib: "libcuda.so".}

View File

@ -0,0 +1,174 @@
# Constantine
# Copyright (c) 2018-2019 Status Research & Development GmbH
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.
import std/macros
# ############################################################
#
# Binding utilities
#
# ############################################################
# Flag parameters
# ------------------------------------------------------------
type Flag*[E: enum] = distinct cint
func flag*[E: enum](e: varargs[E]): Flag[E] {.inline.} =
## Enum should only have power of 2 fields
# static:
# for val in E:
# assert (ord(val) and (ord(val) - 1)) == 0, "Enum values should all be power of 2, found " &
# $val & " with value " & $ord(val) & "."
var flags = 0
for val in e:
flags = flags or ord(val)
result = Flag[E](flags)
# Macros
# ------------------------------------------------------------
proc replaceSymsByIdents*(ast: NimNode): NimNode =
proc inspect(node: NimNode): NimNode =
case node.kind:
of {nnkIdent, nnkSym}:
return ident($node)
of nnkEmpty:
return node
of nnkLiterals:
return node
of nnkHiddenStdConv:
if node[1].kind == nnkIntLit:
return node[1]
else:
expectKind(node[1], nnkSym)
return ident($node[1])
of nnkConv: # type conversion needs to be replaced by a function call in untyped AST
var rTree = nnkCall.newTree()
for child in node:
rTree.add inspect(child)
return rTree
else:
var rTree = node.kind.newTree()
for child in node:
rTree.add inspect(child)
return rTree
result = inspect(ast)
macro replacePragmasByInline(procAst: typed): untyped =
## Replace pragmas by the inline pragma
## We need a separate "typed" macro
## so that it is executed after the {.push mypragma.} calls
var params: seq[NimNode]
for i in 0 ..< procAst.params.len:
params.add procAst.params[i]
result = newStmtList()
# The push cdecl is applied multiple times :/, so fight push with push
result.add nnkPragma.newTree(ident"push", ident"nimcall", ident"inline")
result.add newProc(
name = procAst.name,
params = params,
body = procAst.body.replaceSymsByIdents(),
procType = nnkProcDef,
pragmas = nnkPragma.newTree(ident"inline", ident"nimcall")
)
result.add nnkPragma.newTree(ident"pop")
macro wrapOpenArrayLenType*(ty: typedesc, procAst: untyped): untyped =
## Wraps pointer+len library calls in properly typed and converted openArray calls
##
## ```
## {.push cdecl.}
## proc foo*(r: int, a: openArray[CustomType], b: int) {.wrapOpenArrayLenType: uint32, importc: "foo", dynlib: "libfoo.so".}
## {.pop.}
## ```
##
## is transformed into
##
## ```
## proc foo(r: int, a: ptr CustomType, aLen: uint32, b: int) {.cdecl, importc: "foo", dynlib: "libfoo.so".}
##
## proc foo*(r: int, a: openArray[CustomType], b: int) {.inline.} =
## foo(r, a[0].unsafeAddr, a.len.uint32, b)
## ```
procAst.expectKind(nnkProcDef)
var
wrappeeParams = @[procAst.params[0]]
wrapperParams = @[procAst.params[0]]
wrapperBody = newCall(ident($procAst.name))
for i in 1 ..< procAst.params.len:
if procAst.params[i][^2].kind == nnkBracketExpr and procAst.params[i][^2][0].eqident"openarray":
procAst.params[i].expectLen(3) # prevent `proc foo(a, b: openArray[int])`
wrappeeParams.add newIdentDefs(
ident($procAst.params[i][0] & "Ptr"),
nnkPtrTy.newTree(procAst.params[i][^2][1]),
newEmptyNode()
)
wrappeeParams.add newIdentDefs(
ident($procAst.params[i][0] & "Len"),
ty,
newEmptyNode()
)
wrapperParams.add procAst.params[i]
wrapperBody.add nnkIfExpr.newTree(
nnkElifExpr.newTree(
nnkInfix.newTree(
ident"==",
nnkDotExpr.newTree(ident($procAst.params[i][0]), bindSym"len"),
newLit 0
),
newNilLit()
),
nnkElseExpr.newTree(
newCall(
ident"unsafeAddr",
nnkBracketExpr.newTree(
ident($procAst.params[i][0]),
newLit 0
))
)
)
wrapperBody.add newCall(ty, nnkDotExpr.newTree(ident($procAst.params[i][0]), bindSym"len"))
else:
wrappeeParams.add procAst.params[i]
wrapperParams.add procAst.params[i]
# Handle "a, b: int"
for j in 0 ..< procAst.params[i].len - 2:
wrapperBody.add ident($procAst.params[i][j])
let wrappee = newProc(
name = ident($procAst.name), # Remove export marker if any
params = wrappeeParams,
body = procAst.body.copyNimTree(),
procType = nnkProcDef,
pragmas = procAst.pragma
)
let wrapper = newProc(
name = procAst[0], # keep export marker if any
params = wrapperParams,
body = newStmtList(procAst.body.copyNimTree(), wrapperBody), # original procAst body can contain comments that we copy
procType = nnkProcDef,
pragmas = nnkPragma.newTree(bindSym"replacePragmasByInline") # pragmas are for the wrappee
)
result = newStmtList(wrappee, wrapper)
when isMainModule:
expandMacros:
{.push cdecl.}
proc foo(x: int, a: openArray[uint32], name: cstring) {.wrapOpenArrayLenType: cuint.} =
discard
{.pop.}

View File

@ -0,0 +1,345 @@
# Constantine
# Copyright (c) 2018-2019 Status Research & Development GmbH
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.
import
../../math/config/[curves, precompute],
../../math/io/io_bigints,
../primitives, ../bithacks, ../endians,
./llvm,
std/hashes
# ############################################################
#
# Assembler
#
# ############################################################
type
Assembler_LLVM* = ref object
# LLVM
ctx*: ContextRef
module*: ModuleRef
builder*: BuilderRef
i1_t*, i32_t*, i64_t*, void_t*: TypeRef
backend*: Backend
Backend* = enum
bkNvidiaPTX
FnDef* = tuple[fnTy: TypeRef, fnImpl: ValueRef]
# calling getTypeOf on a ValueRef function
# loses type information like return value type or arity
proc finalizeAssemblerLLVM(asy: Assembler_LLVM) =
if not asy.isNil:
asy.builder.dispose()
asy.module.dispose()
asy.ctx.dispose()
proc new*(T: type Assembler_LLVM, backend: Backend, moduleName: cstring): Assembler_LLVM =
new result, finalizeAssemblerLLVM
result.ctx = createContext()
result.module = result.ctx.createModule(moduleName)
case backend
of bkNvidiaPTX:
result.module.setTarget("nvptx64-nvidia-cuda")
# Datalayout for NVVM IR 1.8 (CUDA 11.6)
result.module.setDataLayout("e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64")
result.builder = result.ctx.createBuilder()
result.i1_t = result.ctx.int1_t()
result.i32_t = result.ctx.int32_t()
result.i64_t = result.ctx.int32_t()
result.void_t = result.ctx.void_t()
result.backend = backend
# ############################################################
#
# Metadata precomputation
#
# ############################################################
# Constantine on CPU is configured at compile-time for several properties that need to be runtime configuration GPUs:
# - word size (32-bit or 64-bit)
# - curve properties access like modulus bitsize or -1/M[0] a.k.a. m0ninv
# - constants are stored in freestanding `const`
#
# This is because it's not possible to store a BigInt[254] and a BigInt[384]
# in a generic way in the same structure, especially without using heap allocation.
# And with Nim's dead code elimination, unused curves are not compiled in.
#
# As there would be no easy way to dynamically retrieve (via an array or a table)
# const BLS12_381_modulus = ...
# const BN254_Snarks_modulus = ...
#
# - We would need a macro to properly access each constant.
# - We would need to create a 32-bit and a 64-bit version.
# - Unused curves would be compiled in the program.
#
# Note: on GPU we don't manipulate secrets hence branches and dynamic memory allocations are allowed.
#
# As GPU is a niche usage, instead we recreate the relevant `precompute` and IO procedures
# with dynamic wordsize support.
type
DynWord = uint32 or uint64
BigNum[T: DynWord] = object
bits: uint32
limbs: seq[T]
# Serialization
# ------------------------------------------------
func byteLen(bits: SomeInteger): SomeInteger {.inline.} =
## Length in bytes to serialize BigNum
(bits + 7) shr 3 # (bits + 8 - 1) div 8
func wordsRequiredForBits(bits, wordBitwidth: SomeInteger): SomeInteger {.inline.} =
## Compute the number of limbs required
## from the announced bit length
debug: doAssert wordBitwidth == 32 or wordBitwidth == 64 # Power of 2
(bits + wordBitwidth - 1) shr log2_vartime(uint32 wordBitwidth) # 5x to 55x faster than dividing by wordBitwidth
func fromHex[T](a: var BigNum[T], s: string) =
var bytes = newSeq[byte](a.bits.byteLen())
hexToPaddedByteArray(s, bytes, bigEndian)
# 2. Convert canonical uint to BigNum
const wordBitwidth = sizeof(T) * 8
a.limbs.unmarshal(bytes, wordBitwidth, bigEndian)
func fromHex[T](BN: type BigNum[T], bits: uint32, s: string): BN =
const wordBitwidth = sizeof(T) * 8
let numWords = wordsRequiredForBits(bits, wordBitwidth)
result.bits = bits
result.limbs.setLen(numWords)
result.fromHex(s)
func toHex[T](a: BigNum[T]): string =
## Conversion to big-endian hex
## This is variable-time
# 1. Convert BigInt to canonical uint
const wordBitwidth = sizeof(T) * 8
var bytes = newSeq[byte](byteLen(a.bits))
bytes.marshal(a.limbs, wordBitwidth, cpuEndian)
# 2 Convert canonical uint to hex
return bytes.nativeEndianToHex(bigEndian)
# Checks
# ------------------------------------------------
func checkValidModulus(M: BigNum) =
const wordBitwidth = uint32(BigNum.T.sizeof() * 8)
let expectedMsb = M.bits-1 - wordBitwidth * (M.limbs.len.uint32 - 1)
let msb = log2_vartime(M.limbs[M.limbs.len-1])
doAssert msb == expectedMsb, "Internal Error: the modulus must use all declared bits and only those:\n" &
" Modulus '" & M.toHex() & "' is declared with " & $M.bits &
" bits but uses " & $(msb + wordBitwidth * uint32(M.limbs.len - 1)) & " bits."
# Fields metadata
# ------------------------------------------------
func negInvModWord[T](M: BigNum[T]): T =
## Returns the Montgomery domain magic constant for the input modulus:
##
## µ ≡ -1/M[0] (mod SecretWord)
##
## M[0] is the least significant limb of M
## M must be odd and greater than 2.
##
## Assuming 64-bit words:
##
## µ ≡ -1/M[0] (mod 2^64)
checkValidModulus(M)
result = invModBitwidth(M.limbs[0])
# negate to obtain the negative inverse
result = not(result) + 1
# ############################################################
#
# Intermediate Representation
#
# ############################################################
type
WordSize* = enum
size32
size64
Field* = enum
fp
fr
FieldConst* = object
wordTy: TypeRef
fieldTy: TypeRef
modulus*: seq[ConstValueRef]
m0ninv*: ConstValueRef
bits*: uint32
spareBits*: uint8
CurveMetadata* = object
curve*: Curve
prefix*: string
wordSize*: WordSize
fp*: FieldConst
fr*: FieldConst
Opcode* = enum
opFpAdd = "fp_add"
opFrAdd = "fr_add"
proc setFieldConst(fc: var FieldConst, ctx: ContextRef, wordSize: WordSize, modBits: uint32, modulus: string) =
let wordTy = case wordSize
of size32: ctx.int32_t()
of size64: ctx.int64_t()
let wordBitwidth = case wordSize
of size32: 32'u32
of size64: 64'u32
let numWords = wordsRequiredForBits(modBits, wordBitwidth)
fc.wordTy = wordTy
fc.fieldTy = array_t(wordTy, numWords)
case wordSize
of size32:
let m = BigNum[uint32].fromHex(modBits, modulus)
fc.modulus.setlen(m.limbs.len)
for i in 0 ..< m.limbs.len:
fc.modulus[i] = ctx.int32_t().constInt(m.limbs[i])
fc.m0ninv = ctx.int32_t().constInt(m.negInvModWord())
of size64:
let m = BigNum[uint64].fromHex(modBits, modulus)
fc.modulus.setlen(m.limbs.len)
for i in 0 ..< m.limbs.len:
fc.modulus[i] = ctx.int64_t().constInt(m.limbs[i])
fc.m0ninv = ctx.int64_t().constInt(m.negInvModWord())
debug: doAssert numWords == fc.modulus.len.uint32
fc.bits = modBits
fc.spareBits = uint8(numWords*wordBitwidth - modBits)
proc init*(
C: type CurveMetadata, ctx: ContextRef,
prefix: string, wordSize: WordSize,
fpBits: uint32, fpMod: string,
frBits: uint32, frMod: string): CurveMetadata =
result = C(prefix: prefix, wordSize: wordSize)
result.fp.setFieldConst(ctx, wordSize, fpBits, fpMod)
result.fr.setFieldConst(ctx, wordSize, frBits, frMod)
proc hash*(curveOp: tuple[cm: CurveMetadata, op: Opcode]): Hash {.inline.} =
result = hash(curveOp.cm.curve) !& int(hash(curveOp.op))
result = !$result
proc genSymbol*(cm: CurveMetadata, opcode: Opcode): string {.inline.} =
cm.prefix &
(if cm.wordSize == size32: "32b_" else: "64b_") &
$opcode
func getFieldType*(cm: CurveMetadata, field: Field): TypeRef {.inline.} =
if field == fp:
return cm.fp.fieldTy
else:
return cm.fr.fieldTy
func getNumWords*(cm: CurveMetadata, field: Field): int {.inline.} =
case field
of fp:
return cm.fp.modulus.len
of fr:
return cm.fr.modulus.len
func getModulus*(cm: CurveMetadata, field: Field): lent seq[ConstValueRef] {.inline.} =
# TODO: replace static typing, the returned type is incorrect for 64-bit
case field
of fp:
return cm.fp.modulus
of fr:
return cm.fr.modulus
func getSpareBits*(cm: CurveMetadata, field: Field): uint8 {.inline.} =
if field == fp:
return cm.fp.sparebits
else:
return cm.fr.sparebits
# ############################################################
#
# Syntax Sugar
#
# ############################################################
# For array access we need to use:
#
# builder.extractValue(array, index, name)
# builder.insertValue(array, index, value, name)
#
# which is very verbose compared to array[index].
# So we wrap in syntactic sugar to improve readability, maintainability and auditability
type
Array* = object
builder: BuilderRef
p: ValueRef
arrayTy: TypeRef
elemTy: TypeRef
int32_t: TypeRef
proc asArray*(builder: BuilderRef, arrayPtr: ValueRef, arrayTy: TypeRef): Array =
Array(
builder: builder,
p: arrayPtr,
arrayTy: arrayTy,
elemTy: arrayTy.getElementType(),
int32_t: arrayTy.getContext().int32_t()
)
proc makeArray*(builder: BuilderRef, arrayTy: TypeRef): Array =
Array(
builder: builder,
p: builder.alloca(arrayTy),
arrayTy: arrayTy,
elemTy: arrayTy.getElementType(),
int32_t: arrayTy.getContext().int32_t()
)
proc makeArray*(builder: BuilderRef, elemTy: TypeRef, len: uint32): Array =
let arrayTy = array_t(elemTy, len)
Array(
builder: builder,
p: builder.alloca(arrayTy),
arrayTy: arrayTy,
elemTy: elemTy,
int32_t: arrayTy.getContext().int32_t()
)
proc `[]`*(a: Array, index: SomeInteger): ValueRef {.inline.}=
# First dereference the array pointer with 0, then access the `index`
let pelem = a.builder.getElementPtr2_InBounds(a.arrayTy, a.p, [ValueRef constInt(a.int32_t, 0), ValueRef constInt(a.int32_t, uint64 index)])
a.builder.load2(a.elemTy, pelem)
proc `[]=`*(a: Array, index: SomeInteger, val: ValueRef) {.inline.}=
let pelem = a.builder.getElementPtr2_InBounds(a.arrayTy, a.p, [ValueRef constInt(a.int32_t, 0), ValueRef constInt(a.int32_t, uint64 index)])
a.builder.store(val, pelem)
proc store*(builder: BuilderRef, dst: Array, src: Array) {.inline.}=
let v = builder.load2(src.arrayTy, src.p)
builder.store(v, dst.p)

View File

@ -0,0 +1,212 @@
# Constantine
# Copyright (c) 2018-2019 Status Research & Development GmbH
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.
import ./bindings/llvm_abi {.all.}
export llvm_abi
# ############################################################
#
# LLVM API
#
# ############################################################
# This file exposes a Nimified LLVM API.
# Most functions are reexported as-is except functions involving:
# - LLVM cstring
# - LLVM Memory buffer
# - LLVM bool
# - LLVM metadata
# The cstring and memory buffers require manual memory management on the LLVM side.
# The LLVM bool uses 32-bit representation instead of 1-bit.
# LLVM metadata is easier to use with dedicated procedures.
# ⚠ Warning: To provide full diagnostic (filename:line), we wrap some LLVM procedures in template
# Templates copy-paste their inputs parameters.
# For example if `module` parameter is passed `foo.launchMissiles()`
# and that parameter is used twice within the template, `foo.launchMissiles()` will be called twice.
# Module
# ------------------------------------------------------------
proc createModule*(ctx: ContextRef, name: cstring): ModuleRef {.inline.} =
createModule(name, ctx)
proc `$`*(ty: ModuleRef): string =
let s = ty.toIR_LLVMstring()
result = $cstring(s)
s.dispose()
proc toBitcode*(m: ModuleRef): seq[byte] =
## Print a module IR to bitcode
let mb = m.writeBitcodeToMemoryBuffer()
let len = mb.getBufferSize()
result.newSeq(len)
copyMem(result[0].addr, mb.getBufferStart(), len)
mb.dispose()
template verify*(module: ModuleRef, failureAction: VerifierFailureAction) =
## Verify the IR code in a module
var errMsg: LLVMstring
let err = bool verify(module, failureAction, errMsg)
if err:
writeStackTrace()
stderr.write("\"verify\" for module '" & astToStr(module) & "' " & $instantiationInfo() & " exited with error: " & $cstring(errMsg) & '\n')
errMsg.dispose()
quit 1
proc getIdentifier*(module: ModuleRef): string =
var rLen: csize_t
let rStr = getIdentifier(module, rLen)
result = newString(rLen.int)
copyMem(result[0].addr, rStr, rLen.int)
# Target
# ------------------------------------------------------------
template toTarget*(triple: cstring): TargetRef =
var target: TargetRef
var errMsg: LLVMstring
let err = bool triple.getTargetFromTriple(target, errMsg)
if err:
writeStackTrace()
echo "\"toTarget\" for triple '", triple, "' " & $instantiationInfo() & " exited with error: " & $cstring(errMsg) & '\n'
errMsg.dispose()
quit 1
target
proc initializeFullNativeTarget* {.inline.} =
static: doAssert defined(amd64) or defined(i386), "Only x86 is configured at the moment"
initializeX86TargetInfo()
initializeX86Target()
initializeX86TargetMC()
# With usual `initializeNativeTarget`
# it's a separate call but it's mandatory so include it
initializeX86AsmPrinter()
proc initializeFullNVPTXTarget* {.inline.} =
initializeNVPTXTargetInfo()
initializeNVPTXTarget()
initializeNVPTXTargetMC()
initializeNVPTXAsmPrinter()
# Execution Engine
# ------------------------------------------------------------
template createJITCompilerForModule*(
engine: var ExecutionEngineRef,
module: ModuleRef,
optLevel: uint32) =
var errMsg: LLVMstring
let err = bool createJITCompilerForModule(engine, module, optLevel, errMsg)
if err:
writeStackTrace()
stderr.write("\"createJITCompilerForModule\" for module '" & astToStr(module) & "' " & $instantiationInfo() & " exited with error: " & $cstring(errMsg) & '\n')
errMsg.dispose()
quit 1
# Target Machine
# ------------------------------------------------------------
template emitToFile*(t: TargetMachineRef, m: ModuleRef,
fileName: string, codegen: CodeGenFileType) =
var errMsg: LLVMstring
let err = bool targetMachineEmitToFile(t, m, cstring(fileName), codegen, errMsg)
if err:
writeStackTrace()
stderr.write("\"emitToFile\" for module '" & astToStr(module) & "' " & $instantiationInfo() & " exited with error: " & $cstring(errMsg) & '\n')
errMsg.dispose()
quit 1
template emitToString*(t: TargetMachineRef, m: ModuleRef, codegen: CodeGenFileType): string =
## Codegen to string
var errMsg: LLVMstring
var mb: MemoryBufferRef
let err = bool targetMachineEmitToMemoryBuffer(t, m, codegen, errMsg, mb)
if err:
writeStackTrace()
stderr.write("\"emitToString\" for module '" & astToStr(module) & "' " & $instantiationInfo() & " exited with error: " & $cstring(errMsg) & '\n')
errMsg.dispose()
quit 1
let len = mb.getBufferSize()
var emitted = newString(len)
copyMem(emitted[0].addr, mb.getBufferStart(), len)
mb.dispose()
emitted
# Target Machine
# ------------------------------------------------------------
proc initializePasses* =
let registry = getGlobalPassRegistry()
# TODO: Some passes in llc aren't exposed
# https://github.com/llvm/llvm-project/blob/main/llvm/tools/llc/llc.cpp
registry.initializeCore()
registry.initializeTransformUtils()
registry.initializeScalarOpts()
registry.initializeObjCARCOpts()
registry.initializeVectorization()
registry.initializeInstCombine()
registry.initializeAggressiveInstCombiner()
registry.initializeIPO()
registry.initializeInstrumentation()
registry.initializeAnalysis()
registry.initializeIPA()
registry.initializeCodeGen()
registry.initializeTarget()
# Builder
# ------------------------------------------------------------
proc getContext*(builder: BuilderRef): ContextRef =
# LLVM C API does not expose IRBuilder.getContext()
# making this unnecessary painful
# https://github.com/llvm/llvm-project/issues/59875
builder.getInsertBlock().getBasicBlockParent().getTypeOf().getContext()
# Types
# ------------------------------------------------------------
proc `$`*(ty: TypeRef): string =
let s = ty.toLLVMstring()
result = $cstring(s)
s.dispose()
proc isVoid*(ty: TypeRef): bool {.inline.} =
ty.getTypeKind == tkVoid
proc pointer_t*(elementTy: TypeRef): TypeRef {.inline.} =
pointerType(elementTy, addressSpace = 0)
proc array_t*(elemType: TypeRef, elemCount: SomeInteger): TypeRef {.inline.}=
array_t(elemType, uint32(elemCount))
proc function_t*(returnType: TypeRef, paramTypes: openArray[TypeRef]): TypeRef {.inline.} =
function_t(returnType, paramTypes, isVarArg = LlvmBool(false))
# Values
# ------------------------------------------------------------
type
ConstValueRef* = distinct ValueRef
AnyValueRef* = ValueRef or ConstValueRef
proc getName*(v: ValueRef): string =
var rLen: csize_t
let rStr = getValueName2(v, rLen)
result = newString(rLen.int)
copyMem(result[0].addr, rStr, rLen.int)
proc constInt*(ty: TypeRef, n: uint64, signExtend = false): ConstValueRef {.inline.} =
ConstValueRef constInt(ty, culonglong(n), LlvmBool(signExtend))
proc constStruct*(constantVals: openArray[ValueRef], packed = false): ConstValueRef {.inline.} =
ConstValueRef constStruct(constantVals, LlvmBool(packed))
proc getTypeOf*(v: ConstValueRef): TypeRef {.borrow.}

View File

@ -0,0 +1,270 @@
# Constantine
# Copyright (c) 2018-2019 Status Research & Development GmbH
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.
import
./bindings/nvidia_abi {.all.},
./bindings/utils,
./llvm, ./ir,
./nvidia_inlineasm,
../primitives
export
nvidia_abi, nvidia_inlineasm,
Flag, flag
# ############################################################
#
# Nvidia GPUs API
#
# ############################################################
# Versioning and hardware support
# ------------------------------------------------------------
# GPU architectures:
# - Kepler Geforce GTX 780, 2012, Compute Capability SM3.5
# - Maxwell Geforce GTX 980, 2014, Compute Capability SM5.2
# - Pascal Geforce GTX 1080, 2016, Compute Capability SM6.1
# - Volta Tesla V100, 2017, Compute Capability SM7.0
# - Turing Geforce RTX 2080, 2018, Compute Capability SM7.5
# - Ampere Geforce RTX 3080, 2020, Compute Capability SM8.6
# - Ada Geforce RTX 4080, 2022, Compute Capability SM8.9
# We likely want to use unified memory in the future to avoid having to copy back and from device explicitly
# - https://developer.nvidia.com/blog/unified-memory-cuda-beginners/
# - https://developer.nvidia.com/blog/unified-memory-in-cuda-6/
#
# Unified memory is fully supported starting from Pascal GPU (GTX 1080, 2016, Compute Capability SM6.0)
# and require Kepler at minimum.
#
# Cuda 9 exposes the current explicit synchronization primitives (cooperative groups) and deprecated the old ones
# Those primitives are particularly suitable for Volta GPUs (GTX 2080, 2018, Compute Capability SM7.5)
# and requiring.
#
# Furthermore Pascal GPUs predates the high demand for deep learning and cryptocurrency mining
# and were widely available at an affordable price point.
# Also given iven that it's a 7 years old architecture,
# it is unlikely that users have an older Nvidia GPU available.
#
# Hence we can target Cuda 9 at minimum (Sept 2017): https://developer.nvidia.com/cuda-toolkit-archive
# which corresponds to PTX ISA 6.0: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#release-notes__ptx-release-history
#
# Unfortunately, there is no easy programmatic way to retrieve the PTX ISA version supported
# only the Cuda/Compiler version (https://docs.nvidia.com/cuda/ptx-compiler-api/index.html#group__versioning)
# Hence it's likely easier to ask users to update Cuda in case of ISA incompatibility.
# Cuda Driver API
# ------------------------------------------------------------
template check*(status: CUresult) =
## Check the status code of a CUDA operation
## Exit program with error if failure
let code = status # ensure that the input expression is evaluated once only
if code != CUDA_SUCCESS:
writeStackTrace()
stderr.write(astToStr(status) & " " & $instantiationInfo() & " exited with error: " & $code & '\n')
quit 1
func cuModuleLoadData*(module: var CUmodule, sourceCode: openArray[char]): CUresult {.inline.}=
cuModuleLoadData(module, sourceCode[0].unsafeAddr)
func cuModuleGetFunction*(kernel: var CUfunction, module: CUmodule, fnName: openArray[char]): CUresult {.inline.}=
cuModuleGetFunction(kernel, module, fnName[0].unsafeAddr)
proc cudaDeviceInit*(deviceID = 0'i32): CUdevice =
check cuInit(deviceID.uint32)
var devCount: int32
check cuDeviceGetCount(devCount)
if devCount == 0:
echo "cudaDeviceInit error: no devices supporting CUDA"
quit 1
var cuDevice: CUdevice
check cuDeviceGet(cuDevice, deviceID)
var name = newString(128)
check cuDeviceGetName(name[0].addr, name.len.int32, cuDevice)
echo "Using CUDA Device [", deviceID, "]: ", cstring(name)
var major, minor: int32
check cuDeviceGetAttribute(major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice)
check cuDeviceGetAttribute(minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice)
echo "Compute Capability: SM ", major, ".", minor
if major < 6:
echo "Error: Device ",deviceID," is not sm_60 (Pascal generation, GTX 1080) or later"
quit 1
return cuDevice
# ############################################################
#
# LLVM IR for Nvidia GPUs
#
# ############################################################
proc tagCudaKernel(module: ModuleRef, fn: FnDef) =
## Tag a function as a Cuda Kernel, i.e. callable from host
doAssert fn.fnTy.getReturnType().isVoid(), block:
"Kernels must not return values but function returns " & $fn.fnTy.getReturnType().getTypeKind()
let ctx = module.getContext()
module.addNamedMetadataOperand(
"nvvm.annotations",
ctx.asValueRef(ctx.metadataNode([
fn.fnImpl.asMetadataRef(),
ctx.metadataNode("kernel"),
constInt(ctx.int32_t(), 1, LlvmBool(false)).asMetadataRef()
]))
)
proc setCallableCudaKernel*(module: ModuleRef, fn: FnDef) =
## Create a public wrapper of a cuda device function
##
## A function named `addmod` can be found by appending _public
## check cuModuleGetFunction(fnPointer, cuModule, "addmod_public")
let pubName = fn.fnImpl.getName() & "_public"
let pubFn = module.addFunction(cstring(pubName), fn.fnTy)
let ctx = module.getContext()
let builder = ctx.createBuilder()
defer: builder.dispose()
let blck = ctx.appendBasicBlock(pubFn, "publicKernelBody")
builder.positionAtEnd(blck)
var args = newSeq[ValueRef](fn.fnTy.countParamTypes())
for i, arg in mpairs(args):
arg = pubFn.getParam(i.uint32)
discard builder.call2(fn.fnTy, fn.fnImpl, args)
# A public kernel must return void
builder.retVoid()
module.tagCudaKernel((fn.fnTy, pubFn))
# ############################################################
#
# Code generation
#
# ############################################################
proc codegenNvidiaPTX*(asy: Assembler_LLVM, sm: tuple[major, minor: int32]): string =
## Generate Nvidia PTX via LLVM
## SM corresponds to the target GPU architecture Compute Capability
## - https://developer.nvidia.com/cuda-gpus
## - https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-capabilities
##
## This requires the following function to be called beforehand:
## - initializePasses()
## - initializeFullNVPTXTarget()
debug: doAssert asy.backend == bkNvidiaPTX
asy.module.verify(AbortProcessAction)
const triple = "nvptx64-nvidia-cuda"
let machine = createTargetMachine(
target = toTarget(triple),
triple = triple,
cpu = cstring("sm_" & $sm.major & $sm.minor),
features = "",
level = CodeGenLevelAggressive,
reloc = RelocDefault,
codeModel = CodeModelDefault
)
# https://www.llvm.org/docs/Passes.html
let pm = createPassManager()
machine.addAnalysisPasses(pm)
pm.addDeduceFunctionAttributesPass()
pm.addMemCpyOptPass()
pm.addScalarReplacementOfAggregatesPass()
pm.addPromoteMemoryToRegisterPass()
pm.addGlobalValueNumberingPass()
pm.addDeadStoreEliminationPass()
pm.addInstructionCombiningPass()
pm.addFunctionInliningPass()
pm.addAggressiveDeadCodeEliminationPass()
when false:
# As most (all?) of our code is straightline, unoptimizable inline assembly, no loop and no branches
# most optimizations, even at -O3, are not applicable
let pmb = createPassManagerBuilder()
pmb.setOptLevel(3)
pmb.populateModulePassManager(pm)
pmb.dispose()
pm.run(asy.module)
pm.dispose()
return machine.emitToString(asy.module, AssemblyFile)
# ############################################################
#
# Code execution
#
# ############################################################
proc getCudaKernel*(cuMod: CUmodule, cm: CurveMetadata, opcode: Opcode): CUfunction =
# Public kernels are appended _public
let fnName = cm.genSymbol(opcode) & "_public"
check cuModuleGetFunction(result, cuMod, fnName)
proc exec*[T](jitFn: CUfunction, r: var T, a, b: T) =
## Execute a binary operation in the form r <- op(a, b)
## on Nvidia GPU
# The execution wrapper provided are mostly for testing and debugging low-level kernels
# that serve as building blocks, like field addition or multiplication.
# They aren't parallelizable so we are not concern about the grid and block size.
# We also aren't concerned about the cuda stream when testing.
#
# This is not the case for production kernels (multi-scalar-multiplication, FFT)
# as we want to execute kernels asynchronously then merge results which might require multiple streams.
static: doAssert cpuEndian == littleEndian, block:
# From https://developer.nvidia.com/cuda-downloads?target_os=Linux
# Supported architectures for Cuda are:
# x86-64, PowerPC 64 little-endian, ARM64 (aarch64)
# which are all little-endian at word-level.
#
# Due to limbs being also stored in little-endian, on little-endian host
# the CPU and GPU will have the same binary representation
# whether we use 32-bit or 64-bit words, so naive memcpy can be used for parameter passing.
"Most CPUs (x86-64, ARM) are little-endian, as are Nvidia GPUs, which allows naive copying of parameters.\n" &
"Your architecture '" & $hostCPU & "' is big-endian and GPU offloading is unsupported on it."
# We assume that all arguments are passed by reference in the Cuda kernel, hence the need for GPU alloc.
var rGPU, aGPU, bGPU: CUdeviceptr
check cuMemAlloc(rGPU, csize_t sizeof(r))
check cuMemAlloc(aGPU, csize_t sizeof(a))
check cuMemAlloc(bGPU, csize_t sizeof(b))
check cuMemcpyHtoD(aGPU, a.unsafeAddr, csize_t sizeof(a))
check cuMemcpyHtoD(bGPU, b.unsafeAddr, csize_t sizeof(b))
let params = [pointer(rGPU.addr), pointer(aGPU.addr), pointer(bGPU.addr)]
check cuLaunchKernel(
jitFn,
1, 1, 1, # grid(x, y, z)
1, 1, 1, # block(x, y, z)
sharedMemBytes = 0,
CUstream(nil),
params[0].unsafeAddr, nil)
check cuMemcpyDtoH(r.addr, rGPU, csize_t sizeof(r))
check cuMemFree(rGPU)
check cuMemFree(aGPU)
check cuMemFree(bGPU)

View File

@ -0,0 +1,377 @@
# Constantine
# Copyright (c) 2018-2019 Status Research & Development GmbH
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.
import
std/[macros, strutils],
./llvm
# ############################################################
#
# Nvidia Inline ASM
#
# ############################################################
# We want to generate procedures related to the following
# instructions -> inline assembly -> argument mapping
# Inline assembly looks like this:
#
# C: asm volatile ("add.cc.u64 %0, %1, %2;" : "=l"(c) : "l"(a), "l"(b) : "memory" );
# LLVM: call i64 asm "add.cc.u64 $0, $1, $2;", "=l,l,l,~{memory}"(i64 %1, i64 %2)
#
# So we need to do the following steps
#
# 1. Collect inline ASM opcodes definition for Nvidia PTX inline assembly
# 2. Generate u32 and u64 `getInlineAsm()` definition (that is associated with an LLVM IR ContextRef)
# 3. Create an initialization proc to be called after initializing the LLVM ContextRef
# For each instruction, return a routine with signature that mirrors LLVM builtin instructions:
#
# proc myInstr(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring): ValueRef =
# let numBits = lhs.getTypeOf().getIntTypeWidth()
# if numBits == 32:
# builder.call2(inlineAsmFnType, inlineAsmFn32, [arg0, arg1, ...], name)
# elif numBits == 64:
# builder.call2(inlineAsmFnType, inlineAsmFn64, [arg0, arg1, ...], name)
# else:
# doAssert false, "Unsupported int" & $numBits
#
# To create `inlineAsmFn32` and `inlineAsmFn64` we may use `getInlineAsm` just before the corresponding
# builder.call2. This allows us to define freestanding functions.
# The potential issue is the overhead of repeated definition of add/sub/mul/muladd
# and their carry-in, carry-out variations.
# LLVM internally ensures that only a single instance will be defined via a HashTable
# Though this will involve thousands of repeated hashing: https://llvm.org/doxygen/InlineAsm_8cpp_source.html#l00043
#
# Alternatively, we can cache the functions created and the design challenge is how to expose
# the routines with the same API as LLVM builder. We could use a global, a wrapper builder type,
# or a template to call at the beginning of each function that setups some boilerplate indirection.
#
# However caching only works if inlineAsmFn32 and inlineAsmFn64 are stable
# but it's very clunky in our case as a fused multiply-addfunction like
# mad.lo.u32 "%0, %1, %2, %3;" "=l,l,l,l" [lmul, rmul, addend]
# can also have immediate operand (a modulus for example) as constraint.
# So we don't cache and rely on LLVM own deduplication.
template selConstraint(operand: auto, append = ""): string =
when operand is ValueRef:
# PTX Assembly:
# r for 32-bit operand
# l for 64-bit operand
# n for immediates
if operand.getTypeOf().getIntTypeWidth() == 32:
"r" & append
else:
"l" & append
else: # ConstValueRef or uint32 or uint64
"n" & append
macro genInstr(body: untyped): untyped =
result = newStmtList()
body.expectKind(nnkStmtList)
for op in body:
op.expectKind(nnkCommand)
doAssert op[0].eqIdent"op"
let instrName = op[1]
# For each op, generate a builder proc
op[2][0].expectKind(nnkTupleConstr)
op[2][0][0].expectKind({nnkStrLit, nnkCurly})
op[2][0][1].expectKind(nnkStrLit)
op[2][0][2].expectKind(nnkStrLit)
op[2][0][3].expectKind(nnkBracket)
let instrBody = newStmtList()
# 1. Detect the size of registers
let numBits = ident"numBits"
let regTy = ident"regTy"
let fnTy = ident"fnTy"
let ctx = ident"ctx"
let lhs = op[2][0][3][0]
instrBody.add quote do:
let `ctx` = builder.getContext()
# lhs: ValueRef or uint32 or uint64
let `numBits` = when `lhs` is ValueRef|ConstValueRef: `lhs`.getTypeOf().getIntTypeWidth()
else: 8*sizeof(`lhs`)
let `regTy` = when `lhs` is ValueRef|ConstValueRef: `lhs`.getTypeOf()
elif `lhs` is uint32: `ctx`.int32_t()
elif `lhs` is uint64: `ctx`.int64_t()
else: {.error "Unsupported input type " & $typeof(`lhs`).}
# 2. Create the LLVM asm signature
let operands = op[2][0][3]
let arity = operands.len
let constraintString = op[2][0][2]
let constraints = ident"constraints"
let instr = op[2][0][0]
let hasTwoTypes = instr.kind == nnkCurly
if hasTwoTypes:
# For now only slct has 2 types so we special case to it
instrBody.add quote do:
let `fnTy` = function_t(`regTy`, [`regTy`, `regTy`, `ctx`.int32_t()])
elif arity == 2:
if constraintString.strVal.startsWith('='):
instrBody.add quote do:
let `fnTy` = function_t(`regTy`, [`regTy`, `regTy`])
else:
# We only support out of place "=" function.
# In-place with "+" requires alloca + load/stores in codegen
error "Unsupported constraint: " & constraintString.strVal
elif arity == 3:
if constraintString.strVal.startsWith('='):
instrBody.add quote do:
let `fnTy` = function_t(`regTy`, [`regTy`, `regTy`, `regTy`])
# We only support out of place "=" function.
# In-place with "+" requires alloca + load/stores in codegen
else:
error "Unsupported constraint: " & constraintString.strVal
else:
error "Unsupported arity: " & $arity
# 3. Create the constraints string
# We could have generic constraint string generation, but we only have 2 arities to support
# and codegen without quote do would be even more verbose and hard to read.
if arity == 2:
let op0 = operands[0]
let op1 = operands[1]
instrBody.add quote do:
let `constraints` = block:
var c: string
let constraintRegisterSymbol =
if `numBits` == 32: "r"
else: "l"
when `constraintString`.startsWith('='):
c.add "=" & constraintRegisterSymbol & ','
c.add selConstraint(`op0`,",")
c.add selConstraint(`op1`)
else:
static: doAssert false, " Constraint misconfigured"
when `constraintString`.endsWith(",~{memory}"):
c.add ",~{memory}"
c
elif arity == 3:
let op0 = operands[0]
let op1 = operands[1]
let op2 = operands[2]
instrBody.add quote do:
let `constraints` = block:
var c: string
let constraintRegisterSymbol =
if `numBits` == 32: "r"
else: "l"
when `constraintString`.startsWith('='):
c.add "=" & constraintRegisterSymbol & ','
c.add selConstraint(`op0`,",")
c.add selConstraint(`op1`, ",")
c.add selConstraint(`op2`)
else:
static: doAssert false, " Constraint misconfigured"
when `constraintString`.endsWith(",~{memory}"):
c.add ",~{memory}"
c
else:
error "Unsupported arity: " & $arity
# 4. Register the inline ASM with LLVM
let inlineASM = ident"inlineASM"
let instrParam = op[2][0][1]
let asmString = ident"asmString"
if hasTwoTypes:
# Only slct has 2 types, and it has to be s32, so there is no need to dynamically the type of the parameter at the moment
let mnemo = instr[0]
let type2 = instr[1]
instrBody.add quote do:
let `asmString` = static(`mnemo` & ".u") & $`numBits` & static(`type2` & " " & `instrParam`)
else:
instrBody.add quote do:
let `asmString` = static(`instr` & ".u") & $`numBits` & static(" " & `instrParam`)
instrBody.add quote do:
# Chapter 6 of https://docs.nvidia.com/cuda/pdf/NVVM_IR_Specification.pdf
# inteldialect is not supported (but the NVPTX dialect is akin to intel dialect)
let `inlineASM` = getInlineAsm(
ty = `fnTy`,
asmString = `asmString`,
constraints = `constraints`,
# All carry instructions have sideffect on carry flag and can't be reordered
# However, function calls can't be reordered and
# by default on NVPTX load/stores, comparisons and arithmetic operations don't affect carry
# flags so it's fine for the compiler to intersperse them.
hasSideEffects = LlvmBool(0),
isAlignStack = LlvmBool(0),
dialect = InlineAsmDialectATT,
canThrow = LlvmBool(0)
)
# 5. Call it
let opArray = nnkBracket.newTree()
for op in operands:
# when op is ValueRef: op
# else: constInt(uint64(op))
opArray.add newCall(
bindSym"ValueRef",
nnkWhenStmt.newTree(
nnkElifBranch.newTree(nnkInfix.newTree(ident"is", op, bindSym"AnyValueRef"), op),
nnkElse.newTree(newCall(ident"constInt", regTy, newCall(ident"uint64", op)))
)
)
# builder.call2(ty, inlineASM, [lhs, rhs], name)
instrBody.add newCall(
ident"call2", ident"builder", fnTy,
inlineASM, opArray, ident"name"
)
# 6. Create the function signature
var opDefs: seq[NimNode]
opDefs.add ident"ValueRef" # Return type
opDefs.add newIdentDefs(ident"builder", bindSym"BuilderRef")
block:
var i = 0
for constraint in constraintString.strVal.split(','):
if constraint.startsWith('=') or constraint.startsWith("~{memory}"):
# Don't increment i
continue
elif constraint == "rl":
opDefs.add newIdentDefs(operands[i], ident"ValueRef")
elif constraint == "rln":
opDefs.add newIdentDefs(
operands[i],
nnkInfix.newTree(ident"or",
nnkInfix.newTree(ident"or", ident"AnyValueRef", ident"uint32"),
ident"uint64")
)
elif constraint == "rn":
opDefs.add newIdentDefs(
operands[i],
nnkInfix.newTree(ident"or",
ident"AnyValueRef",
ident"uint32")
)
else:
error "Unsupported constraint: " & constraint
i += 1
opDefs.add newIdentDefs(ident"name", bindSym"cstring", newLit"")
result.add newProc(
name = nnkPostfix.newTree(ident"*", instrName),
params = opDefs,
procType = nnkProcDef,
body = instrBody
)
# Inline PTX assembly
# ------------------------------------------------------------
# See docs/implementation_nvidia_gpus.md for detailed implementation considerations
#
# Need Cuda 11.5.1 at least for madc.hi.u64:
# https://forums.developer.nvidia.com/t/wrong-result-returned-by-madc-hi-u64-ptx-instruction-for-specific-operands/196094
#
# The PTX compilation part is done by NVVM
# but NVVM version is not listed here: https://docs.nvidia.com/cuda/archive/11.5.1/cuda-toolkit-release-notes/index.html
# and nvvmVersion returns the IR version instead of the version of the compiler library.
# Alternatively we use LLVM NVPTX backend instead of Nvidia's NVVM.
#
# Nvidia manual
# ~~~~~~~~~~~~~
#
# https://docs.nvidia.com/cuda/inline-ptx-assembly/index.html#constraints
# There is a separate constraint letter for each PTX register type:
#
# "h" = .u16 reg
# "r" = .u32 reg
# "l" = .u64 reg
# "f" = .f32 reg
# "d" = .f64 reg
#
# The constraint "n" may be used for immediate integer operands with a known value.
#
#
# 1.2.3. Incorrect Optimization
#
# The compiler assumes that an asm() statement has no side effects except to change the output operands. To ensure that the asm is not deleted or moved during generation of PTX, you should use the volatile keyword, e.g.:
#
# asm volatile ("mov.u32 %0, %%clock;" : "=r"(x));
#
# Normally any memory that is written to will be specified as an out operand, but if there is a hidden side effect on user memory (for example, indirect access of a memory location via an operand), or if you want to stop any memory optimizations around the asm() statement performed during generation of PTX, you can add a “memory” clobbers specification after a 3rd colon, e.g.:
#
# asm volatile ("mov.u32 %0, %%clock;" : "=r"(x) :: "memory");
# asm ("st.u32 [%0], %1;" : "r"(p), "r"(x) :: "memory");
#
# Constantine implementation
# ~~~~~~~~~~~~~~~~~~~~~~~~~~
#
# To encode the allowed constraints we use rl to allow the r and l constraints
# and we use rln to allow r and l constraints and n immediate.
#
# The asm volatile constraint is passed via `hasSideEffects` in getInlineAsm.
#
# For the memory constraint, it is specified the following way:
#
# C: asm volatile ("add.cc.u64 %0, %1, %2;" : "=l"(c) : "l"(a), "l"(b) : "memory" );
# LLVM: call i64 asm "add.u64 $0, $1, $2;", "=l,l,l,~{memory}"(i64 %1, i64 %2)
#
# Instructions that use carries should not be reordered hence need volatile/hasSideEffect
genInstr():
# The PTX is without size indicator i.e. add.cc instead of add.cc.u32
# Both version will be generated.
#
# op name: ("ptx", "args;", "constraints", [params])
# r <- a+b
op add_co: ("add.cc", "$0, $1, $2;", "=rl,rln,rln", [lhs, rhs])
op add_ci: ("addc", "$0, $1, $2;", "=rl,rln,rln", [lhs, rhs])
op add_cio: ("addc.cc", "$0, $1, $2;", "=rl,rln,rln", [lhs, rhs])
# r <- a-b
op sub_bo: ("sub.cc", "$0, $1, $2;", "=rl,rln,rln", [lhs, rhs])
op sub_bi: ("subc", "$0, $1, $2;", "=rl,rln,rln", [lhs, rhs])
op sub_bio: ("subc.cc", "$0, $1, $2;", "=rl,rln,rln", [lhs, rhs])
# r <- a * b >> 32
op mulhi: ("mul.hi", "$0, $1, $2;", "=rl,rln,rln", [lhs, rhs])
# r <- a * b + c
op mulloadd: ("mad.lo", "$0, $1, $2, $3;", "=rl,rln,rln,rln", [lmul, rmul, addend])
op mulloadd_co: ("mad.lo.cc", "$0, $1, $2, $3;", "=rl,rln,rln,rln", [lmul, rmul, addend])
op mulloadd_cio: ("madc.lo.cc", "$0, $1, $2, $3;", "=rl,rln,rln,rln", [lmul, rmul, addend])
# r <- (a * b) >> 32 + c
# r <- (a * b) >> 64 + c
op mulhiadd: ("mad.hi", "$0, $1, $2, $3;", "=rl,rln,rln,rln", [lmul, rmul, addend])
op mulhiadd_co: ("mad.hi.cc", "$0, $1, $2, $3;", "=rl,rln,rln,rln", [lmul, rmul, addend])
op mulhiadd_cio: ("madc.hi.cc", "$0, $1, $2, $3;", "=rl,rln,rln,rln", [lmul, rmul, addend])
# Conditional mov / select
# slct r, a, b, c;
# r <- (c >= 0) ? a : b;
op slct: ({"slct",".s32"}, "$0, $1, $2, $3;", "=rl,rln,rln,rn", [ifPos, ifNeg, condition])
# selp is the classic select operation, however the selector `c` is of type "predicate"
# and quoting the PTX ISA doc
# https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#manipulating-predicates
# > There is no direct conversion between predicates and integer values, and no direct way to load or store predicate register values. However, setp can be used to generate a predicate from an integer, and the predicate-based select (selp) instruction can be used to generate an integer value based on the value of a predicate; for example:
# > selp.u32 %r1,1,0,%p; // convert predicate to 32-bit value
#
# If selp is more practical than slct in some cases, then it's likely easier to use LLVM builtin IR trunc/icmp + select
# selp r, a, b, c;
# r <- (c == 1) ? a : b;
# op selp: ("selp", "$0, $1, $2, $3;", "=rl,rln,rln,rln", [ifTrue, ifFalse, condition])
# Alternatively, for conditional moves use-cases, we might want to use
# 'setp' to set a predicate and then '@p mov' for predicated moves

View File

@ -0,0 +1,209 @@
# Implementation on Nvidia GPUs
This documentation references useful information for implementing and optimizing for Nvidia GPUs
## Integer instruction bug
The instruction integer fused-multiply-ad with carry-in may
be incorrectly compiled in PTX prior to Cuda 11.5.1:
https://forums.developer.nvidia.com/t/wrong-result-returned-by-madc-hi-u64-ptx-instruction-for-specific-operands/196094
Test case from: https://github.com/tickinbuaa/CudaTest/blob/master/main.cu
```C
#include <cuda_runtime.h>
#include <memory>
__device__
inline void mac_with_carry(uint64_t &lo, uint64_t &hi, const uint64_t &a, const uint64_t &b, const uint64_t &c) {
if (blockIdx.x == 0 && threadIdx.x == 0) {
printf("GPU calculation input: a = %lx b = %lx c = %lx\n", a, b, c);
}
asm("mad.lo.cc.u64 %0, %2, %3, %4;\n\t"
"madc.hi.u64 %1, %2, %3, 0;\n\t"
:"=l"(lo), "=l"(hi): "l"(a), "l"(b), "l"(c));
if (blockIdx.x == 0 && threadIdx.x == 0) {
printf("GPU calculation result: hi = %lx low = %lx\n", hi, lo);
}
}
__global__
void test(uint64_t *out, uint32_t num){
unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid >= num) {
return;
}
uint64_t a = 0x42737a020c0d6393UL;
uint64_t b = 0xffffffff00000001UL;
uint64_t c = 0xc999e990f3f29c6dUL;
mac_with_carry(out[tid << 1], out[(tid << 1) + 1], a, b, c);
}
int main() {
uint64_t *d_out;
uint32_t num = 1;
cudaMalloc(&d_out, num * 2 * sizeof(uint64_t));
const uint32_t BLOCK_SIZE = 256;
uint32_t block_num = (num + BLOCK_SIZE - 1) / BLOCK_SIZE;
test<<<block_num, BLOCK_SIZE>>>(d_out, num);
cudaDeviceSynchronize();
unsigned __int128 a = 0x42737a020c0d6393UL;
unsigned __int128 b = 0xffffffff00000001UL;
unsigned __int128 c = 0xc999e990f3f29c6dUL;
unsigned __int128 result = a * b + c;
printf("Cpu result: hi:%lx low:%lx\n", (uint64_t)((result >> 64) & 0xffffffffffffffffUL), (uint64_t)(result & 0xffffffffffffffffUL));
}
```
## The hidden XMAD instruction
There is a "hidden" instruction called xmad on Nvidia GPUs described in
- Optimizing Modular Multiplication for NVIDIAs Maxwell GPUs\
Niall Emmart , Justin Luitjens , Charles Weems and Cliff Woolley\
https://ieeexplore.ieee.org/abstract/document/7563271
On Maxwell and Pascal GPUs (SM 5.3), there was no native 32-bit integer multiplication, probably due to die size constraint.
So 32-bit mul was based on 16-bit muladd (XMAD) with some PTX->SASS compiler pattern matching to detect optimal XMAD
scheduling.
Starting from Volta (SM 7.0 / RTX 2XXX), there is now an hardware integer multiply again
https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#arithmetic-instructions
Code to generate the proper XMAD is available in:
- https://github.com/NVlabs/xmp/blob/0052dbb/src/include/ptx/PTXInliner_impl.h#L371-L384
- https://github.com/NVlabs/CGBN/blob/e8b9d26/include/cgbn/arith/asm.cu#L131-L142
## Double-precision floating point arithmetic
On double-precision floating point arithmetic.
There are some recent papers exploring using the 52-bit mantissa of a float64 to accelerate elliptic curve cryptography.
This is similar to the AVX approaches on CPU.
- Faster Modular Exponentiation Using Double Precision Floating Point Arithmetic on the GPU\
Niall Emmart, Fangyu Zheng, Charles Weems\
https://ieeexplore.ieee.org/document/8464792
- DPF-ECC: Accelerating Elliptic Curve Cryptography with Floating-Point Computing Power of GPUs
Lili Gao, Fangyu Zheng, Niall Emmart, Jiankuo Dong, Jingqiang Lin, C. Weems\
https://ieeexplore.ieee.org/document/9139772
Unfortunately float64 arithmetic is extremely slow on Nvidia GPUs except for Tesla-class GPU due to market segmentation.
https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#architecture-8-x
SM 8.0 corresponds to a Tesla A100, and SM 8.6 to RTX 30X0 or Quadro AX000
> A Streaming Multiprocessor (SM) consists of:
> - 64 FP32 cores for single-precision arithmetic operations in devices of compute capability 8.0\
> and 128 FP32 cores in devices of compute capability 8.6, 8.7 and 8.9,
> - 32 FP64 cores for double-precision arithmetic operations in devices of compute capability 8.0\
> and **2 FP64 cores** in devices of compute capability 8.6, 8.7 and 8.9
> - **64 INT32 cores** for integer math
Hence Nvidia choose to replace 30 FP64 cores with 64 FP32 cores on consumer GPU. An understandable business decision since graphics and machine learning use and are benchmarked on FP32 with FP64 being used mostly in scientific and simulation workloads. Hozever for blockchain, it's important for decentralization that as much as possible can run on consumer hardware, Tesla cards are $10k so we want to optimize for consumer GPUs with 1/32 INT32/FP64 throughput ratio.
So assuming 1 cycle per instruction on the matching core, we can do 64 INT32 instructions while we do 2 FP64 instructions, hence 1/32 throughput ratio.
Concretely to emulate 64x64->128 extended precision multiplication we need 4 32-bit multiplications (and fused additions):
```
a₁a₀
* b₁b₀
---------------------------
a₀b₀
a₁b₀
a₀b₁
a₁b₁
```
Assuming we need only 2 FP64 instructions for 64x64->128 integer mul (mul.lo and mul.hi) the throughput ratio would be:
`1/32 (base throughput) * 4 (mul int32 instr) * 1/2 (mul fp64) = 1/16`
In reality:
- we use 52-bit mantissa so we would have calculated only 104 bit
- there is extra addition/substraction, shifting and masking involved
- this significantly increase the chances of mistakes. Furthermore formal verification or fuzzing on GPUs isn't the easiest
## Code generation considerations
### Parameter passing:
- https://reviews.llvm.org/D118084
> The motivation for this change is to allow SROA to eliminate local copies in more cases. Local copies that make it to the generated PTX present a substantial performance hit, as we end up with all threads on the GPU rushing to access their own chunk of very high-latency memory.
Direct parameter passing is easier to analyze but not worthwhile
for large aggregate
### Important optimization passes:
- https://www.llvm.org/docs/Passes.html
- gvn, global value numbering to remove redundant loads
- mem2reg, will promote memory into regisster, memory is expensive in GPUs
> This file promotes memory references to be register references. It promotes alloca instructions which only have loads and stores as uses. An alloca is transformed by using dominator frontiers to place phi nodes, then traversing the function in depth-first order to rewrite loads and stores as appropriate. This is just the standard SSA construction algorithm to construct “pruned” SSA form.
https://stackoverflow.com/a/66082008
- SROA, Scalar Replacement of Aggregates, to remove local copies and alloca. Static indices access help.
as mentioned in https://discourse.llvm.org/t/nvptx-calling-convention-for-aggregate-arguments-passed-by-value/
https://github.com/llvm/llvm-project/issues/51734#issuecomment-981047833
> Local loads/stores on GPU are expensive enough to be worth quite a few extra instructions.
- https://github.com/apc-llc/nvcc-llvm-ir
Note: The dead code/instructions elimination passes might remove the ASM not marked sideeffect/volatile
Ordering GVN before SROA: https://reviews.llvm.org/D111471
If we use "normal" instructions instead of inline assembly, this thread links to many LLVM internal discussions
on the passes that optimize to add-with-carry: https://github.com/llvm/llvm-project/issues/31102
We have:
- InstCombine, for instruction combining (see also: https://reviews.llvm.org/D8889, https://reviews.llvm.org/D124698, https://github.com/llvm/llvm-project/issues/39832)
- CodegenPrepare, for ISA specific codegen
### LLVM NVPTX or Nvidia libNVVM
https://docs.nvidia.com/cuda/libnvvm-api/index.html
https://docs.nvidia.com/pdf/libNVVM_API.pdf
https://docs.nvidia.com/cuda/nvvm-ir-spec/index.html
https://docs.nvidia.com/cuda/pdf/NVVM_IR_Specification.pdf
⚠ NVVM IR is based on LLVM 7.0.1 IR which dates from december 2018.
There are a couple of caveats:
- LLVM 7.0.1 is usually not available in repo, making installation difficult
- There was a ABI breaking bug making the 7.0.1 and 7.1.0 versions messy (https://www.phoronix.com/news/LLVM-7.0.1-Released)
- LLVM 7.0.1 does not have LLVMBuildCall2 and relies on the deprecated LLVMBuildCall meaning
supporting that and latest LLVM (for AMDGPU and SPIR-V backends) will likely have heavy costs
- When generating a add-with-carry kernel with inline ASM calls from LLVM-14,
if the LLVM IR is passed as bitcode,
the kernel content is silently discarded, this does not happen with built-in add.
It is unsure if it's call2 or inline ASM incompatibility that causes the issues
- When generating a add-with-carry kernel with inline ASM calls from LLVM-14,
if the LLVM IR is passed as testual IR, the code is refused with NVVM_ERROR_INVALID_IR
Hence, using LLVM NVPTX backend instead of libNVVM is likely the sustainable way forward
### Register pressure
See this AMD paper https://dl.acm.org/doi/pdf/10.1145/3368826.3377918
However if we want to reduce register pressure we need to store to local memory which is also expensive.
## Parallel reductions
Batch elliptic point addition `r = P₀ + P₁ + ... + Pₙ` and
multi-scalar multiplication (MSM) `r = [k₀]P₀ + [k₁]P₁ + ... + [kₙ]Pₙ`
are reduction operations.
There is a wealth of resources regarding optimized implementations of those.
The baseline is provided by: [Optimizing Parallel Reduction in CUDA, Mark harris](https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf)
Then on later architectures:
- https://developer.nvidia.com/blog/faster-parallel-reductions-kepler/
- https://www.irisa.fr/alf/downloads/collange/talks/collange_warp_synchronous_19.pdf
Other interesting resources:
- https://on-demand.gputechconf.com/gtc/2017/presentation/s7622-Kyrylo-perelygin-robust-and-scalable-cuda.pdf \
This explains in great details the cooperative group features
and examples in reduction kernels
- https://github.com/umfranzw/cuda-reduction-example \
This explains and uses overlapping streams for latency hiding
- https://vccvisualization.org/teaching/CS380/CS380_fall2020_lecture_25.pdf
SHFL instruction
- https://unum.cloud/post/2022-01-28-reduce/
- https://github.com/ashvardanian/ParallelReductionsBenchmark \
This provides an overview and benchmark code across CPU (AVX2, OpenMP, TBB), OpenCL, Cuda (Cublas, Thrust, Cub)
- https://diglib.eg.org/bitstream/handle/10.2312/egt20211037/CUDA_day2.pdf
- https://cuda-tutorial.github.io/part3_22.pdf
- https://github.com/CUDA-Tutorial/CodeSamples

View File

@ -37,7 +37,7 @@ The optimizations can be of algebraic, algorithmic or "implementation details" n
- Representation - Representation
- [x] Montgomery Representation - [x] Montgomery Representation
- [ ] Barret Reduction - [ ] Barret Reduction
- [x] Unsaturated Representation - [ ] Unsaturated Representation
- [ ] Mersenne Prime (2ᵏ - 1), - [ ] Mersenne Prime (2ᵏ - 1),
- [ ] Generalized Mersenne Prime (NIST Prime P256: 2^256 - 2^224 + 2^192 + 2^96 - 1) - [ ] Generalized Mersenne Prime (NIST Prime P256: 2^256 - 2^224 + 2^192 + 2^96 - 1)
- [ ] Pseudo-Mersenne Prime (2^m - k for example Edwards25519: 2^255 - 19) - [ ] Pseudo-Mersenne Prime (2^m - k for example Edwards25519: 2^255 - 19)
@ -69,14 +69,14 @@ The optimizations can be of algebraic, algorithmic or "implementation details" n
- Montgomery Squaring - Montgomery Squaring
- [x] Dedicated squaring functions - [x] Dedicated squaring functions
- [x] Fused multiply + reduce - [x] Fused multiply + reduce
- [ ] int128 - [x] int128
- [ ] loop unrolling - [x] loop unrolling
- [x] x86: Full Assembly implementation - [x] x86: Full Assembly implementation
- [x] x86: MULX, ADCX, ADOX instructions - [x] x86: MULX, ADCX, ADOX instructions
- [ ] no-carry optimization for CIOS (Coarsely Integrated Operand Scanning) - [ ] no-carry optimization for CIOS (Coarsely Integrated Operand Scanning)
- Addition chains - Addition chains
- [ ] unreduced squarings/multiplications in addition chains - [x] unreduced squarings/multiplications in addition chains
- Exponentiation - Exponentiation
- [x] variable-time exponentiation - [x] variable-time exponentiation
@ -217,4 +217,4 @@ The optimizations can be of algebraic, algorithmic or "implementation details" n
- [ ] BLS G1: Bowe, endomorphism accelerated - [ ] BLS G1: Bowe, endomorphism accelerated
- [ ] BLS G2: Bowe, endomorphism accelerated - [ ] BLS G2: Bowe, endomorphism accelerated
- [x] BLS G1: Scott, endomorphism accelerated - [x] BLS G1: Scott, endomorphism accelerated
- [x] BLS G2: Scott, endomorphism accelerated - [x] BLS G2: Scott, endomorphism accelerated

1
nim.cfg Normal file
View File

@ -0,0 +1 @@
--hint[Name]:off

View File

@ -0,0 +1,48 @@
# Constantine
# Copyright (c) 2018-2019 Status Research & Development GmbH
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.
import ../../constantine/platforms/gpu/llvm
echo "LLVM JIT compiler Hello World"
let ctx = createContext()
let module = ctx.createModule("addition")
let i32 = ctx.int32_t()
let addType = function_t(i32, [i32, i32], isVarArg = LlvmBool(false))
let addBody = module.addFunction("add", addType)
let builder = ctx.createBuilder()
let blck = ctx.appendBasicBlock(addBody, "addBody")
builder.positionAtEnd(blck)
block:
let a = addBody.getParam(0)
let b = addBody.getParam(1)
let sum = builder.add(a, b, "sum")
builder.ret(sum)
module.verify(AbortProcessAction)
var engine: ExecutionEngineRef
block:
initializeFullNativeTarget()
createJITCompilerForModule(engine, module, optLevel = 0)
let jitAdd = cast[proc(a, b: int32): int32 {.noconv.}](
engine.getFunctionAddress("add"))
echo "jitAdd(1, 2) = ", jitAdd(1, 2)
doAssert jitAdd(1, 2) == 1 + 2
block:
# Cleanup
builder.dispose()
engine.dispose() # also destroys the module attached to it
ctx.dispose()
echo "LLVM JIT - SUCCESS"

View File

@ -0,0 +1,326 @@
# Constantine
# Copyright (c) 2018-2019 Status Research & Development GmbH
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.
import ../../constantine/platforms/gpu/[llvm, nvidia, bindings/utils]
# ############################################################
#
# NVVM
#
# ############################################################
# https://docs.nvidia.com/cuda/libnvvm-api/index.html
# https://docs.nvidia.com/pdf/libNVVM_API.pdf
# https://docs.nvidia.com/cuda/nvvm-ir-spec/index.html
# https://docs.nvidia.com/cuda/pdf/NVVM_IR_Specification.pdf
# ⚠ NVVM IR is based on LLVM 7.0.1 IR which dates from december 2018.
# There are a couple of caveats:
# - LLVM 7.0.1 is usually not available in repo, making installation difficult
# - There was a ABI breaking bug making the 7.0.1 and 7.1.0 versions messy (https://www.phoronix.com/news/LLVM-7.0.1-Released)
# - LLVM 7.0.1 does not have LLVMBuildCall2 and relies on the deprecated LLVMBuildCall meaning
# supporting that and latest LLVM (for AMDGPU and SPIR-V backends) will likely have heavy costs
# - When generating a add-with-carry kernel with inline ASM calls from LLVM-14,
# if the LLVM IR is passed as bitcode,
# the kernel content is silently discarded, this does not happen with built-in add.
# It is unsure if it's call2 or inline ASM incompatibility that causes the issues
# - When generating a add-with-carry kernel with inline ASM calls from LLVM-14,
# if the LLVM IR is passed as testual IR, the code is refused with NVVM_ERROR_INVALID_IR
# Hence, using LLVM NVPTX backend instead of libNVVM is likely the sustainable way forward
static: echo "[Constantine] Using library libnvvm.so"
{.passl: "-L/opt/cuda/nvvm/lib64 -lnvvm".}
type
NvvmResult* {.size: sizeof(cint).} = enum
NVVM_SUCCESS = 0
NVVM_ERROR_OUT_OF_MEMORY = 1
NVVM_ERROR_PROGRAM_CREATION_FAILURE = 2
NVVM_ERROR_IR_VERSION_MISMATCH = 3
NVVM_ERROR_INVALID_INPUT = 4
NVVM_ERROR_INVALID_PROGRAM = 5
NVVM_ERROR_INVALID_IR = 6
NVVM_ERROR_INVALID_OPTION = 7
NVVM_ERROR_NO_MODULE_IN_PROGRAM = 8
NVVM_ERROR_COMPILATION = 9
NvvmProgram = distinct pointer
{.push cdecl, importc, dynlib: "libnvvm.so".}
proc nvvmGetErrorString*(r: NvvmResult): cstring
proc nvvmVersion*(major, minor: var int32): NvvmResult
proc nvvmIRVersion*(majorIR, minorIR, majorDbg, minorDbg: var int32): NvvmResult
proc nvvmCreateProgram*(prog: var NvvmProgram): NvvmResult
proc nvvmDestroyProgram*(prog: var NvvmProgram): NvvmResult
proc nvvmAddModuleToProgram*(prog: NvvmProgram, buffer: openArray[byte], name: cstring): NvvmResult {.wrapOpenArrayLenType: csize_t.}
proc nvvmLazyAddModuleToProgram*(prog: NvvmProgram, buffer: openArray[byte], name: cstring): NvvmResult {.wrapOpenArrayLenType: csize_t.}
proc nvvmCompileProgram*(prog: NvvmProgram; numOptions: int32; options: cstringArray): NvvmResult
proc nvvmVerifyProgram*(prog: NvvmProgram; numOptions: int32; options: cstringArray): NvvmResult
proc nvvmGetCompiledResultSize*(prog: NvvmProgram; bufferSizeRet: var csize_t): NvvmResult
proc nvvmGetCompiledResult*(prog: NvvmProgram; buffer: ptr char): NvvmResult
proc nvvmGetProgramLogSize*(prog: NvvmProgram; bufferSizeRet: var csize_t): NvvmResult
proc nvvmGetProgramLog*(prog: NvvmProgram; buffer: ptr char): NvvmResult
{.pop.} # {.push cdecl, importc, header: "<nvvm.h>".}
# ############################################################
#
# PTX Codegen
#
# ############################################################
template check*(status: NvvmResult) =
let code = status # Assign so execution is done once only.
if code != NVVM_SUCCESS:
stderr.write astToStr(status) & " " & $instantiationInfo() & " exited with error: " & $code
quit 1
proc getNvvmLog(prog: NvvmProgram): string {.used.} =
var logSize: csize_t
check nvvmGetProgramLogSize(prog, logSize)
if logSize > 0:
result = newString(logSize)
check nvvmGetProgramLog(prog, result[0].addr)
proc ptxCodegenViaNvidiaNvvm(module: ModuleRef, sm: tuple[major, minor: int32]): string {.used.} =
## PTX codegen via Nvidia NVVM
# ######################################
# LLVM -> NNVM handover
var prog{.noInit.}: NvvmProgram
check nvvmCreateProgram(prog)
let bitcode = module.toBitcode()
check nvvmAddModuleToProgram(prog, bitcode, cstring module.getIdentifier())
# ######################################
# GPU codegen
check nvvmVerifyProgram(prog, 0, nil)
let options = allocCStringArray(["-arch=compute_" & $sm.major & $sm.minor])
check nvvmCompileProgram(prog, 1, options)
deallocCStringArray(options)
var ptxSize: csize_t
check nvvmGetCompiledResultSize(prog, ptxSize)
result = newString(ptxSize-1) # The NNVM size includes '\0' ending char while Nim excludes it.
check nvvmGetCompiledResult(prog, result[0].addr)
check nvvmDestroyProgram(prog)
proc ptxCodegenViaLlvmNvptx(module: ModuleRef, sm: tuple[major, minor: int32]): string {.used.} =
## PTX codegen via LLVM NVPTX
module.verify(AbortProcessAction)
initializeFullNVPTXTarget()
const triple = "nvptx64-nvidia-cuda"
let machine = createTargetMachine(
target = toTarget(triple),
triple = triple,
cpu = cstring("sm_" & $sm.major & $sm.minor),
features = "",
level = CodeGenLevelAggressive,
reloc = RelocDefault,
codeModel = CodeModelDefault
)
machine.emitToString(module, AssemblyFile)
# ############################################################
#
# Hello world
#
# ############################################################
echo "Nvidia JIT compiler Hello World"
proc writeExampleAddMul(ctx: ContextRef, module: ModuleRef, addKernelName, mulKernelName: string) =
# ######################################
# Metadata
const triple = "nvptx64-nvidia-cuda"
# Datalayout for NVVM IR 1.8 (CUDA 11.6)
const datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
# ######################################
# LLVM IR codegen
module.setTarget(triple)
module.setDataLayout(datalayout)
let i128 = ctx.int128_t()
let void_t = ctx.void_t()
let builder = ctx.createBuilder()
defer: builder.dispose()
block:
let addType = function_t(void_t, [i128.pointer_t(), i128, i128], isVarArg = LlvmBool(false))
let addKernel = module.addFunction(addKernelName, addType)
let blck = ctx.appendBasicBlock(addKernel, "addBody")
builder.positionAtEnd(blck)
let r = addKernel.getParam(0)
let a = addKernel.getParam(1)
let b = addKernel.getParam(2)
let sum = builder.add(a, b, "sum")
builder.store(sum, r)
builder.retVoid()
module.setCallableCudaKernel((addType, addKernel))
block:
let mulType = function_t(void_t, [i128.pointer_t(), i128, i128], isVarArg = LlvmBool(false))
let mulKernel = module.addFunction(mulKernelName, mulType)
let blck = ctx.appendBasicBlock(mulKernel, "mulBody")
builder.positionAtEnd(blck)
let r = mulKernel.getParam(0)
let a = mulKernel.getParam(1)
let b = mulKernel.getParam(2)
let prod = builder.mul(a, b, "prod")
builder.store(prod, r)
builder.retVoid()
module.setCallableCudaKernel((mulType, mulKernel))
module.verify(AbortProcessAction)
block:
echo "================="
echo "LLVM IR output"
echo $module
echo "================="
func toHex*(a: uint64): string =
const hexChars = "0123456789abcdef"
const L = 2*sizeof(uint64)
result = newString(L)
var a = a
for j in countdown(result.len-1, 0):
result[j] = hexChars[a and 0xF]
a = a shr 4
func toString*(a: openArray[uint64]): string =
result = "0x"
for i in countdown(a.len-1, 0):
result.add toHex(a[i])
type
CodegenBackend = enum
PTXviaNvidiaNvvm
PTXviaLlvmNvptx
proc getCudaKernel(cuMod: CUmodule, fnName: string): CUfunction =
check cuModuleGetFunction(result, cuMod, fnName & "_public")
proc main(backend: CodegenBackend) =
#######################################
# GPU init
let cudaDevice = cudaDeviceInit()
var sm: tuple[major, minor: int32]
check cuDeviceGetAttribute(sm.major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cudaDevice)
check cuDeviceGetAttribute(sm.minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cudaDevice)
#######################################
# LLVM IR
let ctx = createContext()
let module = ctx.createModule("test_nnvm")
let addKernelName = "addKernel"
let mulKernelName = "mulKernel"
writeExampleAddMul(ctx, module, addKernelName, mulKernelName)
#######################################
# PTX codegen
let ptx = case backend
of PTXviaNvidiaNvvm:
module.ptxCodegenViaNvidiaNVVM(sm)
of PTXviaLlvmNvptx:
module.ptxCodegenViaLlvmNvptx(sm)
module.dispose()
ctx.dispose()
block:
echo "================="
echo "PTX output"
echo $ptx
echo "================="
#######################################
# GPU JIT
var cuCtx: CUcontext
var cuMod: CUmodule
check cuCtxCreate(cuCtx, 0, cudaDevice)
check cuModuleLoadData(cuMod, ptx)
let addKernel = cuMod.getCudaKernel(addKernelName)
let mulKernel = cuMod.getCudaKernel(mulKernelName)
#######################################
# Kernel launch
var r{.noInit.}, a, b: array[2, uint64]
a[1] = 0x00000000000001FF'u64; a[0] = 0xFFFFFFFFFFFFFFFF'u64
b[1] = 0x0000000000000000'u64; b[0] = 0x0010000000000000'u64
echo "r: ", r.toString()
echo "a: ", a.toString()
echo "b: ", b.toString()
var rGPU: CUdeviceptr
check cuMemAlloc(rGPU, csize_t sizeof(r))
let params = [pointer(rGPU.addr), pointer(a.addr), pointer(b.addr)]
check cuLaunchKernel(
addKernel,
1, 1, 1,
1, 1, 1,
0, CUstream(nil),
params[0].unsafeAddr, nil)
check cuMemcpyDtoH(r.addr, rGPU, csize_t sizeof(r))
echo "a+b: ", r.toString()
check cuLaunchKernel(
mulKernel,
1, 1, 1,
1, 1, 1,
0, CUstream(nil),
params[0].unsafeAddr, nil)
check cuMemcpyDtoH(r.addr, rGPU, csize_t sizeof(r))
echo "a*b: ", r.toString()
#######################################
# Cleanup
check cuMemFree(rGPU)
rGPU = CUdeviceptr(nil)
check cuModuleUnload(cuMod)
cuMod = CUmodule(nil)
check cuCtxDestroy(cuCtx)
cuCtx = CUcontext(nil)
echo "\n\nCompilation via Nvidia NVVM\n###########################\n"
main(PTXviaNvidiaNvvm)
echo "\n\nEnd: Compilation via Nvidia NVVM\n################################"
echo "\n\nCompilation via LLVM NVPTX\n##########################\n"
main(PTXviaLlvmNvptx)
echo "\n\nEnd: Compilation via LLVM NVPTX\n###############################"

134
tests/gpu/t_nvidia_fp.nim Normal file
View File

@ -0,0 +1,134 @@
# Constantine
# Copyright (c) 2018-2019 Status Research & Development GmbH
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.
import
# Standard library
std/[unittest, times],
# Internal
../../constantine/platforms/gpu/[llvm, nvidia, ir],
../../constantine/math/config/[curves, type_bigint],
../../constantine/math/io/io_bigints,
../../constantine/math/arithmetic,
../../constantine/math_gpu/fields_nvidia,
# Test utilities
../../helpers/[prng_unsafe, static_for]
var rng: RngState
let seed = uint32(getTime().toUnix() and (1'i64 shl 32 - 1)) # unixTime mod 2^32
rng.seed(seed)
echo "\n------------------------------------------------------\n"
echo "test_nvidia_fp xoshiro512** seed: ", seed
const Iters = 10
proc init(T: type CurveMetadata, asy: Assembler_LLVM, curve: static Curve, wordSize: WordSize): T =
CurveMetadata.init(
asy.ctx,
$curve & "_", wordSize,
fpBits = uint32 curve.getCurveBitwidth(),
fpMod = curve.Mod().toHex(),
frBits = uint32 curve.getCurveOrderBitwidth(),
frMod = curve.getCurveOrder().toHex()
)
proc genFieldAddPTX(asy: Assembler_LLVM, cm: CurveMetadata) =
let fpAdd = asy.field_add_gen(cm, fp)
asy.module.setCallableCudaKernel(fpAdd)
let frAdd = asy.field_add_gen(cm, fr)
asy.module.setCallableCudaKernel(frAdd)
# Init LLVM
# -------------------------
initializeFullNVPTXTarget()
initializePasses()
# Init GPU
# -------------------------
let cudaDevice = cudaDeviceInit()
var sm: tuple[major, minor: int32]
check cuDeviceGetAttribute(sm.major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cudaDevice)
check cuDeviceGetAttribute(sm.minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cudaDevice)
proc t_field_add(curve: static Curve) =
# Codegen
# -------------------------
let asy = Assembler_LLVM.new(bkNvidiaPTX, cstring("t_nvidia_" & $curve))
let cm32 = CurveMetadata.init(asy, curve, size32)
asy.genFieldAddPTX(cm32)
let cm64 = CurveMetadata.init(asy, curve, size64)
asy.genFieldAddPTX(cm64)
let ptx = asy.codegenNvidiaPTX(sm)
# GPU exec
# -------------------------
var cuCtx: CUcontext
var cuMod: CUmodule
check cuCtxCreate(cuCtx, 0, cudaDevice)
check cuModuleLoadData(cuMod, ptx)
defer:
check cuMod.cuModuleUnload()
check cuCtx.cuCtxDestroy()
let fpAdd32 = cuMod.getCudaKernel(cm32, opFpAdd)
let fpAdd64 = cuMod.getCudaKernel(cm64, opFpAdd)
let frAdd32 = cuMod.getCudaKernel(cm32, opFrAdd)
let frAdd64 = cuMod.getCudaKernel(cm64, opFrAdd)
# Fp
for i in 0 ..< Iters:
let a = rng.random_long01Seq(Fp[curve])
let b = rng.random_long01Seq(Fp[curve])
var rCPU, rGPU_32, rGPU_64: Fp[curve]
rCPU.sum(a, b)
fpAdd32.exec(rGPU_32, a, b)
fpAdd64.exec(rGPU_64, a, b)
doAssert bool(rCPU == rGPU_32)
doAssert bool(rCPU == rGPU_64)
# Fr
for i in 0 ..< Iters:
let a = rng.random_long01Seq(Fr[curve])
let b = rng.random_long01Seq(Fr[curve])
var rCPU, rGPU_32, rGPU_64: Fr[curve]
rCPU.sum(a, b)
frAdd32.exec(rGPU_32, a, b)
frAdd64.exec(rGPU_64, a, b)
doAssert bool(rCPU == rGPU_32)
doAssert bool(rCPU == rGPU_64)
proc main() =
const curves = [
P224,
BN254_Nogami,
BN254_Snarks,
Edwards25519,
Bandersnatch,
Pallas,
Vesta,
P256,
Secp256k1,
BLS12_377,
BLS12_381,
BW6_761
]
suite "[Nvidia GPU] Field Addition":
staticFor i, 0, curves.len:
const curve = curves[i]
test "Nvidia GPU field addition (𝔽p, 𝔽r) for " & $curve:
t_field_add(curve)
main()