[Research] x86 code generator (#234)

* rename compilers -> intrinsics, math_gpu -> math_codegen

* stash x86 codegen in research
This commit is contained in:
Mamy Ratsimbazafy 2023-04-27 21:52:51 +02:00 committed by GitHub
parent c6d9a213f2
commit 33c3a2e8c4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
23 changed files with 343 additions and 38 deletions

View File

@ -7,7 +7,7 @@
# at your option. This file may not be copied, modified, or distributed except according to those terms. # at your option. This file may not be copied, modified, or distributed except according to those terms.
import import
../platforms/gpu/[llvm, nvidia, ir] ../platforms/code_generator/[llvm, nvidia, ir]
# ############################################################ # ############################################################
# #
@ -21,10 +21,10 @@ import
proc finalSubMayOverflow*(asy: Assembler_LLVM, cm: CurveMetadata, field: Field, r, a: Array) = proc finalSubMayOverflow*(asy: Assembler_LLVM, cm: CurveMetadata, field: Field, r, a: Array) =
## If a >= Modulus: r <- a-M ## If a >= Modulus: r <- a-M
## else: r <- a ## else: r <- a
## ##
## This is constant-time straightline code. ## This is constant-time straightline code.
## Due to warp divergence, the overhead of doing comparison with shortcutting might not be worth it on GPU. ## Due to warp divergence, the overhead of doing comparison with shortcutting might not be worth it on GPU.
## ##
## To be used when the final substraction can ## To be used when the final substraction can
## also overflow the limbs (a 2^256 order of magnitude modulus stored in n words of total max size 2^256) ## also overflow the limbs (a 2^256 order of magnitude modulus stored in n words of total max size 2^256)
@ -48,15 +48,15 @@ proc finalSubMayOverflow*(asy: Assembler_LLVM, cm: CurveMetadata, field: Field,
let underflowedModulus = bld.sub_bi(overflowedLimbs, 0'u32) let underflowedModulus = bld.sub_bi(overflowedLimbs, 0'u32)
for i in 0 ..< N: for i in 0 ..< N:
r[i] = bld.slct(scratch[i], a[i], underflowedModulus) r[i] = bld.slct(scratch[i], a[i], underflowedModulus)
proc finalSubNoOverflow*(asy: Assembler_LLVM, cm: CurveMetadata, field: Field, r, a: Array) = proc finalSubNoOverflow*(asy: Assembler_LLVM, cm: CurveMetadata, field: Field, r, a: Array) =
## If a >= Modulus: r <- a-M ## If a >= Modulus: r <- a-M
## else: r <- a ## else: r <- a
## ##
## This is constant-time straightline code. ## This is constant-time straightline code.
## Due to warp divergence, the overhead of doing comparison with shortcutting might not be worth it on GPU. ## Due to warp divergence, the overhead of doing comparison with shortcutting might not be worth it on GPU.
## ##
## To be used when the modulus does not use the full bitwidth of the storing words ## To be used when the modulus does not use the full bitwidth of the storing words
## (say using 255 bits for the modulus out of 256 available in words) ## (say using 255 bits for the modulus out of 256 available in words)
@ -65,7 +65,7 @@ proc finalSubNoOverflow*(asy: Assembler_LLVM, cm: CurveMetadata, field: Field, r
let scratch = bld.makeArray(fieldTy) let scratch = bld.makeArray(fieldTy)
let M = cm.getModulus(field) let M = cm.getModulus(field)
let N = M.len let N = M.len
# Now substract the modulus, and test a < M with the last borrow # Now substract the modulus, and test a < M with the last borrow
scratch[0] = bld.sub_bo(a[0], M[0]) scratch[0] = bld.sub_bo(a[0], M[0])
for i in 1 ..< N: for i in 1 ..< N:
@ -80,7 +80,7 @@ proc finalSubNoOverflow*(asy: Assembler_LLVM, cm: CurveMetadata, field: Field, r
proc field_add_gen*(asy: Assembler_LLVM, cm: CurveMetadata, field: Field): FnDef = proc field_add_gen*(asy: Assembler_LLVM, cm: CurveMetadata, field: Field): FnDef =
## Generate an optimized modular addition kernel ## Generate an optimized modular addition kernel
## with parameters `a, b, modulus: Limbs -> Limbs` ## with parameters `a, b, modulus: Limbs -> Limbs`
let procName = cm.genSymbol(block: let procName = cm.genSymbol(block:
case field case field
of fp: opFpAdd of fp: opFpAdd
@ -94,14 +94,14 @@ proc field_add_gen*(asy: Assembler_LLVM, cm: CurveMetadata, field: Field): FnDef
asy.builder.positionAtEnd(blck) asy.builder.positionAtEnd(blck)
let bld = asy.builder let bld = asy.builder
let r = bld.asArray(addModKernel.getParam(0), fieldTy) let r = bld.asArray(addModKernel.getParam(0), fieldTy)
let a = bld.asArray(addModKernel.getParam(1), fieldTy) let a = bld.asArray(addModKernel.getParam(1), fieldTy)
let b = bld.asArray(addModKernel.getParam(2), fieldTy) let b = bld.asArray(addModKernel.getParam(2), fieldTy)
let t = bld.makeArray(fieldTy) let t = bld.makeArray(fieldTy)
let N = cm.getNumWords(field) let N = cm.getNumWords(field)
t[0] = bld.add_co(a[0], b[0]) t[0] = bld.add_co(a[0], b[0])
for i in 1 ..< N: for i in 1 ..< N:
t[i] = bld.add_cio(a[i], b[i]) t[i] = bld.add_cio(a[i], b[i])

View File

@ -6,7 +6,7 @@
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). # * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms. # at your option. This file may not be copied, modified, or distributed except according to those terms.
import ./compilers/bitops import ./intrinsics/bitops
# ############################################################ # ############################################################
# #

View File

@ -24,7 +24,7 @@ type
ctx*: ContextRef ctx*: ContextRef
module*: ModuleRef module*: ModuleRef
builder*: BuilderRef builder*: BuilderRef
i1_t*, i32_t*, i64_t*, void_t*: TypeRef i1_t*, i32_t*, i64_t*, i128_t*, void_t*: TypeRef
backend*: Backend backend*: Backend
Backend* = enum Backend* = enum
@ -54,7 +54,8 @@ proc new*(T: type Assembler_LLVM, backend: Backend, moduleName: cstring): Assemb
result.builder = result.ctx.createBuilder() result.builder = result.ctx.createBuilder()
result.i1_t = result.ctx.int1_t() result.i1_t = result.ctx.int1_t()
result.i32_t = result.ctx.int32_t() result.i32_t = result.ctx.int32_t()
result.i64_t = result.ctx.int32_t() result.i64_t = result.ctx.int64_t()
result.i128_t = result.ctx.int128_t()
result.void_t = result.ctx.void_t() result.void_t = result.ctx.void_t()
result.backend = backend result.backend = backend

View File

@ -20,7 +20,7 @@ import
# instructions -> inline assembly -> argument mapping # instructions -> inline assembly -> argument mapping
# Inline assembly looks like this: # Inline assembly looks like this:
# #
# C: asm volatile ("add.cc.u64 %0, %1, %2;" : "=l"(c) : "l"(a), "l"(b) : "memory" ); # C: asm volatile ("add.cc.u64 %0, %1, %2;" : "=l"(c) : "l"(a), "l"(b) : "memory" );
# LLVM: call i64 asm "add.cc.u64 $0, $1, $2;", "=l,l,l,~{memory}"(i64 %1, i64 %2) # LLVM: call i64 asm "add.cc.u64 $0, $1, $2;", "=l,l,l,~{memory}"(i64 %1, i64 %2)
# #
@ -30,16 +30,16 @@ import
# 2. Generate u32 and u64 `getInlineAsm()` definition (that is associated with an LLVM IR ContextRef) # 2. Generate u32 and u64 `getInlineAsm()` definition (that is associated with an LLVM IR ContextRef)
# 3. Create an initialization proc to be called after initializing the LLVM ContextRef # 3. Create an initialization proc to be called after initializing the LLVM ContextRef
# For each instruction, return a routine with signature that mirrors LLVM builtin instructions: # For each instruction, return a routine with signature that mirrors LLVM builtin instructions:
# #
# proc myInstr(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring): ValueRef = # proc myInstr(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring): ValueRef =
# let numBits = lhs.getTypeOf().getIntTypeWidth() # let numBits = lhs.getTypeOf().getIntTypeWidth()
# if numBits == 32: # if numBits == 32:
# builder.call2(inlineAsmFnType, inlineAsmFn32, [arg0, arg1, ...], name) # builder.call2(inlineAsmFnType, inlineAsmFn32, [arg0, arg1, ...], name)
# elif numBits == 64: # elif numBits == 64:
# builder.call2(inlineAsmFnType, inlineAsmFn64, [arg0, arg1, ...], name) # builder.call2(inlineAsmFnType, inlineAsmFn64, [arg0, arg1, ...], name)
# else: # else:
# doAssert false, "Unsupported int" & $numBits # doAssert false, "Unsupported int" & $numBits
# #
# To create `inlineAsmFn32` and `inlineAsmFn64` we may use `getInlineAsm` just before the corresponding # To create `inlineAsmFn32` and `inlineAsmFn64` we may use `getInlineAsm` just before the corresponding
# builder.call2. This allows us to define freestanding functions. # builder.call2. This allows us to define freestanding functions.
# The potential issue is the overhead of repeated definition of add/sub/mul/muladd # The potential issue is the overhead of repeated definition of add/sub/mul/muladd
@ -94,7 +94,7 @@ macro genInstr(body: untyped): untyped =
let fnTy = ident"fnTy" let fnTy = ident"fnTy"
let ctx = ident"ctx" let ctx = ident"ctx"
let lhs = op[2][0][3][0] let lhs = op[2][0][3][0]
instrBody.add quote do: instrBody.add quote do:
let `ctx` = builder.getContext() let `ctx` = builder.getContext()
# lhs: ValueRef or uint32 or uint64 # lhs: ValueRef or uint32 or uint64
@ -143,6 +143,8 @@ macro genInstr(body: untyped): untyped =
# We could have generic constraint string generation, but we only have 2 arities to support # We could have generic constraint string generation, but we only have 2 arities to support
# and codegen without quote do would be even more verbose and hard to read. # and codegen without quote do would be even more verbose and hard to read.
# TODO: commutative inputs
if arity == 2: if arity == 2:
let op0 = operands[0] let op0 = operands[0]
let op1 = operands[1] let op1 = operands[1]
@ -201,7 +203,7 @@ macro genInstr(body: untyped): untyped =
else: else:
instrBody.add quote do: instrBody.add quote do:
let `asmString` = static(`instr` & ".u") & $`numBits` & static(" " & `instrParam`) let `asmString` = static(`instr` & ".u") & $`numBits` & static(" " & `instrParam`)
instrBody.add quote do: instrBody.add quote do:
# Chapter 6 of https://docs.nvidia.com/cuda/pdf/NVVM_IR_Specification.pdf # Chapter 6 of https://docs.nvidia.com/cuda/pdf/NVVM_IR_Specification.pdf
# inteldialect is not supported (but the NVPTX dialect is akin to intel dialect) # inteldialect is not supported (but the NVPTX dialect is akin to intel dialect)
@ -217,8 +219,7 @@ macro genInstr(body: untyped): untyped =
hasSideEffects = LlvmBool(0), hasSideEffects = LlvmBool(0),
isAlignStack = LlvmBool(0), isAlignStack = LlvmBool(0),
dialect = InlineAsmDialectATT, dialect = InlineAsmDialectATT,
canThrow = LlvmBool(0) canThrow = LlvmBool(0))
)
# 5. Call it # 5. Call it
let opArray = nnkBracket.newTree() let opArray = nnkBracket.newTree()
@ -235,8 +236,7 @@ macro genInstr(body: untyped): untyped =
# builder.call2(ty, inlineASM, [lhs, rhs], name) # builder.call2(ty, inlineASM, [lhs, rhs], name)
instrBody.add newCall( instrBody.add newCall(
ident"call2", ident"builder", fnTy, ident"call2", ident"builder", fnTy,
inlineASM, opArray, ident"name" inlineASM, opArray, ident"name")
)
# 6. Create the function signature # 6. Create the function signature
var opDefs: seq[NimNode] var opDefs: seq[NimNode]
@ -273,8 +273,7 @@ macro genInstr(body: untyped): untyped =
name = nnkPostfix.newTree(ident"*", instrName), name = nnkPostfix.newTree(ident"*", instrName),
params = opDefs, params = opDefs,
procType = nnkProcDef, procType = nnkProcDef,
body = instrBody body = instrBody)
)
# Inline PTX assembly # Inline PTX assembly
# ------------------------------------------------------------ # ------------------------------------------------------------
@ -293,7 +292,7 @@ macro genInstr(body: untyped): untyped =
# #
# https://docs.nvidia.com/cuda/inline-ptx-assembly/index.html#constraints # https://docs.nvidia.com/cuda/inline-ptx-assembly/index.html#constraints
# There is a separate constraint letter for each PTX register type: # There is a separate constraint letter for each PTX register type:
# #
# "h" = .u16 reg # "h" = .u16 reg
# "r" = .u32 reg # "r" = .u32 reg
# "l" = .u64 reg # "l" = .u64 reg
@ -304,13 +303,13 @@ macro genInstr(body: untyped): untyped =
# #
# #
# 1.2.3. Incorrect Optimization # 1.2.3. Incorrect Optimization
# #
# The compiler assumes that an asm() statement has no side effects except to change the output operands. To ensure that the asm is not deleted or moved during generation of PTX, you should use the volatile keyword, e.g.: # The compiler assumes that an asm() statement has no side effects except to change the output operands. To ensure that the asm is not deleted or moved during generation of PTX, you should use the volatile keyword, e.g.:
# #
# asm volatile ("mov.u32 %0, %%clock;" : "=r"(x)); # asm volatile ("mov.u32 %0, %%clock;" : "=r"(x));
# #
# Normally any memory that is written to will be specified as an out operand, but if there is a hidden side effect on user memory (for example, indirect access of a memory location via an operand), or if you want to stop any memory optimizations around the asm() statement performed during generation of PTX, you can add a “memory” clobbers specification after a 3rd colon, e.g.: # Normally any memory that is written to will be specified as an out operand, but if there is a hidden side effect on user memory (for example, indirect access of a memory location via an operand), or if you want to stop any memory optimizations around the asm() statement performed during generation of PTX, you can add a “memory” clobbers specification after a 3rd colon, e.g.:
# #
# asm volatile ("mov.u32 %0, %%clock;" : "=r"(x) :: "memory"); # asm volatile ("mov.u32 %0, %%clock;" : "=r"(x) :: "memory");
# asm ("st.u32 [%0], %1;" : "r"(p), "r"(x) :: "memory"); # asm ("st.u32 [%0], %1;" : "r"(p), "r"(x) :: "memory");
# #
@ -331,7 +330,7 @@ macro genInstr(body: untyped): untyped =
genInstr(): genInstr():
# The PTX is without size indicator i.e. add.cc instead of add.cc.u32 # The PTX is without size indicator i.e. add.cc instead of add.cc.u32
# Both version will be generated. # Both version will be generated.
# #
# op name: ("ptx", "args;", "constraints", [params]) # op name: ("ptx", "args;", "constraints", [params])
@ -356,7 +355,7 @@ genInstr():
op mulhiadd_cio: ("madc.hi.cc", "$0, $1, $2, $3;", "=rl,rln,rln,rln", [lmul, rmul, addend]) op mulhiadd_cio: ("madc.hi.cc", "$0, $1, $2, $3;", "=rl,rln,rln,rln", [lmul, rmul, addend])
# Conditional mov / select # Conditional mov / select
# slct r, a, b, c; # slct r, a, b, c;
# r <- (c >= 0) ? a : b; # r <- (c >= 0) ? a : b;
op slct: ({"slct",".s32"}, "$0, $1, $2, $3;", "=rl,rln,rln,rn", [ifPos, ifNeg, condition]) op slct: ({"slct",".s32"}, "$0, $1, $2, $3;", "=rl,rln,rln,rn", [ifPos, ifNeg, condition])

View File

@ -13,7 +13,7 @@ import
multiplexers, multiplexers,
ct_division ct_division
], ],
compilers/[ intrinsics/[
addcarry_subborrow, addcarry_subborrow,
extended_precision, extended_precision,
compiler_optim_hints compiler_optim_hints

1
research/codegen/nim.cfg Normal file
View File

@ -0,0 +1 @@
--path:../../constantine/platforms/code_generator

95
research/codegen/x86.nim Normal file
View File

@ -0,0 +1,95 @@
# Constantine
# Copyright (c) 2018-2019 Status Research & Development GmbH
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.
import
./bindings/c_abi,
./llvm, ./ir,
./x86_inlineasm,
../primitives
export x86_inlineasm
# ############################################################
#
# x86 API
#
# ############################################################
proc defMulExt*(asy: Assembler_LLVM, wordSize: int): FnDef =
let procName = if wordSize == 64: cstring"hw_mulExt64"
else: cstring"hw_mulExt32"
let doublePrec_t = if wordSize == 64: asy.i128_t
else: asy.i64_t
let mulExtTy = if wordSize == 64: function_t(doublePrec_t, [asy.i64_t, asy.i64_t])
else: function_t(doublePrec_t, [asy.i32_t, asy.i32_t])
let mulExtKernel = asy.module.addFunction(procName, mulExtTy)
let blck = asy.ctx.appendBasicBlock(mulExtKernel, "mulExtBody")
asy.builder.positionAtEnd(blck)
let bld = asy.builder
let a = bld.zext(mulExtKernel.getParam(0), doublePrec_t)
let b = bld.zext(mulExtKernel.getParam(1), doublePrec_t)
let r = bld.mul(a, b)
bld.ret r
return (mulExtTy, mulExtKernel)
proc defHi*(asy: Assembler_LLVM, wordSize: int): FnDef =
let procName = if wordSize == 64: cstring"hw_hi64"
else: cstring"hw_hi32"
let doublePrec_t = if wordSize == 64: asy.i128_t
else: asy.i64_t
let singlePrec_t = if wordSize == 64: asy.i64_t
else: asy.i32_t
let hiTy = function_t(singlePrec_t, [doublePrec_t])
let hiKernel = asy.module.addFunction(procName, hiTy)
let blck = asy.ctx.appendBasicBlock(hiKernel, "hiBody")
asy.builder.positionAtEnd(blck)
let bld = asy.builder
# %1 = zext i32 64 to i128
let shift = bld.zext(constInt(asy.i32_t, culonglong wordSize, signExtend = LlvmBool(0)), doublePrec_t)
# %hiLarge = lshr i128 %input, %1
let hiLarge = bld.lshr(hiKernel.getParam(0), shift)
# %hi = trunc i128 %hiLarge to i64
let hi = bld.trunc(hiLarge, singlePrec_t)
bld.ret hi
return (hiTy, hiKernel)
proc defLo*(asy: Assembler_LLVM, wordSize: int): FnDef =
let procName = if wordSize == 64: cstring"hw_lo64"
else: cstring"hw_lo32"
let doublePrec_t = if wordSize == 64: asy.i128_t
else: asy.i64_t
let singlePrec_t = if wordSize == 64: asy.i64_t
else: asy.i32_t
let loTy = function_t(singlePrec_t, [doublePrec_t])
let loKernel = asy.module.addFunction(procName, loTy)
let blck = asy.ctx.appendBasicBlock(loKernel, "loBody")
asy.builder.positionAtEnd(blck)
let bld = asy.builder
# %lo = trunc i128 %input to i64
let lo = bld.trunc(loKernel.getParam(0), singlePrec_t)
bld.ret lo
return (loTy, loKernel)

View File

@ -0,0 +1,209 @@
# Constantine
# Copyright (c) 2018-2019 Status Research & Development GmbH
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.
import
std/[macros, strutils],
./llvm
# ############################################################
#
# x86 Inline ASM
#
# ############################################################
macro genInstr(body: untyped): untyped =
result = newStmtList()
body.expectKind(nnkStmtList)
for op in body:
op.expectKind(nnkCommand)
doAssert op[0].eqIdent"op"
let instrName = op[1]
# For each op, generate a builder proc
op[2][0].expectKind(nnkTupleConstr)
op[2][0][0].expectKind(nnkStrLit)
op[2][0][1].expectKind(nnkStrLit)
op[2][0][2].expectKind(nnkStrLit)
op[2][0][3].expectKind(nnkBracket)
let instrBody = newStmtList()
# 1. Detect the size of registers
let numBits = ident"numBits"
let regTy = ident"regTy"
let fnTy = ident"fnTy"
let ctx = ident"ctx"
let lhs = op[2][0][3][0]
instrBody.add quote do:
let `ctx` = builder.getContext()
# lhs: ValueRef or uint32 or uint64
let `numBits` = when `lhs` is ValueRef|ConstValueRef: `lhs`.getTypeOf().getIntTypeWidth()
else: 8*sizeof(`lhs`)
let `regTy` = when `lhs` is ValueRef|ConstValueRef: `lhs`.getTypeOf()
elif `lhs` is uint32: `ctx`.int32_t()
elif `lhs` is uint64: `ctx`.int64_t()
else: {.error "Unsupported input type " & $typeof(`lhs`).}
# 2. Create the LLVM asm signature
let operands = op[2][0][3]
let arity = operands.len
let constraintString = op[2][0][2]
let constraints = ident"constraints"
let instr = op[2][0][0]
if arity == 2:
if constraintString.strVal.startsWith('='):
if constraintString.strVal.endsWith('r'):
instrBody.add quote do:
let `fnTy` = function_t(`regTy`, [`regTy`, `regTy`])
else:
instrBody.add quote do:
let `fnTy` = function_t(`regTy`, [`regTy`, pointer_t(`regTy`)])
else:
# We only support out of place "=" function.
# In-place with "+" requires alloca + load/stores in codegen
# in-place functions can be rewritten to be out-place with "matching constraints"
error "Unsupported constraint: " & constraintString.strVal
else:
error "Unsupported arity: " & $arity
# 3. Nothing, we can use the constraint string as is on x86
# 4. Register the inline ASM with LLVM
let inlineASM = ident"inlineASM"
let instrParam = op[2][0][1]
let asmString = ident"asmString"
instrBody.add quote do:
let `asmString` = if numBits == 64: static(`instr` & "q") & static(" " & `instrParam`)
else: static(`instr` & "l") & static(" " & `instrParam`)
instrBody.add quote do:
let `inlineASM` = getInlineAsm(
ty = `fnTy`,
asmString = `asmString`,
constraints = `constraintString`,
# All carry/overflow instructions have sideffect on carry flag and can't be reordered
# However, function calls can't be reordered.
# Relevant operations that affects flags are:
# - MUL, if the compiler decides not to use MULX
# - XOR, for zeroing a register
hasSideEffects = LlvmBool(0),
isAlignStack = LlvmBool(0),
dialect = InlineAsmDialectATT,
canThrow = LlvmBool(0))
# 5. Call it
let opArray = nnkBracket.newTree()
for op in operands:
# when op is ValueRef: op
# else: constInt(uint64(op))
opArray.add newCall(
bindSym"ValueRef",
nnkWhenStmt.newTree(
nnkElifBranch.newTree(nnkInfix.newTree(ident"is", op, bindSym"AnyValueRef"), op),
nnkElse.newTree(newCall(ident"constInt", regTy, newCall(ident"uint64", op)))
)
)
# builder.call2(ty, inlineASM, [lhs, rhs], name)
instrBody.add newCall(
ident"call2", ident"builder", fnTy,
inlineASM, opArray, ident"name")
# 6. Create the function signature
var opDefs: seq[NimNode]
opDefs.add ident"ValueRef" # Return type
opDefs.add newIdentDefs(ident"builder", bindSym"BuilderRef")
block:
var i = 0
for constraint in constraintString.strVal.split(','):
if constraint.startsWith('=') or constraint.startsWith("~{memory}"):
# Don't increment i
continue
elif constraint == "m":
opDefs.add newIdentDefs(operands[i], ident"ValueRef")
elif constraint.endsWith('r') or constraint.endsWith('0'):
opDefs.add newIdentDefs(
operands[i],
nnkInfix.newTree(ident"or",
nnkInfix.newTree(ident"or", ident"AnyValueRef", ident"uint32"),
ident"uint64")
)
else:
error "Unsupported constraint: " & constraint
i += 1
opDefs.add newIdentDefs(ident"name", bindSym"cstring", newLit"")
result.add newProc(
name = nnkPostfix.newTree(ident"*", instrName),
params = opDefs,
procType = nnkProcDef,
body = instrBody)
# Inline x86 assembly
# ------------------------------------------------------------
#
# We can generate add with carry via
# call { i8, i64 } @llvm.x86.addcarry.64(i8 %carryIn, i64 %a, i64 %b)
#
# We can generate multi-precision mul and mulx via
#
# define {i64, i64} @mul(i64 %x, i64 %y) #0 {
#
# %1 = zext i64 %x to i128
# %2 = zext i64 %y to i128
# %r = mul i128 %1, %2
# %3 = zext i32 64 to i128
# %4 = lshr i128 %r, %3
# %hi = trunc i128 %4 to i64
# %lo = trunc i128 %r to i64
#
# %res_tmp = insertvalue {i64, i64} undef, i64 %hi, 0
# %res = insertvalue {i64, i64} %res_tmp, i64 %lo, 1
#
# ret {i64, i64} %res
# }
#
# attributes #0 = {"target-features"="+bmi2"}
#
# mul:
# mov rax, rdi
# mul rsi
# mov rcx, rax
# mov rax, rdx
# mov rdx, rcx
# ret
#
# mul_bmi2:
# mov rdx, rdi
# mulx rax, rdx, rsi
# ret
#
# Note that mul(hi: var rdx, lo: var rax, a: reg/mem64, b: rax)
# - clobbers carry (and many other) flags
# - has fixed output to rdx:rax registers
# while mulx(hi: var reg64, lo: var reg64, a: reg/mem64, b: rdx)
# - does not clobber flags
# - has flexible register outputs
genInstr():
# We are only concerned about the ADCX/ADOX instructions
# which do not have intrinsics or cannot be generated through instruction combining
# unlike llvm.x86.addcarry.u64 that can generate adc
# (cf/of, r) <- a+b+(cf/of)
op adcx_rr: ("adcx", "%2, %0;", "=r,%0,r", [lhs, rhs])
op adcx_rm: ("adcx", "%2, %0;", "=r,0,m", [lhs, rhs])
op adox_rr: ("adox", "%2, %0;", "=r,%0,r", [lhs, rhs])
op adox_rm: ("adox", "%2, %0;", "=r,0,m", [lhs, rhs])

View File

@ -6,7 +6,7 @@
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). # * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms. # at your option. This file may not be copied, modified, or distributed except according to those terms.
import ../../constantine/platforms/gpu/llvm import ../../constantine/platforms/code_generator/llvm
echo "LLVM JIT compiler Hello World" echo "LLVM JIT compiler Hello World"

View File

@ -6,7 +6,7 @@
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). # * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms. # at your option. This file may not be copied, modified, or distributed except according to those terms.
import ../../constantine/platforms/gpu/[llvm, nvidia, bindings/c_abi] import ../../constantine/platforms/code_generator/[llvm, nvidia, bindings/c_abi]
# ############################################################ # ############################################################
# #

View File

@ -11,12 +11,12 @@ import
# Standard library # Standard library
std/[unittest, times], std/[unittest, times],
# Internal # Internal
../../constantine/platforms/gpu/[llvm, nvidia, ir], ../../constantine/platforms/code_generator/[llvm, nvidia, ir],
../../constantine/platforms/static_for, ../../constantine/platforms/static_for,
../../constantine/math/config/curves, ../../constantine/math/config/curves,
../../constantine/math/io/io_bigints, ../../constantine/math/io/io_bigints,
../../constantine/math/arithmetic, ../../constantine/math/arithmetic,
../../constantine/math_gpu/fields_nvidia, ../../constantine/math_codegen/fields_nvidia,
# Test utilities # Test utilities
../../helpers/prng_unsafe ../../helpers/prng_unsafe