[Research] x86 code generator (#234)

* rename compilers -> intrinsics, math_gpu -> math_codegen

* stash x86 codegen in research
This commit is contained in:
Mamy Ratsimbazafy 2023-04-27 21:52:51 +02:00 committed by GitHub
parent c6d9a213f2
commit 33c3a2e8c4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
23 changed files with 343 additions and 38 deletions

View File

@ -7,7 +7,7 @@
# at your option. This file may not be copied, modified, or distributed except according to those terms.
import
../platforms/gpu/[llvm, nvidia, ir]
../platforms/code_generator/[llvm, nvidia, ir]
# ############################################################
#
@ -21,10 +21,10 @@ import
proc finalSubMayOverflow*(asy: Assembler_LLVM, cm: CurveMetadata, field: Field, r, a: Array) =
## If a >= Modulus: r <- a-M
## else: r <- a
##
##
## This is constant-time straightline code.
## Due to warp divergence, the overhead of doing comparison with shortcutting might not be worth it on GPU.
##
##
## To be used when the final substraction can
## also overflow the limbs (a 2^256 order of magnitude modulus stored in n words of total max size 2^256)
@ -48,15 +48,15 @@ proc finalSubMayOverflow*(asy: Assembler_LLVM, cm: CurveMetadata, field: Field,
let underflowedModulus = bld.sub_bi(overflowedLimbs, 0'u32)
for i in 0 ..< N:
r[i] = bld.slct(scratch[i], a[i], underflowedModulus)
r[i] = bld.slct(scratch[i], a[i], underflowedModulus)
proc finalSubNoOverflow*(asy: Assembler_LLVM, cm: CurveMetadata, field: Field, r, a: Array) =
## If a >= Modulus: r <- a-M
## else: r <- a
##
##
## This is constant-time straightline code.
## Due to warp divergence, the overhead of doing comparison with shortcutting might not be worth it on GPU.
##
##
## To be used when the modulus does not use the full bitwidth of the storing words
## (say using 255 bits for the modulus out of 256 available in words)
@ -65,7 +65,7 @@ proc finalSubNoOverflow*(asy: Assembler_LLVM, cm: CurveMetadata, field: Field, r
let scratch = bld.makeArray(fieldTy)
let M = cm.getModulus(field)
let N = M.len
# Now substract the modulus, and test a < M with the last borrow
scratch[0] = bld.sub_bo(a[0], M[0])
for i in 1 ..< N:
@ -80,7 +80,7 @@ proc finalSubNoOverflow*(asy: Assembler_LLVM, cm: CurveMetadata, field: Field, r
proc field_add_gen*(asy: Assembler_LLVM, cm: CurveMetadata, field: Field): FnDef =
## Generate an optimized modular addition kernel
## with parameters `a, b, modulus: Limbs -> Limbs`
let procName = cm.genSymbol(block:
case field
of fp: opFpAdd
@ -94,14 +94,14 @@ proc field_add_gen*(asy: Assembler_LLVM, cm: CurveMetadata, field: Field): FnDef
asy.builder.positionAtEnd(blck)
let bld = asy.builder
let r = bld.asArray(addModKernel.getParam(0), fieldTy)
let a = bld.asArray(addModKernel.getParam(1), fieldTy)
let b = bld.asArray(addModKernel.getParam(2), fieldTy)
let t = bld.makeArray(fieldTy)
let N = cm.getNumWords(field)
t[0] = bld.add_co(a[0], b[0])
for i in 1 ..< N:
t[i] = bld.add_cio(a[i], b[i])

View File

@ -6,7 +6,7 @@
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.
import ./compilers/bitops
import ./intrinsics/bitops
# ############################################################
#

View File

@ -24,7 +24,7 @@ type
ctx*: ContextRef
module*: ModuleRef
builder*: BuilderRef
i1_t*, i32_t*, i64_t*, void_t*: TypeRef
i1_t*, i32_t*, i64_t*, i128_t*, void_t*: TypeRef
backend*: Backend
Backend* = enum
@ -54,7 +54,8 @@ proc new*(T: type Assembler_LLVM, backend: Backend, moduleName: cstring): Assemb
result.builder = result.ctx.createBuilder()
result.i1_t = result.ctx.int1_t()
result.i32_t = result.ctx.int32_t()
result.i64_t = result.ctx.int32_t()
result.i64_t = result.ctx.int64_t()
result.i128_t = result.ctx.int128_t()
result.void_t = result.ctx.void_t()
result.backend = backend

View File

@ -20,7 +20,7 @@ import
# instructions -> inline assembly -> argument mapping
# Inline assembly looks like this:
#
#
# C: asm volatile ("add.cc.u64 %0, %1, %2;" : "=l"(c) : "l"(a), "l"(b) : "memory" );
# LLVM: call i64 asm "add.cc.u64 $0, $1, $2;", "=l,l,l,~{memory}"(i64 %1, i64 %2)
#
@ -30,16 +30,16 @@ import
# 2. Generate u32 and u64 `getInlineAsm()` definition (that is associated with an LLVM IR ContextRef)
# 3. Create an initialization proc to be called after initializing the LLVM ContextRef
# For each instruction, return a routine with signature that mirrors LLVM builtin instructions:
#
#
# proc myInstr(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring): ValueRef =
# let numBits = lhs.getTypeOf().getIntTypeWidth()
# if numBits == 32:
# if numBits == 32:
# builder.call2(inlineAsmFnType, inlineAsmFn32, [arg0, arg1, ...], name)
# elif numBits == 64:
# elif numBits == 64:
# builder.call2(inlineAsmFnType, inlineAsmFn64, [arg0, arg1, ...], name)
# else:
# doAssert false, "Unsupported int" & $numBits
#
#
# To create `inlineAsmFn32` and `inlineAsmFn64` we may use `getInlineAsm` just before the corresponding
# builder.call2. This allows us to define freestanding functions.
# The potential issue is the overhead of repeated definition of add/sub/mul/muladd
@ -94,7 +94,7 @@ macro genInstr(body: untyped): untyped =
let fnTy = ident"fnTy"
let ctx = ident"ctx"
let lhs = op[2][0][3][0]
instrBody.add quote do:
let `ctx` = builder.getContext()
# lhs: ValueRef or uint32 or uint64
@ -143,6 +143,8 @@ macro genInstr(body: untyped): untyped =
# We could have generic constraint string generation, but we only have 2 arities to support
# and codegen without quote do would be even more verbose and hard to read.
# TODO: commutative inputs
if arity == 2:
let op0 = operands[0]
let op1 = operands[1]
@ -201,7 +203,7 @@ macro genInstr(body: untyped): untyped =
else:
instrBody.add quote do:
let `asmString` = static(`instr` & ".u") & $`numBits` & static(" " & `instrParam`)
instrBody.add quote do:
# Chapter 6 of https://docs.nvidia.com/cuda/pdf/NVVM_IR_Specification.pdf
# inteldialect is not supported (but the NVPTX dialect is akin to intel dialect)
@ -217,8 +219,7 @@ macro genInstr(body: untyped): untyped =
hasSideEffects = LlvmBool(0),
isAlignStack = LlvmBool(0),
dialect = InlineAsmDialectATT,
canThrow = LlvmBool(0)
)
canThrow = LlvmBool(0))
# 5. Call it
let opArray = nnkBracket.newTree()
@ -235,8 +236,7 @@ macro genInstr(body: untyped): untyped =
# builder.call2(ty, inlineASM, [lhs, rhs], name)
instrBody.add newCall(
ident"call2", ident"builder", fnTy,
inlineASM, opArray, ident"name"
)
inlineASM, opArray, ident"name")
# 6. Create the function signature
var opDefs: seq[NimNode]
@ -273,8 +273,7 @@ macro genInstr(body: untyped): untyped =
name = nnkPostfix.newTree(ident"*", instrName),
params = opDefs,
procType = nnkProcDef,
body = instrBody
)
body = instrBody)
# Inline PTX assembly
# ------------------------------------------------------------
@ -293,7 +292,7 @@ macro genInstr(body: untyped): untyped =
#
# https://docs.nvidia.com/cuda/inline-ptx-assembly/index.html#constraints
# There is a separate constraint letter for each PTX register type:
#
#
# "h" = .u16 reg
# "r" = .u32 reg
# "l" = .u64 reg
@ -304,13 +303,13 @@ macro genInstr(body: untyped): untyped =
#
#
# 1.2.3. Incorrect Optimization
#
#
# The compiler assumes that an asm() statement has no side effects except to change the output operands. To ensure that the asm is not deleted or moved during generation of PTX, you should use the volatile keyword, e.g.:
#
#
# asm volatile ("mov.u32 %0, %%clock;" : "=r"(x));
#
#
# Normally any memory that is written to will be specified as an out operand, but if there is a hidden side effect on user memory (for example, indirect access of a memory location via an operand), or if you want to stop any memory optimizations around the asm() statement performed during generation of PTX, you can add a “memory” clobbers specification after a 3rd colon, e.g.:
#
#
# asm volatile ("mov.u32 %0, %%clock;" : "=r"(x) :: "memory");
# asm ("st.u32 [%0], %1;" : "r"(p), "r"(x) :: "memory");
#
@ -331,7 +330,7 @@ macro genInstr(body: untyped): untyped =
genInstr():
# The PTX is without size indicator i.e. add.cc instead of add.cc.u32
# Both version will be generated.
# Both version will be generated.
#
# op name: ("ptx", "args;", "constraints", [params])
@ -356,7 +355,7 @@ genInstr():
op mulhiadd_cio: ("madc.hi.cc", "$0, $1, $2, $3;", "=rl,rln,rln,rln", [lmul, rmul, addend])
# Conditional mov / select
# slct r, a, b, c;
# r <- (c >= 0) ? a : b;
op slct: ({"slct",".s32"}, "$0, $1, $2, $3;", "=rl,rln,rln,rn", [ifPos, ifNeg, condition])

View File

@ -13,7 +13,7 @@ import
multiplexers,
ct_division
],
compilers/[
intrinsics/[
addcarry_subborrow,
extended_precision,
compiler_optim_hints

1
research/codegen/nim.cfg Normal file
View File

@ -0,0 +1 @@
--path:../../constantine/platforms/code_generator

95
research/codegen/x86.nim Normal file
View File

@ -0,0 +1,95 @@
# Constantine
# Copyright (c) 2018-2019 Status Research & Development GmbH
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.
import
./bindings/c_abi,
./llvm, ./ir,
./x86_inlineasm,
../primitives
export x86_inlineasm
# ############################################################
#
# x86 API
#
# ############################################################
proc defMulExt*(asy: Assembler_LLVM, wordSize: int): FnDef =
let procName = if wordSize == 64: cstring"hw_mulExt64"
else: cstring"hw_mulExt32"
let doublePrec_t = if wordSize == 64: asy.i128_t
else: asy.i64_t
let mulExtTy = if wordSize == 64: function_t(doublePrec_t, [asy.i64_t, asy.i64_t])
else: function_t(doublePrec_t, [asy.i32_t, asy.i32_t])
let mulExtKernel = asy.module.addFunction(procName, mulExtTy)
let blck = asy.ctx.appendBasicBlock(mulExtKernel, "mulExtBody")
asy.builder.positionAtEnd(blck)
let bld = asy.builder
let a = bld.zext(mulExtKernel.getParam(0), doublePrec_t)
let b = bld.zext(mulExtKernel.getParam(1), doublePrec_t)
let r = bld.mul(a, b)
bld.ret r
return (mulExtTy, mulExtKernel)
proc defHi*(asy: Assembler_LLVM, wordSize: int): FnDef =
let procName = if wordSize == 64: cstring"hw_hi64"
else: cstring"hw_hi32"
let doublePrec_t = if wordSize == 64: asy.i128_t
else: asy.i64_t
let singlePrec_t = if wordSize == 64: asy.i64_t
else: asy.i32_t
let hiTy = function_t(singlePrec_t, [doublePrec_t])
let hiKernel = asy.module.addFunction(procName, hiTy)
let blck = asy.ctx.appendBasicBlock(hiKernel, "hiBody")
asy.builder.positionAtEnd(blck)
let bld = asy.builder
# %1 = zext i32 64 to i128
let shift = bld.zext(constInt(asy.i32_t, culonglong wordSize, signExtend = LlvmBool(0)), doublePrec_t)
# %hiLarge = lshr i128 %input, %1
let hiLarge = bld.lshr(hiKernel.getParam(0), shift)
# %hi = trunc i128 %hiLarge to i64
let hi = bld.trunc(hiLarge, singlePrec_t)
bld.ret hi
return (hiTy, hiKernel)
proc defLo*(asy: Assembler_LLVM, wordSize: int): FnDef =
let procName = if wordSize == 64: cstring"hw_lo64"
else: cstring"hw_lo32"
let doublePrec_t = if wordSize == 64: asy.i128_t
else: asy.i64_t
let singlePrec_t = if wordSize == 64: asy.i64_t
else: asy.i32_t
let loTy = function_t(singlePrec_t, [doublePrec_t])
let loKernel = asy.module.addFunction(procName, loTy)
let blck = asy.ctx.appendBasicBlock(loKernel, "loBody")
asy.builder.positionAtEnd(blck)
let bld = asy.builder
# %lo = trunc i128 %input to i64
let lo = bld.trunc(loKernel.getParam(0), singlePrec_t)
bld.ret lo
return (loTy, loKernel)

View File

@ -0,0 +1,209 @@
# Constantine
# Copyright (c) 2018-2019 Status Research & Development GmbH
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.
import
std/[macros, strutils],
./llvm
# ############################################################
#
# x86 Inline ASM
#
# ############################################################
macro genInstr(body: untyped): untyped =
result = newStmtList()
body.expectKind(nnkStmtList)
for op in body:
op.expectKind(nnkCommand)
doAssert op[0].eqIdent"op"
let instrName = op[1]
# For each op, generate a builder proc
op[2][0].expectKind(nnkTupleConstr)
op[2][0][0].expectKind(nnkStrLit)
op[2][0][1].expectKind(nnkStrLit)
op[2][0][2].expectKind(nnkStrLit)
op[2][0][3].expectKind(nnkBracket)
let instrBody = newStmtList()
# 1. Detect the size of registers
let numBits = ident"numBits"
let regTy = ident"regTy"
let fnTy = ident"fnTy"
let ctx = ident"ctx"
let lhs = op[2][0][3][0]
instrBody.add quote do:
let `ctx` = builder.getContext()
# lhs: ValueRef or uint32 or uint64
let `numBits` = when `lhs` is ValueRef|ConstValueRef: `lhs`.getTypeOf().getIntTypeWidth()
else: 8*sizeof(`lhs`)
let `regTy` = when `lhs` is ValueRef|ConstValueRef: `lhs`.getTypeOf()
elif `lhs` is uint32: `ctx`.int32_t()
elif `lhs` is uint64: `ctx`.int64_t()
else: {.error "Unsupported input type " & $typeof(`lhs`).}
# 2. Create the LLVM asm signature
let operands = op[2][0][3]
let arity = operands.len
let constraintString = op[2][0][2]
let constraints = ident"constraints"
let instr = op[2][0][0]
if arity == 2:
if constraintString.strVal.startsWith('='):
if constraintString.strVal.endsWith('r'):
instrBody.add quote do:
let `fnTy` = function_t(`regTy`, [`regTy`, `regTy`])
else:
instrBody.add quote do:
let `fnTy` = function_t(`regTy`, [`regTy`, pointer_t(`regTy`)])
else:
# We only support out of place "=" function.
# In-place with "+" requires alloca + load/stores in codegen
# in-place functions can be rewritten to be out-place with "matching constraints"
error "Unsupported constraint: " & constraintString.strVal
else:
error "Unsupported arity: " & $arity
# 3. Nothing, we can use the constraint string as is on x86
# 4. Register the inline ASM with LLVM
let inlineASM = ident"inlineASM"
let instrParam = op[2][0][1]
let asmString = ident"asmString"
instrBody.add quote do:
let `asmString` = if numBits == 64: static(`instr` & "q") & static(" " & `instrParam`)
else: static(`instr` & "l") & static(" " & `instrParam`)
instrBody.add quote do:
let `inlineASM` = getInlineAsm(
ty = `fnTy`,
asmString = `asmString`,
constraints = `constraintString`,
# All carry/overflow instructions have sideffect on carry flag and can't be reordered
# However, function calls can't be reordered.
# Relevant operations that affects flags are:
# - MUL, if the compiler decides not to use MULX
# - XOR, for zeroing a register
hasSideEffects = LlvmBool(0),
isAlignStack = LlvmBool(0),
dialect = InlineAsmDialectATT,
canThrow = LlvmBool(0))
# 5. Call it
let opArray = nnkBracket.newTree()
for op in operands:
# when op is ValueRef: op
# else: constInt(uint64(op))
opArray.add newCall(
bindSym"ValueRef",
nnkWhenStmt.newTree(
nnkElifBranch.newTree(nnkInfix.newTree(ident"is", op, bindSym"AnyValueRef"), op),
nnkElse.newTree(newCall(ident"constInt", regTy, newCall(ident"uint64", op)))
)
)
# builder.call2(ty, inlineASM, [lhs, rhs], name)
instrBody.add newCall(
ident"call2", ident"builder", fnTy,
inlineASM, opArray, ident"name")
# 6. Create the function signature
var opDefs: seq[NimNode]
opDefs.add ident"ValueRef" # Return type
opDefs.add newIdentDefs(ident"builder", bindSym"BuilderRef")
block:
var i = 0
for constraint in constraintString.strVal.split(','):
if constraint.startsWith('=') or constraint.startsWith("~{memory}"):
# Don't increment i
continue
elif constraint == "m":
opDefs.add newIdentDefs(operands[i], ident"ValueRef")
elif constraint.endsWith('r') or constraint.endsWith('0'):
opDefs.add newIdentDefs(
operands[i],
nnkInfix.newTree(ident"or",
nnkInfix.newTree(ident"or", ident"AnyValueRef", ident"uint32"),
ident"uint64")
)
else:
error "Unsupported constraint: " & constraint
i += 1
opDefs.add newIdentDefs(ident"name", bindSym"cstring", newLit"")
result.add newProc(
name = nnkPostfix.newTree(ident"*", instrName),
params = opDefs,
procType = nnkProcDef,
body = instrBody)
# Inline x86 assembly
# ------------------------------------------------------------
#
# We can generate add with carry via
# call { i8, i64 } @llvm.x86.addcarry.64(i8 %carryIn, i64 %a, i64 %b)
#
# We can generate multi-precision mul and mulx via
#
# define {i64, i64} @mul(i64 %x, i64 %y) #0 {
#
# %1 = zext i64 %x to i128
# %2 = zext i64 %y to i128
# %r = mul i128 %1, %2
# %3 = zext i32 64 to i128
# %4 = lshr i128 %r, %3
# %hi = trunc i128 %4 to i64
# %lo = trunc i128 %r to i64
#
# %res_tmp = insertvalue {i64, i64} undef, i64 %hi, 0
# %res = insertvalue {i64, i64} %res_tmp, i64 %lo, 1
#
# ret {i64, i64} %res
# }
#
# attributes #0 = {"target-features"="+bmi2"}
#
# mul:
# mov rax, rdi
# mul rsi
# mov rcx, rax
# mov rax, rdx
# mov rdx, rcx
# ret
#
# mul_bmi2:
# mov rdx, rdi
# mulx rax, rdx, rsi
# ret
#
# Note that mul(hi: var rdx, lo: var rax, a: reg/mem64, b: rax)
# - clobbers carry (and many other) flags
# - has fixed output to rdx:rax registers
# while mulx(hi: var reg64, lo: var reg64, a: reg/mem64, b: rdx)
# - does not clobber flags
# - has flexible register outputs
genInstr():
# We are only concerned about the ADCX/ADOX instructions
# which do not have intrinsics or cannot be generated through instruction combining
# unlike llvm.x86.addcarry.u64 that can generate adc
# (cf/of, r) <- a+b+(cf/of)
op adcx_rr: ("adcx", "%2, %0;", "=r,%0,r", [lhs, rhs])
op adcx_rm: ("adcx", "%2, %0;", "=r,0,m", [lhs, rhs])
op adox_rr: ("adox", "%2, %0;", "=r,%0,r", [lhs, rhs])
op adox_rm: ("adox", "%2, %0;", "=r,0,m", [lhs, rhs])

View File

@ -6,7 +6,7 @@
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.
import ../../constantine/platforms/gpu/llvm
import ../../constantine/platforms/code_generator/llvm
echo "LLVM JIT compiler Hello World"

View File

@ -6,7 +6,7 @@
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.
import ../../constantine/platforms/gpu/[llvm, nvidia, bindings/c_abi]
import ../../constantine/platforms/code_generator/[llvm, nvidia, bindings/c_abi]
# ############################################################
#

View File

@ -11,12 +11,12 @@ import
# Standard library
std/[unittest, times],
# Internal
../../constantine/platforms/gpu/[llvm, nvidia, ir],
../../constantine/platforms/code_generator/[llvm, nvidia, ir],
../../constantine/platforms/static_for,
../../constantine/math/config/curves,
../../constantine/math/io/io_bigints,
../../constantine/math/arithmetic,
../../constantine/math_gpu/fields_nvidia,
../../constantine/math_codegen/fields_nvidia,
# Test utilities
../../helpers/prng_unsafe