# Constantine # Copyright (c) 2018-2019 Status Research & Development GmbH # Copyright (c) 2020-Present Mamy André-Ratsimbazafy # Licensed and distributed under either of # * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). # * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). # at your option. This file may not be copied, modified, or distributed except according to those terms. import ../../constantine/platforms/gpu/[llvm, nvidia, bindings/utils] # ############################################################ # # NVVM # # ############################################################ # https://docs.nvidia.com/cuda/libnvvm-api/index.html # https://docs.nvidia.com/pdf/libNVVM_API.pdf # https://docs.nvidia.com/cuda/nvvm-ir-spec/index.html # https://docs.nvidia.com/cuda/pdf/NVVM_IR_Specification.pdf # ⚠ NVVM IR is based on LLVM 7.0.1 IR which dates from december 2018. # There are a couple of caveats: # - LLVM 7.0.1 is usually not available in repo, making installation difficult # - There was a ABI breaking bug making the 7.0.1 and 7.1.0 versions messy (https://www.phoronix.com/news/LLVM-7.0.1-Released) # - LLVM 7.0.1 does not have LLVMBuildCall2 and relies on the deprecated LLVMBuildCall meaning # supporting that and latest LLVM (for AMDGPU and SPIR-V backends) will likely have heavy costs # - When generating a add-with-carry kernel with inline ASM calls from LLVM-14, # if the LLVM IR is passed as bitcode, # the kernel content is silently discarded, this does not happen with built-in add. # It is unsure if it's call2 or inline ASM incompatibility that causes the issues # - When generating a add-with-carry kernel with inline ASM calls from LLVM-14, # if the LLVM IR is passed as testual IR, the code is refused with NVVM_ERROR_INVALID_IR # Hence, using LLVM NVPTX backend instead of libNVVM is likely the sustainable way forward static: echo "[Constantine] Using library libnvvm.so" {.passl: "-L/opt/cuda/nvvm/lib64 -lnvvm".} type NvvmResult* {.size: sizeof(cint).} = enum NVVM_SUCCESS = 0 NVVM_ERROR_OUT_OF_MEMORY = 1 NVVM_ERROR_PROGRAM_CREATION_FAILURE = 2 NVVM_ERROR_IR_VERSION_MISMATCH = 3 NVVM_ERROR_INVALID_INPUT = 4 NVVM_ERROR_INVALID_PROGRAM = 5 NVVM_ERROR_INVALID_IR = 6 NVVM_ERROR_INVALID_OPTION = 7 NVVM_ERROR_NO_MODULE_IN_PROGRAM = 8 NVVM_ERROR_COMPILATION = 9 NvvmProgram = distinct pointer {.push cdecl, importc, dynlib: "libnvvm.so".} proc nvvmGetErrorString*(r: NvvmResult): cstring proc nvvmVersion*(major, minor: var int32): NvvmResult proc nvvmIRVersion*(majorIR, minorIR, majorDbg, minorDbg: var int32): NvvmResult proc nvvmCreateProgram*(prog: var NvvmProgram): NvvmResult proc nvvmDestroyProgram*(prog: var NvvmProgram): NvvmResult proc nvvmAddModuleToProgram*(prog: NvvmProgram, buffer: openArray[byte], name: cstring): NvvmResult {.wrapOpenArrayLenType: csize_t.} proc nvvmLazyAddModuleToProgram*(prog: NvvmProgram, buffer: openArray[byte], name: cstring): NvvmResult {.wrapOpenArrayLenType: csize_t.} proc nvvmCompileProgram*(prog: NvvmProgram; numOptions: int32; options: cstringArray): NvvmResult proc nvvmVerifyProgram*(prog: NvvmProgram; numOptions: int32; options: cstringArray): NvvmResult proc nvvmGetCompiledResultSize*(prog: NvvmProgram; bufferSizeRet: var csize_t): NvvmResult proc nvvmGetCompiledResult*(prog: NvvmProgram; buffer: ptr char): NvvmResult proc nvvmGetProgramLogSize*(prog: NvvmProgram; bufferSizeRet: var csize_t): NvvmResult proc nvvmGetProgramLog*(prog: NvvmProgram; buffer: ptr char): NvvmResult {.pop.} # {.push cdecl, importc, header: "".} # ############################################################ # # PTX Codegen # # ############################################################ template check*(status: NvvmResult) = let code = status # Assign so execution is done once only. if code != NVVM_SUCCESS: stderr.write astToStr(status) & " " & $instantiationInfo() & " exited with error: " & $code quit 1 proc getNvvmLog(prog: NvvmProgram): string {.used.} = var logSize: csize_t check nvvmGetProgramLogSize(prog, logSize) if logSize > 0: result = newString(logSize) check nvvmGetProgramLog(prog, result[0].addr) proc ptxCodegenViaNvidiaNvvm(module: ModuleRef, sm: tuple[major, minor: int32]): string {.used.} = ## PTX codegen via Nvidia NVVM # ###################################### # LLVM -> NNVM handover var prog{.noInit.}: NvvmProgram check nvvmCreateProgram(prog) let bitcode = module.toBitcode() check nvvmAddModuleToProgram(prog, bitcode, cstring module.getIdentifier()) # ###################################### # GPU codegen check nvvmVerifyProgram(prog, 0, nil) let options = allocCStringArray(["-arch=compute_" & $sm.major & $sm.minor]) check nvvmCompileProgram(prog, 1, options) deallocCStringArray(options) var ptxSize: csize_t check nvvmGetCompiledResultSize(prog, ptxSize) result = newString(ptxSize-1) # The NNVM size includes '\0' ending char while Nim excludes it. check nvvmGetCompiledResult(prog, result[0].addr) check nvvmDestroyProgram(prog) proc ptxCodegenViaLlvmNvptx(module: ModuleRef, sm: tuple[major, minor: int32]): string {.used.} = ## PTX codegen via LLVM NVPTX module.verify(AbortProcessAction) initializeFullNVPTXTarget() const triple = "nvptx64-nvidia-cuda" let machine = createTargetMachine( target = toTarget(triple), triple = triple, cpu = cstring("sm_" & $sm.major & $sm.minor), features = "", level = CodeGenLevelAggressive, reloc = RelocDefault, codeModel = CodeModelDefault ) machine.emitToString(module, AssemblyFile) # ############################################################ # # Hello world # # ############################################################ echo "Nvidia JIT compiler Hello World" proc writeExampleAddMul(ctx: ContextRef, module: ModuleRef, addKernelName, mulKernelName: string) = # ###################################### # Metadata const triple = "nvptx64-nvidia-cuda" # Datalayout for NVVM IR 1.8 (CUDA 11.6) const datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" # ###################################### # LLVM IR codegen module.setTarget(triple) module.setDataLayout(datalayout) let i128 = ctx.int128_t() let void_t = ctx.void_t() let builder = ctx.createBuilder() defer: builder.dispose() block: let addType = function_t(void_t, [i128.pointer_t(), i128, i128], isVarArg = LlvmBool(false)) let addKernel = module.addFunction(addKernelName, addType) let blck = ctx.appendBasicBlock(addKernel, "addBody") builder.positionAtEnd(blck) let r = addKernel.getParam(0) let a = addKernel.getParam(1) let b = addKernel.getParam(2) let sum = builder.add(a, b, "sum") builder.store(sum, r) builder.retVoid() module.setCallableCudaKernel((addType, addKernel)) block: let mulType = function_t(void_t, [i128.pointer_t(), i128, i128], isVarArg = LlvmBool(false)) let mulKernel = module.addFunction(mulKernelName, mulType) let blck = ctx.appendBasicBlock(mulKernel, "mulBody") builder.positionAtEnd(blck) let r = mulKernel.getParam(0) let a = mulKernel.getParam(1) let b = mulKernel.getParam(2) let prod = builder.mul(a, b, "prod") builder.store(prod, r) builder.retVoid() module.setCallableCudaKernel((mulType, mulKernel)) module.verify(AbortProcessAction) block: echo "=================" echo "LLVM IR output" echo $module echo "=================" func toHex*(a: uint64): string = const hexChars = "0123456789abcdef" const L = 2*sizeof(uint64) result = newString(L) var a = a for j in countdown(result.len-1, 0): result[j] = hexChars[a and 0xF] a = a shr 4 func toString*(a: openArray[uint64]): string = result = "0x" for i in countdown(a.len-1, 0): result.add toHex(a[i]) type CodegenBackend = enum PTXviaNvidiaNvvm PTXviaLlvmNvptx proc getCudaKernel(cuMod: CUmodule, fnName: string): CUfunction = check cuModuleGetFunction(result, cuMod, fnName & "_public") proc main(backend: CodegenBackend) = ####################################### # GPU init let cudaDevice = cudaDeviceInit() var sm: tuple[major, minor: int32] check cuDeviceGetAttribute(sm.major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cudaDevice) check cuDeviceGetAttribute(sm.minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cudaDevice) ####################################### # LLVM IR let ctx = createContext() let module = ctx.createModule("test_nnvm") let addKernelName = "addKernel" let mulKernelName = "mulKernel" writeExampleAddMul(ctx, module, addKernelName, mulKernelName) ####################################### # PTX codegen let ptx = case backend of PTXviaNvidiaNvvm: module.ptxCodegenViaNvidiaNVVM(sm) of PTXviaLlvmNvptx: module.ptxCodegenViaLlvmNvptx(sm) module.dispose() ctx.dispose() block: echo "=================" echo "PTX output" echo $ptx echo "=================" ####################################### # GPU JIT var cuCtx: CUcontext var cuMod: CUmodule check cuCtxCreate(cuCtx, 0, cudaDevice) check cuModuleLoadData(cuMod, ptx) let addKernel = cuMod.getCudaKernel(addKernelName) let mulKernel = cuMod.getCudaKernel(mulKernelName) ####################################### # Kernel launch var r{.noInit.}, a, b: array[2, uint64] a[1] = 0x00000000000001FF'u64; a[0] = 0xFFFFFFFFFFFFFFFF'u64 b[1] = 0x0000000000000000'u64; b[0] = 0x0010000000000000'u64 echo "r: ", r.toString() echo "a: ", a.toString() echo "b: ", b.toString() var rGPU: CUdeviceptr check cuMemAlloc(rGPU, csize_t sizeof(r)) let params = [pointer(rGPU.addr), pointer(a.addr), pointer(b.addr)] check cuLaunchKernel( addKernel, 1, 1, 1, 1, 1, 1, 0, CUstream(nil), params[0].unsafeAddr, nil) check cuMemcpyDtoH(r.addr, rGPU, csize_t sizeof(r)) echo "a+b: ", r.toString() check cuLaunchKernel( mulKernel, 1, 1, 1, 1, 1, 1, 0, CUstream(nil), params[0].unsafeAddr, nil) check cuMemcpyDtoH(r.addr, rGPU, csize_t sizeof(r)) echo "a*b: ", r.toString() ####################################### # Cleanup check cuMemFree(rGPU) rGPU = CUdeviceptr(nil) check cuModuleUnload(cuMod) cuMod = CUmodule(nil) check cuCtxDestroy(cuCtx) cuCtx = CUcontext(nil) echo "\n\nCompilation via Nvidia NVVM\n###########################\n" main(PTXviaNvidiaNvvm) echo "\n\nEnd: Compilation via Nvidia NVVM\n################################" echo "\n\nCompilation via LLVM NVPTX\n##########################\n" main(PTXviaLlvmNvptx) echo "\n\nEnd: Compilation via LLVM NVPTX\n###############################"