Mamy Ratsimbazafy c6d9a213f2
Rework assembly to be compatible with LTO (#231)
* rework assembler register/mem and constraint declarations

* Introduce constraint UnmutatedPointerToWriteMem

* Create invidual memory cell operands

* [Assembly] fully support indirect memory addressing

* fix calling convention for exported procs

* Prepare for switch to intel syntax to avoid clang constant propagation asm symbol name interfering OR pointer+offset addressing

* use modifiers to prevent bad string mixin fo assembler to linker of propagated consts

* Assembly: switch to intel syntax

* with working memory operand - now works with LTO on both GCC and clang and constant folding

* use memory operand in more places

* remove some inline now that we have lto

* cleanup compiler config and benches

* tracer shouldn't force dependencies when unused

* fix cc on linux

* nimble fixes

* update README [skip CI]

* update MacOS CI with Homebrew Clang

* oops nimble bindings disappeared

* more nimble fixes

* fix sha256 exported symbol

* improve constraints on modular addition

* Add extra constraint to force reloading of pointer in reg inputs

* Fix LLVM gold linker running out of registers

* workaround MinGW64 GCC 12.2 bad codegen in t_pairing_cyclotomic_subgroup with LTO
2023-04-26 06:58:31 +02:00

519 lines
37 KiB
Nim

# Constantine
# Copyright (c) 2018-2019 Status Research & Development GmbH
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.
# ############################################################
#
# Bindings to Nvidia GPUs libraries
#
# ############################################################
import ./c_abi
# ############################################################
#
# Cuda
#
# ############################################################
static: echo "[Constantine] Using library libcuda.so"
{.passl: "-L/opt/cuda/lib64 -lcuda".}
# Cuda offers 2 APIs:
# - cuda.h the driver API
# - cuda_runtime.h the runtime API
#
# https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html
#
# We need to use the lower-level driver API for JIT modules loading and reloading
type
CUresult* {.size: sizeof(cint).} = enum
## The API call returned with no errors. In the case of query calls, this
## also means that the operation being queried is complete (see
## ::cuEventQuery() and ::cuStreamQuery()).
CUDA_SUCCESS = 0
## This indicates that one or more of the parameters passed to the API call
## is not within an acceptable range of values.
CUDA_ERROR_INVALID_VALUE = 1
## The API call failed because it was unable to allocate enough memory to
## perform the requested operation.
CUDA_ERROR_OUT_OF_MEMORY = 2
## This indicates that the CUDA driver has not been initialized with
## ::cuInit() or that initialization has failed.
CUDA_ERROR_NOT_INITIALIZED = 3
## This indicates that the CUDA driver is in the process of shutting down.
CUDA_ERROR_DEINITIALIZED = 4
## This indicates profiler is not initialized for this run. This can
## happen when the application is running with external profiling tools
## like visual profiler.
CUDA_ERROR_PROFILER_DISABLED = 5
## to attempt to enable/disable the profiling via ::cuProfilerStart or
## ::cuProfilerStop without initialization.
CUDA_ERROR_PROFILER_NOT_INITIALIZED = 6
## to call cuProfilerStart() when profiling is already enabled.
CUDA_ERROR_PROFILER_ALREADY_STARTED = 7
## to call cuProfilerStop() when profiling is already disabled.
CUDA_ERROR_PROFILER_ALREADY_STOPPED = 8
## This indicates that the CUDA driver that the application has loaded is a
## stub library. Applications that run with the stub rather than a real
## driver loaded will result in CUDA API returning this error.
CUDA_ERROR_STUB_LIBRARY = 34
## This indicates that requested CUDA device is unavailable at the current
## time. Devices are often unavailable due to use of
## ::CU_COMPUTEMODE_EXCLUSIVE_PROCESS or ::CU_COMPUTEMODE_PROHIBITED.
CUDA_ERROR_DEVICE_UNAVAILABLE = 46
## This indicates that no CUDA-capable devices were detected by the installed
## CUDA driver.
CUDA_ERROR_NO_DEVICE = 100
## This indicates that the device ordinal supplied by the user does not
## correspond to a valid CUDA device or that the action requested is
## invalid for the specified device.
CUDA_ERROR_INVALID_DEVICE = 101
## This error indicates that the Grid license is not applied.
CUDA_ERROR_DEVICE_NOT_LICENSED = 102
## This indicates that the device kernel image is invalid. This can also
## indicate an invalid CUDA module.
CUDA_ERROR_INVALID_IMAGE = 200
## This most frequently indicates that there is no context bound to the
## current thread. This can also be returned if the context passed to an
## API call is not a valid handle (such as a context that has had
## ::cuCtxDestroy() invoked on it). This can also be returned if a user
## mixes different API versions (i.e. 3010 context with 3020 API calls).
## See ::cuCtxGetApiVersion() for more details.
CUDA_ERROR_INVALID_CONTEXT = 201
## This indicated that the context being supplied as a parameter to the
## API call was already the active context.
## error to attempt to push the active context via ::cuCtxPushCurrent().
CUDA_ERROR_CONTEXT_ALREADY_CURRENT = 202
## This indicates that a map or register operation has failed.
CUDA_ERROR_MAP_FAILED = 205
## This indicates that an unmap or unregister operation has failed.
CUDA_ERROR_UNMAP_FAILED = 206
## This indicates that the specified array is currently mapped and thus
## cannot be destroyed.
CUDA_ERROR_ARRAY_IS_MAPPED = 207
## This indicates that the resource is already mapped.
CUDA_ERROR_ALREADY_MAPPED = 208
## This indicates that there is no kernel image available that is suitable
## for the device. This can occur when a user specifies code generation
## options for a particular CUDA source file that do not include the
## corresponding device configuration.
CUDA_ERROR_NO_BINARY_FOR_GPU = 209
## This indicates that a resource has already been acquired.
CUDA_ERROR_ALREADY_ACQUIRED = 210
## This indicates that a resource is not mapped.
CUDA_ERROR_NOT_MAPPED = 211
## This indicates that a mapped resource is not available for access as an
## array.
CUDA_ERROR_NOT_MAPPED_AS_ARRAY = 212
## This indicates that a mapped resource is not available for access as a
## pointer.
CUDA_ERROR_NOT_MAPPED_AS_POINTER = 213
## This indicates that an uncorrectable ECC error was detected during
## execution.
CUDA_ERROR_ECC_UNCORRECTABLE = 214
## This indicates that the ::CUlimit passed to the API call is not
## supported by the active device.
CUDA_ERROR_UNSUPPORTED_LIMIT = 215
## This indicates that the ::CUcontext passed to the API call can
## only be bound to a single CPU thread at a time but is already
## bound to a CPU thread.
CUDA_ERROR_CONTEXT_ALREADY_IN_USE = 216
## This indicates that peer access is not supported across the given
## devices.
CUDA_ERROR_PEER_ACCESS_UNSUPPORTED = 217
## This indicates that a PTX JIT compilation failed.
CUDA_ERROR_INVALID_PTX = 218
## This indicates an error with OpenGL or DirectX context.
CUDA_ERROR_INVALID_GRAPHICS_CONTEXT = 219
## This indicates that an uncorrectable NVLink error was detected during the
## execution.
CUDA_ERROR_NVLINK_UNCORRECTABLE = 220
## This indicates that the PTX JIT compiler library was not found.
CUDA_ERROR_JIT_COMPILER_NOT_FOUND = 221
## This indicates that the provided PTX was compiled with an unsupported toolchain.
CUDA_ERROR_UNSUPPORTED_PTX_VERSION = 222
## This indicates that the PTX JIT compilation was disabled.
CUDA_ERROR_JIT_COMPILATION_DISABLED = 223
## This indicates that the ::CUexecAffinityType passed to the API call is not
## supported by the active device.
CUDA_ERROR_UNSUPPORTED_EXEC_AFFINITY = 224
## This indicates that the device kernel source is invalid. This includes
## compilation/linker errors encountered in device code or user error.
CUDA_ERROR_INVALID_SOURCE = 300
## This indicates that the file specified was not found.
CUDA_ERROR_FILE_NOT_FOUND = 301
## This indicates that a link to a shared object failed to resolve.
CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND = 302
## This indicates that initialization of a shared object failed.
CUDA_ERROR_SHARED_OBJECT_INIT_FAILED = 303
## This indicates that an OS call failed.
CUDA_ERROR_OPERATING_SYSTEM = 304
## This indicates that a resource handle passed to the API call was not
## valid. Resource handles are opaque types like ::CUstream and ::CUevent.
CUDA_ERROR_INVALID_HANDLE = 400
## This indicates that a resource required by the API call is not in a
## valid state to perform the requested operation.
CUDA_ERROR_ILLEGAL_STATE = 401
## This indicates that a named symbol was not found. Examples of symbols
## are global/constant variable names, driver function names, texture names,
## and surface names.
CUDA_ERROR_NOT_FOUND = 500
## This indicates that asynchronous operations issued previously have not
## completed yet. This result is not actually an error, but must be indicated
## differently than ::CUDA_SUCCESS (which indicates completion). Calls that
## may return this value include ::cuEventQuery() and ::cuStreamQuery().
CUDA_ERROR_NOT_READY = 600
## While executing a kernel, the device encountered a
## load or store instruction on an invalid memory address.
## This leaves the process in an inconsistent state and any further CUDA work
## will return the same error. To continue using CUDA, the process must be terminated
## and relaunched.
CUDA_ERROR_ILLEGAL_ADDRESS = 700
## This indicates that a launch did not occur because it did not have
## appropriate resources. This error usually indicates that the user has
## attempted to pass too many arguments to the device kernel, or the
## kernel launch specifies too many threads for the kernel's register
## count. Passing arguments of the wrong size (i.e. a 64-bit pointer
## when a 32-bit int is expected) is equivalent to passing too many
## arguments and can also result in this error.
CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES = 701
## This indicates that the device kernel took too long to execute. This can
## only occur if timeouts are enabled - see the device attribute
## ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT for more information.
## This leaves the process in an inconsistent state and any further CUDA work
## will return the same error. To continue using CUDA, the process must be terminated
## and relaunched.
CUDA_ERROR_LAUNCH_TIMEOUT = 702
## This error indicates a kernel launch that uses an incompatible texturing
## mode.
CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING = 703
## This error indicates that a call to ::cuCtxEnablePeerAccess() is
## trying to re-enable peer access to a context which has already
## had peer access to it enabled.
CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED = 704
## This error indicates that ::cuCtxDisablePeerAccess() is
## trying to disable peer access which has not been enabled yet
## via ::cuCtxEnablePeerAccess().
CUDA_ERROR_PEER_ACCESS_NOT_ENABLED = 705
## This error indicates that the primary context for the specified device
## has already been initialized.
CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE = 708
## This error indicates that the context current to the calling thread
## has been destroyed using ::cuCtxDestroy, or is a primary context which
## has not yet been initialized.
CUDA_ERROR_CONTEXT_IS_DESTROYED = 709
## A device-side assert triggered during kernel execution. The context
## cannot be used anymore, and must be destroyed. All existing device
## memory allocations from this context are invalid and must be
## reconstructed if the program is to continue using CUDA.
CUDA_ERROR_ASSERT = 710
## This error indicates that the hardware resources required to enable
## peer access have been exhausted for one or more of the devices
## passed to ::cuCtxEnablePeerAccess().
CUDA_ERROR_TOO_MANY_PEERS = 711
## This error indicates that the memory range passed to ::cuMemHostRegister()
## has already been registered.
CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED = 712
## This error indicates that the pointer passed to ::cuMemHostUnregister()
## does not correspond to any currently registered memory region.
CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED = 713
## While executing a kernel, the device encountered a stack error.
## This can be due to stack corruption or exceeding the stack size limit.
## This leaves the process in an inconsistent state and any further CUDA work
## will return the same error. To continue using CUDA, the process must be terminated
## and relaunched.
CUDA_ERROR_HARDWARE_STACK_ERROR = 714
## While executing a kernel, the device encountered an illegal instruction.
## This leaves the process in an inconsistent state and any further CUDA work
## will return the same error. To continue using CUDA, the process must be terminated
## and relaunched.
CUDA_ERROR_ILLEGAL_INSTRUCTION = 715
## While executing a kernel, the device encountered a load or store instruction
## on a memory address which is not aligned.
## This leaves the process in an inconsistent state and any further CUDA work
## will return the same error. To continue using CUDA, the process must be terminated
## and relaunched.
CUDA_ERROR_MISALIGNED_ADDRESS = 716
## While executing a kernel, the device encountered an instruction
## which can only operate on memory locations in certain address spaces
## (global, shared, or local), but was supplied a memory address not
## belonging to an allowed address space.
## This leaves the process in an inconsistent state and any further CUDA work
## will return the same error. To continue using CUDA, the process must be terminated
## and relaunched.
CUDA_ERROR_INVALID_ADDRESS_SPACE = 717
## While executing a kernel, the device program counter wrapped its address space.
## This leaves the process in an inconsistent state and any further CUDA work
## will return the same error. To continue using CUDA, the process must be terminated
## and relaunched.
CUDA_ERROR_INVALID_PC = 718
## An exception occurred on the device while executing a kernel. Common
## causes include dereferencing an invalid device pointer and accessing
## out of bounds shared memory. Less common cases can be system specific - more
## information about these cases can be found in the system specific user guide.
## This leaves the process in an inconsistent state and any further CUDA work
## will return the same error. To continue using CUDA, the process must be terminated
## and relaunched.
CUDA_ERROR_LAUNCH_FAILED = 719
## This error indicates that the number of blocks launched per grid for a kernel that was
## launched via either ::cuLaunchCooperativeKernel or ::cuLaunchCooperativeKernelMultiDevice
## exceeds the maximum number of blocks as allowed by ::cuOccupancyMaxActiveBlocksPerMultiprocessor
## or ::cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags times the number of multiprocessors
## as specified by the device attribute ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT.
CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE = 720
## This error indicates that the attempted operation is not permitted.
CUDA_ERROR_NOT_PERMITTED = 800
## This error indicates that the attempted operation is not supported
## on the current system or device.
CUDA_ERROR_NOT_SUPPORTED = 801
## This error indicates that the system is not yet ready to start any CUDA
## work. To continue using CUDA, verify the system configuration is in a
## valid state and all required driver daemons are actively running.
## More information about this error can be found in the system specific
## user guide.
CUDA_ERROR_SYSTEM_NOT_READY = 802
## This error indicates that there is a mismatch between the versions of
## the display driver and the CUDA driver. Refer to the compatibility documentation
## for supported versions.
CUDA_ERROR_SYSTEM_DRIVER_MISMATCH = 803
## This error indicates that the system was upgraded to run with forward compatibility
## but the visible hardware detected by CUDA does not support this configuration.
## Refer to the compatibility documentation for the supported hardware matrix or ensure
## that only supported hardware is visible during initialization via the CUDA_VISIBLE_DEVICES
## environment variable.
CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE = 804
## This error indicates that the MPS client failed to connect to the MPS control daemon or the MPS server.
CUDA_ERROR_MPS_CONNECTION_FAILED = 805
## This error indicates that the remote procedural call between the MPS server and the MPS client failed.
CUDA_ERROR_MPS_RPC_FAILURE = 806
## This error indicates that the MPS server is not ready to accept new MPS client requests.
## This error can be returned when the MPS server is in the process of recovering from a fatal failure.
CUDA_ERROR_MPS_SERVER_NOT_READY = 807
## This error indicates that the hardware resources required to create MPS client have been exhausted.
CUDA_ERROR_MPS_MAX_CLIENTS_REACHED = 808
## This error indicates the the hardware resources required to support device connections have been exhausted.
CUDA_ERROR_MPS_MAX_CONNECTIONS_REACHED = 809
## This error indicates that the MPS client has been terminated by the server. To continue using CUDA, the process must be terminated and relaunched.
CUDA_ERROR_MPS_CLIENT_TERMINATED = 810
## This error indicates that the operation is not permitted when
## the stream is capturing.
CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED = 900
## This error indicates that the current capture sequence on the stream
## has been invalidated due to a previous error.
CUDA_ERROR_STREAM_CAPTURE_INVALIDATED = 901
## This error indicates that the operation would have resulted in a merge
## of two independent capture sequences.
CUDA_ERROR_STREAM_CAPTURE_MERGE = 902
## This error indicates that the capture was not initiated in this stream.
CUDA_ERROR_STREAM_CAPTURE_UNMATCHED = 903
## This error indicates that the capture sequence contains a fork that was
## not joined to the primary stream.
CUDA_ERROR_STREAM_CAPTURE_UNJOINED = 904
## This error indicates that a dependency would have been created which
## crosses the capture sequence boundary. Only implicit in-stream ordering
## dependencies are allowed to cross the boundary.
CUDA_ERROR_STREAM_CAPTURE_ISOLATION = 905
## This error indicates a disallowed implicit dependency on a current capture
## sequence from cudaStreamLegacy.
CUDA_ERROR_STREAM_CAPTURE_IMPLICIT = 906
## This error indicates that the operation is not permitted on an event which
## was last recorded in a capturing stream.
CUDA_ERROR_CAPTURED_EVENT = 907
## A stream capture sequence not initiated with the ::CU_STREAM_CAPTURE_MODE_RELAXED
## argument to ::cuStreamBeginCapture was passed to ::cuStreamEndCapture in a
## different thread.
CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD = 908
## This error indicates that the timeout specified for the wait operation has lapsed.
CUDA_ERROR_TIMEOUT = 909
## This error indicates that the graph update was not performed because it included
## changes which violated constraints specific to instantiated graph update.
CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE = 910
## This indicates that an async error has occurred in a device outside of CUDA.
## If CUDA was waiting for an external device's signal before consuming shared data,
## the external device signaled an error indicating that the data is not valid for
## consumption. This leaves the process in an inconsistent state and any further CUDA
## work will return the same error. To continue using CUDA, the process must be
## terminated and relaunched.
CUDA_ERROR_EXTERNAL_DEVICE = 911
## Indicates a kernel launch error due to cluster misconfiguration.
CUDA_ERROR_INVALID_CLUSTER_SIZE = 912
## This indicates that an unknown internal error has occurred.
CUDA_ERROR_UNKNOWN = 999
CUdevice_attribute* {.size: sizeof(cint).} = enum
CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1, ## Maximum number of threads per block */
CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2, ## Maximum block dimension X */
CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 3, ## Maximum block dimension Y */
CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z = 4, ## Maximum block dimension Z */
CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5, ## Maximum grid dimension X */
CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y = 6, ## Maximum grid dimension Y */
CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z = 7, ## Maximum grid dimension Z */
CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8, ## Maximum shared memory available per block in bytes */
CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 9, ## Memory available on device for __constant__ variables in a CUDA C kernel in bytes */
CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10, ## Warp size in threads */
CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11, ## Maximum pitch in bytes allowed by memory copies */
CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12, ## Maximum number of 32-bit registers available per block */
CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13, ## Typical clock frequency in kilohertz */
CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14, ## Alignment requirement for textures */
CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16, ## Number of multiprocessors on device */
CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17, ## Specifies whether there is a run time limit on kernels */
CU_DEVICE_ATTRIBUTE_INTEGRATED = 18, ## Device is integrated with host memory */
CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 19, ## Device can map host memory into CUDA address space */
CU_DEVICE_ATTRIBUTE_COMPUTE_MODE = 20, ## Compute mode (See ::CUcomputemode for details) */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH = 21, ## Maximum 1D texture width */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH = 22, ## Maximum 2D texture width */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT = 23, ## Maximum 2D texture height */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH = 24, ## Maximum 3D texture width */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT = 25, ## Maximum 3D texture height */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH = 26, ## Maximum 3D texture depth */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH = 27, ## Maximum 2D layered texture width */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT = 28, ## Maximum 2D layered texture height */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS = 29, ## Maximum layers in a 2D layered texture */
CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT = 30, ## Alignment requirement for surfaces */
CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31, ## Device can possibly execute multiple kernels concurrently */
CU_DEVICE_ATTRIBUTE_ECC_ENABLED = 32, ## Device has ECC support enabled */
CU_DEVICE_ATTRIBUTE_PCI_BUS_ID = 33, ## PCI bus ID of the device */
CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID = 34, ## PCI device ID of the device */
CU_DEVICE_ATTRIBUTE_TCC_DRIVER = 35, ## Device is using TCC driver model */
CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36, ## Peak memory clock frequency in kilohertz */
CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH = 37, ## Global memory bus width in bits */
CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE = 38, ## Size of L2 cache in bytes */
CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39, ## Maximum resident threads per multiprocessor */
CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT = 40, ## Number of asynchronous engines */
CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING = 41, ## Device shares a unified address space with the host */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH = 42, ## Maximum 1D layered texture width */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS = 43, ## Maximum layers in a 1D layered texture */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH = 45, ## Maximum 2D texture width if CUDA_ARRAY3D_TEXTURE_GATHER is set */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT = 46, ## Maximum 2D texture height if CUDA_ARRAY3D_TEXTURE_GATHER is set */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE = 47, ## Alternate maximum 3D texture width */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE = 48, ## Alternate maximum 3D texture height */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE = 49, ## Alternate maximum 3D texture depth */
CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID = 50, ## PCI domain ID of the device */
CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT = 51, ## Pitch alignment requirement for textures */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH = 52, ## Maximum cubemap texture width/height */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH = 53, ## Maximum cubemap layered texture width/height */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS = 54, ## Maximum layers in a cubemap layered texture */
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH = 55, ## Maximum 1D surface width */
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH = 56, ## Maximum 2D surface width */
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT = 57, ## Maximum 2D surface height */
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH = 58, ## Maximum 3D surface width */
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT = 59, ## Maximum 3D surface height */
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH = 60, ## Maximum 3D surface depth */
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH = 61, ## Maximum 1D layered surface width */
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS = 62, ## Maximum layers in a 1D layered surface */
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH = 63, ## Maximum 2D layered surface width */
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT = 64, ## Maximum 2D layered surface height */
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS = 65, ## Maximum layers in a 2D layered surface */
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH = 66, ## Maximum cubemap surface width */
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH = 67, ## Maximum cubemap layered surface width */
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS = 68, ## Maximum layers in a cubemap layered surface */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH = 70, ## Maximum 2D linear texture width */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT = 71, ## Maximum 2D linear texture height */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH = 72, ## Maximum 2D linear texture pitch in bytes */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH = 73, ## Maximum mipmapped 2D texture width */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT = 74, ## Maximum mipmapped 2D texture height */
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75, ## Major compute capability version number */
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76, ## Minor compute capability version number */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH = 77, ## Maximum mipmapped 1D texture width */
CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED = 78, ## Device supports stream priorities */
CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED = 79, ## Device supports caching globals in L1 */
CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED = 80, ## Device supports caching locals in L1 */
CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR = 81, ## Maximum shared memory available per multiprocessor in bytes */
CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82, ## Maximum number of 32-bit registers available per multiprocessor */
CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY = 83, ## Device can allocate managed memory on this system */
CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD = 84, ## Device is on a multi-GPU board */
CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID = 85, ## Unique id for a group of devices on the same multi-GPU board */
CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED = 86, ## Link between the device and the host supports native atomic operations (this is a placeholder attribute, and is not supported on any current hardware)*/
CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO = 87, ## Ratio of single precision performance (in floating-point operations per second) to double precision performance */
CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS = 88, ## Device supports coherently accessing pageable memory without calling cudaHostRegister on it */
CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS = 89, ## Device can coherently access managed memory concurrently with the CPU */
CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED = 90, ## Device supports compute preemption. */
CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM = 91, ## Device can access host registered memory at the same virtual address as the CPU */
CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS = 92, ## ::cuStreamBatchMemOp and related APIs are supported. */
CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS = 93, ## 64-bit operations are supported in ::cuStreamBatchMemOp and related APIs. */
CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR = 94, ## ::CU_STREAM_WAIT_VALUE_NOR is supported. */
CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH = 95, ## Device supports launching cooperative kernels via ::cuLaunchCooperativeKernel */
CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN = 97, ## Maximum optin shared memory per block */
CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES = 98, ## The ::CU_STREAM_WAIT_VALUE_FLUSH flag and the ::CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES MemOp are supported on the device. See \ref CUDA_MEMOP for additional details. */
CU_DEVICE_ATTRIBUTE_HOST_REGISTER_SUPPORTED = 99, ## Device supports host memory registration via ::cudaHostRegister. */
CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES = 100, ## Device accesses pageable memory via the host's page tables. */
CU_DEVICE_ATTRIBUTE_DIRECT_MANAGED_MEM_ACCESS_FROM_HOST = 101, ## The host can directly access managed memory on the device without migration. */
CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED = 102, ## Device supports virtual memory management APIs like ::cuMemAddressReserve, ::cuMemCreate, ::cuMemMap and related APIs */
CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR_SUPPORTED = 103, ## Device supports exporting memory to a posix file descriptor with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate */
CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_HANDLE_SUPPORTED = 104, ## Device supports exporting memory to a Win32 NT handle with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate */
CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_KMT_HANDLE_SUPPORTED = 105, ## Device supports exporting memory to a Win32 KMT handle with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate */
CU_DEVICE_ATTRIBUTE_MAX_BLOCKS_PER_MULTIPROCESSOR = 106, ## Maximum number of blocks per multiprocessor */
CU_DEVICE_ATTRIBUTE_GENERIC_COMPRESSION_SUPPORTED = 107, ## Device supports compression of memory */
CU_DEVICE_ATTRIBUTE_MAX_PERSISTING_L2_CACHE_SIZE = 108, ## Maximum L2 persisting lines capacity setting in bytes. */
CU_DEVICE_ATTRIBUTE_MAX_ACCESS_POLICY_WINDOW_SIZE = 109, ## Maximum value of CUaccessPolicyWindow::num_bytes. */
CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED = 110, ## Device supports specifying the GPUDirect RDMA flag with ::cuMemCreate */
CU_DEVICE_ATTRIBUTE_RESERVED_SHARED_MEMORY_PER_BLOCK = 111, ## Shared memory reserved by CUDA driver per block in bytes */
CU_DEVICE_ATTRIBUTE_SPARSE_CUDA_ARRAY_SUPPORTED = 112, ## Device supports sparse CUDA arrays and sparse CUDA mipmapped arrays */
CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED = 113, ## Device supports using the ::cuMemHostRegister flag ::CU_MEMHOSTERGISTER_READ_ONLY to register memory that must be mapped as read-only to the GPU */
CU_DEVICE_ATTRIBUTE_TIMELINE_SEMAPHORE_INTEROP_SUPPORTED = 114, ## External timeline semaphore interop is supported on the device */
CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED = 115, ## Device supports using the ::cuMemAllocAsync and ::cuMemPool family of APIs */
CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED = 116, ## Device supports GPUDirect RDMA APIs, like nvidia_p2p_get_pages (see https://docs.nvidia.com/cuda/gpudirect-rdma for more information) */
CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS = 117, ## The returned attribute shall be interpreted as a bitmask, where the individual bits are described by the ::CUflushGPUDirectRDMAWritesOptions enum */
CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING = 118, ## GPUDirect RDMA writes to the device do not need to be flushed for consumers within the scope indicated by the returned attribute. See ::CUGPUDirectRDMAWritesOrdering for the numerical values returned here. */
CU_DEVICE_ATTRIBUTE_MEMPOOL_SUPPORTED_HANDLE_TYPES = 119, ## Handle types supported with mempool based IPC */
CU_DEVICE_ATTRIBUTE_CLUSTER_LAUNCH = 120, ## Indicates device supports cluster launch */
CU_DEVICE_ATTRIBUTE_DEFERRED_MAPPING_CUDA_ARRAY_SUPPORTED = 121, ## Device supports deferred mapping CUDA arrays and CUDA mipmapped arrays */
CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS_V2 = 122, ## 64-bit operations are supported in ::cuStreamBatchMemOp_v2 and related v2 MemOp APIs. */
CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR_V2 = 123, ## ::CU_STREAM_WAIT_VALUE_NOR is supported by v2 MemOp APIs. */
CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED = 124, ## Device supports buffer sharing with dma_buf mechanism. */
CU_DEVICE_ATTRIBUTE_MAX
CUmemAttach_flags* = enum
CU_MEM_ATTACH_GLOBAL = 0x1, ## Memory can be accessed by any stream on any device
CU_MEM_ATTACH_HOST = 0x2, ## Memory cannot be accessed by any stream on any device
CU_MEM_ATTACH_SINGLE = 0x4
CUdevice* = distinct int32
## Compute Device handle
CUcontext* = distinct pointer
CUmodule* = distinct pointer
CUfunction* = distinct pointer
CUstream* = distinct pointer
CUdeviceptr* = distinct pointer
{.push noconv, importc, dynlib: "libcuda.so".}
proc cuInit*(flags: uint32): CUresult
proc cuDeviceGetCount*(count: var int32): CUresult
proc cuDeviceGet*(device: var CUdevice, ordinal: int32): CUresult
proc cuDeviceGetName*(name: ptr char, len: int32, dev: CUdevice): CUresult
proc cuDeviceGetAttribute*(r: var int32, attrib: CUdevice_attribute, dev: CUdevice): CUresult
proc cuCtxCreate*(pctx: var CUcontext, flags: uint32, dev: CUdevice): CUresult
proc cuCtxDestroy*(ctx: CUcontext): CUresult
proc cuCtxSynchronize*(ctx: CUcontext): CUresult
proc cuModuleLoadData(module: var CUmodule, sourceCode: ptr char): CUresult {.used.}
proc cuModuleUnload*(module: CUmodule): CUresult
proc cuModuleGetFunction(kernel: var CUfunction, module: CUmodule, fnName: ptr char): CUresult {.used.}
proc cuLaunchKernel(
kernel: CUfunction,
gridDimX, gridDimY, gridDimZ: uint32,
blockDimX, blockDimY, blockDimZ: uint32,
sharedMemBytes: uint32,
stream: CUstream,
kernelParams: ptr pointer,
extra: ptr pointer
): CUresult {.used.}
proc cuMemAlloc*(devptr: var CUdeviceptr, size: csize_t): CUresult
proc cuMemAllocManaged*(devptr: var CUdeviceptr, size: csize_t, flags: Flag[CUmemAttach_flags]): CUresult
proc cuMemFree*(devptr: CUdeviceptr): CUresult
proc cuMemcpyHtoD*(dst: CUdeviceptr, src: pointer, size: csize_t): CUresult
proc cuMemcpyDtoH*(dst: pointer, src: CUdeviceptr, size: csize_t): CUresult
{.pop.} # {.push noconv, importc, dynlib: "libcuda.so".}