mirror of
https://github.com/logos-storage/constantine.git
synced 2026-01-02 13:13:07 +00:00
* rework assembler register/mem and constraint declarations * Introduce constraint UnmutatedPointerToWriteMem * Create invidual memory cell operands * [Assembly] fully support indirect memory addressing * fix calling convention for exported procs * Prepare for switch to intel syntax to avoid clang constant propagation asm symbol name interfering OR pointer+offset addressing * use modifiers to prevent bad string mixin fo assembler to linker of propagated consts * Assembly: switch to intel syntax * with working memory operand - now works with LTO on both GCC and clang and constant folding * use memory operand in more places * remove some inline now that we have lto * cleanup compiler config and benches * tracer shouldn't force dependencies when unused * fix cc on linux * nimble fixes * update README [skip CI] * update MacOS CI with Homebrew Clang * oops nimble bindings disappeared * more nimble fixes * fix sha256 exported symbol * improve constraints on modular addition * Add extra constraint to force reloading of pointer in reg inputs * Fix LLVM gold linker running out of registers * workaround MinGW64 GCC 12.2 bad codegen in t_pairing_cyclotomic_subgroup with LTO
519 lines
37 KiB
Nim
519 lines
37 KiB
Nim
# Constantine
|
|
# Copyright (c) 2018-2019 Status Research & Development GmbH
|
|
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
|
|
# Licensed and distributed under either of
|
|
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
|
|
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
|
|
# at your option. This file may not be copied, modified, or distributed except according to those terms.
|
|
|
|
# ############################################################
|
|
#
|
|
# Bindings to Nvidia GPUs libraries
|
|
#
|
|
# ############################################################
|
|
|
|
import ./c_abi
|
|
|
|
# ############################################################
|
|
#
|
|
# Cuda
|
|
#
|
|
# ############################################################
|
|
|
|
static: echo "[Constantine] Using library libcuda.so"
|
|
{.passl: "-L/opt/cuda/lib64 -lcuda".}
|
|
|
|
# Cuda offers 2 APIs:
|
|
# - cuda.h the driver API
|
|
# - cuda_runtime.h the runtime API
|
|
#
|
|
# https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html
|
|
#
|
|
# We need to use the lower-level driver API for JIT modules loading and reloading
|
|
|
|
type
|
|
CUresult* {.size: sizeof(cint).} = enum
|
|
## The API call returned with no errors. In the case of query calls, this
|
|
## also means that the operation being queried is complete (see
|
|
## ::cuEventQuery() and ::cuStreamQuery()).
|
|
CUDA_SUCCESS = 0
|
|
## This indicates that one or more of the parameters passed to the API call
|
|
## is not within an acceptable range of values.
|
|
CUDA_ERROR_INVALID_VALUE = 1
|
|
## The API call failed because it was unable to allocate enough memory to
|
|
## perform the requested operation.
|
|
CUDA_ERROR_OUT_OF_MEMORY = 2
|
|
## This indicates that the CUDA driver has not been initialized with
|
|
## ::cuInit() or that initialization has failed.
|
|
CUDA_ERROR_NOT_INITIALIZED = 3
|
|
## This indicates that the CUDA driver is in the process of shutting down.
|
|
CUDA_ERROR_DEINITIALIZED = 4
|
|
## This indicates profiler is not initialized for this run. This can
|
|
## happen when the application is running with external profiling tools
|
|
## like visual profiler.
|
|
CUDA_ERROR_PROFILER_DISABLED = 5
|
|
## to attempt to enable/disable the profiling via ::cuProfilerStart or
|
|
## ::cuProfilerStop without initialization.
|
|
CUDA_ERROR_PROFILER_NOT_INITIALIZED = 6
|
|
## to call cuProfilerStart() when profiling is already enabled.
|
|
CUDA_ERROR_PROFILER_ALREADY_STARTED = 7
|
|
## to call cuProfilerStop() when profiling is already disabled.
|
|
CUDA_ERROR_PROFILER_ALREADY_STOPPED = 8
|
|
## This indicates that the CUDA driver that the application has loaded is a
|
|
## stub library. Applications that run with the stub rather than a real
|
|
## driver loaded will result in CUDA API returning this error.
|
|
CUDA_ERROR_STUB_LIBRARY = 34
|
|
## This indicates that requested CUDA device is unavailable at the current
|
|
## time. Devices are often unavailable due to use of
|
|
## ::CU_COMPUTEMODE_EXCLUSIVE_PROCESS or ::CU_COMPUTEMODE_PROHIBITED.
|
|
CUDA_ERROR_DEVICE_UNAVAILABLE = 46
|
|
## This indicates that no CUDA-capable devices were detected by the installed
|
|
## CUDA driver.
|
|
CUDA_ERROR_NO_DEVICE = 100
|
|
## This indicates that the device ordinal supplied by the user does not
|
|
## correspond to a valid CUDA device or that the action requested is
|
|
## invalid for the specified device.
|
|
CUDA_ERROR_INVALID_DEVICE = 101
|
|
## This error indicates that the Grid license is not applied.
|
|
CUDA_ERROR_DEVICE_NOT_LICENSED = 102
|
|
## This indicates that the device kernel image is invalid. This can also
|
|
## indicate an invalid CUDA module.
|
|
CUDA_ERROR_INVALID_IMAGE = 200
|
|
## This most frequently indicates that there is no context bound to the
|
|
## current thread. This can also be returned if the context passed to an
|
|
## API call is not a valid handle (such as a context that has had
|
|
## ::cuCtxDestroy() invoked on it). This can also be returned if a user
|
|
## mixes different API versions (i.e. 3010 context with 3020 API calls).
|
|
## See ::cuCtxGetApiVersion() for more details.
|
|
CUDA_ERROR_INVALID_CONTEXT = 201
|
|
## This indicated that the context being supplied as a parameter to the
|
|
## API call was already the active context.
|
|
## error to attempt to push the active context via ::cuCtxPushCurrent().
|
|
CUDA_ERROR_CONTEXT_ALREADY_CURRENT = 202
|
|
## This indicates that a map or register operation has failed.
|
|
CUDA_ERROR_MAP_FAILED = 205
|
|
## This indicates that an unmap or unregister operation has failed.
|
|
CUDA_ERROR_UNMAP_FAILED = 206
|
|
## This indicates that the specified array is currently mapped and thus
|
|
## cannot be destroyed.
|
|
CUDA_ERROR_ARRAY_IS_MAPPED = 207
|
|
## This indicates that the resource is already mapped.
|
|
CUDA_ERROR_ALREADY_MAPPED = 208
|
|
## This indicates that there is no kernel image available that is suitable
|
|
## for the device. This can occur when a user specifies code generation
|
|
## options for a particular CUDA source file that do not include the
|
|
## corresponding device configuration.
|
|
CUDA_ERROR_NO_BINARY_FOR_GPU = 209
|
|
## This indicates that a resource has already been acquired.
|
|
CUDA_ERROR_ALREADY_ACQUIRED = 210
|
|
## This indicates that a resource is not mapped.
|
|
CUDA_ERROR_NOT_MAPPED = 211
|
|
## This indicates that a mapped resource is not available for access as an
|
|
## array.
|
|
CUDA_ERROR_NOT_MAPPED_AS_ARRAY = 212
|
|
## This indicates that a mapped resource is not available for access as a
|
|
## pointer.
|
|
CUDA_ERROR_NOT_MAPPED_AS_POINTER = 213
|
|
## This indicates that an uncorrectable ECC error was detected during
|
|
## execution.
|
|
CUDA_ERROR_ECC_UNCORRECTABLE = 214
|
|
## This indicates that the ::CUlimit passed to the API call is not
|
|
## supported by the active device.
|
|
CUDA_ERROR_UNSUPPORTED_LIMIT = 215
|
|
## This indicates that the ::CUcontext passed to the API call can
|
|
## only be bound to a single CPU thread at a time but is already
|
|
## bound to a CPU thread.
|
|
CUDA_ERROR_CONTEXT_ALREADY_IN_USE = 216
|
|
## This indicates that peer access is not supported across the given
|
|
## devices.
|
|
CUDA_ERROR_PEER_ACCESS_UNSUPPORTED = 217
|
|
## This indicates that a PTX JIT compilation failed.
|
|
CUDA_ERROR_INVALID_PTX = 218
|
|
## This indicates an error with OpenGL or DirectX context.
|
|
CUDA_ERROR_INVALID_GRAPHICS_CONTEXT = 219
|
|
## This indicates that an uncorrectable NVLink error was detected during the
|
|
## execution.
|
|
CUDA_ERROR_NVLINK_UNCORRECTABLE = 220
|
|
## This indicates that the PTX JIT compiler library was not found.
|
|
CUDA_ERROR_JIT_COMPILER_NOT_FOUND = 221
|
|
## This indicates that the provided PTX was compiled with an unsupported toolchain.
|
|
CUDA_ERROR_UNSUPPORTED_PTX_VERSION = 222
|
|
## This indicates that the PTX JIT compilation was disabled.
|
|
CUDA_ERROR_JIT_COMPILATION_DISABLED = 223
|
|
## This indicates that the ::CUexecAffinityType passed to the API call is not
|
|
## supported by the active device.
|
|
CUDA_ERROR_UNSUPPORTED_EXEC_AFFINITY = 224
|
|
## This indicates that the device kernel source is invalid. This includes
|
|
## compilation/linker errors encountered in device code or user error.
|
|
CUDA_ERROR_INVALID_SOURCE = 300
|
|
## This indicates that the file specified was not found.
|
|
CUDA_ERROR_FILE_NOT_FOUND = 301
|
|
## This indicates that a link to a shared object failed to resolve.
|
|
CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND = 302
|
|
## This indicates that initialization of a shared object failed.
|
|
CUDA_ERROR_SHARED_OBJECT_INIT_FAILED = 303
|
|
## This indicates that an OS call failed.
|
|
CUDA_ERROR_OPERATING_SYSTEM = 304
|
|
## This indicates that a resource handle passed to the API call was not
|
|
## valid. Resource handles are opaque types like ::CUstream and ::CUevent.
|
|
CUDA_ERROR_INVALID_HANDLE = 400
|
|
## This indicates that a resource required by the API call is not in a
|
|
## valid state to perform the requested operation.
|
|
CUDA_ERROR_ILLEGAL_STATE = 401
|
|
## This indicates that a named symbol was not found. Examples of symbols
|
|
## are global/constant variable names, driver function names, texture names,
|
|
## and surface names.
|
|
CUDA_ERROR_NOT_FOUND = 500
|
|
## This indicates that asynchronous operations issued previously have not
|
|
## completed yet. This result is not actually an error, but must be indicated
|
|
## differently than ::CUDA_SUCCESS (which indicates completion). Calls that
|
|
## may return this value include ::cuEventQuery() and ::cuStreamQuery().
|
|
CUDA_ERROR_NOT_READY = 600
|
|
## While executing a kernel, the device encountered a
|
|
## load or store instruction on an invalid memory address.
|
|
## This leaves the process in an inconsistent state and any further CUDA work
|
|
## will return the same error. To continue using CUDA, the process must be terminated
|
|
## and relaunched.
|
|
CUDA_ERROR_ILLEGAL_ADDRESS = 700
|
|
## This indicates that a launch did not occur because it did not have
|
|
## appropriate resources. This error usually indicates that the user has
|
|
## attempted to pass too many arguments to the device kernel, or the
|
|
## kernel launch specifies too many threads for the kernel's register
|
|
## count. Passing arguments of the wrong size (i.e. a 64-bit pointer
|
|
## when a 32-bit int is expected) is equivalent to passing too many
|
|
## arguments and can also result in this error.
|
|
CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES = 701
|
|
## This indicates that the device kernel took too long to execute. This can
|
|
## only occur if timeouts are enabled - see the device attribute
|
|
## ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT for more information.
|
|
## This leaves the process in an inconsistent state and any further CUDA work
|
|
## will return the same error. To continue using CUDA, the process must be terminated
|
|
## and relaunched.
|
|
CUDA_ERROR_LAUNCH_TIMEOUT = 702
|
|
## This error indicates a kernel launch that uses an incompatible texturing
|
|
## mode.
|
|
CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING = 703
|
|
## This error indicates that a call to ::cuCtxEnablePeerAccess() is
|
|
## trying to re-enable peer access to a context which has already
|
|
## had peer access to it enabled.
|
|
CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED = 704
|
|
## This error indicates that ::cuCtxDisablePeerAccess() is
|
|
## trying to disable peer access which has not been enabled yet
|
|
## via ::cuCtxEnablePeerAccess().
|
|
CUDA_ERROR_PEER_ACCESS_NOT_ENABLED = 705
|
|
## This error indicates that the primary context for the specified device
|
|
## has already been initialized.
|
|
CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE = 708
|
|
## This error indicates that the context current to the calling thread
|
|
## has been destroyed using ::cuCtxDestroy, or is a primary context which
|
|
## has not yet been initialized.
|
|
CUDA_ERROR_CONTEXT_IS_DESTROYED = 709
|
|
## A device-side assert triggered during kernel execution. The context
|
|
## cannot be used anymore, and must be destroyed. All existing device
|
|
## memory allocations from this context are invalid and must be
|
|
## reconstructed if the program is to continue using CUDA.
|
|
CUDA_ERROR_ASSERT = 710
|
|
## This error indicates that the hardware resources required to enable
|
|
## peer access have been exhausted for one or more of the devices
|
|
## passed to ::cuCtxEnablePeerAccess().
|
|
CUDA_ERROR_TOO_MANY_PEERS = 711
|
|
## This error indicates that the memory range passed to ::cuMemHostRegister()
|
|
## has already been registered.
|
|
CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED = 712
|
|
## This error indicates that the pointer passed to ::cuMemHostUnregister()
|
|
## does not correspond to any currently registered memory region.
|
|
CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED = 713
|
|
## While executing a kernel, the device encountered a stack error.
|
|
## This can be due to stack corruption or exceeding the stack size limit.
|
|
## This leaves the process in an inconsistent state and any further CUDA work
|
|
## will return the same error. To continue using CUDA, the process must be terminated
|
|
## and relaunched.
|
|
CUDA_ERROR_HARDWARE_STACK_ERROR = 714
|
|
## While executing a kernel, the device encountered an illegal instruction.
|
|
## This leaves the process in an inconsistent state and any further CUDA work
|
|
## will return the same error. To continue using CUDA, the process must be terminated
|
|
## and relaunched.
|
|
CUDA_ERROR_ILLEGAL_INSTRUCTION = 715
|
|
## While executing a kernel, the device encountered a load or store instruction
|
|
## on a memory address which is not aligned.
|
|
## This leaves the process in an inconsistent state and any further CUDA work
|
|
## will return the same error. To continue using CUDA, the process must be terminated
|
|
## and relaunched.
|
|
CUDA_ERROR_MISALIGNED_ADDRESS = 716
|
|
## While executing a kernel, the device encountered an instruction
|
|
## which can only operate on memory locations in certain address spaces
|
|
## (global, shared, or local), but was supplied a memory address not
|
|
## belonging to an allowed address space.
|
|
## This leaves the process in an inconsistent state and any further CUDA work
|
|
## will return the same error. To continue using CUDA, the process must be terminated
|
|
## and relaunched.
|
|
CUDA_ERROR_INVALID_ADDRESS_SPACE = 717
|
|
## While executing a kernel, the device program counter wrapped its address space.
|
|
## This leaves the process in an inconsistent state and any further CUDA work
|
|
## will return the same error. To continue using CUDA, the process must be terminated
|
|
## and relaunched.
|
|
CUDA_ERROR_INVALID_PC = 718
|
|
## An exception occurred on the device while executing a kernel. Common
|
|
## causes include dereferencing an invalid device pointer and accessing
|
|
## out of bounds shared memory. Less common cases can be system specific - more
|
|
## information about these cases can be found in the system specific user guide.
|
|
## This leaves the process in an inconsistent state and any further CUDA work
|
|
## will return the same error. To continue using CUDA, the process must be terminated
|
|
## and relaunched.
|
|
CUDA_ERROR_LAUNCH_FAILED = 719
|
|
## This error indicates that the number of blocks launched per grid for a kernel that was
|
|
## launched via either ::cuLaunchCooperativeKernel or ::cuLaunchCooperativeKernelMultiDevice
|
|
## exceeds the maximum number of blocks as allowed by ::cuOccupancyMaxActiveBlocksPerMultiprocessor
|
|
## or ::cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags times the number of multiprocessors
|
|
## as specified by the device attribute ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT.
|
|
CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE = 720
|
|
## This error indicates that the attempted operation is not permitted.
|
|
CUDA_ERROR_NOT_PERMITTED = 800
|
|
## This error indicates that the attempted operation is not supported
|
|
## on the current system or device.
|
|
CUDA_ERROR_NOT_SUPPORTED = 801
|
|
## This error indicates that the system is not yet ready to start any CUDA
|
|
## work. To continue using CUDA, verify the system configuration is in a
|
|
## valid state and all required driver daemons are actively running.
|
|
## More information about this error can be found in the system specific
|
|
## user guide.
|
|
CUDA_ERROR_SYSTEM_NOT_READY = 802
|
|
## This error indicates that there is a mismatch between the versions of
|
|
## the display driver and the CUDA driver. Refer to the compatibility documentation
|
|
## for supported versions.
|
|
CUDA_ERROR_SYSTEM_DRIVER_MISMATCH = 803
|
|
## This error indicates that the system was upgraded to run with forward compatibility
|
|
## but the visible hardware detected by CUDA does not support this configuration.
|
|
## Refer to the compatibility documentation for the supported hardware matrix or ensure
|
|
## that only supported hardware is visible during initialization via the CUDA_VISIBLE_DEVICES
|
|
## environment variable.
|
|
CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE = 804
|
|
## This error indicates that the MPS client failed to connect to the MPS control daemon or the MPS server.
|
|
CUDA_ERROR_MPS_CONNECTION_FAILED = 805
|
|
## This error indicates that the remote procedural call between the MPS server and the MPS client failed.
|
|
CUDA_ERROR_MPS_RPC_FAILURE = 806
|
|
## This error indicates that the MPS server is not ready to accept new MPS client requests.
|
|
## This error can be returned when the MPS server is in the process of recovering from a fatal failure.
|
|
CUDA_ERROR_MPS_SERVER_NOT_READY = 807
|
|
## This error indicates that the hardware resources required to create MPS client have been exhausted.
|
|
CUDA_ERROR_MPS_MAX_CLIENTS_REACHED = 808
|
|
## This error indicates the the hardware resources required to support device connections have been exhausted.
|
|
CUDA_ERROR_MPS_MAX_CONNECTIONS_REACHED = 809
|
|
## This error indicates that the MPS client has been terminated by the server. To continue using CUDA, the process must be terminated and relaunched.
|
|
CUDA_ERROR_MPS_CLIENT_TERMINATED = 810
|
|
## This error indicates that the operation is not permitted when
|
|
## the stream is capturing.
|
|
CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED = 900
|
|
## This error indicates that the current capture sequence on the stream
|
|
## has been invalidated due to a previous error.
|
|
CUDA_ERROR_STREAM_CAPTURE_INVALIDATED = 901
|
|
## This error indicates that the operation would have resulted in a merge
|
|
## of two independent capture sequences.
|
|
CUDA_ERROR_STREAM_CAPTURE_MERGE = 902
|
|
## This error indicates that the capture was not initiated in this stream.
|
|
CUDA_ERROR_STREAM_CAPTURE_UNMATCHED = 903
|
|
## This error indicates that the capture sequence contains a fork that was
|
|
## not joined to the primary stream.
|
|
CUDA_ERROR_STREAM_CAPTURE_UNJOINED = 904
|
|
## This error indicates that a dependency would have been created which
|
|
## crosses the capture sequence boundary. Only implicit in-stream ordering
|
|
## dependencies are allowed to cross the boundary.
|
|
CUDA_ERROR_STREAM_CAPTURE_ISOLATION = 905
|
|
## This error indicates a disallowed implicit dependency on a current capture
|
|
## sequence from cudaStreamLegacy.
|
|
CUDA_ERROR_STREAM_CAPTURE_IMPLICIT = 906
|
|
## This error indicates that the operation is not permitted on an event which
|
|
## was last recorded in a capturing stream.
|
|
CUDA_ERROR_CAPTURED_EVENT = 907
|
|
## A stream capture sequence not initiated with the ::CU_STREAM_CAPTURE_MODE_RELAXED
|
|
## argument to ::cuStreamBeginCapture was passed to ::cuStreamEndCapture in a
|
|
## different thread.
|
|
CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD = 908
|
|
## This error indicates that the timeout specified for the wait operation has lapsed.
|
|
CUDA_ERROR_TIMEOUT = 909
|
|
## This error indicates that the graph update was not performed because it included
|
|
## changes which violated constraints specific to instantiated graph update.
|
|
CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE = 910
|
|
## This indicates that an async error has occurred in a device outside of CUDA.
|
|
## If CUDA was waiting for an external device's signal before consuming shared data,
|
|
## the external device signaled an error indicating that the data is not valid for
|
|
## consumption. This leaves the process in an inconsistent state and any further CUDA
|
|
## work will return the same error. To continue using CUDA, the process must be
|
|
## terminated and relaunched.
|
|
CUDA_ERROR_EXTERNAL_DEVICE = 911
|
|
## Indicates a kernel launch error due to cluster misconfiguration.
|
|
CUDA_ERROR_INVALID_CLUSTER_SIZE = 912
|
|
## This indicates that an unknown internal error has occurred.
|
|
CUDA_ERROR_UNKNOWN = 999
|
|
|
|
CUdevice_attribute* {.size: sizeof(cint).} = enum
|
|
CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1, ## Maximum number of threads per block */
|
|
CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2, ## Maximum block dimension X */
|
|
CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 3, ## Maximum block dimension Y */
|
|
CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z = 4, ## Maximum block dimension Z */
|
|
CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5, ## Maximum grid dimension X */
|
|
CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y = 6, ## Maximum grid dimension Y */
|
|
CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z = 7, ## Maximum grid dimension Z */
|
|
CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8, ## Maximum shared memory available per block in bytes */
|
|
CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 9, ## Memory available on device for __constant__ variables in a CUDA C kernel in bytes */
|
|
CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10, ## Warp size in threads */
|
|
CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11, ## Maximum pitch in bytes allowed by memory copies */
|
|
CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12, ## Maximum number of 32-bit registers available per block */
|
|
CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13, ## Typical clock frequency in kilohertz */
|
|
CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14, ## Alignment requirement for textures */
|
|
CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16, ## Number of multiprocessors on device */
|
|
CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17, ## Specifies whether there is a run time limit on kernels */
|
|
CU_DEVICE_ATTRIBUTE_INTEGRATED = 18, ## Device is integrated with host memory */
|
|
CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 19, ## Device can map host memory into CUDA address space */
|
|
CU_DEVICE_ATTRIBUTE_COMPUTE_MODE = 20, ## Compute mode (See ::CUcomputemode for details) */
|
|
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH = 21, ## Maximum 1D texture width */
|
|
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH = 22, ## Maximum 2D texture width */
|
|
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT = 23, ## Maximum 2D texture height */
|
|
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH = 24, ## Maximum 3D texture width */
|
|
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT = 25, ## Maximum 3D texture height */
|
|
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH = 26, ## Maximum 3D texture depth */
|
|
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH = 27, ## Maximum 2D layered texture width */
|
|
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT = 28, ## Maximum 2D layered texture height */
|
|
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS = 29, ## Maximum layers in a 2D layered texture */
|
|
CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT = 30, ## Alignment requirement for surfaces */
|
|
CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31, ## Device can possibly execute multiple kernels concurrently */
|
|
CU_DEVICE_ATTRIBUTE_ECC_ENABLED = 32, ## Device has ECC support enabled */
|
|
CU_DEVICE_ATTRIBUTE_PCI_BUS_ID = 33, ## PCI bus ID of the device */
|
|
CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID = 34, ## PCI device ID of the device */
|
|
CU_DEVICE_ATTRIBUTE_TCC_DRIVER = 35, ## Device is using TCC driver model */
|
|
CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36, ## Peak memory clock frequency in kilohertz */
|
|
CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH = 37, ## Global memory bus width in bits */
|
|
CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE = 38, ## Size of L2 cache in bytes */
|
|
CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39, ## Maximum resident threads per multiprocessor */
|
|
CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT = 40, ## Number of asynchronous engines */
|
|
CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING = 41, ## Device shares a unified address space with the host */
|
|
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH = 42, ## Maximum 1D layered texture width */
|
|
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS = 43, ## Maximum layers in a 1D layered texture */
|
|
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH = 45, ## Maximum 2D texture width if CUDA_ARRAY3D_TEXTURE_GATHER is set */
|
|
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT = 46, ## Maximum 2D texture height if CUDA_ARRAY3D_TEXTURE_GATHER is set */
|
|
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE = 47, ## Alternate maximum 3D texture width */
|
|
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE = 48, ## Alternate maximum 3D texture height */
|
|
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE = 49, ## Alternate maximum 3D texture depth */
|
|
CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID = 50, ## PCI domain ID of the device */
|
|
CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT = 51, ## Pitch alignment requirement for textures */
|
|
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH = 52, ## Maximum cubemap texture width/height */
|
|
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH = 53, ## Maximum cubemap layered texture width/height */
|
|
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS = 54, ## Maximum layers in a cubemap layered texture */
|
|
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH = 55, ## Maximum 1D surface width */
|
|
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH = 56, ## Maximum 2D surface width */
|
|
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT = 57, ## Maximum 2D surface height */
|
|
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH = 58, ## Maximum 3D surface width */
|
|
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT = 59, ## Maximum 3D surface height */
|
|
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH = 60, ## Maximum 3D surface depth */
|
|
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH = 61, ## Maximum 1D layered surface width */
|
|
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS = 62, ## Maximum layers in a 1D layered surface */
|
|
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH = 63, ## Maximum 2D layered surface width */
|
|
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT = 64, ## Maximum 2D layered surface height */
|
|
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS = 65, ## Maximum layers in a 2D layered surface */
|
|
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH = 66, ## Maximum cubemap surface width */
|
|
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH = 67, ## Maximum cubemap layered surface width */
|
|
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS = 68, ## Maximum layers in a cubemap layered surface */
|
|
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH = 70, ## Maximum 2D linear texture width */
|
|
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT = 71, ## Maximum 2D linear texture height */
|
|
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH = 72, ## Maximum 2D linear texture pitch in bytes */
|
|
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH = 73, ## Maximum mipmapped 2D texture width */
|
|
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT = 74, ## Maximum mipmapped 2D texture height */
|
|
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75, ## Major compute capability version number */
|
|
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76, ## Minor compute capability version number */
|
|
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH = 77, ## Maximum mipmapped 1D texture width */
|
|
CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED = 78, ## Device supports stream priorities */
|
|
CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED = 79, ## Device supports caching globals in L1 */
|
|
CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED = 80, ## Device supports caching locals in L1 */
|
|
CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR = 81, ## Maximum shared memory available per multiprocessor in bytes */
|
|
CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82, ## Maximum number of 32-bit registers available per multiprocessor */
|
|
CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY = 83, ## Device can allocate managed memory on this system */
|
|
CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD = 84, ## Device is on a multi-GPU board */
|
|
CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID = 85, ## Unique id for a group of devices on the same multi-GPU board */
|
|
CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED = 86, ## Link between the device and the host supports native atomic operations (this is a placeholder attribute, and is not supported on any current hardware)*/
|
|
CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO = 87, ## Ratio of single precision performance (in floating-point operations per second) to double precision performance */
|
|
CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS = 88, ## Device supports coherently accessing pageable memory without calling cudaHostRegister on it */
|
|
CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS = 89, ## Device can coherently access managed memory concurrently with the CPU */
|
|
CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED = 90, ## Device supports compute preemption. */
|
|
CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM = 91, ## Device can access host registered memory at the same virtual address as the CPU */
|
|
CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS = 92, ## ::cuStreamBatchMemOp and related APIs are supported. */
|
|
CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS = 93, ## 64-bit operations are supported in ::cuStreamBatchMemOp and related APIs. */
|
|
CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR = 94, ## ::CU_STREAM_WAIT_VALUE_NOR is supported. */
|
|
CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH = 95, ## Device supports launching cooperative kernels via ::cuLaunchCooperativeKernel */
|
|
CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN = 97, ## Maximum optin shared memory per block */
|
|
CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES = 98, ## The ::CU_STREAM_WAIT_VALUE_FLUSH flag and the ::CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES MemOp are supported on the device. See \ref CUDA_MEMOP for additional details. */
|
|
CU_DEVICE_ATTRIBUTE_HOST_REGISTER_SUPPORTED = 99, ## Device supports host memory registration via ::cudaHostRegister. */
|
|
CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES = 100, ## Device accesses pageable memory via the host's page tables. */
|
|
CU_DEVICE_ATTRIBUTE_DIRECT_MANAGED_MEM_ACCESS_FROM_HOST = 101, ## The host can directly access managed memory on the device without migration. */
|
|
CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED = 102, ## Device supports virtual memory management APIs like ::cuMemAddressReserve, ::cuMemCreate, ::cuMemMap and related APIs */
|
|
CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR_SUPPORTED = 103, ## Device supports exporting memory to a posix file descriptor with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate */
|
|
CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_HANDLE_SUPPORTED = 104, ## Device supports exporting memory to a Win32 NT handle with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate */
|
|
CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_KMT_HANDLE_SUPPORTED = 105, ## Device supports exporting memory to a Win32 KMT handle with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate */
|
|
CU_DEVICE_ATTRIBUTE_MAX_BLOCKS_PER_MULTIPROCESSOR = 106, ## Maximum number of blocks per multiprocessor */
|
|
CU_DEVICE_ATTRIBUTE_GENERIC_COMPRESSION_SUPPORTED = 107, ## Device supports compression of memory */
|
|
CU_DEVICE_ATTRIBUTE_MAX_PERSISTING_L2_CACHE_SIZE = 108, ## Maximum L2 persisting lines capacity setting in bytes. */
|
|
CU_DEVICE_ATTRIBUTE_MAX_ACCESS_POLICY_WINDOW_SIZE = 109, ## Maximum value of CUaccessPolicyWindow::num_bytes. */
|
|
CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED = 110, ## Device supports specifying the GPUDirect RDMA flag with ::cuMemCreate */
|
|
CU_DEVICE_ATTRIBUTE_RESERVED_SHARED_MEMORY_PER_BLOCK = 111, ## Shared memory reserved by CUDA driver per block in bytes */
|
|
CU_DEVICE_ATTRIBUTE_SPARSE_CUDA_ARRAY_SUPPORTED = 112, ## Device supports sparse CUDA arrays and sparse CUDA mipmapped arrays */
|
|
CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED = 113, ## Device supports using the ::cuMemHostRegister flag ::CU_MEMHOSTERGISTER_READ_ONLY to register memory that must be mapped as read-only to the GPU */
|
|
CU_DEVICE_ATTRIBUTE_TIMELINE_SEMAPHORE_INTEROP_SUPPORTED = 114, ## External timeline semaphore interop is supported on the device */
|
|
CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED = 115, ## Device supports using the ::cuMemAllocAsync and ::cuMemPool family of APIs */
|
|
CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED = 116, ## Device supports GPUDirect RDMA APIs, like nvidia_p2p_get_pages (see https://docs.nvidia.com/cuda/gpudirect-rdma for more information) */
|
|
CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS = 117, ## The returned attribute shall be interpreted as a bitmask, where the individual bits are described by the ::CUflushGPUDirectRDMAWritesOptions enum */
|
|
CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING = 118, ## GPUDirect RDMA writes to the device do not need to be flushed for consumers within the scope indicated by the returned attribute. See ::CUGPUDirectRDMAWritesOrdering for the numerical values returned here. */
|
|
CU_DEVICE_ATTRIBUTE_MEMPOOL_SUPPORTED_HANDLE_TYPES = 119, ## Handle types supported with mempool based IPC */
|
|
CU_DEVICE_ATTRIBUTE_CLUSTER_LAUNCH = 120, ## Indicates device supports cluster launch */
|
|
CU_DEVICE_ATTRIBUTE_DEFERRED_MAPPING_CUDA_ARRAY_SUPPORTED = 121, ## Device supports deferred mapping CUDA arrays and CUDA mipmapped arrays */
|
|
CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS_V2 = 122, ## 64-bit operations are supported in ::cuStreamBatchMemOp_v2 and related v2 MemOp APIs. */
|
|
CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR_V2 = 123, ## ::CU_STREAM_WAIT_VALUE_NOR is supported by v2 MemOp APIs. */
|
|
CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED = 124, ## Device supports buffer sharing with dma_buf mechanism. */
|
|
CU_DEVICE_ATTRIBUTE_MAX
|
|
|
|
CUmemAttach_flags* = enum
|
|
CU_MEM_ATTACH_GLOBAL = 0x1, ## Memory can be accessed by any stream on any device
|
|
CU_MEM_ATTACH_HOST = 0x2, ## Memory cannot be accessed by any stream on any device
|
|
CU_MEM_ATTACH_SINGLE = 0x4
|
|
|
|
CUdevice* = distinct int32
|
|
## Compute Device handle
|
|
CUcontext* = distinct pointer
|
|
CUmodule* = distinct pointer
|
|
CUfunction* = distinct pointer
|
|
CUstream* = distinct pointer
|
|
CUdeviceptr* = distinct pointer
|
|
|
|
{.push noconv, importc, dynlib: "libcuda.so".}
|
|
|
|
proc cuInit*(flags: uint32): CUresult
|
|
|
|
proc cuDeviceGetCount*(count: var int32): CUresult
|
|
proc cuDeviceGet*(device: var CUdevice, ordinal: int32): CUresult
|
|
proc cuDeviceGetName*(name: ptr char, len: int32, dev: CUdevice): CUresult
|
|
proc cuDeviceGetAttribute*(r: var int32, attrib: CUdevice_attribute, dev: CUdevice): CUresult
|
|
|
|
proc cuCtxCreate*(pctx: var CUcontext, flags: uint32, dev: CUdevice): CUresult
|
|
proc cuCtxDestroy*(ctx: CUcontext): CUresult
|
|
proc cuCtxSynchronize*(ctx: CUcontext): CUresult
|
|
|
|
proc cuModuleLoadData(module: var CUmodule, sourceCode: ptr char): CUresult {.used.}
|
|
proc cuModuleUnload*(module: CUmodule): CUresult
|
|
proc cuModuleGetFunction(kernel: var CUfunction, module: CUmodule, fnName: ptr char): CUresult {.used.}
|
|
|
|
proc cuLaunchKernel(
|
|
kernel: CUfunction,
|
|
gridDimX, gridDimY, gridDimZ: uint32,
|
|
blockDimX, blockDimY, blockDimZ: uint32,
|
|
sharedMemBytes: uint32,
|
|
stream: CUstream,
|
|
kernelParams: ptr pointer,
|
|
extra: ptr pointer
|
|
): CUresult {.used.}
|
|
|
|
proc cuMemAlloc*(devptr: var CUdeviceptr, size: csize_t): CUresult
|
|
proc cuMemAllocManaged*(devptr: var CUdeviceptr, size: csize_t, flags: Flag[CUmemAttach_flags]): CUresult
|
|
proc cuMemFree*(devptr: CUdeviceptr): CUresult
|
|
proc cuMemcpyHtoD*(dst: CUdeviceptr, src: pointer, size: csize_t): CUresult
|
|
proc cuMemcpyDtoH*(dst: pointer, src: CUdeviceptr, size: csize_t): CUresult
|
|
|
|
{.pop.} # {.push noconv, importc, dynlib: "libcuda.so".}
|