constantine/tests/gpu/t_nvidia_fp.nim

# Constantine
# Copyright (c) 2018-2019    Status Research & Development GmbH
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
# Licensed and distributed under either of
#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.


import
  # Standard library
  std/[unittest, times],
  # Internal
  ../../constantine/platforms/code_generator/[llvm, nvidia, ir],
  ../../constantine/platforms/static_for,
  ../../constantine/math/config/curves,
  ../../constantine/math/io/io_bigints,
  ../../constantine/math/arithmetic,
  ../../constantine/math_codegen/fields_nvidia,
  # Test utilities
  ../../helpers/prng_unsafe

var rng: RngState
let seed = uint32(getTime().toUnix() and (1'i64 shl 32 - 1)) # unixTime mod 2^32
rng.seed(seed)
echo "\n------------------------------------------------------\n"
echo "test_nvidia_fp xoshiro512** seed: ", seed

const Iters = 10

proc init(T: type CurveMetadata, asy: Assembler_LLVM, curve: static Curve, wordSize: WordSize): T =
  CurveMetadata.init(
      asy.ctx,
      $curve & "_", wordSize,
      fpBits = uint32 curve.getCurveBitwidth(),
      fpMod = curve.Mod().toHex(),
      frBits = uint32 curve.getCurveOrderBitwidth(),
      frMod = curve.getCurveOrder().toHex())

proc genFieldAddPTX(asy: Assembler_LLVM, cm: CurveMetadata) =
  let fpAdd = asy.field_add_gen(cm, fp)
  asy.module.setCallableCudaKernel(fpAdd)
  let frAdd = asy.field_add_gen(cm, fr)
  asy.module.setCallableCudaKernel(frAdd)

# Init LLVM
# -------------------------
initializeFullNVPTXTarget()
initializePasses()

# Init GPU
# -------------------------
let cudaDevice = cudaDeviceInit()
var sm: tuple[major, minor: int32]
check cuDeviceGetAttribute(sm.major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cudaDevice)
check cuDeviceGetAttribute(sm.minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cudaDevice)

proc t_field_add(curve: static Curve) =
  # Codegen
  # -------------------------
  let asy = Assembler_LLVM.new(bkNvidiaPTX, cstring("t_nvidia_" & $curve))
  let cm32 = CurveMetadata.init(asy, curve, size32)
  asy.genFieldAddPTX(cm32)
  let cm64 = CurveMetadata.init(asy, curve, size64)
  asy.genFieldAddPTX(cm64)

  let ptx = asy.codegenNvidiaPTX(sm)

  # GPU exec
  # -------------------------
  var cuCtx: CUcontext
  var cuMod: CUmodule
  check cuCtxCreate(cuCtx, 0, cudaDevice)
  check cuModuleLoadData(cuMod, ptx)
  defer:
    check cuMod.cuModuleUnload()
    check cuCtx.cuCtxDestroy()

  let fpAdd32 = cuMod.getCudaKernel(cm32, opFpAdd)
  let fpAdd64 = cuMod.getCudaKernel(cm64, opFpAdd)
  let frAdd32 = cuMod.getCudaKernel(cm32, opFrAdd)
  let frAdd64 = cuMod.getCudaKernel(cm64, opFrAdd)

  # Fp
  for i in 0 ..< Iters:
    let a = rng.random_long01Seq(Fp[curve])
    let b = rng.random_long01Seq(Fp[curve])

    var rCPU, rGPU_32, rGPU_64: Fp[curve]

    rCPU.sum(a, b)
    fpAdd32.exec(rGPU_32, a, b)
    fpAdd64.exec(rGPU_64, a, b)

    doAssert bool(rCPU == rGPU_32)
    doAssert bool(rCPU == rGPU_64)

  # Fr
  for i in 0 ..< Iters:
    let a = rng.random_long01Seq(Fr[curve])
    let b = rng.random_long01Seq(Fr[curve])

    var rCPU, rGPU_32, rGPU_64: Fr[curve]

    rCPU.sum(a, b)
    frAdd32.exec(rGPU_32, a, b)
    frAdd64.exec(rGPU_64, a, b)

    doAssert bool(rCPU == rGPU_32)
    doAssert bool(rCPU == rGPU_64)

proc main() =
  const curves = [
    P224,
    BN254_Nogami,
    BN254_Snarks,
    Edwards25519,
    Bandersnatch,
    Pallas,
    Vesta,
    P256,
    Secp256k1,
    BLS12_377,
    BLS12_381,
    BW6_761
  ]

  suite "[Nvidia GPU] Field Addition":
    staticFor i, 0, curves.len:
      const curve = curves[i]
      test "Nvidia GPU field addition (𝔽p, 𝔽r) for " & $curve:
        t_field_add(curve)

main()
-												[Backend] Add support for Nvidia GPUs  (#210)

* Add PoC of JIT exec on Nvidia GPUs [skip ci]

* Split GPU bindings into low-level (ABI) and high-level [skip ci]

* small typedef reorg [skip ci]

* refine LLVM IR/Nvidia GPU hello worlds

* [Nvidia GPU] PoC implementation of field addition [skip ci]

* prod-ready field addition + tests on Nvidia GPUs via LLVM codegen
											
										
										
											2023-01-12 01:01:57 +01:00
+								# Constantine
 								# Copyright (c) 2018-2019    Status Research & Development GmbH
 								# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
 								# Licensed and distributed under either of
 								#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
 								#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 								# at your option. This file may not be copied, modified, or distributed except according to those terms.
-												move staticFor to the inner repo, not helpers/ for unblocking nimble install (#216)


											
										
										
											2023-02-07 13:11:44 +01:00
+								import
-												[Backend] Add support for Nvidia GPUs  (#210)

* Add PoC of JIT exec on Nvidia GPUs [skip ci]

* Split GPU bindings into low-level (ABI) and high-level [skip ci]

* small typedef reorg [skip ci]

* refine LLVM IR/Nvidia GPU hello worlds

* [Nvidia GPU] PoC implementation of field addition [skip ci]

* prod-ready field addition + tests on Nvidia GPUs via LLVM codegen
											
										
										
											2023-01-12 01:01:57 +01:00
+								  # Standard library
 								  std/[unittest, times],
 								  # Internal
-												[Research] x86 code generator (#234)

* rename compilers -> intrinsics, math_gpu -> math_codegen

* stash x86 codegen in research
											
										
										
											2023-04-27 21:52:51 +02:00
+								  ../../constantine/platforms/code_generator/[llvm, nvidia, ir],
-												Parallel for (#222)

* introduce reserve threads to minimize latency and maximize throughput when awaiting a future

* introduce a ceilDiv proc

* threadpool: implement parallel-for loops

* 10x perf improvement by not waking reserveBackoff on syncAll

* bench overhead: new reserve system might introduce too much wakeup latency, 2x slower, for fine-grained parallelism

* add parallelForStrided

* Threadpool: Implement parallel reductions

* refactor parallel loop codegen: introduce descriptor, parsing and codegen stages

* parallel strided, test transpose bench

* tight loop is faster when backoff is not inline

* no POSIX stuff on windows, larger types for histogram bench

* fix tests

* max RSS overflow?

* missed an undefined var

* exit histogram on 32-bit

* forgot to return early dor 32-bit
											
										
										
											2023-02-24 09:47:36 +01:00
+								  ../../constantine/platforms/static_for,
 								  ../../constantine/math/config/curves,
-												[Backend] Add support for Nvidia GPUs  (#210)

* Add PoC of JIT exec on Nvidia GPUs [skip ci]

* Split GPU bindings into low-level (ABI) and high-level [skip ci]

* small typedef reorg [skip ci]

* refine LLVM IR/Nvidia GPU hello worlds

* [Nvidia GPU] PoC implementation of field addition [skip ci]

* prod-ready field addition + tests on Nvidia GPUs via LLVM codegen
											
										
										
											2023-01-12 01:01:57 +01:00
+								  ../../constantine/math/io/io_bigints,
 								  ../../constantine/math/arithmetic,
-												[Research] x86 code generator (#234)

* rename compilers -> intrinsics, math_gpu -> math_codegen

* stash x86 codegen in research
											
										
										
											2023-04-27 21:52:51 +02:00
+								  ../../constantine/math_codegen/fields_nvidia,
-												[Backend] Add support for Nvidia GPUs  (#210)

* Add PoC of JIT exec on Nvidia GPUs [skip ci]

* Split GPU bindings into low-level (ABI) and high-level [skip ci]

* small typedef reorg [skip ci]

* refine LLVM IR/Nvidia GPU hello worlds

* [Nvidia GPU] PoC implementation of field addition [skip ci]

* prod-ready field addition + tests on Nvidia GPUs via LLVM codegen
											
										
										
											2023-01-12 01:01:57 +01:00
+								  # Test utilities
-												move staticFor to the inner repo, not helpers/ for unblocking nimble install (#216)


											
										
										
											2023-02-07 13:11:44 +01:00
+								  ../../helpers/prng_unsafe
-												[Backend] Add support for Nvidia GPUs  (#210)

* Add PoC of JIT exec on Nvidia GPUs [skip ci]

* Split GPU bindings into low-level (ABI) and high-level [skip ci]

* small typedef reorg [skip ci]

* refine LLVM IR/Nvidia GPU hello worlds

* [Nvidia GPU] PoC implementation of field addition [skip ci]

* prod-ready field addition + tests on Nvidia GPUs via LLVM codegen
											
										
										
											2023-01-12 01:01:57 +01:00
 								var rng: RngState
 								let seed = uint32(getTime().toUnix() and (1'i64 shl 32 - 1)) # unixTime mod 2^32
 								rng.seed(seed)
 								echo "\n------------------------------------------------------\n"
 								echo "test_nvidia_fp xoshiro512** seed: ", seed
 								const Iters = 10
 								proc init(T: type CurveMetadata, asy: Assembler_LLVM, curve: static Curve, wordSize: WordSize): T =
 								  CurveMetadata.init(
 								      asy.ctx,
 								      $curve & "_", wordSize,
 								      fpBits = uint32 curve.getCurveBitwidth(),
 								      fpMod = curve.Mod().toHex(),
 								      frBits = uint32 curve.getCurveOrderBitwidth(),
-												Parallel for (#222)

* introduce reserve threads to minimize latency and maximize throughput when awaiting a future

* introduce a ceilDiv proc

* threadpool: implement parallel-for loops

* 10x perf improvement by not waking reserveBackoff on syncAll

* bench overhead: new reserve system might introduce too much wakeup latency, 2x slower, for fine-grained parallelism

* add parallelForStrided

* Threadpool: Implement parallel reductions

* refactor parallel loop codegen: introduce descriptor, parsing and codegen stages

* parallel strided, test transpose bench

* tight loop is faster when backoff is not inline

* no POSIX stuff on windows, larger types for histogram bench

* fix tests

* max RSS overflow?

* missed an undefined var

* exit histogram on 32-bit

* forgot to return early dor 32-bit
											
										
										
											2023-02-24 09:47:36 +01:00
+								      frMod = curve.getCurveOrder().toHex())
-												[Backend] Add support for Nvidia GPUs  (#210)

* Add PoC of JIT exec on Nvidia GPUs [skip ci]

* Split GPU bindings into low-level (ABI) and high-level [skip ci]

* small typedef reorg [skip ci]

* refine LLVM IR/Nvidia GPU hello worlds

* [Nvidia GPU] PoC implementation of field addition [skip ci]

* prod-ready field addition + tests on Nvidia GPUs via LLVM codegen
											
										
										
											2023-01-12 01:01:57 +01:00
 								proc genFieldAddPTX(asy: Assembler_LLVM, cm: CurveMetadata) =
 								  let fpAdd = asy.field_add_gen(cm, fp)
 								  asy.module.setCallableCudaKernel(fpAdd)
 								  let frAdd = asy.field_add_gen(cm, fr)
 								  asy.module.setCallableCudaKernel(frAdd)
 								# Init LLVM
 								# -------------------------
 								initializeFullNVPTXTarget()
 								initializePasses()
 								# Init GPU
 								# -------------------------
 								let cudaDevice = cudaDeviceInit()
 								var sm: tuple[major, minor: int32]
 								check cuDeviceGetAttribute(sm.major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cudaDevice)
 								check cuDeviceGetAttribute(sm.minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cudaDevice)
 								proc t_field_add(curve: static Curve) =
 								  # Codegen
 								  # -------------------------
 								  let asy = Assembler_LLVM.new(bkNvidiaPTX, cstring("t_nvidia_" & $curve))
 								  let cm32 = CurveMetadata.init(asy, curve, size32)
 								  asy.genFieldAddPTX(cm32)
 								  let cm64 = CurveMetadata.init(asy, curve, size64)
 								  asy.genFieldAddPTX(cm64)
 								  let ptx = asy.codegenNvidiaPTX(sm)
 								  # GPU exec
 								  # -------------------------
 								  var cuCtx: CUcontext
 								  var cuMod: CUmodule
 								  check cuCtxCreate(cuCtx, 0, cudaDevice)
 								  check cuModuleLoadData(cuMod, ptx)
 								  defer:
 								    check cuMod.cuModuleUnload()
 								    check cuCtx.cuCtxDestroy()
 								  let fpAdd32 = cuMod.getCudaKernel(cm32, opFpAdd)
 								  let fpAdd64 = cuMod.getCudaKernel(cm64, opFpAdd)
 								  let frAdd32 = cuMod.getCudaKernel(cm32, opFrAdd)
 								  let frAdd64 = cuMod.getCudaKernel(cm64, opFrAdd)
 								  # Fp
 								  for i in 0 ..< Iters:
 								    let a = rng.random_long01Seq(Fp[curve])
 								    let b = rng.random_long01Seq(Fp[curve])
 								    var rCPU, rGPU_32, rGPU_64: Fp[curve]
-												move staticFor to the inner repo, not helpers/ for unblocking nimble install (#216)


											
										
										
											2023-02-07 13:11:44 +01:00
-												[Backend] Add support for Nvidia GPUs  (#210)

* Add PoC of JIT exec on Nvidia GPUs [skip ci]

* Split GPU bindings into low-level (ABI) and high-level [skip ci]

* small typedef reorg [skip ci]

* refine LLVM IR/Nvidia GPU hello worlds

* [Nvidia GPU] PoC implementation of field addition [skip ci]

* prod-ready field addition + tests on Nvidia GPUs via LLVM codegen
											
										
										
											2023-01-12 01:01:57 +01:00
+								    rCPU.sum(a, b)
 								    fpAdd32.exec(rGPU_32, a, b)
 								    fpAdd64.exec(rGPU_64, a, b)
 								    doAssert bool(rCPU == rGPU_32)
 								    doAssert bool(rCPU == rGPU_64)
 								  # Fr
 								  for i in 0 ..< Iters:
 								    let a = rng.random_long01Seq(Fr[curve])
 								    let b = rng.random_long01Seq(Fr[curve])
 								    var rCPU, rGPU_32, rGPU_64: Fr[curve]
-												move staticFor to the inner repo, not helpers/ for unblocking nimble install (#216)


											
										
										
											2023-02-07 13:11:44 +01:00
-												[Backend] Add support for Nvidia GPUs  (#210)

* Add PoC of JIT exec on Nvidia GPUs [skip ci]

* Split GPU bindings into low-level (ABI) and high-level [skip ci]

* small typedef reorg [skip ci]

* refine LLVM IR/Nvidia GPU hello worlds

* [Nvidia GPU] PoC implementation of field addition [skip ci]

* prod-ready field addition + tests on Nvidia GPUs via LLVM codegen
											
										
										
											2023-01-12 01:01:57 +01:00
+								    rCPU.sum(a, b)
 								    frAdd32.exec(rGPU_32, a, b)
 								    frAdd64.exec(rGPU_64, a, b)
 								    doAssert bool(rCPU == rGPU_32)
 								    doAssert bool(rCPU == rGPU_64)
 								proc main() =
 								  const curves = [
 								    P224,
 								    BN254_Nogami,
 								    BN254_Snarks,
 								    Edwards25519,
 								    Bandersnatch,
 								    Pallas,
 								    Vesta,
 								    P256,
 								    Secp256k1,
 								    BLS12_377,
 								    BLS12_381,
 								    BW6_761
 								  ]
 								  suite "[Nvidia GPU] Field Addition":
 								    staticFor i, 0, curves.len:
 								      const curve = curves[i]
 								      test "Nvidia GPU field addition (𝔽p, 𝔽r) for " & $curve:
 								        t_field_add(curve)
 								main()