[Research] x86 code generator (#234)

* rename compilers -> intrinsics, math_gpu -> math_codegen * stash x86 codegen in research
2025-01-27 11:04:51 +00:00 · 2023-04-27 21:52:51 +02:00 · 2023-04-27 21:52:51 +02:00 · 33c3a2e8c4
commit 33c3a2e8c4
parent c6d9a213f2
23 changed files with 343 additions and 38 deletions
--- a/constantine/math_codegen/fields_nvidia.nim
+++ b/constantine/math_codegen/fields_nvidia.nim
@ -7,7 +7,7 @@
 # at your option. This file may not be copied, modified, or distributed except according to those terms.

 import
-  ../platforms/gpu/[llvm, nvidia, ir]
+  ../platforms/code_generator/[llvm, nvidia, ir]

 # ############################################################
 #
@ -21,10 +21,10 @@ import
 proc finalSubMayOverflow*(asy: Assembler_LLVM, cm: CurveMetadata, field: Field, r, a: Array) =
  ## If a >= Modulus: r <- a-M
  ## else:            r <- a
-  ## 
+  ##
  ## This is constant-time straightline code.
  ## Due to warp divergence, the overhead of doing comparison with shortcutting might not be worth it on GPU.
-  ## 
+  ##
  ## To be used when the final substraction can
  ## also overflow the limbs (a 2^256 order of magnitude modulus stored in n words of total max size 2^256)

@ -48,15 +48,15 @@ proc finalSubMayOverflow*(asy: Assembler_LLVM, cm: CurveMetadata, field: Field,
  let underflowedModulus = bld.sub_bi(overflowedLimbs, 0'u32)

  for i in 0 ..< N:
-    r[i] = bld.slct(scratch[i], a[i], underflowedModulus) 
+    r[i] = bld.slct(scratch[i], a[i], underflowedModulus)

 proc finalSubNoOverflow*(asy: Assembler_LLVM, cm: CurveMetadata, field: Field, r, a: Array) =
  ## If a >= Modulus: r <- a-M
  ## else:            r <- a
-  ## 
+  ##
  ## This is constant-time straightline code.
  ## Due to warp divergence, the overhead of doing comparison with shortcutting might not be worth it on GPU.
-  ## 
+  ##
  ## To be used when the modulus does not use the full bitwidth of the storing words
  ## (say using 255 bits for the modulus out of 256 available in words)

@ -65,7 +65,7 @@ proc finalSubNoOverflow*(asy: Assembler_LLVM, cm: CurveMetadata, field: Field, r
  let scratch = bld.makeArray(fieldTy)
  let M = cm.getModulus(field)
  let N = M.len
- 
+
  # Now substract the modulus, and test a < M with the last borrow
  scratch[0] = bld.sub_bo(a[0], M[0])
  for i in 1 ..< N:
@ -80,7 +80,7 @@ proc finalSubNoOverflow*(asy: Assembler_LLVM, cm: CurveMetadata, field: Field, r
 proc field_add_gen*(asy: Assembler_LLVM, cm: CurveMetadata, field: Field): FnDef =
  ## Generate an optimized modular addition kernel
  ## with parameters `a, b, modulus: Limbs -> Limbs`
-  
+
  let procName = cm.genSymbol(block:
    case field
    of fp: opFpAdd
@ -94,14 +94,14 @@ proc field_add_gen*(asy: Assembler_LLVM, cm: CurveMetadata, field: Field): FnDef
  asy.builder.positionAtEnd(blck)

  let bld = asy.builder
-  
+
  let r = bld.asArray(addModKernel.getParam(0), fieldTy)
  let a = bld.asArray(addModKernel.getParam(1), fieldTy)
  let b = bld.asArray(addModKernel.getParam(2), fieldTy)

  let t = bld.makeArray(fieldTy)
  let N = cm.getNumWords(field)
-  
+
  t[0] = bld.add_co(a[0], b[0])
  for i in 1 ..< N:
    t[i] = bld.add_cio(a[i], b[i])
--- a/constantine/platforms/bithacks.nim
+++ b/constantine/platforms/bithacks.nim
@ -6,7 +6,7 @@
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.

-import ./compilers/bitops
+import ./intrinsics/bitops

 # ############################################################
 #
--- a/constantine/platforms/code_generator/README.md
+++ b/constantine/platforms/code_generator/README.md
--- a/constantine/platforms/code_generator/bindings/c_abi.nim
+++ b/constantine/platforms/code_generator/bindings/c_abi.nim
--- a/constantine/platforms/code_generator/bindings/llvm_abi.nim
+++ b/constantine/platforms/code_generator/bindings/llvm_abi.nim
--- a/constantine/platforms/code_generator/bindings/nvidia_abi.nim
+++ b/constantine/platforms/code_generator/bindings/nvidia_abi.nim
--- a/constantine/platforms/code_generator/ir.nim
+++ b/constantine/platforms/code_generator/ir.nim
@ -24,7 +24,7 @@ type
    ctx*: ContextRef
    module*: ModuleRef
    builder*: BuilderRef
-    i1_t*, i32_t*, i64_t*, void_t*: TypeRef
+    i1_t*, i32_t*, i64_t*, i128_t*, void_t*: TypeRef
    backend*: Backend

  Backend* = enum
@ -54,7 +54,8 @@ proc new*(T: type Assembler_LLVM, backend: Backend, moduleName: cstring): Assemb
  result.builder = result.ctx.createBuilder()
  result.i1_t = result.ctx.int1_t()
  result.i32_t = result.ctx.int32_t()
-  result.i64_t = result.ctx.int32_t()
+  result.i64_t = result.ctx.int64_t()
+  result.i128_t = result.ctx.int128_t()
  result.void_t = result.ctx.void_t()
  result.backend = backend

--- a/constantine/platforms/code_generator/llvm.nim
+++ b/constantine/platforms/code_generator/llvm.nim
--- a/constantine/platforms/code_generator/nvidia.nim
+++ b/constantine/platforms/code_generator/nvidia.nim
--- a/constantine/platforms/code_generator/nvidia_inlineasm.nim
+++ b/constantine/platforms/code_generator/nvidia_inlineasm.nim
@ -20,7 +20,7 @@ import
 # instructions -> inline assembly -> argument mapping

 # Inline assembly looks like this:
-# 
+#
 # C:    asm volatile ("add.cc.u64 %0, %1, %2;" : "=l"(c) : "l"(a), "l"(b) : "memory" );
 # LLVM: call i64 asm "add.cc.u64 $0, $1, $2;", "=l,l,l,~{memory}"(i64 %1, i64 %2)
 #
@ -30,16 +30,16 @@ import
 # 2. Generate u32 and u64 `getInlineAsm()` definition (that is associated with an LLVM IR ContextRef)
 # 3. Create an initialization proc to be called after initializing the LLVM ContextRef
 #    For each instruction, return a routine with signature that mirrors LLVM builtin instructions:
-#    
+#
 #    proc myInstr(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring): ValueRef =
 #      let numBits = lhs.getTypeOf().getIntTypeWidth()
-#      if numBits == 32: 
+#      if numBits == 32:
 #        builder.call2(inlineAsmFnType, inlineAsmFn32, [arg0, arg1, ...], name)
-#      elif numBits == 64: 
+#      elif numBits == 64:
 #        builder.call2(inlineAsmFnType, inlineAsmFn64, [arg0, arg1, ...], name)
 #      else:
 #        doAssert false, "Unsupported int" & $numBits
-# 
+#
 # To create `inlineAsmFn32` and `inlineAsmFn64` we may use `getInlineAsm` just before the corresponding
 # builder.call2. This allows us to define freestanding functions.
 # The potential issue is the overhead of repeated definition of add/sub/mul/muladd
@ -94,7 +94,7 @@ macro genInstr(body: untyped): untyped =
    let fnTy = ident"fnTy"
    let ctx = ident"ctx"
    let lhs = op[2][0][3][0]
-    
+
    instrBody.add quote do:
      let `ctx` = builder.getContext()
      # lhs: ValueRef or uint32 or uint64
@ -143,6 +143,8 @@ macro genInstr(body: untyped): untyped =

    # We could have generic constraint string generation, but we only have 2 arities to support
    # and codegen without quote do would be even more verbose and hard to read.
+
+    # TODO: commutative inputs
    if arity == 2:
      let op0 = operands[0]
      let op1 = operands[1]
@ -201,7 +203,7 @@ macro genInstr(body: untyped): untyped =
    else:
      instrBody.add quote do:
        let `asmString` = static(`instr` & ".u") & $`numBits` & static(" " & `instrParam`)
-    
+
    instrBody.add quote do:
      # Chapter 6 of https://docs.nvidia.com/cuda/pdf/NVVM_IR_Specification.pdf
      # inteldialect is not supported (but the NVPTX dialect is akin to intel dialect)
@ -217,8 +219,7 @@ macro genInstr(body: untyped): untyped =
        hasSideEffects = LlvmBool(0),
        isAlignStack = LlvmBool(0),
        dialect = InlineAsmDialectATT,
-        canThrow = LlvmBool(0)
-      ) 
+        canThrow = LlvmBool(0))

    # 5. Call it
    let opArray = nnkBracket.newTree()
@ -235,8 +236,7 @@ macro genInstr(body: untyped): untyped =
    # builder.call2(ty, inlineASM, [lhs, rhs], name)
    instrBody.add newCall(
      ident"call2", ident"builder", fnTy,
-      inlineASM, opArray, ident"name"
-    )
+      inlineASM, opArray, ident"name")

    # 6. Create the function signature
    var opDefs: seq[NimNode]
@ -273,8 +273,7 @@ macro genInstr(body: untyped): untyped =
      name = nnkPostfix.newTree(ident"*", instrName),
      params = opDefs,
      procType = nnkProcDef,
-      body = instrBody
-    )
+      body = instrBody)

 # Inline PTX assembly
 # ------------------------------------------------------------
@ -293,7 +292,7 @@ macro genInstr(body: untyped): untyped =
 #
 # https://docs.nvidia.com/cuda/inline-ptx-assembly/index.html#constraints
 # There is a separate constraint letter for each PTX register type:
-# 
+#
 # "h" = .u16 reg
 # "r" = .u32 reg
 # "l" = .u64 reg
@ -304,13 +303,13 @@ macro genInstr(body: untyped): untyped =
 #
 #
 # 1.2.3. Incorrect Optimization
-# 
+#
 # The compiler assumes that an asm() statement has no side effects except to change the output operands. To ensure that the asm is not deleted or moved during generation of PTX, you should use the volatile keyword, e.g.:
-# 
+#
 # asm volatile ("mov.u32 %0, %%clock;" : "=r"(x));
-# 
+#
 # Normally any memory that is written to will be specified as an out operand, but if there is a hidden side effect on user memory (for example, indirect access of a memory location via an operand), or if you want to stop any memory optimizations around the asm() statement performed during generation of PTX, you can add a “memory” clobbers specification after a 3rd colon, e.g.:
-# 
+#
 # asm volatile ("mov.u32 %0, %%clock;" : "=r"(x) :: "memory");
 # asm ("st.u32 [%0], %1;" : "r"(p), "r"(x) :: "memory");
 #
@ -331,7 +330,7 @@ macro genInstr(body: untyped): untyped =

 genInstr():
  # The PTX is without size indicator i.e. add.cc instead of add.cc.u32
-  # Both version will be generated. 
+  # Both version will be generated.
  #
  # op name:       ("ptx",        "args;",            "constraints", [params])

@ -356,7 +355,7 @@ genInstr():
  op mulhiadd_cio: ("madc.hi.cc", "$0, $1, $2, $3;", "=rl,rln,rln,rln", [lmul, rmul, addend])

  # Conditional mov / select
-  
+
  # slct r, a, b, c;
  # r <- (c >= 0) ? a : b;
  op slct:         ({"slct",".s32"},     "$0, $1, $2, $3;", "=rl,rln,rln,rn", [ifPos, ifNeg, condition])
--- a/constantine/platforms/intrinsics/addcarry_subborrow.nim
+++ b/constantine/platforms/intrinsics/addcarry_subborrow.nim
--- a/constantine/platforms/intrinsics/bitops.nim
+++ b/constantine/platforms/intrinsics/bitops.nim
--- a/constantine/platforms/intrinsics/compiler_optim_hints.nim
+++ b/constantine/platforms/intrinsics/compiler_optim_hints.nim
--- a/constantine/platforms/intrinsics/extended_precision.nim
+++ b/constantine/platforms/intrinsics/extended_precision.nim
--- a/constantine/platforms/intrinsics/extended_precision_64bit_uint128.nim
+++ b/constantine/platforms/intrinsics/extended_precision_64bit_uint128.nim
--- a/constantine/platforms/intrinsics/extended_precision_x86_64_msvc.nim
+++ b/constantine/platforms/intrinsics/extended_precision_x86_64_msvc.nim
--- a/constantine/platforms/primitives.nim
+++ b/constantine/platforms/primitives.nim
@ -13,7 +13,7 @@ import
    multiplexers,
    ct_division
  ],
-  compilers/[
+  intrinsics/[
    addcarry_subborrow,
    extended_precision,
    compiler_optim_hints
--- a/research/codegen/nim.cfg
+++ b/research/codegen/nim.cfg
@ -0,0 +1 @@
+--path:../../constantine/platforms/code_generator
--- a/research/codegen/x86.nim
+++ b/research/codegen/x86.nim
@ -0,0 +1,95 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  ./bindings/c_abi,
+  ./llvm, ./ir,
+  ./x86_inlineasm,
+  ../primitives
+
+export x86_inlineasm
+
+# ############################################################
+#
+#                     x86 API
+#
+# ############################################################
+
+proc defMulExt*(asy: Assembler_LLVM, wordSize: int): FnDef =
+
+  let procName = if wordSize == 64: cstring"hw_mulExt64"
+                 else: cstring"hw_mulExt32"
+
+  let doublePrec_t = if wordSize == 64: asy.i128_t
+                     else: asy.i64_t
+
+  let mulExtTy = if wordSize == 64: function_t(doublePrec_t, [asy.i64_t, asy.i64_t])
+                 else: function_t(doublePrec_t, [asy.i32_t, asy.i32_t])
+  let mulExtKernel = asy.module.addFunction(procName, mulExtTy)
+  let blck = asy.ctx.appendBasicBlock(mulExtKernel, "mulExtBody")
+  asy.builder.positionAtEnd(blck)
+
+  let bld = asy.builder
+
+  let a = bld.zext(mulExtKernel.getParam(0), doublePrec_t)
+  let b = bld.zext(mulExtKernel.getParam(1), doublePrec_t)
+  let r = bld.mul(a, b)
+
+  bld.ret r
+
+  return (mulExtTy, mulExtKernel)
+
+proc defHi*(asy: Assembler_LLVM, wordSize: int): FnDef =
+
+  let procName = if wordSize == 64: cstring"hw_hi64"
+                 else: cstring"hw_hi32"
+  let doublePrec_t = if wordSize == 64: asy.i128_t
+                     else: asy.i64_t
+  let singlePrec_t = if wordSize == 64: asy.i64_t
+                     else: asy.i32_t
+
+  let hiTy = function_t(singlePrec_t, [doublePrec_t])
+
+  let hiKernel = asy.module.addFunction(procName, hiTy)
+  let blck = asy.ctx.appendBasicBlock(hiKernel, "hiBody")
+  asy.builder.positionAtEnd(blck)
+
+  let bld = asy.builder
+
+  # %1 = zext i32 64 to i128
+  let shift = bld.zext(constInt(asy.i32_t, culonglong wordSize, signExtend = LlvmBool(0)), doublePrec_t)
+  # %hiLarge = lshr i128 %input, %1
+  let hiLarge = bld.lshr(hiKernel.getParam(0), shift)
+  # %hi = trunc i128 %hiLarge to i64
+  let hi = bld.trunc(hiLarge, singlePrec_t)
+
+  bld.ret hi
+
+  return (hiTy, hiKernel)
+
+proc defLo*(asy: Assembler_LLVM, wordSize: int): FnDef =
+
+  let procName = if wordSize == 64: cstring"hw_lo64"
+                 else: cstring"hw_lo32"
+  let doublePrec_t = if wordSize == 64: asy.i128_t
+                     else: asy.i64_t
+  let singlePrec_t = if wordSize == 64: asy.i64_t
+                     else: asy.i32_t
+
+  let loTy = function_t(singlePrec_t, [doublePrec_t])
+
+  let loKernel = asy.module.addFunction(procName, loTy)
+  let blck = asy.ctx.appendBasicBlock(loKernel, "loBody")
+  asy.builder.positionAtEnd(blck)
+
+  let bld = asy.builder
+
+  # %lo = trunc i128 %input to i64
+  let lo = bld.trunc(loKernel.getParam(0), singlePrec_t)
+  bld.ret lo
+  return (loTy, loKernel)
--- a/research/codegen/x86_inlineasm.nim
+++ b/research/codegen/x86_inlineasm.nim
@ -0,0 +1,209 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  std/[macros, strutils],
+  ./llvm
+
+# ############################################################
+#
+#                   x86 Inline ASM
+#
+# ############################################################
+
+macro genInstr(body: untyped): untyped =
+  result = newStmtList()
+
+  body.expectKind(nnkStmtList)
+  for op in body:
+    op.expectKind(nnkCommand)
+    doAssert op[0].eqIdent"op"
+
+    let instrName = op[1]
+    # For each op, generate a builder proc
+    op[2][0].expectKind(nnkTupleConstr)
+    op[2][0][0].expectKind(nnkStrLit)
+    op[2][0][1].expectKind(nnkStrLit)
+    op[2][0][2].expectKind(nnkStrLit)
+    op[2][0][3].expectKind(nnkBracket)
+
+    let instrBody = newStmtList()
+
+    # 1. Detect the size of registers
+    let numBits = ident"numBits"
+    let regTy = ident"regTy"
+    let fnTy = ident"fnTy"
+    let ctx = ident"ctx"
+    let lhs = op[2][0][3][0]
+
+    instrBody.add quote do:
+      let `ctx` = builder.getContext()
+      # lhs: ValueRef or uint32 or uint64
+      let `numBits` = when `lhs` is ValueRef|ConstValueRef: `lhs`.getTypeOf().getIntTypeWidth()
+                      else: 8*sizeof(`lhs`)
+      let `regTy` = when `lhs` is ValueRef|ConstValueRef: `lhs`.getTypeOf()
+                    elif `lhs` is uint32: `ctx`.int32_t()
+                    elif `lhs` is uint64: `ctx`.int64_t()
+                    else: {.error "Unsupported input type " & $typeof(`lhs`).}
+
+    # 2. Create the LLVM asm signature
+    let operands = op[2][0][3]
+    let arity = operands.len
+
+    let constraintString = op[2][0][2]
+    let constraints = ident"constraints"
+
+    let instr = op[2][0][0]
+
+    if arity == 2:
+      if constraintString.strVal.startsWith('='):
+        if constraintString.strVal.endsWith('r'):
+          instrBody.add quote do:
+            let `fnTy` = function_t(`regTy`, [`regTy`, `regTy`])
+        else:
+          instrBody.add quote do:
+            let `fnTy` = function_t(`regTy`, [`regTy`, pointer_t(`regTy`)])
+      else:
+        # We only support out of place "=" function.
+        # In-place with "+" requires alloca + load/stores in codegen
+        # in-place functions can be rewritten to be out-place with "matching constraints"
+        error "Unsupported constraint: " & constraintString.strVal
+    else:
+      error "Unsupported arity: " & $arity
+
+    # 3. Nothing, we can use the constraint string as is on x86
+
+    # 4. Register the inline ASM with LLVM
+    let inlineASM = ident"inlineASM"
+    let instrParam = op[2][0][1]
+    let asmString = ident"asmString"
+
+
+    instrBody.add quote do:
+      let `asmString` = if numBits == 64: static(`instr` & "q") & static(" " & `instrParam`)
+                        else: static(`instr` & "l") & static(" " & `instrParam`)
+
+    instrBody.add quote do:
+      let `inlineASM` = getInlineAsm(
+        ty = `fnTy`,
+        asmString = `asmString`,
+        constraints = `constraintString`,
+        # All carry/overflow instructions have sideffect on carry flag and can't be reordered
+        # However, function calls can't be reordered.
+        # Relevant operations that affects flags are:
+        # - MUL, if the compiler decides not to use MULX
+        # - XOR, for zeroing a register
+        hasSideEffects = LlvmBool(0),
+        isAlignStack = LlvmBool(0),
+        dialect = InlineAsmDialectATT,
+        canThrow = LlvmBool(0))
+
+    # 5. Call it
+    let opArray = nnkBracket.newTree()
+    for op in operands:
+      # when op is ValueRef: op
+      # else: constInt(uint64(op))
+      opArray.add newCall(
+        bindSym"ValueRef",
+        nnkWhenStmt.newTree(
+          nnkElifBranch.newTree(nnkInfix.newTree(ident"is", op, bindSym"AnyValueRef"), op),
+          nnkElse.newTree(newCall(ident"constInt", regTy, newCall(ident"uint64", op)))
+        )
+      )
+    # builder.call2(ty, inlineASM, [lhs, rhs], name)
+    instrBody.add newCall(
+      ident"call2", ident"builder", fnTy,
+      inlineASM, opArray, ident"name")
+
+    # 6. Create the function signature
+    var opDefs: seq[NimNode]
+    opDefs.add ident"ValueRef" # Return type
+    opDefs.add newIdentDefs(ident"builder", bindSym"BuilderRef")
+    block:
+      var i = 0
+      for constraint in constraintString.strVal.split(','):
+        if constraint.startsWith('=') or constraint.startsWith("~{memory}"):
+          # Don't increment i
+          continue
+        elif constraint == "m":
+          opDefs.add newIdentDefs(operands[i], ident"ValueRef")
+        elif constraint.endsWith('r') or constraint.endsWith('0'):
+          opDefs.add newIdentDefs(
+            operands[i],
+            nnkInfix.newTree(ident"or",
+              nnkInfix.newTree(ident"or", ident"AnyValueRef", ident"uint32"),
+              ident"uint64")
+          )
+        else:
+          error "Unsupported constraint: " & constraint
+        i += 1
+    opDefs.add newIdentDefs(ident"name", bindSym"cstring", newLit"")
+
+    result.add newProc(
+      name = nnkPostfix.newTree(ident"*", instrName),
+      params = opDefs,
+      procType = nnkProcDef,
+      body = instrBody)
+
+# Inline x86 assembly
+# ------------------------------------------------------------
+#
+# We can generate add with carry via
+#   call { i8, i64 } @llvm.x86.addcarry.64(i8 %carryIn, i64 %a, i64 %b)
+#
+# We can generate multi-precision mul and mulx via
+#
+#    define {i64, i64} @mul(i64 %x, i64 %y) #0 {
+#
+#      %1 = zext i64 %x to i128
+#      %2 = zext i64 %y to i128
+#      %r = mul i128 %1, %2
+#      %3 = zext i32 64 to i128
+#      %4 = lshr i128 %r, %3
+#      %hi = trunc i128 %4 to i64
+#      %lo = trunc i128 %r to i64
+#
+#      %res_tmp = insertvalue {i64, i64} undef, i64 %hi, 0
+#      %res = insertvalue {i64, i64} %res_tmp, i64 %lo, 1
+#
+#      ret {i64, i64} %res
+#    }
+#
+#    attributes #0 = {"target-features"="+bmi2"}
+#
+#    mul:
+#            mov     rax, rdi
+#            mul     rsi
+#            mov     rcx, rax
+#            mov     rax, rdx
+#            mov     rdx, rcx
+#            ret
+#
+#    mul_bmi2:
+#        mov     rdx, rdi
+#        mulx    rax, rdx, rsi
+#        ret
+#
+# Note that mul(hi: var rdx, lo: var rax, a: reg/mem64, b: rax)
+#   - clobbers carry (and many other) flags
+#   - has fixed output to rdx:rax registers
+# while mulx(hi: var reg64, lo: var reg64, a: reg/mem64, b: rdx)
+#   - does not clobber flags
+#   - has flexible register outputs
+
+
+genInstr():
+  # We are only concerned about the ADCX/ADOX instructions
+  # which do not have intrinsics or cannot be generated through instruction combining
+  # unlike llvm.x86.addcarry.u64 that can generate adc
+
+  # (cf/of, r) <- a+b+(cf/of)
+  op adcx_rr: ("adcx", "%2, %0;", "=r,%0,r", [lhs, rhs])
+  op adcx_rm: ("adcx", "%2, %0;", "=r,0,m", [lhs, rhs])
+  op adox_rr: ("adox", "%2, %0;", "=r,%0,r", [lhs, rhs])
+  op adox_rm: ("adox", "%2, %0;", "=r,0,m", [lhs, rhs])
--- a/tests/gpu/hello_world_llvm.nim
+++ b/tests/gpu/hello_world_llvm.nim
@ -6,7 +6,7 @@
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.

-import ../../constantine/platforms/gpu/llvm
+import ../../constantine/platforms/code_generator/llvm

 echo "LLVM JIT compiler Hello World"

--- a/tests/gpu/hello_world_nvidia.nim
+++ b/tests/gpu/hello_world_nvidia.nim
@ -6,7 +6,7 @@
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.

-import ../../constantine/platforms/gpu/[llvm, nvidia, bindings/c_abi]
+import ../../constantine/platforms/code_generator/[llvm, nvidia, bindings/c_abi]

 # ############################################################
 #
--- a/tests/gpu/t_nvidia_fp.nim
+++ b/tests/gpu/t_nvidia_fp.nim
@ -11,12 +11,12 @@ import
  # Standard library
  std/[unittest, times],
  # Internal
-  ../../constantine/platforms/gpu/[llvm, nvidia, ir],
+  ../../constantine/platforms/code_generator/[llvm, nvidia, ir],
  ../../constantine/platforms/static_for,
  ../../constantine/math/config/curves,
  ../../constantine/math/io/io_bigints,
  ../../constantine/math/arithmetic,
-  ../../constantine/math_gpu/fields_nvidia,
+  ../../constantine/math_codegen/fields_nvidia,
  # Test utilities
  ../../helpers/prng_unsafe
				`@ -0,0 +1 @@`
				`--path:../../constantine/platforms/code_generator`