From 33c3a2e8c41a266a85aef7581a6cf3b73584a237 Mon Sep 17 00:00:00 2001
From: Mamy Ratsimbazafy <mamy_github@numforge.co>
Date: Thu, 27 Apr 2023 21:52:51 +0200
Subject: [PATCH] [Research] x86 code generator (#234)

* rename compilers -> intrinsics, math_gpu -> math_codegen

* stash x86 codegen in research
---
 .../fields_nvidia.nim                         |  20 +-
 constantine/platforms/bithacks.nim            |   2 +-
 .../{gpu => code_generator}/README.md         |   0
 .../bindings/c_abi.nim                        |   0
 .../bindings/llvm_abi.nim                     |   0
 .../bindings/nvidia_abi.nim                   |   0
 .../platforms/{gpu => code_generator}/ir.nim  |   5 +-
 .../{gpu => code_generator}/llvm.nim          |   0
 .../{gpu => code_generator}/nvidia.nim        |   0
 .../nvidia_inlineasm.nim                      |  39 ++--
 .../addcarry_subborrow.nim                    |   0
 .../{compilers => intrinsics}/bitops.nim      |   0
 .../compiler_optim_hints.nim                  |   0
 .../extended_precision.nim                    |   0
 .../extended_precision_64bit_uint128.nim      |   0
 .../extended_precision_x86_64_msvc.nim        |   0
 constantine/platforms/primitives.nim          |   2 +-
 research/codegen/nim.cfg                      |   1 +
 research/codegen/x86.nim                      |  95 ++++++++
 research/codegen/x86_inlineasm.nim            | 209 ++++++++++++++++++
 tests/gpu/hello_world_llvm.nim                |   2 +-
 tests/gpu/hello_world_nvidia.nim              |   2 +-
 tests/gpu/t_nvidia_fp.nim                     |   4 +-
 23 files changed, 343 insertions(+), 38 deletions(-)
 rename constantine/{math_gpu => math_codegen}/fields_nvidia.nim (96%)
 rename constantine/platforms/{gpu => code_generator}/README.md (100%)
 rename constantine/platforms/{gpu => code_generator}/bindings/c_abi.nim (100%)
 rename constantine/platforms/{gpu => code_generator}/bindings/llvm_abi.nim (100%)
 rename constantine/platforms/{gpu => code_generator}/bindings/nvidia_abi.nim (100%)
 rename constantine/platforms/{gpu => code_generator}/ir.nim (98%)
 rename constantine/platforms/{gpu => code_generator}/llvm.nim (100%)
 rename constantine/platforms/{gpu => code_generator}/nvidia.nim (100%)
 rename constantine/platforms/{gpu => code_generator}/nvidia_inlineasm.nim (98%)
 rename constantine/platforms/{compilers => intrinsics}/addcarry_subborrow.nim (100%)
 rename constantine/platforms/{compilers => intrinsics}/bitops.nim (100%)
 rename constantine/platforms/{compilers => intrinsics}/compiler_optim_hints.nim (100%)
 rename constantine/platforms/{compilers => intrinsics}/extended_precision.nim (100%)
 rename constantine/platforms/{compilers => intrinsics}/extended_precision_64bit_uint128.nim (100%)
 rename constantine/platforms/{compilers => intrinsics}/extended_precision_x86_64_msvc.nim (100%)
 create mode 100644 research/codegen/nim.cfg
 create mode 100644 research/codegen/x86.nim
 create mode 100644 research/codegen/x86_inlineasm.nim

diff --git a/constantine/math_gpu/fields_nvidia.nim b/constantine/math_codegen/fields_nvidia.nim
similarity index 96%
rename from constantine/math_gpu/fields_nvidia.nim
rename to constantine/math_codegen/fields_nvidia.nim
index afe5c5d..36e086a 100644
--- a/constantine/math_gpu/fields_nvidia.nim
+++ b/constantine/math_codegen/fields_nvidia.nim
@@ -7,7 +7,7 @@
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
 import
-  ../platforms/gpu/[llvm, nvidia, ir]
+  ../platforms/code_generator/[llvm, nvidia, ir]
 
 # ############################################################
 #
@@ -21,10 +21,10 @@ import
 proc finalSubMayOverflow*(asy: Assembler_LLVM, cm: CurveMetadata, field: Field, r, a: Array) =
   ## If a >= Modulus: r <- a-M
   ## else:            r <- a
-  ## 
+  ##
   ## This is constant-time straightline code.
   ## Due to warp divergence, the overhead of doing comparison with shortcutting might not be worth it on GPU.
-  ## 
+  ##
   ## To be used when the final substraction can
   ## also overflow the limbs (a 2^256 order of magnitude modulus stored in n words of total max size 2^256)
 
@@ -48,15 +48,15 @@ proc finalSubMayOverflow*(asy: Assembler_LLVM, cm: CurveMetadata, field: Field,
   let underflowedModulus = bld.sub_bi(overflowedLimbs, 0'u32)
 
   for i in 0 ..< N:
-    r[i] = bld.slct(scratch[i], a[i], underflowedModulus) 
+    r[i] = bld.slct(scratch[i], a[i], underflowedModulus)
 
 proc finalSubNoOverflow*(asy: Assembler_LLVM, cm: CurveMetadata, field: Field, r, a: Array) =
   ## If a >= Modulus: r <- a-M
   ## else:            r <- a
-  ## 
+  ##
   ## This is constant-time straightline code.
   ## Due to warp divergence, the overhead of doing comparison with shortcutting might not be worth it on GPU.
-  ## 
+  ##
   ## To be used when the modulus does not use the full bitwidth of the storing words
   ## (say using 255 bits for the modulus out of 256 available in words)
 
@@ -65,7 +65,7 @@ proc finalSubNoOverflow*(asy: Assembler_LLVM, cm: CurveMetadata, field: Field, r
   let scratch = bld.makeArray(fieldTy)
   let M = cm.getModulus(field)
   let N = M.len
- 
+
   # Now substract the modulus, and test a < M with the last borrow
   scratch[0] = bld.sub_bo(a[0], M[0])
   for i in 1 ..< N:
@@ -80,7 +80,7 @@ proc finalSubNoOverflow*(asy: Assembler_LLVM, cm: CurveMetadata, field: Field, r
 proc field_add_gen*(asy: Assembler_LLVM, cm: CurveMetadata, field: Field): FnDef =
   ## Generate an optimized modular addition kernel
   ## with parameters `a, b, modulus: Limbs -> Limbs`
-  
+
   let procName = cm.genSymbol(block:
     case field
     of fp: opFpAdd
@@ -94,14 +94,14 @@ proc field_add_gen*(asy: Assembler_LLVM, cm: CurveMetadata, field: Field): FnDef
   asy.builder.positionAtEnd(blck)
 
   let bld = asy.builder
-  
+
   let r = bld.asArray(addModKernel.getParam(0), fieldTy)
   let a = bld.asArray(addModKernel.getParam(1), fieldTy)
   let b = bld.asArray(addModKernel.getParam(2), fieldTy)
 
   let t = bld.makeArray(fieldTy)
   let N = cm.getNumWords(field)
-  
+
   t[0] = bld.add_co(a[0], b[0])
   for i in 1 ..< N:
     t[i] = bld.add_cio(a[i], b[i])
diff --git a/constantine/platforms/bithacks.nim b/constantine/platforms/bithacks.nim
index 85e972b..7fcdca9 100644
--- a/constantine/platforms/bithacks.nim
+++ b/constantine/platforms/bithacks.nim
@@ -6,7 +6,7 @@
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
-import ./compilers/bitops
+import ./intrinsics/bitops
 
 # ############################################################
 #
diff --git a/constantine/platforms/gpu/README.md b/constantine/platforms/code_generator/README.md
similarity index 100%
rename from constantine/platforms/gpu/README.md
rename to constantine/platforms/code_generator/README.md
diff --git a/constantine/platforms/gpu/bindings/c_abi.nim b/constantine/platforms/code_generator/bindings/c_abi.nim
similarity index 100%
rename from constantine/platforms/gpu/bindings/c_abi.nim
rename to constantine/platforms/code_generator/bindings/c_abi.nim
diff --git a/constantine/platforms/gpu/bindings/llvm_abi.nim b/constantine/platforms/code_generator/bindings/llvm_abi.nim
similarity index 100%
rename from constantine/platforms/gpu/bindings/llvm_abi.nim
rename to constantine/platforms/code_generator/bindings/llvm_abi.nim
diff --git a/constantine/platforms/gpu/bindings/nvidia_abi.nim b/constantine/platforms/code_generator/bindings/nvidia_abi.nim
similarity index 100%
rename from constantine/platforms/gpu/bindings/nvidia_abi.nim
rename to constantine/platforms/code_generator/bindings/nvidia_abi.nim
diff --git a/constantine/platforms/gpu/ir.nim b/constantine/platforms/code_generator/ir.nim
similarity index 98%
rename from constantine/platforms/gpu/ir.nim
rename to constantine/platforms/code_generator/ir.nim
index a8709ed..ab02553 100644
--- a/constantine/platforms/gpu/ir.nim
+++ b/constantine/platforms/code_generator/ir.nim
@@ -24,7 +24,7 @@ type
     ctx*: ContextRef
     module*: ModuleRef
     builder*: BuilderRef
-    i1_t*, i32_t*, i64_t*, void_t*: TypeRef
+    i1_t*, i32_t*, i64_t*, i128_t*, void_t*: TypeRef
     backend*: Backend
 
   Backend* = enum
@@ -54,7 +54,8 @@ proc new*(T: type Assembler_LLVM, backend: Backend, moduleName: cstring): Assemb
   result.builder = result.ctx.createBuilder()
   result.i1_t = result.ctx.int1_t()
   result.i32_t = result.ctx.int32_t()
-  result.i64_t = result.ctx.int32_t()
+  result.i64_t = result.ctx.int64_t()
+  result.i128_t = result.ctx.int128_t()
   result.void_t = result.ctx.void_t()
   result.backend = backend
 
diff --git a/constantine/platforms/gpu/llvm.nim b/constantine/platforms/code_generator/llvm.nim
similarity index 100%
rename from constantine/platforms/gpu/llvm.nim
rename to constantine/platforms/code_generator/llvm.nim
diff --git a/constantine/platforms/gpu/nvidia.nim b/constantine/platforms/code_generator/nvidia.nim
similarity index 100%
rename from constantine/platforms/gpu/nvidia.nim
rename to constantine/platforms/code_generator/nvidia.nim
diff --git a/constantine/platforms/gpu/nvidia_inlineasm.nim b/constantine/platforms/code_generator/nvidia_inlineasm.nim
similarity index 98%
rename from constantine/platforms/gpu/nvidia_inlineasm.nim
rename to constantine/platforms/code_generator/nvidia_inlineasm.nim
index 8aea197..b17bd89 100644
--- a/constantine/platforms/gpu/nvidia_inlineasm.nim
+++ b/constantine/platforms/code_generator/nvidia_inlineasm.nim
@@ -20,7 +20,7 @@ import
 # instructions -> inline assembly -> argument mapping
 
 # Inline assembly looks like this:
-# 
+#
 # C:    asm volatile ("add.cc.u64 %0, %1, %2;" : "=l"(c) : "l"(a), "l"(b) : "memory" );
 # LLVM: call i64 asm "add.cc.u64 $0, $1, $2;", "=l,l,l,~{memory}"(i64 %1, i64 %2)
 #
@@ -30,16 +30,16 @@ import
 # 2. Generate u32 and u64 `getInlineAsm()` definition (that is associated with an LLVM IR ContextRef)
 # 3. Create an initialization proc to be called after initializing the LLVM ContextRef
 #    For each instruction, return a routine with signature that mirrors LLVM builtin instructions:
-#    
+#
 #    proc myInstr(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring): ValueRef =
 #      let numBits = lhs.getTypeOf().getIntTypeWidth()
-#      if numBits == 32: 
+#      if numBits == 32:
 #        builder.call2(inlineAsmFnType, inlineAsmFn32, [arg0, arg1, ...], name)
-#      elif numBits == 64: 
+#      elif numBits == 64:
 #        builder.call2(inlineAsmFnType, inlineAsmFn64, [arg0, arg1, ...], name)
 #      else:
 #        doAssert false, "Unsupported int" & $numBits
-# 
+#
 # To create `inlineAsmFn32` and `inlineAsmFn64` we may use `getInlineAsm` just before the corresponding
 # builder.call2. This allows us to define freestanding functions.
 # The potential issue is the overhead of repeated definition of add/sub/mul/muladd
@@ -94,7 +94,7 @@ macro genInstr(body: untyped): untyped =
     let fnTy = ident"fnTy"
     let ctx = ident"ctx"
     let lhs = op[2][0][3][0]
-    
+
     instrBody.add quote do:
       let `ctx` = builder.getContext()
       # lhs: ValueRef or uint32 or uint64
@@ -143,6 +143,8 @@ macro genInstr(body: untyped): untyped =
 
     # We could have generic constraint string generation, but we only have 2 arities to support
     # and codegen without quote do would be even more verbose and hard to read.
+
+    # TODO: commutative inputs
     if arity == 2:
       let op0 = operands[0]
       let op1 = operands[1]
@@ -201,7 +203,7 @@ macro genInstr(body: untyped): untyped =
     else:
       instrBody.add quote do:
         let `asmString` = static(`instr` & ".u") & $`numBits` & static(" " & `instrParam`)
-    
+
     instrBody.add quote do:
       # Chapter 6 of https://docs.nvidia.com/cuda/pdf/NVVM_IR_Specification.pdf
       # inteldialect is not supported (but the NVPTX dialect is akin to intel dialect)
@@ -217,8 +219,7 @@ macro genInstr(body: untyped): untyped =
         hasSideEffects = LlvmBool(0),
         isAlignStack = LlvmBool(0),
         dialect = InlineAsmDialectATT,
-        canThrow = LlvmBool(0)
-      ) 
+        canThrow = LlvmBool(0))
 
     # 5. Call it
     let opArray = nnkBracket.newTree()
@@ -235,8 +236,7 @@ macro genInstr(body: untyped): untyped =
     # builder.call2(ty, inlineASM, [lhs, rhs], name)
     instrBody.add newCall(
       ident"call2", ident"builder", fnTy,
-      inlineASM, opArray, ident"name"
-    )
+      inlineASM, opArray, ident"name")
 
     # 6. Create the function signature
     var opDefs: seq[NimNode]
@@ -273,8 +273,7 @@ macro genInstr(body: untyped): untyped =
       name = nnkPostfix.newTree(ident"*", instrName),
       params = opDefs,
       procType = nnkProcDef,
-      body = instrBody
-    )
+      body = instrBody)
 
 # Inline PTX assembly
 # ------------------------------------------------------------
@@ -293,7 +292,7 @@ macro genInstr(body: untyped): untyped =
 #
 # https://docs.nvidia.com/cuda/inline-ptx-assembly/index.html#constraints
 # There is a separate constraint letter for each PTX register type:
-# 
+#
 # "h" = .u16 reg
 # "r" = .u32 reg
 # "l" = .u64 reg
@@ -304,13 +303,13 @@ macro genInstr(body: untyped): untyped =
 #
 #
 # 1.2.3. Incorrect Optimization
-# 
+#
 # The compiler assumes that an asm() statement has no side effects except to change the output operands. To ensure that the asm is not deleted or moved during generation of PTX, you should use the volatile keyword, e.g.:
-# 
+#
 # asm volatile ("mov.u32 %0, %%clock;" : "=r"(x));
-# 
+#
 # Normally any memory that is written to will be specified as an out operand, but if there is a hidden side effect on user memory (for example, indirect access of a memory location via an operand), or if you want to stop any memory optimizations around the asm() statement performed during generation of PTX, you can add a “memory” clobbers specification after a 3rd colon, e.g.:
-# 
+#
 # asm volatile ("mov.u32 %0, %%clock;" : "=r"(x) :: "memory");
 # asm ("st.u32 [%0], %1;" : "r"(p), "r"(x) :: "memory");
 #
@@ -331,7 +330,7 @@ macro genInstr(body: untyped): untyped =
 
 genInstr():
   # The PTX is without size indicator i.e. add.cc instead of add.cc.u32
-  # Both version will be generated. 
+  # Both version will be generated.
   #
   # op name:       ("ptx",        "args;",            "constraints", [params])
 
@@ -356,7 +355,7 @@ genInstr():
   op mulhiadd_cio: ("madc.hi.cc", "$0, $1, $2, $3;", "=rl,rln,rln,rln", [lmul, rmul, addend])
 
   # Conditional mov / select
-  
+
   # slct r, a, b, c;
   # r <- (c >= 0) ? a : b;
   op slct:         ({"slct",".s32"},     "$0, $1, $2, $3;", "=rl,rln,rln,rn", [ifPos, ifNeg, condition])
diff --git a/constantine/platforms/compilers/addcarry_subborrow.nim b/constantine/platforms/intrinsics/addcarry_subborrow.nim
similarity index 100%
rename from constantine/platforms/compilers/addcarry_subborrow.nim
rename to constantine/platforms/intrinsics/addcarry_subborrow.nim
diff --git a/constantine/platforms/compilers/bitops.nim b/constantine/platforms/intrinsics/bitops.nim
similarity index 100%
rename from constantine/platforms/compilers/bitops.nim
rename to constantine/platforms/intrinsics/bitops.nim
diff --git a/constantine/platforms/compilers/compiler_optim_hints.nim b/constantine/platforms/intrinsics/compiler_optim_hints.nim
similarity index 100%
rename from constantine/platforms/compilers/compiler_optim_hints.nim
rename to constantine/platforms/intrinsics/compiler_optim_hints.nim
diff --git a/constantine/platforms/compilers/extended_precision.nim b/constantine/platforms/intrinsics/extended_precision.nim
similarity index 100%
rename from constantine/platforms/compilers/extended_precision.nim
rename to constantine/platforms/intrinsics/extended_precision.nim
diff --git a/constantine/platforms/compilers/extended_precision_64bit_uint128.nim b/constantine/platforms/intrinsics/extended_precision_64bit_uint128.nim
similarity index 100%
rename from constantine/platforms/compilers/extended_precision_64bit_uint128.nim
rename to constantine/platforms/intrinsics/extended_precision_64bit_uint128.nim
diff --git a/constantine/platforms/compilers/extended_precision_x86_64_msvc.nim b/constantine/platforms/intrinsics/extended_precision_x86_64_msvc.nim
similarity index 100%
rename from constantine/platforms/compilers/extended_precision_x86_64_msvc.nim
rename to constantine/platforms/intrinsics/extended_precision_x86_64_msvc.nim
diff --git a/constantine/platforms/primitives.nim b/constantine/platforms/primitives.nim
index 60abc08..c7cbc21 100644
--- a/constantine/platforms/primitives.nim
+++ b/constantine/platforms/primitives.nim
@@ -13,7 +13,7 @@ import
     multiplexers,
     ct_division
   ],
-  compilers/[
+  intrinsics/[
     addcarry_subborrow,
     extended_precision,
     compiler_optim_hints
diff --git a/research/codegen/nim.cfg b/research/codegen/nim.cfg
new file mode 100644
index 0000000..ffb8d7c
--- /dev/null
+++ b/research/codegen/nim.cfg
@@ -0,0 +1 @@
+--path:../../constantine/platforms/code_generator
\ No newline at end of file
diff --git a/research/codegen/x86.nim b/research/codegen/x86.nim
new file mode 100644
index 0000000..2dd83ab
--- /dev/null
+++ b/research/codegen/x86.nim
@@ -0,0 +1,95 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  ./bindings/c_abi,
+  ./llvm, ./ir,
+  ./x86_inlineasm,
+  ../primitives
+
+export x86_inlineasm
+
+# ############################################################
+#
+#                     x86 API
+#
+# ############################################################
+
+proc defMulExt*(asy: Assembler_LLVM, wordSize: int): FnDef =
+
+  let procName = if wordSize == 64: cstring"hw_mulExt64"
+                 else: cstring"hw_mulExt32"
+
+  let doublePrec_t = if wordSize == 64: asy.i128_t
+                     else: asy.i64_t
+
+  let mulExtTy = if wordSize == 64: function_t(doublePrec_t, [asy.i64_t, asy.i64_t])
+                 else: function_t(doublePrec_t, [asy.i32_t, asy.i32_t])
+  let mulExtKernel = asy.module.addFunction(procName, mulExtTy)
+  let blck = asy.ctx.appendBasicBlock(mulExtKernel, "mulExtBody")
+  asy.builder.positionAtEnd(blck)
+
+  let bld = asy.builder
+
+  let a = bld.zext(mulExtKernel.getParam(0), doublePrec_t)
+  let b = bld.zext(mulExtKernel.getParam(1), doublePrec_t)
+  let r = bld.mul(a, b)
+
+  bld.ret r
+
+  return (mulExtTy, mulExtKernel)
+
+proc defHi*(asy: Assembler_LLVM, wordSize: int): FnDef =
+
+  let procName = if wordSize == 64: cstring"hw_hi64"
+                 else: cstring"hw_hi32"
+  let doublePrec_t = if wordSize == 64: asy.i128_t
+                     else: asy.i64_t
+  let singlePrec_t = if wordSize == 64: asy.i64_t
+                     else: asy.i32_t
+
+  let hiTy = function_t(singlePrec_t, [doublePrec_t])
+
+  let hiKernel = asy.module.addFunction(procName, hiTy)
+  let blck = asy.ctx.appendBasicBlock(hiKernel, "hiBody")
+  asy.builder.positionAtEnd(blck)
+
+  let bld = asy.builder
+
+  # %1 = zext i32 64 to i128
+  let shift = bld.zext(constInt(asy.i32_t, culonglong wordSize, signExtend = LlvmBool(0)), doublePrec_t)
+  # %hiLarge = lshr i128 %input, %1
+  let hiLarge = bld.lshr(hiKernel.getParam(0), shift)
+  # %hi = trunc i128 %hiLarge to i64
+  let hi = bld.trunc(hiLarge, singlePrec_t)
+
+  bld.ret hi
+
+  return (hiTy, hiKernel)
+
+proc defLo*(asy: Assembler_LLVM, wordSize: int): FnDef =
+
+  let procName = if wordSize == 64: cstring"hw_lo64"
+                 else: cstring"hw_lo32"
+  let doublePrec_t = if wordSize == 64: asy.i128_t
+                     else: asy.i64_t
+  let singlePrec_t = if wordSize == 64: asy.i64_t
+                     else: asy.i32_t
+
+  let loTy = function_t(singlePrec_t, [doublePrec_t])
+
+  let loKernel = asy.module.addFunction(procName, loTy)
+  let blck = asy.ctx.appendBasicBlock(loKernel, "loBody")
+  asy.builder.positionAtEnd(blck)
+
+  let bld = asy.builder
+
+  # %lo = trunc i128 %input to i64
+  let lo = bld.trunc(loKernel.getParam(0), singlePrec_t)
+  bld.ret lo
+  return (loTy, loKernel)
diff --git a/research/codegen/x86_inlineasm.nim b/research/codegen/x86_inlineasm.nim
new file mode 100644
index 0000000..9f604b9
--- /dev/null
+++ b/research/codegen/x86_inlineasm.nim
@@ -0,0 +1,209 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  std/[macros, strutils],
+  ./llvm
+
+# ############################################################
+#
+#                   x86 Inline ASM
+#
+# ############################################################
+
+macro genInstr(body: untyped): untyped =
+  result = newStmtList()
+
+  body.expectKind(nnkStmtList)
+  for op in body:
+    op.expectKind(nnkCommand)
+    doAssert op[0].eqIdent"op"
+
+    let instrName = op[1]
+    # For each op, generate a builder proc
+    op[2][0].expectKind(nnkTupleConstr)
+    op[2][0][0].expectKind(nnkStrLit)
+    op[2][0][1].expectKind(nnkStrLit)
+    op[2][0][2].expectKind(nnkStrLit)
+    op[2][0][3].expectKind(nnkBracket)
+
+    let instrBody = newStmtList()
+
+    # 1. Detect the size of registers
+    let numBits = ident"numBits"
+    let regTy = ident"regTy"
+    let fnTy = ident"fnTy"
+    let ctx = ident"ctx"
+    let lhs = op[2][0][3][0]
+
+    instrBody.add quote do:
+      let `ctx` = builder.getContext()
+      # lhs: ValueRef or uint32 or uint64
+      let `numBits` = when `lhs` is ValueRef|ConstValueRef: `lhs`.getTypeOf().getIntTypeWidth()
+                      else: 8*sizeof(`lhs`)
+      let `regTy` = when `lhs` is ValueRef|ConstValueRef: `lhs`.getTypeOf()
+                    elif `lhs` is uint32: `ctx`.int32_t()
+                    elif `lhs` is uint64: `ctx`.int64_t()
+                    else: {.error "Unsupported input type " & $typeof(`lhs`).}
+
+    # 2. Create the LLVM asm signature
+    let operands = op[2][0][3]
+    let arity = operands.len
+
+    let constraintString = op[2][0][2]
+    let constraints = ident"constraints"
+
+    let instr = op[2][0][0]
+
+    if arity == 2:
+      if constraintString.strVal.startsWith('='):
+        if constraintString.strVal.endsWith('r'):
+          instrBody.add quote do:
+            let `fnTy` = function_t(`regTy`, [`regTy`, `regTy`])
+        else:
+          instrBody.add quote do:
+            let `fnTy` = function_t(`regTy`, [`regTy`, pointer_t(`regTy`)])
+      else:
+        # We only support out of place "=" function.
+        # In-place with "+" requires alloca + load/stores in codegen
+        # in-place functions can be rewritten to be out-place with "matching constraints"
+        error "Unsupported constraint: " & constraintString.strVal
+    else:
+      error "Unsupported arity: " & $arity
+
+    # 3. Nothing, we can use the constraint string as is on x86
+
+    # 4. Register the inline ASM with LLVM
+    let inlineASM = ident"inlineASM"
+    let instrParam = op[2][0][1]
+    let asmString = ident"asmString"
+
+
+    instrBody.add quote do:
+      let `asmString` = if numBits == 64: static(`instr` & "q") & static(" " & `instrParam`)
+                        else: static(`instr` & "l") & static(" " & `instrParam`)
+
+    instrBody.add quote do:
+      let `inlineASM` = getInlineAsm(
+        ty = `fnTy`,
+        asmString = `asmString`,
+        constraints = `constraintString`,
+        # All carry/overflow instructions have sideffect on carry flag and can't be reordered
+        # However, function calls can't be reordered.
+        # Relevant operations that affects flags are:
+        # - MUL, if the compiler decides not to use MULX
+        # - XOR, for zeroing a register
+        hasSideEffects = LlvmBool(0),
+        isAlignStack = LlvmBool(0),
+        dialect = InlineAsmDialectATT,
+        canThrow = LlvmBool(0))
+
+    # 5. Call it
+    let opArray = nnkBracket.newTree()
+    for op in operands:
+      # when op is ValueRef: op
+      # else: constInt(uint64(op))
+      opArray.add newCall(
+        bindSym"ValueRef",
+        nnkWhenStmt.newTree(
+          nnkElifBranch.newTree(nnkInfix.newTree(ident"is", op, bindSym"AnyValueRef"), op),
+          nnkElse.newTree(newCall(ident"constInt", regTy, newCall(ident"uint64", op)))
+        )
+      )
+    # builder.call2(ty, inlineASM, [lhs, rhs], name)
+    instrBody.add newCall(
+      ident"call2", ident"builder", fnTy,
+      inlineASM, opArray, ident"name")
+
+    # 6. Create the function signature
+    var opDefs: seq[NimNode]
+    opDefs.add ident"ValueRef" # Return type
+    opDefs.add newIdentDefs(ident"builder", bindSym"BuilderRef")
+    block:
+      var i = 0
+      for constraint in constraintString.strVal.split(','):
+        if constraint.startsWith('=') or constraint.startsWith("~{memory}"):
+          # Don't increment i
+          continue
+        elif constraint == "m":
+          opDefs.add newIdentDefs(operands[i], ident"ValueRef")
+        elif constraint.endsWith('r') or constraint.endsWith('0'):
+          opDefs.add newIdentDefs(
+            operands[i],
+            nnkInfix.newTree(ident"or",
+              nnkInfix.newTree(ident"or", ident"AnyValueRef", ident"uint32"),
+              ident"uint64")
+          )
+        else:
+          error "Unsupported constraint: " & constraint
+        i += 1
+    opDefs.add newIdentDefs(ident"name", bindSym"cstring", newLit"")
+
+    result.add newProc(
+      name = nnkPostfix.newTree(ident"*", instrName),
+      params = opDefs,
+      procType = nnkProcDef,
+      body = instrBody)
+
+# Inline x86 assembly
+# ------------------------------------------------------------
+#
+# We can generate add with carry via
+#   call { i8, i64 } @llvm.x86.addcarry.64(i8 %carryIn, i64 %a, i64 %b)
+#
+# We can generate multi-precision mul and mulx via
+#
+#    define {i64, i64} @mul(i64 %x, i64 %y) #0 {
+#
+#      %1 = zext i64 %x to i128
+#      %2 = zext i64 %y to i128
+#      %r = mul i128 %1, %2
+#      %3 = zext i32 64 to i128
+#      %4 = lshr i128 %r, %3
+#      %hi = trunc i128 %4 to i64
+#      %lo = trunc i128 %r to i64
+#
+#      %res_tmp = insertvalue {i64, i64} undef, i64 %hi, 0
+#      %res = insertvalue {i64, i64} %res_tmp, i64 %lo, 1
+#
+#      ret {i64, i64} %res
+#    }
+#
+#    attributes #0 = {"target-features"="+bmi2"}
+#
+#    mul:
+#            mov     rax, rdi
+#            mul     rsi
+#            mov     rcx, rax
+#            mov     rax, rdx
+#            mov     rdx, rcx
+#            ret
+#
+#    mul_bmi2:
+#        mov     rdx, rdi
+#        mulx    rax, rdx, rsi
+#        ret
+#
+# Note that mul(hi: var rdx, lo: var rax, a: reg/mem64, b: rax)
+#   - clobbers carry (and many other) flags
+#   - has fixed output to rdx:rax registers
+# while mulx(hi: var reg64, lo: var reg64, a: reg/mem64, b: rdx)
+#   - does not clobber flags
+#   - has flexible register outputs
+
+
+genInstr():
+  # We are only concerned about the ADCX/ADOX instructions
+  # which do not have intrinsics or cannot be generated through instruction combining
+  # unlike llvm.x86.addcarry.u64 that can generate adc
+
+  # (cf/of, r) <- a+b+(cf/of)
+  op adcx_rr: ("adcx", "%2, %0;", "=r,%0,r", [lhs, rhs])
+  op adcx_rm: ("adcx", "%2, %0;", "=r,0,m", [lhs, rhs])
+  op adox_rr: ("adox", "%2, %0;", "=r,%0,r", [lhs, rhs])
+  op adox_rm: ("adox", "%2, %0;", "=r,0,m", [lhs, rhs])
\ No newline at end of file
diff --git a/tests/gpu/hello_world_llvm.nim b/tests/gpu/hello_world_llvm.nim
index c35392f..6e3f310 100644
--- a/tests/gpu/hello_world_llvm.nim
+++ b/tests/gpu/hello_world_llvm.nim
@@ -6,7 +6,7 @@
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
-import ../../constantine/platforms/gpu/llvm
+import ../../constantine/platforms/code_generator/llvm
 
 echo "LLVM JIT compiler Hello World"
 
diff --git a/tests/gpu/hello_world_nvidia.nim b/tests/gpu/hello_world_nvidia.nim
index 7745e98..ae98954 100644
--- a/tests/gpu/hello_world_nvidia.nim
+++ b/tests/gpu/hello_world_nvidia.nim
@@ -6,7 +6,7 @@
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
-import ../../constantine/platforms/gpu/[llvm, nvidia, bindings/c_abi]
+import ../../constantine/platforms/code_generator/[llvm, nvidia, bindings/c_abi]
 
 # ############################################################
 #
diff --git a/tests/gpu/t_nvidia_fp.nim b/tests/gpu/t_nvidia_fp.nim
index c99c967..d5954ad 100644
--- a/tests/gpu/t_nvidia_fp.nim
+++ b/tests/gpu/t_nvidia_fp.nim
@@ -11,12 +11,12 @@ import
   # Standard library
   std/[unittest, times],
   # Internal
-  ../../constantine/platforms/gpu/[llvm, nvidia, ir],
+  ../../constantine/platforms/code_generator/[llvm, nvidia, ir],
   ../../constantine/platforms/static_for,
   ../../constantine/math/config/curves,
   ../../constantine/math/io/io_bigints,
   ../../constantine/math/arithmetic,
-  ../../constantine/math_gpu/fields_nvidia,
+  ../../constantine/math_codegen/fields_nvidia,
   # Test utilities
   ../../helpers/prng_unsafe