From 1920adc42e6762c48c227f1e075cb46794bc41dc Mon Sep 17 00:00:00 2001
From: Jacek Sieka <jacek@status.im>
Date: Thu, 8 Aug 2024 16:29:06 +0200
Subject: [PATCH 1/6] Reuse stint primitives for limbs

This PR makes bncurve less slow by reusing stint integer primtivies and
unrolling a few loops and arrays to avoid array length checks and the
like.

To give an idea, it brings down processing 8k nimbus-eth1 blocks around
the 18M block height mark from 24 to 16 minutes - this is quite
significant given that a lot of time in eth1 is spent reading the
database - this is at least an order of magnitude of bncurve improvement
but probably quite a lot more - how much doesn't greatly matter but now
there's at least a decent baseline for any future performance work ;)

Of course, reusing private primitives from `stint` is not pretty - the
plan is to extract them to a separate library, work started in
https://github.com/status-im/nim-stew/pull/187.
---
 bncurve.nimble    |   3 +-
 bncurve/arith.nim | 136 +++++++++++++---------------------------------
 bncurve/fp.nim    |   2 -
 bncurve/fq12.nim  |   2 -
 bncurve/fq2.nim   |   2 -
 bncurve/fq6.nim   |   2 -
 6 files changed, 41 insertions(+), 106 deletions(-)

diff --git a/bncurve.nimble b/bncurve.nimble
index d013f12..1941738 100644
--- a/bncurve.nimble
+++ b/bncurve.nimble
@@ -8,7 +8,8 @@ skipDirs      = @["tests", "Nim", "nim"]
 ### Dependencies
 
 requires "nim >= 1.6.0",
-         "nimcrypto"
+         "nimcrypto",
+         "stint"
 
 task test, "Run all tests":
   for tprog in @[
diff --git a/bncurve/arith.nim b/bncurve/arith.nim
index ffaf167..a8c582f 100644
--- a/bncurve/arith.nim
+++ b/bncurve/arith.nim
@@ -10,7 +10,9 @@ import options, endians
 import nimcrypto/[utils, sysrand]
 export options
 
-{.deadCodeElim: on.}
+# TODO replace private stint operations with an integer primitive library
+import stint/private/primitives/[addcarry_subborrow, extended_precision]
+import stint/private/datatypes
 
 type
   BNU256* = array[4, uint64]
@@ -68,16 +70,7 @@ proc getBit*(a: openArray[uint64], n: int): bool {.inline, noinit.} =
   let bit = n - (part shl 6)
   result = ((a[part] and (1'u64 shl bit)) != 0)
 
-template splitU64(n: uint64, hi, lo: untyped) =
-  ## Split 64bit unsigned integer to 32bit parts
-  hi = n shr 32
-  lo = n and 0xFFFF_FFFF'u64
-
-template combineU64(hi, lo: untyped): uint64 =
-  ## Combine 64bit unsigned integer from 32bit parts
-  (hi shl 32) or lo
-
-proc div2*(a: var BNU256) {.inline.} =
+proc div2(a: var BNU256) {.inline.} =
   ## Divide integer ``a`` in place by ``2``.
   var t = a[3] shl 63
   a[3] = a[3] shr 1
@@ -90,7 +83,7 @@ proc div2*(a: var BNU256) {.inline.} =
   a[0] = a[0] shr 1
   a[0] = a[0] or t
 
-proc mul2*(a: var BNU256) {.inline.} =
+proc mul2(a: var BNU256) {.inline.} =
   ## Multiply integer ``a`` in place by ``2``.
   var last = 0'u64
   for i in a.mitems():
@@ -99,92 +92,42 @@ proc mul2*(a: var BNU256) {.inline.} =
     i = i or last
     last = tmp
 
-proc adc(a, b: uint64, carry: var uint64): uint64 {.inline, noinit.} =
-  ## Calculate ``a + b`` and return result, set ``carry`` to addition
-  ## operation carry.
-  var a0, a1, b0, b1, c, r0, r1: uint64
-  splitU64(a, a1, a0)
-  splitU64(b, b1, b0)
-  let tmp0 = a0 + b0 + carry
-  splitU64(tmp0, c, r0)
-  let tmp1 = a1 + b1 + c
-  splitU64(tmp1, c, r1)
-  carry = c
-  result = combineU64(r1, r0)
-
-proc addNoCarry*(a: var BNU256, b: BNU256) {.inline.} =
+proc addNoCarry(a: var BNU256, b: BNU256) {.inline.} =
   ## Calculate integer addition ``a = a + b``.
-  var carry = 0'u64
-  a[0] = adc(a[0], b[0], carry)
-  a[1] = adc(a[1], b[1], carry)
-  a[2] = adc(a[2], b[2], carry)
-  a[3] = adc(a[3], b[3], carry)
-  doAssert(carry == 0)
+  var carry: Carry
+  staticFor i, 0, 4:
+    addC(carry, a[i], a[i], b[i], carry)
 
-proc subNoBorrow*(a: var BNU256, b: BNU256) {.inline.} =
+proc subNoBorrow(a: var BNU256, b: BNU256) {.inline.} =
   ## Calculate integer substraction ``a = a - b``.
-  proc sbb(a: uint64, b: uint64,
-           borrow: var uint64): uint64 {.inline, noinit.}=
-    var a0, a1, b0, b1, t0, r0, r1: uint64
-    splitU64(a, a1, a0)
-    splitU64(b, b1, b0)
-    let tmp0 = (1'u64 shl 32) + a0 - b0 - borrow
-    splitU64(tmp0, t0, r0)
-    let tmp1 = (1'u64 shl 32) + a1 - b1 - uint64(t0 == 0'u64)
-    splitU64(tmp1, t0, r1)
-    borrow = uint64(t0 == 0)
-    result = combineU64(r1, r0)
-  var borrow = 0'u64
-  a[0] = sbb(a[0], b[0], borrow)
-  a[1] = sbb(a[1], b[1], borrow)
-  a[2] = sbb(a[2], b[2], borrow)
-  a[3] = sbb(a[3], b[3], borrow)
-  doAssert(borrow == 0)
-
-proc macDigit(acc: var openArray[uint64], pos: int, b: openArray[uint64],
-              c: uint64) =
-  proc macWithCarry(a, b, c: uint64, carry: var uint64): uint64 {.noinit.} =
-    var
-      bhi, blo, chi, clo, ahi, alo, carryhi, carrylo: uint64
-      xhi, xlo, yhi, ylo, zhi, zlo, rhi, rlo: uint64
-    splitU64(b, bhi, blo)
-    splitU64(c, chi, clo)
-    splitU64(a, ahi, alo)
-    splitU64(carry, carryhi, carrylo)
-    splitU64(blo * clo + alo + carrylo, xhi, xlo)
-    splitU64(blo * chi, yhi, ylo)
-    splitU64(bhi * clo, zhi, zlo)
-    splitU64(xhi + ylo + zlo + ahi + carryhi, rhi, rlo)
-    carry = (bhi * chi) + rhi + yhi + zhi
-    result = combineU64(rlo, xlo)
+  var borrow: Borrow
+  staticFor i, 0, 4:
+    subB(borrow, a[i], a[i], b[i], borrow)
 
+proc macDigit[N, N2: static int](
+    acc: var array[N, uint64], pos: static int, b: array[N2, uint64], c: uint64) =
   if c == 0'u64:
     return
-  var carry = 0'u64
-  for i in pos..<len(acc):
-    if (i - pos) < len(b):
-      acc[i] = macWithCarry(acc[i], b[i - pos], c, carry)
-    elif carry != 0:
-      acc[i] = macWithCarry(acc[i], 0'u64, c, carry)
-    else:
-      break
-  doAssert(carry == 0)
 
-proc mulReduce(a: var BNU256, by: BNU256, modulus: BNU256,
-               inv: uint64) =
+  var carry = 0'u64
+
+  staticFor i, pos, N:
+    when (i - pos) < len(b):
+      muladd2(carry, acc[i], b[i-pos], c, acc[i], carry)
+    else:
+      muladd2(carry, acc[i], 0, c, acc[i], carry)
+
+proc mulReduce(a: var BNU256, by: BNU256, modulus: BNU256, inv: uint64) =
   var res: array[4 * 2, uint64]
-  var k: uint64
-  macDigit(res, 0, by, a[0])
-  macDigit(res, 1, by, a[1])
-  macDigit(res, 2, by, a[2])
-  macDigit(res, 3, by, a[3])
-  for i in 0..<4:
-    k = inv * res[i]
+  staticFor i, 0, 4:
+    macDigit(res, i, by, a[i])
+
+  staticFor i, 0, 4:
+    let k = inv * res[i]
     macDigit(res, i, modulus, k)
-  a[0] = res[4]
-  a[1] = res[5]
-  a[2] = res[6]
-  a[3] = res[7]
+
+  staticFor i, 0, 4:
+    a[i] = res[i + 4]
 
 proc compare*(a: BNU256, b: BNU256): int {.noinit, inline.}=
   ## Compare integers ``a`` and ``b``.
@@ -267,15 +210,14 @@ proc into*(t: typedesc[BNU512], c1: BNU256,
   macDigit(result, 1, modulo, c1[1])
   macDigit(result, 2, modulo, c1[2])
   macDigit(result, 3, modulo, c1[3])
-  var carry = 0'u64
-  for i in 0..<len(result):
-    if len(c0) > i:
-      result[i] = adc(result[i], c0[i], carry)
-    elif carry != 0'u64:
-      result[i] = adc(result[i], 0'u64, carry)
+  var carry: Carry
+  staticFor i, 0, len(result):
+    when len(c0) > i:
+      addC(carry, result[i], result[i], c0[i], carry)
     else:
-      break
-  doAssert(carry == 0'u64)
+      addC(carry, result[i], result[i], 0'u64, carry)
+
+  doAssert(carry == 0)
 
 proc fromBytes*(dst: var BNU256, src: openArray[byte]): bool =
   ## Create 256bit integer from big-endian bytes representation ``src``.
diff --git a/bncurve/fp.nim b/bncurve/fp.nim
index a23bd57..5c7a26b 100644
--- a/bncurve/fp.nim
+++ b/bncurve/fp.nim
@@ -8,8 +8,6 @@
 # those terms.
 import arith, options
 
-{.deadCodeElim: on.}
-
 template fieldImplementation(finame, fimodulus, firsquared, fircubed,
                              fionep, fiinv: untyped): untyped {.dirty.} =
   type finame* = distinct BNU256
diff --git a/bncurve/fq12.nim b/bncurve/fq12.nim
index 97756f3..d9e1cf2 100644
--- a/bncurve/fq12.nim
+++ b/bncurve/fq12.nim
@@ -9,8 +9,6 @@
 import options
 import fq6, fq2, fp, arith
 
-{.deadCodeElim: on.}
-
 const frobeniusCoeffsC1: array[4, FQ2] = [
   FQ2.one(),
   FQ2(
diff --git a/bncurve/fq2.nim b/bncurve/fq2.nim
index 7bad05f..80d57bd 100644
--- a/bncurve/fq2.nim
+++ b/bncurve/fq2.nim
@@ -9,8 +9,6 @@
 import options
 import fp, arith
 
-{.deadCodeElim: on.}
-
 type
   FQ2* = object
     c0*: FQ
diff --git a/bncurve/fq6.nim b/bncurve/fq6.nim
index f74dc2f..bca4fcd 100644
--- a/bncurve/fq6.nim
+++ b/bncurve/fq6.nim
@@ -9,8 +9,6 @@
 import options
 import fq2, fp, arith
 
-{.deadCodeElim: on.}
-
 const frobeniusCoeffsC1: array[4, FQ2] = [
   FQ2.one(),
   FQ2(

From dca7819f2b8428a22967ee3450d31bda2147f90c Mon Sep 17 00:00:00 2001
From: Jacek Sieka <jacek@status.im>
Date: Thu, 8 Aug 2024 17:18:21 +0200
Subject: [PATCH 2/6] add back 32-bit support

---
 bncurve/arith.nim | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/bncurve/arith.nim b/bncurve/arith.nim
index a8c582f..c5528ef 100644
--- a/bncurve/arith.nim
+++ b/bncurve/arith.nim
@@ -104,6 +104,13 @@ proc subNoBorrow(a: var BNU256, b: BNU256) {.inline.} =
   staticFor i, 0, 4:
     subB(borrow, a[i], a[i], b[i], borrow)
 
+when sizeof(int) == 4:
+  import stint/private/primitives/compiletime_fallback
+
+  # TODO a future intops library should expose this on 32-bit platforms too!
+  proc muladd2(hi, lo: var uint64, a, b, c1, c2: uint64): uint64 =
+    muladd2_nim(hi, lo, a, b, c1, c2)
+
 proc macDigit[N, N2: static int](
     acc: var array[N, uint64], pos: static int, b: array[N2, uint64], c: uint64) =
   if c == 0'u64:

From 8aadf13bb5023b16bb49812545eb83e99f7a0c68 Mon Sep 17 00:00:00 2001
From: Jacek Sieka <jacek@status.im>
Date: Thu, 8 Aug 2024 17:36:32 +0200
Subject: [PATCH 3/6] oops

---
 bncurve/arith.nim | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bncurve/arith.nim b/bncurve/arith.nim
index c5528ef..dbbca8d 100644
--- a/bncurve/arith.nim
+++ b/bncurve/arith.nim
@@ -108,7 +108,7 @@ when sizeof(int) == 4:
   import stint/private/primitives/compiletime_fallback
 
   # TODO a future intops library should expose this on 32-bit platforms too!
-  proc muladd2(hi, lo: var uint64, a, b, c1, c2: uint64): uint64 =
+  proc muladd2(hi, lo: var uint64, a, b, c1, c2: uint64) =
     muladd2_nim(hi, lo, a, b, c1, c2)
 
 proc macDigit[N, N2: static int](

From 5b38e791dc5de590a967cb6678c4de2365e8fada Mon Sep 17 00:00:00 2001
From: Jacek Sieka <jacek@status.im>
Date: Thu, 8 Aug 2024 18:07:16 +0200
Subject: [PATCH 4/6] more fallbacks

---
 bncurve/arith.nim | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/bncurve/arith.nim b/bncurve/arith.nim
index dbbca8d..c591a8b 100644
--- a/bncurve/arith.nim
+++ b/bncurve/arith.nim
@@ -11,7 +11,20 @@ import nimcrypto/[utils, sysrand]
 export options
 
 # TODO replace private stint operations with an integer primitive library
-import stint/private/primitives/[addcarry_subborrow, extended_precision]
+when sizeof(int) == 4:
+  import stint/private/primitives/compiletime_fallback
+
+  # TODO a future intops library should expose this on 32-bit platforms too!
+  func addC*(cOut: var Carry, sum: var uint64, a, b: uint64, cIn: Carry) {.inline.} =
+    addC_nim(cOut, sum, a, b, cIn)
+  func subB*(bOut: var Borrow, diff: var uint64, a, b: uint64, bIn: Borrow) {.inline.} =
+    subB_nim(bOuit, diff, a, b, bIn)
+  proc muladd2(hi, lo: var uint64, a, b, c1, c2: uint64) =
+    muladd2_nim(hi, lo, a, b, c1, c2)
+
+else:
+  import stint/private/primitives/[addcarry_subborrow, extended_precision]
+
 import stint/private/datatypes
 
 type
@@ -104,13 +117,6 @@ proc subNoBorrow(a: var BNU256, b: BNU256) {.inline.} =
   staticFor i, 0, 4:
     subB(borrow, a[i], a[i], b[i], borrow)
 
-when sizeof(int) == 4:
-  import stint/private/primitives/compiletime_fallback
-
-  # TODO a future intops library should expose this on 32-bit platforms too!
-  proc muladd2(hi, lo: var uint64, a, b, c1, c2: uint64) =
-    muladd2_nim(hi, lo, a, b, c1, c2)
-
 proc macDigit[N, N2: static int](
     acc: var array[N, uint64], pos: static int, b: array[N2, uint64], c: uint64) =
   if c == 0'u64:

From 3ffca4d248c122d7e722e4d65021a98ec057a14e Mon Sep 17 00:00:00 2001
From: Jacek Sieka <jacek@status.im>
Date: Thu, 8 Aug 2024 18:28:51 +0200
Subject: [PATCH 5/6] imports

---
 bncurve/arith.nim | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bncurve/arith.nim b/bncurve/arith.nim
index c591a8b..c084ed9 100644
--- a/bncurve/arith.nim
+++ b/bncurve/arith.nim
@@ -11,6 +11,8 @@ import nimcrypto/[utils, sysrand]
 export options
 
 # TODO replace private stint operations with an integer primitive library
+import stint/private/datatypes
+
 when sizeof(int) == 4:
   import stint/private/primitives/compiletime_fallback
 
@@ -25,8 +27,6 @@ when sizeof(int) == 4:
 else:
   import stint/private/primitives/[addcarry_subborrow, extended_precision]
 
-import stint/private/datatypes
-
 type
   BNU256* = array[4, uint64]
   BNU512* = array[8, uint64]

From 03c2569177d1c6c5085ea83038cf38f5a17414f9 Mon Sep 17 00:00:00 2001
From: Jacek Sieka <jacek@status.im>
Date: Thu, 8 Aug 2024 18:36:07 +0200
Subject: [PATCH 6/6] aaaaaaaaaaarrrrrrrrrrggggggggggggghhhhhhhhhhhhhhh

---
 bncurve/arith.nim | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bncurve/arith.nim b/bncurve/arith.nim
index c084ed9..f87599b 100644
--- a/bncurve/arith.nim
+++ b/bncurve/arith.nim
@@ -20,7 +20,7 @@ when sizeof(int) == 4:
   func addC*(cOut: var Carry, sum: var uint64, a, b: uint64, cIn: Carry) {.inline.} =
     addC_nim(cOut, sum, a, b, cIn)
   func subB*(bOut: var Borrow, diff: var uint64, a, b: uint64, bIn: Borrow) {.inline.} =
-    subB_nim(bOuit, diff, a, b, bIn)
+    subB_nim(bOut, diff, a, b, bIn)
   proc muladd2(hi, lo: var uint64, a, b, c1, c2: uint64) =
     muladd2_nim(hi, lo, a, b, c1, c2)