From 1920adc42e6762c48c227f1e075cb46794bc41dc Mon Sep 17 00:00:00 2001 From: Jacek Sieka Date: Thu, 8 Aug 2024 16:29:06 +0200 Subject: [PATCH 1/6] Reuse stint primitives for limbs This PR makes bncurve less slow by reusing stint integer primtivies and unrolling a few loops and arrays to avoid array length checks and the like. To give an idea, it brings down processing 8k nimbus-eth1 blocks around the 18M block height mark from 24 to 16 minutes - this is quite significant given that a lot of time in eth1 is spent reading the database - this is at least an order of magnitude of bncurve improvement but probably quite a lot more - how much doesn't greatly matter but now there's at least a decent baseline for any future performance work ;) Of course, reusing private primitives from `stint` is not pretty - the plan is to extract them to a separate library, work started in https://github.com/status-im/nim-stew/pull/187. --- bncurve.nimble | 3 +- bncurve/arith.nim | 136 +++++++++++++--------------------------------- bncurve/fp.nim | 2 - bncurve/fq12.nim | 2 - bncurve/fq2.nim | 2 - bncurve/fq6.nim | 2 - 6 files changed, 41 insertions(+), 106 deletions(-) diff --git a/bncurve.nimble b/bncurve.nimble index d013f12..1941738 100644 --- a/bncurve.nimble +++ b/bncurve.nimble @@ -8,7 +8,8 @@ skipDirs = @["tests", "Nim", "nim"] ### Dependencies requires "nim >= 1.6.0", - "nimcrypto" + "nimcrypto", + "stint" task test, "Run all tests": for tprog in @[ diff --git a/bncurve/arith.nim b/bncurve/arith.nim index ffaf167..a8c582f 100644 --- a/bncurve/arith.nim +++ b/bncurve/arith.nim @@ -10,7 +10,9 @@ import options, endians import nimcrypto/[utils, sysrand] export options -{.deadCodeElim: on.} +# TODO replace private stint operations with an integer primitive library +import stint/private/primitives/[addcarry_subborrow, extended_precision] +import stint/private/datatypes type BNU256* = array[4, uint64] @@ -68,16 +70,7 @@ proc getBit*(a: openArray[uint64], n: int): bool {.inline, noinit.} = let bit = n - (part shl 6) result = ((a[part] and (1'u64 shl bit)) != 0) -template splitU64(n: uint64, hi, lo: untyped) = - ## Split 64bit unsigned integer to 32bit parts - hi = n shr 32 - lo = n and 0xFFFF_FFFF'u64 - -template combineU64(hi, lo: untyped): uint64 = - ## Combine 64bit unsigned integer from 32bit parts - (hi shl 32) or lo - -proc div2*(a: var BNU256) {.inline.} = +proc div2(a: var BNU256) {.inline.} = ## Divide integer ``a`` in place by ``2``. var t = a[3] shl 63 a[3] = a[3] shr 1 @@ -90,7 +83,7 @@ proc div2*(a: var BNU256) {.inline.} = a[0] = a[0] shr 1 a[0] = a[0] or t -proc mul2*(a: var BNU256) {.inline.} = +proc mul2(a: var BNU256) {.inline.} = ## Multiply integer ``a`` in place by ``2``. var last = 0'u64 for i in a.mitems(): @@ -99,92 +92,42 @@ proc mul2*(a: var BNU256) {.inline.} = i = i or last last = tmp -proc adc(a, b: uint64, carry: var uint64): uint64 {.inline, noinit.} = - ## Calculate ``a + b`` and return result, set ``carry`` to addition - ## operation carry. - var a0, a1, b0, b1, c, r0, r1: uint64 - splitU64(a, a1, a0) - splitU64(b, b1, b0) - let tmp0 = a0 + b0 + carry - splitU64(tmp0, c, r0) - let tmp1 = a1 + b1 + c - splitU64(tmp1, c, r1) - carry = c - result = combineU64(r1, r0) - -proc addNoCarry*(a: var BNU256, b: BNU256) {.inline.} = +proc addNoCarry(a: var BNU256, b: BNU256) {.inline.} = ## Calculate integer addition ``a = a + b``. - var carry = 0'u64 - a[0] = adc(a[0], b[0], carry) - a[1] = adc(a[1], b[1], carry) - a[2] = adc(a[2], b[2], carry) - a[3] = adc(a[3], b[3], carry) - doAssert(carry == 0) + var carry: Carry + staticFor i, 0, 4: + addC(carry, a[i], a[i], b[i], carry) -proc subNoBorrow*(a: var BNU256, b: BNU256) {.inline.} = +proc subNoBorrow(a: var BNU256, b: BNU256) {.inline.} = ## Calculate integer substraction ``a = a - b``. - proc sbb(a: uint64, b: uint64, - borrow: var uint64): uint64 {.inline, noinit.}= - var a0, a1, b0, b1, t0, r0, r1: uint64 - splitU64(a, a1, a0) - splitU64(b, b1, b0) - let tmp0 = (1'u64 shl 32) + a0 - b0 - borrow - splitU64(tmp0, t0, r0) - let tmp1 = (1'u64 shl 32) + a1 - b1 - uint64(t0 == 0'u64) - splitU64(tmp1, t0, r1) - borrow = uint64(t0 == 0) - result = combineU64(r1, r0) - var borrow = 0'u64 - a[0] = sbb(a[0], b[0], borrow) - a[1] = sbb(a[1], b[1], borrow) - a[2] = sbb(a[2], b[2], borrow) - a[3] = sbb(a[3], b[3], borrow) - doAssert(borrow == 0) - -proc macDigit(acc: var openArray[uint64], pos: int, b: openArray[uint64], - c: uint64) = - proc macWithCarry(a, b, c: uint64, carry: var uint64): uint64 {.noinit.} = - var - bhi, blo, chi, clo, ahi, alo, carryhi, carrylo: uint64 - xhi, xlo, yhi, ylo, zhi, zlo, rhi, rlo: uint64 - splitU64(b, bhi, blo) - splitU64(c, chi, clo) - splitU64(a, ahi, alo) - splitU64(carry, carryhi, carrylo) - splitU64(blo * clo + alo + carrylo, xhi, xlo) - splitU64(blo * chi, yhi, ylo) - splitU64(bhi * clo, zhi, zlo) - splitU64(xhi + ylo + zlo + ahi + carryhi, rhi, rlo) - carry = (bhi * chi) + rhi + yhi + zhi - result = combineU64(rlo, xlo) + var borrow: Borrow + staticFor i, 0, 4: + subB(borrow, a[i], a[i], b[i], borrow) +proc macDigit[N, N2: static int]( + acc: var array[N, uint64], pos: static int, b: array[N2, uint64], c: uint64) = if c == 0'u64: return - var carry = 0'u64 - for i in pos.. i: - result[i] = adc(result[i], c0[i], carry) - elif carry != 0'u64: - result[i] = adc(result[i], 0'u64, carry) + var carry: Carry + staticFor i, 0, len(result): + when len(c0) > i: + addC(carry, result[i], result[i], c0[i], carry) else: - break - doAssert(carry == 0'u64) + addC(carry, result[i], result[i], 0'u64, carry) + + doAssert(carry == 0) proc fromBytes*(dst: var BNU256, src: openArray[byte]): bool = ## Create 256bit integer from big-endian bytes representation ``src``. diff --git a/bncurve/fp.nim b/bncurve/fp.nim index a23bd57..5c7a26b 100644 --- a/bncurve/fp.nim +++ b/bncurve/fp.nim @@ -8,8 +8,6 @@ # those terms. import arith, options -{.deadCodeElim: on.} - template fieldImplementation(finame, fimodulus, firsquared, fircubed, fionep, fiinv: untyped): untyped {.dirty.} = type finame* = distinct BNU256 diff --git a/bncurve/fq12.nim b/bncurve/fq12.nim index 97756f3..d9e1cf2 100644 --- a/bncurve/fq12.nim +++ b/bncurve/fq12.nim @@ -9,8 +9,6 @@ import options import fq6, fq2, fp, arith -{.deadCodeElim: on.} - const frobeniusCoeffsC1: array[4, FQ2] = [ FQ2.one(), FQ2( diff --git a/bncurve/fq2.nim b/bncurve/fq2.nim index 7bad05f..80d57bd 100644 --- a/bncurve/fq2.nim +++ b/bncurve/fq2.nim @@ -9,8 +9,6 @@ import options import fp, arith -{.deadCodeElim: on.} - type FQ2* = object c0*: FQ diff --git a/bncurve/fq6.nim b/bncurve/fq6.nim index f74dc2f..bca4fcd 100644 --- a/bncurve/fq6.nim +++ b/bncurve/fq6.nim @@ -9,8 +9,6 @@ import options import fq2, fp, arith -{.deadCodeElim: on.} - const frobeniusCoeffsC1: array[4, FQ2] = [ FQ2.one(), FQ2( From dca7819f2b8428a22967ee3450d31bda2147f90c Mon Sep 17 00:00:00 2001 From: Jacek Sieka Date: Thu, 8 Aug 2024 17:18:21 +0200 Subject: [PATCH 2/6] add back 32-bit support --- bncurve/arith.nim | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/bncurve/arith.nim b/bncurve/arith.nim index a8c582f..c5528ef 100644 --- a/bncurve/arith.nim +++ b/bncurve/arith.nim @@ -104,6 +104,13 @@ proc subNoBorrow(a: var BNU256, b: BNU256) {.inline.} = staticFor i, 0, 4: subB(borrow, a[i], a[i], b[i], borrow) +when sizeof(int) == 4: + import stint/private/primitives/compiletime_fallback + + # TODO a future intops library should expose this on 32-bit platforms too! + proc muladd2(hi, lo: var uint64, a, b, c1, c2: uint64): uint64 = + muladd2_nim(hi, lo, a, b, c1, c2) + proc macDigit[N, N2: static int]( acc: var array[N, uint64], pos: static int, b: array[N2, uint64], c: uint64) = if c == 0'u64: From 8aadf13bb5023b16bb49812545eb83e99f7a0c68 Mon Sep 17 00:00:00 2001 From: Jacek Sieka Date: Thu, 8 Aug 2024 17:36:32 +0200 Subject: [PATCH 3/6] oops --- bncurve/arith.nim | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bncurve/arith.nim b/bncurve/arith.nim index c5528ef..dbbca8d 100644 --- a/bncurve/arith.nim +++ b/bncurve/arith.nim @@ -108,7 +108,7 @@ when sizeof(int) == 4: import stint/private/primitives/compiletime_fallback # TODO a future intops library should expose this on 32-bit platforms too! - proc muladd2(hi, lo: var uint64, a, b, c1, c2: uint64): uint64 = + proc muladd2(hi, lo: var uint64, a, b, c1, c2: uint64) = muladd2_nim(hi, lo, a, b, c1, c2) proc macDigit[N, N2: static int]( From 5b38e791dc5de590a967cb6678c4de2365e8fada Mon Sep 17 00:00:00 2001 From: Jacek Sieka Date: Thu, 8 Aug 2024 18:07:16 +0200 Subject: [PATCH 4/6] more fallbacks --- bncurve/arith.nim | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/bncurve/arith.nim b/bncurve/arith.nim index dbbca8d..c591a8b 100644 --- a/bncurve/arith.nim +++ b/bncurve/arith.nim @@ -11,7 +11,20 @@ import nimcrypto/[utils, sysrand] export options # TODO replace private stint operations with an integer primitive library -import stint/private/primitives/[addcarry_subborrow, extended_precision] +when sizeof(int) == 4: + import stint/private/primitives/compiletime_fallback + + # TODO a future intops library should expose this on 32-bit platforms too! + func addC*(cOut: var Carry, sum: var uint64, a, b: uint64, cIn: Carry) {.inline.} = + addC_nim(cOut, sum, a, b, cIn) + func subB*(bOut: var Borrow, diff: var uint64, a, b: uint64, bIn: Borrow) {.inline.} = + subB_nim(bOuit, diff, a, b, bIn) + proc muladd2(hi, lo: var uint64, a, b, c1, c2: uint64) = + muladd2_nim(hi, lo, a, b, c1, c2) + +else: + import stint/private/primitives/[addcarry_subborrow, extended_precision] + import stint/private/datatypes type @@ -104,13 +117,6 @@ proc subNoBorrow(a: var BNU256, b: BNU256) {.inline.} = staticFor i, 0, 4: subB(borrow, a[i], a[i], b[i], borrow) -when sizeof(int) == 4: - import stint/private/primitives/compiletime_fallback - - # TODO a future intops library should expose this on 32-bit platforms too! - proc muladd2(hi, lo: var uint64, a, b, c1, c2: uint64) = - muladd2_nim(hi, lo, a, b, c1, c2) - proc macDigit[N, N2: static int]( acc: var array[N, uint64], pos: static int, b: array[N2, uint64], c: uint64) = if c == 0'u64: From 3ffca4d248c122d7e722e4d65021a98ec057a14e Mon Sep 17 00:00:00 2001 From: Jacek Sieka Date: Thu, 8 Aug 2024 18:28:51 +0200 Subject: [PATCH 5/6] imports --- bncurve/arith.nim | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bncurve/arith.nim b/bncurve/arith.nim index c591a8b..c084ed9 100644 --- a/bncurve/arith.nim +++ b/bncurve/arith.nim @@ -11,6 +11,8 @@ import nimcrypto/[utils, sysrand] export options # TODO replace private stint operations with an integer primitive library +import stint/private/datatypes + when sizeof(int) == 4: import stint/private/primitives/compiletime_fallback @@ -25,8 +27,6 @@ when sizeof(int) == 4: else: import stint/private/primitives/[addcarry_subborrow, extended_precision] -import stint/private/datatypes - type BNU256* = array[4, uint64] BNU512* = array[8, uint64] From 03c2569177d1c6c5085ea83038cf38f5a17414f9 Mon Sep 17 00:00:00 2001 From: Jacek Sieka Date: Thu, 8 Aug 2024 18:36:07 +0200 Subject: [PATCH 6/6] aaaaaaaaaaarrrrrrrrrrggggggggggggghhhhhhhhhhhhhhh --- bncurve/arith.nim | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bncurve/arith.nim b/bncurve/arith.nim index c084ed9..f87599b 100644 --- a/bncurve/arith.nim +++ b/bncurve/arith.nim @@ -20,7 +20,7 @@ when sizeof(int) == 4: func addC*(cOut: var Carry, sum: var uint64, a, b: uint64, cIn: Carry) {.inline.} = addC_nim(cOut, sum, a, b, cIn) func subB*(bOut: var Borrow, diff: var uint64, a, b: uint64, bIn: Borrow) {.inline.} = - subB_nim(bOuit, diff, a, b, bIn) + subB_nim(bOut, diff, a, b, bIn) proc muladd2(hi, lo: var uint64, a, b, c1, c2: uint64) = muladd2_nim(hi, lo, a, b, c1, c2)