From 2613356281ed33626bd9cc8b2cbaf68ae11a0867 Mon Sep 17 00:00:00 2001
From: Mamy Ratsimbazafy <mamy_github@numforge.co>
Date: Sun, 14 Jun 2020 15:39:06 +0200
Subject: [PATCH] Endomorphism acceleration for Scalar Multiplication (#44)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Add MultiScalar recoding from "Efficient and Secure Algorithms for GLV-Based Scalar Multiplication" by Faz et al

* precompute cube root of unity - Add VM precomputation of Fp - workaround upstream bug https://github.com/nim-lang/Nim/issues/14585

* Add the φ-accelerated lookup table builder

* Add a dedicated bithacks file

* cosmetic import consistency

* Build the φ precompute table with n-1 EC additions instead of 2^(n-1) additions

* remove binary

* Add the GLV precomputations to the sage scripts

* You can't avoid it, bigint multiplication is needed at one point

* Add bigint multiplication discarding some low words

* Implement the lattice decomposition in sage

* Proper decomposition for BN254

* Prepare the code for a new scalar mul

* We compile, and now debugging hunt

* More helpers to debug GLV scalar Mul

* Fix conditional negation

* Endomorphism accelerated scalar mul working for BN254 curve

* Implement endomorphism acceleration for BLS12-381 (needed cofactor clearing of the point)

* fix nimble test script after bench rename
---
 ...ch_ec_swei_proj_g1.nim => bench_ec_g1.nim} |  12 +-
 benchmarks/bench_elliptic_template.nim        |  59 +-
 constantine.nimble                            |  40 +-
 constantine/arithmetic.nim                    |   2 -
 constantine/arithmetic/bigints.nim            | 130 ++--
 constantine/arithmetic/finite_fields.nim      |  17 +-
 .../arithmetic/finite_fields_inversion.nim    |   4 +-
 constantine/arithmetic/limbs.nim              | 146 ++++-
 constantine/arithmetic/limbs_montgomery.nim   |  49 +-
 constantine/config/curves.nim                 |  20 +-
 constantine/config/curves_declaration.nim     |   4 +
 constantine/config/curves_derived.nim         |  40 +-
 constantine/config/curves_parser.nim          |  25 +-
 .../precomputed.nim => config/precompute.nim} | 134 +++-
 constantine/config/type_bigint.nim            |  53 ++
 constantine/config/type_fp.nim                |  28 +
 .../elliptic/ec_endomorphism_accel.nim        | 600 ++++++++++++++++++
 .../elliptic/ec_endomorphism_params.nim       | 135 ++++
 constantine/elliptic/ec_scalar_mul.nim        | 229 +++++++
 .../elliptic/ec_weierstrass_projective.nim    | 215 +------
 constantine/io/io_bigints.nim                 |   3 +-
 constantine/io/io_ec.nim                      |  13 +-
 constantine/io/io_fields.nim                  |   2 +-
 constantine/primitives.nim                    |   6 +-
 constantine/primitives/bithacks.nim           |  77 +++
 constantine/primitives/constant_time.nim      |  56 +-
 constantine/primitives/extended_precision.nim |  16 +-
 .../research/addcarry_subborrow_compiler.nim  |   4 +-
 .../quadratic_extensions.nim                  |   1 -
 formal_verification/bls12_381_q_64.nim        |   2 +-
 helpers/static_for.nim                        |   2 +-
 sage/curve_family_bls12.sage                  |  43 ++
 sage/curve_family_bn.sage                     |  50 +-
 sage/lattice_decomposition_bls12_381_g1.sage  | 202 ++++++
 ...lattice_decomposition_bn254_snarks_g1.sage | 193 ++++++
 sage/testgen_bls12_381.sage                   |   8 +-
 sage/testgen_bn254_snarks.sage                |   1 +
 tests/test_bigints.nim                        | 127 +++-
 ...vs_gmp.nim => test_bigints_mod_vs_gmp.nim} |   4 +-
 tests/test_bigints_mul_high_words_vs_gmp.nim  | 151 +++++
 tests/test_bigints_mul_vs_gmp.nim             | 140 ++++
 tests/test_bigints_multimod.nim               |   2 +-
 tests/test_ec_bls12_381.nim                   |  93 +--
 tests/test_ec_bn254.nim                       |  13 +-
 tests/test_ec_weierstrass_projective_g1.nim   |  20 +-
 tests/test_finite_fields.nim                  |   4 +-
 tests/test_finite_fields_mulsquare.nim        |  15 +-
 tests/test_finite_fields_powinv.nim           |  17 +-
 tests/test_finite_fields_sqrt.nim             |  18 +-
 tests/test_finite_fields_vs_gmp.nim           |   2 +-
 tests/test_precomputed.nim                    |  37 ++
 51 files changed, 2760 insertions(+), 504 deletions(-)
 rename benchmarks/{bench_ec_swei_proj_g1.nim => bench_ec_g1.nim} (81%)
 rename constantine/{arithmetic/precomputed.nim => config/precompute.nim} (78%)
 create mode 100644 constantine/config/type_bigint.nim
 create mode 100644 constantine/config/type_fp.nim
 create mode 100644 constantine/elliptic/ec_endomorphism_accel.nim
 create mode 100644 constantine/elliptic/ec_endomorphism_params.nim
 create mode 100644 constantine/elliptic/ec_scalar_mul.nim
 create mode 100644 constantine/primitives/bithacks.nim
 create mode 100644 sage/lattice_decomposition_bls12_381_g1.sage
 create mode 100644 sage/lattice_decomposition_bn254_snarks_g1.sage
 rename tests/{test_bigints_vs_gmp.nim => test_bigints_mod_vs_gmp.nim} (98%)
 create mode 100644 tests/test_bigints_mul_high_words_vs_gmp.nim
 create mode 100644 tests/test_bigints_mul_vs_gmp.nim
 create mode 100644 tests/test_precomputed.nim

diff --git a/benchmarks/bench_ec_swei_proj_g1.nim b/benchmarks/bench_ec_g1.nim
similarity index 81%
rename from benchmarks/bench_ec_swei_proj_g1.nim
rename to benchmarks/bench_ec_g1.nim
index 5d5c3d5..175e226 100644
--- a/benchmarks/bench_ec_swei_proj_g1.nim
+++ b/benchmarks/bench_ec_g1.nim
@@ -51,14 +51,16 @@ proc main() =
     separator()
     doublingBench(ECP_SWei_Proj[Fp[curve]], Iters)
     separator()
-    scalarMulBench(ECP_SWei_Proj[Fp[curve]], scratchSpaceSize = 1 shl 2, MulIters)
+    scalarMulUnsafeDoubleAddBench(ECP_SWei_Proj[Fp[curve]], MulIters)
     separator()
-    scalarMulBench(ECP_SWei_Proj[Fp[curve]], scratchSpaceSize = 1 shl 3, MulIters)
+    scalarMulGenericBench(ECP_SWei_Proj[Fp[curve]], scratchSpaceSize = 1 shl 2, MulIters)
     separator()
-    scalarMulBench(ECP_SWei_Proj[Fp[curve]], scratchSpaceSize = 1 shl 4, MulIters)
+    scalarMulGenericBench(ECP_SWei_Proj[Fp[curve]], scratchSpaceSize = 1 shl 3, MulIters)
+    separator()
+    scalarMulGenericBench(ECP_SWei_Proj[Fp[curve]], scratchSpaceSize = 1 shl 4, MulIters)
+    separator()
+    scalarMulGLV(ECP_SWei_Proj[Fp[curve]], MulIters)
     separator()
-    # scalarMulUnsafeDoubleAddBench(ECP_SWei_Proj[Fp[curve]], MulIters)
-    # separator()
   separator()
 
 main()
diff --git a/benchmarks/bench_elliptic_template.nim b/benchmarks/bench_elliptic_template.nim
index d1ccf73..a654b5b 100644
--- a/benchmarks/bench_elliptic_template.nim
+++ b/benchmarks/bench_elliptic_template.nim
@@ -17,11 +17,14 @@ import
   ../constantine/config/curves,
   ../constantine/arithmetic,
   ../constantine/io/io_bigints,
+  ../constantine/elliptic/[ec_weierstrass_projective, ec_scalar_mul, ec_endomorphism_accel],
   # Helpers
   ../helpers/[prng_unsafe, static_for],
   ./platforms,
   # Standard library
-  std/[monotimes, times, strformat, strutils, macros]
+  std/[monotimes, times, strformat, strutils, macros],
+  # Reference unsafe scalar multiplication
+  ../tests/support/ec_reference_scalar_mult
 
 var rng: RngState
 let seed = uint32(getTime().toUnix() and (1'i64 shl 32 - 1)) # unixTime mod 2^32
@@ -71,15 +74,15 @@ when SupportsGetTicks:
 echo "\n=================================================================================================================\n"
 
 proc separator*() =
-  echo "-".repeat(157)
+  echo "-".repeat(177)
 
 proc report(op, elliptic: string, start, stop: MonoTime, startClk, stopClk: int64, iters: int) =
   let ns = inNanoseconds((stop-start) div iters)
   let throughput = 1e9 / float64(ns)
   when SupportsGetTicks:
-    echo &"{op:<40} {elliptic:<40} {throughput:>15.3f} ops/s     {ns:>9} ns/op     {(stopClk - startClk) div iters:>9} CPU cycles (approx)"
+    echo &"{op:<60} {elliptic:<40} {throughput:>15.3f} ops/s     {ns:>9} ns/op     {(stopClk - startClk) div iters:>9} CPU cycles (approx)"
   else:
-    echo &"{op:<40} {elliptic:<40} {throughput:>15.3f} ops/s     {ns:>9} ns/op"
+    echo &"{op:<60} {elliptic:<40} {throughput:>15.3f} ops/s     {ns:>9} ns/op"
 
 macro fixEllipticDisplay(T: typedesc): untyped =
   # At compile-time, enums are integers and their display is buggy
@@ -120,11 +123,11 @@ proc doublingBench*(T: typedesc, iters: int) =
   bench("EC Double G1", T, iters):
     r.double(P)
 
-proc scalarMulBench*(T: typedesc, scratchSpaceSize: static int, iters: int) =
+proc scalarMulGenericBench*(T: typedesc, scratchSpaceSize: static int, iters: int) =
   const bits = T.F.C.getCurveOrderBitwidth()
 
   var r {.noInit.}: T
-  let P = rng.random_unsafe(T)
+  let P = rng.random_unsafe(T) # TODO: clear cofactor
 
   let exponent = rng.random_unsafe(BigInt[bits])
   var exponentCanonical{.noInit.}: array[(bits+7) div 8, byte]
@@ -132,22 +135,32 @@ proc scalarMulBench*(T: typedesc, scratchSpaceSize: static int, iters: int) =
 
   var scratchSpace{.noInit.}: array[scratchSpaceSize, T]
 
-  bench("EC ScalarMul G1 (scratchsize = " & $scratchSpaceSize & ')', T, iters):
+  bench("EC ScalarMul Generic G1 (scratchsize = " & $scratchSpaceSize & ')', T, iters):
     r = P
-    r.scalarMul(exponentCanonical, scratchSpace)
+    r.scalarMulGeneric(exponentCanonical, scratchSpace)
 
-# import ../tests/support/ec_reference_scalar_mult
-#
-# proc scalarMulUnsafeDoubleAddBench*(T: typedesc, iters: int) =
-#   const bits = T.F.C.getCurveOrderBitwidth()
-#
-#   var r {.noInit.}: T
-#   let P = rng.random_unsafe(T)
-#
-#   let exponent = rng.random_unsafe(BigInt[bits])
-#   var exponentCanonical{.noInit.}: array[(bits+7) div 8, byte]
-#   exponentCanonical.exportRawUint(exponent, bigEndian)
-#
-#   bench("EC ScalarMul G1 (unsafe DoubleAdd)", T, iters):
-#     r = P
-#     r.unsafe_ECmul_double_add(exponentCanonical)
+proc scalarMulGLV*(T: typedesc, iters: int) =
+  const bits = T.F.C.getCurveOrderBitwidth()
+
+  var r {.noInit.}: T
+  let P = rng.random_unsafe(T) # TODO: clear cofactor
+
+  let exponent = rng.random_unsafe(BigInt[bits])
+
+  bench("EC ScalarMul G1 (GLV endomorphism accelerated)", T, iters):
+    r = P
+    r.scalarMulGLV(exponent)
+
+proc scalarMulUnsafeDoubleAddBench*(T: typedesc, iters: int) =
+  const bits = T.F.C.getCurveOrderBitwidth()
+
+  var r {.noInit.}: T
+  let P = rng.random_unsafe(T) # TODO: clear cofactor
+
+  let exponent = rng.random_unsafe(BigInt[bits])
+  var exponentCanonical{.noInit.}: array[(bits+7) div 8, byte]
+  exponentCanonical.exportRawUint(exponent, bigEndian)
+
+  bench("EC ScalarMul G1 (unsafe reference DoubleAdd)", T, iters):
+    r = P
+    r.unsafe_ECmul_double_add(exponentCanonical)
diff --git a/constantine.nimble b/constantine.nimble
index e718117..d21c12f 100644
--- a/constantine.nimble
+++ b/constantine.nimble
@@ -59,7 +59,7 @@ task test, "Run all tests":
   test "", "tests/test_bigints.nim"
   test "", "tests/test_bigints_multimod.nim"
 
-  test "", "tests/test_bigints_vs_gmp.nim"
+  test "", "tests/test_bigints_mod_vs_gmp.nim"
 
   # Field
   test "", "tests/test_io_fields"
@@ -70,6 +70,9 @@ task test, "Run all tests":
 
   test "", "tests/test_finite_fields_vs_gmp.nim"
 
+  # Precompute
+  test "", "tests/test_precomputed"
+
   # Towers of extension fields
   test "", "tests/test_fp2.nim"
   test "", "tests/test_fp6.nim"
@@ -89,7 +92,7 @@ task test, "Run all tests":
     test "-d:Constantine32", "tests/test_bigints.nim"
     test "-d:Constantine32", "tests/test_bigints_multimod.nim"
 
-    test "-d:Constantine32", "tests/test_bigints_vs_gmp.nim"
+    test "-d:Constantine32", "tests/test_bigints_mod_vs_gmp.nim"
 
     # Field
     test "-d:Constantine32", "tests/test_io_fields"
@@ -100,6 +103,9 @@ task test, "Run all tests":
 
     test "-d:Constantine32", "tests/test_finite_fields_vs_gmp.nim"
 
+    # Precompute
+    test "-d:Constantine32", "tests/test_precomputed"
+
     # Towers of extension fields
     test "-d:Constantine32", "tests/test_fp2.nim"
     test "-d:Constantine32", "tests/test_fp6.nim"
@@ -118,7 +124,7 @@ task test, "Run all tests":
     runBench("bench_fp2")
     runBench("bench_fp6")
     runBench("bench_fp12")
-    runBench("bench_ec_swei_proj_g1")
+    runBench("bench_ec_g1")
 
 task test_no_gmp, "Run tests that don't require GMP":
   # -d:testingCurves is configured in a *.nim.cfg for convenience
@@ -138,6 +144,9 @@ task test_no_gmp, "Run tests that don't require GMP":
   test "", "tests/test_finite_fields_sqrt.nim"
   test "", "tests/test_finite_fields_powinv.nim"
 
+  # Precompute
+  test "", "tests/test_precomputed"
+
   # Towers of extension fields
   test "", "tests/test_fp2.nim"
   test "", "tests/test_fp6.nim"
@@ -164,6 +173,9 @@ task test_no_gmp, "Run tests that don't require GMP":
     test "-d:Constantine32", "tests/test_finite_fields_sqrt.nim"
     test "-d:Constantine32", "tests/test_finite_fields_powinv.nim"
 
+    # Precompute
+    test "-d:Constantine32", "tests/test_precomputed"
+
     # Towers of extension fields
     test "-d:Constantine32", "tests/test_fp2.nim"
     test "-d:Constantine32", "tests/test_fp6.nim"
@@ -182,7 +194,7 @@ task test_no_gmp, "Run tests that don't require GMP":
     runBench("bench_fp2")
     runBench("bench_fp6")
     runBench("bench_fp12")
-    runBench("bench_ec_swei_proj_g1")
+    runBench("bench_ec_g1")
 
 task test_parallel, "Run all tests in parallel (via GNU parallel)":
   # -d:testingCurves is configured in a *.nim.cfg for convenience
@@ -197,7 +209,8 @@ task test_parallel, "Run all tests in parallel (via GNU parallel)":
   test "", "tests/test_bigints.nim", cmdFile
   test "", "tests/test_bigints_multimod.nim", cmdFile
 
-  test "", "tests/test_bigints_vs_gmp.nim", cmdFile
+  test "", "tests/test_bigints_mul_vs_gmp.nim", cmdFile
+  test "", "tests/test_bigints_mod_vs_gmp.nim", cmdFile
 
   # Field
   test "", "tests/test_io_fields", cmdFile
@@ -233,7 +246,8 @@ task test_parallel, "Run all tests in parallel (via GNU parallel)":
     test "-d:Constantine32", "tests/test_bigints.nim", cmdFile
     test "-d:Constantine32", "tests/test_bigints_multimod.nim", cmdFile
 
-    test "-d:Constantine32", "tests/test_bigints_vs_gmp.nim", cmdFile
+    test "-d:Constantine32", "tests/test_bigints_mul_vs_gmp.nim", cmdFile
+    test "-d:Constantine32", "tests/test_bigints_mod_vs_gmp.nim", cmdFile
 
     # Field
     test "-d:Constantine32", "tests/test_io_fields", cmdFile
@@ -268,7 +282,7 @@ task test_parallel, "Run all tests in parallel (via GNU parallel)":
     runBench("bench_fp2")
     runBench("bench_fp6")
     runBench("bench_fp12")
-    runBench("bench_ec_swei_proj_g1")
+    runBench("bench_ec_g1")
 
 task bench_fp, "Run benchmark 𝔽p with your default compiler":
   runBench("bench_fp")
@@ -306,11 +320,11 @@ task bench_fp12_gcc, "Run benchmark 𝔽p12 with gcc":
 task bench_fp12_clang, "Run benchmark 𝔽p12 with clang":
   runBench("bench_fp12", "clang")
 
-task bench_ec_swei_proj_g1, "Run benchmark on Elliptic Curve group 𝔾1 - Short Weierstrass with Projective Coordinates - GCC":
-  runBench("bench_ec_swei_proj_g1")
+task bench_ec_g1, "Run benchmark on Elliptic Curve group 𝔾1 - Short Weierstrass with Projective Coordinates - GCC":
+  runBench("bench_ec_g1")
 
-task bench_ec_swei_proj_g1_gcc, "Run benchmark on Elliptic Curve group 𝔾1 - Short Weierstrass with Projective Coordinates - GCC":
-  runBench("bench_ec_swei_proj_g1", "gcc")
+task bench_ec_gcc, "Run benchmark on Elliptic Curve group 𝔾1 - Short Weierstrass with Projective Coordinates - GCC":
+  runBench("bench_ec_g1", "gcc")
 
-task bench_ec_swei_proj_g1_clang, "Run benchmark on Elliptic Curve group 𝔾1 - Short Weierstrass with Projective Coordinates - Clang":
-  runBench("bench_ec_swei_proj_g1", "clang")
+task bench_ec_g1_clang, "Run benchmark on Elliptic Curve group 𝔾1 - Short Weierstrass with Projective Coordinates - Clang":
+  runBench("bench_ec_g1", "clang")
diff --git a/constantine/arithmetic.nim b/constantine/arithmetic.nim
index 7d2c6ca..5a85716 100644
--- a/constantine/arithmetic.nim
+++ b/constantine/arithmetic.nim
@@ -7,11 +7,9 @@
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
 import
-  arithmetic/[limbs, limbs_modular, limbs_montgomery],
   arithmetic/bigints,
   arithmetic/[finite_fields, finite_fields_inversion]
 
 export
-  limbs, limbs_modular, limbs_montgomery,
   bigints,
   finite_fields, finite_fields_inversion
diff --git a/constantine/arithmetic/bigints.nim b/constantine/arithmetic/bigints.nim
index 26280d0..9b682ea 100644
--- a/constantine/arithmetic/bigints.nim
+++ b/constantine/arithmetic/bigints.nim
@@ -7,10 +7,12 @@
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
 import
-  ../config/common,
+  ../config/[common, type_bigint],
   ../primitives,
   ./limbs, ./limbs_montgomery, ./limbs_modular
 
+export BigInt
+
 # ############################################################
 #
 #                        BigInts
@@ -55,41 +57,6 @@ import
 # having redundant representations that forces costly reductions before multiplications.
 # https://github.com/mratsim/constantine/issues/15
 
-func wordsRequired(bits: int): int {.compileTime.} =
-  ## Compute the number of limbs required
-  # from the **announced** bit length
-  (bits + WordBitWidth - 1) div WordBitWidth
-
-type
-  BigInt*[bits: static int] = object
-    ## Fixed-precision big integer
-    ##
-    ## - "bits" is the announced bit-length of the BigInt
-    ##   This is public data, usually equal to the curve prime bitlength.
-    ##
-    ## - "limbs" is an internal field that holds the internal representation
-    ##   of the big integer. Least-significant limb first. Within limbs words are native-endian.
-    ##
-    ## This internal representation can be changed
-    ## without notice and should not be used by external applications or libraries.
-    limbs*: array[bits.wordsRequired, SecretWord]
-
-# For unknown reason, `bits` doesn't semcheck if
-#   `limbs: Limbs[bits.wordsRequired]`
-# with
-#   `Limbs[N: static int] = distinct array[N, SecretWord]`
-# so we don't set Limbs as a distinct type
-
-debug:
-  import strutils
-
-  func `$`*(a: BigInt): string =
-    result = "BigInt["
-    result.add $BigInt.bits
-    result.add "](limbs: "
-    result.add a.limbs.toString()
-    result.add ")"
-
 # No exceptions allowed
 {.push raises: [].}
 {.push inline.}
@@ -123,6 +90,14 @@ func cswap*(a, b: var BigInt, ctl: CTBool) =
   ## memory accesses are done (unless the compiler tries to be clever)
   cswap(a.limbs, b.limbs, ctl)
 
+func copyTruncatedFrom*[dBits, sBits: static int](dst: var BigInt[dBits], src: BigInt[sBits]) =
+  ## Copy `src` into `dst`
+  ## if `dst` is not big enough, only the low words are copied
+  ## if `src` is smaller than `dst` the higher words of `dst` will NOT be overwritten
+
+  for wordIdx in 0 ..< min(dst.limbs.len, src.limbs.len):
+    dst.limbs[wordIdx] = src.limbs[wordIdx]
+
 # Comparison
 # ------------------------------------------------------------
 
@@ -161,11 +136,17 @@ func cadd*(a: var BigInt, b: BigInt, ctl: SecretBool): SecretBool =
   (SecretBool) cadd(a.limbs, b.limbs, ctl)
 
 func csub*(a: var BigInt, b: BigInt, ctl: SecretBool): SecretBool =
-  ## Constant-time in-place conditional addition
-  ## The addition is only performed if ctl is "true"
-  ## The result carry is always computed.
+  ## Constant-time in-place conditional substraction
+  ## The substraction is only performed if ctl is "true"
+  ## The result borrow is always computed.
   (SecretBool) csub(a.limbs, b.limbs, ctl)
 
+func csub*(a: var BigInt, b: SecretWord, ctl: SecretBool): SecretBool =
+  ## Constant-time in-place conditional substraction
+  ## The substraction is only performed if ctl is "true"
+  ## The result borrow is always computed.
+  (SecretBool) csub(a.limbs, b, ctl)
+
 func cdouble*(a: var BigInt, ctl: SecretBool): SecretBool =
   ## Constant-time in-place conditional doubling
   ## The doubling is only performed if ctl is "true"
@@ -182,11 +163,36 @@ func add*(a: var BigInt, b: SecretWord): SecretBool =
   ## Returns the carry
   (SecretBool) add(a.limbs, b)
 
+func `+=`*(a: var BigInt, b: BigInt) =
+  ## Constant-time in-place addition
+  ## Discards the carry
+  discard add(a.limbs, b.limbs)
+
+func `+=`*(a: var BigInt, b: SecretWord) =
+  ## Constant-time in-place addition
+  ## Discards the carry
+  discard add(a.limbs, b)
+
 func sub*(a: var BigInt, b: BigInt): SecretBool =
   ## Constant-time in-place substraction
   ## Returns the borrow
   (SecretBool) sub(a.limbs, b.limbs)
 
+func sub*(a: var BigInt, b: SecretWord): SecretBool =
+  ## Constant-time in-place substraction
+  ## Returns the borrow
+  (SecretBool) sub(a.limbs, b)
+
+func `-=`*(a: var BigInt, b: BigInt) =
+  ## Constant-time in-place substraction
+  ## Discards the borrow
+  discard sub(a.limbs, b.limbs)
+
+func `-=`*(a: var BigInt, b: SecretWord) =
+  ## Constant-time in-place substraction
+  ## Discards the borrow
+  discard sub(a.limbs, b)
+
 func double*(a: var BigInt): SecretBool =
   ## Constant-time in-place doubling
   ## Returns the carry
@@ -222,6 +228,34 @@ func cneg*(a: var BigInt, ctl: CTBool) =
   ## Negate if ``ctl`` is true
   a.limbs.cneg(ctl)
 
+func prod*[rBits, aBits, bBits](r: var BigInt[rBits], a: BigInt[aBits], b: BigInt[bBits]) =
+  ## Multi-precision multiplication
+  ## r <- a*b
+  ## `a`, `b`, `r` can have different sizes
+  ## if r.bits < a.bits + b.bits
+  ## the multiplication will overflow.
+  ## It will be truncated if it cannot fit in r limbs.
+  r.limbs.prod(a.limbs, b.limbs)
+
+func prod_high_words*[rBits, aBits, bBits](r: var BigInt[rBits], a: BigInt[aBits], b: BigInt[bBits], lowestWordIndex: static int) =
+  ## Multi-precision multiplication keeping only high words
+  ## r <- a*b >> (2^WordBitWidth)^lowestWordIndex
+  ##
+  ## `a`, `b`, `r` can have a different number of limbs
+  ## if `r`.limbs.len < a.limbs.len + b.limbs.len - lowestWordIndex
+  ## The result will be truncated, i.e. it will be
+  ## a * b >> (2^WordBitWidth)^lowestWordIndex (mod (2^WordBitwidth)^r.limbs.len)
+  ##
+  # This is useful for
+  # - Barret reduction
+  # - Approximating multiplication by a fractional constant in the form f(a) = K/C * a
+  #   with K and C known at compile-time.
+  #   We can instead find a well chosen M = (2^WordBitWidth)^w, with M > C (i.e. M is a power of 2 bigger than C)
+  #   Precompute P = K*M/C at compile-time
+  #   and at runtime do P*a/M <=> P*a >> WordBitWidth*w
+  #   i.e. prod_high_words(result, P, a, w)
+  r.limbs.prod_high_words(a.limbs, b.limbs, lowestWordIndex)
+
 # Bit Manipulation
 # ------------------------------------------------------------
 
@@ -231,6 +265,24 @@ func shiftRight*(a: var BigInt, k: int) =
   ## k MUST be less than the base word size (2^31 or 2^63)
   a.limbs.shiftRight(k)
 
+func bit*[bits: static int](a: BigInt[bits], index: int): Ct[uint8] =
+  ## Access an individual bit of `a`
+  ## Bits are accessed as-if the bit representation is bigEndian
+  ## for a 8-bit "big-integer" we have
+  ## (b7, b6, b5, b4, b3, b2, b1, b0)
+  ## for a 256-bit big-integer
+  ## (b255, b254, ..., b1, b0)
+  const SlotShift = log2(WordBitWidth.uint32)
+  const SelectMask = WordBitWidth - 1
+  const BitMask = SecretWord 1
+
+  let slot = a.limbs[index shr SlotShift] # LimbEndianness is littleEndian
+  result = ct(slot shr (index and SelectMask) and BitMask, uint8)
+
+func bit0*(a: BigInt): Ct[uint8] =
+  ## Access the least significant bit
+  ct(a.limbs[0] and SecretWord(1), uint8)
+
 # ############################################################
 #
 #                   Modular BigInt
diff --git a/constantine/arithmetic/finite_fields.nim b/constantine/arithmetic/finite_fields.nim
index e46bcc2..ecab650 100644
--- a/constantine/arithmetic/finite_fields.nim
+++ b/constantine/arithmetic/finite_fields.nim
@@ -26,23 +26,10 @@
 
 import
   ../primitives,
-  ../config/[common, curves],
+  ../config/[common, type_fp, curves],
   ./bigints, ./limbs_montgomery
 
-type
-  Fp*[C: static Curve] = object
-    ## All operations on a field are modulo P
-    ## P being the prime modulus of the Curve C
-    ## Internally, data is stored in Montgomery n-residue form
-    ## with the magic constant chosen for convenient division (a power of 2 depending on P bitsize)
-    mres*: matchingBigInt(C)
-
-debug:
-  func `$`*[C: static Curve](a: Fp[C]): string =
-    result = "Fp[" & $C
-    result.add "]("
-    result.add $a.mres
-    result.add ')'
+export Fp
 
 # No exceptions allowed
 {.push raises: [].}
diff --git a/constantine/arithmetic/finite_fields_inversion.nim b/constantine/arithmetic/finite_fields_inversion.nim
index 661a884..a59706b 100644
--- a/constantine/arithmetic/finite_fields_inversion.nim
+++ b/constantine/arithmetic/finite_fields_inversion.nim
@@ -24,7 +24,7 @@ template repeat(num: int, body: untyped) =
 
 # Secp256k1
 # ------------------------------------------------------------
-func invmod_addchain(r: var Fp[Secp256k1], a: Fp[Secp256k1]) =
+func invmod_addchain(r: var Fp[Secp256k1], a: Fp[Secp256k1]) {.used.}=
   ## We invert via Little Fermat's theorem
   ## a^(-1) ≡ a^(p-2) (mod p)
   ## with p = "0xFFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFE FFFFFC2F"
@@ -120,7 +120,7 @@ func invmod_addchain(r: var Fp[Secp256k1], a: Fp[Secp256k1]) =
 # Note: it only works for u positive, in particular BN254 doesn't work :/
 #       Is there a way to only use a^-u or even powers?
 
-func invmod_addchain_bn[C](r: var Fp[C], a: Fp[C]) =
+func invmod_addchain_bn[C](r: var Fp[C], a: Fp[C]) {.used.}=
   ## Inversion on BN prime fields with positive base parameter `u`
   ## via Little Fermat theorem and leveraging the prime low Hamming weight
   ##
diff --git a/constantine/arithmetic/limbs.nim b/constantine/arithmetic/limbs.nim
index 53a36b1..a25822c 100644
--- a/constantine/arithmetic/limbs.nim
+++ b/constantine/arithmetic/limbs.nim
@@ -8,7 +8,8 @@
 
 import
   ../config/common,
-  ../primitives
+  ../primitives,
+  ../../helpers/static_for
 
 # ############################################################
 #
@@ -43,21 +44,6 @@ type Limbs*[N: static int] = array[N, SecretWord]
   ##
   ## but for unknown reason, it prevents semchecking `bits`
 
-debug:
-  import strutils
-
-  func toString*(a: Limbs): string =
-    result = "["
-    result.add " 0x" & toHex(BaseType(a[0]))
-    for i in 1 ..< a.len:
-      result.add ", 0x" & toHex(BaseType(a[i]))
-    result.add "])"
-
-  func toHex*(a: Limbs): string =
-    result = "0x"
-    for i in countdown(a.len-1, 0):
-      result.add toHex(BaseType(a[i]))
-
 # No exceptions allowed
 {.push raises: [].}
 
@@ -180,7 +166,28 @@ func isOdd*(a: Limbs): SecretBool =
   ## Returns true if a is odd
   SecretBool(a[0] and SecretWord(1))
 
-# Arithmetic
+# Bit manipulation
+# ------------------------------------------------------------
+
+func shiftRight*(a: var Limbs, k: int) {.inline.}=
+  ## Shift right by k.
+  ##
+  ## k MUST be less than the base word size (2^32 or 2^64)
+  # We don't reuse shr as this is an in-place operation
+  # Do we need to return the shifted out part?
+  #
+  # Note: for speed, loading a[i] and a[i+1]
+  #       instead of a[i-1] and a[i]
+  #       is probably easier to parallelize for the compiler
+  #       (antidependence WAR vs loop-carried dependence RAW)
+
+  # checkWordShift(k)
+
+  for i in 0 ..< a.len-1:
+    a[i] = (a[i] shr k) or (a[i+1] shl (WordBitWidth - k))
+  a[a.len-1] = a[a.len-1] shr k
+
+# Basic Arithmetic
 # ------------------------------------------------------------
 
 func add*(a: var Limbs, b: Limbs): Carry =
@@ -229,6 +236,14 @@ func sub*(a: var Limbs, b: Limbs): Borrow =
   for i in 0 ..< a.len:
     subB(result, a[i], a[i], b[i], result)
 
+func sub*(a: var Limbs, w: SecretWord): Borrow =
+  ## Limbs substraction, sub a number that fits in a word
+  ## Returns the borrow
+  result = Borrow(0)
+  subB(result, a[0], a[0], w, result)
+  for i in 1 ..< a.len:
+    subB(result, a[i], a[i], Zero, result)
+
 func csub*(a: var Limbs, b: Limbs, ctl: SecretBool): Borrow =
   ## Limbs conditional substraction
   ## Returns the borrow
@@ -244,6 +259,17 @@ func csub*(a: var Limbs, b: Limbs, ctl: SecretBool): Borrow =
     subB(result, diff, a[i], b[i], result)
     ctl.ccopy(a[i], diff)
 
+func csub*(a: var Limbs, w: SecretWord, ctl: SecretBool): Borrow =
+  ## Limbs conditional substraction, sub a number that fits in a word
+  ## Returns the borrow
+  result = Carry(0)
+  var diff: SecretWord
+  subB(result, diff, a[0], w, result)
+  ctl.ccopy(a[0], diff)
+  for i in 1 ..< a.len:
+    subB(result, diff, a[i], Zero, result)
+    ctl.ccopy(a[i], diff)
+
 func diff*(r: var Limbs, a, b: Limbs): Borrow =
   ## Diff `a` and `b` into `r`
   ## `r` is initialized/overwritten
@@ -266,33 +292,85 @@ func cneg*(a: var Limbs, ctl: CTBool) =
   # So we need to xor all words and then add 1
   # The "+1" might carry
   # So we fuse the 2 steps
-  let mask = -SecretWord(ctl)              # Obtain a 0xFF... or 0x00... mask
+  let mask = -SecretWord(ctl)        # Obtain a 0xFF... or 0x00... mask
   var carry = SecretWord(ctl)
   for i in 0 ..< a.len:
     let t = (a[i] xor mask) + carry  # XOR with mask and add 0x01 or 0x00 respectively
-    carry = SecretWord(t < carry)          # Carry on
+    carry = SecretWord(t < carry)    # Carry on
     a[i] = t
 
-# Bit manipulation
+{.pop.} # inline
+
+# Multiplication
 # ------------------------------------------------------------
 
-func shiftRight*(a: var Limbs, k: int) =
-  ## Shift right by k.
+func prod*[rLen, aLen, bLen](r: var Limbs[rLen], a: Limbs[aLen], b: Limbs[bLen]) =
+  ## Multi-precision multiplication
+  ## r <- a*b
   ##
-  ## k MUST be less than the base word size (2^32 or 2^64)
-  # We don't reuse shr as this is an in-place operation
-  # Do we need to return the shifted out part?
+  ## `a`, `b`, `r` can have a different number of limbs
+  ## if `r`.limbs.len < a.limbs.len + b.limbs.len
+  ## The result will be truncated, i.e. it will be
+  ## a * b (mod (2^WordBitwidth)^r.limbs.len)
+
+  # We use Product Scanning / Comba multiplication
+  var t, u, v = SecretWord(0)
+  var z: Limbs[rLen] # zero-init, ensure on stack and removes in-place problems
+
+  staticFor i, 0, min(a.len+b.len, r.len):
+    const ib = min(b.len-1, i)
+    const ia = i - ib
+    staticFor j, 0, min(a.len - ia, ib+1):
+      mulAcc(t, u, v, a[ia+j], b[ib-j])
+
+    z[i] = v
+    v = u
+    u = t
+    t = SecretWord(0)
+
+  r = z
+
+func prod_high_words*[rLen, aLen, bLen](
+       r: var Limbs[rLen],
+       a: Limbs[aLen], b: Limbs[bLen],
+       lowestWordIndex: static int) =
+  ## Multi-precision multiplication keeping only high words
+  ## r <- a*b >> (2^WordBitWidth)^lowestWordIndex
+  ##
+  ## `a`, `b`, `r` can have a different number of limbs
+  ## if `r`.limbs.len < a.limbs.len + b.limbs.len - lowestWordIndex
+  ## The result will be truncated, i.e. it will be
+  ## a * b >> (2^WordBitWidth)^lowestWordIndex (mod (2^WordBitwidth)^r.limbs.len)
   #
-  # Note: for speed, loading a[i] and a[i+1]
-  #       instead of a[i-1] and a[i]
-  #       is probably easier to parallelize for the compiler
-  #       (antidependence WAR vs loop-carried dependence RAW)
+  # This is useful for
+  # - Barret reduction
+  # - Approximating multiplication by a fractional constant in the form f(a) = K/C * a
+  #   with K and C known at compile-time.
+  #   We can instead find a well chosen M = (2^WordBitWidth)^w, with M > C (i.e. M is a power of 2 bigger than C)
+  #   Precompute P = K*M/C at compile-time
+  #   and at runtime do P*a/M <=> P*a >> (WordBitWidth*w)
+  #   i.e. prod_high_words(result, P, a, w)
 
-  # checkWordShift(k)
+  # We use Product Scanning / Comba multiplication
+  var t, u, v = SecretWord(0) # Will raise warning on empty iterations
+  var z: Limbs[rLen] # zero-init, ensure on stack and removes in-place problems
 
-  for i in 0 ..< a.len-1:
-    a[i] = (a[i] shr k) or (a[i+1] shl (WordBitWidth - k))
-  a[a.len-1] = a[a.len-1] shr k
+  # The previous 2 columns can affect the lowest word due to carries
+  # but not the ones before (we accumulate in 3 words (t, u, v))
+  const w = lowestWordIndex - 2
+
+  staticFor i, max(0, w), min(a.len+b.len, r.len+lowestWordIndex):
+    const ib = min(b.len-1, i)
+    const ia = i - ib
+    staticFor j, 0, min(a.len - ia, ib+1):
+      mulAcc(t, u, v, a[ia+j], b[ib-j])
+
+    when i >= lowestWordIndex:
+      z[i-lowestWordIndex] = v
+    v = u
+    u = t
+    t = SecretWord(0)
+
+  r = z
 
-{.pop.} # inline
 {.pop.} # raises no exceptions
diff --git a/constantine/arithmetic/limbs_montgomery.nim b/constantine/arithmetic/limbs_montgomery.nim
index 683d065..76edde4 100644
--- a/constantine/arithmetic/limbs_montgomery.nim
+++ b/constantine/arithmetic/limbs_montgomery.nim
@@ -7,10 +7,12 @@
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
 import
+  # Stadard library
+  std/macros,
+  # Internal
   ../config/common,
   ../primitives,
-  ./limbs,
-  macros
+  ./limbs
 
 # ############################################################
 #
@@ -118,7 +120,7 @@ func montyMul_CIOS_nocarry(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType) =
   discard t.csub(M, not(t < M))
   r = t
 
-func montyMul_CIOS(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType) =
+func montyMul_CIOS(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType) {.used.} =
   ## Montgomery Multiplication using Coarse Grained Operand Scanning (CIOS)
   # - Analyzing and Comparing Montgomery Multiplication Algorithms
   #   Cetin Kaya Koc and Tolga Acar and Burton S. Kaliski Jr.
@@ -166,6 +168,43 @@ func montyMul_CIOS(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType) =
   discard t.csub(M, tN.isNonZero() or not(t < M)) # TODO: (t >= M) is unnecessary for prime in the form (2^64)^w
   r = t
 
+func montyMul_FIPS(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType) =
+  ## Montgomery Multiplication using Finely Integrated Product Scanning (FIPS)
+  # - Architectural Enhancements for Montgomery
+  #   Multiplication on Embedded RISC Processors
+  #   Johann Großschädl and Guy-Armand Kamendje, 2003
+  #   https://pure.tugraz.at/ws/portalfiles/portal/2887154/ACNS2003_AEM.pdf
+  #
+  # - New Speed Records for Montgomery Modular
+  #   Multiplication on 8-bit AVR Microcontrollers
+  #   Zhe Liu and Johann Großschädl, 2013
+  #   https://eprint.iacr.org/2013/882.pdf
+  var z: typeof(r) # zero-init, ensure on stack and removes in-place problems in tower fields
+  const L = r.len
+  var t, u, v = SecretWord(0)
+
+  staticFor i, 0, L:
+    staticFor j, 0, i:
+      mulAcc(t, u, v, a[j], b[i-j])
+      mulAcc(t, u, v, z[j], M[i-j])
+    mulAcc(t, u, v, a[i], b[0])
+    z[i] = v * SecretWord(m0ninv)
+    mulAcc(t, u, v, z[i], M[0])
+    v = u
+    u = t
+    t = SecretWord(0)
+  staticFor i, L, 2*L:
+    staticFor j, i-L+1, L:
+      mulAcc(t, u, v, a[j], b[i-j])
+      mulAcc(t, u, v, z[j], M[i-j])
+    z[i-L] = v
+    v = u
+    u = t
+    t = SecretWord(0)
+
+  discard z.csub(M, v.isNonZero() or not(z < M))
+  r = z
+
 func montySquare_CIOS_nocarry(r: var Limbs, a, M: Limbs, m0ninv: BaseType) =
   ## Montgomery Multiplication using Coarse Grained Operand Scanning (CIOS)
   ## and no-carry optimization.
@@ -253,7 +292,7 @@ func montySquare_CIOS(r: var Limbs, a, M: Limbs, m0ninv: BaseType) =
     #  (_, t[N])  <- t[N+1] + C
     var carryR: Carry
     addC(carryR, t[N-1], tN, C, Carry(0))
-    addC(carryR, tN, SecretWord(tNp1), Zero, carryR)
+    addC(carryR, tN, tNp1, Zero, carryR)
 
   discard t.csub(M, tN.isNonZero() or not(t < M)) # TODO: (t >= M) is unnecessary for prime in the form (2^64)^w
   r = t
@@ -293,7 +332,7 @@ func montyMul*(
   when canUseNoCarryMontyMul:
     montyMul_CIOS_nocarry(r, a, b, M, m0ninv)
   else:
-    montyMul_CIOS(r, a, b, M, m0ninv)
+    montyMul_FIPS(r, a, b, M, m0ninv)
 
 func montySquare*(r: var Limbs, a, M: Limbs,
                   m0ninv: static BaseType, canUseNoCarryMontySquare: static bool) =
diff --git a/constantine/config/curves.nim b/constantine/config/curves.nim
index f5896c1..cc6ecdb 100644
--- a/constantine/config/curves.nim
+++ b/constantine/config/curves.nim
@@ -8,10 +8,10 @@
 
 import
   # Standard library
-  macros,
+  std/macros,
   # Internal
-  ./curves_declaration, ./curves_derived, ./curves_parser,
-  ../arithmetic/bigints
+  ./type_bigint, ./common,
+  ./curves_declaration, ./curves_derived, ./curves_parser
 
 export CurveFamily, Curve, SexticTwist
 
@@ -170,6 +170,16 @@ macro getBN_param_6u_minus_1_BE*(C: static Curve): untyped =
   ## of a BN curve in canonical big-endian representation
   result = bindSym($C & "_BN_6u_minus_1_BE")
 
+# Endomorphism
+# -------------------------------------------------------
+macro getCubicRootOfUnity_mod_p*(C: static Curve): untyped =
+  ## Get a non-trivial cubic root of unity (mod p) with p the prime field
+  result = bindSym($C & "_cubicRootOfUnity_mod_p")
+
+macro getCubicRootOfUnity_mod_r*(C: static Curve): untyped =
+  ## Get a non-trivial cubic root of unity (mod r) with r the curve order
+  result = bindSym($C & "_cubicRootOfUnity_mod_r")
+
 # ############################################################
 #
 #                Debug info printed at compile-time
@@ -187,12 +197,16 @@ macro debugConsts(): untyped {.used.} =
     let modulus = bindSym(curveName & "_Modulus")
     let r2modp = bindSym(curveName & "_R2modP")
     let negInvModWord = bindSym(curveName & "_NegInvModWord")
+    let cubeRootOfUnity = ident(curveName & "_cubicRootOfUnity")
 
     result.add quote do:
       echo "Curve ", `curveName`,':'
       echo "  Field Modulus:                 ", `modulus`
       echo "  Montgomery R² (mod P):         ", `r2modp`
       echo "  Montgomery -1/P[0] (mod 2^", WordBitWidth, "): ", `negInvModWord`
+      when declared(`cubeRootOfUnity`):
+        echo "  Cube root of unity:            ", `cubeRootOfUnity`
+
   result.add quote do:
     echo "----------------------------------------------------------------------------"
 
diff --git a/constantine/config/curves_declaration.nim b/constantine/config/curves_declaration.nim
index 25ccd75..3efdfc7 100644
--- a/constantine/config/curves_declaration.nim
+++ b/constantine/config/curves_declaration.nim
@@ -91,6 +91,9 @@ declareCurves:
     family: BarretoNaehrig
     bn_u_bitwidth: 63
     bn_u: "0x44E992B44A6909F1" # u: 4965661367192848881
+    cubicRootOfUnity_modP: "0x30644e72e131a0295e6dd9e7e0acccb0c28f069fbb966e3de4bd44e5607cfd48"
+    # For sanity checks
+    cubicRootOfUnity_modR: "0x30644e72e131a029048b6e193fd84104cc37a73fec2bc5e9b8ca0b2d36636f23"
 
     # G1 Equation: Y^2 = X^3 + 3
     # G2 Equation: Y^2 = X^3 + 3/(9+𝑖)
@@ -141,6 +144,7 @@ declareCurves:
     modulus: "0x1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaab"
     family: BarretoLynnScott
     # u: -(2^63 + 2^62 + 2^60 + 2^57 + 2^48 + 2^16)
+    cubicRootOfUnity_mod_p: "0x1a0111ea397fe699ec02408663d4de85aa0d857d89759ad4897d29650fb85f9b409427eb4f49fffd8bfd00000000aaac"
 
     # G1 Equation: y² = x³ + 4
     # G2 Equation: y² = x³ + 4 (1+i)
diff --git a/constantine/config/curves_derived.nim b/constantine/config/curves_derived.nim
index a055114..93805d4 100644
--- a/constantine/config/curves_derived.nim
+++ b/constantine/config/curves_derived.nim
@@ -8,10 +8,12 @@
 
 import
   # Standard library
-  macros,
+  std/macros,
   # Internal
-  ../arithmetic/precomputed,
-  ./curves_declaration
+  ./precompute,
+  ./curves_declaration,
+  ./type_fp,
+  ../io/io_bigints
 
 {.experimental: "dynamicBindSym".}
 
@@ -30,8 +32,6 @@ macro genDerivedConstants*(): untyped =
   # "for curve in low(Curve) .. high(Curve):"
   # As an ugly workaround, we count
   # The item at position 0 is a pragma
-  let curveList = Curve.getType[1].getType
-
   result = newStmtList()
 
   template used(name: string): NimNode =
@@ -124,6 +124,34 @@ macro genDerivedConstants*(): untyped =
       )
     )
 
+    # const MyCurve_cubicRootOfUnity_mod_p
+    block:
+      let cubicHex = ident(curve & "_cubicRootOfUnity_modP_Hex")
+      let cubic = used(curve & "_cubicRootOfUnity_mod_p")
+      let M = bindSym(curve & "_Modulus")
+      let r2modM = ident(curve & "_R2modP")
+      let m0ninv = ident(curve & "_NegInvModWord")
+      result.add quote do:
+        when declared(`cubicHex`):
+          const `cubic` = block:
+            var cubic: Fp[Curve(`curveSym`)]
+            montyResidue_precompute(
+              cubic.mres,
+              fromHex(cubic.mres.typeof, `cubicHex`),
+              `M`, `r2modM`, `m0ninv`
+            )
+            cubic
+    # const MyCurve_cubicRootOfUnity_mod_r
+    block: # For scalar decomposition sanity checks
+      let cubicHex = ident(curve & "_cubicRootOfUnity_modR_Hex")
+      let cubic = used(curve & "_cubicRootOfUnity_mod_r")
+      let getCurveOrderBitwidth = ident"getCurveOrderBitwidth"
+      result.add quote do:
+        when declared(`cubicHex`):
+          const `cubic` = fromHex(BigInt[
+            `getCurveOrderBitwidth`(Curve(`curveSym`))
+          ], `cubicHex`)
+
     if CurveFamilies[curveSym] == BarretoNaehrig:
       # when declared(MyCurve_BN_param_u):
       #   const MyCurve_BN_u_BE = toCanonicalIntRepr(MyCurve_BN_param_u)
@@ -148,3 +176,5 @@ macro genDerivedConstants*(): untyped =
           bnStmts
         )
       )
+
+  # echo result.toStrLit()
diff --git a/constantine/config/curves_parser.nim b/constantine/config/curves_parser.nim
index c85a70f..98162e2 100644
--- a/constantine/config/curves_parser.nim
+++ b/constantine/config/curves_parser.nim
@@ -10,7 +10,8 @@ import
   # Standard library
   std/[macros, strutils],
   # Internal
-  ../io/io_bigints, ../arithmetic/[bigints, precomputed]
+  ../io/io_bigints,
+  ./type_bigint
 
 # Parsing is done in 2 steps:
 # 1. All declared parameters are collected in a {.compileTime.} seq[CurveParams]
@@ -108,6 +109,10 @@ type
     sexticTwist: SexticTwist
     sexticNonResidue_fp2: NimNode # nnkPar(nnkIntLit, nnkIntLit)
 
+    # Endomorphisms
+    cubicRootOfUnity_modP: NimNode # nnkStrLit
+    cubicRootOfUnity_modR: NimNode # nnkStrLit
+
     family: CurveFamily
     # BN family
     # ------------------------
@@ -181,6 +186,10 @@ proc parseCurveDecls(defs: var seq[CurveParams], curves: NimNode) =
         params.bn_u_bitwidth = sectionVal
       elif sectionId.eqIdent"bn_u":
         params.bn_u = sectionVal
+      elif sectionId.eqident"cubicRootOfUnity_modP":
+        params.cubicRootOfUnity_modP = sectionVal
+      elif sectionId.eqident"cubicRootOfUnity_modR":
+        params.cubicRootOfUnity_modR = sectionVal
       elif sectionId.eqIdent"eq_form":
         params.eq_form = parseEnum[CurveEquationForm]($sectionVal)
       elif sectionId.eqIdent"coef_a":
@@ -271,6 +280,7 @@ proc genMainConstants(defs: var seq[CurveParams]): NimNode =
     MapCurveFamily.add nnkExprColonExpr.newTree(
         curve, newLit(family)
     )
+
     # Curve equation
     # -----------------------------------------------
     curveEllipticStmts.add newConstStmt(
@@ -313,6 +323,19 @@ proc genMainConstants(defs: var seq[CurveParams]): NimNode =
         curveDef.sexticNonResidue_fp2
       )
 
+    # Endomorphisms
+    # -----------------------------------------------
+    if not curveDef.cubicRootOfUnity_modP.isNil:
+      curveExtraStmts.add newConstStmt(
+        exported($curve & "_cubicRootOfUnity_modP_Hex"),
+        curveDef.cubicRootOfUnity_modP
+      )
+    if not curveDef.cubicRootOfUnity_modR.isNil:
+      curveExtraStmts.add newConstStmt(
+        exported($curve & "_cubicRootOfUnity_modR_Hex"),
+        curveDef.cubicRootOfUnity_modR
+      )
+
     # BN curves
     # -----------------------------------------------
     if family == BarretoNaehrig:
diff --git a/constantine/arithmetic/precomputed.nim b/constantine/config/precompute.nim
similarity index 78%
rename from constantine/arithmetic/precomputed.nim
rename to constantine/config/precompute.nim
index a2c1f41..b6c7f6b 100644
--- a/constantine/arithmetic/precomputed.nim
+++ b/constantine/config/precompute.nim
@@ -7,17 +7,18 @@
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
 import
-  ./bigints,
-  ../primitives/constant_time,
-  ../config/common,
+  ./type_bigint, ./common,
+  ../primitives,
   ../io/io_bigints
 
 # Precomputed constants
-# ############################################################
+# We need alternate code paths for the VM
+# for various reasons
+# ------------------------------------------------------------
 
 # ############################################################
 #
-#                   Modular primitives
+#                   BigInt primitives
 #
 # ############################################################
 #
@@ -25,7 +26,7 @@ import
 # Those are NOT tagged compile-time, using CTBool seems to confuse the VM
 
 # We don't use distinct types here, they confuse the VM
-# Similarly, using addC / subB confuses the VM
+# Similarly, using addC / subB from primitives confuses the VM
 
 # As we choose to use the full 32/64 bits of the integers and there is no carry flag
 # in the compile-time VM we need a portable (and slow) "adc" and "sbb".
@@ -36,9 +37,15 @@ const
   HalfBase = (BaseType(1) shl HalfWidth)
   HalfMask = HalfBase - 1
 
+func hi(n: BaseType): BaseType =
+  result = n shr HalfWidth
+
+func lo(n: BaseType): BaseType =
+  result = n and HalfMask
+
 func split(n: BaseType): tuple[hi, lo: BaseType] =
-  result.hi = n shr HalfWidth
-  result.lo = n and HalfMask
+  result.hi = n.hi
+  result.lo = n.lo
 
 func merge(hi, lo: BaseType): BaseType =
   (hi shl HalfWidth) or lo
@@ -104,6 +111,56 @@ func sub(a: var BigInt, w: BaseType): bool =
 
   result = bool(borrow)
 
+func mul(hi, lo: var BaseType, u, v: BaseType) =
+  ## Extended precision multiplication
+  ## (hi, lo) <- u * v
+  var x0, x1, x2, x3: BaseType
+
+  let
+    (uh, ul) = u.split()
+    (vh, vl) = v.split()
+
+  x0 = ul * vl
+  x1 = ul * vh
+  x2 = uh * vl
+  x3 = uh * vh
+
+  x1 += hi(x0)          # This can't carry
+  x1 += x2              # but this can
+  if x1 < x2:           # if carry, add it to x3
+    x3 += HalfBase
+
+  hi = x3 + hi(x1)
+  lo = merge(x1, lo(x0))
+
+func muladd1(hi, lo: var BaseType, a, b, c: BaseType) {.inline.} =
+  ## Extended precision multiplication + addition
+  ## (hi, lo) <- a*b + c
+  ##
+  ## Note: 0xFFFFFFFF_FFFFFFFF² -> (hi: 0xFFFFFFFFFFFFFFFE, lo: 0x0000000000000001)
+  ##       so adding any c cannot overflow
+  var carry: BaseType
+  mul(hi, lo, a, b)
+  addC(carry, lo, lo, c, 0)
+  addC(carry, hi, hi, 0, carry)
+
+func muladd2(hi, lo: var BaseType, a, b, c1, c2: BaseType) {.inline.}=
+  ## Extended precision multiplication + addition + addition
+  ## (hi, lo) <- a*b + c1 + c2
+  ##
+  ## Note: 0xFFFFFFFF_FFFFFFFF² -> (hi: 0xFFFFFFFFFFFFFFFE, lo: 0x0000000000000001)
+  ##       so adding 0xFFFFFFFFFFFFFFFF leads to (hi: 0xFFFFFFFFFFFFFFFF, lo: 0x0000000000000000)
+  ##       and we have enough space to add again 0xFFFFFFFFFFFFFFFF without overflowing
+  var carry1, carry2: BaseType
+
+  mul(hi, lo, a, b)
+  # Carry chain 1
+  addC(carry1, lo, lo, c1, 0)
+  addC(carry1, hi, hi, 0, carry1)
+  # Carry chain 2
+  addC(carry2, lo, lo, c2, 0)
+  addC(carry2, hi, hi, 0, carry2)
+
 func cadd(a: var BigInt, b: BigInt, ctl: bool): bool =
   ## In-place optional addition
   ##
@@ -147,6 +204,22 @@ func doubleMod(a: var BigInt, M: BigInt) =
   ctl = ctl or not a.csub(M, false)
   discard csub(a, M, ctl)
 
+func `<`(a, b: BigInt): bool =
+  var diff, borrow: BaseType
+  for i in 0 ..< a.limbs.len:
+    subB(borrow, diff, BaseType(a.limbs[i]), BaseType(b.limbs[i]), borrow)
+
+  result = bool borrow
+
+func shiftRight*(a: var BigInt, k: int) =
+  ## Shift right by k.
+  ##
+  ## k MUST be less than the base word size (2^32 or 2^64)
+
+  for i in 0 ..< a.limbs.len-1:
+    a.limbs[i] = (a.limbs[i] shr k) or (a.limbs[i+1] shl (WordBitWidth - k))
+  a.limbs[a.limbs.len-1] = a.limbs[a.limbs.len-1] shr k
+
 # ############################################################
 #
 #          Montgomery Magic Constants precomputation
@@ -413,3 +486,48 @@ func bn_6u_minus_1_BE*[bits: static int](
 
   # Export
   result.exportRawUint(u_ext, bigEndian)
+
+# ############################################################
+#
+#       Compile-time Conversion to Montgomery domain
+#
+# ############################################################
+# This is needed to avoid recursive dependencies
+
+func montyMul_precompute(r: var BigInt, a, b, M: BigInt, m0ninv: BaseType) =
+  ## Montgomery Multiplication using Coarse Grained Operand Scanning (CIOS)
+  var t: typeof(M)   # zero-init
+  const N = t.limbs.len
+  var tN: BaseType
+  var tNp1: BaseType
+
+  var tmp: BaseType # Distinct types bug in the VM are a huge pain ...
+
+  for i in 0 ..< N:
+    var A: BaseType
+    for j in 0 ..< N:
+      muladd2(A, tmp, BaseType(a.limbs[j]), BaseType(b.limbs[i]), BaseType(t.limbs[j]), A)
+      t.limbs[j] = SecretWord(tmp)
+    addC(tNp1, tN, tN, A, 0)
+
+    var C, lo: BaseType
+    let m = BaseType(t.limbs[0]) * m0ninv
+    muladd1(C, lo, m, BaseType(M.limbs[0]), BaseType(t.limbs[0]))
+    for j in 1 ..< N:
+      muladd2(C, tmp, m, BaseType(M.limbs[j]), BaseType(t.limbs[j]), C)
+      t.limbs[j-1] = SecretWord(tmp)
+
+    var carry: BaseType
+    addC(carry, tmp, tN, C, 0)
+    t.limbs[N-1] = SecretWord(tmp)
+    addC(carry, tN, tNp1, 0, carry)
+
+  discard t.csub(M, (tN != 0) or not(t < M))
+  r = t
+
+func montyResidue_precompute*(r: var BigInt, a, M, r2modM: BigInt,
+                              m0ninv: BaseType) =
+  ## Transform a bigint ``a`` from it's natural representation (mod N)
+  ## to a the Montgomery n-residue representation
+  ## This is intended for compile-time precomputations-only
+  montyMul_precompute(r, a, r2ModM, M, m0ninv)
diff --git a/constantine/config/type_bigint.nim b/constantine/config/type_bigint.nim
new file mode 100644
index 0000000..340cfff
--- /dev/null
+++ b/constantine/config/type_bigint.nim
@@ -0,0 +1,53 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import ./common
+
+func wordsRequired*(bits: int): int {.compileTime.} =
+  ## Compute the number of limbs required
+  # from the **announced** bit length
+  (bits + WordBitWidth - 1) div WordBitWidth
+
+type
+  BigInt*[bits: static int] = object
+    ## Fixed-precision big integer
+    ##
+    ## - "bits" is the announced bit-length of the BigInt
+    ##   This is public data, usually equal to the curve prime bitlength.
+    ##
+    ## - "limbs" is an internal field that holds the internal representation
+    ##   of the big integer. Least-significant limb first. Within limbs words are native-endian.
+    ##
+    ## This internal representation can be changed
+    ## without notice and should not be used by external applications or libraries.
+    limbs*: array[bits.wordsRequired, SecretWord]
+
+
+debug:
+  import std/strutils
+
+  type Limbs[N: static int] = array[N, SecretWord]
+
+  func toString*(a: Limbs): string =
+    result = "["
+    result.add " 0x" & toHex(BaseType(a[0]))
+    for i in 1 ..< a.len:
+      result.add ", 0x" & toHex(BaseType(a[i]))
+    result.add "])"
+
+  func toHex*(a: Limbs): string =
+    result = "0x"
+    for i in countdown(a.len-1, 0):
+      result.add toHex(BaseType(a[i]))
+
+  func `$`*(a: BigInt): string =
+    result = "BigInt["
+    result.add $BigInt.bits
+    result.add "](limbs: "
+    result.add a.limbs.toString()
+    result.add ")"
diff --git a/constantine/config/type_fp.nim b/constantine/config/type_fp.nim
new file mode 100644
index 0000000..a206021
--- /dev/null
+++ b/constantine/config/type_fp.nim
@@ -0,0 +1,28 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  ./common,
+  ./curves_declaration
+
+type
+  Fp*[C: static Curve] = object
+    ## All operations on a field are modulo P
+    ## P being the prime modulus of the Curve C
+    ## Internally, data is stored in Montgomery n-residue form
+    ## with the magic constant chosen for convenient division (a power of 2 depending on P bitsize)
+    mres*: matchingBigInt(C)
+
+debug:
+  import ./type_bigint
+
+  func `$`*[C: static Curve](a: Fp[C]): string =
+    result = "Fp[" & $C
+    result.add "]("
+    result.add $a.mres
+    result.add ')'
diff --git a/constantine/elliptic/ec_endomorphism_accel.nim b/constantine/elliptic/ec_endomorphism_accel.nim
new file mode 100644
index 0000000..eece904
--- /dev/null
+++ b/constantine/elliptic/ec_endomorphism_accel.nim
@@ -0,0 +1,600 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  # Standard Library
+  std/typetraits,
+  # Internal
+  ../../helpers/static_for,
+  ../primitives,
+  ../config/[common, curves, type_bigint],
+  ../arithmetic,
+  ../io/io_bigints,
+  ../towers,
+  ./ec_weierstrass_affine,
+  ./ec_weierstrass_projective,
+  ./ec_endomorphism_params
+
+# ############################################################
+#
+#             Endomorphism acceleration for
+#                 Scalar Multiplication
+#
+# ############################################################
+#
+# This files implements endomorphism-acceleration of scalar multiplication
+# using:
+# - GLV endomorphism on G1 (Gallant-Lambert-Vanstone)
+# - GLV and GLS endomorphisms on G2 (Galbraith-Lin-Scott)
+# - NAF recoding (windowed Non-Adjacent-Form)
+
+
+# Secret scalar + dynamic point
+# ----------------------------------------------------------------
+#
+# This section targets the case where the scalar multiplication [k]P
+# involves:
+# - a secret scalar `k`, hence requiring constant-time operations
+# - a dynamic `P`
+#
+# For example signing a message
+#
+# When P is known ahead of time (for example it's the generator)
+# We can precompute the point decomposition with plain scalar multiplication
+# and not require a fast endomorphism.
+# (For example generating a public-key)
+
+type
+  Recoded[LengthInDigits: static int] = distinct array[LengthInDigits, byte]
+  GLV_SAC[M, LengthInDigits: static int] = array[M, Recoded[LengthInDigits]]
+    ## GLV-Based Sign-Aligned-Column representation
+    ## see Faz-Hernandez, 2013
+    ##
+    ## (i) Length of every sub-scalar is fixed and given by
+    ##     l = ⌈log2 r/m⌉ + 1 where r is the prime subgroup order
+    ##     and m the number of dimensions of the GLV endomorphism
+    ## (ii) Exactly one subscalar which should be odd
+    ##      is expressed by a signed nonzero representation
+    ##      with all digits ∈ {1, −1}
+    ## (iii) Other subscalars have digits  ∈ {0, 1, −1}
+    ##
+    ## We pack the representation, using 2 bits per digit:
+    ##  0 = 0b00
+    ##  1 = 0b01
+    ## -1 = 0b11
+    ##
+    ## This means that GLV_SAC uses twice the size of a canonical integer
+    ##
+    ## Digit-Endianness is bigEndian
+
+const
+  BitSize   = 2
+  Shift     = 2    # log2(4) - we can store 4 digit per byte
+  ByteMask  = 3    # we need (mod 4) to access a packed bytearray
+  DigitMask = 0b11 # Digits take 2-bit
+
+# template signExtend_2bit(recoded: byte): int8 =
+#   ## We need to extend:
+#   ## - 0b00 to 0b0000_0000 ( 0)
+#   ## - 0b01 to 0b0000_0001 ( 1)
+#   ## - 0b11 to 0b1111_1111 (-1)
+#   ##
+#   ## This can be done by shifting left to have
+#   ## - 0b00 to 0b0000_0000
+#   ## - 0b01 to 0b0100_0000
+#   ## - 0b11 to 0b1100_0000
+#   ##
+#   ## And then an arithmetic right shift (SAR)
+#   ##
+#   ## However there is no builtin SAR
+#   ## we can get it in C by right-shifting
+#   ## with the main compilers/platforms
+#   ## (GCC, Clang, MSVC, ...)
+#   ## but this is implementation defined behavior
+#   ## Nim `ashr` uses C signed right shifting
+#   ##
+#   ## We could check the compiler to ensure we only use
+#   ## well documented behaviors: https://gcc.gnu.org/onlinedocs/gcc/Integers-implementation.html#Integers-implementation
+#   ## but if we can avoid that altogether in a crypto library
+#   ##
+#   ## Instead we use signed bitfield which are automatically sign-extended
+#   ## in a portable way as sign extension is automatic for builtin types
+
+type
+  SignExtender = object
+    ## Uses C builtin types sign extension to sign extend 2-bit to 8-bit
+    ## in a portable way as sign extension is automatic for builtin types
+    ## http://graphics.stanford.edu/~seander/bithacks.html#FixedSignExtend
+    digit {.bitsize:2.}: int8
+
+# TODO: use unsigned to avoid checks and potentially leaking secrets
+#       or push checks off (or prove that checks can be elided once Nim has Z3 in the compiler)
+proc `[]`(recoding: Recoded,
+          digitIdx: int): int8 {.inline.}=
+  ## 0 <= digitIdx < LengthInDigits
+  ## returns digit ∈ {0, 1, −1}
+  const len = Recoded.LengthInDigits
+  assert digitIdx < len
+
+  let slot = distinctBase(recoding)[
+    len-1 - (digitIdx shr Shift)
+  ]
+  let recoded = slot shr (BitSize*(digitIdx and ByteMask)) and DigitMask
+  var signExtender: SignExtender
+  # Hack with C assignment that return values
+  {.emit: [result, " = ", signExtender, ".digit = ", recoded, ";"].}
+  # " # Fix highlighting bug in VScode
+
+
+proc `[]=`(recoding: var Recoded,
+           digitIdx: int, value: int8) {.inline.}=
+  ## 0 <= digitIdx < LengthInDigits
+  ## returns digit ∈ {0, 1, −1}
+  ## This is write-once
+  const len = Recoded.LengthInDigits
+  assert digitIdx < Recoded.LengthInDigits
+
+  let slot = distinctBase(recoding)[
+    len-1 - (digitIdx shr Shift)
+  ].addr
+
+  let shifted = byte((value and DigitMask) shl (BitSize*(digitIdx and ByteMask)))
+  slot[] = slot[] or shifted
+
+
+func nDimMultiScalarRecoding[M, L: static int](
+    dst: var GLV_SAC[M, L],
+    src: MultiScalar[M, L]
+  ) =
+  ## This recodes N scalar for GLV multi-scalar multiplication
+  ## with side-channel resistance.
+  ##
+  ## Precondition src[0] is odd
+  #
+  # - Efficient and Secure Algorithms for GLV-Based Scalar
+  #   Multiplication and their Implementation on GLV-GLS
+  #   Curves (Extended Version)
+  #   Armando Faz-Hernández, Patrick Longa, Ana H. Sánchez, 2013
+  #   https://eprint.iacr.org/2013/158.pdf
+  #
+  # Algorithm 1 Protected Recoding Algorithm for the GLV-SAC Representation.
+  # ------------------------------------------------------------------------
+  #
+  # Input: m l-bit positive integers kj = (kj_l−1, ..., kj_0)_2 for
+  # 0 ≤ j < m, an odd “sign-aligner” kJ ∈ {kj}^m, where
+  # l = ⌈log2 r/m⌉ + 1, m is the GLV dimension and r is
+  # the prime subgroup order.
+  # Output: (bj_l−1 , ..., bj_0)GLV-SAC for 0 ≤ j < m, where
+  # bJ_i ∈ {1, −1}, and bj_i ∈ {0, bJ_i} for 0 ≤ j < m with
+  # j != J.
+  # ------------------------------------------------------------------------
+  #
+  # 1: bJ_l-1 = 1
+  # 2: for i = 0 to (l − 2) do
+  # 3:   bJ_i = 2kJ_i+1 - 1
+  # 4: for j = 0 to (m − 1), j != J do
+  # 5:   for i = 0 to (l − 1) do
+  # 6:     bj_i = bJ_i kj_0
+  # 7:     kj = ⌊kj/2⌋ − ⌊bj_i/2⌋
+  # 8: return (bj_l−1 , . . . , bj_0)_GLV-SAC for 0 ≤ j < m.
+  #
+  # - Guide to Pairing-based Cryptography
+  #   Chapter 6: Scalar Multiplication and Exponentiation in Pairing Groups
+  #   Joppe Bos, Craig Costello, Michael Naehrig
+  #
+  # We choose kJ = k0
+  #
+  # Implementation strategy and points of attention
+  # - The subscalars kj must support extracting the least significant bit
+  # - The subscalars kj must support floor division by 2
+  #   For that floored division, kj is 0 or positive
+  # - The subscalars kj must support individual bit accesses
+  # - The subscalars kj must support addition by a small value (0 or 1)
+  # Hence we choose to use our own BigInt representation.
+  #
+  # - The digit bji must support floor division by 2
+  #   For that floored division, bji may be negative!!!
+  # In particular floored division of -1 is -1 not 0.
+  # This means that arithmetic right shift must be used instead of logical right shift
+
+  # assert src[0].isOdd - Only happen on implementation error, we don't want to leak a single bit
+
+  var k = src # Keep the source multiscalar in registers
+  template b: untyped {.dirty.} = dst
+
+  b[0][L-1] = 1
+  for i in 0 .. L-2:
+    b[0][i] = 2 * k[0].bit(i+1).int8 - 1
+  for j in 1 .. M-1:
+    for i in 0 .. L-1:
+      let bji = b[0][i] * k[j].bit0.int8
+      b[j][i] = bji
+      # In the following equation
+      #   kj = ⌊kj/2⌋ − ⌊bj_i/2⌋
+      # We have ⌊bj_i/2⌋ (floor division)
+      # = -1 if bj_i == -1
+      # = 0  if bj_i ∈ {0, 1}
+      # So we turn that statement in an addition
+      # by the opposite
+      k[j].div2()
+      k[j] += SecretWord -bji.ashr(1)
+
+func buildLookupTable[M: static int, F](
+       P: ECP_SWei_Proj[F],
+       endomorphisms: array[M-1, ECP_SWei_Proj[F]],
+       lut: var array[1 shl (M-1), ECP_SWei_Proj[F]],
+     ) =
+  ## Build the lookup table from the base point P
+  ## and the curve endomorphism
+  #
+  # Algorithm
+  # Compute P[u] = P0 + u0 P1 +...+ um−2 Pm−1 for all 0≤u<2^m−1, where
+  # u= (um−2,...,u0)_2.
+  #
+  # Traduction:
+  #   for u in 0 ..< 2^(m-1)
+  #     lut[u] = P0
+  #     iterate on the bit representation of u
+  #       if the bit is set, add the matching endomorphism to lut[u]
+  #
+  # Note: This is for variable/unknown point P
+  #       when P is fixed at compile-time (for example is the generator point)
+  #       alternative algorithms are more efficient.
+  #
+  # Implementation:
+  #   We optimize the basic algorithm to reuse already computed table entries
+  #   by noticing for example that:
+  #   - 6 represented as 0b0110 requires P0 + P2 + P3
+  #   - 2 represented as 0b0010 already required P0 + P2
+  #   To find the already computed table entry, we can index
+  #   the table with the current `u` with the MSB unset
+  #   and add to it the endormorphism at the index matching the MSB position
+  #
+  #   This scheme ensures 1 addition per table entry instead of a number
+  #   of addition dependent on `u` Hamming Weight
+  #
+  # TODO:
+  # 1. Window method for M == 2
+  # 2. Have P in affine coordinate and build the table with mixed addition
+  #    assuming endomorphism φi(P) do not affect the Z coordinates
+  #    (if table is big enough/inversion cost is amortized)
+  # 3. Use Montgomery simultaneous inversion to have the table in
+  #    affine coordinate so that we can use mixed addition in teh main loop
+  lut[0] = P
+  for u in 1'u32 ..< 1 shl (M-1):
+    # The recoding allows usage of 2^(n-1) table instead of the usual 2^n with NAF
+    let msb = u.log2() # No undefined, u != 0
+    lut[u].sum(lut[u.clearBit(msb)], endomorphisms[msb])
+    # } # highlight bug, ...
+
+func tableIndex(glv: GLV_SAC, bit: int): SecretWord =
+  ## Compose the secret table index from
+  ## the GLV-SAC representation and the "bit" accessed
+  # TODO:
+  #   We are currently storing 2-bit for 0, 1, -1 in the GLV-SAC representation
+  #   but since columns have all the same sign, determined by k0,
+  #   we only need 0 and 1 dividing storage per 2
+  staticFor i, 1, GLV_SAC.M:
+    result = result or SecretWord((glv[i][bit] and 1) shl (i-1))
+
+func isNeg(glv: GLV_SAC, bit: int): SecretBool =
+  ## Returns true if the bit requires substraction
+  SecretBool(glv[0][bit] < 0)
+
+func secretLookup[T](dst: var T, table: openArray[T], index: SecretWord) =
+  ## Load a table[index] into `dst`
+  ## This is constant-time, whatever the `index`, its value is not leaked
+  ## This is also protected against cache-timing attack by always scanning the whole table
+  for i in 0 ..< table.len:
+    let selector = SecretWord(i) == index
+    dst.ccopy(table[i], selector)
+
+func scalarMulGLV*[scalBits](
+       P: var ECP_SWei_Proj,
+       scalar: BigInt[scalBits]
+     ) =
+  ## Elliptic Curve Scalar Multiplication
+  ##
+  ##   P <- [k] P
+  ##
+  ## This is a scalar multiplication accelerated by an endomorphism
+  ## via the GLV (Gallant-lambert-Vanstone) decomposition.
+  const C = P.F.C # curve
+  static: doAssert: scalBits == C.getCurveOrderBitwidth()
+  when P.F is Fp:
+    const M = 2
+
+  # 1. Compute endomorphisms
+  var endomorphisms: array[M-1, typeof(P)] # TODO: zero-init not required
+  endomorphisms[0] = P
+  endomorphisms[0].x *= C.getCubicRootOfUnity_mod_p()
+
+  # 2. Decompose scalar into mini-scalars
+  const L = (C.getCurveOrderBitwidth() + M - 1) div M + 1
+  var miniScalars: array[M, BigInt[L]] # TODO: zero-init not required
+  when C == BN254_Snarks:
+    scalar.decomposeScalar_BN254_Snarks_G1(
+      miniScalars
+    )
+  elif C == BLS12_381:
+    scalar.decomposeScalar_BLS12_381_G1(
+      miniScalars
+    )
+  else:
+    {.error: "Unsupported curve for GLV acceleration".}
+
+  # 3. TODO: handle negative mini-scalars
+  #    Either negate the associated base and the scalar (in the `endomorphisms` array)
+  #    Or use Algorithm 3 from Faz et al which can encode the sign
+  #    in the GLV representation at the low low price of 1 bit
+
+  # 4. Precompute lookup table
+  var lut: array[1 shl (M-1), ECP_SWei_Proj] # TODO: zero-init not required
+  buildLookupTable(P, endomorphisms, lut)
+  # TODO: Montgomery simultaneous inversion (or other simultaneous inversion techniques)
+  #       so that we use mixed addition formulas in the main loop
+
+  # 5. Recode the miniscalars
+  #    we need the base miniscalar (that encodes the sign)
+  #    to be odd, and this in constant-time to protect the secret least-significant bit.
+  var k0isOdd = miniScalars[0].isOdd()
+  discard miniScalars[0].csub(SecretWord(1), not k0isOdd)
+
+  var recoded: GLV_SAC[2, L] # zero-init required
+  recoded.nDimMultiScalarRecoding(miniScalars)
+
+  # 6. Proceed to GLV accelerated scalar multiplication
+  var Q: typeof(P) # TODO: zero-init not required
+  Q.secretLookup(lut, recoded.tableIndex(L-1))
+  Q.cneg(recoded.isNeg(L-1))
+
+  for i in countdown(L-2, 0):
+    Q.double()
+    var tmp: typeof(Q) # TODO: zero-init not required
+    tmp.secretLookup(lut, recoded.tableIndex(i))
+    tmp.cneg(recoded.isNeg(i))
+    Q += tmp
+
+  # Now we need to correct if the sign miniscalar was not odd
+  # we missed an addition
+  lut[0] += Q # Contains Q + P0
+  P = Q
+  P.ccopy(lut[0], not k0isOdd)
+
+# Sanity checks
+# ----------------------------------------------------------------
+# See page 7 of
+#
+# - Efficient and Secure Algorithms for GLV-Based Scalar
+#   Multiplication and their Implementation on GLV-GLS
+#   Curves (Extended Version)
+#   Armando Faz-Hernández, Patrick Longa, Ana H. Sánchez, 2013
+#   https://eprint.iacr.org/2013/158.pdf
+
+when isMainModule:
+  import ../io/io_bigints
+
+  proc toString(glvSac: GLV_SAC): string =
+    for j in 0 ..< glvSac.M:
+      result.add "k" & $j & ": ["
+      for i in countdown(glvSac.LengthInDigits-1, 0):
+        result.add " " & (block:
+          case glvSac[j][i]
+          of -1: "1\u{0305}"
+          of 0: "0"
+          of 1: "1"
+          else:
+            raise newException(ValueError, "Unexpected encoded value: " & $glvSac[j][i])
+        ) # " # Unbreak VSCode highlighting bug
+      result.add " ]\n"
+
+
+  iterator bits(u: SomeInteger): tuple[bitIndex: int32, bitValue: uint8] =
+    ## bit iterator, starts from the least significant bit
+    var u = u
+    var idx = 0'i32
+    while u != 0:
+      yield (idx, uint8(u and 1))
+      u = u shr 1
+      inc idx
+
+  func buildLookupTable_naive[M: static int](
+         P: string,
+         endomorphisms: array[M-1, string],
+         lut: var array[1 shl (M-1), string],
+       ) =
+    ## Checking the LUT by building strings of endomorphisms additions
+    ## This naively translates the lookup table algorithm
+    ## Compute P[u] = P0 + u0 P1 +...+ um−2 Pm−1 for all 0≤u<2m−1, where
+    ## u= (um−2,...,u0)_2.
+    ## The number of additions done per entries is equal to the
+    ## iteration variable `u` Hamming Weight
+    for u in 0 ..< 1 shl (M-1):
+      lut[u] = P
+    for u in 0 ..< 1 shl (M-1):
+      for idx, bit in bits(u):
+        if bit == 1:
+          lut[u] &= " + " & endomorphisms[idx]
+
+  func buildLookupTable_reuse[M: static int](
+         P: string,
+         endomorphisms: array[M-1, string],
+         lut: var array[1 shl (M-1), string],
+       ) =
+    ## Checking the LUT by building strings of endomorphisms additions
+    ## This reuses previous table entries so that only one addition is done
+    ## per new entries
+    lut[0] = P
+    for u in 1'u32 ..< 1 shl (M-1):
+      let msb = u.log2() # No undefined, u != 0
+      lut[u] = lut[u.clearBit(msb)] & " + " & endomorphisms[msb]
+
+  proc main_lut() =
+    const M = 4              # GLS-4 decomposition
+    const miniBitwidth = 4   # Bitwidth of the miniscalars resulting from scalar decomposition
+
+    var k: MultiScalar[M, miniBitwidth]
+    var kRecoded: GLV_SAC[M, miniBitwidth+1]
+
+    k[0].fromUint(11)
+    k[1].fromUint(6)
+    k[2].fromuint(14)
+    k[3].fromUint(3)
+
+    kRecoded.nDimMultiScalarRecoding(k)
+
+    echo kRecoded.toString()
+
+    var lut: array[1 shl (M-1), string]
+    let
+      P = "P0"
+      endomorphisms = ["P1", "P2", "P3"]
+
+    buildLookupTable_naive(P, endomorphisms, lut)
+    echo lut
+    doAssert lut[0] == "P0"
+    doAssert lut[1] == "P0 + P1"
+    doAssert lut[2] == "P0 + P2"
+    doAssert lut[3] == "P0 + P1 + P2"
+    doAssert lut[4] == "P0 + P3"
+    doAssert lut[5] == "P0 + P1 + P3"
+    doAssert lut[6] == "P0 + P2 + P3"
+    doAssert lut[7] == "P0 + P1 + P2 + P3"
+
+    var lut_reuse: array[1 shl (M-1), string]
+    buildLookupTable_reuse(P, endomorphisms, lut_reuse)
+    echo lut_reuse
+    doAssert lut == lut_reuse
+
+  main_lut()
+  echo "---------------------------------------------"
+
+  proc main_decomp() =
+    const M = 2
+    const scalBits = BN254_Snarks.getCurveOrderBitwidth()
+    const miniBits = (scalBits+M-1) div M
+
+    block:
+      let scalar = BigInt[scalBits].fromHex(
+        "0x24a0b87203c7a8def0018c95d7fab106373aebf920265c696f0ae08f8229b3f3"
+      )
+
+      var decomp: MultiScalar[M, miniBits]
+      decomposeScalar_BN254_Snarks_G1(scalar, decomp)
+
+      doAssert: bool(decomp[0] == BigInt[127].fromHex"14928105460c820ccc9a25d0d953dbfe")
+      doAssert: bool(decomp[1] == BigInt[127].fromHex"13a2f911eb48a578844b901de6f41660")
+
+    block:
+      let scalar = BigInt[scalBits].fromHex(
+        "24554fa6d0c06f6dc51c551dea8b058cd737fc8d83f7692fcebdd1842b3092c4"
+      )
+
+      var decomp: MultiScalar[M, miniBits]
+      decomposeScalar_BN254_Snarks_G1(scalar, decomp)
+
+      doAssert: bool(decomp[0] == BigInt[127].fromHex"28cf7429c3ff8f7e82fc419e90cc3a2")
+      doAssert: bool(decomp[1] == BigInt[127].fromHex"457efc201bdb3d2e6087df36430a6db6")
+
+    block:
+      let scalar = BigInt[scalBits].fromHex(
+        "288c20b297b9808f4e56aeb70eabf269e75d055567ff4e05fe5fb709881e6717"
+      )
+
+      var decomp: MultiScalar[M, miniBits]
+      decomposeScalar_BN254_Snarks_G1(scalar, decomp)
+
+      doAssert: bool(decomp[0] == BigInt[127].fromHex"4da8c411566c77e00c902eb542aaa66b")
+      doAssert: bool(decomp[1] == BigInt[127].fromHex"5aa8f2f15afc3217f06677702bd4e41a")
+
+
+  main_decomp()
+
+  echo "---------------------------------------------"
+
+  # This tests the multiplication against the Table 1
+  # of the paper
+
+  # Coef       Decimal    Binary        GLV-SAC recoded
+  # | k0 |     | 11 |   | 0 1 0 1 1 |   | 1 -1 1 -1 1 |
+  # | k1 |  =  |  6 | = | 0 0 1 1 0 | = | 1 -1 0 -1 0 |
+  # | k2 |     | 14 |   | 0 1 1 1 0 |   | 1  0 0 -1 0 |
+  # | k3 |     |  3 |   | 0 0 0 1 1 |   | 0  0 1 -1 1 |
+
+  #   i                |         3               2             1             0
+  # -------------------+----------------------------------------------------------------------
+  #  2Q                |   2P0+2P1+2P2    2P0+2P1+4P2    6P0+4P1+8P2+2P3  10P0+6P1+14P2+2P3
+  # Q + sign_i T[ki]   |    P0+P1+2P2   3P0+2P1+4P2+P3   5P0+3P1+7P2+P3   11P0+6P1+14P2+3P3
+
+  type Endo = enum
+    P0
+    P1
+    P2
+    P3
+
+  func buildLookupTable_reuse[M: static int](
+         P: Endo,
+         endomorphisms: array[M-1, Endo],
+         lut: var array[1 shl (M-1), set[Endo]],
+       ) =
+    ## Checking the LUT by building strings of endomorphisms additions
+    ## This reuses previous table entries so that only one addition is done
+    ## per new entries
+    lut[0].incl P
+    for u in 1'u32 ..< 1 shl (M-1):
+      let msb = u.log2() # No undefined, u != 0
+      lut[u] = lut[u.clearBit(msb)] + {endomorphisms[msb]}
+
+
+  proc mainFullMul() =
+    const M = 4                # GLS-4 decomposition
+    const miniBitwidth = 4     # Bitwidth of the miniscalars resulting from scalar decomposition
+    const L = miniBitwidth + 1 # Bitwidth of the recoded scalars
+
+    var k: MultiScalar[M, miniBitwidth]
+    var kRecoded: GLV_SAC[M, L]
+
+    k[0].fromUint(11)
+    k[1].fromUint(6)
+    k[2].fromuint(14)
+    k[3].fromUint(3)
+
+    kRecoded.nDimMultiScalarRecoding(k)
+
+    echo kRecoded.toString()
+
+    var lut: array[1 shl (M-1), set[Endo]]
+    let
+      P = P0
+      endomorphisms = [P1, P2, P3]
+
+    buildLookupTable_reuse(P, endomorphisms, lut)
+    echo lut
+
+    var Q: array[Endo, int]
+
+    # Multiplication
+    assert bool k[0].isOdd()
+    # Q = sign_l-1 P[K_l-1]
+    let idx = kRecoded.tableIndex(L-1)
+    for p in lut[int(idx)]:
+      Q[p] = if kRecoded.isNeg(L-1).bool: -1 else: 1
+    # Loop
+    for i in countdown(L-2, 0):
+      # Q = 2Q
+      for val in Q.mitems: val *= 2
+      echo "2Q:                    ", Q
+      # Q = Q + sign_l-1 P[K_l-1]
+      let idx = kRecoded.tableIndex(i)
+      for p in lut[int(idx)]:
+        Q[p] += (if kRecoded.isNeg(i).bool: -1 else: 1)
+      echo "Q + sign_l-1 P[K_l-1]: ", Q
+
+    echo Q
+
+  mainFullMul()
diff --git a/constantine/elliptic/ec_endomorphism_params.nim b/constantine/elliptic/ec_endomorphism_params.nim
new file mode 100644
index 0000000..1f00a75
--- /dev/null
+++ b/constantine/elliptic/ec_endomorphism_params.nim
@@ -0,0 +1,135 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  # Internal
+  ../../helpers/static_for,
+  ../primitives,
+  ../config/[common, curves, type_bigint],
+  ../arithmetic,
+  ../io/io_bigints,
+  ../towers,
+  ./ec_weierstrass_projective
+
+# Parameters for GLV endomorphisms acceleration
+# ----------------------------------------------------------------------------------------
+# TODO: cleanup, those should be derived in the config folder
+#       and stored in a constant
+
+type
+  MultiScalar*[M, LengthInBits: static int] = array[M, BigInt[LengthInBits]]
+    ## Decomposition of a secret scalar in multiple scalars
+
+# Chapter 6.3.1 - Guide to Pairing-based Cryptography
+const Lattice_BN254_Snarks_G1: array[2, array[2, tuple[b: BigInt[127], isNeg: bool]]] = [
+  # Curve of order 254 -> mini scalars of size 127
+  # u = 0x44E992B44A6909F1
+  [(BigInt[127].fromHex"0x89d3256894d213e3", false),                  # 2u + 1
+   (BigInt[127].fromHex"0x6f4d8248eeb859fd0be4e1541221250b", false)], # 6u² + 4u + 1
+  [(BigInt[127].fromHex"0x6f4d8248eeb859fc8211bbeb7d4f1128", false),  # 6u² + 2u
+   (BigInt[127].fromHex"0x89d3256894d213e3", true)]                   # -2u - 1
+]
+
+const Babai_BN254_Snarks_G1 = [
+  # Vector for Babai rounding
+  BigInt[127].fromHex"0x89d3256894d213e3",                            # 2u + 1
+  BigInt[127].fromHex"0x6f4d8248eeb859fd0be4e1541221250b"             # 6u² + 4u + 1
+]
+
+func decomposeScalar_BN254_Snarks_G1*[M, scalBits, L: static int](
+       scalar: BigInt[scalBits],
+       miniScalars: var MultiScalar[M, L]
+     ) =
+  ## Decompose a secret scalar into mini-scalar exploiting
+  ## BN254_Snarks specificities.
+  ##
+  ## TODO: Generalize to all BN curves
+  ##       - needs a Lattice type
+  ##       - needs to better support negative bigints, (extra bit for sign?)
+
+  static: doAssert L == (scalBits + M - 1) div M + 1
+  # 𝛼0 = (0x2d91d232ec7e0b3d7 * s) >> 256
+  # 𝛼1 = (0x24ccef014a773d2d25398fd0300ff6565 * s) >> 256
+  const
+    w = BN254_Snarks.getCurveOrderBitwidth().wordsRequired()
+    alphaHats = (BigInt[66].fromHex"0x2d91d232ec7e0b3d7",
+                 BigInt[130].fromHex"0x24ccef014a773d2d25398fd0300ff6565")
+
+  var alphas{.noInit.}: array[M, BigInt[scalBits]] # TODO size 66+254 and 130+254
+
+  staticFor i, 0, M:
+    alphas[i].prod_high_words(alphaHats[i], scalar, w)
+
+  # We have k0 = s - 𝛼0 b00 - 𝛼1 b10
+  # and kj = 0 - 𝛼j b0j - 𝛼1 b1j
+  var k: array[M, BigInt[scalBits]]
+  k[0] = scalar
+  for miniScalarIdx in 0 ..< M:
+    for basisIdx in 0 ..< M:
+      var alphaB {.noInit.}: BigInt[scalBits]
+      alphaB.prod(alphas[basisIdx], Lattice_BN254_Snarks_G1[basisIdx][miniScalarIdx].b) # TODO small lattice size
+      if Lattice_BN254_Snarks_G1[basisIdx][miniScalarIdx].isNeg:
+        k[miniScalarIdx] += alphaB
+      else:
+        k[miniScalarIdx] -= alphaB
+
+    miniScalars[miniScalarIdx].copyTruncatedFrom(k[miniScalarIdx])
+
+const Lattice_BLS12_381_G1: array[2, array[2, tuple[b: BigInt[128], isNeg: bool]]] = [
+  # Curve of order 254 -> mini scalars of size 127
+  # u = 0x44E992B44A6909F1
+  [(BigInt[128].fromHex"0xac45a4010001a40200000000ffffffff", false),  # u² - 1
+   (BigInt[128].fromHex"0x1", true)],                                 # -1
+  [(BigInt[128].fromHex"0x1", false),                                 # 1
+   (BigInt[128].fromHex"0xac45a4010001a4020000000100000000", false)]  # u²
+]
+
+const Babai_BLS12_381_G1 = [
+  # Vector for Babai rounding
+  BigInt[128].fromHex"0xac45a4010001a4020000000100000000",
+  BigInt[128].fromHex"0x1"
+]
+
+func decomposeScalar_BLS12_381_G1*[M, scalBits, L: static int](
+       scalar: BigInt[scalBits],
+       miniScalars: var MultiScalar[M, L]
+     ) =
+  ## Decompose a secret scalar into mini-scalar exploiting
+  ## BLS12_381 specificities.
+  ##
+  ## TODO: Generalize to all BLS curves
+  ##       - needs a Lattice type
+  ##       - needs to better support negative bigints, (extra bit for sign?)
+
+  static: doAssert L == (scalBits + M - 1) div M + 1
+  # 𝛼0 = (0x2d91d232ec7e0b3d7 * s) >> 256
+  # 𝛼1 = (0x24ccef014a773d2d25398fd0300ff6565 * s) >> 256
+  const
+    w = BLS12_381.getCurveOrderBitwidth().wordsRequired()
+    alphaHats = (BigInt[129].fromHex"0x17c6becf1e01faadd63f6e522f6cfee30",
+                 BigInt[2].fromHex"0x2")
+
+  var alphas{.noInit.}: array[M, BigInt[scalBits]] # TODO size 256+255 and 132+255
+
+  staticFor i, 0, M:
+    alphas[i].prod_high_words(alphaHats[i], scalar, w)
+
+  # We have k0 = s - 𝛼0 b00 - 𝛼1 b10
+  # and kj = 0 - 𝛼j b0j - 𝛼1 b1j
+  var k: array[M, BigInt[scalBits]]
+  k[0] = scalar
+  for miniScalarIdx in 0 ..< M:
+    for basisIdx in 0 ..< M:
+      var alphaB {.noInit.}: BigInt[scalBits]
+      alphaB.prod(alphas[basisIdx], Lattice_BLS12_381_G1[basisIdx][miniScalarIdx].b) # TODO small lattice size
+      if Lattice_BLS12_381_G1[basisIdx][miniScalarIdx].isNeg:
+        k[miniScalarIdx] += alphaB
+      else:
+        k[miniScalarIdx] -= alphaB
+
+    miniScalars[miniScalarIdx].copyTruncatedFrom(k[miniScalarIdx])
diff --git a/constantine/elliptic/ec_scalar_mul.nim b/constantine/elliptic/ec_scalar_mul.nim
new file mode 100644
index 0000000..3fc8cfd
--- /dev/null
+++ b/constantine/elliptic/ec_scalar_mul.nim
@@ -0,0 +1,229 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  ../primitives,
+  ../config/[common, curves],
+  ../arithmetic,
+  ../towers,
+  ./ec_weierstrass_projective,
+  ./ec_endomorphism_accel
+
+# ############################################################
+#                                                            #
+#                   Scalar Multiplication                    #
+#                                                            #
+# ############################################################
+#
+# Scalar multiplication is a key algorithm for cryptographic protocols:
+# - it is slow,
+# - it is performance critical as it is used to generate signatures and authenticate messages
+# - it is a high-value target as the "scalar" is very often the user secret key
+#
+# A safe scalar multiplication MUST:
+# - Use no branching (to prevent timing and simple power analysis attacks)
+# - Always do the same memory accesses (in particular for table lookups) (to prevent cache-timing attacks)
+# - Not expose the bitlength of the exponent (use the curve order bitlength instead)
+#
+# Constantine does not make an extra effort to defend against the smart-cards
+# and embedded device attacks:
+# - Differential Power-Analysis which may allow for example retrieving bit content depending on the cost of writing 0 or 1
+#   (Address-bit DPA by Itoh, Izu and Takenaka)
+# - Electro-Magnetic which can be used in a similar way to power analysis but based on EM waves
+# - Fault Attacks which can be used by actively introducing faults (via a laser for example) in an algorithm
+#
+# The current security efforts are focused on preventing attacks
+# that are effective remotely including through the network,
+# a colocated VM or a malicious process on your phone.
+#
+# - Survey for Performance & Security Problems of Passive Side-channel Attacks     Countermeasures in ECC\
+#   Rodrigo Abarúa, Claudio Valencia, and Julio López, 2019\
+#   https://eprint.iacr.org/2019/010
+#
+# - State-of-the-art of secure ECC implementations:a survey on known side-channel attacks and countermeasures\
+#   Junfeng Fan,XuGuo, Elke De Mulder, Patrick Schaumont, Bart Preneel and Ingrid Verbauwhede, 2010
+#   https://www.esat.kuleuven.be/cosic/publications/article-1461.pdf
+
+template checkScalarMulScratchspaceLen(len: int) =
+  ## CHeck that there is a minimum of scratchspace to hold the temporaries
+  debug:
+    assert len >= 2, "Internal Error: the scratchspace for scalar multiplication should be equal or greater than 2"
+
+func getWindowLen(bufLen: int): uint =
+  ## Compute the maximum window size that fits in the scratchspace buffer
+  checkScalarMulScratchspaceLen(bufLen)
+  result = 4
+  while (1 shl result) + 1 > bufLen:
+    dec result
+
+func scalarMulPrologue(
+       P: var ECP_SWei_Proj,
+       scratchspace: var openarray[ECP_SWei_Proj]
+     ): uint =
+  ## Setup the scratchspace
+  ## Returns the fixed-window size for scalar mul with window optimization
+  result = scratchspace.len.getWindowLen()
+  # Precompute window content, special case for window = 1
+  # (i.e scratchspace has only space for 2 temporaries)
+  # The content scratchspace[2+k] is set at [k]P
+  # with scratchspace[0] untouched
+  if result == 1:
+    scratchspace[1] = P
+  else:
+    scratchspace[2] = P
+    for k in 2 ..< 1 shl result:
+      scratchspace[k+1].sum(scratchspace[k], P)
+
+  # Set a to infinity
+  P.setInf()
+
+func scalarMulDoubling(
+       P: var ECP_SWei_Proj,
+       exponent: openArray[byte],
+       tmp: var ECP_SWei_Proj,
+       window: uint,
+       acc, acc_len: var uint,
+       e: var int
+     ): tuple[k, bits: uint] {.inline.} =
+  ## Doubling steps of doubling and add for scalar multiplication
+  ## Get the next k bits in range [1, window)
+  ## and double k times
+  ## Returns the number of doubling done and the corresponding bits.
+  ##
+  ## Updates iteration variables and accumulators
+  #
+  # ⚠️: Extreme care should be used to not leak
+  #    the exponent bits nor its real bitlength
+  #    i.e. if the exponent is zero but encoded in a
+  #    256-bit integer, only "256" should leak
+  #    as for most applications like ECDSA or BLS signature schemes
+  #    the scalar is the user secret key.
+
+  # Get the next bits
+  # acc/acc_len must be uint to avoid Nim runtime checks leaking bits
+  # e is public
+  var k = window
+  if acc_len < window:
+    if e < exponent.len:
+      acc = (acc shl 8) or exponent[e].uint
+      inc e
+      acc_len += 8
+    else: # Drained all exponent bits
+      k = acc_len
+
+  let bits = (acc shr (acc_len - k)) and ((1'u32 shl k) - 1)
+  acc_len -= k
+
+  # We have k bits and can do k doublings
+  for i in 0 ..< k:
+    tmp.double(P)
+    P = tmp
+
+  return (k, bits)
+
+
+func scalarMulGeneric*(
+       P: var ECP_SWei_Proj,
+       scalar: openArray[byte],
+       scratchspace: var openArray[ECP_SWei_Proj]
+     ) =
+  ## Elliptic Curve Scalar Multiplication
+  ##
+  ##   P <- [k] P
+  ##
+  ## This uses fixed-window optimization if possible
+  ## `scratchspace` MUST be of size 2 .. 2^4
+  ##
+  ## This is suitable to use with secret `scalar`, in particular
+  ## to derive a public key from a private key or
+  ## to sign a message.
+  ##
+  ## Particular care has been given to defend against the following side-channel attacks:
+  ## - timing attacks: all exponents of the same length
+  ##   will take the same time including
+  ##   a "zero" exponent of length 256-bit
+  ## - cache-timing attacks: Constantine does use a precomputed table
+  ##   but when extracting a value from the table
+  ##   the whole table is always accessed with the same pattern
+  ##   preventing malicious attacks through CPU cache delay analysis.
+  ## - simple power-analysis and electromagnetic attacks: Constantine always do the same
+  ##   double and add sequences and those cannot be analyzed to distinguish
+  ##   the exponent 0 and 1.
+  ##
+  ## I.e. As far as the author know, Constantine implements all countermeasures to the known
+  ##      **remote** attacks on ECC implementations.
+  ##
+  ## Disclaimer:
+  ##   Constantine is provided as-is without any guarantees.
+  ##   Use at your own risks.
+  ##   Thorough evaluation of your threat model, the security of any cryptographic library you are considering,
+  ##   and the secrets you put in jeopardy is strongly advised before putting data at risk.
+  ##   The author would like to remind users that the best code can only mitigate
+  ##   but not protect against human failures which are the weakest links and largest
+  ##   backdoors to secrets exploited today.
+  ##
+  ## Constantine is resistant to
+  ## - Fault Injection attacks: Constantine does not have branches that could
+  ##   be used to skip some additions and reveal which were dummy and which were real.
+  ##   Dummy operations are like the double-and-add-always timing attack countermeasure.
+  ##
+  ##
+  ## Constantine DOES NOT defend against Address-Bit Differential Power Analysis attacks by default,
+  ## which allow differentiating between writing a 0 or a 1 to a memory cell.
+  ## This is a threat for smart-cards and embedded devices (for example to handle authentication to a cable or satellite service)
+  ## Constantine can be extended to use randomized projective coordinates to foil this attack.
+
+  let window = scalarMulPrologue(P, scratchspace)
+
+  # We process bits with from most to least significant.
+  # At each loop iteration with have acc_len bits in acc.
+  # To maintain constant-time the number of iterations
+  # or the number of operations or memory accesses should be the same
+  # regardless of acc & acc_len
+  var
+    acc, acc_len: uint
+    e = 0
+  while acc_len > 0 or e < scalar.len:
+    let (k, bits) = scalarMulDoubling(
+      P, scalar, scratchspace[0],
+      window, acc, acc_len, e
+    )
+
+    # Window lookup: we set scratchspace[1] to the lookup value
+    # If the window length is 1 it's already set.
+    if window > 1:
+      # otherwise we need a constant-time lookup
+      # in particular we need the same memory accesses, we can't
+      # just index the openarray with the bits to avoid cache attacks.
+      for i in 1 ..< 1 shl k:
+        let ctl = SecretWord(i) == SecretWord(bits)
+        scratchspace[1].ccopy(scratchspace[1+i], ctl)
+
+    # Multiply with the looked-up value
+    # we need to keep the product only ig the exponent bits are not all zeroes
+    scratchspace[0].sum(P, scratchspace[1])
+    P.ccopy(scratchspace[0], SecretWord(bits).isNonZero())
+
+func scalarMul*(
+       P: var ECP_SWei_Proj,
+       scalar: BigInt
+     ) {.inline.} =
+  ## Elliptic Curve Scalar Multiplication
+  ##
+  ##   P <- [k] P
+  # This calls endomorphism accelerated scalar mul if available
+  # or the generic scalar mul otherwise
+  when ECP_SWei_Proj.F.C in {BN254_Snarks, BLS12_381}:
+    # ⚠️ This requires the cofactor to be cleared
+    scalarMulGLV(P, scalar)
+  else:
+    var
+      scratchSpace: array[1 shl 4, ECP_SWei_Proj]
+      scalarCanonicalBE: array[(scalar.bits+7)div 8, byte] # canonical big endian representation
+    scalarCanonicalBE.exportRawUint(scalar, bigEndian)   # Export is constant-time
+    P.scalarMulGeneric(scratchSpace)
diff --git a/constantine/elliptic/ec_weierstrass_projective.nim b/constantine/elliptic/ec_weierstrass_projective.nim
index bf61f47..d818b8a 100644
--- a/constantine/elliptic/ec_weierstrass_projective.nim
+++ b/constantine/elliptic/ec_weierstrass_projective.nim
@@ -108,6 +108,15 @@ func neg*(P: var ECP_SWei_Proj) =
   ## Negate ``P``
   P.y.neg(P.y)
 
+func cneg*(P: var ECP_SWei_Proj, ctl: CTBool) =
+  ## Conditional negation.
+  ## Negate if ``ctl`` is true
+  var Q{.noInit.}: typeof(P)
+  Q.x = P.x
+  Q.y.neg(P.y)
+  Q.z = P.z
+  P.ccopy(Q, ctl)
+
 func sum*[F](
        r: var ECP_SWei_Proj[F],
        P, Q: ECP_SWei_Proj[F]
@@ -274,197 +283,23 @@ func double*[F](
   else:
     {.error: "Not implemented.".}
 
-# ############################################################
-#                                                            #
-#                   Scalar Multiplication                    #
-#                                                            #
-# ############################################################
-#
-# Scalar multiplication is a key algorithm for cryptographic protocols:
-# - it is slow,
-# - it is performance critical as it is used to generate signatures and authenticate messages
-# - it is a high-value target as the "scalar" is very often the user secret key
-#
-# A safe scalar multiplication MUST:
-# - Use no branching (to prevent timing and simple power analysis attacks)
-# - Always do the same memory accesses (in particular for table lookups) (to prevent cache-timing attacks)
-# - Not expose the bitlength of the exponent (use the curve order bitlength instead)
-#
-# Constantine does not make an extra effort to defend against the smart-cards
-# and embedded device attacks:
-# - Differential Power-Analysis which may allow for example retrieving bit content depending on the cost of writing 0 or 1
-#   (Address-bit DPA by Itoh, Izu and Takenaka)
-# - Electro-Magnetic which can be used in a similar way to power analysis but based on EM waves
-# - Fault Attacks which can be used by actively introducing faults (via a laser for example) in an algorithm
-#
-# The current security efforts are focused on preventing attacks
-# that are effective remotely including through the network,
-# a colocated VM or a malicious process on your phone.
-#
-# - Survey for Performance & Security Problems of Passive Side-channel Attacks     Countermeasures in ECC\
-#   Rodrigo Abarúa, Claudio Valencia, and Julio López, 2019\
-#   https://eprint.iacr.org/2019/010
-#
-# - State-of-the-art of secure ECC implementations:a survey on known side-channel attacks and countermeasures\
-#   Junfeng Fan,XuGuo, Elke De Mulder, Patrick Schaumont, Bart Preneel and Ingrid Verbauwhede, 2010
-#   https://www.esat.kuleuven.be/cosic/publications/article-1461.pdf
+func `+=`*[F](P: var ECP_SWei_Proj[F], Q: ECP_SWei_Proj[F]) =
+  var tmp {.noInit.}: ECP_SWei_Proj[F]
+  tmp.sum(P, Q)
+  P = tmp
 
-template checkScalarMulScratchspaceLen(len: int) =
-  ## CHeck that there is a minimum of scratchspace to hold the temporaries
-  debug:
-    assert len >= 2, "Internal Error: the scratchspace for scalar multiplication should be equal or greater than 2"
+func double*[F](P: var ECP_SWei_Proj[F]) =
+  var tmp {.noInit.}: ECP_SWei_Proj[F]
+  tmp.double(P)
+  P = tmp
 
-func getWindowLen(bufLen: int): uint =
-  ## Compute the maximum window size that fits in the scratchspace buffer
-  checkScalarMulScratchspaceLen(bufLen)
-  result = 4
-  while (1 shl result) + 1 > bufLen:
-    dec result
+func affineFromProjective*[F](aff: var ECP_SWei_Proj[F], proj: ECP_SWei_Proj) =
+  # TODO: for now we reuse projective coordinate backend
+  # TODO: simultaneous inversion
 
-func scalarMulPrologue(
-       P: var ECP_SWei_Proj,
-       scratchspace: var openarray[ECP_SWei_Proj]
-     ): uint =
-  ## Setup the scratchspace
-  ## Returns the fixed-window size for scalar mul with window optimization
-  result = scratchspace.len.getWindowLen()
-  # Precompute window content, special case for window = 1
-  # (i.e scratchspace has only space for 2 temporaries)
-  # The content scratchspace[2+k] is set at [k]P
-  # with scratchspace[0] untouched
-  if result == 1:
-    scratchspace[1] = P
-  else:
-    scratchspace[2] = P
-    for k in 2 ..< 1 shl result:
-      scratchspace[k+1].sum(scratchspace[k], P)
+  var invZ {.noInit.}: F
+  invZ.inv(proj.z)
 
-  # Set a to infinity
-  P.setInf()
-
-func scalarMulDoubling(
-       P: var ECP_SWei_Proj,
-       exponent: openArray[byte],
-       tmp: var ECP_SWei_Proj,
-       window: uint,
-       acc, acc_len: var uint,
-       e: var int
-     ): tuple[k, bits: uint] {.inline.} =
-  ## Doubling steps of doubling and add for scalar multiplication
-  ## Get the next k bits in range [1, window)
-  ## and double k times
-  ## Returns the number of doubling done and the corresponding bits.
-  ##
-  ## Updates iteration variables and accumulators
-  #
-  # ⚠️: Extreme care should be used to not leak
-  #    the exponent bits nor its real bitlength
-  #    i.e. if the exponent is zero but encoded in a
-  #    256-bit integer, only "256" should leak
-  #    as for most applications like ECDSA or BLS signature schemes
-  #    the scalar is the user secret key.
-
-  # Get the next bits
-  # acc/acc_len must be uint to avoid Nim runtime checks leaking bits
-  # e is public
-  var k = window
-  if acc_len < window:
-    if e < exponent.len:
-      acc = (acc shl 8) or exponent[e].uint
-      inc e
-      acc_len += 8
-    else: # Drained all exponent bits
-      k = acc_len
-
-  let bits = (acc shr (acc_len - k)) and ((1'u32 shl k) - 1)
-  acc_len -= k
-
-  # We have k bits and can do k doublings
-  for i in 0 ..< k:
-    tmp.double(P)
-    P = tmp
-
-  return (k, bits)
-
-
-func scalarMul*(
-       P: var ECP_SWei_Proj,
-       scalar: openArray[byte],
-       scratchspace: var openArray[ECP_SWei_Proj]
-     ) =
-  ## Elliptic Curve Scalar Multiplication
-  ##
-  ##   P <- [k] P
-  ##
-  ## This uses fixed-window optimization if possible
-  ## `scratchspace` MUST be of size 2 .. 2^4
-  ##
-  ## This is suitable to use with secret `scalar`, in particular
-  ## to derive a public key from a private key or
-  ## to sign a message.
-  ##
-  ## Particular care has been given to defend against the following side-channel attacks:
-  ## - timing attacks: all exponents of the same length
-  ##   will take the same time including
-  ##   a "zero" exponent of length 256-bit
-  ## - cache-timing attacks: Constantine does use a precomputed table
-  ##   but when extracting a value from the table
-  ##   the whole table is always accessed with the same pattern
-  ##   preventing malicious attacks through CPU cache delay analysis.
-  ## - simple power-analysis and electromagnetic attacks: Constantine always do the same
-  ##   double and add sequences and those cannot be analyzed to distinguish
-  ##   the exponent 0 and 1.
-  ##
-  ## I.e. As far as the author know, Constantine implements all countermeasures to the known
-  ##      **remote** attacks on ECC implementations.
-  ##
-  ## Disclaimer:
-  ##   Constantine is provided as-is without any guarantees.
-  ##   Use at your own risks.
-  ##   Thorough evaluation of your threat model, the security of any cryptographic library you are considering,
-  ##   and the secrets you put in jeopardy is strongly advised before putting data at risk.
-  ##   The author would like to remind users that the best code can only mitigate
-  ##   but not protect against human failures which are the weakest links and largest
-  ##   backdoors to secrets exploited today.
-  ##
-  ## Constantine is resistant to
-  ## - Fault Injection attacks: Constantine does not have branches that could
-  ##   be used to skip some additions and reveal which were dummy and which were real.
-  ##   Dummy operations are like the double-and-add-always timing attack countermeasure.
-  ##
-  ##
-  ## Constantine DOES NOT defend against Address-Bit Differential Power Analysis attacks by default,
-  ## which allow differentiating between writing a 0 or a 1 to a memory cell.
-  ## This is a threat for smart-cards and embedded devices (for example to handle authentication to a cable or satellite service)
-  ## Constantine can be extended to use randomized projective coordinates to foil this attack.
-
-  let window = scalarMulPrologue(P, scratchspace)
-
-  # We process bits with from most to least significant.
-  # At each loop iteration with have acc_len bits in acc.
-  # To maintain constant-time the number of iterations
-  # or the number of operations or memory accesses should be the same
-  # regardless of acc & acc_len
-  var
-    acc, acc_len: uint
-    e = 0
-  while acc_len > 0 or e < scalar.len:
-    let (k, bits) = scalarMulDoubling(
-      P, scalar, scratchspace[0],
-      window, acc, acc_len, e
-    )
-
-    # Window lookup: we set scratchspace[1] to the lookup value
-    # If the window length is 1 it's already set.
-    if window > 1:
-      # otherwise we need a constant-time lookup
-      # in particular we need the same memory accesses, we can't
-      # just index the openarray with the bits to avoid cache attacks.
-      for i in 1 ..< 1 shl k:
-        let ctl = SecretWord(i) == SecretWord(bits)
-        scratchspace[1].ccopy(scratchspace[1+i], ctl)
-
-    # Multiply with the looked-up value
-    # we need to keep the product only ig the exponent bits are not all zeroes
-    scratchspace[0].sum(P, scratchspace[1])
-    P.ccopy(scratchspace[0], SecretWord(bits).isNonZero())
+  aff.z.setOne()
+  aff.x.prod(proj.x, invZ)
+  aff.y.prod(proj.y, invZ)
diff --git a/constantine/io/io_bigints.nim b/constantine/io/io_bigints.nim
index 8e12306..47b2075 100644
--- a/constantine/io/io_bigints.nim
+++ b/constantine/io/io_bigints.nim
@@ -12,8 +12,7 @@
 
 import
   ../primitives/constant_time,
-  ../arithmetic/bigints,
-  ../config/common
+  ../config/[common, type_bigint]
 
 # ############################################################
 #
diff --git a/constantine/io/io_ec.nim b/constantine/io/io_ec.nim
index ca2c57d..d487a5e 100644
--- a/constantine/io/io_ec.nim
+++ b/constantine/io/io_ec.nim
@@ -9,7 +9,6 @@
 import
   ./io_bigints, ./io_fields,
   ../config/curves,
-  ../arithmetic/[bigints, finite_fields],
   ../elliptic/[
     ec_weierstrass_affine,
     ec_weierstrass_projective
@@ -38,12 +37,14 @@ func toHex*(P: ECP_SWei_Proj): string =
   ## TODO: only normalize and don't display the Z coordinate
   ##
   ## This proc output may change format in the future
-  result = $P.F.C & "(x: "
-  result &= P.x.tohex(bigEndian)
+
+  var aff {.noInit.}: typeof(P)
+  aff.affineFromProjective(P)
+
+  result = $aff.F.C & "(x: "
+  result &= aff.x.tohex(bigEndian)
   result &= ", y: "
-  result &= P.y.tohex(bigEndian)
-  result &= ", z: "
-  result &= P.y.tohex(bigEndian)
+  result &= aff.y.tohex(bigEndian)
   result &= ')'
 
 func fromHex*(dst: var ECP_SWei_Proj, x, y: string): bool {.raises: [ValueError].}=
diff --git a/constantine/io/io_fields.nim b/constantine/io/io_fields.nim
index dbc1c43..cf132a2 100644
--- a/constantine/io/io_fields.nim
+++ b/constantine/io/io_fields.nim
@@ -9,7 +9,7 @@
 import
   ./io_bigints,
   ../config/curves,
-  ../arithmetic/[bigints, finite_fields]
+  ../arithmetic/finite_fields
 
 # No exceptions allowed
 {.push raises: [].}
diff --git a/constantine/primitives.nim b/constantine/primitives.nim
index d311f84..0bb449c 100644
--- a/constantine/primitives.nim
+++ b/constantine/primitives.nim
@@ -11,11 +11,13 @@ import
   primitives/constant_time,
   primitives/multiplexers,
   primitives/addcarry_subborrow,
-  primitives/extended_precision
+  primitives/extended_precision,
+  primitives/bithacks
 
 export
   constant_time_types,
   constant_time,
   multiplexers,
   addcarry_subborrow,
-  extended_precision
+  extended_precision,
+  bithacks
diff --git a/constantine/primitives/bithacks.nim b/constantine/primitives/bithacks.nim
new file mode 100644
index 0000000..160ff2c
--- /dev/null
+++ b/constantine/primitives/bithacks.nim
@@ -0,0 +1,77 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+# ############################################################
+#
+#                           Bit hacks
+#
+# ############################################################
+
+# Bithacks
+# ------------------------------------------------------------
+# TODO: Nim std/bitops is unsatisfactory
+#       in particular the "noUndefined" flag
+#       for countLeadingZeroBits/countTrailingZeroBits
+#       is returning zero instead of the integer bitwidth
+#
+#       Furthermore it is not guaranteed constant-time
+#       And lastly, even compiler builtin may be slightly inefficient
+#       for example when doing fastLog2
+#       which is "31 - builtin_clz" we get
+#       `bsr + xor (from clz) + sub`
+#       instead of plain `bsr`
+#
+#       At the moment we don't need them to operate on secret data
+#
+# See: https://www.chessprogramming.org/BitScan
+#      https://www.chessprogramming.org/General_Setwise_Operations
+# and https://graphics.stanford.edu/%7Eseander/bithacks.html
+# for compendiums of bit manipulation
+
+proc clearMask[T: SomeInteger](v: T, mask: T): T {.inline.} =
+  ## Returns ``v``, with all the ``1`` bits from ``mask`` set to 0
+  v and not mask
+
+proc clearBit*[T: SomeInteger](v: T, bit: T): T {.inline.} =
+  ## Returns ``v``, with the bit at position ``bit`` set to 0
+  v.clearMask(1.T shl bit)
+
+func log2*(x: uint32): uint32 =
+  ## Find the log base 2 of a 32-bit or less integer.
+  ## using De Bruijn multiplication
+  ## Works at compile-time, guaranteed constant-time.
+  ## TODO: at runtime BitScanReverse or CountLeadingZero are more efficient
+  # https://graphics.stanford.edu/%7Eseander/bithacks.html#IntegerLogDeBruijn
+  const lookup: array[32, uint8] = [0'u8, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18,
+    22, 25, 3, 30, 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31]
+  var v = x
+  v = v or v shr 1 # first round down to one less than a power of 2
+  v = v or v shr 2
+  v = v or v shr 4
+  v = v or v shr 8
+  v = v or v shr 16
+  lookup[(v * 0x07C4ACDD'u32) shr 27]
+
+func log2*(x: uint64): uint64 {.inline, noSideEffect.} =
+  ## Find the log base 2 of a 32-bit or less integer.
+  ## using De Bruijn multiplication
+  ## Works at compile-time, guaranteed constant-time.
+  ## TODO: at runtime BitScanReverse or CountLeadingZero are more efficient
+  # https://graphics.stanford.edu/%7Eseander/bithacks.html#IntegerLogDeBruijn
+  const lookup: array[64, uint8] = [0'u8, 58, 1, 59, 47, 53, 2, 60, 39, 48, 27, 54,
+    33, 42, 3, 61, 51, 37, 40, 49, 18, 28, 20, 55, 30, 34, 11, 43, 14, 22, 4, 62,
+    57, 46, 52, 38, 26, 32, 41, 50, 36, 17, 19, 29, 10, 13, 21, 56, 45, 25, 31,
+    35, 16, 9, 12, 44, 24, 15, 8, 23, 7, 6, 5, 63]
+  var v = x
+  v = v or v shr 1 # first round down to one less than a power of 2
+  v = v or v shr 2
+  v = v or v shr 4
+  v = v or v shr 8
+  v = v or v shr 16
+  v = v or v shr 32
+  lookup[(v * 0x03F6EAF2CD271461'u64) shr 58]
diff --git a/constantine/primitives/constant_time.nim b/constantine/primitives/constant_time.nim
index 1507ef9..715f8c6 100644
--- a/constantine/primitives/constant_time.nim
+++ b/constantine/primitives/constant_time.nim
@@ -40,6 +40,9 @@ template cfalse*(T: typedesc[Ct or BaseUint]): auto =
 template ct*[T: BaseUint](x: T): Ct[T] =
   (Ct[T])(x)
 
+template ct*(x: auto, T: typedesc[BaseUint]): Ct[T] =
+  (Ct[T])(x)
+
 template `$`*[T](x: Ct[T]): string =
   $T(x)
 
@@ -119,60 +122,17 @@ template `-`*[T: Ct](x: T): T =
     {.emit:[neg, " = -", x, ";"].}
     neg
 
-# ############################################################
-#
-#                           Bit hacks
-#
-# ############################################################
-
-template isMsbSet*[T: Ct](x: T): CTBool[T] =
-  ## Returns the most significant bit of an integer
-  const msb_pos = T.sizeof * 8 - 1
-  (CTBool[T])(x shr msb_pos)
-
-func log2*(x: uint32): uint32 =
-  ## Find the log base 2 of a 32-bit or less integer.
-  ## using De Bruijn multiplication
-  ## Works at compile-time, guaranteed constant-time.
-  ## Note: at runtime BitScanReverse or CountLeadingZero are more efficient
-  ##       but log2 is never needed at runtime.
-  # https://graphics.stanford.edu/%7Eseander/bithacks.html#IntegerLogDeBruijn
-  const lookup: array[32, uint8] = [0'u8, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18,
-    22, 25, 3, 30, 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31]
-  var v = x
-  v = v or v shr 1 # first round down to one less than a power of 2
-  v = v or v shr 2
-  v = v or v shr 4
-  v = v or v shr 8
-  v = v or v shr 16
-  lookup[(v * 0x07C4ACDD'u32) shr 27]
-
-func log2*(x: uint64): uint64 {.inline, noSideEffect.} =
-  ## Find the log base 2 of a 32-bit or less integer.
-  ## using De Bruijn multiplication
-  ## Works at compile-time, guaranteed constant-time.
-  ## Note: at runtime BitScanReverse or CountLeadingZero are more efficient
-  ##       but log2 is never needed at runtime.
-  # https://graphics.stanford.edu/%7Eseander/bithacks.html#IntegerLogDeBruijn
-  const lookup: array[64, uint8] = [0'u8, 58, 1, 59, 47, 53, 2, 60, 39, 48, 27, 54,
-    33, 42, 3, 61, 51, 37, 40, 49, 18, 28, 20, 55, 30, 34, 11, 43, 14, 22, 4, 62,
-    57, 46, 52, 38, 26, 32, 41, 50, 36, 17, 19, 29, 10, 13, 21, 56, 45, 25, 31,
-    35, 16, 9, 12, 44, 24, 15, 8, 23, 7, 6, 5, 63]
-  var v = x
-  v = v or v shr 1 # first round down to one less than a power of 2
-  v = v or v shr 2
-  v = v or v shr 4
-  v = v or v shr 8
-  v = v or v shr 16
-  v = v or v shr 32
-  lookup[(v * 0x03F6EAF2CD271461'u64) shr 58]
-
 # ############################################################
 #
 #             Hardened Boolean primitives
 #
 # ############################################################
 
+template isMsbSet[T: Ct](x: T): CTBool[T] =
+  ## Returns the most significant bit of an integer
+  const msb_pos = T.sizeof * 8 - 1
+  (CTBool[T])(x shr msb_pos)
+
 template fmap[T: Ct](x: CTBool[T], op: untyped, y: CTBool[T]): CTBool[T] =
   CTBool[T](op(T(x), T(y)))
 
diff --git a/constantine/primitives/extended_precision.nim b/constantine/primitives/extended_precision.nim
index 22a91f6..ae0bc6f 100644
--- a/constantine/primitives/extended_precision.nim
+++ b/constantine/primitives/extended_precision.nim
@@ -12,7 +12,10 @@
 #
 # ############################################################
 
-import ./constant_time_types, ./addcarry_subborrow
+import
+  ./constant_time_types,
+  ./addcarry_subborrow,
+  ./constant_time
 
 # ############################################################
 #
@@ -88,7 +91,7 @@ when sizeof(int) == 8:
       from ./extended_precision_64bit_uint128 import mul, muladd1, muladd2
     else:
       from ./extended_precision_64bit_uint128 import unsafeDiv2n1n, mul, muladd1, muladd2
-  export unsafeDiv2n1n, muladd1, muladd2
+  export unsafeDiv2n1n, mul, muladd1, muladd2
 
 # ############################################################
 #
@@ -126,3 +129,12 @@ func mulDoubleAdd2*[T: Ct[uint32]|Ct[uint64]](r2: var Carry, r1, r0: var T, a, b
   # result at most in (carry: 1, hi: 0xFFFFFFFF_FFFFFFFF, lo: 0x00000000_00000000)
   addC(carry, r0, r0, dLo, Carry(0))
   addC(carry, r1, r1, T(dHi), carry)
+
+func mulAcc*[T: Ct[uint32]|Ct[uint64]](t, u, v: var T, a, b: T) {.inline.} =
+  ## (t, u, v) <- (t, u, v) + a * b
+  var UV: array[2, T]
+  var carry: Carry
+  mul(UV[1], UV[0], a, b)
+  addC(carry, v, v, UV[0], Carry(0))
+  addC(carry, u, u, UV[1], carry)
+  t += T(carry)
diff --git a/constantine/primitives/research/addcarry_subborrow_compiler.nim b/constantine/primitives/research/addcarry_subborrow_compiler.nim
index 63d2523..3391577 100644
--- a/constantine/primitives/research/addcarry_subborrow_compiler.nim
+++ b/constantine/primitives/research/addcarry_subborrow_compiler.nim
@@ -17,7 +17,7 @@
 #
 # This overcome the bad GCC codegen aven with addcary_u64 intrinsic.
 
-import macros
+import std/macros
 
 func wordsRequired(bits: int): int {.compileTime.} =
   ## Compute the number of limbs required
@@ -86,7 +86,7 @@ func `+=`(a: var BigInt, b: BigInt) {.noinline.}=
 
 # #############################################
 when isMainModule:
-  import random
+  import std/random
   proc rand(T: typedesc[BigInt]): T =
     for i in 0 ..< result.limbs.len:
       result.limbs[i] = uint64(rand(high(int)))
diff --git a/constantine/tower_field_extensions/quadratic_extensions.nim b/constantine/tower_field_extensions/quadratic_extensions.nim
index f7a6a12..f0b1538 100644
--- a/constantine/tower_field_extensions/quadratic_extensions.nim
+++ b/constantine/tower_field_extensions/quadratic_extensions.nim
@@ -8,7 +8,6 @@
 
 import
   ../arithmetic,
-  ../config/common,
   ../primitives,
   ./tower_common
 
diff --git a/formal_verification/bls12_381_q_64.nim b/formal_verification/bls12_381_q_64.nim
index b5da75e..07cb51c 100644
--- a/formal_verification/bls12_381_q_64.nim
+++ b/formal_verification/bls12_381_q_64.nim
@@ -128,7 +128,7 @@ func fromHex(output: var openArray[byte], hexStr: string, order: static[Endianne
 # -------------------------------------------------------------------------
 
 when isMainModule:
-  import random, std/monotimes, times, strformat, ../benchmarks/platforms
+  import std/[random, monotimes, times, strformat], ../benchmarks/platforms
 
   const Iters = 1_000_000
   const InvIters = 1000
diff --git a/helpers/static_for.nim b/helpers/static_for.nim
index 0232664..ffedcad 100644
--- a/helpers/static_for.nim
+++ b/helpers/static_for.nim
@@ -1,4 +1,4 @@
-import macros
+import std/macros
 
 proc replaceNodes(ast: NimNode, what: NimNode, by: NimNode): NimNode =
   # Replace "what" ident node by "by"
diff --git a/sage/curve_family_bls12.sage b/sage/curve_family_bls12.sage
index a8324a0..bda3566 100644
--- a/sage/curve_family_bls12.sage
+++ b/sage/curve_family_bls12.sage
@@ -30,11 +30,54 @@ def compute_curve_characteristic(u_str):
   else:
       print('  Parameter u (hex):  0x' + u.hex())
 
+  print(f'  p mod 3:           ' + str(p % 3))
   print(f'  p mod 4:           ' + str(p % 4))
   print(f'  p mod 8:           ' + str(p % 8))
   print(f'  p mod 12:          ' + str(p % 12))
   print(f'  p mod 16:          ' + str(p % 16))
 
+  print()
+
+  print(f'  Endomorphism-based acceleration when p mod 3 == 1')
+  print(f'    Endomorphism can be field multiplication by one of the non-trivial cube root of unity 𝜑')
+  print(f'      Rationale:')
+  print(f'        curve equation is y² = x³ + b, and y² = (x𝜑)³ + b <=> y² = x³ + b (with 𝜑³ == 1) so we are still on the curve')
+  print(f'        this means that multiplying by 𝜑 the x-coordinate is equivalent to a scalar multiplication by some λᵩ')
+  print(f'        with λᵩ² + λᵩ + 1 ≡ 0 (mod CurveOrder), see below. Hence we have a 2 dimensional decomposition of the scalar multiplication')
+  print(f'        i.e. For any [s]P, we can find a corresponding [k1]P + [k2][λᵩ]P with [λᵩ]P being a simple field multiplication by 𝜑')
+  print(f'      Finding cube roots:')
+  print(f'         x³−1=0 <=> (x−1)(x²+x+1) = 0, if x != 1, x solves (x²+x+1) = 0 <=> x = (-1±√3)/2')
+  print(f'         cube roots of unity: ' + str(['0x' + Integer(root).hex() for root in GF(p)(1).nth_root(3, all=True)]))
+  print(f'    GLV-2 decomposition of s into (k1, k2) on G1')
+  print(f'      (k1, k2) = (s, 0) - 𝛼1 b1 - 𝛼2 b2')
+  print(f'      𝛼i = 𝛼\u0302i * s / r')
+  print(f'        Lattice b1: ' + str(['0x' + b.hex() for b in [u^2-1, -1]]))
+  print(f'        Lattice b2: ' + str(['0x' + b.hex() for b in [1, u^2]]))
+
+  # Babai rounding
+  ahat1 = u^2
+  ahat2 = 1
+  # We want a1 = ahat1 * s/r with m = 2 (for a 2-dim decomposition) and r the curve order
+  # To handle rounding errors we instead multiply by
+  # 𝜈 = (2^WordBitWidth)^w (i.e. the same as the R magic constant for Montgomery arithmetic)
+  # with 𝜈 > r and w minimal so that 𝜈 > r
+  # a1 = ahat1*𝜈/r * s/𝜈
+  v = int(r).bit_length()
+  print(f'      r.bit_length(): {v}')
+  v = int(((v + 64 - 1) // 64) * 64) # round to next multiple of 64
+  print(f'      𝜈 > r, 𝜈: 2^{v}')
+  print(f'      Babai roundings')
+  print(f'        𝛼\u03021: ' + '0x' + ahat1.hex())
+  print(f'        𝛼\u03022: ' + '0x' + ahat2.hex())
+  print(f'      Handle rounding errors')
+  print(f'        𝛼\u03051 = 𝛼\u03021 * s / r with 𝛼1 = (𝛼\u03021 * 𝜈/r) * s/𝜈')
+  print(f'        𝛼\u03052 = 𝛼\u03022 * s / r with 𝛼2 = (𝛼\u03022 * 𝜈/r) * s/𝜈')
+  print(f'        -----------------------------------------------------')
+  l1 = Integer(ahat1 << v) // r
+  l2 = Integer(ahat2 << v) // r
+  print(f'        𝛼1 = (0x{l1.hex()} * s) >> {v}')
+  print(f'        𝛼2 = (0x{l2.hex()} * s) >> {v}')
+
 if __name__ == "__main__":
   # Usage
   # sage sage/curve_family_bls12.sage '-(2^63 + 2^62 + 2^60 + 2^57 + 2^48 + 2^16)'
diff --git a/sage/curve_family_bn.sage b/sage/curve_family_bn.sage
index 2b81286..a546487 100644
--- a/sage/curve_family_bn.sage
+++ b/sage/curve_family_bn.sage
@@ -22,22 +22,68 @@ def compute_curve_characteristic(u_str):
   r = 36*u^4 + 36*u^3 + 18*u^2 + 6*u + 1
 
   print(f'BN family - {p.nbits()} bits')
-  print('  Prime modulus:     0x' + p.hex())
-  print('  Curve order:       0x' + r.hex())
+  print('  Prime modulus p:     0x' + p.hex())
+  print('  Curve order r:       0x' + r.hex())
   print('  Parameter u:       ' + u_str)
   if u < 0:
       print('  Parameter u (hex): -0x' + (-u).hex())
   else:
       print('  Parameter u (hex):  0x' + u.hex())
 
+  print(f'  p mod 3:           ' + str(p % 3))
   print(f'  p mod 4:           ' + str(p % 4))
   print(f'  p mod 8:           ' + str(p % 8))
   print(f'  p mod 12:          ' + str(p % 12))
   print(f'  p mod 16:          ' + str(p % 16))
 
+  print()
+
+  print(f'  Endomorphism-based acceleration when p mod 3 == 1')
+  print(f'    Endomorphism can be field multiplication by one of the non-trivial cube root of unity 𝜑')
+  print(f'      Rationale:')
+  print(f'        curve equation is y² = x³ + b, and y² = (x𝜑)³ + b <=> y² = x³ + b (with 𝜑³ == 1) so we are still on the curve')
+  print(f'        this means that multiplying by 𝜑 the x-coordinate is equivalent to a scalar multiplication by some λᵩ')
+  print(f'        with λᵩ² + λᵩ + 1 ≡ 0 (mod r) and 𝜑² + 𝜑 + 1 ≡ 0 (mod p), see below.')
+  print(f'        Hence we have a 2 dimensional decomposition of the scalar multiplication')
+  print(f'        i.e. For any [s]P, we can find a corresponding [k1]P + [k2][λᵩ]P with [λᵩ]P being a simple field multiplication by 𝜑')
+  print(f'      Finding cube roots:')
+  print(f'         x³−1=0 <=> (x−1)(x²+x+1) = 0, if x != 1, x solves (x²+x+1) = 0 <=> x = (-1±√3)/2')
+  print(f'         cube roots of unity 𝜑 (mod p): ' + str(['0x' + Integer(root).hex() for root in GF(p)(1).nth_root(3, all=True)]))
+  print(f'         cube roots of unity λᵩ (mod r): ' + str(['0x' + Integer(root).hex() for root in GF(r)(1).nth_root(3, all=True)]))
+  print(f'    GLV-2 decomposition of s into (k1, k2) on G1')
+  print(f'      (k1, k2) = (s, 0) - 𝛼1 b1 - 𝛼2 b2')
+  print(f'      𝛼i = 𝛼\u0302i * s / r')
+  print(f'        Lattice b1: ' + str(['0x' + b.hex() for b in [2*u+1, 6*u^2+4*u+1]]))
+  print(f'        Lattice b2: ' + str(['0x' + b.hex() for b in [6*u^2+2*u, -2*u-1]]))
+
+  # Babai rounding
+  ahat1 = 2*u+1
+  ahat2 = 6*u^2+4*u+1
+  # We want a1 = ahat1 * s/r with m = 2 (for a 2-dim decomposition) and r the curve order
+  # To handle rounding errors we instead multiply by
+  # 𝜈 = (2^WordBitWidth)^w (i.e. the same as the R magic constant for Montgomery arithmetic)
+  # with 𝜈 > r and w minimal so that 𝜈 > r
+  # a1 = ahat1*𝜈/r * s/𝜈
+  v = int(r).bit_length()
+  print(f'      r.bit_length(): {v}')
+  v = int(((v + 64 - 1) // 64) * 64) # round to next multiple of 64
+  print(f'      𝜈 > r, 𝜈: 2^{v}')
+  print(f'      Babai roundings')
+  print(f'        𝛼\u03021: ' + '0x' + ahat1.hex())
+  print(f'        𝛼\u03022: ' + '0x' + ahat2.hex())
+  print(f'      Handle rounding errors')
+  print(f'        𝛼1 = 𝛼\u03021 * s / r with 𝛼1 = (𝛼\u03021 * 𝜈/r) * s/𝜈')
+  print(f'        𝛼2 = 𝛼\u03022 * s / r with 𝛼2 = (𝛼\u03022 * 𝜈/r) * s/𝜈')
+  print(f'        -----------------------------------------------------')
+  l1 = Integer(ahat1 << v) // r
+  l2 = Integer(ahat2 << v) // r
+  print(f'        𝛼1 = (0x{l1.hex()} * s) >> {v}')
+  print(f'        𝛼2 = (0x{l2.hex()} * s) >> {v}')
+
 if __name__ == "__main__":
   # Usage
   # sage sage/curve_family_bn.sage '-(2^62 + 2^55 + 1)'
+  # sage sage/curve_family_bn.sage 4965661367192848881
 
   from argparse import ArgumentParser
 
diff --git a/sage/lattice_decomposition_bls12_381_g1.sage b/sage/lattice_decomposition_bls12_381_g1.sage
new file mode 100644
index 0000000..3d082c8
--- /dev/null
+++ b/sage/lattice_decomposition_bls12_381_g1.sage
@@ -0,0 +1,202 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+# ############################################################
+#
+#                    BN254 GLV Endomorphism
+#                     Lattice Decomposition
+#
+# ############################################################
+
+# Parameters
+u = -(2^63 + 2^62 + 2^60 + 2^57 + 2^48 + 2^16)
+p = (u - 1)^2 * (u^4 - u^2 + 1)//3 + u
+r = u^4 - u^2 + 1
+cofactor = Integer('0x396c8c005555e1568c00aaab0000aaab')
+print('p  : ' + p.hex())
+print('r  : ' + r.hex())
+
+# Cube root of unity (mod r) formula for any BLS12 curves
+lambda1_r = u^2 - 1
+assert lambda1_r^3 % r == 1
+print('λᵩ1  : ' + lambda1_r.hex())
+print('λᵩ1+r: ' + (lambda1_r+r).hex())
+
+lambda2_r = u^4
+assert lambda2_r^3 % r == 1
+print('λᵩ2  : ' + lambda2_r.hex())
+
+# Finite fields
+F       = GF(p)
+# K2.<u>  = PolynomialRing(F)
+# F2.<beta>  = F.extension(u^2+9)
+# K6.<v>  = PolynomialRing(F2)
+# F6.<eta>  = F2.extension(v^3-beta)
+# K12.<w> = PolynomialRing(F6)
+# K12.<gamma> = F6.extension(w^2-eta)
+
+# Curves
+b = 4
+G1 = EllipticCurve(F, [0, b])
+# G2 = EllipticCurve(F2, [0, b*beta])
+
+(phi1, phi2) = (root for root in GF(p)(1).nth_root(3, all=True) if root != 1)
+print('𝜑1  :' + Integer(phi1).hex())
+print('𝜑2  :' + Integer(phi2).hex())
+assert phi1^3 % p == 1
+assert phi2^3 % p == 1
+
+# Test generator
+set_random_seed(1337)
+
+# Check
+def checkEndo():
+    Prand = G1.random_point()
+    assert Prand != G1([0, 1, 0]) # Infinity
+
+    # Clear cofactor
+    P = Prand * cofactor
+
+    (Px, Py, Pz) = P
+    Qendo1 = G1([Px*phi1 % p, Py, Pz])
+    Qendo2 = G1([Px*phi2 % p, Py, Pz])
+
+    Q1 = lambda1_r * P
+    Q2 = lambda2_r * P
+
+    assert P != Q1
+    assert P != Q2
+
+    assert (F(Px)*F(phi1))^3 == F(Px)^3
+    assert (F(Px)*F(phi2))^3 == F(Px)^3
+
+    assert Q1 == Qendo2
+    assert Q2 == Qendo2
+
+    print('Endomorphism OK with 𝜑2')
+
+checkEndo()
+
+# Lattice
+b = [
+  [u^2-1, -1],
+  [1,  u^2]
+]
+# Babai rounding
+ahat = [u^2, 1]
+v = int(r).bit_length()
+v = int(((v + 64 - 1) // 64) * 64) # round to next multiple of 64
+
+l = [Integer(a << v) // r for a in ahat]
+print('𝛼\u03051: ' + l[0].hex())
+print('𝛼\u03052: ' + l[1].hex())
+
+def getGLV2_decomp(scalar):
+
+    a0 = (l[0] * scalar) >> v
+    a1 = (l[1] * scalar) >> v
+
+    k0 = scalar - a0 * b[0][0] - a1 * b[1][0]
+    k1 = 0      - a0 * b[0][1] - a1 * b[1][1]
+
+    assert int(k0).bit_length() <= (int(r).bit_length() + 1) // 2
+    assert int(k1).bit_length() <= (int(r).bit_length() + 1) // 2
+
+    assert scalar == (k0 + k1 * (lambda1_r % r)) % r
+    assert scalar == (k0 + k1 * (lambda2_r % r)) % r
+
+    return k0, k1
+
+def recodeScalars(k):
+    m = 2
+    L = ((int(r).bit_length() + m-1) // m) + 1 # l = ⌈log2 r/m⌉ + 1
+
+    b = [[0] * L, [0] * L]
+    b[0][L-1] = 1
+    for i in range(0, L-1): # l-2 inclusive
+        b[0][i] = 2 * ((k[0] >> (i+1)) & 1) - 1
+    for j in range(1, m):
+        for i in range(0, L):
+            b[j][i] = b[0][i] * (k[j] & 1)
+            k[j] = (k[j]//2) - (b[j][i] // 2)
+
+    return b
+
+def buildLut(P0, P1):
+    m = 2
+    lut = [0] * (1 << (m-1))
+    lut[0] = P0
+    lut[1] = P0 + P1
+    return lut
+
+def pointToString(P):
+    (Px, Py, Pz) = P
+    return '(x: ' + Integer(Px).hex() + ', y: ' + Integer(Py).hex() + ', z: ' + Integer(Pz).hex() + ')'
+
+def scalarMulGLV(scalar, P0):
+    m = 2
+    L = ((int(r).bit_length() + m-1) // m) + 1 # l = ⌈log2 r/m⌉ + 1
+
+    print('L: ' + str(L))
+
+    print('scalar: ' + Integer(scalar).hex())
+
+    k0, k1 = getGLV2_decomp(scalar)
+    print('k0: ' + k0.hex())
+    print('k1: ' + k1.hex())
+
+    P1 = (lambda1_r % r) * P0
+    (Px, Py, Pz) = P0
+    P1_endo = G1([Px*phi2 % p, Py, Pz])
+    assert P1 == P1_endo
+
+    expected = scalar * P0
+    decomp = k0*P0 + k1*P1
+    assert expected == decomp
+
+    print('------ recode scalar -----------')
+    even = k0 & 1 == 1
+    if even:
+        k0 -= 1
+
+    b = recodeScalars([k0, k1])
+    print('b0: ' + str(list(reversed(b[0]))))
+    print('b1: ' + str(list(reversed(b[1]))))
+
+    print('------------ lut ---------------')
+
+    lut = buildLut(P0, P1)
+
+    print('------------ mul ---------------')
+    print('b0 L-1: ' + str(b[0][L-1]))
+    Q = b[0][L-1] * lut[b[1][L-1] & 1]
+    for i in range(L-2, -1, -1):
+        Q *= 2
+        Q += b[0][i] * lut[b[1][i] & 1]
+
+    if even:
+        Q += P0
+
+    print('final Q: ' + pointToString(Q))
+    print('expected: ' + pointToString(expected))
+    assert Q == expected # TODO debug
+
+# Test generator
+set_random_seed(1337)
+
+for i in range(1):
+    print('---------------------------------------')
+    # scalar = randrange(r) # Pick an integer below curve order
+    # P = G1.random_point()
+    scalar = Integer('0xf7e60a832eb77ac47374bc93251360d6c81c21add62767ff816caf11a20d8db')
+    P = G1([
+        Integer('0xf9679bb02ee7f352fff6a6467a5e563ec8dd38c86a48abd9e8f7f241f1cdd29d54bc3ddea3a33b62e0d7ce22f3d244a'),
+        Integer('0x50189b992cf856846b30e52205ff9ef72dc081e9680726586231cbc29a81a162120082585f401e00382d5c86fb1083f'),
+        Integer(1)
+    ])
+    scalarMulGLV(scalar, P)
diff --git a/sage/lattice_decomposition_bn254_snarks_g1.sage b/sage/lattice_decomposition_bn254_snarks_g1.sage
new file mode 100644
index 0000000..ab2fb49
--- /dev/null
+++ b/sage/lattice_decomposition_bn254_snarks_g1.sage
@@ -0,0 +1,193 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+# ############################################################
+#
+#                    BN254 GLV Endomorphism
+#                     Lattice Decomposition
+#
+# ############################################################
+
+# Parameters
+u = Integer('0x44E992B44A6909F1')
+p = 36*u^4 + 36*u^3 + 24*u^2 + 6*u + 1
+r = 36*u^4 + 36*u^3 + 18*u^2 + 6*u + 1
+cofactor = 1
+
+# Cube root of unity (mod r) formula for any BN curves
+lambda1_r = (-(36*u^3+18*u^2+6*u+2))
+assert lambda1_r^3 % r == 1
+print('λᵩ1  : ' + lambda1_r.hex())
+print('λᵩ1+r: ' + (lambda1_r+r).hex())
+
+lambda2_r = (36*u^4-1)
+assert lambda2_r^3 % r == 1
+print('λᵩ2  : ' + lambda2_r.hex())
+
+# Finite fields
+F       = GF(p)
+# K2.<u>  = PolynomialRing(F)
+# F2.<beta>  = F.extension(u^2+9)
+# K6.<v>  = PolynomialRing(F2)
+# F6.<eta>  = F2.extension(v^3-beta)
+# K12.<w> = PolynomialRing(F6)
+# K12.<gamma> = F6.extension(w^2-eta)
+
+# Curves
+b = 3
+G1 = EllipticCurve(F, [0, b])
+# G2 = EllipticCurve(F2, [0, b/beta])
+
+(phi1, phi2) = (root for root in GF(p)(1).nth_root(3, all=True) if root != 1)
+print('𝜑1  :' + Integer(phi1).hex())
+print('𝜑2  :' + Integer(phi2).hex())
+
+# Test generator
+set_random_seed(1337)
+
+# Check
+def checkEndo():
+    P = G1.random_point()
+    assert P != G1([0, 1, 0]) # Infinity
+
+    (Px, Py, Pz) = P
+    Qendo1 = G1([Px*phi1 % p, Py, Pz])
+    Qendo2 = G1([Px*phi2 % p, Py, Pz])
+
+    Q1 = lambda1_r * P
+    Q2 = lambda2_r * P
+
+    assert P != Q1
+    assert P != Q2
+
+    assert (F(Px)*F(phi1))^3 == F(Px)^3
+    assert (F(Px)*F(phi2))^3 == F(Px)^3
+
+    assert Q1 == Qendo1
+    assert Q2 == Qendo1
+
+    print('Endomorphism OK with 𝜑1')
+
+checkEndo()
+
+# Lattice
+b = [
+  [2*u+1, 6*u^2+4*u+1],
+  [6*u^2+2*u,  -2*u-1]
+]
+# Babai rounding
+ahat = [2*u+1, 6*u^2+4*u+1]
+v = int(r).bit_length()
+v = int(((v + 64 - 1) // 64) * 64) # round to next multiple of 64
+
+l = [Integer(a << v) // r for a in ahat]
+
+def getGLV2_decomp(scalar):
+
+    a0 = (l[0] * scalar) >> v
+    a1 = (l[1] * scalar) >> v
+
+    k0 = scalar - a0 * b[0][0] - a1 * b[1][0]
+    k1 = 0      - a0 * b[0][1] - a1 * b[1][1]
+
+    assert int(k0).bit_length() <= (int(r).bit_length() + 1) // 2
+    assert int(k1).bit_length() <= (int(r).bit_length() + 1) // 2
+
+    assert scalar == (k0 + k1 * (lambda1_r % r)) % r
+    assert scalar == (k0 + k1 * (lambda2_r % r)) % r
+
+    return k0, k1
+
+def recodeScalars(k):
+    m = 2
+    l = ((int(r).bit_length() + m-1) // m) + 1 # l = ⌈log2 r/m⌉ + 1
+
+    b = [[0] * l, [0] * l]
+    b[0][l-1] = 1
+    for i in range(0, l-1): # l-2 inclusive
+        b[0][i] = 2 * ((k[0] >> (i+1)) & 1) - 1
+    for j in range(1, m):
+        for i in range(0, l):
+            b[j][i] = b[0][i] * (k[j] & 1)
+            k[j] = (k[j]//2) - (b[j][i] // 2)
+
+    return b
+
+def buildLut(P0, P1):
+    m = 2
+    lut = [0] * (1 << (m-1))
+    lut[0] = P0
+    lut[1] = P0 + P1
+    return lut
+
+def pointToString(P):
+    (Px, Py, Pz) = P
+    return '(x: ' + Integer(Px).hex() + ', y: ' + Integer(Py).hex() + ', z: ' + Integer(Pz).hex() + ')'
+
+def scalarMulGLV(scalar, P0):
+    m = 2
+    L = ((int(r).bit_length() + m-1) // m) + 1 # l = ⌈log2 r/m⌉ + 1
+
+    print('L: ' + str(L))
+
+    print('scalar: ' + Integer(scalar).hex())
+
+    k0, k1 = getGLV2_decomp(scalar)
+    print('k0: ' + k0.hex())
+    print('k1: ' + k1.hex())
+
+    P1 = (lambda1_r % r) * P0
+    (Px, Py, Pz) = P0
+    P1_endo = G1([Px*phi1 % p, Py, Pz])
+    assert P1 == P1_endo
+
+    expected = scalar * P0
+    decomp = k0*P0 + k1*P1
+    assert expected == decomp
+
+    print('------ recode scalar -----------')
+    even = k0 & 1 == 1
+    if even:
+        k0 -= 1
+
+    b = recodeScalars([k0, k1])
+    print('b0: ' + str(list(reversed(b[0]))))
+    print('b1: ' + str(list(reversed(b[1]))))
+
+    print('------------ lut ---------------')
+
+    lut = buildLut(P0, P1)
+
+    print('------------ mul ---------------')
+    print('b0 L-1: ' + str(b[0][L-1]))
+    Q = b[0][L-1] * lut[b[1][L-1] & 1]
+    for i in range(L-2, -1, -1):
+        Q *= 2
+        Q += b[0][i] * lut[b[1][i] & 1]
+
+    if even:
+        Q += P0
+
+    print('final Q: ' + pointToString(Q))
+    print('expected: ' + pointToString(expected))
+    assert Q == expected # TODO debug
+
+# Test generator
+set_random_seed(1337)
+
+for i in range(1):
+    print('---------------------------------------')
+    # scalar = randrange(r) # Pick an integer below curve order
+    # P = G1.random_point()
+    scalar = Integer('0x0e08a292f940cfb361cc82bc24ca564f51453708c9745a9cf8707b11c84bc448')
+    P = G1([
+        Integer('0x22d3af0f3ee310df7fc1a2a204369ac13eb4a48d969a27fcd2861506b2dc0cd7'),
+        Integer('0x1c994169687886ccd28dd587c29c307fb3cab55d796d73a5be0bbf9aab69912e'),
+        Integer(1)
+    ])
+    scalarMulGLV(scalar, P)
diff --git a/sage/testgen_bls12_381.sage b/sage/testgen_bls12_381.sage
index ee96098..d681f64 100644
--- a/sage/testgen_bls12_381.sage
+++ b/sage/testgen_bls12_381.sage
@@ -37,7 +37,11 @@ set_random_seed(1337)
 
 for i in range(10):
     print('---------------------------------------')
-    P = G1.random_point()
+    Prand = G1.random_point()
+
+    # Clear cofactor
+    P = Prand * cofactor
+
     (Px, Py, Pz) = P
     print('Px: ' + Integer(Px).hex())
     print('Py: ' + Integer(Py).hex())
@@ -50,7 +54,7 @@ for i in range(10):
     print('Qx: ' + Integer(Qx).hex())
     print('Qy: ' + Integer(Qy).hex())
     print('Qz: ' + Integer(Qz).hex())
-
+print('=========================================')
 
 # CurveOrder sanity check
 #
diff --git a/sage/testgen_bn254_snarks.sage b/sage/testgen_bn254_snarks.sage
index 0e9a5f0..9afd828 100644
--- a/sage/testgen_bn254_snarks.sage
+++ b/sage/testgen_bn254_snarks.sage
@@ -50,6 +50,7 @@ for i in range(10):
     print('Qx: ' + Integer(Qx).hex())
     print('Qy: ' + Integer(Qy).hex())
     print('Qz: ' + Integer(Qz).hex())
+print('=========================================')
 
 # CurveOrder sanity check
 #
diff --git a/tests/test_bigints.nim b/tests/test_bigints.nim
index ed4f50f..45726bb 100644
--- a/tests/test_bigints.nim
+++ b/tests/test_bigints.nim
@@ -6,7 +6,7 @@
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
-import  unittest,
+import  std/unittest,
         ../constantine/io/io_bigints,
         ../constantine/arithmetic,
         ../constantine/config/common,
@@ -138,6 +138,131 @@ proc mainArith() =
         discard a.add(SecretWord 1)
         check: bool(a == expected)
 
+  suite "Multi-precision multiplication":
+    test "Same size operand into double size result":
+      block:
+        var r: BigInt[256]
+        let a = BigInt[128].fromHex"0x12345678_FF11FFAA_00321321_CAFECAFE"
+        let b = BigInt[128].fromHex"0xDEADBEEF_DEADBEEF_DEADBEEF_DEADBEEF"
+
+        let expected = BigInt[256].fromHex"fd5bdef43d64113f371ab5d8843beca889c07fd549b84d8a5001a8f102e0722"
+
+        r.prod(a, b)
+        check: bool(r == expected)
+        r.prod(b, a)
+        check: bool(r == expected)
+
+    test "Different size into large result":
+      block:
+        var r: BigInt[200]
+        let a = BigInt[29].fromHex"0x12345678"
+        let b = BigInt[128].fromHex"0xDEADBEEF_DEADBEEF_DEADBEEF_DEADBEEF"
+
+        let expected = BigInt[200].fromHex"fd5bdee65f787f665f787f665f787f65621ca08"
+
+        r.prod(a, b)
+        check: bool(r == expected)
+        r.prod(b, a)
+        check: bool(r == expected)
+
+    test "Destination is properly zero-padded if multiplicands are too short":
+      block:
+        var r = BigInt[200].fromHex"0xDEADBEEF_DEADBEEF_DEADBEEF_DEADBEEF_DEADBEEF_DEADBEEF_DE"
+        let a = BigInt[29].fromHex"0x12345678"
+        let b = BigInt[128].fromHex"0xDEADBEEF_DEADBEEF_DEADBEEF_DEADBEEF"
+
+        let expected = BigInt[200].fromHex"fd5bdee65f787f665f787f665f787f65621ca08"
+
+        r.prod(a, b)
+        check: bool(r == expected)
+        r.prod(b, a)
+        check: bool(r == expected)
+
+  suite "Multi-precision multiplication keeping only high words":
+    test "Same size operand into double size result - discard first word":
+      block:
+        var r: BigInt[256]
+        let a = BigInt[128].fromHex"0x12345678_FF11FFAA_00321321_CAFECAFE"
+        let b = BigInt[128].fromHex"0xDEADBEEF_DEADBEEF_DEADBEEF_DEADBEEF"
+
+        when WordBitWidth == 32:
+          let expected = BigInt[256].fromHex"fd5bdef43d64113f371ab5d8843beca889c07fd549b84d8a5001a8f"
+        else:
+          let expected = BigInt[256].fromHex"fd5bdef43d64113f371ab5d8843beca889c07fd549b84d8"
+
+        r.prod_high_words(a, b, 1)
+        check: bool(r == expected)
+        r.prod_high_words(b, a, 1)
+        check: bool(r == expected)
+
+    test "Same size operand into double size result - discard first 3 words":
+      block:
+        var r: BigInt[256]
+        let a = BigInt[128].fromHex"0x12345678_FF11FFAA_00321321_CAFECAFE"
+        let b = BigInt[128].fromHex"0xDEADBEEF_DEADBEEF_DEADBEEF_DEADBEEF"
+
+        when WordBitWidth == 32:
+          let expected = BigInt[256].fromHex"fd5bdef43d64113f371ab5d8843beca889c07fd"
+        else:
+          let expected = BigInt[256].fromHex"fd5bdef43d64113"
+
+        r.prod_high_words(a, b, 3)
+        check: bool(r == expected)
+        r.prod_high_words(b, a, 3)
+        check: bool(r == expected)
+
+    test "All lower words trigger a carry":
+      block:
+        var r: BigInt[256]
+        let a = BigInt[256].fromHex"0xFFFFF000_FFFFF111_FFFFFFFA_FFFFFFFF_FFFFFFFF_FFFFFFFF_FFFFFFFF_FFFFFFFF"
+        let b = BigInt[256].fromHex"0xFFFFFFFF_FFFFF222_FFFFFFFB_FFFFFFFF_FFFFFFFF_FFFFFFFF_FFFFFFFF_FFFFFFFF"
+
+        # Full product:
+        # fffff000_ffffe335_00ddc21a_00cf3972_00008109_00000013_ffffffff_fffffffe
+        # 00000fff_00001ccb_00000009_00000000_00000000_00000000_00000000_00000001
+        let expected = BigInt[256].fromHex"0xfffff000_ffffe335_00ddc21a_00cf3972_00008109_00000013_ffffffff_fffffffe"
+        when WordBitWidth == 32:
+          const startWord = 8
+        else:
+          const startWord = 4
+
+        r.prod_high_words(a, b, startWord)
+        check: bool(r == expected)
+        r.prod_high_words(b, a, startWord)
+        check: bool(r == expected)
+
+    test "Different size into large result":
+      block:
+        var r: BigInt[200]
+        let a = BigInt[29].fromHex"0x12345678"
+        let b = BigInt[128].fromHex"0xDEADBEEF_DEADBEEF_DEADBEEF_DEADBEEF"
+
+        when WordBitWidth == 32:
+          let expected = BigInt[200].fromHex"fd5bdee65f787f665f787f6"
+        else:
+          let expected = BigInt[200].fromHex"fd5bdee"
+
+        r.prod_high_words(a, b, 2)
+        check: bool(r == expected)
+        r.prod_high_words(b, a, 2)
+        check: bool(r == expected)
+
+    test "Destination is properly zero-padded if multiplicands are too short":
+      block:
+        var r = BigInt[200].fromHex"0xDEADBEEF_DEADBEEF_DEADBEEF_DEADBEEF_DEADBEEF_DEADBEEF_DE"
+        let a = BigInt[29].fromHex"0x12345678"
+        let b = BigInt[128].fromHex"0xDEADBEEF_DEADBEEF_DEADBEEF_DEADBEEF"
+
+        when WordBitWidth == 32:
+          let expected = BigInt[200].fromHex"fd5bdee65f787f665f787f6"
+        else:
+          let expected = BigInt[200].fromHex"fd5bdee"
+
+        r.prod_high_words(a, b, 2)
+        check: bool(r == expected)
+        r.prod_high_words(b, a, 2)
+        check: bool(r == expected)
+
   suite "Modular operations - small modulus":
     # Vectors taken from Stint - https://github.com/status-im/nim-stint
     test "100 mod 13":
diff --git a/tests/test_bigints_vs_gmp.nim b/tests/test_bigints_mod_vs_gmp.nim
similarity index 98%
rename from tests/test_bigints_vs_gmp.nim
rename to tests/test_bigints_mod_vs_gmp.nim
index fe71145..de7fb01 100644
--- a/tests/test_bigints_vs_gmp.nim
+++ b/tests/test_bigints_mod_vs_gmp.nim
@@ -8,7 +8,7 @@
 
 import
   # Standard library
-  random, macros, times, strutils,
+  std/[random, macros, times, strutils],
   # Third-party
   gmp, stew/byteutils,
   # Internal
@@ -148,7 +148,7 @@ proc main() =
       # Reexport as bigEndian for debugging
       discard mpz_export(aBuf[0].addr, aW.addr, GMP_MostSignificantWordFirst, 1, GMP_WordNativeEndian, 0, a)
       discard mpz_export(mBuf[0].addr, mW.addr, GMP_MostSignificantWordFirst, 1, GMP_WordNativeEndian, 0, m)
-      "\nModulus with operand\n" &
+      "\nModulus with operands\n" &
       "  a (" & align($aBits, 4) & "-bit):   " & aBuf.toHex & "\n" &
       "  m (" & align($mBits, 4) & "-bit):   " & mBuf.toHex & "\n" &
       "failed:" & "\n" &
diff --git a/tests/test_bigints_mul_high_words_vs_gmp.nim b/tests/test_bigints_mul_high_words_vs_gmp.nim
new file mode 100644
index 0000000..85c48f2
--- /dev/null
+++ b/tests/test_bigints_mul_high_words_vs_gmp.nim
@@ -0,0 +1,151 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  # Standard library
+  std/[random, macros, times, strutils],
+  # Third-party
+  gmp, stew/byteutils,
+  # Internal
+  ../constantine/io/io_bigints,
+  ../constantine/arithmetic,
+  ../constantine/primitives,
+  ../constantine/config/[common, type_bigint]
+
+# We test up to 1024-bit, more is really slow
+
+var bitSizeRNG {.compileTime.} = initRand(1234)
+
+macro testRandomModSizes(numSizes: static int, rBits, aBits, bBits, wordsStartIndex, body: untyped): untyped =
+  ## Generate `numSizes` random bit sizes known at compile-time to test against GMP
+  ## for A mod M
+  result = newStmtList()
+
+  for _ in 0 ..< numSizes:
+    let aBitsVal = bitSizeRNG.rand(126 .. 2048)
+    let bBitsVal = bitSizeRNG.rand(126 .. 2048)
+    let rBitsVal = bitSizeRNG.rand(62 .. 4096+128)
+    let wordsStartIndexVal = bitSizeRNG.rand(1 .. wordsRequired(4096+128))
+
+    result.add quote do:
+      block:
+        const `aBits` = `aBitsVal`
+        const `bBits` = `bBitsVal`
+        const `rBits` = `rBitsVal`
+        const `wordsStartIndex` = `wordsStartIndexVal`
+
+        block:
+          `body`
+
+const # https://gmplib.org/manual/Integer-Import-and-Export.html
+  GMP_WordLittleEndian {.used.} = -1'i32
+  GMP_WordNativeEndian {.used.} = 0'i32
+  GMP_WordBigEndian {.used.} = 1'i32
+
+  GMP_MostSignificantWordFirst = 1'i32
+  GMP_LeastSignificantWordFirst {.used.} = -1'i32
+
+proc main() =
+  var gmpRng: gmp_randstate_t
+  gmp_randinit_mt(gmpRng)
+  # The GMP seed varies between run so that
+  # test coverage increases as the library gets tested.
+  # This requires to dump the seed in the console or the function inputs
+  # to be able to reproduce a bug
+  let seed = uint32(getTime().toUnix() and (1'i64 shl 32 - 1)) # unixTime mod 2^32
+  echo "GMP seed: ", seed
+  gmp_randseed_ui(gmpRng, seed)
+
+  var r, a, b: mpz_t
+  mpz_init(r)
+  mpz_init(a)
+  mpz_init(b)
+
+  testRandomModSizes(128, rBits, aBits, bBits, wordsStartIndex):
+    # echo "--------------------------------------------------------------------------------"
+    echo "Testing: random mul_high_words  r (", align($rBits, 4),
+      "-bit, keeping from ", wordsStartIndex,
+      " word index) <- a (", align($aBits, 4),
+      "-bit) * b (", align($bBits, 4), "-bit) (full mul bits: ", align($(aBits+bBits), 4),
+      "), r large enough? ", wordsRequired(rBits) >= wordsRequired(aBits+bBits) - wordsStartIndex
+
+    # Generate random value in the range 0 ..< 2^aBits
+    mpz_urandomb(a, gmpRng, aBits)
+    # Generate random modulus and ensure the MSB is set
+    mpz_urandomb(b, gmpRng, bBits)
+    mpz_setbit(r, aBits+bBits)
+
+    # discard gmp_printf(" -- %#Zx mod %#Zx\n", a.addr, m.addr)
+
+    #########################################################
+    # Conversion buffers
+    const aLen = (aBits + 7) div 8
+    const bLen = (bBits + 7) div 8
+
+    var aBuf: array[aLen, byte]
+    var bBuf: array[bLen, byte]
+
+    {.push warnings: off.} # deprecated csize
+    var aW, bW: csize # Word written by GMP
+    {.pop.}
+
+    discard mpz_export(aBuf[0].addr, aW.addr, GMP_MostSignificantWordFirst, 1, GMP_WordNativeEndian, 0, a)
+    discard mpz_export(bBuf[0].addr, bW.addr, GMP_MostSignificantWordFirst, 1, GMP_WordNativeEndian, 0, b)
+
+    # Since the modulus is using all bits, it's we can test for exact amount copy
+    doAssert aLen >= aW, "Expected at most " & $aLen & " bytes but wrote " & $aW & " for " & toHex(aBuf) & " (big-endian)"
+    doAssert bLen >= bW, "Expected at most " & $bLen & " bytes but wrote " & $bW & " for " & toHex(bBuf) & " (big-endian)"
+
+    # Build the bigint
+    let aTest = BigInt[aBits].fromRawUint(aBuf.toOpenArray(0, aW-1), bigEndian)
+    let bTest = BigInt[bBits].fromRawUint(bBuf.toOpenArray(0, bW-1), bigEndian)
+
+    #########################################################
+    # Multiplication + drop low words
+    mpz_mul(r, a, b)
+    var shift: mpz_t
+    mpz_init(shift)
+    r.mpz_tdiv_q_2exp(r, WordBitwidth * wordsStartIndex)
+
+    # If a*b overflow the result size we truncate
+    const numWords = wordsRequired(rBits)
+    when numWords < wordsRequired(aBits+bBits):
+      echo "  truncating from ", wordsRequired(aBits+bBits), " words to ", numWords, " (2^", WordBitwidth * numWords, ")"
+      r.mpz_tdiv_r_2exp(r, WordBitwidth * numWords)
+
+    # Constantine
+    var rTest: BigInt[rBits]
+    rTest.prod_high_words(aTest, bTest, wordsStartIndex)
+
+    #########################################################
+    # Check
+    const rLen = numWords * WordBitWidth
+    var rGMP: array[rLen, byte]
+    {.push warnings: off.} # deprecated csize
+    var rW: csize # Word written by GMP
+    {.pop.}
+    discard mpz_export(rGMP[0].addr, rW.addr, GMP_MostSignificantWordFirst, 1, GMP_WordNativeEndian, 0, r)
+
+    var rConstantine: array[rLen, byte]
+    exportRawUint(rConstantine, rTest, bigEndian)
+
+    # Note: in bigEndian, GMP aligns left while constantine aligns right
+    doAssert rGMP.toOpenArray(0, rW-1) == rConstantine.toOpenArray(rLen-rW, rLen-1), block:
+      # Reexport as bigEndian for debugging
+      discard mpz_export(aBuf[0].addr, aW.addr, GMP_MostSignificantWordFirst, 1, GMP_WordNativeEndian, 0, a)
+      discard mpz_export(bBuf[0].addr, bW.addr, GMP_MostSignificantWordFirst, 1, GMP_WordNativeEndian, 0, b)
+      "\nMultiplication with operands\n" &
+      "  a (" & align($aBits, 4) & "-bit):   " & aBuf.toHex & "\n" &
+      "  b (" & align($bBits, 4) & "-bit):   " & bBuf.toHex & "\n" &
+      "  keeping words starting from:   " & $wordsStartIndex & "\n" &
+      "into r of size " & align($rBits, 4) & "-bit failed:" & "\n" &
+      "  GMP:            " & rGMP.toHex() & "\n" &
+      "  Constantine:    " & rConstantine.toHex() & "\n" &
+      "(Note that GMP aligns bytes left while constantine aligns bytes right)"
+
+main()
diff --git a/tests/test_bigints_mul_vs_gmp.nim b/tests/test_bigints_mul_vs_gmp.nim
new file mode 100644
index 0000000..9b34fb6
--- /dev/null
+++ b/tests/test_bigints_mul_vs_gmp.nim
@@ -0,0 +1,140 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  # Standard library
+  std/[random, macros, times, strutils],
+  # Third-party
+  gmp, stew/byteutils,
+  # Internal
+  ../constantine/io/io_bigints,
+  ../constantine/arithmetic,
+  ../constantine/primitives,
+  ../constantine/config/[common, type_bigint]
+
+# We test up to 1024-bit, more is really slow
+
+var bitSizeRNG {.compileTime.} = initRand(1234)
+
+macro testRandomModSizes(numSizes: static int, rBits, aBits, bBits, body: untyped): untyped =
+  ## Generate `numSizes` random bit sizes known at compile-time to test against GMP
+  ## for A mod M
+  result = newStmtList()
+
+  for _ in 0 ..< numSizes:
+    let aBitsVal = bitSizeRNG.rand(126 .. 2048)
+    let bBitsVal = bitSizeRNG.rand(126 .. 2048)
+    let rBitsVal = bitSizeRNG.rand(62 .. 4096+128)
+
+    result.add quote do:
+      block:
+        const `aBits` = `aBitsVal`
+        const `bBits` = `bBitsVal`
+        const `rBits` = `rBitsVal`
+        block:
+          `body`
+
+const # https://gmplib.org/manual/Integer-Import-and-Export.html
+  GMP_WordLittleEndian {.used.} = -1'i32
+  GMP_WordNativeEndian {.used.} = 0'i32
+  GMP_WordBigEndian {.used.} = 1'i32
+
+  GMP_MostSignificantWordFirst = 1'i32
+  GMP_LeastSignificantWordFirst {.used.} = -1'i32
+
+proc main() =
+  var gmpRng: gmp_randstate_t
+  gmp_randinit_mt(gmpRng)
+  # The GMP seed varies between run so that
+  # test coverage increases as the library gets tested.
+  # This requires to dump the seed in the console or the function inputs
+  # to be able to reproduce a bug
+  let seed = uint32(getTime().toUnix() and (1'i64 shl 32 - 1)) # unixTime mod 2^32
+  echo "GMP seed: ", seed
+  gmp_randseed_ui(gmpRng, seed)
+
+  var r, a, b: mpz_t
+  mpz_init(r)
+  mpz_init(a)
+  mpz_init(b)
+
+  testRandomModSizes(128, rBits, aBits, bBits):
+    # echo "--------------------------------------------------------------------------------"
+    echo "Testing: random mul  r (", align($rBits, 4), "-bit) <- a (", align($aBits, 4), "-bit) * b (", align($bBits, 4), "-bit) (full mul bits: ", align($(aBits+bBits), 4), "), r large enough? ", rBits >= aBits+bBits
+
+    # Generate random value in the range 0 ..< 2^aBits
+    mpz_urandomb(a, gmpRng, aBits)
+    # Generate random modulus and ensure the MSB is set
+    mpz_urandomb(b, gmpRng, bBits)
+    mpz_setbit(r, aBits+bBits)
+
+    # discard gmp_printf(" -- %#Zx mod %#Zx\n", a.addr, m.addr)
+
+    #########################################################
+    # Conversion buffers
+    const aLen = (aBits + 7) div 8
+    const bLen = (bBits + 7) div 8
+
+    var aBuf: array[aLen, byte]
+    var bBuf: array[bLen, byte]
+
+    {.push warnings: off.} # deprecated csize
+    var aW, bW: csize # Word written by GMP
+    {.pop.}
+
+    discard mpz_export(aBuf[0].addr, aW.addr, GMP_MostSignificantWordFirst, 1, GMP_WordNativeEndian, 0, a)
+    discard mpz_export(bBuf[0].addr, bW.addr, GMP_MostSignificantWordFirst, 1, GMP_WordNativeEndian, 0, b)
+
+    # Since the modulus is using all bits, it's we can test for exact amount copy
+    doAssert aLen >= aW, "Expected at most " & $aLen & " bytes but wrote " & $aW & " for " & toHex(aBuf) & " (big-endian)"
+    doAssert bLen >= bW, "Expected at most " & $bLen & " bytes but wrote " & $bW & " for " & toHex(bBuf) & " (big-endian)"
+
+    # Build the bigint
+    let aTest = BigInt[aBits].fromRawUint(aBuf.toOpenArray(0, aW-1), bigEndian)
+    let bTest = BigInt[bBits].fromRawUint(bBuf.toOpenArray(0, bW-1), bigEndian)
+
+    #########################################################
+    # Multiplication
+    mpz_mul(r, a, b)
+
+    # If a*b overflow the result size we truncate
+    const numWords = wordsRequired(rBits)
+    when numWords < wordsRequired(aBits+bBits):
+      echo "  truncating from ", wordsRequired(aBits+bBits), " words to ", numWords, " (2^", WordBitwidth * numWords, ")"
+      r.mpz_tdiv_r_2exp(r, WordBitwidth * numWords)
+
+    # Constantine
+    var rTest: BigInt[rBits]
+    rTest.prod(aTest, bTest)
+
+    #########################################################
+    # Check
+    const rLen = numWords * WordBitWidth
+    var rGMP: array[rLen, byte]
+    {.push warnings: off.} # deprecated csize
+    var rW: csize # Word written by GMP
+    {.pop.}
+    discard mpz_export(rGMP[0].addr, rW.addr, GMP_MostSignificantWordFirst, 1, GMP_WordNativeEndian, 0, r)
+
+    var rConstantine: array[rLen, byte]
+    exportRawUint(rConstantine, rTest, bigEndian)
+
+    # Note: in bigEndian, GMP aligns left while constantine aligns right
+    doAssert rGMP.toOpenArray(0, rW-1) == rConstantine.toOpenArray(rLen-rW, rLen-1), block:
+      # Reexport as bigEndian for debugging
+      discard mpz_export(aBuf[0].addr, aW.addr, GMP_MostSignificantWordFirst, 1, GMP_WordNativeEndian, 0, a)
+      discard mpz_export(bBuf[0].addr, bW.addr, GMP_MostSignificantWordFirst, 1, GMP_WordNativeEndian, 0, b)
+      "\nMultiplication with operands\n" &
+      "  a (" & align($aBits, 4) & "-bit):   " & aBuf.toHex & "\n" &
+      "  b (" & align($bBits, 4) & "-bit):   " & bBuf.toHex & "\n" &
+      "into r of size " & align($rBits, 4) & "-bit failed:" & "\n" &
+      "  GMP:            " & rGMP.toHex() & "\n" &
+      "  Constantine:    " & rConstantine.toHex() & "\n" &
+      "(Note that GMP aligns bytes left while constantine aligns bytes right)"
+
+main()
diff --git a/tests/test_bigints_multimod.nim b/tests/test_bigints_multimod.nim
index e020b4c..3c5e5e8 100644
--- a/tests/test_bigints_multimod.nim
+++ b/tests/test_bigints_multimod.nim
@@ -8,7 +8,7 @@
 
 import
   # Standard library
-  unittest,
+  std/unittest,
   # Third-party
   ../constantine/io/io_bigints,
   ../constantine/arithmetic,
diff --git a/tests/test_ec_bls12_381.nim b/tests/test_ec_bls12_381.nim
index b487965..ffe8740 100644
--- a/tests/test_ec_bls12_381.nim
+++ b/tests/test_ec_bls12_381.nim
@@ -8,12 +8,12 @@
 
 import
   # Standard library
-  unittest, times,
+  std/[unittest, times],
   # Internals
   ../constantine/config/[common, curves],
   ../constantine/arithmetic,
   ../constantine/io/[io_bigints, io_ec],
-  ../constantine/elliptic/[ec_weierstrass_projective],
+  ../constantine/elliptic/[ec_weierstrass_projective, ec_scalar_mul, ec_endomorphism_accel],
   # Test utilities
   ./support/ec_reference_scalar_mult
 
@@ -33,119 +33,122 @@ proc test(
     var Q: EC
     let qOK = Q.fromHex(Qx, Qy)
 
-    let exponent = EC.F.C.matchingBigInt.fromHex(scalar)
+    let exponent = BigInt[EC.F.C.getCurveOrderBitwidth()].fromHex(scalar)
     var exponentCanonical: array[(exponent.bits+7) div 8, byte]
     exponentCanonical.exportRawUint(exponent, bigEndian)
 
     var
       impl = P
       reference = P
+      endo = P
       scratchSpace: array[1 shl 4, EC]
 
-    impl.scalarMul(exponentCanonical, scratchSpace)
+    impl.scalarMulGeneric(exponentCanonical, scratchSpace)
     reference.unsafe_ECmul_double_add(exponentCanonical)
+    endo.scalarMulGLV(exponent)
 
     doAssert: bool(Q == reference)
     doAssert: bool(Q == impl)
+    doAssert: bool(Q == endo)
 
-suite "BLS12_381 implementation (and unsafe reference impl) vs SageMath":
+suite "Scalar Multiplication (cofactor cleared): BLS12_381 implementation (and unsafe reference impl) vs SageMath":
   # Generated via sage sage/testgen_bls12_381.sage
   test(
     id = 1,
     EC = ECP_SWei_Proj[Fp[BLS12_381]],
-    Px = "f21eda282230f72b855d48055e68ab3825da87831fa5147a64fa071bade4c26bddd45e8b602e62df4d907414a6ec1b4",
-    Py = "531b38866cb35c19951f4a1ac62242f11fa714a1b99c6116a630fa75e7f4407fcd1ae9770a821c5899a777d341c915a",
+    Px = "f9679bb02ee7f352fff6a6467a5e563ec8dd38c86a48abd9e8f7f241f1cdd29d54bc3ddea3a33b62e0d7ce22f3d244a",
+    Py = "50189b992cf856846b30e52205ff9ef72dc081e9680726586231cbc29a81a162120082585f401e00382d5c86fb1083f",
     scalar = "f7e60a832eb77ac47374bc93251360d6c81c21add62767ff816caf11a20d8db",
-    Qx = "18d7ca3fb93d7300a0484233f3bac9bca00b45595a4b9caf66aa0b2237f6fd51559a24a634f3876451332c5f754438b2",
-    Qy = "edbb203999303fc99ef04368412da4b3555f999c703b425dedff3fdc799317c292751c46275b27990c53d933de2db63"
+    Qx = "c344f3bcc86df380186311fa502b7943a436a629380f8ee1960515522eedc58fe67ddd47615487668bcf12842c524d8",
+    Qy = "189e0c154f2631ad26e24ca73d84fb60a21d385fe205df04cf9f2f6fc0c3aa72afe9fbea71a930fa71d9bbfddb2fa571"
   )
 
   test(
     id = 2,
     EC = ECP_SWei_Proj[Fp[BLS12_381]],
-    Px = "9ca8e33d8a330b04b052af6cf44bf2ed08cc93d83a4eb48cbb0cabfe02ffb2ef910df44862b271354352f15b70e45b5",
-    Py = "102f6d07ef45f51de9a4ecef5ec34eae16833f4761c2ddfbe2b414173c3580721135e5bbb74269ab85ba83cb03020d9b",
+    Px = "17d71835ff84f150fabf5c77ac90bf7f6249143abd1f5d8a46a76f243d424d82e1e258fc7983ba8af97a2462adebe090",
+    Py = "d3e108ee1332067cbe4f4193eae10381acb69f493b40e53d9dee59506b49c6564c9056494a7f987982eb4069512c1c6",
     scalar = "5f10367bdae7aa872d90b5ac209321ce5a15181ce22848d032a8d452055cbfd0",
-    Qx = "a50d49e3d8757f994aae312dedd55205687c432bc9d97efbe69e87bef4256b87af1b665a669d06657cda6ff01ee42df",
-    Qy = "160d50aaa21f9d5b4faada77e4f91d8d4f152a0fcca4d30d271d74b20c1bba8638128f99f52d9603d4a24f8e27219bcd"
+    Qx = "21073bee733a07b15d83afcd4e6ee11b01e6137fd5ad4589c5045e12d79a9a9490a3ebc59f30633a60fc3635a3c1e51",
+    Qy = "eb7a97a9d3dfff1667b8fa559bdcdf37c7767e6afb8ca93ad9dd44feb93761e10aa2c4c1a79728a21cd4a6f705398b5"
   )
 
   test(
     id = 3,
     EC = ECP_SWei_Proj[Fp[BLS12_381]],
-    Px = "173c28687d23de83c950131e548485e8e09d4053d32b814d13b618ee4159e8b61bf6320148ddabcedf2b04d3c9787cd4",
-    Py = "277f935b4e0a90155915960c617f395dcadead1c7297cf92916add07308fc3f0493aa6dabf31d1f15953f56ac37d3d9",
+    Px = "f92c9572692e8f3d450483a7a9bb4694e3b54c9cd09441a4dd7f579b0a6984e47f8090c31c172b33d87f3de186d6b58",
+    Py = "286ede4cb2ae19ead4932d5550c5d3ec8ce3a3ada5e1ed6d202e93dd1b16d3513f0f9b62adc6323f18e272a426ee955",
     scalar = "4c321d72220c098fc0fd52306de98f8be9446bf854cf1e4d8dbae62375d18faf",
-    Qx = "16259e878b5921bbe1e5672cccea0f29fedbb93b8ce1bae4d4b602b6dd5708c6d4e5d82ff92868828c46fd333aadf82d",
-    Qy = "16d09713f4fe5705f2e3491aa9a1d5827fb3b280f5a1fdde0b01a2b75f5803d528d5f5603cc0e9da29d6a07b8e14df7c"
+    Qx = "4bb385e937582ae32aa7ba89632fcef2eace3f7b57309d979cf35298a430de9ef4d9ac5ba2335c1a4b6e7e5c38d0036",
+    Qy = "1801154d3a7b0daea772345b7f72a4c88c9677743f267da63490dad4dece2ecc9ec02d4d4d063086ee5d356aa2db914e"
   )
 
   test(
     id = 4,
     EC = ECP_SWei_Proj[Fp[BLS12_381]],
-    Px = "177d32dfa6e97daf5ea8aee520bc5c33b7bee37cba114fda52dd948030ad81abdffbdda402c930791e1048ad531b2c80",
-    Py = "14e9f915698dadd2220586a8bcc921b1f855273c3c0e41a88569e5a9fd2a4e886eeff9a7a11b02ec286987c5a52d55ce",
+    Px = "ec23ff3435b8ebd5e8e0a879d432e11eb974161664b1341fd28f1ffc4c228bf6ada2ae4a565f18c9b66f67a7573502d",
+    Py = "10c4b647be08db0b49b75320ae891f9f9c5d7bb7c798947e800d681d205d1b24b12e4dfa993d1bd16851b00356627cc1",
     scalar = "1738857afb76c55f615c2a20b44ca90dcb3267d804ec23fddea431dbee4eb37f",
-    Qx = "a4bfcfc65eb16562752f5c164349ef673477e19fe020de84eddbc2958f6d40bbbba39fc67ee8c8fdf007922fec97f79",
-    Qy = "106ccd382d15773e6097f8ea6f012cbec15184d6f4ea08bac2842ed419f0e555f1a43f7434b2e017f9e02971d07eb59d"
+    Qx = "dc7ae7801152918ee3c13590407b4242a80d0b855a0bf585d3dc30719601d2d5d9e01e99ae735003ecb7c20ef48265",
+    Qy = "142c01a6aa390426a4ce2f36df43f86442732c35d4e05e5b67f3623832944f0ea5a29138624cb939330652a3cfb282b5"
   )
 
   test(
     id = 5,
     EC = ECP_SWei_Proj[Fp[BLS12_381]],
-    Px = "1bc38e70e62770489063d3b7a3bebbc6735ab8dc389576ff03272c2883045aa051d74f8407920530b4c719b140cda81",
-    Py = "bd24e4fb09ed4098d61e3d2cb456f03d7818ded79dfba9cfe7956829797b12e10f1766c46c1a2e1cf2957295124c782",
+    Px = "df127083c2a5ef2388b02af913c0e4002a52a82db9e5ecbf23ee4f557d3b61c91ebcfe9d4973070b46bc5ea6897bca1",
+    Py = "318960aeea262ec23ffdd42ec1ba72ae6fa2186a1e2a0fc2659073fb7b5adfb50d581a4d998a94d1accf78b1b3a0163",
     scalar = "19c47811813444020c999a2b263940b5054cf45bb8ad8e086ff126bfcd5507e1",
-    Qx = "b310d4688f2c9f8cd4c030b62ed27341f4c71341fe9c56858a949a2d51670eb6ebe1339163bdb833e692b0ee0cf4e92",
-    Qy = "c92300561e1acb1e1ae6a1b75f83b9d2d2cb5f07c3f8ea945990ceb75e7ea12c4aec115227c13a05be92f5caed9268e"
+    Qx = "5f93c42fd76a29063efa2ee92607e0b3ae7edc4e419b3914661e5162d6beaeb96a34d2007ff817bc102651f61dca8d1",
+    Qy = "18dde8666bb1d0a379719d7d1b1512de809b70e49d9553303274ea872e56f7f39da551d6bcb7c57ae88ec7dc1fb354a4"
   )
 
   test(
     id = 6,
     EC = ECP_SWei_Proj[Fp[BLS12_381]],
-    Px = "48cddafaca93d33caf910a7a6f74dc3489d53da9fa2f940b70b6dcf538cc08da1a369809ab86a8ee49cead0ed6bfef6",
-    Py = "173f8dfb384aea011bed89aaca625085dc2940d0775e5f2647fc7574ce822643d0d7b1b39e9a51b9f5a0dca7486bddd0",
+    Px = "101123de23c0f240c583c2368c4118dc942db219c55f58cf54acd500c1fcfa06f651ad75319ebf840cbdb6bddea7fde4",
+    Py = "5268587d4b844b0708e0336d1bbf48da185aaf5b948eccc3b565d00a856dd55882b9bb31c52af0e275b168cb35eb7b0",
     scalar = "43ffcda71e45a3e90b7502d92b30a0b06c54c95a91aa21e0438677b1c2714ecb",
-    Qx = "ef1e4967a3eb19318a66d092eada9810bebf301c168cea7c73fad9d98f7d4c2bde1071fd142c3da90830509f22a82b5",
-    Qy = "da537922dcb6bf79e4d09237c1a3c5804e3a83b6f18ccb26991d50d77c81bef76139fa73d39c684c7c1616151b1058b"
+    Qx = "f9871b682c1c76c7f4f0a7ca57ad876c10dc108b65b76987264873278d9f54db95101c173aed06d07062efc7d47ca0c",
+    Qy = "20d9628d611e72a4251a1f2357d4f53e68e4915383b6a0d126273d216b1a8c5e2cb7b2688ad702ef1682f4c5228fcd9"
   )
 
   test(
     id = 7,
     EC = ECP_SWei_Proj[Fp[BLS12_381]],
-    Px = "9d56cb273bdeef945078066192b74d2f3077f00f5bd1a50b338c44f7c640005a614f9c6fc89cb4678140b2a721c69a8",
-    Py = "107b42b9a0c22b9e9cd2191b90fede2ab280532ea26806338a5b28533cf9431bde1a8010677a5078c63482953d4f2451",
+    Px = "1457ba1bae6eb3afae3261941c65c93e3ae7d784907d15b8d559100da5e13fd29e4a4d6e3103b781a95237b7b2d80a8e",
+    Py = "6a869a47cb48d01e7d29660932afd7617720262b55de5f430b8aa3d74f9fd2b9d3a07ce192425da58014764fc9532cd",
     scalar = "64ad0d6c36dba5368e71f0010aebf860288f54611e5aaf18082bae7a404ebfd8",
-    Qx = "e0c78d1e1ed993fdeb14e4872965bc90014aa39c728c457a720bf3123ebcdcb17ac553a619b9b7073ada436565d4bb4",
-    Qy = "c2d9ba441ed90bae4f1597da90e434f1668fda320e4fa04cddcdce0eacb3bc54185d5f7cde826f5bd0e3d59b2424906"
+    Qx = "93e540e26190e161038d985d40f2ab897cbc2346be7d8f2b201a689b59d4020a8740e252606f2f79ba0e121ccc9976d",
+    Qy = "10568d68f1b993aa1eded3869eda14e509f1cb4d8553bdf97feee175467cea4c0c1316fdb4e5a68440ad04b96b2d3bfc"
   )
 
   test(
     id = 8,
     EC = ECP_SWei_Proj[Fp[BLS12_381]],
-    Px = "150a83a868fa6a74dbc5658445ea99ec47009572f303ce1d3c76370804c5a8c26d40c8b4b35a6585612d704c5fb090cb",
-    Py = "31e73ed0aedebcf0b58d60c16f2e5ddd2d4eb2a6e34177939efcca0767cde241966b5950c3333c62ccddee51de26fe6",
+    Px = "2615f843e8fe68d4c337bcf83b2cf13cbae638edd0740f1eac520dc2146afa3b8d36c540878c1d207ef913634b1e593",
+    Py = "1787d6eeeceb6e7793073f0bbe7bae522529c126b650c43d5d41e732c581a57df1bfb818061b7b4e6c9145da5df2c43e",
     scalar = "b0ac3d0e685583075aa46c03a00859dfbec24ccb36e2cae3806d82275adcc03",
-    Qx = "9c5e69fbd492a64e5811af7cc69e42bc14d8626f6d384d3f479d8e06c20ec5f460a1e3839f33899b4a9e0ada876ac6e",
-    Qy = "16990d7d308897c74b87368f847df3ac0bb6609091c8d39b22d5778a4229f0bb92fea385d27db41e237dcfb0d05bd0e7"
+    Qx = "d95ed29c2e15fd2205d83a71478341d6022deb93af4d49f704437678a72ce141d2f6043aa0e34e26f60d17e16b97053",
+    Qy = "b37cbded112c84116b74ff311b10d148f3e203cb88d4a011b096c74cd2bfdb27255727de4aa8299ae10b32d661d48a7"
   )
 
   test(
     id = 9,
     EC = ECP_SWei_Proj[Fp[BLS12_381]],
-    Px = "69498486a06c18f836a8e9ed507bbb563d6d03545e03e08f628e8fbd2e5d098e58950071d516ffd044d92b3a8b07184",
-    Py = "18a169f06fc94f40cd131bdcd23e48e95b1276c0c8daacf56c3a5e278e89ee1094c94aa516113aa4a2455da149f0f989",
+    Px = "10bc0c4e1ed87246a9d4d7d38546369f275a245f6e1d3b882e8c9a7f05bc6ee8ff97a96a54084c2bef15ed8bfefb1465",
+    Py = "1782377e5f588576b5ab42fea224e88873dda957202f0c6d72ce8728c2d58dc654be77226fbda385d5f269354e4a176a",
     scalar = "23941bb3c3659423d6fdafb7cff52e0e02de0ac91e64c537c6203d64905b63d0",
-    Qx = "482e085550f5e514dd98f2d9b119c284ac165514d228c8f7a179f2b442968984873223af2255a499dc931c63543c0ba",
-    Qy = "151ce80ca51dd09243d2b1a7937096d6b7494e89190da5ab7604cd913dc4105c871e48c815fefadee2906b8b401e7e71"
+    Qx = "83f1e7e8bd963c1ccd837dae7bc9336531aaf0aee717537a9a7e2712e220f74cdb73a99f331c0eb6b377be3dafc211f",
+    Qy = "cd87773d072b1305dfc85c2983aecae2ab316e5e8f31306c32d58d6ce2e431b12685d18c58b6a35ad2113c5b689eeb"
   )
 
   test(
     id = 10,
     EC = ECP_SWei_Proj[Fp[BLS12_381]],
-    Px = "98cc20aa561769b7ee569304503a94752e236bba52938fed7f3093d5867f65361dc8b48c83bd7db490c26736196e20e",
-    Py = "10a68394358903122bd649bd30b473f4d3b4f0830bfe7da1c48ae87d9429d8fd26f5b4be8d8fd8e4214017044696da29",
+    Px = "be4f9f721d98a761a5562bd80ea06f369e9cbb7d33bbb2f0191d4b77d0fd2a10c4083b54157b525f36c522ca3a6ca09",
+    Py = "166c315ecdd20acb3c5efcc7e038b17d0b37a06ffbf77873f15fc0cd091a1e4102a8b8bf5507919453759e744391b04d",
     scalar = "4203156dcf70582ea8cbd0388104f47fd5a18ae336b2fed8458e1e4e74d7baf5",
-    Qx = "18ff1dfd96799b7d0bffaa7480121c3a719047815ae41419f1bd1fdd593288bed8827b3d9e45a3a1e01bf7d603b5ba0",
-    Qy = "49b95ca2c0f75dfb15fc07e5692d23f8eb38cb1cc9c48cd0e93a80adbff135a3945cc7a5d53d2b7510d6ee7cf97308d"
+    Qx = "c72bc7087cd22993b7f6d2e49026abfde678a384073ed373b95df722b1ab658eb5ae42211e5528af606e38b59511bc6",
+    Qy = "96d80593b42fe44e64793e490b1257af0aa26b36773aac93c3686fdb14975917cf60a1a19e32623218d0722dbb88a85"
   )
diff --git a/tests/test_ec_bn254.nim b/tests/test_ec_bn254.nim
index d553eff..6044477 100644
--- a/tests/test_ec_bn254.nim
+++ b/tests/test_ec_bn254.nim
@@ -8,12 +8,12 @@
 
 import
   # Standard library
-  unittest, times,
+  std/[unittest, times],
   # Internals
   ../constantine/config/[common, curves],
   ../constantine/arithmetic,
   ../constantine/io/[io_bigints, io_ec],
-  ../constantine/elliptic/[ec_weierstrass_projective],
+  ../constantine/elliptic/[ec_weierstrass_projective, ec_scalar_mul, ec_endomorphism_accel],
   # Test utilities
   ./support/ec_reference_scalar_mult
 
@@ -33,22 +33,25 @@ proc test(
     var Q: EC
     let qOK = Q.fromHex(Qx, Qy)
 
-    let exponent = EC.F.C.matchingBigInt.fromHex(scalar)
+    let exponent = BigInt[EC.F.C.getCurveOrderBitwidth()].fromHex(scalar)
     var exponentCanonical: array[(exponent.bits+7) div 8, byte]
     exponentCanonical.exportRawUint(exponent, bigEndian)
 
     var
       impl = P
       reference = P
+      endo = P
       scratchSpace: array[1 shl 4, EC]
 
-    impl.scalarMul(exponentCanonical, scratchSpace)
+    impl.scalarMulGeneric(exponentCanonical, scratchSpace)
     reference.unsafe_ECmul_double_add(exponentCanonical)
+    endo.scalarMulGLV(exponent)
 
     doAssert: bool(Q == reference)
     doAssert: bool(Q == impl)
+    doAssert: bool(Q == endo)
 
-suite "BN254 implementation (and unsafe reference impl) vs SageMath":
+suite "Scalar Multiplication: BN254 implementation (and unsafe reference impl) vs SageMath":
   # Generated via sage sage/testgen_bn254_snarks.sage
   test(
     id = 1,
diff --git a/tests/test_ec_weierstrass_projective_g1.nim b/tests/test_ec_weierstrass_projective_g1.nim
index ef0ac03..d901aa8 100644
--- a/tests/test_ec_weierstrass_projective_g1.nim
+++ b/tests/test_ec_weierstrass_projective_g1.nim
@@ -8,12 +8,12 @@
 
 import
   # Standard library
-  unittest, times,
+  std/[unittest, times],
   # Internals
   ../constantine/config/[common, curves],
   ../constantine/arithmetic,
   ../constantine/io/io_bigints,
-  ../constantine/elliptic/[ec_weierstrass_affine, ec_weierstrass_projective],
+  ../constantine/elliptic/[ec_weierstrass_affine, ec_weierstrass_projective, ec_scalar_mul],
   # Test utilities
   ../helpers/prng_unsafe,
   ./support/ec_reference_scalar_mult
@@ -194,7 +194,7 @@ suite "Elliptic curve in Short Weierstrass form y² = x³ + a x + b with project
           reference = a
           scratchSpace{.noInit.}: array[1 shl 4, ECP_SWei_Proj[F]]
 
-        impl.scalarMul(exponentCanonical, scratchSpace)
+        impl.scalarMulGeneric(exponentCanonical, scratchSpace)
         reference.unsafe_ECmul_double_add(exponentCanonical)
 
         check:
@@ -223,7 +223,7 @@ suite "Elliptic curve in Short Weierstrass form y² = x³ + a x + b with project
           reference = a
           scratchSpace{.noInit.}: array[1 shl 4, ECP_SWei_Proj[F]]
 
-        impl.scalarMul(exponentCanonical, scratchSpace)
+        impl.scalarMulGeneric(exponentCanonical, scratchSpace)
         reference.unsafe_ECmul_double_add(exponentCanonical)
 
         check:
@@ -256,7 +256,7 @@ suite "Elliptic curve in Short Weierstrass form y² = x³ + a x + b with project
           reference = a
           scratchSpace{.noInit.}: array[1 shl 4, ECP_SWei_Proj[F]]
 
-        impl.scalarMul(exponentCanonical, scratchSpace)
+        impl.scalarMulGeneric(exponentCanonical, scratchSpace)
         reference.unsafe_ECmul_double_add(exponentCanonical)
 
         check:
@@ -288,7 +288,7 @@ suite "Elliptic curve in Short Weierstrass form y² = x³ + a x + b with project
           reference = a
           scratchSpace{.noInit.}: array[1 shl 4, ECP_SWei_Proj[F]]
 
-        impl.scalarMul(exponentCanonical, scratchSpace)
+        impl.scalarMulGeneric(exponentCanonical, scratchSpace)
         reference.unsafe_ECmul_double_add(exponentCanonical)
 
         check:
@@ -323,20 +323,20 @@ suite "Elliptic curve in Short Weierstrass form y² = x³ + a x + b with project
         fImpl.sum(a, b)
         fReference.sum(a, b)
 
-        fImpl.scalarMul(exponentCanonical, scratchSpace)
+        fImpl.scalarMulGeneric(exponentCanonical, scratchSpace)
         fReference.unsafe_ECmul_double_add(exponentCanonical)
 
         # [k]a + [k]b - Distributed
         var kaImpl = a
         var kaRef = a
 
-        kaImpl.scalarMul(exponentCanonical, scratchSpace)
+        kaImpl.scalarMulGeneric(exponentCanonical, scratchSpace)
         kaRef.unsafe_ECmul_double_add(exponentCanonical)
 
         var kbImpl = b
         var kbRef = b
 
-        kbImpl.scalarMul(exponentCanonical, scratchSpace)
+        kbImpl.scalarMulGeneric(exponentCanonical, scratchSpace)
         kbRef.unsafe_ECmul_double_add(exponentCanonical)
 
         var kakbImpl{.noInit.}, kakbRef{.noInit.}: ECP_SWei_Proj[F]
@@ -370,7 +370,7 @@ suite "Elliptic curve in Short Weierstrass form y² = x³ + a x + b with project
           reference = a
           scratchSpace{.noInit.}: array[1 shl 4, ECP_SWei_Proj[F]]
 
-        impl.scalarMul(exponentCanonical, scratchSpace)
+        impl.scalarMulGeneric(exponentCanonical, scratchSpace)
         reference.unsafe_ECmul_double_add(exponentCanonical)
 
         check: bool(impl == reference)
diff --git a/tests/test_finite_fields.nim b/tests/test_finite_fields.nim
index f85f70b..fd169a4 100644
--- a/tests/test_finite_fields.nim
+++ b/tests/test_finite_fields.nim
@@ -6,13 +6,11 @@
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
-import  unittest,
+import  std/unittest,
         ../constantine/arithmetic,
         ../constantine/io/io_fields,
         ../constantine/config/curves
 
-import ../constantine/io/io_bigints
-
 static: doAssert defined(testingCurves), "This modules requires the -d:testingCurves compile option"
 
 proc main() =
diff --git a/tests/test_finite_fields_mulsquare.nim b/tests/test_finite_fields_mulsquare.nim
index d44bf49..d04923c 100644
--- a/tests/test_finite_fields_mulsquare.nim
+++ b/tests/test_finite_fields_mulsquare.nim
@@ -6,12 +6,15 @@
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
-import  std/unittest, std/times,
-        ../constantine/arithmetic,
-        ../constantine/io/[io_bigints, io_fields],
-        ../constantine/config/[curves, common],
-        # Test utilities
-        ../helpers/prng_unsafe
+import
+  # Standard library
+  std/[unittest, times],
+  # Internal
+  ../constantine/arithmetic,
+  ../constantine/io/[io_bigints, io_fields],
+  ../constantine/config/[curves, common],
+  # Test utilities
+  ../helpers/prng_unsafe
 
 const Iters = 128
 
diff --git a/tests/test_finite_fields_powinv.nim b/tests/test_finite_fields_powinv.nim
index bb59f24..9d29980 100644
--- a/tests/test_finite_fields_powinv.nim
+++ b/tests/test_finite_fields_powinv.nim
@@ -6,13 +6,16 @@
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
-import  ../constantine/arithmetic,
-        ../constantine/io/[io_bigints, io_fields],
-        ../constantine/config/curves,
-        # Test utilities
-        ../helpers/prng_unsafe,
-        # Standard library
-        std/unittest, std/times
+import
+  # Standard library
+  std/[unittest, times],
+  # Internal
+  ../constantine/arithmetic,
+  ../constantine/io/[io_bigints, io_fields],
+  ../constantine/config/curves,
+  # Test utilities
+  ../helpers/prng_unsafe
+
 
 static: doAssert defined(testingCurves), "This modules requires the -d:testingCurves compile option"
 
diff --git a/tests/test_finite_fields_sqrt.nim b/tests/test_finite_fields_sqrt.nim
index c6b8150..4bffb71 100644
--- a/tests/test_finite_fields_sqrt.nim
+++ b/tests/test_finite_fields_sqrt.nim
@@ -6,14 +6,16 @@
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
-import  ../constantine/[arithmetic, primitives],
-        ../constantine/io/[io_fields],
-        ../constantine/config/[curves, common],
-        # Test utilities
-        ../helpers/prng_unsafe,
-        # Standard library
-        std/tables,
-        std/unittest, std/times
+import
+  # Standard library
+  std/[tables, unittest, times],
+  # Internal
+  ../constantine/[arithmetic, primitives],
+  ../constantine/io/[io_fields],
+  ../constantine/config/[curves, common],
+  # Test utilities
+  ../helpers/prng_unsafe
+
 
 const Iters = 128
 
diff --git a/tests/test_finite_fields_vs_gmp.nim b/tests/test_finite_fields_vs_gmp.nim
index 518730f..63fa2ff 100644
--- a/tests/test_finite_fields_vs_gmp.nim
+++ b/tests/test_finite_fields_vs_gmp.nim
@@ -8,7 +8,7 @@
 
 import
   # Standard library
-  random, macros, times, strutils,
+  std/[random, macros, times, strutils],
   # Third-party
   gmp, stew/byteutils,
   # Internal
diff --git a/tests/test_precomputed.nim b/tests/test_precomputed.nim
new file mode 100644
index 0000000..b787af7
--- /dev/null
+++ b/tests/test_precomputed.nim
@@ -0,0 +1,37 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import  std/unittest,
+        ../constantine/arithmetic,
+        ../constantine/config/curves,
+        ../constantine/io/[io_bigints, io_fields]
+
+proc checkCubeRootOfUnity(curve: static Curve) =
+  test $curve & " cube root of unity (mod p)":
+    var cru = curve.getCubicRootOfUnity_mod_p()
+    cru.square()
+    cru *= curve.getCubicRootOfUnity_mod_p()
+
+    check: bool cru.isOne()
+
+  test $curve & " cube root of unity (mod r)":
+    var cru: BigInt[3 * curve.getCurveOrderBitwidth()]
+    cru.prod(curve.getCubicRootOfUnity_mod_r(), curve.getCubicRootOfUnity_mod_r())
+    cru.prod(cru, curve.getCubicRootOfUnity_mod_r())
+
+    var r: BigInt[curve.getCurveOrderBitwidth()]
+    r.reduce(cru, curve.getCurveOrder)
+
+    check: bool r.isOne()
+
+proc main() =
+  suite "Sanity checks on precomputed values":
+    checkCubeRootOfUnity(BN254_Snarks)
+    # checkCubeRootOfUnity(BLS12_381)
+
+main()