diff --git a/constantine/field_fp.nim b/constantine/field_fp.nim
index fa72a02..a38e9d6 100644
--- a/constantine/field_fp.nim
+++ b/constantine/field_fp.nim
@@ -16,6 +16,8 @@
 import
   ./word_types, ./bigints
 
+from ./private/word_types_internal import unsafe_div2n1n
+
 type
   Fp*[P: static BigInt] = object
     ## P is a prime number
@@ -64,63 +66,18 @@ func `+`*(a, b: Fp): Fp =
   ctl = ctl or not sub(result, Fp.P, False)
   sub(result, Fp.P, ctl)
 
-# ############################################################
-#
-#                Montgomery domain primitives
-#
-# ############################################################
+template scaleadd_impl(a: var Fp, c: Limb) =
+  ## Scale-accumulate
+  ##
+  ## With a word W = 2^LimbBitSize and a field Fp
+  ## Does a <- a * W + c (mod p)
 
-from bitops import fastLog2
-  # This will only be used at compile-time
-  # so no constant-time worries (it is constant-time if using the De Bruijn multiplication)
+  when Fp.P.bits <= LimbBitSize:
+    # If the prime fits in a single limb
+    var q: Limb
 
-func montyMagic*(M: static BigInt): static Limb =
-  ## Returns the Montgomery domain magic number for the input modulus:
-  ##   -1/M[0] mod LimbSize
-  ## M[0] is the least significant limb of M
-  ## M must be odd and greater than 2.
-
-  # Test vectors: https://www.researchgate.net/publication/4107322_Montgomery_modular_multiplication_architecture_for_public_key_cryptosystems
-  # on p354
-  # Reference C impl: http://www.hackersdelight.org/hdcodetxt/mont64.c.txt
-
-  # ######################################################################
-  # Implementation of modular multiplication inverse
-  # Assuming 2 positive integers a and m the modulo
-  #
-  # We are looking for z that solves `az ≡ 1 mod m`
-  #
-  # References:
-  #   - Knuth, The Art of Computer Programming, Vol2 p342
-  #   - Menezes, Handbook of Applied Cryptography (HAC), p610
-  #     http://cacr.uwaterloo.ca/hac/about/chap14.pdf
-
-  # Starting from the extended GCD formula (Bezout identity),
-  # `ax + by = gcd(x,y)` with input x,y and outputs a, b, gcd
-  # We assume a and m are coprimes, i.e. gcd is 1, otherwise no inverse
-  # `ax + my = 1` <=> `ax + my ≡ 1 mod m` <=> `ax ≡ 1 mod m`
-
-  # For Montgomery magic number, we are in a special case
-  # where a = M and m = 2^LimbSize.
-  # For a and m to be coprimes, a must be odd.
-
-  # M being a power of 2 greatly simplifies computation:
-  #  - https://crypto.stackexchange.com/questions/47493/how-to-determine-the-multiplicative-inverse-modulo-64-or-other-power-of-two
-  #  - http://groups.google.com/groups?selm=1994Apr6.093116.27805%40mnemosyne.cs.du.edu
-  #  - https://mumble.net/~campbell/2015/01/21/inverse-mod-power-of-two
-  #  - https://eprint.iacr.org/2017/411
-
-  # We have the following relation
-  # ax ≡ 1 (mod 2^k) <=> ax(2 - ax) ≡ 1 (mod 2^(2k))
-  #
-  # To get  -1/M0 mod LimbSize
-  # we can either negate the resulting x of `ax(2 - ax) ≡ 1 (mod 2^(2k))`
-  # or do ax(2 + ax) ≡ 1 (mod 2^(2k))
-
-  const
-    M0 = M.limbs[0]
-    k = fastLog2(LimbBitSize)
-
-  result = M0                # Start from an inverse of M0 modulo 2, M0 is odd and it's own inverse
-  for _ in static(0 ..< k):
-    result *= 2 + M * result # x' = x(2 + ax) (`+` to avoid negating at the end)
+    # (hi, lo) = a * 2^63 + c
+    let hi = a[0] shr 1                            # 64 - 63 = 1
+    let lo = a[0] shl LimbBitSize or c             # Assumes most-significant bit in c is not set
+    unsafe_div2n1n(q, a[0], hi, lo, Fp.P.limbs[0]) # (hi, lo) mod P
+    return
diff --git a/constantine/montgomery.nim b/constantine/montgomery.nim
new file mode 100644
index 0000000..64a009e
--- /dev/null
+++ b/constantine/montgomery.nim
@@ -0,0 +1,73 @@
+# Constantine
+# Copyright (c) 2018 Status Research & Development GmbH
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+# ############################################################
+#
+#                Montgomery domain primitives
+#
+# ############################################################
+
+import
+  ./word_types, ./bigints, ./field_fp
+
+from bitops import fastLog2
+  # This will only be used at compile-time
+  # so no constant-time worries (it is constant-time if using the De Bruijn multiplication)
+
+func montyMagic*(M: static BigInt): static Limb =
+  ## Returns the Montgomery domain magic number for the input modulus:
+  ##   -1/M[0] mod LimbSize
+  ## M[0] is the least significant limb of M
+  ## M must be odd and greater than 2.
+
+  # Test vectors: https://www.researchgate.net/publication/4107322_Montgomery_modular_multiplication_architecture_for_public_key_cryptosystems
+  # on p354
+  # Reference C impl: http://www.hackersdelight.org/hdcodetxt/mont64.c.txt
+
+  # ######################################################################
+  # Implementation of modular multiplication inverse
+  # Assuming 2 positive integers a and m the modulo
+  #
+  # We are looking for z that solves `az ≡ 1 mod m`
+  #
+  # References:
+  #   - Knuth, The Art of Computer Programming, Vol2 p342
+  #   - Menezes, Handbook of Applied Cryptography (HAC), p610
+  #     http://cacr.uwaterloo.ca/hac/about/chap14.pdf
+
+  # Starting from the extended GCD formula (Bezout identity),
+  # `ax + by = gcd(x,y)` with input x,y and outputs a, b, gcd
+  # We assume a and m are coprimes, i.e. gcd is 1, otherwise no inverse
+  # `ax + my = 1` <=> `ax + my ≡ 1 mod m` <=> `ax ≡ 1 mod m`
+
+  # For Montgomery magic number, we are in a special case
+  # where a = M and m = 2^LimbSize.
+  # For a and m to be coprimes, a must be odd.
+
+  # M being a power of 2 greatly simplifies computation:
+  #  - https://crypto.stackexchange.com/questions/47493/how-to-determine-the-multiplicative-inverse-modulo-64-or-other-power-of-two
+  #  - http://groups.google.com/groups?selm=1994Apr6.093116.27805%40mnemosyne.cs.du.edu
+  #  - https://mumble.net/~campbell/2015/01/21/inverse-mod-power-of-two
+  #  - https://eprint.iacr.org/2017/411
+
+  # We have the following relation
+  # ax ≡ 1 (mod 2^k) <=> ax(2 - ax) ≡ 1 (mod 2^(2k))
+  #
+  # To get  -1/M0 mod LimbSize
+  # we can either negate the resulting x of `ax(2 - ax) ≡ 1 (mod 2^(2k))`
+  # or do ax(2 + ax) ≡ 1 (mod 2^(2k))
+
+  const
+    M0 = M.limbs[0]
+    k = fastLog2(LimbBitSize)
+
+  result = M0                # Start from an inverse of M0 modulo 2, M0 is odd and it's own inverse
+  for _ in static(0 ..< k):
+    result *= 2 + M * result # x' = x(2 + ax) (`+` to avoid negating at the end)
+
+# func toMonty*[P: static BigInt](a: Fp[P], montyMagic: Limb): Montgomery[P] =
+
diff --git a/constantine/private/word_types_internal.nim b/constantine/private/word_types_internal.nim
new file mode 100644
index 0000000..2a0f63e
--- /dev/null
+++ b/constantine/private/word_types_internal.nim
@@ -0,0 +1,127 @@
+# Constantine
+# Copyright (c) 2018 Status Research & Development GmbH
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+# ############################################################
+#
+#  Unsafe constant-time primitives with specific restrictions
+#
+# ############################################################
+
+import ../word_types
+
+func asm_x86_64_div2n1n(q, r: var uint64, n_hi, n_lo, d: uint64) {.inline.}=
+  ## Division uint128 by uint64
+  ## Warning ⚠️ :
+  ##   - if n_hi == d, quotient does not fit in an uint64
+  ##   - if n_hi > d result is undefined
+
+  # TODO !!! - Replace by constant-time, portable, non-assembly version
+
+  # DIV r/m64
+  # Divide RDX:RAX (n_hi:n_lo) by r/m64
+  #
+  # Inputs
+  #   - numerator high word in RDX,
+  #   - numerator low word in RAX,
+  #   - divisor as rm parameter (register or memory at the compiler discretion)
+  # Result
+  #   - Quotient in RAX
+  #   - Remainder in RDX
+  asm """
+    divq %[divisor]             // We name the register/memory divisor
+    : "=a" (`*q`), "=d" (`*r`)  // Don't forget to dereference the var hidden pointer
+    : "d" (`n_hi`), "a" (`n_lo`), [divisor] "rm" (`d`)
+    :  // no register clobbered besides explicitly used RAX and RDX
+  """
+
+func unsafe_div2n1n*(q, r: var Ct[uint64], n_hi, n_lo, d: Ct[uint64]) {.inline.}=
+  ## Division uint128 by uint64
+  ## Warning ⚠️ :
+  ##   - if n_hi == d, quotient does not fit in an uint64
+  ##   - if n_hi > d result is undefined
+  ##
+  ## TODO, at the moment only x86_64 architecture are supported
+  ##       as we use assembly.
+  ##       Also we assume that the native integer division
+  ##       provided by the PU is constant-time
+
+  # Note, using C/Nim default `div` is inefficient
+  # and complicated to make constant-time
+  # See at the bottom.
+  #
+  # Furthermore compilers try to substitute division
+  # with a fast path that may have branches. It might also
+  # be the same at the hardware level.
+
+  type T = uint64
+
+  when not defined(amd64):
+    {.error: "At the moment only x86_64 architecture is supported".}
+  else:
+    asm_x86_64_div2n1n(T(q), T(r), T(n_hi), T(n_lo), T(d))
+
+when isMainModule:
+
+  var q, r: uint64
+
+  # (1 shl 64) div 3
+  let n_hi = 1'u64
+  let n_lo = 0'u64
+  let d = 3'u64
+
+  asm_x86_64_div2n1n(q, r, n_hi, n_lo, d)
+
+  doAssert q == 6148914691236517205'u64
+  doAssert r == 1
+
+# ############################################################
+#
+#             Non-constant-time portable div2n1n
+#
+# ############################################################
+
+# implementation from Stint: https://github.com/status-im/nim-stint/blob/edb1ade37309390cc641cee07ab62e5459d9ca44/stint/private/uint_div.nim#L131
+
+# func div2n1n[T: SomeunsignedInt](q, r: var T, n_hi, n_lo, d: T) =
+#
+#   # assert countLeadingZeroBits(d) == 0, "Divisor was not normalized"
+#
+#   const
+#     size = bitsof(q)
+#     halfSize = size div 2
+#     halfMask = (1.T shl halfSize) - 1.T
+#
+#   template halfQR(n_hi, n_lo, d, d_hi, d_lo: T): tuple[q,r: T] =
+#
+#     var (q, r) = divmod(n_hi, d_hi)
+#     let m = q * d_lo
+#     var r = (r shl halfSize) or n_lo
+#
+#     # Fix the reminder, we're at most 2 iterations off
+#     if r < m:
+#       dec q
+#       r += d
+#       if r >= d and r < m:
+#         dec q
+#         r += d
+#     r -= m
+#     (q, r)
+#
+#   let
+#     d_hi = d shr halfSize
+#     d_lo = d and halfMask
+#     n_lohi = nlo shr halfSize
+#     n_lolo = nlo and halfMask
+#
+#   # First half of the quotient
+#   let (q1, r1) = halfQR(n_hi, n_lohi, d, d_hi, d_lo)
+#
+#   # Second half
+#   let (q2, r2) = halfQR(r1, n_lolo, d, d_hi, d_lo)
+#
+#   q = (q1 shl halfSize) or q2
+#   r = r2