diff --git a/constantine/tower_field_extensions/fp2_complex.nim b/constantine/tower_field_extensions/fp2_complex.nim
index 5a95ddd..d496313 100644
--- a/constantine/tower_field_extensions/fp2_complex.nim
+++ b/constantine/tower_field_extensions/fp2_complex.nim
@@ -67,7 +67,7 @@ func square*(r: var Fp2, a: Fp2) =
   #           => (c0²-c1², 2 c0 c1)
   #           or
   #           => ((c0-c1)(c0+c1), 2 c0 c1)
-  #           => ((c0-c1)(c0-c1 + 2 c1), 2 c0 c1)
+  #           => ((c0-c1)(c0-c1 + 2 c1), c0 * 2 c1)
   #
   # Costs (naive implementation)
   # - 1 Multiplication 𝔽p
@@ -82,12 +82,51 @@ func square*(r: var Fp2, a: Fp2) =
   # - 1 Addition 𝔽p
   # - 1 Doubling 𝔽p
   # - 1 Substraction 𝔽p
-  # Stack: 6 * ModulusBitSize (4x 𝔽p element + 1 named temporaries + 1 multiplication temporary)
-  # as multiplications require a (shared) internal temporary
+  # Stack: 6 * ModulusBitSize (4x 𝔽p element + 1 named temporaries + 1 in-place multiplication temporary)
+  # as in-place multiplications require a (shared) internal temporary
 
   var c0mc1 {.noInit.}: typeof(a.c0)
   c0mc1.diff(a.c0, a.c1) # c0mc1 = c0 - c1                            [1 Sub]
   r.c1.double(a.c1)      # result.c1 = 2 c1                           [1 Dbl, 1 Sub]
   r.c0.sum(c0mc1, r.c1)  # result.c0 = c0 - c1 + 2 c1                 [1 Add, 1 Dbl, 1 Sub]
-  r.c0 *= c0mc1          # result.c0 = (c0 + c1)(c0 - c1) = c0² - c1² [1 Mul, 1 Add, 1 Dbl, 1 Sub]
-  r.c1 *= a.c0           # result.c1 = 2 c1 c0                        [2 Mul, 1 Add, 1 Dbl, 1 Sub]
+  r.c0 *= c0mc1          # result.c0 = (c0 + c1)(c0 - c1) = c0² - c1² [1 Mul, 1 Add, 1 Dbl, 1 Sub] - 𝔽p temporary
+  r.c1 *= a.c0           # result.c1 = 2 c1 c0                        [2 Mul, 1 Add, 1 Dbl, 1 Sub] - 𝔽p temporary
+
+func prod*(r: var Fp2, a, b: Fp2) =
+  ## Return a * b in 𝔽p2 in ``r``
+  ## ``r`` is initialized/overwritten
+  # (a0, a1) (b0, b1) => (a0 + a1𝑖) (b0 + b1𝑖)
+  #                   => (a0 b0 - a1 b1) + (a0 b1 + a1 b0) 𝑖
+  #
+  # In Fp, multiplication has cost O(n²) with n the number of limbs
+  # while addition has cost O(3n) (n for addition, n for overflow, n for conditional substraction)
+  # and substraction has cost O(2n) (n for substraction + underflow, n for conditional addition)
+  #
+  # Even for 256-bit primes, we are looking at always a minimum of n=5 limbs (with 2^63 words)
+  # where addition/substraction are significantly cheaper than multiplication
+  #
+  # So we always reframe the imaginary part using Karatsuba approach to save a multiplication
+  # (a0, a1) (b0, b1) => (a0 b0 - a1 b1) + 𝑖( (a0 + a1)(b0 + b1) - a0 b0 - a1 b1 )
+  #
+  # Costs (naive implementation)
+  # - 4 Multiplications 𝔽p
+  # - 1 Addition 𝔽p
+  # - 1 Substraction 𝔽p
+  # Stack: 6 * ModulusBitSize (4x 𝔽p element + 2x named temporaries)
+  #
+  # Costs (Karatsuba)
+  # - 3 Multiplications 𝔽p
+  # - 3 Substraction 𝔽p (2 are fused)
+  # - 2 Addition 𝔽p
+  # Stack: 6 * ModulusBitSize (4x 𝔽p element + 2x named temporaries + 1 in-place multiplication temporary)
+  var a0b0 {.noInit.}, a1b1 {.noInit.}: typeof(a.c0)
+  a0b0.prod(a.c0, b.c0)                                         # [1 Mul]
+  a1b1.prod(a.c1, b.c1)                                         # [2 Mul]
+
+  r.c0.sum(a.c0, a.c1)  # r0 = (a0 + a1)                        # [2 Mul, 1 Add]
+  r.c1.sum(b.c0, b.c1)  # r1 = (b0 + b1)                        # [2 Mul, 2 Add]
+  r.c1 *= c.c0          # r1 = (b0 + b1)(a0 + a1)               # [3 Mul, 2 Add] - 𝔽p temporary
+
+  r.c0.diff(a0b0, a1b1) # r0 = a0 b0 - a1 b1                    # [3 Mul, 2 Add, 1 Sub]
+  r.c1 -= a0b0          # r1 = (b0 + b1)(a0 + a1) - a0b0        # [3 Mul, 2 Add, 2 Sub]
+  r.c1 -= a1b1          # r1 = (b0 + b1)(a0 + a1) - a0b0 - a1b1 # [3 Mul, 2 Add, 3 Sub]