From 5d099c5d4585778d70fc7e9e104c9e0a10328b79 Mon Sep 17 00:00:00 2001
From: Jakub Nabaglo <j@nab.gl>
Date: Wed, 13 Oct 2021 09:16:45 -0700
Subject: [PATCH] x86 ASM tricks for scalar Goldilocks multiplication  (#299)

* ASM tricks for scalar Goldilocks multiplication

* Minor style

* Provide generic versions of `add/sub_with_wraparound`

* Minor bugfix
---
 src/field/goldilocks_field.rs | 70 ++++++++++++++++++++++++++++++++---
 src/lib.rs                    |  1 +
 2 files changed, 65 insertions(+), 6 deletions(-)

diff --git a/src/field/goldilocks_field.rs b/src/field/goldilocks_field.rs
index fbf31659..ae838f9d 100644
--- a/src/field/goldilocks_field.rs
+++ b/src/field/goldilocks_field.rs
@@ -254,6 +254,68 @@ impl Extendable<4> for GoldilocksField {
 
 impl RichField for GoldilocksField {}
 
+/// Fast addition modulo ORDER for x86-64.
+/// This function is marked unsafe for the following reasons:
+///   - It is only correct if x + y < 2**64 + ORDER = 0x1ffffffff00000001.
+///   - It is only faster in some circumstances. In particular, on x86 it overwrites both inputs in
+///     the registers, so its use is not recommended when either input will be used again.
+#[inline(always)]
+#[cfg(target_arch = "x86_64")]
+unsafe fn add_with_wraparound(x: u64, y: u64) -> u64 {
+    let res_wrapped: u64;
+    let adjustment: u64;
+    asm!(
+        "add {0}, {1}",
+        // Trick. The carry flag is set iff the addition overflowed.
+        // sbb x, y does x := x - y - CF. In our case, x and y are both {1:e}, so it simply does
+        // {1:e} := 0xffffffff on overflow and {1:e} := 0 otherwise. {1:e} is the low 32 bits of
+        // {1}; the high 32-bits are zeroed on write. In the end, we end up with 0xffffffff in {1}
+        // on overflow; this happens be EPSILON.
+        // Note that the CPU does not realize that the result of sbb x, x does not actually depend
+        // on x. We must write the result to a register that we know to be ready. We have a
+        // dependency on {1} anyway, so let's use it.
+        "sbb {1:e}, {1:e}",
+        inlateout(reg) x => res_wrapped,
+        inlateout(reg) y => adjustment,
+        options(pure, nomem, nostack),
+    );
+    res_wrapped.wrapping_add(adjustment) // Add EPSILON == subtract ORDER.
+}
+
+#[inline(always)]
+#[cfg(not(target_arch = "x86_64"))]
+unsafe fn add_with_wraparound(x: u64, y: u64) -> u64 {
+    let (res_wrapped, carry) = x.overflowing_add(y);
+    res_wrapped.wrapping_add(EPSILON * (carry as u64))
+}
+
+/// Fast subtraction modulo ORDER for x86-64.
+/// This function is marked unsafe for the following reasons:
+///   - It is only correct if x - y >= -ORDER.
+///   - It is only faster in some circumstances. In particular, on x86 it overwrites both inputs in
+///     the registers, so its use is not recommended when either input will be used again.
+#[inline(always)]
+#[cfg(target_arch = "x86_64")]
+unsafe fn sub_with_wraparound(x: u64, y: u64) -> u64 {
+    let res_wrapped: u64;
+    let adjustment: u64;
+    asm!(
+        "sub {0}, {1}",
+        "sbb {1:e}, {1:e}", // See add_with_wraparound.
+        inlateout(reg) x => res_wrapped,
+        inlateout(reg) y => adjustment,
+        options(pure, nomem, nostack),
+    );
+    res_wrapped.wrapping_sub(adjustment) // Subtract EPSILON == add ORDER.
+}
+
+#[inline(always)]
+#[cfg(not(target_arch = "x86_64"))]
+unsafe fn sub_with_wraparound(x: u64, y: u64) -> u64 {
+    let (res_wrapped, borrow) = x.overflowing_sub(y);
+    res_wrapped.wrapping_sub(EPSILON * (borrow as u64))
+}
+
 /// Reduces to a 64-bit value. The result might not be in canonical form; it could be in between the
 /// field order and `2^64`.
 #[inline]
@@ -262,13 +324,9 @@ fn reduce128(x: u128) -> GoldilocksField {
     let x_hi_hi = x_hi >> 32;
     let x_hi_lo = x_hi & EPSILON;
 
-    let (mut t0, borrow) = x_lo.overflowing_sub(x_hi_hi);
-    t0 = t0.wrapping_sub(EPSILON * (borrow as u64));
-
+    let t0 = unsafe { sub_with_wraparound(x_lo, x_hi_hi) };
     let t1 = x_hi_lo * EPSILON;
-
-    let (mut t2, carry) = t1.overflowing_add(t0);
-    t2 = t2.wrapping_add(EPSILON * (carry as u64));
+    let t2 = unsafe { add_with_wraparound(t0, t1) };
     GoldilocksField(t2)
 }
 
diff --git a/src/lib.rs b/src/lib.rs
index be2732dc..c72f783c 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,3 +1,4 @@
+#![feature(asm)]
 #![feature(destructuring_assignment)]
 #![feature(generic_const_exprs)]
 #![feature(specialization)]