From 5d099c5d4585778d70fc7e9e104c9e0a10328b79 Mon Sep 17 00:00:00 2001 From: Jakub Nabaglo Date: Wed, 13 Oct 2021 09:16:45 -0700 Subject: [PATCH] x86 ASM tricks for scalar Goldilocks multiplication (#299) * ASM tricks for scalar Goldilocks multiplication * Minor style * Provide generic versions of `add/sub_with_wraparound` * Minor bugfix --- src/field/goldilocks_field.rs | 70 ++++++++++++++++++++++++++++++++--- src/lib.rs | 1 + 2 files changed, 65 insertions(+), 6 deletions(-) diff --git a/src/field/goldilocks_field.rs b/src/field/goldilocks_field.rs index fbf31659..ae838f9d 100644 --- a/src/field/goldilocks_field.rs +++ b/src/field/goldilocks_field.rs @@ -254,6 +254,68 @@ impl Extendable<4> for GoldilocksField { impl RichField for GoldilocksField {} +/// Fast addition modulo ORDER for x86-64. +/// This function is marked unsafe for the following reasons: +/// - It is only correct if x + y < 2**64 + ORDER = 0x1ffffffff00000001. +/// - It is only faster in some circumstances. In particular, on x86 it overwrites both inputs in +/// the registers, so its use is not recommended when either input will be used again. +#[inline(always)] +#[cfg(target_arch = "x86_64")] +unsafe fn add_with_wraparound(x: u64, y: u64) -> u64 { + let res_wrapped: u64; + let adjustment: u64; + asm!( + "add {0}, {1}", + // Trick. The carry flag is set iff the addition overflowed. + // sbb x, y does x := x - y - CF. In our case, x and y are both {1:e}, so it simply does + // {1:e} := 0xffffffff on overflow and {1:e} := 0 otherwise. {1:e} is the low 32 bits of + // {1}; the high 32-bits are zeroed on write. In the end, we end up with 0xffffffff in {1} + // on overflow; this happens be EPSILON. + // Note that the CPU does not realize that the result of sbb x, x does not actually depend + // on x. We must write the result to a register that we know to be ready. We have a + // dependency on {1} anyway, so let's use it. + "sbb {1:e}, {1:e}", + inlateout(reg) x => res_wrapped, + inlateout(reg) y => adjustment, + options(pure, nomem, nostack), + ); + res_wrapped.wrapping_add(adjustment) // Add EPSILON == subtract ORDER. +} + +#[inline(always)] +#[cfg(not(target_arch = "x86_64"))] +unsafe fn add_with_wraparound(x: u64, y: u64) -> u64 { + let (res_wrapped, carry) = x.overflowing_add(y); + res_wrapped.wrapping_add(EPSILON * (carry as u64)) +} + +/// Fast subtraction modulo ORDER for x86-64. +/// This function is marked unsafe for the following reasons: +/// - It is only correct if x - y >= -ORDER. +/// - It is only faster in some circumstances. In particular, on x86 it overwrites both inputs in +/// the registers, so its use is not recommended when either input will be used again. +#[inline(always)] +#[cfg(target_arch = "x86_64")] +unsafe fn sub_with_wraparound(x: u64, y: u64) -> u64 { + let res_wrapped: u64; + let adjustment: u64; + asm!( + "sub {0}, {1}", + "sbb {1:e}, {1:e}", // See add_with_wraparound. + inlateout(reg) x => res_wrapped, + inlateout(reg) y => adjustment, + options(pure, nomem, nostack), + ); + res_wrapped.wrapping_sub(adjustment) // Subtract EPSILON == add ORDER. +} + +#[inline(always)] +#[cfg(not(target_arch = "x86_64"))] +unsafe fn sub_with_wraparound(x: u64, y: u64) -> u64 { + let (res_wrapped, borrow) = x.overflowing_sub(y); + res_wrapped.wrapping_sub(EPSILON * (borrow as u64)) +} + /// Reduces to a 64-bit value. The result might not be in canonical form; it could be in between the /// field order and `2^64`. #[inline] @@ -262,13 +324,9 @@ fn reduce128(x: u128) -> GoldilocksField { let x_hi_hi = x_hi >> 32; let x_hi_lo = x_hi & EPSILON; - let (mut t0, borrow) = x_lo.overflowing_sub(x_hi_hi); - t0 = t0.wrapping_sub(EPSILON * (borrow as u64)); - + let t0 = unsafe { sub_with_wraparound(x_lo, x_hi_hi) }; let t1 = x_hi_lo * EPSILON; - - let (mut t2, carry) = t1.overflowing_add(t0); - t2 = t2.wrapping_add(EPSILON * (carry as u64)); + let t2 = unsafe { add_with_wraparound(t0, t1) }; GoldilocksField(t2) } diff --git a/src/lib.rs b/src/lib.rs index be2732dc..c72f783c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,3 +1,4 @@ +#![feature(asm)] #![feature(destructuring_assignment)] #![feature(generic_const_exprs)] #![feature(specialization)]