Minor optimizations to addition (#323)

This commit is contained in:
Jakub Nabaglo 2021-10-26 18:05:52 -07:00 committed by GitHub
parent bf421314f9
commit 7d39074e61
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 64 additions and 16 deletions

View File

@ -13,6 +13,7 @@ use crate::field::extension_field::quartic::QuarticExtension;
use crate::field::extension_field::Extendable;
use crate::field::field_types::{Field, PrimeField, RichField};
use crate::field::inversion::try_inverse_u64;
use crate::util::{assume, branch_hint};
const EPSILON: u64 = (1 << 32) - 1;
@ -136,13 +137,15 @@ impl PrimeField for GoldilocksField {
#[inline]
unsafe fn add_canonical_u64(&self, rhs: u64) -> Self {
let (res_wrapped, carry) = self.0.overflowing_add(rhs);
Self(res_wrapped.wrapping_add(EPSILON * (carry as u64)))
// Add EPSILON * carry cannot overflow unless rhs is not in canonical form.
Self(res_wrapped + EPSILON * (carry as u64))
}
#[inline]
unsafe fn sub_canonical_u64(&self, rhs: u64) -> Self {
let (res_wrapped, borrow) = self.0.overflowing_sub(rhs);
Self(res_wrapped.wrapping_sub(EPSILON * (borrow as u64)))
// Sub EPSILON * carry cannot underflow unless rhs is not in canonical form.
Self(res_wrapped - EPSILON * (borrow as u64))
}
}
@ -165,8 +168,21 @@ impl Add for GoldilocksField {
#[inline]
#[allow(clippy::suspicious_arithmetic_impl)]
fn add(self, rhs: Self) -> Self {
let (sum, over) = self.0.overflowing_add(rhs.to_canonical_u64());
Self(sum.wrapping_sub((over as u64) * Self::ORDER))
let (sum, over) = self.0.overflowing_add(rhs.0);
let (mut sum, over) = sum.overflowing_add((over as u64) * EPSILON);
if over {
// NB: self.0 > Self::ORDER && rhs.0 > Self::ORDER is necessary but not sufficient for
// double-overflow.
// This assume does two things:
// 1. If compiler knows that either self.0 or rhs.0 <= ORDER, then it can skip this
// check.
// 2. Hints to the compiler how rare this double-overflow is (thus handled better with
// a branch).
assume(self.0 > Self::ORDER && rhs.0 > Self::ORDER);
branch_hint();
sum += EPSILON; // Cannot overflow.
}
Self(sum)
}
}
@ -189,8 +205,21 @@ impl Sub for GoldilocksField {
#[inline]
#[allow(clippy::suspicious_arithmetic_impl)]
fn sub(self, rhs: Self) -> Self {
let (diff, under) = self.0.overflowing_sub(rhs.to_canonical_u64());
Self(diff.wrapping_add((under as u64) * Self::ORDER))
let (diff, under) = self.0.overflowing_sub(rhs.0);
let (mut diff, under) = diff.overflowing_sub((under as u64) * EPSILON);
if under {
// NB: self.0 < EPSILON - 1 && rhs.0 > Self::ORDER is necessary but not sufficient for
// double-underflow.
// This assume does two things:
// 1. If compiler knows that either self.0 >= EPSILON - 1 or rhs.0 <= ORDER, then it
// can skip this check.
// 2. Hints to the compiler how rare this double-underflow is (thus handled better
// with a branch).
assume(self.0 < EPSILON - 1 && rhs.0 > Self::ORDER);
branch_hint();
diff -= EPSILON; // Cannot underflow.
}
Self(diff)
}
}
@ -283,8 +312,6 @@ impl RichField for GoldilocksField {}
#[inline(always)]
#[cfg(target_arch = "x86_64")]
unsafe fn add_no_canonicalize_trashing_input(x: u64, y: u64) -> u64 {
use crate::util::assume;
let res_wrapped: u64;
let adjustment: u64;
asm!(
@ -304,14 +331,17 @@ unsafe fn add_no_canonicalize_trashing_input(x: u64, y: u64) -> u64 {
);
assume(x != 0 || (res_wrapped == y && adjustment == 0));
assume(y != 0 || (res_wrapped == x && adjustment == 0));
res_wrapped.wrapping_add(adjustment) // Add EPSILON == subtract ORDER.
// Add EPSILON == subtract ORDER.
// Cannot overflow unless the assumption if x + y < 2**64 + ORDER is incorrect.
res_wrapped + adjustment
}
#[inline(always)]
#[cfg(not(target_arch = "x86_64"))]
unsafe fn add_no_canonicalize_trashing_input(x: u64, y: u64) -> u64 {
let (res_wrapped, carry) = x.overflowing_add(y);
res_wrapped.wrapping_add(EPSILON * (carry as u64))
// Below cannot overflow unless the assumption if x + y < 2**64 + ORDER is incorrect.
res_wrapped + EPSILON * (carry as u64)
}
/// Fast subtraction modulo ORDER for x86-64.
@ -322,8 +352,6 @@ unsafe fn add_no_canonicalize_trashing_input(x: u64, y: u64) -> u64 {
#[inline(always)]
#[cfg(target_arch = "x86_64")]
unsafe fn sub_no_canonicalize_trashing_input(x: u64, y: u64) -> u64 {
use crate::util::assume;
let res_wrapped: u64;
let adjustment: u64;
asm!(
@ -334,14 +362,17 @@ unsafe fn sub_no_canonicalize_trashing_input(x: u64, y: u64) -> u64 {
options(pure, nomem, nostack),
);
assume(y != 0 || (res_wrapped == x && adjustment == 0));
res_wrapped.wrapping_sub(adjustment) // Subtract EPSILON == add ORDER.
// Subtract EPSILON == add ORDER.
// Cannot underflow unless the assumption x - y >= -ORDER is incorrect.
res_wrapped - adjustment
}
#[inline(always)]
#[cfg(not(target_arch = "x86_64"))]
unsafe fn sub_no_canonicalize_trashing_input(x: u64, y: u64) -> u64 {
let (res_wrapped, borrow) = x.overflowing_sub(y);
res_wrapped.wrapping_sub(EPSILON * (borrow as u64))
// Below cannot underflow unless the assumption x - y >= -ORDER is incorrect.
res_wrapped - EPSILON * (borrow as u64)
}
/// Reduces to a 64-bit value. The result might not be in canonical form; it could be in between the

View File

@ -198,10 +198,27 @@ pub(crate) fn reverse_bits(n: usize, num_bits: usize) -> usize {
}
#[inline(always)]
pub(crate) unsafe fn assume(p: bool) {
pub(crate) fn assume(p: bool) {
debug_assert!(p);
if !p {
unreachable_unchecked();
unsafe {
unreachable_unchecked();
}
}
}
/// Try to force Rust to emit a branch. Example:
/// if x > 2 {
/// y = foo();
/// branch_hint();
/// } else {
/// y = bar();
/// }
/// This function has no semantics. It is a hint only.
#[inline(always)]
pub(crate) fn branch_hint() {
unsafe {
asm!("", options(nomem, nostack, preserves_flags));
}
}