mirror of
https://github.com/logos-storage/plonky2.git
synced 2026-01-08 00:33:06 +00:00
x86 ASM tricks for scalar Goldilocks multiplication (#299)
* ASM tricks for scalar Goldilocks multiplication * Minor style * Provide generic versions of `add/sub_with_wraparound` * Minor bugfix
This commit is contained in:
parent
164aa0947e
commit
5d099c5d45
@ -254,6 +254,68 @@ impl Extendable<4> for GoldilocksField {
|
|||||||
|
|
||||||
impl RichField for GoldilocksField {}
|
impl RichField for GoldilocksField {}
|
||||||
|
|
||||||
|
/// Fast addition modulo ORDER for x86-64.
|
||||||
|
/// This function is marked unsafe for the following reasons:
|
||||||
|
/// - It is only correct if x + y < 2**64 + ORDER = 0x1ffffffff00000001.
|
||||||
|
/// - It is only faster in some circumstances. In particular, on x86 it overwrites both inputs in
|
||||||
|
/// the registers, so its use is not recommended when either input will be used again.
|
||||||
|
#[inline(always)]
|
||||||
|
#[cfg(target_arch = "x86_64")]
|
||||||
|
unsafe fn add_with_wraparound(x: u64, y: u64) -> u64 {
|
||||||
|
let res_wrapped: u64;
|
||||||
|
let adjustment: u64;
|
||||||
|
asm!(
|
||||||
|
"add {0}, {1}",
|
||||||
|
// Trick. The carry flag is set iff the addition overflowed.
|
||||||
|
// sbb x, y does x := x - y - CF. In our case, x and y are both {1:e}, so it simply does
|
||||||
|
// {1:e} := 0xffffffff on overflow and {1:e} := 0 otherwise. {1:e} is the low 32 bits of
|
||||||
|
// {1}; the high 32-bits are zeroed on write. In the end, we end up with 0xffffffff in {1}
|
||||||
|
// on overflow; this happens be EPSILON.
|
||||||
|
// Note that the CPU does not realize that the result of sbb x, x does not actually depend
|
||||||
|
// on x. We must write the result to a register that we know to be ready. We have a
|
||||||
|
// dependency on {1} anyway, so let's use it.
|
||||||
|
"sbb {1:e}, {1:e}",
|
||||||
|
inlateout(reg) x => res_wrapped,
|
||||||
|
inlateout(reg) y => adjustment,
|
||||||
|
options(pure, nomem, nostack),
|
||||||
|
);
|
||||||
|
res_wrapped.wrapping_add(adjustment) // Add EPSILON == subtract ORDER.
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline(always)]
|
||||||
|
#[cfg(not(target_arch = "x86_64"))]
|
||||||
|
unsafe fn add_with_wraparound(x: u64, y: u64) -> u64 {
|
||||||
|
let (res_wrapped, carry) = x.overflowing_add(y);
|
||||||
|
res_wrapped.wrapping_add(EPSILON * (carry as u64))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Fast subtraction modulo ORDER for x86-64.
|
||||||
|
/// This function is marked unsafe for the following reasons:
|
||||||
|
/// - It is only correct if x - y >= -ORDER.
|
||||||
|
/// - It is only faster in some circumstances. In particular, on x86 it overwrites both inputs in
|
||||||
|
/// the registers, so its use is not recommended when either input will be used again.
|
||||||
|
#[inline(always)]
|
||||||
|
#[cfg(target_arch = "x86_64")]
|
||||||
|
unsafe fn sub_with_wraparound(x: u64, y: u64) -> u64 {
|
||||||
|
let res_wrapped: u64;
|
||||||
|
let adjustment: u64;
|
||||||
|
asm!(
|
||||||
|
"sub {0}, {1}",
|
||||||
|
"sbb {1:e}, {1:e}", // See add_with_wraparound.
|
||||||
|
inlateout(reg) x => res_wrapped,
|
||||||
|
inlateout(reg) y => adjustment,
|
||||||
|
options(pure, nomem, nostack),
|
||||||
|
);
|
||||||
|
res_wrapped.wrapping_sub(adjustment) // Subtract EPSILON == add ORDER.
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline(always)]
|
||||||
|
#[cfg(not(target_arch = "x86_64"))]
|
||||||
|
unsafe fn sub_with_wraparound(x: u64, y: u64) -> u64 {
|
||||||
|
let (res_wrapped, borrow) = x.overflowing_sub(y);
|
||||||
|
res_wrapped.wrapping_sub(EPSILON * (borrow as u64))
|
||||||
|
}
|
||||||
|
|
||||||
/// Reduces to a 64-bit value. The result might not be in canonical form; it could be in between the
|
/// Reduces to a 64-bit value. The result might not be in canonical form; it could be in between the
|
||||||
/// field order and `2^64`.
|
/// field order and `2^64`.
|
||||||
#[inline]
|
#[inline]
|
||||||
@ -262,13 +324,9 @@ fn reduce128(x: u128) -> GoldilocksField {
|
|||||||
let x_hi_hi = x_hi >> 32;
|
let x_hi_hi = x_hi >> 32;
|
||||||
let x_hi_lo = x_hi & EPSILON;
|
let x_hi_lo = x_hi & EPSILON;
|
||||||
|
|
||||||
let (mut t0, borrow) = x_lo.overflowing_sub(x_hi_hi);
|
let t0 = unsafe { sub_with_wraparound(x_lo, x_hi_hi) };
|
||||||
t0 = t0.wrapping_sub(EPSILON * (borrow as u64));
|
|
||||||
|
|
||||||
let t1 = x_hi_lo * EPSILON;
|
let t1 = x_hi_lo * EPSILON;
|
||||||
|
let t2 = unsafe { add_with_wraparound(t0, t1) };
|
||||||
let (mut t2, carry) = t1.overflowing_add(t0);
|
|
||||||
t2 = t2.wrapping_add(EPSILON * (carry as u64));
|
|
||||||
GoldilocksField(t2)
|
GoldilocksField(t2)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -1,3 +1,4 @@
|
|||||||
|
#![feature(asm)]
|
||||||
#![feature(destructuring_assignment)]
|
#![feature(destructuring_assignment)]
|
||||||
#![feature(generic_const_exprs)]
|
#![feature(generic_const_exprs)]
|
||||||
#![feature(specialization)]
|
#![feature(specialization)]
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user