Merge pull request #128
b2c9681
Make {mul,sqr}_inner use the same argument order as {mul,sqr} (Pieter Wuille)6793505
Convert YASM code into inline assembly (Pieter Wuille)f048615
Rewrite field assembly to match the C version (Pieter Wuille)
This commit is contained in:
commit
376b28b096
24
Makefile.am
24
Makefile.am
|
@ -1,12 +1,6 @@
|
|||
ACLOCAL_AMFLAGS = -I build-aux/m4
|
||||
|
||||
lib_LTLIBRARIES = libsecp256k1.la
|
||||
if USE_ASM
|
||||
COMMON_LIB = libsecp256k1_common.la
|
||||
else
|
||||
COMMON_LIB =
|
||||
endif
|
||||
noinst_LTLIBRARIES = $(COMMON_LIB)
|
||||
include_HEADERS = include/secp256k1.h
|
||||
noinst_HEADERS =
|
||||
noinst_HEADERS += src/scalar.h
|
||||
|
@ -47,13 +41,9 @@ noinst_HEADERS += src/field_impl.h
|
|||
pkgconfigdir = $(libdir)/pkgconfig
|
||||
pkgconfig_DATA = libsecp256k1.pc
|
||||
|
||||
if USE_ASM
|
||||
libsecp256k1_common_la_SOURCES = src/field_5x52_asm.asm
|
||||
endif
|
||||
|
||||
libsecp256k1_la_SOURCES = src/secp256k1.c
|
||||
libsecp256k1_la_CPPFLAGS = -I$(top_srcdir)/include $(SECP_INCLUDES)
|
||||
libsecp256k1_la_LIBADD = $(COMMON_LIB) $(SECP_LIBS)
|
||||
libsecp256k1_la_LIBADD = $(SECP_LIBS)
|
||||
|
||||
|
||||
noinst_PROGRAMS =
|
||||
|
@ -66,7 +56,7 @@ bench_sign_SOURCES = src/bench_sign.c
|
|||
bench_sign_LDADD = libsecp256k1.la $(SECP_LIBS)
|
||||
bench_sign_LDFLAGS = -static
|
||||
bench_inv_SOURCES = src/bench_inv.c
|
||||
bench_inv_LDADD = $(COMMON_LIB) $(SECP_LIBS)
|
||||
bench_inv_LDADD = $(SECP_LIBS)
|
||||
bench_inv_LDFLAGS = -static
|
||||
bench_inv_CPPFLAGS = $(SECP_INCLUDES)
|
||||
endif
|
||||
|
@ -75,15 +65,9 @@ if USE_TESTS
|
|||
noinst_PROGRAMS += tests
|
||||
tests_SOURCES = src/tests.c
|
||||
tests_CPPFLAGS = -DVERIFY $(SECP_INCLUDES) $(SECP_TEST_INCLUDES)
|
||||
tests_LDADD = $(COMMON_LIB) $(SECP_LIBS) $(SECP_TEST_LIBS)
|
||||
tests_LDADD = $(SECP_LIBS) $(SECP_TEST_LIBS)
|
||||
tests_LDFLAGS = -static
|
||||
TESTS = tests
|
||||
endif
|
||||
|
||||
EXTRA_DIST = autogen.sh nasm_lt.sh
|
||||
|
||||
#x86_64 only
|
||||
if USE_ASM
|
||||
.asm.lo:
|
||||
$(LIBTOOL) --mode=compile --tag YASM $(srcdir)/nasm_lt.sh $(YASM) -f $(YASM_BINFMT) $(YAFLAGS) -I$(srcdir) -I. $< -o $@
|
||||
endif
|
||||
EXTRA_DIST = autogen.sh
|
||||
|
|
|
@ -11,38 +11,16 @@ fi
|
|||
|
||||
dnl
|
||||
AC_DEFUN([SECP_64BIT_ASM_CHECK],[
|
||||
if test x"$host_cpu" == x"x86_64"; then
|
||||
AC_CHECK_PROG(YASM, yasm, yasm)
|
||||
else
|
||||
if test x"$set_field" = x"64bit_asm"; then
|
||||
AC_MSG_ERROR([$set_field field support explicitly requested but is not compatible with this host])
|
||||
fi
|
||||
fi
|
||||
if test x$YASM = x; then
|
||||
if test x"$set_field" = x"64bit_asm"; then
|
||||
AC_MSG_ERROR([$set_field field support explicitly requested but yasm was not found])
|
||||
fi
|
||||
has_64bit_asm=no
|
||||
else
|
||||
case x"$host_os" in
|
||||
xdarwin*)
|
||||
YASM_BINFMT=macho64
|
||||
;;
|
||||
x*-gnux32)
|
||||
YASM_BINFMT=elfx32
|
||||
;;
|
||||
*)
|
||||
YASM_BINFMT=elf64
|
||||
;;
|
||||
esac
|
||||
if $YASM -f help | grep -q $YASM_BINFMT; then
|
||||
has_64bit_asm=yes
|
||||
else
|
||||
if test x"$set_field" = x"64bit_asm"; then
|
||||
AC_MSG_ERROR([$set_field field support explicitly requested but yasm doesn't support $YASM_BINFMT format])
|
||||
fi
|
||||
AC_MSG_WARN([yasm too old for $YASM_BINFMT format])
|
||||
has_64bit_asm=no
|
||||
AC_MSG_CHECKING(for x86_64 assembly availability)
|
||||
AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
|
||||
#include <stdint.h>]],[[
|
||||
uint64_t a = 11, tmp;
|
||||
__asm__ __volatile__("movq $0x100000000,%1; mulq %%rsi" : "+a"(a) : "S"(tmp) : "cc", "%rdx");
|
||||
]])],[has_64bit_asm=yes],[has_64bit_asm=no])
|
||||
AC_MSG_RESULT([$has_64bit_asm])
|
||||
if test x"$set_field" == x"64bit_asm"; then
|
||||
if test x"$has_64bit_asm" == x"no"; then
|
||||
AC_MSG_ERROR([$set_field field support explicitly requested but no x86_64 assembly available])
|
||||
fi
|
||||
fi
|
||||
])
|
||||
|
|
|
@ -283,7 +283,6 @@ AC_SUBST(SECP_INCLUDES)
|
|||
AC_SUBST(SECP_LIBS)
|
||||
AC_SUBST(SECP_TEST_LIBS)
|
||||
AC_SUBST(SECP_TEST_INCLUDES)
|
||||
AC_SUBST(YASM_BINFMT)
|
||||
AM_CONDITIONAL([USE_ASM], [test x"$set_field" == x"64bit_asm"])
|
||||
AM_CONDITIONAL([USE_TESTS], [test x"$use_tests" != x"no"])
|
||||
AM_CONDITIONAL([USE_BENCHMARK], [test x"$use_benchmark" != x"no"])
|
||||
|
|
57
nasm_lt.sh
57
nasm_lt.sh
|
@ -1,57 +0,0 @@
|
|||
#! /bin/sh
|
||||
command=""
|
||||
infile=""
|
||||
o_opt=no
|
||||
pic=no
|
||||
while [ $# -gt 0 ]; do
|
||||
case "$1" in
|
||||
-DPIC|-fPIC|-fpic|-Kpic|-KPIC)
|
||||
if [ "$pic" != "yes" ] ; then
|
||||
command="$command -DPIC"
|
||||
pic=yes
|
||||
fi
|
||||
;;
|
||||
-f|-fbin|-faout|-faoutb|-fcoff|-felf|-felf64|-fas86| \
|
||||
-fobj|-fwin32|-fwin64|-frdf|-fieee|-fmacho|-fmacho64)
|
||||
# it's a file format specifier for nasm.
|
||||
command="$command $1"
|
||||
;;
|
||||
-f*)
|
||||
# maybe a code-generation flag for gcc.
|
||||
;;
|
||||
-[Ii]*)
|
||||
incdir=`echo "$1" | sed 's/^-[Ii]//'`
|
||||
if [ "x$incdir" = x -a "x$2" != x ] ; then
|
||||
case "$2" in
|
||||
-*) ;;
|
||||
*) incdir="$2"; shift;;
|
||||
esac
|
||||
fi
|
||||
if [ "x$incdir" != x ] ; then
|
||||
# In the case of NASM, the trailing slash is necessary.
|
||||
incdir=`echo "$incdir" | sed 's%/*$%/%'`
|
||||
command="$command -I$incdir"
|
||||
fi
|
||||
;;
|
||||
-o*)
|
||||
o_opt=yes
|
||||
command="$command $1"
|
||||
;;
|
||||
*.asm)
|
||||
infile=$1
|
||||
command="$command $1"
|
||||
;;
|
||||
*)
|
||||
command="$command $1"
|
||||
;;
|
||||
esac
|
||||
shift
|
||||
done
|
||||
if [ "$o_opt" != yes ] ; then
|
||||
# By default, NASM creates an output file
|
||||
# in the same directory as the input file.
|
||||
outfile="-o `echo $infile | sed -e 's%^.*/%%' -e 's%\.[^.]*$%%'`.o"
|
||||
command="$command $outfile"
|
||||
fi
|
||||
echo $command
|
||||
exec $command
|
|
@ -271,7 +271,7 @@ SECP256K1_INLINE static void secp256k1_fe_add(secp256k1_fe_t *r, const secp256k1
|
|||
#define VERIFY_BITS(x, n) do { } while(0)
|
||||
#endif
|
||||
|
||||
SECP256K1_INLINE static void secp256k1_fe_mul_inner(const uint32_t *a, const uint32_t * SECP256K1_RESTRICT b, uint32_t *r) {
|
||||
SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint32_t *r, const uint32_t *a, const uint32_t * SECP256K1_RESTRICT b) {
|
||||
VERIFY_BITS(a[0], 30);
|
||||
VERIFY_BITS(a[1], 30);
|
||||
VERIFY_BITS(a[2], 30);
|
||||
|
@ -598,7 +598,7 @@ SECP256K1_INLINE static void secp256k1_fe_mul_inner(const uint32_t *a, const uin
|
|||
/* [r9 r8 r7 r6 r5 r4 r3 r2 r1 r0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] */
|
||||
}
|
||||
|
||||
SECP256K1_INLINE static void secp256k1_fe_sqr_inner(const uint32_t *a, uint32_t *r) {
|
||||
SECP256K1_INLINE static void secp256k1_fe_sqr_inner(uint32_t *r, const uint32_t *a) {
|
||||
VERIFY_BITS(a[0], 30);
|
||||
VERIFY_BITS(a[1], 30);
|
||||
VERIFY_BITS(a[2], 30);
|
||||
|
@ -879,7 +879,7 @@ static void secp256k1_fe_mul(secp256k1_fe_t *r, const secp256k1_fe_t *a, const s
|
|||
secp256k1_fe_verify(b);
|
||||
VERIFY_CHECK(r != b);
|
||||
#endif
|
||||
secp256k1_fe_mul_inner(a->n, b->n, r->n);
|
||||
secp256k1_fe_mul_inner(r->n, a->n, b->n);
|
||||
#ifdef VERIFY
|
||||
r->magnitude = 1;
|
||||
r->normalized = 0;
|
||||
|
@ -892,7 +892,7 @@ static void secp256k1_fe_sqr(secp256k1_fe_t *r, const secp256k1_fe_t *a) {
|
|||
VERIFY_CHECK(a->magnitude <= 8);
|
||||
secp256k1_fe_verify(a);
|
||||
#endif
|
||||
secp256k1_fe_sqr_inner(a->n, r->n);
|
||||
secp256k1_fe_sqr_inner(r->n, a->n);
|
||||
#ifdef VERIFY
|
||||
r->magnitude = 1;
|
||||
r->normalized = 0;
|
||||
|
|
|
@ -1,469 +0,0 @@
|
|||
;; Added by Diederik Huys, March 2013
|
||||
;;
|
||||
;; Provided public procedures:
|
||||
;; secp256k1_fe_mul_inner
|
||||
;; secp256k1_fe_sqr_inner
|
||||
;;
|
||||
;; Needed tools: YASM (http://yasm.tortall.net)
|
||||
;;
|
||||
;;
|
||||
|
||||
BITS 64
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__,macho64
|
||||
%define SYM(x) _ %+ x
|
||||
%else
|
||||
%define SYM(x) x
|
||||
%endif
|
||||
|
||||
;; Procedure ExSetMult
|
||||
;; Register Layout:
|
||||
;; INPUT: rdi = a->n
|
||||
;; rsi = b->n
|
||||
;; rdx = r->a
|
||||
;;
|
||||
;; INTERNAL: rdx:rax = multiplication accumulator
|
||||
;; r9:r8 = c
|
||||
;; r10-r13 = t0-t3
|
||||
;; r14 = b.n[0] / t4
|
||||
;; r15 = b.n[1] / t5
|
||||
;; rbx = b.n[2] / t6
|
||||
;; rcx = b.n[3] / t7
|
||||
;; rbp = Constant 0FFFFFFFFFFFFFh / t8
|
||||
;; rsi = b.n / b.n[4] / t9
|
||||
|
||||
GLOBAL SYM(secp256k1_fe_mul_inner)
|
||||
ALIGN 32
|
||||
SYM(secp256k1_fe_mul_inner):
|
||||
push rbp
|
||||
push rbx
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
push rdx
|
||||
mov r14,[rsi+8*0] ; preload b.n[0]. This will be the case until
|
||||
; b.n[0] is no longer needed, then we reassign
|
||||
; r14 to t4
|
||||
;; c=a.n[0] * b.n[0]
|
||||
mov rax,[rdi+0*8] ; load a.n[0]
|
||||
mov rbp,0FFFFFFFFFFFFFh
|
||||
mul r14 ; rdx:rax=a.n[0]*b.n[0]
|
||||
mov r15,[rsi+1*8]
|
||||
mov r10,rbp ; load modulus into target register for t0
|
||||
mov r8,rax
|
||||
and r10,rax ; only need lower qword of c
|
||||
shrd r8,rdx,52
|
||||
xor r9,r9 ; c < 2^64, so we ditch the HO part
|
||||
|
||||
;; c+=a.n[0] * b.n[1] + a.n[1] * b.n[0]
|
||||
mov rax,[rdi+0*8]
|
||||
mul r15
|
||||
add r8,rax
|
||||
adc r9,rdx
|
||||
|
||||
mov rax,[rdi+1*8]
|
||||
mul r14
|
||||
mov r11,rbp
|
||||
mov rbx,[rsi+2*8]
|
||||
add r8,rax
|
||||
adc r9,rdx
|
||||
and r11,r8
|
||||
shrd r8,r9,52
|
||||
xor r9,r9
|
||||
|
||||
;; c+=a.n[0 1 2] * b.n[2 1 0]
|
||||
mov rax,[rdi+0*8]
|
||||
mul rbx
|
||||
add r8,rax
|
||||
adc r9,rdx
|
||||
|
||||
mov rax,[rdi+1*8]
|
||||
mul r15
|
||||
add r8,rax
|
||||
adc r9,rdx
|
||||
|
||||
mov rax,[rdi+2*8]
|
||||
mul r14
|
||||
mov r12,rbp
|
||||
mov rcx,[rsi+3*8]
|
||||
add r8,rax
|
||||
adc r9,rdx
|
||||
and r12,r8
|
||||
shrd r8,r9,52
|
||||
xor r9,r9
|
||||
|
||||
;; c+=a.n[0 1 2 3] * b.n[3 2 1 0]
|
||||
mov rax,[rdi+0*8]
|
||||
mul rcx
|
||||
add r8,rax
|
||||
adc r9,rdx
|
||||
|
||||
mov rax,[rdi+1*8]
|
||||
mul rbx
|
||||
add r8,rax
|
||||
adc r9,rdx
|
||||
|
||||
mov rax,[rdi+2*8]
|
||||
mul r15
|
||||
add r8,rax
|
||||
adc r9,rdx
|
||||
|
||||
mov rax,[rdi+3*8]
|
||||
mul r14
|
||||
mov r13,rbp
|
||||
mov rsi,[rsi+4*8] ; load b.n[4] and destroy pointer
|
||||
add r8,rax
|
||||
adc r9,rdx
|
||||
and r13,r8
|
||||
|
||||
shrd r8,r9,52
|
||||
xor r9,r9
|
||||
|
||||
|
||||
;; c+=a.n[0 1 2 3 4] * b.n[4 3 2 1 0]
|
||||
mov rax,[rdi+0*8]
|
||||
mul rsi
|
||||
add r8,rax
|
||||
adc r9,rdx
|
||||
|
||||
mov rax,[rdi+1*8]
|
||||
mul rcx
|
||||
add r8,rax
|
||||
adc r9,rdx
|
||||
|
||||
mov rax,[rdi+2*8]
|
||||
mul rbx
|
||||
add r8,rax
|
||||
adc r9,rdx
|
||||
|
||||
mov rax,[rdi+3*8]
|
||||
mul r15
|
||||
add r8,rax
|
||||
adc r9,rdx
|
||||
|
||||
mov rax,[rdi+4*8]
|
||||
mul r14
|
||||
mov r14,rbp ; load modulus into t4 and destroy a.n[0]
|
||||
add r8,rax
|
||||
adc r9,rdx
|
||||
and r14,r8
|
||||
shrd r8,r9,52
|
||||
xor r9,r9
|
||||
|
||||
;; c+=a.n[1 2 3 4] * b.n[4 3 2 1]
|
||||
mov rax,[rdi+1*8]
|
||||
mul rsi
|
||||
add r8,rax
|
||||
adc r9,rdx
|
||||
|
||||
mov rax,[rdi+2*8]
|
||||
mul rcx
|
||||
add r8,rax
|
||||
adc r9,rdx
|
||||
|
||||
mov rax,[rdi+3*8]
|
||||
mul rbx
|
||||
add r8,rax
|
||||
adc r9,rdx
|
||||
|
||||
mov rax,[rdi+4*8]
|
||||
mul r15
|
||||
mov r15,rbp
|
||||
add r8,rax
|
||||
adc r9,rdx
|
||||
|
||||
and r15,r8
|
||||
shrd r8,r9,52
|
||||
xor r9,r9
|
||||
|
||||
;; c+=a.n[2 3 4] * b.n[4 3 2]
|
||||
mov rax,[rdi+2*8]
|
||||
mul rsi
|
||||
add r8,rax
|
||||
adc r9,rdx
|
||||
|
||||
mov rax,[rdi+3*8]
|
||||
mul rcx
|
||||
add r8,rax
|
||||
adc r9,rdx
|
||||
|
||||
mov rax,[rdi+4*8]
|
||||
mul rbx
|
||||
mov rbx,rbp
|
||||
add r8,rax
|
||||
adc r9,rdx
|
||||
|
||||
and rbx,r8
|
||||
shrd r8,r9,52
|
||||
xor r9,r9
|
||||
|
||||
;; c+=a.n[3 4] * b.n[4 3]
|
||||
mov rax,[rdi+3*8]
|
||||
mul rsi
|
||||
add r8,rax
|
||||
adc r9,rdx
|
||||
|
||||
mov rax,[rdi+4*8]
|
||||
mul rcx
|
||||
mov rcx,rbp
|
||||
add r8,rax
|
||||
adc r9,rdx
|
||||
and rcx,r8
|
||||
shrd r8,r9,52
|
||||
xor r9,r9
|
||||
|
||||
;; c+=a.n[4] * b.n[4]
|
||||
mov rax,[rdi+4*8]
|
||||
mul rsi
|
||||
;; mov rbp,rbp ; modulus already there!
|
||||
add r8,rax
|
||||
adc r9,rdx
|
||||
and rbp,r8
|
||||
shrd r8,r9,52
|
||||
xor r9,r9
|
||||
|
||||
mov rsi,r8 ; load c into t9 and destroy b.n[4]
|
||||
|
||||
;; *******************************************************
|
||||
common_exit_norm:
|
||||
mov rdi,01000003D10h ; load constant
|
||||
|
||||
mov rax,r15 ; get t5
|
||||
mul rdi
|
||||
add rax,r10 ; +t0
|
||||
adc rdx,0
|
||||
mov r10,0FFFFFFFFFFFFFh ; modulus. Sadly, we ran out of registers!
|
||||
mov r8,rax ; +c
|
||||
and r10,rax
|
||||
shrd r8,rdx,52
|
||||
xor r9,r9
|
||||
|
||||
mov rax,rbx ; get t6
|
||||
mul rdi
|
||||
add rax,r11 ; +t1
|
||||
adc rdx,0
|
||||
mov r11,0FFFFFFFFFFFFFh ; modulus
|
||||
add r8,rax ; +c
|
||||
adc r9,rdx
|
||||
and r11,r8
|
||||
shrd r8,r9,52
|
||||
xor r9,r9
|
||||
|
||||
mov rax,rcx ; get t7
|
||||
mul rdi
|
||||
add rax,r12 ; +t2
|
||||
adc rdx,0
|
||||
pop rbx ; retrieve pointer to this.n
|
||||
mov r12,0FFFFFFFFFFFFFh ; modulus
|
||||
add r8,rax ; +c
|
||||
adc r9,rdx
|
||||
and r12,r8
|
||||
mov [rbx+2*8],r12 ; mov into this.n[2]
|
||||
shrd r8,r9,52
|
||||
xor r9,r9
|
||||
|
||||
mov rax,rbp ; get t8
|
||||
mul rdi
|
||||
add rax,r13 ; +t3
|
||||
adc rdx,0
|
||||
mov r13,0FFFFFFFFFFFFFh ; modulus
|
||||
add r8,rax ; +c
|
||||
adc r9,rdx
|
||||
and r13,r8
|
||||
mov [rbx+3*8],r13 ; -> this.n[3]
|
||||
shrd r8,r9,52
|
||||
xor r9,r9
|
||||
|
||||
mov rax,rsi ; get t9
|
||||
mul rdi
|
||||
add rax,r14 ; +t4
|
||||
adc rdx,0
|
||||
mov r14,0FFFFFFFFFFFFh ; !!!
|
||||
add r8,rax ; +c
|
||||
adc r9,rdx
|
||||
and r14,r8
|
||||
mov [rbx+4*8],r14 ; -> this.n[4]
|
||||
shrd r8,r9,48 ; !!!
|
||||
xor r9,r9
|
||||
|
||||
mov rax,01000003D1h
|
||||
mul r8
|
||||
add rax,r10
|
||||
adc rdx,0
|
||||
mov r10,0FFFFFFFFFFFFFh ; modulus
|
||||
mov r8,rax
|
||||
and rax,r10
|
||||
shrd r8,rdx,52
|
||||
mov [rbx+0*8],rax ; -> this.n[0]
|
||||
add r8,r11
|
||||
mov [rbx+1*8],r8 ; -> this.n[1]
|
||||
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rbx
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
|
||||
;; PROC ExSetSquare
|
||||
;; Register Layout:
|
||||
;; INPUT: rdi = a.n
|
||||
;; rsi = this.a
|
||||
;; INTERNAL: rdx:rax = multiplication accumulator
|
||||
;; r9:r8 = c
|
||||
;; r10-r13 = t0-t3
|
||||
;; r14 = a.n[0] / t4
|
||||
;; r15 = a.n[1] / t5
|
||||
;; rbx = a.n[2] / t6
|
||||
;; rcx = a.n[3] / t7
|
||||
;; rbp = 0FFFFFFFFFFFFFh / t8
|
||||
;; rsi = a.n[4] / t9
|
||||
GLOBAL SYM(secp256k1_fe_sqr_inner)
|
||||
ALIGN 32
|
||||
SYM(secp256k1_fe_sqr_inner):
|
||||
push rbp
|
||||
push rbx
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
push rsi
|
||||
mov rbp,0FFFFFFFFFFFFFh
|
||||
|
||||
;; c=a.n[0] * a.n[0]
|
||||
mov r14,[rdi+0*8] ; r14=a.n[0]
|
||||
mov r10,rbp ; modulus
|
||||
mov rax,r14
|
||||
mul rax
|
||||
mov r15,[rdi+1*8] ; a.n[1]
|
||||
add r14,r14 ; r14=2*a.n[0]
|
||||
mov r8,rax
|
||||
and r10,rax ; only need lower qword
|
||||
shrd r8,rdx,52
|
||||
xor r9,r9
|
||||
|
||||
;; c+=2*a.n[0] * a.n[1]
|
||||
mov rax,r14 ; r14=2*a.n[0]
|
||||
mul r15
|
||||
mov rbx,[rdi+2*8] ; rbx=a.n[2]
|
||||
mov r11,rbp ; modulus
|
||||
add r8,rax
|
||||
adc r9,rdx
|
||||
and r11,r8
|
||||
shrd r8,r9,52
|
||||
xor r9,r9
|
||||
|
||||
;; c+=2*a.n[0]*a.n[2]+a.n[1]*a.n[1]
|
||||
mov rax,r14
|
||||
mul rbx
|
||||
add r8,rax
|
||||
adc r9,rdx
|
||||
|
||||
mov rax,r15
|
||||
mov r12,rbp ; modulus
|
||||
mul rax
|
||||
mov rcx,[rdi+3*8] ; rcx=a.n[3]
|
||||
add r15,r15 ; r15=a.n[1]*2
|
||||
add r8,rax
|
||||
adc r9,rdx
|
||||
and r12,r8 ; only need lower dword
|
||||
shrd r8,r9,52
|
||||
xor r9,r9
|
||||
|
||||
;; c+=2*a.n[0]*a.n[3]+2*a.n[1]*a.n[2]
|
||||
mov rax,r14
|
||||
mul rcx
|
||||
add r8,rax
|
||||
adc r9,rdx
|
||||
|
||||
mov rax,r15 ; rax=2*a.n[1]
|
||||
mov r13,rbp ; modulus
|
||||
mul rbx
|
||||
mov rsi,[rdi+4*8] ; rsi=a.n[4]
|
||||
add r8,rax
|
||||
adc r9,rdx
|
||||
and r13,r8
|
||||
shrd r8,r9,52
|
||||
xor r9,r9
|
||||
|
||||
;; c+=2*a.n[0]*a.n[4]+2*a.n[1]*a.n[3]+a.n[2]*a.n[2]
|
||||
mov rax,r14 ; last time we need 2*a.n[0]
|
||||
mul rsi
|
||||
add r8,rax
|
||||
adc r9,rdx
|
||||
|
||||
mov rax,r15
|
||||
mul rcx
|
||||
mov r14,rbp ; modulus
|
||||
add r8,rax
|
||||
adc r9,rdx
|
||||
|
||||
mov rax,rbx
|
||||
mul rax
|
||||
add rbx,rbx ; rcx=2*a.n[2]
|
||||
add r8,rax
|
||||
adc r9,rdx
|
||||
and r14,r8
|
||||
shrd r8,r9,52
|
||||
xor r9,r9
|
||||
|
||||
;; c+=2*a.n[1]*a.n[4]+2*a.n[2]*a.n[3]
|
||||
mov rax,r15 ; last time we need 2*a.n[1]
|
||||
mul rsi
|
||||
add r8,rax
|
||||
adc r9,rdx
|
||||
|
||||
mov rax,rbx
|
||||
mul rcx
|
||||
mov r15,rbp ; modulus
|
||||
add r8,rax
|
||||
adc r9,rdx
|
||||
and r15,r8
|
||||
shrd r8,r9,52
|
||||
xor r9,r9
|
||||
|
||||
;; c+=2*a.n[2]*a.n[4]+a.n[3]*a.n[3]
|
||||
mov rax,rbx ; last time we need 2*a.n[2]
|
||||
mul rsi
|
||||
add r8,rax
|
||||
adc r9,rdx
|
||||
|
||||
mov rax,rcx ; a.n[3]
|
||||
mul rax
|
||||
mov rbx,rbp ; modulus
|
||||
add r8,rax
|
||||
adc r9,rdx
|
||||
and rbx,r8 ; only need lower dword
|
||||
lea rax,[2*rcx]
|
||||
shrd r8,r9,52
|
||||
xor r9,r9
|
||||
|
||||
;; c+=2*a.n[3]*a.n[4]
|
||||
mul rsi
|
||||
mov rcx,rbp ; modulus
|
||||
add r8,rax
|
||||
adc r9,rdx
|
||||
and rcx,r8 ; only need lower dword
|
||||
shrd r8,r9,52
|
||||
xor r9,r9
|
||||
|
||||
;; c+=a.n[4]*a.n[4]
|
||||
mov rax,rsi
|
||||
mul rax
|
||||
;; mov rbp,rbp ; modulus is already there!
|
||||
add r8,rax
|
||||
adc r9,rdx
|
||||
and rbp,r8
|
||||
shrd r8,r9,52
|
||||
xor r9,r9
|
||||
|
||||
mov rsi,r8
|
||||
|
||||
;; *******************************************************
|
||||
jmp common_exit_norm
|
||||
end
|
||||
|
||||
|
|
@ -1,13 +1,502 @@
|
|||
/**********************************************************************
|
||||
* Copyright (c) 2013 Pieter Wuille *
|
||||
* Copyright (c) 2013-2014 Diederik Huys, Pieter Wuille *
|
||||
* Distributed under the MIT software license, see the accompanying *
|
||||
* file COPYING or http://www.opensource.org/licenses/mit-license.php.*
|
||||
**********************************************************************/
|
||||
|
||||
/**
|
||||
* Changelog:
|
||||
* - March 2013, Diederik Huys: original version
|
||||
* - November 2014, Pieter Wuille: updated to use Peter Dettman's parallel multiplication algorithm
|
||||
* - December 2014, Pieter Wuille: converted from YASM to GCC inline assembly
|
||||
*/
|
||||
|
||||
#ifndef _SECP256K1_FIELD_INNER5X52_IMPL_H_
|
||||
#define _SECP256K1_FIELD_INNER5X52_IMPL_H_
|
||||
|
||||
void __attribute__ ((sysv_abi)) secp256k1_fe_mul_inner(const uint64_t *a, const uint64_t *b, uint64_t *r);
|
||||
void __attribute__ ((sysv_abi)) secp256k1_fe_sqr_inner(const uint64_t *a, uint64_t *r);
|
||||
SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint64_t *r, const uint64_t *a, const uint64_t * SECP256K1_RESTRICT b) {
|
||||
/**
|
||||
* Registers: rdx:rax = multiplication accumulator
|
||||
* r9:r8 = c
|
||||
* r15:rcx = d
|
||||
* r10-r14 = a0-a4
|
||||
* rbx = b
|
||||
* %2 = r
|
||||
* %0 = a / t?
|
||||
* rbp = R (0x1000003d10)
|
||||
*/
|
||||
__asm__ __volatile__(
|
||||
"pushq %%rbp\n"
|
||||
|
||||
"movq 0(%0),%%r10\n"
|
||||
"movq 8(%0),%%r11\n"
|
||||
"movq 16(%0),%%r12\n"
|
||||
"movq 24(%0),%%r13\n"
|
||||
"movq 32(%0),%%r14\n"
|
||||
"movq $0x1000003d10,%%rbp\n"
|
||||
|
||||
/* d += a3 * b0 */
|
||||
"movq 0(%%rbx),%%rax\n"
|
||||
"mulq %%r13\n"
|
||||
"movq %%rax,%%rcx\n"
|
||||
"movq %%rdx,%%r15\n"
|
||||
/* d += a2 * b1 */
|
||||
"movq 8(%%rbx),%%rax\n"
|
||||
"mulq %%r12\n"
|
||||
"addq %%rax,%%rcx\n"
|
||||
"adcq %%rdx,%%r15\n"
|
||||
/* d += a1 * b2 */
|
||||
"movq 16(%%rbx),%%rax\n"
|
||||
"mulq %%r11\n"
|
||||
"addq %%rax,%%rcx\n"
|
||||
"adcq %%rdx,%%r15\n"
|
||||
/* d = a0 * b3 */
|
||||
"movq 24(%%rbx),%%rax\n"
|
||||
"mulq %%r10\n"
|
||||
"addq %%rax,%%rcx\n"
|
||||
"adcq %%rdx,%%r15\n"
|
||||
/* c = a4 * b4 */
|
||||
"movq 32(%%rbx),%%rax\n"
|
||||
"mulq %%r14\n"
|
||||
"movq %%rax,%%r8\n"
|
||||
"movq %%rdx,%%r9\n"
|
||||
/* d += (c & M) * R */
|
||||
"movq $0xfffffffffffff,%%rdx\n"
|
||||
"andq %%rdx,%%rax\n"
|
||||
"mulq %%rbp\n"
|
||||
"addq %%rax,%%rcx\n"
|
||||
"adcq %%rdx,%%r15\n"
|
||||
/* c >>= 52 (%%r8 only) */
|
||||
"shrdq $52,%%r9,%%r8\n"
|
||||
/* t3 (stack) = d & M */
|
||||
"movq %%rcx,%0\n"
|
||||
"movq $0xfffffffffffff,%%rdx\n"
|
||||
"andq %%rdx,%0\n"
|
||||
"pushq %0\n"
|
||||
/* d >>= 52 */
|
||||
"shrdq $52,%%r15,%%rcx\n"
|
||||
"xorq %%r15,%%r15\n"
|
||||
/* d += a4 * b0 */
|
||||
"movq 0(%%rbx),%%rax\n"
|
||||
"mulq %%r14\n"
|
||||
"addq %%rax,%%rcx\n"
|
||||
"adcq %%rdx,%%r15\n"
|
||||
/* d += a3 * b1 */
|
||||
"movq 8(%%rbx),%%rax\n"
|
||||
"mulq %%r13\n"
|
||||
"addq %%rax,%%rcx\n"
|
||||
"adcq %%rdx,%%r15\n"
|
||||
/* d += a2 * b2 */
|
||||
"movq 16(%%rbx),%%rax\n"
|
||||
"mulq %%r12\n"
|
||||
"addq %%rax,%%rcx\n"
|
||||
"adcq %%rdx,%%r15\n"
|
||||
/* d += a1 * b3 */
|
||||
"movq 24(%%rbx),%%rax\n"
|
||||
"mulq %%r11\n"
|
||||
"addq %%rax,%%rcx\n"
|
||||
"adcq %%rdx,%%r15\n"
|
||||
/* d += a0 * b4 */
|
||||
"movq 32(%%rbx),%%rax\n"
|
||||
"mulq %%r10\n"
|
||||
"addq %%rax,%%rcx\n"
|
||||
"adcq %%rdx,%%r15\n"
|
||||
/* d += c * R */
|
||||
"movq %%r8,%%rax\n"
|
||||
"mulq %%rbp\n"
|
||||
"addq %%rax,%%rcx\n"
|
||||
"adcq %%rdx,%%r15\n"
|
||||
/* t4 = d & M (%0) */
|
||||
"movq %%rcx,%0\n"
|
||||
"movq $0xfffffffffffff,%%rdx\n"
|
||||
"andq %%rdx,%0\n"
|
||||
/* d >>= 52 */
|
||||
"shrdq $52,%%r15,%%rcx\n"
|
||||
"xorq %%r15,%%r15\n"
|
||||
/* tx = t4 >> 48 (%%rbp, overwrites R) */
|
||||
"movq %0,%%rbp\n"
|
||||
"shrq $48,%%rbp\n"
|
||||
/* t4 &= (M >> 4) (stack) */
|
||||
"movq $0xffffffffffff,%%rax\n"
|
||||
"andq %%rax,%0\n"
|
||||
"pushq %0\n"
|
||||
/* c = a0 * b0 */
|
||||
"movq 0(%%rbx),%%rax\n"
|
||||
"mulq %%r10\n"
|
||||
"movq %%rax,%%r8\n"
|
||||
"movq %%rdx,%%r9\n"
|
||||
/* d += a4 * b1 */
|
||||
"movq 8(%%rbx),%%rax\n"
|
||||
"mulq %%r14\n"
|
||||
"addq %%rax,%%rcx\n"
|
||||
"adcq %%rdx,%%r15\n"
|
||||
/* d += a3 * b2 */
|
||||
"movq 16(%%rbx),%%rax\n"
|
||||
"mulq %%r13\n"
|
||||
"addq %%rax,%%rcx\n"
|
||||
"adcq %%rdx,%%r15\n"
|
||||
/* d += a2 * b3 */
|
||||
"movq 24(%%rbx),%%rax\n"
|
||||
"mulq %%r12\n"
|
||||
"addq %%rax,%%rcx\n"
|
||||
"adcq %%rdx,%%r15\n"
|
||||
/* d += a1 * b4 */
|
||||
"movq 32(%%rbx),%%rax\n"
|
||||
"mulq %%r11\n"
|
||||
"addq %%rax,%%rcx\n"
|
||||
"adcq %%rdx,%%r15\n"
|
||||
/* u0 = d & M (%0) */
|
||||
"movq %%rcx,%0\n"
|
||||
"movq $0xfffffffffffff,%%rdx\n"
|
||||
"andq %%rdx,%0\n"
|
||||
/* d >>= 52 */
|
||||
"shrdq $52,%%r15,%%rcx\n"
|
||||
"xorq %%r15,%%r15\n"
|
||||
/* u0 = (u0 << 4) | tx (%0) */
|
||||
"shlq $4,%0\n"
|
||||
"orq %%rbp,%0\n"
|
||||
/* c += u0 * (R >> 4) */
|
||||
"movq $0x1000003d1,%%rax\n"
|
||||
"mulq %0\n"
|
||||
"addq %%rax,%%r8\n"
|
||||
"adcq %%rdx,%%r9\n"
|
||||
/* r[0] = c & M */
|
||||
"movq %%r8,%%rax\n"
|
||||
"movq $0xfffffffffffff,%%rdx\n"
|
||||
"andq %%rdx,%%rax\n"
|
||||
"movq %%rax,0(%2)\n"
|
||||
/* c >>= 52 */
|
||||
"shrdq $52,%%r9,%%r8\n"
|
||||
"xorq %%r9,%%r9\n"
|
||||
/* c += a1 * b0 */
|
||||
"movq 0(%%rbx),%%rax\n"
|
||||
"mulq %%r11\n"
|
||||
"addq %%rax,%%r8\n"
|
||||
"adcq %%rdx,%%r9\n"
|
||||
/* c += a0 * b1 */
|
||||
"movq 8(%%rbx),%%rax\n"
|
||||
"mulq %%r10\n"
|
||||
"addq %%rax,%%r8\n"
|
||||
"adcq %%rdx,%%r9\n"
|
||||
/* d += a4 * b2 */
|
||||
"movq 16(%%rbx),%%rax\n"
|
||||
"mulq %%r14\n"
|
||||
"addq %%rax,%%rcx\n"
|
||||
"adcq %%rdx,%%r15\n"
|
||||
/* d += a3 * b3 */
|
||||
"movq 24(%%rbx),%%rax\n"
|
||||
"mulq %%r13\n"
|
||||
"addq %%rax,%%rcx\n"
|
||||
"adcq %%rdx,%%r15\n"
|
||||
/* d += a2 * b4 */
|
||||
"movq 32(%%rbx),%%rax\n"
|
||||
"mulq %%r12\n"
|
||||
"addq %%rax,%%rcx\n"
|
||||
"adcq %%rdx,%%r15\n"
|
||||
/* restore rdp = R */
|
||||
"movq $0x1000003d10,%%rbp\n"
|
||||
/* c += (d & M) * R */
|
||||
"movq %%rcx,%%rax\n"
|
||||
"movq $0xfffffffffffff,%%rdx\n"
|
||||
"andq %%rdx,%%rax\n"
|
||||
"mulq %%rbp\n"
|
||||
"addq %%rax,%%r8\n"
|
||||
"adcq %%rdx,%%r9\n"
|
||||
/* d >>= 52 */
|
||||
"shrdq $52,%%r15,%%rcx\n"
|
||||
"xorq %%r15,%%r15\n"
|
||||
/* r[1] = c & M */
|
||||
"movq %%r8,%%rax\n"
|
||||
"movq $0xfffffffffffff,%%rdx\n"
|
||||
"andq %%rdx,%%rax\n"
|
||||
"movq %%rax,8(%2)\n"
|
||||
/* c >>= 52 */
|
||||
"shrdq $52,%%r9,%%r8\n"
|
||||
"xorq %%r9,%%r9\n"
|
||||
/* c += a2 * b0 */
|
||||
"movq 0(%%rbx),%%rax\n"
|
||||
"mulq %%r12\n"
|
||||
"addq %%rax,%%r8\n"
|
||||
"adcq %%rdx,%%r9\n"
|
||||
/* c += a1 * b1 */
|
||||
"movq 8(%%rbx),%%rax\n"
|
||||
"mulq %%r11\n"
|
||||
"addq %%rax,%%r8\n"
|
||||
"adcq %%rdx,%%r9\n"
|
||||
/* c += a0 * b2 (last use of %%r10 = a0) */
|
||||
"movq 16(%%rbx),%%rax\n"
|
||||
"mulq %%r10\n"
|
||||
"addq %%rax,%%r8\n"
|
||||
"adcq %%rdx,%%r9\n"
|
||||
/* fetch t3 (%%r10, overwrites a0),t4 (%0) */
|
||||
"popq %0\n"
|
||||
"popq %%r10\n"
|
||||
/* d += a4 * b3 */
|
||||
"movq 24(%%rbx),%%rax\n"
|
||||
"mulq %%r14\n"
|
||||
"addq %%rax,%%rcx\n"
|
||||
"adcq %%rdx,%%r15\n"
|
||||
/* d += a3 * b4 */
|
||||
"movq 32(%%rbx),%%rax\n"
|
||||
"mulq %%r13\n"
|
||||
"addq %%rax,%%rcx\n"
|
||||
"adcq %%rdx,%%r15\n"
|
||||
/* c += (d & M) * R */
|
||||
"movq %%rcx,%%rax\n"
|
||||
"movq $0xfffffffffffff,%%rdx\n"
|
||||
"andq %%rdx,%%rax\n"
|
||||
"mulq %%rbp\n"
|
||||
"addq %%rax,%%r8\n"
|
||||
"adcq %%rdx,%%r9\n"
|
||||
/* d >>= 52 (%%rcx only) */
|
||||
"shrdq $52,%%r15,%%rcx\n"
|
||||
/* r[2] = c & M */
|
||||
"movq %%r8,%%rax\n"
|
||||
"movq $0xfffffffffffff,%%rdx\n"
|
||||
"andq %%rdx,%%rax\n"
|
||||
"movq %%rax,16(%2)\n"
|
||||
/* c >>= 52 */
|
||||
"shrdq $52,%%r9,%%r8\n"
|
||||
"xorq %%r9,%%r9\n"
|
||||
/* c += t3 */
|
||||
"addq %%r10,%%r8\n"
|
||||
/* c += d * R */
|
||||
"movq %%rcx,%%rax\n"
|
||||
"mulq %%rbp\n"
|
||||
"addq %%rax,%%r8\n"
|
||||
"adcq %%rdx,%%r9\n"
|
||||
/* r[3] = c & M */
|
||||
"movq %%r8,%%rax\n"
|
||||
"movq $0xfffffffffffff,%%rdx\n"
|
||||
"andq %%rdx,%%rax\n"
|
||||
"movq %%rax,24(%2)\n"
|
||||
/* c >>= 52 (%%r8 only) */
|
||||
"shrdq $52,%%r9,%%r8\n"
|
||||
/* c += t4 (%%r8 only) */
|
||||
"addq %0,%%r8\n"
|
||||
/* r[4] = c */
|
||||
"movq %%r8,32(%2)\n"
|
||||
|
||||
"popq %%rbp\n"
|
||||
: "+S"(a)
|
||||
: "b"(b), "D"(r)
|
||||
: "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "cc", "memory"
|
||||
);
|
||||
}
|
||||
|
||||
SECP256K1_INLINE static void secp256k1_fe_sqr_inner(uint64_t *r, const uint64_t *a) {
|
||||
/**
|
||||
* Registers: rdx:rax = multiplication accumulator
|
||||
* r9:r8 = c
|
||||
* rcx:rbx = d
|
||||
* r10-r14 = a0-a4
|
||||
* r15 = M (0xfffffffffffff)
|
||||
* %1 = r
|
||||
* %0 = a / t?
|
||||
* rbp = R (0x1000003d10)
|
||||
*/
|
||||
__asm__ __volatile__(
|
||||
"pushq %%rbp\n"
|
||||
|
||||
"movq 0(%0),%%r10\n"
|
||||
"movq 8(%0),%%r11\n"
|
||||
"movq 16(%0),%%r12\n"
|
||||
"movq 24(%0),%%r13\n"
|
||||
"movq 32(%0),%%r14\n"
|
||||
"movq $0x1000003d10,%%rbp\n"
|
||||
"movq $0xfffffffffffff,%%r15\n"
|
||||
|
||||
/* d = (a0*2) * a3 */
|
||||
"leaq (%%r10,%%r10,1),%%rax\n"
|
||||
"mulq %%r13\n"
|
||||
"movq %%rax,%%rbx\n"
|
||||
"movq %%rdx,%%rcx\n"
|
||||
/* d += (a1*2) * a2 */
|
||||
"leaq (%%r11,%%r11,1),%%rax\n"
|
||||
"mulq %%r12\n"
|
||||
"addq %%rax,%%rbx\n"
|
||||
"adcq %%rdx,%%rcx\n"
|
||||
/* c = a4 * a4 */
|
||||
"movq %%r14,%%rax\n"
|
||||
"mulq %%r14\n"
|
||||
"movq %%rax,%%r8\n"
|
||||
"movq %%rdx,%%r9\n"
|
||||
/* d += (c & M) * R */
|
||||
"andq %%r15,%%rax\n"
|
||||
"mulq %%rbp\n"
|
||||
"addq %%rax,%%rbx\n"
|
||||
"adcq %%rdx,%%rcx\n"
|
||||
/* c >>= 52 (%%r8 only) */
|
||||
"shrdq $52,%%r9,%%r8\n"
|
||||
/* t3 (stack) = d & M */
|
||||
"movq %%rbx,%0\n"
|
||||
"andq %%r15,%0\n"
|
||||
"pushq %0\n"
|
||||
/* d >>= 52 */
|
||||
"shrdq $52,%%rcx,%%rbx\n"
|
||||
"xorq %%rcx,%%rcx\n"
|
||||
/* a4 *= 2 */
|
||||
"addq %%r14,%%r14\n"
|
||||
/* d += a0 * a4 */
|
||||
"movq %%r10,%%rax\n"
|
||||
"mulq %%r14\n"
|
||||
"addq %%rax,%%rbx\n"
|
||||
"adcq %%rdx,%%rcx\n"
|
||||
/* d+= (a1*2) * a3 */
|
||||
"leaq (%%r11,%%r11,1),%%rax\n"
|
||||
"mulq %%r13\n"
|
||||
"addq %%rax,%%rbx\n"
|
||||
"adcq %%rdx,%%rcx\n"
|
||||
/* d += a2 * a2 */
|
||||
"movq %%r12,%%rax\n"
|
||||
"mulq %%r12\n"
|
||||
"addq %%rax,%%rbx\n"
|
||||
"adcq %%rdx,%%rcx\n"
|
||||
/* d += c * R */
|
||||
"movq %%r8,%%rax\n"
|
||||
"mulq %%rbp\n"
|
||||
"addq %%rax,%%rbx\n"
|
||||
"adcq %%rdx,%%rcx\n"
|
||||
/* t4 = d & M (%0) */
|
||||
"movq %%rbx,%0\n"
|
||||
"andq %%r15,%0\n"
|
||||
/* d >>= 52 */
|
||||
"shrdq $52,%%rcx,%%rbx\n"
|
||||
"xorq %%rcx,%%rcx\n"
|
||||
/* tx = t4 >> 48 (%%rbp, overwrites constant) */
|
||||
"movq %0,%%rbp\n"
|
||||
"shrq $48,%%rbp\n"
|
||||
/* t4 &= (M >> 4) (stack) */
|
||||
"movq $0xffffffffffff,%%rax\n"
|
||||
"andq %%rax,%0\n"
|
||||
"pushq %0\n"
|
||||
/* c = a0 * a0 */
|
||||
"movq %%r10,%%rax\n"
|
||||
"mulq %%r10\n"
|
||||
"movq %%rax,%%r8\n"
|
||||
"movq %%rdx,%%r9\n"
|
||||
/* d += a1 * a4 */
|
||||
"movq %%r11,%%rax\n"
|
||||
"mulq %%r14\n"
|
||||
"addq %%rax,%%rbx\n"
|
||||
"adcq %%rdx,%%rcx\n"
|
||||
/* d += (a2*2) * a3 */
|
||||
"leaq (%%r12,%%r12,1),%%rax\n"
|
||||
"mulq %%r13\n"
|
||||
"addq %%rax,%%rbx\n"
|
||||
"adcq %%rdx,%%rcx\n"
|
||||
/* u0 = d & M (%0) */
|
||||
"movq %%rbx,%0\n"
|
||||
"andq %%r15,%0\n"
|
||||
/* d >>= 52 */
|
||||
"shrdq $52,%%rcx,%%rbx\n"
|
||||
"xorq %%rcx,%%rcx\n"
|
||||
/* u0 = (u0 << 4) | tx (%0) */
|
||||
"shlq $4,%0\n"
|
||||
"orq %%rbp,%0\n"
|
||||
/* c += u0 * (R >> 4) */
|
||||
"movq $0x1000003d1,%%rax\n"
|
||||
"mulq %0\n"
|
||||
"addq %%rax,%%r8\n"
|
||||
"adcq %%rdx,%%r9\n"
|
||||
/* r[0] = c & M */
|
||||
"movq %%r8,%%rax\n"
|
||||
"andq %%r15,%%rax\n"
|
||||
"movq %%rax,0(%1)\n"
|
||||
/* c >>= 52 */
|
||||
"shrdq $52,%%r9,%%r8\n"
|
||||
"xorq %%r9,%%r9\n"
|
||||
/* a0 *= 2 */
|
||||
"addq %%r10,%%r10\n"
|
||||
/* c += a0 * a1 */
|
||||
"movq %%r10,%%rax\n"
|
||||
"mulq %%r11\n"
|
||||
"addq %%rax,%%r8\n"
|
||||
"adcq %%rdx,%%r9\n"
|
||||
/* d += a2 * a4 */
|
||||
"movq %%r12,%%rax\n"
|
||||
"mulq %%r14\n"
|
||||
"addq %%rax,%%rbx\n"
|
||||
"adcq %%rdx,%%rcx\n"
|
||||
/* d += a3 * a3 */
|
||||
"movq %%r13,%%rax\n"
|
||||
"mulq %%r13\n"
|
||||
"addq %%rax,%%rbx\n"
|
||||
"adcq %%rdx,%%rcx\n"
|
||||
/* load R in %%rbp */
|
||||
"movq $0x1000003d10,%%rbp\n"
|
||||
/* c += (d & M) * R */
|
||||
"movq %%rbx,%%rax\n"
|
||||
"andq %%r15,%%rax\n"
|
||||
"mulq %%rbp\n"
|
||||
"addq %%rax,%%r8\n"
|
||||
"adcq %%rdx,%%r9\n"
|
||||
/* d >>= 52 */
|
||||
"shrdq $52,%%rcx,%%rbx\n"
|
||||
"xorq %%rcx,%%rcx\n"
|
||||
/* r[1] = c & M */
|
||||
"movq %%r8,%%rax\n"
|
||||
"andq %%r15,%%rax\n"
|
||||
"movq %%rax,8(%1)\n"
|
||||
/* c >>= 52 */
|
||||
"shrdq $52,%%r9,%%r8\n"
|
||||
"xorq %%r9,%%r9\n"
|
||||
/* c += a0 * a2 (last use of %%r10) */
|
||||
"movq %%r10,%%rax\n"
|
||||
"mulq %%r12\n"
|
||||
"addq %%rax,%%r8\n"
|
||||
"adcq %%rdx,%%r9\n"
|
||||
/* fetch t3 (%%r10, overwrites a0),t4 (%0) */
|
||||
"popq %0\n"
|
||||
"popq %%r10\n"
|
||||
/* c += a1 * a1 */
|
||||
"movq %%r11,%%rax\n"
|
||||
"mulq %%r11\n"
|
||||
"addq %%rax,%%r8\n"
|
||||
"adcq %%rdx,%%r9\n"
|
||||
/* d += a3 * a4 */
|
||||
"movq %%r13,%%rax\n"
|
||||
"mulq %%r14\n"
|
||||
"addq %%rax,%%rbx\n"
|
||||
"adcq %%rdx,%%rcx\n"
|
||||
/* c += (d & M) * R */
|
||||
"movq %%rbx,%%rax\n"
|
||||
"andq %%r15,%%rax\n"
|
||||
"mulq %%rbp\n"
|
||||
"addq %%rax,%%r8\n"
|
||||
"adcq %%rdx,%%r9\n"
|
||||
/* d >>= 52 (%%rbx only) */
|
||||
"shrdq $52,%%rcx,%%rbx\n"
|
||||
/* r[2] = c & M */
|
||||
"movq %%r8,%%rax\n"
|
||||
"andq %%r15,%%rax\n"
|
||||
"movq %%rax,16(%1)\n"
|
||||
/* c >>= 52 */
|
||||
"shrdq $52,%%r9,%%r8\n"
|
||||
"xorq %%r9,%%r9\n"
|
||||
/* c += t3 */
|
||||
"addq %%r10,%%r8\n"
|
||||
/* c += d * R */
|
||||
"movq %%rbx,%%rax\n"
|
||||
"mulq %%rbp\n"
|
||||
"addq %%rax,%%r8\n"
|
||||
"adcq %%rdx,%%r9\n"
|
||||
/* r[3] = c & M */
|
||||
"movq %%r8,%%rax\n"
|
||||
"andq %%r15,%%rax\n"
|
||||
"movq %%rax,24(%1)\n"
|
||||
/* c >>= 52 (%%r8 only) */
|
||||
"shrdq $52,%%r9,%%r8\n"
|
||||
/* c += t4 (%%r8 only) */
|
||||
"addq %0,%%r8\n"
|
||||
/* r[4] = c */
|
||||
"movq %%r8,32(%1)\n"
|
||||
|
||||
"popq %%rbp\n"
|
||||
: "+S"(a)
|
||||
: "D"(r)
|
||||
: "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "cc", "memory"
|
||||
);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
|
@ -255,7 +255,7 @@ static void secp256k1_fe_mul(secp256k1_fe_t *r, const secp256k1_fe_t *a, const s
|
|||
secp256k1_fe_verify(b);
|
||||
VERIFY_CHECK(r != b);
|
||||
#endif
|
||||
secp256k1_fe_mul_inner(a->n, b->n, r->n);
|
||||
secp256k1_fe_mul_inner(r->n, a->n, b->n);
|
||||
#ifdef VERIFY
|
||||
r->magnitude = 1;
|
||||
r->normalized = 0;
|
||||
|
@ -268,7 +268,7 @@ static void secp256k1_fe_sqr(secp256k1_fe_t *r, const secp256k1_fe_t *a) {
|
|||
VERIFY_CHECK(a->magnitude <= 8);
|
||||
secp256k1_fe_verify(a);
|
||||
#endif
|
||||
secp256k1_fe_sqr_inner(a->n, r->n);
|
||||
secp256k1_fe_sqr_inner(r->n, a->n);
|
||||
#ifdef VERIFY
|
||||
r->magnitude = 1;
|
||||
r->normalized = 0;
|
||||
|
|
|
@ -15,7 +15,7 @@
|
|||
#define VERIFY_BITS(x, n) do { } while(0)
|
||||
#endif
|
||||
|
||||
SECP256K1_INLINE static void secp256k1_fe_mul_inner(const uint64_t *a, const uint64_t * SECP256K1_RESTRICT b, uint64_t *r) {
|
||||
SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint64_t *r, const uint64_t *a, const uint64_t * SECP256K1_RESTRICT b) {
|
||||
VERIFY_BITS(a[0], 56);
|
||||
VERIFY_BITS(a[1], 56);
|
||||
VERIFY_BITS(a[2], 56);
|
||||
|
@ -152,7 +152,7 @@ SECP256K1_INLINE static void secp256k1_fe_mul_inner(const uint64_t *a, const uin
|
|||
/* [r4 r3 r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */
|
||||
}
|
||||
|
||||
SECP256K1_INLINE static void secp256k1_fe_sqr_inner(const uint64_t *a, uint64_t *r) {
|
||||
SECP256K1_INLINE static void secp256k1_fe_sqr_inner(uint64_t *r, const uint64_t *a) {
|
||||
VERIFY_BITS(a[0], 56);
|
||||
VERIFY_BITS(a[1], 56);
|
||||
VERIFY_BITS(a[2], 56);
|
||||
|
|
Loading…
Reference in New Issue