Merge pull request #11 from PSYCPU/master
1st assembly version of 5x64 field code
This commit is contained in:
commit
2ce0e50af3
19
Makefile
19
Makefile
|
@ -8,11 +8,11 @@ JAVA_FILES := src/java/org_bitcoin_NativeSecp256k1.h src/java/org_bitcoin_Native
|
|||
OBJS :=
|
||||
|
||||
ifeq ($(USE_ASM), 1)
|
||||
OBJS := $(OBJS) obj/field_5x52_asm.o
|
||||
OBJS := $(OBJS) obj/field_5x$(HAVE_LIMB)_asm.o
|
||||
endif
|
||||
STD="gnu99"
|
||||
|
||||
default: tests libsecp256k1.a libsecp256k1.so
|
||||
./tests
|
||||
|
||||
clean:
|
||||
rm -rf obj/*.o bench tests *.a *.so config.mk
|
||||
|
@ -20,18 +20,21 @@ clean:
|
|||
obj/field_5x52_asm.o: src/field_5x52_asm.asm
|
||||
$(YASM) -f elf64 -o obj/field_5x52_asm.o src/field_5x52_asm.asm
|
||||
|
||||
obj/field_5x64_asm.o: src/field_5x64_asm.asm
|
||||
$(YASM) -f elf64 -o obj/field_5x64_asm.o src/field_5x64_asm.asm
|
||||
|
||||
obj/secp256k1.o: $(FILES) src/secp256k1.c include/secp256k1.h
|
||||
$(CC) -fPIC -std=c99 $(CFLAGS) $(CFLAGS_EXTRA) -DNDEBUG -O2 src/secp256k1.c -c -o obj/secp256k1.o
|
||||
$(CC) -fPIC -std=$(STD) $(CFLAGS) $(CFLAGS_EXTRA) -DNDEBUG -$(OPTLEVEL) src/secp256k1.c -c -o obj/secp256k1.o
|
||||
|
||||
bench: $(FILES) src/bench.c $(OBJS)
|
||||
$(CC) -fPIC -std=c99 $(CFLAGS) $(CFLAGS_EXTRA) $(CFLAGS_TEST_EXTRA) -DNDEBUG -O2 src/bench.c $(OBJS) $(LDFLAGS_EXTRA) $(LDFLAGS_TEST_EXTRA) -o bench
|
||||
$(CC) -fPIC -std=$(STD) $(CFLAGS) $(CFLAGS_EXTRA) $(CFLAGS_TEST_EXTRA) -DNDEBUG -$(OPTLEVEL) src/bench.c $(OBJS) $(LDFLAGS_EXTRA) $(LDFLAGS_TEST_EXTRA) -o bench
|
||||
|
||||
tests: $(FILES) src/tests.c $(OBJS)
|
||||
$(CC) -std=c99 $(CFLAGS) $(CFLAGS_EXTRA) $(CFLAGS_TEST_EXTRA) -DVERIFY -fstack-protector-all -O2 -ggdb3 src/tests.c $(OBJS) $(LDFLAGS_EXTRA) $(LDFLAGS_TEST_EXTRA) -o tests
|
||||
$(CC) -std=$(STD) $(CFLAGS) $(CFLAGS_EXTRA) $(CFLAGS_TEST_EXTRA) -DVERIFY -fstack-protector-all -$(OPTLEVEL) -ggdb3 src/tests.c $(OBJS) $(LDFLAGS_EXTRA) $(LDFLAGS_TEST_EXTRA) -o tests
|
||||
|
||||
coverage: $(FILES) src/tests.c $(OBJS)
|
||||
rm -rf tests.gcno tests.gcda tests_cov
|
||||
$(CC) -std=c99 $(CFLAGS) $(CFLAGS_EXTRA) $(CFLAGS_TEST_EXTRA) -DVERIFY --coverage -O0 -g src/tests.c $(OBJS) $(LDFLAGS_EXTRA) $(LDFLAGS_TEST_EXTRA) -o tests_cov
|
||||
$(CC) -std=$(STD) $(CFLAGS) $(CFLAGS_EXTRA) $(CFLAGS_TEST_EXTRA) -DVERIFY --coverage -$(OPTLEVEL) -g src/tests.c $(OBJS) $(LDFLAGS_EXTRA) $(LDFLAGS_TEST_EXTRA) -o tests_cov
|
||||
rm -rf lcov
|
||||
mkdir -p lcov
|
||||
cd lcov; lcov --directory ../ --zerocounters
|
||||
|
@ -43,7 +46,7 @@ libsecp256k1.a: obj/secp256k1.o $(OBJS)
|
|||
$(AR) -rs $@ $(OBJS) obj/secp256k1.o
|
||||
|
||||
libsecp256k1.so: obj/secp256k1.o $(OBJS)
|
||||
$(CC) -std=c99 $(LDFLAGS_EXTRA) $(OBJS) obj/secp256k1.o -shared -o libsecp256k1.so
|
||||
$(CC) -std=$(STD) $(LDFLAGS_EXTRA) $(OBJS) obj/secp256k1.o -shared -o libsecp256k1.so
|
||||
|
||||
libjavasecp256k1.so: $(OBJS) obj/secp256k1.o $(JAVA_FILES)
|
||||
$(CC) -fPIC -std=c99 $(CFLAGS) $(CFLAGS_EXTRA) -DNDEBUG -O2 -I. src/java/org_bitcoin_NativeSecp256k1.c $(LDFLAGS_EXTRA) $(OBJS) obj/secp256k1.o -shared -o libjavasecp256k1.so
|
||||
$(CC) -fPIC -std=$(STD) $(CFLAGS) $(CFLAGS_EXTRA) -DNDEBUG -$(OPTLEVEL) -I. src/java/org_bitcoin_NativeSecp256k1.c $(LDFLAGS_EXTRA) $(OBJS) obj/secp256k1.o -shared -o libjavasecp256k1.so
|
||||
|
|
|
@ -0,0 +1,14 @@
|
|||
#!/bin/bash
|
||||
echo "Benchmark Results" >output.txt
|
||||
for j in no-yasm yasm; do
|
||||
echo "5x64 $j:" >>output.txt
|
||||
for i in O0 O1 O2 O3; do
|
||||
make clean
|
||||
./configure --$j
|
||||
echo "OPTLEVEL=$i" >>config.mk
|
||||
make bench
|
||||
echo "OPTLEVEL=$i" >>output.txt
|
||||
(time ./bench) |& grep real >>output.txt
|
||||
done
|
||||
done
|
||||
|
|
@ -97,6 +97,9 @@ if [ "$?" = 0 ]; then
|
|||
HAVE_INT128=1
|
||||
fi
|
||||
|
||||
#default limb size
|
||||
HAVE_LIMB=52
|
||||
|
||||
for arg in "$@"; do
|
||||
case "$arg" in
|
||||
--no-yasm)
|
||||
|
@ -107,6 +110,9 @@ for arg in "$@"; do
|
|||
;;
|
||||
--no-openssl)
|
||||
HAVE_OPENSSL=0
|
||||
;;
|
||||
--use-5x64)
|
||||
HAVE_LIMB=64
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
@ -117,10 +123,10 @@ USE_ASM=0
|
|||
|
||||
# select field implementation
|
||||
if [ "$HAVE_YASM" = "1" ]; then
|
||||
CFLAGS_FIELD="-DUSE_FIELD_5X52 -DUSE_FIELD_5X52_ASM"
|
||||
CFLAGS_FIELD="-DUSE_FIELD_5X$HAVE_LIMB -DUSE_FIELD_5X${HAVE_LIMB}_ASM"
|
||||
USE_ASM=1
|
||||
elif [ "$HAVE_INT128" = "1" ]; then
|
||||
CFLAGS_FIELD="-DUSE_FIELD_5X52 -DUSE_FIELD_5X52_INT128"
|
||||
CFLAGS_FIELD="-DUSE_FIELD_5X$HAVE_LIMB -DUSE_FIELD_5X${HAVE_LIMB}_INT128"
|
||||
elif [ "$HAVE_GMP" = "1" ]; then
|
||||
CFLAGS_FIELD="-DUSE_FIELD_GMP"
|
||||
LINK_GMP=1
|
||||
|
@ -165,3 +171,4 @@ echo "CFLAGS_TEST_EXTRA=$CFLAGS_TEST_EXTRA" >> config.mk
|
|||
echo "LDFLAGS_EXTRA=$LDFLAGS_EXTRA" >> config.mk
|
||||
echo "LDFLAGS_TEST_EXTRA=$LDFLAGS_TEST_EXTRA" >> config.mk
|
||||
echo "USE_ASM=$USE_ASM" >>config.mk
|
||||
echo "HAVE_LIMB=$HAVE_LIMB" >>config.mk
|
||||
|
|
|
@ -314,7 +314,7 @@ common_exit_norm:
|
|||
;; rbx = a.n[2] / t6
|
||||
;; rcx = a.n[3] / t7
|
||||
;; rbp = 0FFFFFFFFFFFFFh / t8
|
||||
;; rsi = a.n[4] / a.n[4] /t9
|
||||
;; rsi = a.n[4] / t9
|
||||
GLOBAL secp256k1_fe_sqr_inner
|
||||
ALIGN 32
|
||||
secp256k1_fe_sqr_inner:
|
||||
|
|
|
@ -0,0 +1,333 @@
|
|||
;; Added by Diederik Huys, March 2013
|
||||
;;
|
||||
;; Provided public procedures:
|
||||
;; secp256k1_fe_mul_inner
|
||||
;; secp256k1_fe_sqr_inner
|
||||
;;
|
||||
;; Needed tools: YASM (http://yasm.tortall.net)
|
||||
;;
|
||||
;;
|
||||
|
||||
BITS 64
|
||||
|
||||
COMP_LIMB EQU 000000001000003D1h
|
||||
|
||||
;; Procedure ExSetMult
|
||||
;; Register Layout:
|
||||
;; INPUT: rdi = a->n
|
||||
;; rsi = b->n
|
||||
;; rdx = r->a
|
||||
;;
|
||||
;; INTERNAL: rdx:rax = multiplication accumulator
|
||||
;; r8-r10 = c0-c2
|
||||
;; r11-r15 = b.n[0]-b.n[4] / r3 - r7
|
||||
;; rbx = r0
|
||||
;; rcx = r1
|
||||
;; rbp = r2
|
||||
;;
|
||||
GLOBAL secp256k1_fe_mul_inner
|
||||
ALIGN 32
|
||||
secp256k1_fe_mul_inner:
|
||||
push rbp
|
||||
push rbx
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
push rdx
|
||||
|
||||
mov r11,[rsi+8*0] ; preload b.n[0]
|
||||
|
||||
;; step 1: mul_c2
|
||||
mov rax,[rdi+0*8] ; load a.n[0]
|
||||
mul r11 ; rdx:rax=a.n[0]*b.n[0]
|
||||
mov r12,[rsi+1*8] ; preload b.n[1]
|
||||
mov rbx,rax ; retire LO qword (r[0])
|
||||
mov r8,rdx ; save overflow
|
||||
xor r9,r9 ; overflow HO qwords
|
||||
xor r10,r10
|
||||
|
||||
;; c+=a.n[0] * b.n[1] + a.n[1] * b.n[0]
|
||||
mov rax,[rdi+0*8]
|
||||
mul r12
|
||||
mov r13,[rsi+2*8] ; preload b.n[2]
|
||||
add r8,rax ; still the same :-)
|
||||
adc r9,rdx ;
|
||||
adc r10,0 ; mmm...
|
||||
|
||||
mov rax,[rdi+1*8]
|
||||
mul r11
|
||||
add r8,rax
|
||||
adc r9,rdx
|
||||
adc r10,0
|
||||
mov rcx,r8 ; retire r[1]
|
||||
xor r8,r8
|
||||
|
||||
;; c+=a.n[0 1 2] * b.n[2 1 0]
|
||||
mov rax,[rdi+0*8]
|
||||
mul r13
|
||||
mov r14,[rsi+3*8] ; preload b.n[3]
|
||||
add r9,rax
|
||||
adc r10,rdx
|
||||
adc r8,0
|
||||
|
||||
mov rax,[rdi+1*8]
|
||||
mul r12
|
||||
add r9,rax
|
||||
adc r10,rdx
|
||||
adc r8,0
|
||||
|
||||
mov rax,[rdi+2*8]
|
||||
mul r11
|
||||
add r9,rax
|
||||
adc r10,rdx
|
||||
adc r8,0
|
||||
mov rbp,r9 ; retire r[2]
|
||||
xor r9,r9
|
||||
|
||||
;; c+=a.n[0 1 2 3] * b.n[3 2 1 0]
|
||||
mov rax,[rdi+0*8]
|
||||
mul r14
|
||||
add r10,rax
|
||||
adc r8,rdx
|
||||
adc r9,0
|
||||
|
||||
mov rax,[rdi+1*8]
|
||||
mul r13
|
||||
add r10,rax
|
||||
adc r8,rdx
|
||||
adc r9,0
|
||||
|
||||
mov rax,[rdi+2*8]
|
||||
mul r12
|
||||
add r10,rax
|
||||
adc r8,rdx
|
||||
adc r9,0
|
||||
|
||||
mov rax,[rdi+3*8]
|
||||
mul r11
|
||||
add r10,rax
|
||||
adc r8,rdx
|
||||
adc r9,0
|
||||
mov r11,r10 ; retire r[3]
|
||||
xor r10,r10
|
||||
|
||||
;; c+=a.n[1 2 3] * b.n[3 2 1]
|
||||
mov rax,[rdi+1*8]
|
||||
mul r14
|
||||
add r8,rax
|
||||
adc r9,rdx
|
||||
adc r10,0
|
||||
|
||||
mov rax,[rdi+2*8]
|
||||
mul r13
|
||||
add r8,rax
|
||||
adc r9,rdx
|
||||
adc r10,0
|
||||
|
||||
mov rax,[rdi+3*8]
|
||||
mul r12
|
||||
add r8,rax
|
||||
adc r9,rdx
|
||||
adc r10,0
|
||||
mov r12,r8 ; retire r[4]
|
||||
xor r8,r8
|
||||
|
||||
;; c+=a.n[2 3] * b.n[3 2]
|
||||
mov rax,[rdi+2*8]
|
||||
mul r14
|
||||
add r9,rax ; still the same :-)
|
||||
adc r10,rdx ;
|
||||
adc r8,0 ; mmm...
|
||||
|
||||
mov rax,[rdi+3*8]
|
||||
mul r13
|
||||
add r9,rax
|
||||
adc r10,rdx
|
||||
adc r8,0
|
||||
mov r13,r9 ; retire r[5]
|
||||
xor r9,r9
|
||||
|
||||
;; c+=a.n[3] * b.n[3]
|
||||
mov rax,[rdi+3*8]
|
||||
mul r14
|
||||
add r10,rax
|
||||
adc r8,rdx
|
||||
adc r9,0
|
||||
mov r14,r10
|
||||
mov r15,r8
|
||||
|
||||
|
||||
;; *******************************************************
|
||||
common_exit_norm:
|
||||
mov rdi,COMP_LIMB
|
||||
mov rax,r12
|
||||
mul rdi
|
||||
add rax,rbx
|
||||
adc rcx,rdx
|
||||
pop rbx
|
||||
mov [rbx],rax
|
||||
|
||||
mov rax,r13 ; get r5
|
||||
mul rdi
|
||||
add rax,rcx ; +r1
|
||||
adc rbp,rdx
|
||||
mov [rbx+1*8],rax
|
||||
|
||||
mov rax,r14 ; get r6
|
||||
mul rdi
|
||||
add rax,rbp ; +r2
|
||||
adc r11,rdx
|
||||
mov [rbx+2*8],rax
|
||||
|
||||
mov rax,r15 ; get r7
|
||||
mul rdi
|
||||
add rax,r11 ; +r3
|
||||
adc rdx,0
|
||||
mov [rbx+3*8],rax
|
||||
mov [rbx+4*8],rdx
|
||||
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rbx
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
|
||||
;; PROC ExSetSquare
|
||||
;; Register Layout:
|
||||
;; INPUT: rdi = a.n
|
||||
;; rsi = this.a
|
||||
;; INTERNAL: rdx:rax = multiplication accumulator
|
||||
;; r8-r10 = c
|
||||
;; r11-r15 = a.n[0]-a.n[4] / r3-r7
|
||||
;; rbx = r0
|
||||
;; rcx = r1
|
||||
;; rbp = r2
|
||||
GLOBAL secp256k1_fe_sqr_inner
|
||||
|
||||
ALIGN 32
|
||||
secp256k1_fe_sqr_inner:
|
||||
push rbp
|
||||
push rbx
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
push rdx
|
||||
|
||||
mov r11,[rdi+8*0] ; preload a.n[0]
|
||||
|
||||
;; step 1: mul_c2
|
||||
mov rax,r11 ; load a.n[0]
|
||||
mul rax ; rdx:rax=a.n[0]²
|
||||
mov r12,[rdi+1*8] ; preload a.n[1]
|
||||
mov rbx,rax ; retire LO qword (r[0])
|
||||
mov r8,rdx ; save overflow
|
||||
xor r9,r9 ; overflow HO qwords
|
||||
xor r10,r10
|
||||
|
||||
;; c+=2*a.n[0] * a.n[1]
|
||||
mov rax,r11 ; load a.n[0]
|
||||
mul r12 ; rdx:rax=a.n[0] * a.n[1]
|
||||
mov r13,[rdi+2*8] ; preload a.n[2]
|
||||
add rax,rax ; rdx:rax*=2
|
||||
adc rdx,rdx
|
||||
adc r10,0
|
||||
add r8,rax ; still the same :-)
|
||||
adc r9,rdx ;
|
||||
adc r10,0 ; mmm...
|
||||
|
||||
mov rcx,r8 ; retire r[1]
|
||||
xor r8,r8
|
||||
|
||||
;; c+=2*a.n[0]*a.n[2]+a.n[1]*a.n[1]
|
||||
mov rax,r11 ; load a.n[0]
|
||||
mul r13 ; * a.n[2]
|
||||
mov r14,[rdi+3*8] ; preload a.n[3]
|
||||
add rax,rax ; rdx:rax*=2
|
||||
adc rdx,rdx
|
||||
adc r8,0
|
||||
add r9,rax
|
||||
adc r10,rdx
|
||||
adc r8,0
|
||||
|
||||
mov rax,r12
|
||||
mul rax
|
||||
add r9,rax
|
||||
adc r10,rdx
|
||||
adc r8,0
|
||||
|
||||
|
||||
mov rbp,r9
|
||||
xor r9,r9
|
||||
|
||||
;; c+=2*a.n[0]*a.n[3]+2*a.n[1]*a.n[2]
|
||||
mov rax,r11 ; load a.n[0]
|
||||
mul r14 ; * a.n[3]
|
||||
add rax,rax ; rdx:rax*=2
|
||||
adc rdx,rdx
|
||||
adc r9,0
|
||||
add r10,rax
|
||||
adc r8,rdx
|
||||
adc r9,0
|
||||
|
||||
mov rax,r12 ; load a.n[1]
|
||||
mul r13 ; * a.n[2]
|
||||
add rax,rax
|
||||
adc rdx,rdx
|
||||
adc r9,0
|
||||
add r10,rax
|
||||
adc r8,rdx
|
||||
adc r9,0
|
||||
|
||||
mov r11,r10
|
||||
xor r10,r10
|
||||
|
||||
;; c+=2*a.n[1]*a.n[3]+a.n[2]*a.n[2]
|
||||
mov rax,r12 ; load a.n[1]
|
||||
mul r14 ; * a.n[3]
|
||||
add rax,rax ; rdx:rax*=2
|
||||
adc rdx,rdx
|
||||
adc r10,0
|
||||
add r8,rax
|
||||
adc r9,rdx
|
||||
adc r10,0
|
||||
|
||||
mov rax,r13
|
||||
mul rax
|
||||
add r8,rax
|
||||
adc r9,rdx
|
||||
adc r10,0
|
||||
|
||||
mov r12,r8
|
||||
xor r8,r8
|
||||
;; c+=2*a.n[2]*a.n[3]
|
||||
mov rax,r13 ; load a.n[2]
|
||||
mul r14 ; * a.n[3]
|
||||
add rax,rax ; rdx:rax*=2
|
||||
adc rdx,rdx
|
||||
adc r8,0
|
||||
add r9,rax
|
||||
adc r10,rdx
|
||||
adc r8,0
|
||||
|
||||
mov r13,r9
|
||||
xor r13,r13
|
||||
|
||||
;; c+=a.n[3]²
|
||||
mov rax,r14
|
||||
mul rax
|
||||
add r10,rax
|
||||
adc r8,rdx
|
||||
adc r9,0
|
||||
|
||||
mov r14,r10
|
||||
mov r15,r8
|
||||
|
||||
jmp common_exit_norm
|
||||
end
|
||||
|
||||
|
|
@ -11,6 +11,7 @@
|
|||
#include "../field.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include "field_5x64_asm.h"
|
||||
|
||||
/** Implements arithmetic modulo FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFE FFFFFC2F,
|
||||
* represented as 4 uint64_t's in base 2^64, and one overflow uint64_t.
|
||||
|
@ -264,9 +265,14 @@ void static inline secp256k1_fe_add(secp256k1_fe_t *r, const secp256k1_fe_t *a)
|
|||
}
|
||||
|
||||
void static secp256k1_fe_mul(secp256k1_fe_t *r, const secp256k1_fe_t *ac, const secp256k1_fe_t *bc) {
|
||||
|
||||
secp256k1_fe_t a = *ac, b = *bc;
|
||||
secp256k1_fe_reduce(&a);
|
||||
secp256k1_fe_reduce(&b);
|
||||
|
||||
#ifdef USE_FIELD_5X64_ASM
|
||||
secp256k1_fe_mul_inner((&a)->n,(&b)->n,r->n);
|
||||
#else
|
||||
uint64_t c1,c2,c3;
|
||||
c3=0;
|
||||
mul_c2(a.n[0], b.n[0], c1, c2);
|
||||
|
@ -303,6 +309,7 @@ void static secp256k1_fe_mul(secp256k1_fe_t *r, const secp256k1_fe_t *ac, const
|
|||
c = (unsigned __int128)r7 * COMP_LIMB + r3 + (c >> 64);
|
||||
r->n[3] = c;
|
||||
r->n[4] = c >> 64;
|
||||
#endif
|
||||
|
||||
#ifdef VERIFY
|
||||
r->normalized = 0;
|
||||
|
|
|
@ -0,0 +1,11 @@
|
|||
// Copyright (c) 2013 Pieter Wuille
|
||||
// Distributed under the MIT/X11 software license, see the accompanying
|
||||
// file COPYING or http://www.opensource.org/licenses/mit-license.php.
|
||||
|
||||
#ifndef _SECP256K1_FIELD_INNER5X52_IMPL_H_
|
||||
#define _SECP256K1_FIELD_INNER5X52_IMPL_H_
|
||||
|
||||
void __attribute__ ((sysv_abi)) secp256k1_fe_mul_inner(const uint64_t *a, const uint64_t *b, uint64_t *r);
|
||||
void __attribute__ ((sysv_abi)) secp256k1_fe_sqr_inner(const uint64_t *a, uint64_t *r);
|
||||
|
||||
#endif
|
Loading…
Reference in New Issue