secp256k1/lin64.asm

432 lines
6.7 KiB
NASM

.x64
QTEST EQU 1
.code
;; Register Layout:
;; INPUT: rdi = a.n
;; rsi = b.n
;; rdx = this.a
;; OUTPUT: [rbx]
;; INTERNAL: rdx:rax = multiplication accumulator
;; rsi = b.n / t9
;; r8:r9 = c
;; r10-r15 = t0-t5
;; rbx = t6
;; rcx = t7
;; rbp = t8
ExSetMult PROC C PUBLIC USES rbx rbp r12 r13 r14 r15
push rdx
mov r14,[rsi+8*0]
;; c=a.n[0] * b.n[0]
mov rax,[rdi+0*8]
mov rbp,0FFFFFFFFFFFFFh
mul r14 ; rsi=b.n[0]
mov r15,[rsi+1*8]
mov r10,rbp
mov r8,rax
and r10,rax ; only need lower qword
shrd r8,rdx,52
xor r9,r9
;; c+=a.n[0] * b.n[1] + a.n[1] * b.n[0]
mov rax,[rdi+0*8]
mul r15 ; b.n[1]
add r8,rax
adc r9,rdx
mov rax,[rdi+1*8]
mul r14 ; b.n[0]
mov r11,rbp
mov rbx,[rsi+2*8]
add r8,rax
adc r9,rdx
and r11,r8
shrd r8,r9,52
xor r9,r9
;; c+=a.n[0 1 2] * b.n[2 1 0]
mov rax,[rdi+0*8]
mul rbx ; b.n[2]
add r8,rax
adc r9,rdx
mov rax,[rdi+1*8]
mul r15 ; b.n[1]
add r8,rax
adc r9,rdx
mov rax,[rdi+2*8]
mul r14
mov r12,rbp ; modulus
mov rcx,[rsi+3*8]
add r8,rax
adc r9,rdx
and r12,r8 ; only need lower dword
shrd r8,r9,52
xor r9,r9
;; c+=a.n[0 1 2 3] * b.n[3 2 1 0]
mov rax,[rdi+0*8]
mul rcx ; b.n[3]
add r8,rax
adc r9,rdx
mov rax,[rdi+1*8]
mul rbx ; b.n[2]
add r8,rax
adc r9,rdx
mov rax,[rdi+2*8]
mul r15 ; b.n[1]
add r8,rax
adc r9,rdx
mov rax,[rdi+3*8]
mul r14 ; b.n[0]
mov r13,rbp ; modulus
mov rsi,[rsi+4*8] ; load b.n[4] and destroy pointer
add r8,rax
adc r9,rdx
and r13,r8
shrd r8,r9,52
xor r9,r9
;; c+=a.n[0 1 2 3 4] * b.n[4 3 2 1 0]
mov rax,[rdi+0*8]
mul rsi
add r8,rax
adc r9,rdx
mov rax,[rdi+1*8]
mul rcx
add r8,rax
adc r9,rdx
mov rax,[rdi+2*8]
mul rbx ; b.n[2]
add r8,rax
adc r9,rdx
mov rax,[rdi+3*8]
mul r15 ; b.n[1]
add r8,rax
adc r9,rdx
mov rax,[rdi+4*8]
mul r14 ; b.n[0]
mov r14,rbp ; modulus
add r8,rax
adc r9,rdx
and r14,r8
shrd r8,r9,52
xor r9,r9
;; c+=a.n[1 2 3 4] * b.n[4 3 2 1]
mov rax,[rdi+1*8]
mul rsi
add r8,rax
adc r9,rdx
mov rax,[rdi+2*8]
mul rcx
add r8,rax
adc r9,rdx
mov rax,[rdi+3*8]
mul rbx
add r8,rax
adc r9,rdx
mov rax,[rdi+4*8]
mul r15
mov r15,rbp ; modulus
add r8,rax
adc r9,rdx
and r15,r8
shrd r8,r9,52
xor r9,r9
;; c+=a.n[2 3 4] * b.n[4 3 2]
mov rax,[rdi+2*8]
mul rsi
add r8,rax
adc r9,rdx
mov rax,[rdi+3*8]
mul rcx
add r8,rax
adc r9,rdx
mov rax,[rdi+4*8]
mul rbx
mov rbx,rbp ; modulus
add r8,rax
adc r9,rdx
and rbx,r8 ; only need lower dword
shrd r8,r9,52
xor r9,r9
;; c+=a.n[3 4] * b.n[4 3]
mov rax,[rdi+3*8]
mul rsi
add r8,rax
adc r9,rdx
mov rax,[rdi+4*8]
mul rcx
mov rcx,rbp ; modulus
add r8,rax
adc r9,rdx
and rcx,r8 ; only need lower dword
shrd r8,r9,52
xor r9,r9
;; c+=a.n[4] * b.n[4]
mov rax,[rdi+4*8]
mul rsi
;; mov rbp,rbp ; modulus already there!
add r8,rax
adc r9,rdx
and rbp,r8
shrd r8,r9,52
xor r9,r9
mov rsi,r8
;; *******************************************************
common_exit_norm::
mov rdi,01000003D10h
mov rax,r15 ; get t5
mul rdi
add rax,r10 ; +t0
adc rdx,0
mov r10,0FFFFFFFFFFFFFh ; modulus
mov r8,rax ; +c
and r10,rax
shrd r8,rdx,52
xor r9,r9
mov rax,rbx ; get t6
mul rdi
add rax,r11 ; +t1
adc rdx,0
mov r11,0FFFFFFFFFFFFFh ; modulus
add r8,rax ; +c
adc r9,rdx
and r11,r8
shrd r8,r9,52
xor r9,r9
mov rax,rcx ; get t7
mul rdi
add rax,r12 ; +t2
adc rdx,0
pop rbx ; retrieve pointer to this.a.n
mov r12,0FFFFFFFFFFFFFh ; modulus
add r8,rax ; +c
adc r9,rdx
and r12,r8
mov [rbx+2*8],r12
shrd r8,r9,52
xor r9,r9
mov rax,rbp ; get t8
mul rdi
add rax,r13 ; +t3
adc rdx,0
mov r13,0FFFFFFFFFFFFFh ; modulus
add r8,rax ; +c
adc r9,rdx
and r13,r8
mov [rbx+3*8],r13
shrd r8,r9,52
xor r9,r9
mov rax,rsi ; get t9
mul rdi
add rax,r14 ; +t4
adc rdx,0
mov r14,0FFFFFFFFFFFFh ; !!!
add r8,rax ; +c
adc r9,rdx
and r14,r8
mov [rbx+4*8],r14
shrd r8,r9,48
xor r9,r9
mov rax,01000003D1h
mul r8
add rax,r10
adc rdx,0
mov r10,0FFFFFFFFFFFFFh ; modulus
mov r8,rax
and rax,r10
shrd r8,rdx,52
mov [rbx+0*8],rax
add r8,r11
mov [rbx+1*8],r8
ret
ExSetMult ENDP
;; Register Layout:
;; INPUT: rdi = a.n
;; rsi = this.a
;; OUTPUT: [rsi]
;; INTERNAL: rdx:rax = multiplication accumulator
;; r8:r9 = c
;; r10-r14 = t0-t4
;; r15 = a.n[0]*2 / t5
;; rbx = a.n[1]*2 / t6
;; rcx = a.n[2]*2 / t7
;; rbp = a.n[3]*2 / t8
;; rsi = a.n[4] / t9
ExSetSquare PROC C PUBLIC USES rbx rbp r12 r13 r14 r15
push rsi
mov rbp,0FFFFFFFFFFFFFh
;; c=a.n[0] * a.n[0]
mov r14,[rdi+0*8] ; r14=a.n[0]
mov r10,rbp ; modulus
mov rax,r14
mul rax
mov r15,[rdi+1*8] ; a.n[1]
add r14,r14 ; r14=2*a.n[0]
mov r8,rax
and r10,rax ; only need lower qword
shrd r8,rdx,52
xor r9,r9
;; c+=2*a.n[0] * a.n[1]
mov rax,r14 ; r14=2*a.n[0]
mul r15
mov rbx,[rdi+2*8] ; rbx=a.n[2]
mov r11,rbp ; modulus
add r8,rax
adc r9,rdx
and r11,r8
shrd r8,r9,52
xor r9,r9
;; c+=2*a.n[0]*a.n[2]+a.n[1]*a.n[1]
mov rax,r14
mul rbx
add r8,rax
adc r9,rdx
mov rax,r15
mov r12,rbp ; modulus
mul rax
mov rcx,[rdi+3*8] ; rcx=a.n[3]
add r15,r15 ; r15=a.n[1]*2
add r8,rax
adc r9,rdx
and r12,r8 ; only need lower dword
shrd r8,r9,52
xor r9,r9
;; c+=2*a.n[0]*a.n[3]+2*a.n[1]*a.n[2]
mov rax,r14
mul rcx
add r8,rax
adc r9,rdx
mov rax,r15 ; rax=2*a.n[1]
mov r13,rbp ; modulus
mul rbx
mov rsi,[rdi+4*8] ; rsi=a.n[4]
add r8,rax
adc r9,rdx
and r13,r8
shrd r8,r9,52
xor r9,r9
;; c+=2*a.n[0]*a.n[4]+2*a.n[1]*a.n[3]+a.n[2]*a.n[2]
mov rax,r14 ; last time we need 2*a.n[0]
mul rsi
add r8,rax
adc r9,rdx
mov rax,r15
mul rcx
mov r14,rbp ; modulus
add r8,rax
adc r9,rdx
mov rax,rbx
mul rax
add rbx,rbx ; rcx=2*a.n[2]
add r8,rax
adc r9,rdx
and r14,r8
shrd r8,r9,52
xor r9,r9
;; c+=2*a.n[1]*a.n[4]+2*a.n[2]*a.n[3]
mov rax,r15 ; last time we need 2*a.n[1]
mul rsi
add r8,rax
adc r9,rdx
mov rax,rbx
mul rcx
mov r15,rbp ; modulus
add r8,rax
adc r9,rdx
and r15,r8
shrd r8,r9,52
xor r9,r9
;; c+=2*a.n[2]*a.n[4]+a.n[3]*a.n[3]
mov rax,rbx ; last time we need 2*a.n[2]
mul rsi
add r8,rax
adc r9,rdx
mov rax,rcx ; a.n[3]
mul rax
mov rbx,rbp ; modulus
add r8,rax
adc r9,rdx
and rbx,r8 ; only need lower dword
lea rax,[2*rcx]
shrd r8,r9,52
xor r9,r9
;; c+=2*a.n[3]*a.n[4]
mul rsi
mov rcx,rbp ; modulus
add r8,rax
adc r9,rdx
and rcx,r8 ; only need lower dword
shrd r8,r9,52
xor r9,r9
;; c+=a.n[4]*a.n[4]
mov rax,rsi
mul rax
;; mov rbp,rbp ; modulus is already there!
add r8,rax
adc r9,rdx
and rbp,r8
shrd r8,r9,52
xor r9,r9
mov rsi,r8
;; *******************************************************
jmp common_exit_norm
ExSetSquare ENDP
end