diff --git a/bench_all b/bench_all
index 89c747d..d9f411e 100644
--- a/bench_all
+++ b/bench_all
@@ -1,10 +1,10 @@
 #!/bin/bash
 echo "Benchmark Results" >output.txt
-for j in no-yasm yasm; do
+for j in yasm; do
     echo "5x64 $j:" >>output.txt
     for i in O0 O1 O2 O3; do
         make clean
-	./configure --$j
+	./configure --use-5x64 --$j
 	echo "OPTLEVEL=$i" >>config.mk
 	make bench
 	echo "OPTLEVEL=$i" >>output.txt
diff --git a/configure b/configure
index d68ae5b..cb69239 100755
--- a/configure
+++ b/configure
@@ -172,3 +172,4 @@ echo "LDFLAGS_EXTRA=$LDFLAGS_EXTRA" >> config.mk
 echo "LDFLAGS_TEST_EXTRA=$LDFLAGS_TEST_EXTRA" >> config.mk
 echo "USE_ASM=$USE_ASM" >>config.mk
 echo "HAVE_LIMB=$HAVE_LIMB" >>config.mk
+echo "OPTLEVEL=O2" >>config.mk
diff --git a/src/field_5x64_asm.asm b/src/field_5x64_asm.asm
index 14b0a52..d449185 100644
--- a/src/field_5x64_asm.asm
+++ b/src/field_5x64_asm.asm
@@ -82,7 +82,7 @@ secp256k1_fe_mul_inner:
 	add r9,rax
 	adc r10,rdx
 	adc r8,0
-	mov rbp,r9	; retire r[2]
+	mov rbp,r9		; retire r[2]
 	xor r9,r9
 
 	;; c+=a.n[0 1 2 3] * b.n[3 2 1 0]
@@ -153,7 +153,7 @@ secp256k1_fe_mul_inner:
 	mul r14
 	add r10,rax
 	adc r8,rdx
-	adc r9,0
+	
 	mov r14,r10
 	mov r15,r8
 	
@@ -216,7 +216,7 @@ secp256k1_fe_sqr_inner:
 	push r13
 	push r14
 	push r15
-	push rdx
+	push rsi
 
 	mov r11,[rdi+8*0]	; preload a.n[0]
 	
@@ -237,7 +237,7 @@ secp256k1_fe_sqr_inner:
 	adc rdx,rdx
 	adc r10,0
 	add r8,rax		; still the same :-)
-	adc r9,rdx		; 
+	adc r9,rdx		
 	adc r10,0		; mmm...
 	
 	mov rcx,r8		; retire r[1]
@@ -315,15 +315,14 @@ secp256k1_fe_sqr_inner:
 	adc r8,0
 
 	mov r13,r9
-	xor r13,r13
+	xor r9,r9
 
 	;; c+=a.n[3]²
 	mov rax,r14
 	mul rax
 	add r10,rax
 	adc r8,rdx
-	adc r9,0
-
+	
 	mov r14,r10
 	mov r15,r8
 	
diff --git a/src/impl/field_5x64.h b/src/impl/field_5x64.h
index 3c927fc..1e645cd 100644
--- a/src/impl/field_5x64.h
+++ b/src/impl/field_5x64.h
@@ -325,6 +325,10 @@ void static secp256k1_fe_mul(secp256k1_fe_t *r, const secp256k1_fe_t *ac, const
 void static secp256k1_fe_sqr(secp256k1_fe_t *r, const secp256k1_fe_t *ac) {
     secp256k1_fe_t a = *ac;
     secp256k1_fe_reduce(&a);
+
+#ifdef USE_FIELD_5X64_ASM
+    secp256k1_fe_sqr_inner((&a)->n,r->n);
+#else
     uint64_t c1,c2,c3;
     c3=0;
     mul_c2(a.n[0], a.n[0], c1, c2);
@@ -355,6 +359,7 @@ void static secp256k1_fe_sqr(secp256k1_fe_t *r, const secp256k1_fe_t *ac) {
     c = (unsigned __int128)r7 * COMP_LIMB + r3 + (c >> 64);
     r->n[3] = c;
     r->n[4] = c >> 64;
+#endif
 
 #ifdef VERIFY
     r->normalized = 0;