From cf12fa13cb96797d6ce356a5023051f99f915fe6 Mon Sep 17 00:00:00 2001 From: Peter Dettman Date: Tue, 18 Apr 2017 11:43:56 +0700 Subject: [PATCH 1/2] Minor optimizations to _scalar_inverse to save 4M - Precalculate x^5 and use for "01010" patterns during accumulation. (net -2M) - Further use of x^5 to allow shorter addition chain (net -2M) --- src/scalar_impl.h | 126 +++++++++++++++++++++------------------------- 1 file changed, 56 insertions(+), 70 deletions(-) diff --git a/src/scalar_impl.h b/src/scalar_impl.h index f5b2376..eb538cb 100644 --- a/src/scalar_impl.h +++ b/src/scalar_impl.h @@ -66,76 +66,70 @@ static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar #else secp256k1_scalar *t; int i; - /* First compute x ^ (2^N - 1) for some values of N. */ - secp256k1_scalar x2, x3, x4, x6, x7, x8, x15, x30, x60, x120, x127; + /* First compute xN as x ^ (2^N - 1) for some values of N, + * and uM as x ^ M for some values of M. */ + secp256k1_scalar x2, x3, x6, x8, x14, x28, x56, x112, x126; + secp256k1_scalar u2, u5; - secp256k1_scalar_sqr(&x2, x); - secp256k1_scalar_mul(&x2, &x2, x); + secp256k1_scalar_sqr(&u2, x); + secp256k1_scalar_mul(&x2, &u2, x); + secp256k1_scalar_mul(&u5, &u2, &x2); + secp256k1_scalar_mul(&x3, &u5, &u2); - secp256k1_scalar_sqr(&x3, &x2); - secp256k1_scalar_mul(&x3, &x3, x); - - secp256k1_scalar_sqr(&x4, &x3); - secp256k1_scalar_mul(&x4, &x4, x); - - secp256k1_scalar_sqr(&x6, &x4); - secp256k1_scalar_sqr(&x6, &x6); - secp256k1_scalar_mul(&x6, &x6, &x2); - - secp256k1_scalar_sqr(&x7, &x6); - secp256k1_scalar_mul(&x7, &x7, x); - - secp256k1_scalar_sqr(&x8, &x7); - secp256k1_scalar_mul(&x8, &x8, x); - - secp256k1_scalar_sqr(&x15, &x8); - for (i = 0; i < 6; i++) { - secp256k1_scalar_sqr(&x15, &x15); + secp256k1_scalar_sqr(&x6, &x3); + for (i = 0; i < 2; i++) { + secp256k1_scalar_sqr(&x6, &x6); } - secp256k1_scalar_mul(&x15, &x15, &x7); + secp256k1_scalar_mul(&x6, &x6, &x3); - secp256k1_scalar_sqr(&x30, &x15); - for (i = 0; i < 14; i++) { - secp256k1_scalar_sqr(&x30, &x30); + secp256k1_scalar_sqr(&x8, &x6); + secp256k1_scalar_sqr(&x8, &x8); + secp256k1_scalar_mul(&x8, &x8, &x2); + + secp256k1_scalar_sqr(&x14, &x8); + for (i = 0; i < 5; i++) { + secp256k1_scalar_sqr(&x14, &x14); } - secp256k1_scalar_mul(&x30, &x30, &x15); + secp256k1_scalar_mul(&x14, &x14, &x6); - secp256k1_scalar_sqr(&x60, &x30); - for (i = 0; i < 29; i++) { - secp256k1_scalar_sqr(&x60, &x60); + secp256k1_scalar_sqr(&x28, &x14); + for (i = 0; i < 13; i++) { + secp256k1_scalar_sqr(&x28, &x28); } - secp256k1_scalar_mul(&x60, &x60, &x30); + secp256k1_scalar_mul(&x28, &x28, &x14); - secp256k1_scalar_sqr(&x120, &x60); - for (i = 0; i < 59; i++) { - secp256k1_scalar_sqr(&x120, &x120); + secp256k1_scalar_sqr(&x56, &x28); + for (i = 0; i < 27; i++) { + secp256k1_scalar_sqr(&x56, &x56); } - secp256k1_scalar_mul(&x120, &x120, &x60); + secp256k1_scalar_mul(&x56, &x56, &x28); - secp256k1_scalar_sqr(&x127, &x120); - for (i = 0; i < 6; i++) { - secp256k1_scalar_sqr(&x127, &x127); + secp256k1_scalar_sqr(&x112, &x56); + for (i = 0; i < 55; i++) { + secp256k1_scalar_sqr(&x112, &x112); } - secp256k1_scalar_mul(&x127, &x127, &x7); + secp256k1_scalar_mul(&x112, &x112, &x56); - /* Then accumulate the final result (t starts at x127). */ - t = &x127; - for (i = 0; i < 2; i++) { /* 0 */ + secp256k1_scalar_sqr(&x126, &x112); + for (i = 0; i < 13; i++) { + secp256k1_scalar_sqr(&x126, &x126); + } + secp256k1_scalar_mul(&x126, &x126, &x14); + + /* Then accumulate the final result (t starts at x126). */ + t = &x126; + for (i = 0; i < 3; i++) { secp256k1_scalar_sqr(t, t); } - secp256k1_scalar_mul(t, t, x); /* 1 */ + secp256k1_scalar_mul(t, t, &u5); /* 101 */ for (i = 0; i < 4; i++) { /* 0 */ secp256k1_scalar_sqr(t, t); } secp256k1_scalar_mul(t, t, &x3); /* 111 */ - for (i = 0; i < 2; i++) { /* 0 */ + for (i = 0; i < 4; i++) { /* 0 */ secp256k1_scalar_sqr(t, t); } - secp256k1_scalar_mul(t, t, x); /* 1 */ - for (i = 0; i < 2; i++) { /* 0 */ - secp256k1_scalar_sqr(t, t); - } - secp256k1_scalar_mul(t, t, x); /* 1 */ + secp256k1_scalar_mul(t, t, &u5); /* 101 */ for (i = 0; i < 2; i++) { /* 0 */ secp256k1_scalar_sqr(t, t); } @@ -160,34 +154,26 @@ static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar secp256k1_scalar_sqr(t, t); } secp256k1_scalar_mul(t, t, &x2); /* 11 */ - for (i = 0; i < 2; i++) { /* 0 */ + for (i = 0; i < 4; i++) { /* 0 */ secp256k1_scalar_sqr(t, t); } - secp256k1_scalar_mul(t, t, x); /* 1 */ - for (i = 0; i < 2; i++) { /* 0 */ + secp256k1_scalar_mul(t, t, &u5); /* 101 */ + for (i = 0; i < 4; i++) { /* 0 */ secp256k1_scalar_sqr(t, t); } - secp256k1_scalar_mul(t, t, x); /* 1 */ - for (i = 0; i < 5; i++) { /* 0 */ + secp256k1_scalar_mul(t, t, &x3); /* 111 */ + for (i = 0; i < 3; i++) { secp256k1_scalar_sqr(t, t); } - secp256k1_scalar_mul(t, t, &x4); /* 1111 */ - for (i = 0; i < 2; i++) { /* 0 */ - secp256k1_scalar_sqr(t, t); - } - secp256k1_scalar_mul(t, t, x); /* 1 */ + secp256k1_scalar_mul(t, t, &u5); /* 101 */ for (i = 0; i < 3; i++) { /* 00 */ secp256k1_scalar_sqr(t, t); } secp256k1_scalar_mul(t, t, x); /* 1 */ - for (i = 0; i < 4; i++) { /* 000 */ + for (i = 0; i < 6; i++) { /* 000 */ secp256k1_scalar_sqr(t, t); } - secp256k1_scalar_mul(t, t, x); /* 1 */ - for (i = 0; i < 2; i++) { /* 0 */ - secp256k1_scalar_sqr(t, t); - } - secp256k1_scalar_mul(t, t, x); /* 1 */ + secp256k1_scalar_mul(t, t, &u5); /* 101 */ for (i = 0; i < 10; i++) { /* 0000000 */ secp256k1_scalar_sqr(t, t); } @@ -212,14 +198,14 @@ static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar secp256k1_scalar_sqr(t, t); } secp256k1_scalar_mul(t, t, x); /* 1 */ - for (i = 0; i < 5; i++) { /* 0 */ + for (i = 0; i < 4; i++) { /* 0 */ secp256k1_scalar_sqr(t, t); } - secp256k1_scalar_mul(t, t, &x4); /* 1111 */ - for (i = 0; i < 2; i++) { /* 0 */ + secp256k1_scalar_mul(t, t, &x3); /* 111 */ + for (i = 0; i < 3; i++) { secp256k1_scalar_sqr(t, t); } - secp256k1_scalar_mul(t, t, x); /* 1 */ + secp256k1_scalar_mul(t, t, &u5); /* 101 */ for (i = 0; i < 5; i++) { /* 000 */ secp256k1_scalar_sqr(t, t); } From 465159c278cecc2cf8d934e78f640f345243eb72 Mon Sep 17 00:00:00 2001 From: Brian Smith Date: Sun, 23 Apr 2017 03:18:53 -1000 Subject: [PATCH 2/2] Further shorten the addition chain for scalar inversion. Reduce the number of squarings by one and reduce the number of multiplications by three. --- src/scalar_impl.h | 81 +++++++++++++++++------------------------------ 1 file changed, 29 insertions(+), 52 deletions(-) diff --git a/src/scalar_impl.h b/src/scalar_impl.h index eb538cb..2690d86 100644 --- a/src/scalar_impl.h +++ b/src/scalar_impl.h @@ -69,18 +69,19 @@ static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar /* First compute xN as x ^ (2^N - 1) for some values of N, * and uM as x ^ M for some values of M. */ secp256k1_scalar x2, x3, x6, x8, x14, x28, x56, x112, x126; - secp256k1_scalar u2, u5; + secp256k1_scalar u2, u5, u9, u11, u13; secp256k1_scalar_sqr(&u2, x); secp256k1_scalar_mul(&x2, &u2, x); secp256k1_scalar_mul(&u5, &u2, &x2); secp256k1_scalar_mul(&x3, &u5, &u2); + secp256k1_scalar_mul(&u9, &x3, &u2); + secp256k1_scalar_mul(&u11, &u9, &u2); + secp256k1_scalar_mul(&u13, &u11, &u2); - secp256k1_scalar_sqr(&x6, &x3); - for (i = 0; i < 2; i++) { - secp256k1_scalar_sqr(&x6, &x6); - } - secp256k1_scalar_mul(&x6, &x6, &x3); + secp256k1_scalar_sqr(&x6, &u13); + secp256k1_scalar_sqr(&x6, &x6); + secp256k1_scalar_mul(&x6, &x6, &u11); secp256k1_scalar_sqr(&x8, &x6); secp256k1_scalar_sqr(&x8, &x8); @@ -130,18 +131,14 @@ static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar secp256k1_scalar_sqr(t, t); } secp256k1_scalar_mul(t, t, &u5); /* 101 */ - for (i = 0; i < 2; i++) { /* 0 */ + for (i = 0; i < 5; i++) { /* 0 */ secp256k1_scalar_sqr(t, t); } - secp256k1_scalar_mul(t, t, x); /* 1 */ - for (i = 0; i < 4; i++) { /* 0 */ + secp256k1_scalar_mul(t, t, &u11); /* 1011 */ + for (i = 0; i < 4; i++) { secp256k1_scalar_sqr(t, t); } - secp256k1_scalar_mul(t, t, &x3); /* 111 */ - for (i = 0; i < 3; i++) { /* 0 */ - secp256k1_scalar_sqr(t, t); - } - secp256k1_scalar_mul(t, t, &x2); /* 11 */ + secp256k1_scalar_mul(t, t, &u11); /* 1011 */ for (i = 0; i < 4; i++) { /* 0 */ secp256k1_scalar_sqr(t, t); } @@ -150,26 +147,22 @@ static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar secp256k1_scalar_sqr(t, t); } secp256k1_scalar_mul(t, t, &x3); /* 111 */ - for (i = 0; i < 4; i++) { /* 00 */ + for (i = 0; i < 6; i++) { /* 00 */ secp256k1_scalar_sqr(t, t); } - secp256k1_scalar_mul(t, t, &x2); /* 11 */ + secp256k1_scalar_mul(t, t, &u13); /* 1101 */ for (i = 0; i < 4; i++) { /* 0 */ secp256k1_scalar_sqr(t, t); } secp256k1_scalar_mul(t, t, &u5); /* 101 */ - for (i = 0; i < 4; i++) { /* 0 */ - secp256k1_scalar_sqr(t, t); - } - secp256k1_scalar_mul(t, t, &x3); /* 111 */ for (i = 0; i < 3; i++) { secp256k1_scalar_sqr(t, t); } - secp256k1_scalar_mul(t, t, &u5); /* 101 */ - for (i = 0; i < 3; i++) { /* 00 */ + secp256k1_scalar_mul(t, t, &x3); /* 111 */ + for (i = 0; i < 5; i++) { /* 0 */ secp256k1_scalar_sqr(t, t); } - secp256k1_scalar_mul(t, t, x); /* 1 */ + secp256k1_scalar_mul(t, t, &u9); /* 1001 */ for (i = 0; i < 6; i++) { /* 000 */ secp256k1_scalar_sqr(t, t); } @@ -186,50 +179,34 @@ static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar secp256k1_scalar_sqr(t, t); } secp256k1_scalar_mul(t, t, &x8); /* 11111111 */ - for (i = 0; i < 2; i++) { /* 0 */ + for (i = 0; i < 5; i++) { /* 0 */ secp256k1_scalar_sqr(t, t); } - secp256k1_scalar_mul(t, t, x); /* 1 */ - for (i = 0; i < 3; i++) { /* 00 */ + secp256k1_scalar_mul(t, t, &u9); /* 1001 */ + for (i = 0; i < 6; i++) { /* 00 */ secp256k1_scalar_sqr(t, t); } - secp256k1_scalar_mul(t, t, x); /* 1 */ - for (i = 0; i < 3; i++) { /* 00 */ + secp256k1_scalar_mul(t, t, &u11); /* 1011 */ + for (i = 0; i < 4; i++) { secp256k1_scalar_sqr(t, t); } - secp256k1_scalar_mul(t, t, x); /* 1 */ - for (i = 0; i < 4; i++) { /* 0 */ - secp256k1_scalar_sqr(t, t); - } - secp256k1_scalar_mul(t, t, &x3); /* 111 */ - for (i = 0; i < 3; i++) { - secp256k1_scalar_sqr(t, t); - } - secp256k1_scalar_mul(t, t, &u5); /* 101 */ - for (i = 0; i < 5; i++) { /* 000 */ + secp256k1_scalar_mul(t, t, &u13); /* 1101 */ + for (i = 0; i < 5; i++) { secp256k1_scalar_sqr(t, t); } secp256k1_scalar_mul(t, t, &x2); /* 11 */ - for (i = 0; i < 4; i++) { /* 00 */ + for (i = 0; i < 6; i++) { /* 00 */ secp256k1_scalar_sqr(t, t); } - secp256k1_scalar_mul(t, t, &x2); /* 11 */ - for (i = 0; i < 2; i++) { /* 0 */ + secp256k1_scalar_mul(t, t, &u13); /* 1101 */ + for (i = 0; i < 10; i++) { /* 000000 */ secp256k1_scalar_sqr(t, t); } - secp256k1_scalar_mul(t, t, x); /* 1 */ - for (i = 0; i < 8; i++) { /* 000000 */ + secp256k1_scalar_mul(t, t, &u13); /* 1101 */ + for (i = 0; i < 4; i++) { secp256k1_scalar_sqr(t, t); } - secp256k1_scalar_mul(t, t, &x2); /* 11 */ - for (i = 0; i < 3; i++) { /* 0 */ - secp256k1_scalar_sqr(t, t); - } - secp256k1_scalar_mul(t, t, &x2); /* 11 */ - for (i = 0; i < 3; i++) { /* 00 */ - secp256k1_scalar_sqr(t, t); - } - secp256k1_scalar_mul(t, t, x); /* 1 */ + secp256k1_scalar_mul(t, t, &u9); /* 1001 */ for (i = 0; i < 6; i++) { /* 00000 */ secp256k1_scalar_sqr(t, t); }