Merge pull request #103

f8cce95 Add overflow analysis to field_10x26_impl.h (Pieter Wuille)
a518598 Add overflow analysis to field_5x52_int128_impl.h (Pieter Wuille)
fa0d620 Add equalities relating input and output variables (Pieter Wuille)
5dd421b Rewrite mul/sqr for 32bit/64bit (Peter Dettman)
This commit is contained in:
Pieter Wuille 2014-11-15 01:26:15 +01:00
commit 21288f2d05
No known key found for this signature in database
GPG Key ID: 57896D2FF8F0B657
2 changed files with 852 additions and 332 deletions

View File

@ -245,257 +245,607 @@ SECP256K1_INLINE static void secp256k1_fe_add(secp256k1_fe_t *r, const secp256k1
#endif
}
SECP256K1_INLINE static void secp256k1_fe_mul_inner(const uint32_t *a, const uint32_t *b, uint32_t *r) {
uint64_t c = (uint64_t)a[0] * b[0];
uint32_t t0 = c & 0x3FFFFFFUL; c = c >> 26;
c = c + (uint64_t)a[0] * b[1] +
(uint64_t)a[1] * b[0];
uint32_t t1 = c & 0x3FFFFFFUL; c = c >> 26;
c = c + (uint64_t)a[0] * b[2] +
(uint64_t)a[1] * b[1] +
(uint64_t)a[2] * b[0];
uint32_t t2 = c & 0x3FFFFFFUL; c = c >> 26;
c = c + (uint64_t)a[0] * b[3] +
(uint64_t)a[1] * b[2] +
(uint64_t)a[2] * b[1] +
(uint64_t)a[3] * b[0];
uint32_t t3 = c & 0x3FFFFFFUL; c = c >> 26;
c = c + (uint64_t)a[0] * b[4] +
(uint64_t)a[1] * b[3] +
(uint64_t)a[2] * b[2] +
(uint64_t)a[3] * b[1] +
(uint64_t)a[4] * b[0];
uint32_t t4 = c & 0x3FFFFFFUL; c = c >> 26;
c = c + (uint64_t)a[0] * b[5] +
(uint64_t)a[1] * b[4] +
(uint64_t)a[2] * b[3] +
(uint64_t)a[3] * b[2] +
(uint64_t)a[4] * b[1] +
(uint64_t)a[5] * b[0];
uint32_t t5 = c & 0x3FFFFFFUL; c = c >> 26;
c = c + (uint64_t)a[0] * b[6] +
(uint64_t)a[1] * b[5] +
(uint64_t)a[2] * b[4] +
(uint64_t)a[3] * b[3] +
(uint64_t)a[4] * b[2] +
(uint64_t)a[5] * b[1] +
(uint64_t)a[6] * b[0];
uint32_t t6 = c & 0x3FFFFFFUL; c = c >> 26;
c = c + (uint64_t)a[0] * b[7] +
(uint64_t)a[1] * b[6] +
(uint64_t)a[2] * b[5] +
(uint64_t)a[3] * b[4] +
(uint64_t)a[4] * b[3] +
(uint64_t)a[5] * b[2] +
(uint64_t)a[6] * b[1] +
(uint64_t)a[7] * b[0];
uint32_t t7 = c & 0x3FFFFFFUL; c = c >> 26;
c = c + (uint64_t)a[0] * b[8] +
(uint64_t)a[1] * b[7] +
(uint64_t)a[2] * b[6] +
(uint64_t)a[3] * b[5] +
(uint64_t)a[4] * b[4] +
(uint64_t)a[5] * b[3] +
(uint64_t)a[6] * b[2] +
(uint64_t)a[7] * b[1] +
(uint64_t)a[8] * b[0];
uint32_t t8 = c & 0x3FFFFFFUL; c = c >> 26;
c = c + (uint64_t)a[0] * b[9] +
(uint64_t)a[1] * b[8] +
(uint64_t)a[2] * b[7] +
(uint64_t)a[3] * b[6] +
(uint64_t)a[4] * b[5] +
(uint64_t)a[5] * b[4] +
(uint64_t)a[6] * b[3] +
(uint64_t)a[7] * b[2] +
(uint64_t)a[8] * b[1] +
(uint64_t)a[9] * b[0];
uint32_t t9 = c & 0x3FFFFFFUL; c = c >> 26;
c = c + (uint64_t)a[1] * b[9] +
(uint64_t)a[2] * b[8] +
(uint64_t)a[3] * b[7] +
(uint64_t)a[4] * b[6] +
(uint64_t)a[5] * b[5] +
(uint64_t)a[6] * b[4] +
(uint64_t)a[7] * b[3] +
(uint64_t)a[8] * b[2] +
(uint64_t)a[9] * b[1];
uint32_t t10 = c & 0x3FFFFFFUL; c = c >> 26;
c = c + (uint64_t)a[2] * b[9] +
(uint64_t)a[3] * b[8] +
(uint64_t)a[4] * b[7] +
(uint64_t)a[5] * b[6] +
(uint64_t)a[6] * b[5] +
(uint64_t)a[7] * b[4] +
(uint64_t)a[8] * b[3] +
(uint64_t)a[9] * b[2];
uint32_t t11 = c & 0x3FFFFFFUL; c = c >> 26;
c = c + (uint64_t)a[3] * b[9] +
(uint64_t)a[4] * b[8] +
(uint64_t)a[5] * b[7] +
(uint64_t)a[6] * b[6] +
(uint64_t)a[7] * b[5] +
(uint64_t)a[8] * b[4] +
(uint64_t)a[9] * b[3];
uint32_t t12 = c & 0x3FFFFFFUL; c = c >> 26;
c = c + (uint64_t)a[4] * b[9] +
(uint64_t)a[5] * b[8] +
(uint64_t)a[6] * b[7] +
(uint64_t)a[7] * b[6] +
(uint64_t)a[8] * b[5] +
(uint64_t)a[9] * b[4];
uint32_t t13 = c & 0x3FFFFFFUL; c = c >> 26;
c = c + (uint64_t)a[5] * b[9] +
(uint64_t)a[6] * b[8] +
(uint64_t)a[7] * b[7] +
(uint64_t)a[8] * b[6] +
(uint64_t)a[9] * b[5];
uint32_t t14 = c & 0x3FFFFFFUL; c = c >> 26;
c = c + (uint64_t)a[6] * b[9] +
(uint64_t)a[7] * b[8] +
(uint64_t)a[8] * b[7] +
(uint64_t)a[9] * b[6];
uint32_t t15 = c & 0x3FFFFFFUL; c = c >> 26;
c = c + (uint64_t)a[7] * b[9] +
(uint64_t)a[8] * b[8] +
(uint64_t)a[9] * b[7];
uint32_t t16 = c & 0x3FFFFFFUL; c = c >> 26;
c = c + (uint64_t)a[8] * b[9] +
(uint64_t)a[9] * b[8];
uint32_t t17 = c & 0x3FFFFFFUL; c = c >> 26;
c = c + (uint64_t)a[9] * b[9];
uint32_t t18 = c & 0x3FFFFFFUL; c = c >> 26;
uint32_t t19 = c;
#ifdef VERIFY
#define VERIFY_BITS(x, n) VERIFY_CHECK(((x) >> (n)) == 0)
#else
#define VERIFY_BITS(x, n) do { } while(0)
#endif
c = t0 + (uint64_t)t10 * 0x3D10UL;
t0 = c & 0x3FFFFFFUL; c = c >> 26;
c = c + t1 + (uint64_t)t10*0x400UL + (uint64_t)t11 * 0x3D10UL;
t1 = c & 0x3FFFFFFUL; c = c >> 26;
c = c + t2 + (uint64_t)t11*0x400UL + (uint64_t)t12 * 0x3D10UL;
t2 = c & 0x3FFFFFFUL; c = c >> 26;
c = c + t3 + (uint64_t)t12*0x400UL + (uint64_t)t13 * 0x3D10UL;
r[3] = c & 0x3FFFFFFUL; c = c >> 26;
c = c + t4 + (uint64_t)t13*0x400UL + (uint64_t)t14 * 0x3D10UL;
r[4] = c & 0x3FFFFFFUL; c = c >> 26;
c = c + t5 + (uint64_t)t14*0x400UL + (uint64_t)t15 * 0x3D10UL;
r[5] = c & 0x3FFFFFFUL; c = c >> 26;
c = c + t6 + (uint64_t)t15*0x400UL + (uint64_t)t16 * 0x3D10UL;
r[6] = c & 0x3FFFFFFUL; c = c >> 26;
c = c + t7 + (uint64_t)t16*0x400UL + (uint64_t)t17 * 0x3D10UL;
r[7] = c & 0x3FFFFFFUL; c = c >> 26;
c = c + t8 + (uint64_t)t17*0x400UL + (uint64_t)t18 * 0x3D10UL;
r[8] = c & 0x3FFFFFFUL; c = c >> 26;
c = c + t9 + (uint64_t)t18*0x400UL + (uint64_t)t19 * 0x1000003D10ULL;
r[9] = c & 0x03FFFFFUL; c = c >> 22;
uint64_t d = t0 + c * 0x3D1UL;
r[0] = d & 0x3FFFFFFUL; d = d >> 26;
d = d + t1 + c*0x40;
r[1] = d & 0x3FFFFFFUL; d = d >> 26;
r[2] = t2 + d;
SECP256K1_INLINE static void secp256k1_fe_mul_inner(const uint32_t *a, const uint32_t *b, uint32_t *r) {
VERIFY_BITS(a[0], 30);
VERIFY_BITS(a[1], 30);
VERIFY_BITS(a[2], 30);
VERIFY_BITS(a[3], 30);
VERIFY_BITS(a[4], 30);
VERIFY_BITS(a[5], 30);
VERIFY_BITS(a[6], 30);
VERIFY_BITS(a[7], 30);
VERIFY_BITS(a[8], 30);
VERIFY_BITS(a[9], 26);
VERIFY_BITS(b[0], 30);
VERIFY_BITS(b[1], 30);
VERIFY_BITS(b[2], 30);
VERIFY_BITS(b[3], 30);
VERIFY_BITS(b[4], 30);
VERIFY_BITS(b[5], 30);
VERIFY_BITS(b[6], 30);
VERIFY_BITS(b[7], 30);
VERIFY_BITS(b[8], 30);
VERIFY_BITS(b[9], 26);
const uint32_t M = 0x3FFFFFFUL, R0 = 0x3D10UL, R1 = 0x400UL;
// [... a b c] is a shorthand for ... + a<<52 + b<<26 + c<<0 mod n.
// px is a shorthand for sum(a[i]*b[x-i], i=0..x).
// Note that [x 0 0 0 0 0 0 0 0 0 0] = [x*R1 x*R0].
uint64_t c, d;
d = (uint64_t)a[0] * b[9]
+ (uint64_t)a[1] * b[8]
+ (uint64_t)a[2] * b[7]
+ (uint64_t)a[3] * b[6]
+ (uint64_t)a[4] * b[5]
+ (uint64_t)a[5] * b[4]
+ (uint64_t)a[6] * b[3]
+ (uint64_t)a[7] * b[2]
+ (uint64_t)a[8] * b[1]
+ (uint64_t)a[9] * b[0];
// VERIFY_BITS(d, 64);
// [d 0 0 0 0 0 0 0 0 0] = [p9 0 0 0 0 0 0 0 0 0]
uint32_t t9 = d & M; d >>= 26;
VERIFY_BITS(t9, 26);
VERIFY_BITS(d, 38);
// [d t9 0 0 0 0 0 0 0 0 0] = [p9 0 0 0 0 0 0 0 0 0]
c = (uint64_t)a[0] * b[0];
VERIFY_BITS(c, 60);
// [d t9 0 0 0 0 0 0 0 0 c] = [p9 0 0 0 0 0 0 0 0 p0]
d += (uint64_t)a[1] * b[9]
+ (uint64_t)a[2] * b[8]
+ (uint64_t)a[3] * b[7]
+ (uint64_t)a[4] * b[6]
+ (uint64_t)a[5] * b[5]
+ (uint64_t)a[6] * b[4]
+ (uint64_t)a[7] * b[3]
+ (uint64_t)a[8] * b[2]
+ (uint64_t)a[9] * b[1];
VERIFY_BITS(d, 63);
// [d t9 0 0 0 0 0 0 0 0 c] = [p10 p9 0 0 0 0 0 0 0 0 p0]
uint64_t u0 = d & M; d >>= 26; c += u0 * R0;
VERIFY_BITS(u0, 26);
VERIFY_BITS(d, 37);
VERIFY_BITS(c, 61);
// [d u0 t9 0 0 0 0 0 0 0 0 c-u0*R0] = [p10 p9 0 0 0 0 0 0 0 0 p0]
uint32_t t0 = c & M; c >>= 26; c += u0 * R1;
VERIFY_BITS(t0, 26);
VERIFY_BITS(c, 37);
// [d u0 t9 0 0 0 0 0 0 0 c-u0*R1 t0-u0*R0] = [p10 p9 0 0 0 0 0 0 0 0 p0]
// [d 0 t9 0 0 0 0 0 0 0 c t0] = [p10 p9 0 0 0 0 0 0 0 0 p0]
c += (uint64_t)a[0] * b[1]
+ (uint64_t)a[1] * b[0];
VERIFY_BITS(c, 62);
// [d 0 t9 0 0 0 0 0 0 0 c t0] = [p10 p9 0 0 0 0 0 0 0 p1 p0]
d += (uint64_t)a[2] * b[9]
+ (uint64_t)a[3] * b[8]
+ (uint64_t)a[4] * b[7]
+ (uint64_t)a[5] * b[6]
+ (uint64_t)a[6] * b[5]
+ (uint64_t)a[7] * b[4]
+ (uint64_t)a[8] * b[3]
+ (uint64_t)a[9] * b[2];
VERIFY_BITS(d, 63);
// [d 0 t9 0 0 0 0 0 0 0 c t0] = [p11 p10 p9 0 0 0 0 0 0 0 p1 p0]
uint64_t u1 = d & M; d >>= 26; c += u1 * R0;
VERIFY_BITS(u1, 26);
VERIFY_BITS(d, 37);
VERIFY_BITS(c, 63);
// [d u1 0 t9 0 0 0 0 0 0 0 c-u1*R0 t0] = [p11 p10 p9 0 0 0 0 0 0 0 p1 p0]
uint32_t t1 = c & M; c >>= 26; c += u1 * R1;
VERIFY_BITS(t1, 26);
VERIFY_BITS(c, 38);
// [d u1 0 t9 0 0 0 0 0 0 c-u1*R1 t1-u1*R0 t0] = [p11 p10 p9 0 0 0 0 0 0 0 p1 p0]
// [d 0 0 t9 0 0 0 0 0 0 c t1 t0] = [p11 p10 p9 0 0 0 0 0 0 0 p1 p0]
c += (uint64_t)a[0] * b[2]
+ (uint64_t)a[1] * b[1]
+ (uint64_t)a[2] * b[0];
VERIFY_BITS(c, 62);
// [d 0 0 t9 0 0 0 0 0 0 c t1 t0] = [p11 p10 p9 0 0 0 0 0 0 p2 p1 p0]
d += (uint64_t)a[3] * b[9]
+ (uint64_t)a[4] * b[8]
+ (uint64_t)a[5] * b[7]
+ (uint64_t)a[6] * b[6]
+ (uint64_t)a[7] * b[5]
+ (uint64_t)a[8] * b[4]
+ (uint64_t)a[9] * b[3];
VERIFY_BITS(d, 63);
// [d 0 0 t9 0 0 0 0 0 0 c t1 t0] = [p12 p11 p10 p9 0 0 0 0 0 0 p2 p1 p0]
uint64_t u2 = d & M; d >>= 26; c += u2 * R0;
VERIFY_BITS(u2, 26);
VERIFY_BITS(d, 37);
VERIFY_BITS(c, 63);
// [d u2 0 0 t9 0 0 0 0 0 0 c-u2*R0 t1 t0] = [p12 p11 p10 p9 0 0 0 0 0 0 p2 p1 p0]
uint32_t t2 = c & M; c >>= 26; c += u2 * R1;
VERIFY_BITS(t2, 26);
VERIFY_BITS(c, 38);
// [d u2 0 0 t9 0 0 0 0 0 c-u2*R1 t2-u2*R0 t1 t0] = [p12 p11 p10 p9 0 0 0 0 0 0 p2 p1 p0]
// [d 0 0 0 t9 0 0 0 0 0 c t2 t1 t0] = [p12 p11 p10 p9 0 0 0 0 0 0 p2 p1 p0]
c += (uint64_t)a[0] * b[3]
+ (uint64_t)a[1] * b[2]
+ (uint64_t)a[2] * b[1]
+ (uint64_t)a[3] * b[0];
VERIFY_BITS(c, 63);
// [d 0 0 0 t9 0 0 0 0 0 c t2 t1 t0] = [p12 p11 p10 p9 0 0 0 0 0 p3 p2 p1 p0]
d += (uint64_t)a[4] * b[9]
+ (uint64_t)a[5] * b[8]
+ (uint64_t)a[6] * b[7]
+ (uint64_t)a[7] * b[6]
+ (uint64_t)a[8] * b[5]
+ (uint64_t)a[9] * b[4];
VERIFY_BITS(d, 63);
// [d 0 0 0 t9 0 0 0 0 0 c t2 t1 t0] = [p13 p12 p11 p10 p9 0 0 0 0 0 p3 p2 p1 p0]
uint64_t u3 = d & M; d >>= 26; c += u3 * R0;
VERIFY_BITS(u3, 26);
VERIFY_BITS(d, 37);
// VERIFY_BITS(c, 64);
// [d u3 0 0 0 t9 0 0 0 0 0 c-u3*R0 t2 t1 t0] = [p13 p12 p11 p10 p9 0 0 0 0 0 p3 p2 p1 p0]
uint32_t t3 = c & M; c >>= 26; c += u3 * R1;
VERIFY_BITS(t3, 26);
VERIFY_BITS(c, 39);
// [d u3 0 0 0 t9 0 0 0 0 c-u3*R1 t3-u3*R0 t2 t1 t0] = [p13 p12 p11 p10 p9 0 0 0 0 0 p3 p2 p1 p0]
// [d 0 0 0 0 t9 0 0 0 0 c t3 t2 t1 t0] = [p13 p12 p11 p10 p9 0 0 0 0 0 p3 p2 p1 p0]
c += (uint64_t)a[0] * b[4]
+ (uint64_t)a[1] * b[3]
+ (uint64_t)a[2] * b[2]
+ (uint64_t)a[3] * b[1]
+ (uint64_t)a[4] * b[0];
VERIFY_BITS(c, 63);
// [d 0 0 0 0 t9 0 0 0 0 c t3 t2 t1 t0] = [p13 p12 p11 p10 p9 0 0 0 0 p4 p3 p2 p1 p0]
d += (uint64_t)a[5] * b[9]
+ (uint64_t)a[6] * b[8]
+ (uint64_t)a[7] * b[7]
+ (uint64_t)a[8] * b[6]
+ (uint64_t)a[9] * b[5];
VERIFY_BITS(d, 62);
// [d 0 0 0 0 t9 0 0 0 0 c t3 t2 t1 t0] = [p14 p13 p12 p11 p10 p9 0 0 0 0 p4 p3 p2 p1 p0]
uint64_t u4 = d & M; d >>= 26; c += u4 * R0;
VERIFY_BITS(u4, 26);
VERIFY_BITS(d, 36);
// VERIFY_BITS(c, 64);
// [d u4 0 0 0 0 t9 0 0 0 0 c-u4*R0 t3 t2 t1 t0] = [p14 p13 p12 p11 p10 p9 0 0 0 0 p4 p3 p2 p1 p0]
uint32_t t4 = c & M; c >>= 26; c += u4 * R1;
VERIFY_BITS(t4, 26);
VERIFY_BITS(c, 39);
// [d u4 0 0 0 0 t9 0 0 0 c-u4*R1 t4-u4*R0 t3 t2 t1 t0] = [p14 p13 p12 p11 p10 p9 0 0 0 0 p4 p3 p2 p1 p0]
// [d 0 0 0 0 0 t9 0 0 0 c t4 t3 t2 t1 t0] = [p14 p13 p12 p11 p10 p9 0 0 0 0 p4 p3 p2 p1 p0]
c += (uint64_t)a[0] * b[5]
+ (uint64_t)a[1] * b[4]
+ (uint64_t)a[2] * b[3]
+ (uint64_t)a[3] * b[2]
+ (uint64_t)a[4] * b[1]
+ (uint64_t)a[5] * b[0];
VERIFY_BITS(c, 63);
// [d 0 0 0 0 0 t9 0 0 0 c t4 t3 t2 t1 t0] = [p14 p13 p12 p11 p10 p9 0 0 0 p5 p4 p3 p2 p1 p0]
d += (uint64_t)a[6] * b[9]
+ (uint64_t)a[7] * b[8]
+ (uint64_t)a[8] * b[7]
+ (uint64_t)a[9] * b[6];
VERIFY_BITS(d, 62);
// [d 0 0 0 0 0 t9 0 0 0 c t4 t3 t2 t1 t0] = [p15 p14 p13 p12 p11 p10 p9 0 0 0 p5 p4 p3 p2 p1 p0]
uint64_t u5 = d & M; d >>= 26; c += u5 * R0;
VERIFY_BITS(u5, 26);
VERIFY_BITS(d, 36);
// VERIFY_BITS(c, 64);
// [d u5 0 0 0 0 0 t9 0 0 0 c-u5*R0 t4 t3 t2 t1 t0] = [p15 p14 p13 p12 p11 p10 p9 0 0 0 p5 p4 p3 p2 p1 p0]
uint32_t t5 = c & M; c >>= 26; c += u5 * R1;
VERIFY_BITS(t5, 26);
VERIFY_BITS(c, 39);
// [d u5 0 0 0 0 0 t9 0 0 c-u5*R1 t5-u5*R0 t4 t3 t2 t1 t0] = [p15 p14 p13 p12 p11 p10 p9 0 0 0 p5 p4 p3 p2 p1 p0]
// [d 0 0 0 0 0 0 t9 0 0 c t5 t4 t3 t2 t1 t0] = [p15 p14 p13 p12 p11 p10 p9 0 0 0 p5 p4 p3 p2 p1 p0]
c += (uint64_t)a[0] * b[6]
+ (uint64_t)a[1] * b[5]
+ (uint64_t)a[2] * b[4]
+ (uint64_t)a[3] * b[3]
+ (uint64_t)a[4] * b[2]
+ (uint64_t)a[5] * b[1]
+ (uint64_t)a[6] * b[0];
VERIFY_BITS(c, 63);
// [d 0 0 0 0 0 0 t9 0 0 c t5 t4 t3 t2 t1 t0] = [p15 p14 p13 p12 p11 p10 p9 0 0 p6 p5 p4 p3 p2 p1 p0]
d += (uint64_t)a[7] * b[9]
+ (uint64_t)a[8] * b[8]
+ (uint64_t)a[9] * b[7];
VERIFY_BITS(d, 61);
// [d 0 0 0 0 0 0 t9 0 0 c t5 t4 t3 t2 t1 t0] = [p16 p15 p14 p13 p12 p11 p10 p9 0 0 p6 p5 p4 p3 p2 p1 p0]
uint64_t u6 = d & M; d >>= 26; c += u6 * R0;
VERIFY_BITS(u6, 26);
VERIFY_BITS(d, 35);
// VERIFY_BITS(c, 64);
// [d u6 0 0 0 0 0 0 t9 0 0 c-u6*R0 t5 t4 t3 t2 t1 t0] = [p16 p15 p14 p13 p12 p11 p10 p9 0 0 p6 p5 p4 p3 p2 p1 p0]
uint32_t t6 = c & M; c >>= 26; c += u6 * R1;
VERIFY_BITS(t6, 26);
VERIFY_BITS(c, 39);
// [d u6 0 0 0 0 0 0 t9 0 c-u6*R1 t6-u6*R0 t5 t4 t3 t2 t1 t0] = [p16 p15 p14 p13 p12 p11 p10 p9 0 0 p6 p5 p4 p3 p2 p1 p0]
// [d 0 0 0 0 0 0 0 t9 0 c t6 t5 t4 t3 t2 t1 t0] = [p16 p15 p14 p13 p12 p11 p10 p9 0 0 p6 p5 p4 p3 p2 p1 p0]
c += (uint64_t)a[0] * b[7]
+ (uint64_t)a[1] * b[6]
+ (uint64_t)a[2] * b[5]
+ (uint64_t)a[3] * b[4]
+ (uint64_t)a[4] * b[3]
+ (uint64_t)a[5] * b[2]
+ (uint64_t)a[6] * b[1]
+ (uint64_t)a[7] * b[0];
// VERIFY_BITS(c, 64);
VERIFY_CHECK(c <= 0x8000007C00000007ULL);
// [d 0 0 0 0 0 0 0 t9 0 c t6 t5 t4 t3 t2 t1 t0] = [p16 p15 p14 p13 p12 p11 p10 p9 0 p7 p6 p5 p4 p3 p2 p1 p0]
d += (uint64_t)a[8] * b[9]
+ (uint64_t)a[9] * b[8];
VERIFY_BITS(d, 58);
// [d 0 0 0 0 0 0 0 t9 0 c t6 t5 t4 t3 t2 t1 t0] = [p17 p16 p15 p14 p13 p12 p11 p10 p9 0 p7 p6 p5 p4 p3 p2 p1 p0]
uint64_t u7 = d & M; d >>= 26; c += u7 * R0;
VERIFY_BITS(u7, 26);
VERIFY_BITS(d, 32);
// VERIFY_BITS(c, 64);
VERIFY_CHECK(c <= 0x800001703FFFC2F7ULL);
// [d u7 0 0 0 0 0 0 0 t9 0 c-u7*R0 t6 t5 t4 t3 t2 t1 t0] = [p17 p16 p15 p14 p13 p12 p11 p10 p9 0 p7 p6 p5 p4 p3 p2 p1 p0]
uint32_t t7 = c & M; c >>= 26; c += u7 * R1;
VERIFY_BITS(t7, 26);
VERIFY_BITS(c, 38);
// [d u7 0 0 0 0 0 0 0 t9 c-u7*R1 t7-u7*R0 t6 t5 t4 t3 t2 t1 t0] = [p17 p16 p15 p14 p13 p12 p11 p10 p9 0 p7 p6 p5 p4 p3 p2 p1 p0]
// [d 0 0 0 0 0 0 0 0 t9 c t7 t6 t5 t4 t3 t2 t1 t0] = [p17 p16 p15 p14 p13 p12 p11 p10 p9 0 p7 p6 p5 p4 p3 p2 p1 p0]
c += (uint64_t)a[0] * b[8]
+ (uint64_t)a[1] * b[7]
+ (uint64_t)a[2] * b[6]
+ (uint64_t)a[3] * b[5]
+ (uint64_t)a[4] * b[4]
+ (uint64_t)a[5] * b[3]
+ (uint64_t)a[6] * b[2]
+ (uint64_t)a[7] * b[1]
+ (uint64_t)a[8] * b[0];
// VERIFY_BITS(c, 64);
VERIFY_CHECK(c <= 0x9000007B80000008ULL);
// [d 0 0 0 0 0 0 0 0 t9 c t7 t6 t5 t4 t3 t2 t1 t0] = [p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0]
d += (uint64_t)a[9] * b[9];
VERIFY_BITS(d, 57);
// [d 0 0 0 0 0 0 0 0 t9 c t7 t6 t5 t4 t3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0]
uint64_t u8 = d & M; d >>= 26; c += u8 * R0;
VERIFY_BITS(u8, 26);
VERIFY_BITS(d, 31);
// VERIFY_BITS(c, 64);
VERIFY_CHECK(c <= 0x9000016FBFFFC2F8ULL);
// [d u8 0 0 0 0 0 0 0 0 t9 c-u8*R0 t7 t6 t5 t4 t3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0]
r[3] = t3;
VERIFY_BITS(r[3], 26);
// [d u8 0 0 0 0 0 0 0 0 t9 c-u8*R0 t7 t6 t5 t4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0]
r[4] = t4;
VERIFY_BITS(r[4], 26);
// [d u8 0 0 0 0 0 0 0 0 t9 c-u8*R0 t7 t6 t5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0]
r[5] = t5;
VERIFY_BITS(r[5], 26);
// [d u8 0 0 0 0 0 0 0 0 t9 c-u8*R0 t7 t6 r5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0]
r[6] = t6;
VERIFY_BITS(r[6], 26);
// [d u8 0 0 0 0 0 0 0 0 t9 c-u8*R0 t7 r6 r5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0]
r[7] = t7;
VERIFY_BITS(r[7], 26);
// [d u8 0 0 0 0 0 0 0 0 t9 c-u8*R0 r7 r6 r5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0]
r[8] = c & M; c >>= 26; c += u8 * R1;
VERIFY_BITS(r[8], 26);
VERIFY_BITS(c, 39);
// [d u8 0 0 0 0 0 0 0 0 t9+c-u8*R1 r8-u8*R0 r7 r6 r5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0]
// [d 0 0 0 0 0 0 0 0 0 t9+c r8 r7 r6 r5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0]
c += d * R0 + t9;
VERIFY_BITS(c, 45);
// [d 0 0 0 0 0 0 0 0 0 c-d*R0 r8 r7 r6 r5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0]
r[9] = c & (M >> 4); c >>= 22; c += d * (R1 << 4);
VERIFY_BITS(r[9], 22);
VERIFY_BITS(c, 46);
// [d 0 0 0 0 0 0 0 0 r9+((c-d*R1<<4)<<22)-d*R0 r8 r7 r6 r5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0]
// [d 0 0 0 0 0 0 0 -d*R1 r9+(c<<22)-d*R0 r8 r7 r6 r5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0]
// [r9+(c<<22) r8 r7 r6 r5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0]
d = c * (R0 >> 4) + t0;
VERIFY_BITS(d, 56);
// [r9+(c<<22) r8 r7 r6 r5 r4 r3 t2 t1 d-c*R0>>4] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0]
r[0] = d & M; d >>= 26;
VERIFY_BITS(r[0], 26);
VERIFY_BITS(d, 30);
// [r9+(c<<22) r8 r7 r6 r5 r4 r3 t2 t1+d r0-c*R0>>4] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0]
d += c * (R1 >> 4) + t1;
VERIFY_BITS(d, 53);
VERIFY_CHECK(d <= 0x10000003FFFFBFULL);
// [r9+(c<<22) r8 r7 r6 r5 r4 r3 t2 d-c*R1>>4 r0-c*R0>>4] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0]
// [r9 r8 r7 r6 r5 r4 r3 t2 d r0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0]
r[1] = d & M; d >>= 26;
VERIFY_BITS(r[1], 26);
VERIFY_BITS(d, 27);
VERIFY_CHECK(d <= 0x4000000ULL);
// [r9 r8 r7 r6 r5 r4 r3 t2+d r1 r0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0]
d += t2;
VERIFY_BITS(d, 27);
// [r9 r8 r7 r6 r5 r4 r3 d r1 r0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0]
r[2] = d;
VERIFY_BITS(r[2], 27);
// [r9 r8 r7 r6 r5 r4 r3 r2 r1 r0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0]
}
SECP256K1_INLINE static void secp256k1_fe_sqr_inner(const uint32_t *a, uint32_t *r) {
uint64_t c = (uint64_t)a[0] * a[0];
uint32_t t0 = c & 0x3FFFFFFUL; c = c >> 26;
c = c + (uint64_t)(a[0]*2) * a[1];
uint32_t t1 = c & 0x3FFFFFFUL; c = c >> 26;
c = c + (uint64_t)(a[0]*2) * a[2] +
(uint64_t)a[1] * a[1];
uint32_t t2 = c & 0x3FFFFFFUL; c = c >> 26;
c = c + (uint64_t)(a[0]*2) * a[3] +
(uint64_t)(a[1]*2) * a[2];
uint32_t t3 = c & 0x3FFFFFFUL; c = c >> 26;
c = c + (uint64_t)(a[0]*2) * a[4] +
(uint64_t)(a[1]*2) * a[3] +
(uint64_t)a[2] * a[2];
uint32_t t4 = c & 0x3FFFFFFUL; c = c >> 26;
c = c + (uint64_t)(a[0]*2) * a[5] +
(uint64_t)(a[1]*2) * a[4] +
(uint64_t)(a[2]*2) * a[3];
uint32_t t5 = c & 0x3FFFFFFUL; c = c >> 26;
c = c + (uint64_t)(a[0]*2) * a[6] +
(uint64_t)(a[1]*2) * a[5] +
(uint64_t)(a[2]*2) * a[4] +
(uint64_t)a[3] * a[3];
uint32_t t6 = c & 0x3FFFFFFUL; c = c >> 26;
c = c + (uint64_t)(a[0]*2) * a[7] +
(uint64_t)(a[1]*2) * a[6] +
(uint64_t)(a[2]*2) * a[5] +
(uint64_t)(a[3]*2) * a[4];
uint32_t t7 = c & 0x3FFFFFFUL; c = c >> 26;
c = c + (uint64_t)(a[0]*2) * a[8] +
(uint64_t)(a[1]*2) * a[7] +
(uint64_t)(a[2]*2) * a[6] +
(uint64_t)(a[3]*2) * a[5] +
(uint64_t)a[4] * a[4];
uint32_t t8 = c & 0x3FFFFFFUL; c = c >> 26;
c = c + (uint64_t)(a[0]*2) * a[9] +
(uint64_t)(a[1]*2) * a[8] +
(uint64_t)(a[2]*2) * a[7] +
(uint64_t)(a[3]*2) * a[6] +
(uint64_t)(a[4]*2) * a[5];
uint32_t t9 = c & 0x3FFFFFFUL; c = c >> 26;
c = c + (uint64_t)(a[1]*2) * a[9] +
(uint64_t)(a[2]*2) * a[8] +
(uint64_t)(a[3]*2) * a[7] +
(uint64_t)(a[4]*2) * a[6] +
(uint64_t)a[5] * a[5];
uint32_t t10 = c & 0x3FFFFFFUL; c = c >> 26;
c = c + (uint64_t)(a[2]*2) * a[9] +
(uint64_t)(a[3]*2) * a[8] +
(uint64_t)(a[4]*2) * a[7] +
(uint64_t)(a[5]*2) * a[6];
uint32_t t11 = c & 0x3FFFFFFUL; c = c >> 26;
c = c + (uint64_t)(a[3]*2) * a[9] +
(uint64_t)(a[4]*2) * a[8] +
(uint64_t)(a[5]*2) * a[7] +
(uint64_t)a[6] * a[6];
uint32_t t12 = c & 0x3FFFFFFUL; c = c >> 26;
c = c + (uint64_t)(a[4]*2) * a[9] +
(uint64_t)(a[5]*2) * a[8] +
(uint64_t)(a[6]*2) * a[7];
uint32_t t13 = c & 0x3FFFFFFUL; c = c >> 26;
c = c + (uint64_t)(a[5]*2) * a[9] +
(uint64_t)(a[6]*2) * a[8] +
(uint64_t)a[7] * a[7];
uint32_t t14 = c & 0x3FFFFFFUL; c = c >> 26;
c = c + (uint64_t)(a[6]*2) * a[9] +
(uint64_t)(a[7]*2) * a[8];
uint32_t t15 = c & 0x3FFFFFFUL; c = c >> 26;
c = c + (uint64_t)(a[7]*2) * a[9] +
(uint64_t)a[8] * a[8];
uint32_t t16 = c & 0x3FFFFFFUL; c = c >> 26;
c = c + (uint64_t)(a[8]*2) * a[9];
uint32_t t17 = c & 0x3FFFFFFUL; c = c >> 26;
c = c + (uint64_t)a[9] * a[9];
uint32_t t18 = c & 0x3FFFFFFUL; c = c >> 26;
uint32_t t19 = c;
VERIFY_BITS(a[0], 30);
VERIFY_BITS(a[1], 30);
VERIFY_BITS(a[2], 30);
VERIFY_BITS(a[3], 30);
VERIFY_BITS(a[4], 30);
VERIFY_BITS(a[5], 30);
VERIFY_BITS(a[6], 30);
VERIFY_BITS(a[7], 30);
VERIFY_BITS(a[8], 30);
VERIFY_BITS(a[9], 26);
c = t0 + (uint64_t)t10 * 0x3D10UL;
t0 = c & 0x3FFFFFFUL; c = c >> 26;
c = c + t1 + (uint64_t)t10*0x400UL + (uint64_t)t11 * 0x3D10UL;
t1 = c & 0x3FFFFFFUL; c = c >> 26;
c = c + t2 + (uint64_t)t11*0x400UL + (uint64_t)t12 * 0x3D10UL;
t2 = c & 0x3FFFFFFUL; c = c >> 26;
c = c + t3 + (uint64_t)t12*0x400UL + (uint64_t)t13 * 0x3D10UL;
r[3] = c & 0x3FFFFFFUL; c = c >> 26;
c = c + t4 + (uint64_t)t13*0x400UL + (uint64_t)t14 * 0x3D10UL;
r[4] = c & 0x3FFFFFFUL; c = c >> 26;
c = c + t5 + (uint64_t)t14*0x400UL + (uint64_t)t15 * 0x3D10UL;
r[5] = c & 0x3FFFFFFUL; c = c >> 26;
c = c + t6 + (uint64_t)t15*0x400UL + (uint64_t)t16 * 0x3D10UL;
r[6] = c & 0x3FFFFFFUL; c = c >> 26;
c = c + t7 + (uint64_t)t16*0x400UL + (uint64_t)t17 * 0x3D10UL;
r[7] = c & 0x3FFFFFFUL; c = c >> 26;
c = c + t8 + (uint64_t)t17*0x400UL + (uint64_t)t18 * 0x3D10UL;
r[8] = c & 0x3FFFFFFUL; c = c >> 26;
c = c + t9 + (uint64_t)t18*0x400UL + (uint64_t)t19 * 0x1000003D10ULL;
r[9] = c & 0x03FFFFFUL; c = c >> 22;
uint64_t d = t0 + c * 0x3D1UL;
r[0] = d & 0x3FFFFFFUL; d = d >> 26;
d = d + t1 + c*0x40;
r[1] = d & 0x3FFFFFFUL; d = d >> 26;
r[2] = t2 + d;
const uint32_t M = 0x3FFFFFFUL, R0 = 0x3D10UL, R1 = 0x400UL;
// [... a b c] is a shorthand for ... + a<<52 + b<<26 + c<<0 mod n.
// px is a shorthand for sum(a[i]*a[x-i], i=0..x).
// Note that [x 0 0 0 0 0 0 0 0 0 0] = [x*R1 x*R0].
uint64_t c, d;
d = (uint64_t)(a[0]*2) * a[9]
+ (uint64_t)(a[1]*2) * a[8]
+ (uint64_t)(a[2]*2) * a[7]
+ (uint64_t)(a[3]*2) * a[6]
+ (uint64_t)(a[4]*2) * a[5];
// VERIFY_BITS(d, 64);
// [d 0 0 0 0 0 0 0 0 0] = [p9 0 0 0 0 0 0 0 0 0]
uint32_t t9 = d & M; d >>= 26;
VERIFY_BITS(t9, 26);
VERIFY_BITS(d, 38);
// [d t9 0 0 0 0 0 0 0 0 0] = [p9 0 0 0 0 0 0 0 0 0]
c = (uint64_t)a[0] * a[0];
VERIFY_BITS(c, 60);
// [d t9 0 0 0 0 0 0 0 0 c] = [p9 0 0 0 0 0 0 0 0 p0]
d += (uint64_t)(a[1]*2) * a[9]
+ (uint64_t)(a[2]*2) * a[8]
+ (uint64_t)(a[3]*2) * a[7]
+ (uint64_t)(a[4]*2) * a[6]
+ (uint64_t)a[5] * a[5];
VERIFY_BITS(d, 63);
// [d t9 0 0 0 0 0 0 0 0 c] = [p10 p9 0 0 0 0 0 0 0 0 p0]
uint64_t u0 = d & M; d >>= 26; c += u0 * R0;
VERIFY_BITS(u0, 26);
VERIFY_BITS(d, 37);
VERIFY_BITS(c, 61);
// [d u0 t9 0 0 0 0 0 0 0 0 c-u0*R0] = [p10 p9 0 0 0 0 0 0 0 0 p0]
uint32_t t0 = c & M; c >>= 26; c += u0 * R1;
VERIFY_BITS(t0, 26);
VERIFY_BITS(c, 37);
// [d u0 t9 0 0 0 0 0 0 0 c-u0*R1 t0-u0*R0] = [p10 p9 0 0 0 0 0 0 0 0 p0]
// [d 0 t9 0 0 0 0 0 0 0 c t0] = [p10 p9 0 0 0 0 0 0 0 0 p0]
c += (uint64_t)(a[0]*2) * a[1];
VERIFY_BITS(c, 62);
// [d 0 t9 0 0 0 0 0 0 0 c t0] = [p10 p9 0 0 0 0 0 0 0 p1 p0]
d += (uint64_t)(a[2]*2) * a[9]
+ (uint64_t)(a[3]*2) * a[8]
+ (uint64_t)(a[4]*2) * a[7]
+ (uint64_t)(a[5]*2) * a[6];
VERIFY_BITS(d, 63);
// [d 0 t9 0 0 0 0 0 0 0 c t0] = [p11 p10 p9 0 0 0 0 0 0 0 p1 p0]
uint64_t u1 = d & M; d >>= 26; c += u1 * R0;
VERIFY_BITS(u1, 26);
VERIFY_BITS(d, 37);
VERIFY_BITS(c, 63);
// [d u1 0 t9 0 0 0 0 0 0 0 c-u1*R0 t0] = [p11 p10 p9 0 0 0 0 0 0 0 p1 p0]
uint32_t t1 = c & M; c >>= 26; c += u1 * R1;
VERIFY_BITS(t1, 26);
VERIFY_BITS(c, 38);
// [d u1 0 t9 0 0 0 0 0 0 c-u1*R1 t1-u1*R0 t0] = [p11 p10 p9 0 0 0 0 0 0 0 p1 p0]
// [d 0 0 t9 0 0 0 0 0 0 c t1 t0] = [p11 p10 p9 0 0 0 0 0 0 0 p1 p0]
c += (uint64_t)(a[0]*2) * a[2]
+ (uint64_t)a[1] * a[1];
VERIFY_BITS(c, 62);
// [d 0 0 t9 0 0 0 0 0 0 c t1 t0] = [p11 p10 p9 0 0 0 0 0 0 p2 p1 p0]
d += (uint64_t)(a[3]*2) * a[9]
+ (uint64_t)(a[4]*2) * a[8]
+ (uint64_t)(a[5]*2) * a[7]
+ (uint64_t)a[6] * a[6];
VERIFY_BITS(d, 63);
// [d 0 0 t9 0 0 0 0 0 0 c t1 t0] = [p12 p11 p10 p9 0 0 0 0 0 0 p2 p1 p0]
uint64_t u2 = d & M; d >>= 26; c += u2 * R0;
VERIFY_BITS(u2, 26);
VERIFY_BITS(d, 37);
VERIFY_BITS(c, 63);
// [d u2 0 0 t9 0 0 0 0 0 0 c-u2*R0 t1 t0] = [p12 p11 p10 p9 0 0 0 0 0 0 p2 p1 p0]
uint32_t t2 = c & M; c >>= 26; c += u2 * R1;
VERIFY_BITS(t2, 26);
VERIFY_BITS(c, 38);
// [d u2 0 0 t9 0 0 0 0 0 c-u2*R1 t2-u2*R0 t1 t0] = [p12 p11 p10 p9 0 0 0 0 0 0 p2 p1 p0]
// [d 0 0 0 t9 0 0 0 0 0 c t2 t1 t0] = [p12 p11 p10 p9 0 0 0 0 0 0 p2 p1 p0]
c += (uint64_t)(a[0]*2) * a[3]
+ (uint64_t)(a[1]*2) * a[2];
VERIFY_BITS(c, 63);
// [d 0 0 0 t9 0 0 0 0 0 c t2 t1 t0] = [p12 p11 p10 p9 0 0 0 0 0 p3 p2 p1 p0]
d += (uint64_t)(a[4]*2) * a[9]
+ (uint64_t)(a[5]*2) * a[8]
+ (uint64_t)(a[6]*2) * a[7];
VERIFY_BITS(d, 63);
// [d 0 0 0 t9 0 0 0 0 0 c t2 t1 t0] = [p13 p12 p11 p10 p9 0 0 0 0 0 p3 p2 p1 p0]
uint64_t u3 = d & M; d >>= 26; c += u3 * R0;
VERIFY_BITS(u3, 26);
VERIFY_BITS(d, 37);
// VERIFY_BITS(c, 64);
// [d u3 0 0 0 t9 0 0 0 0 0 c-u3*R0 t2 t1 t0] = [p13 p12 p11 p10 p9 0 0 0 0 0 p3 p2 p1 p0]
uint32_t t3 = c & M; c >>= 26; c += u3 * R1;
VERIFY_BITS(t3, 26);
VERIFY_BITS(c, 39);
// [d u3 0 0 0 t9 0 0 0 0 c-u3*R1 t3-u3*R0 t2 t1 t0] = [p13 p12 p11 p10 p9 0 0 0 0 0 p3 p2 p1 p0]
// [d 0 0 0 0 t9 0 0 0 0 c t3 t2 t1 t0] = [p13 p12 p11 p10 p9 0 0 0 0 0 p3 p2 p1 p0]
c += (uint64_t)(a[0]*2) * a[4]
+ (uint64_t)(a[1]*2) * a[3]
+ (uint64_t)a[2] * a[2];
VERIFY_BITS(c, 63);
// [d 0 0 0 0 t9 0 0 0 0 c t3 t2 t1 t0] = [p13 p12 p11 p10 p9 0 0 0 0 p4 p3 p2 p1 p0]
d += (uint64_t)(a[5]*2) * a[9]
+ (uint64_t)(a[6]*2) * a[8]
+ (uint64_t)a[7] * a[7];
VERIFY_BITS(d, 62);
// [d 0 0 0 0 t9 0 0 0 0 c t3 t2 t1 t0] = [p14 p13 p12 p11 p10 p9 0 0 0 0 p4 p3 p2 p1 p0]
uint64_t u4 = d & M; d >>= 26; c += u4 * R0;
VERIFY_BITS(u4, 26);
VERIFY_BITS(d, 36);
// VERIFY_BITS(c, 64);
// [d u4 0 0 0 0 t9 0 0 0 0 c-u4*R0 t3 t2 t1 t0] = [p14 p13 p12 p11 p10 p9 0 0 0 0 p4 p3 p2 p1 p0]
uint32_t t4 = c & M; c >>= 26; c += u4 * R1;
VERIFY_BITS(t4, 26);
VERIFY_BITS(c, 39);
// [d u4 0 0 0 0 t9 0 0 0 c-u4*R1 t4-u4*R0 t3 t2 t1 t0] = [p14 p13 p12 p11 p10 p9 0 0 0 0 p4 p3 p2 p1 p0]
// [d 0 0 0 0 0 t9 0 0 0 c t4 t3 t2 t1 t0] = [p14 p13 p12 p11 p10 p9 0 0 0 0 p4 p3 p2 p1 p0]
c += (uint64_t)(a[0]*2) * a[5]
+ (uint64_t)(a[1]*2) * a[4]
+ (uint64_t)(a[2]*2) * a[3];
VERIFY_BITS(c, 63);
// [d 0 0 0 0 0 t9 0 0 0 c t4 t3 t2 t1 t0] = [p14 p13 p12 p11 p10 p9 0 0 0 p5 p4 p3 p2 p1 p0]
d += (uint64_t)(a[6]*2) * a[9]
+ (uint64_t)(a[7]*2) * a[8];
VERIFY_BITS(d, 62);
// [d 0 0 0 0 0 t9 0 0 0 c t4 t3 t2 t1 t0] = [p15 p14 p13 p12 p11 p10 p9 0 0 0 p5 p4 p3 p2 p1 p0]
uint64_t u5 = d & M; d >>= 26; c += u5 * R0;
VERIFY_BITS(u5, 26);
VERIFY_BITS(d, 36);
// VERIFY_BITS(c, 64);
// [d u5 0 0 0 0 0 t9 0 0 0 c-u5*R0 t4 t3 t2 t1 t0] = [p15 p14 p13 p12 p11 p10 p9 0 0 0 p5 p4 p3 p2 p1 p0]
uint32_t t5 = c & M; c >>= 26; c += u5 * R1;
VERIFY_BITS(t5, 26);
VERIFY_BITS(c, 39);
// [d u5 0 0 0 0 0 t9 0 0 c-u5*R1 t5-u5*R0 t4 t3 t2 t1 t0] = [p15 p14 p13 p12 p11 p10 p9 0 0 0 p5 p4 p3 p2 p1 p0]
// [d 0 0 0 0 0 0 t9 0 0 c t5 t4 t3 t2 t1 t0] = [p15 p14 p13 p12 p11 p10 p9 0 0 0 p5 p4 p3 p2 p1 p0]
c += (uint64_t)(a[0]*2) * a[6]
+ (uint64_t)(a[1]*2) * a[5]
+ (uint64_t)(a[2]*2) * a[4]
+ (uint64_t)a[3] * a[3];
VERIFY_BITS(c, 63);
// [d 0 0 0 0 0 0 t9 0 0 c t5 t4 t3 t2 t1 t0] = [p15 p14 p13 p12 p11 p10 p9 0 0 p6 p5 p4 p3 p2 p1 p0]
d += (uint64_t)(a[7]*2) * a[9]
+ (uint64_t)a[8] * a[8];
VERIFY_BITS(d, 61);
// [d 0 0 0 0 0 0 t9 0 0 c t5 t4 t3 t2 t1 t0] = [p16 p15 p14 p13 p12 p11 p10 p9 0 0 p6 p5 p4 p3 p2 p1 p0]
uint64_t u6 = d & M; d >>= 26; c += u6 * R0;
VERIFY_BITS(u6, 26);
VERIFY_BITS(d, 35);
// VERIFY_BITS(c, 64);
// [d u6 0 0 0 0 0 0 t9 0 0 c-u6*R0 t5 t4 t3 t2 t1 t0] = [p16 p15 p14 p13 p12 p11 p10 p9 0 0 p6 p5 p4 p3 p2 p1 p0]
uint32_t t6 = c & M; c >>= 26; c += u6 * R1;
VERIFY_BITS(t6, 26);
VERIFY_BITS(c, 39);
// [d u6 0 0 0 0 0 0 t9 0 c-u6*R1 t6-u6*R0 t5 t4 t3 t2 t1 t0] = [p16 p15 p14 p13 p12 p11 p10 p9 0 0 p6 p5 p4 p3 p2 p1 p0]
// [d 0 0 0 0 0 0 0 t9 0 c t6 t5 t4 t3 t2 t1 t0] = [p16 p15 p14 p13 p12 p11 p10 p9 0 0 p6 p5 p4 p3 p2 p1 p0]
c += (uint64_t)(a[0]*2) * a[7]
+ (uint64_t)(a[1]*2) * a[6]
+ (uint64_t)(a[2]*2) * a[5]
+ (uint64_t)(a[3]*2) * a[4];
// VERIFY_BITS(c, 64);
VERIFY_CHECK(c <= 0x8000007C00000007ULL);
// [d 0 0 0 0 0 0 0 t9 0 c t6 t5 t4 t3 t2 t1 t0] = [p16 p15 p14 p13 p12 p11 p10 p9 0 p7 p6 p5 p4 p3 p2 p1 p0]
d += (uint64_t)(a[8]*2) * a[9];
VERIFY_BITS(d, 58);
// [d 0 0 0 0 0 0 0 t9 0 c t6 t5 t4 t3 t2 t1 t0] = [p17 p16 p15 p14 p13 p12 p11 p10 p9 0 p7 p6 p5 p4 p3 p2 p1 p0]
uint64_t u7 = d & M; d >>= 26; c += u7 * R0;
VERIFY_BITS(u7, 26);
VERIFY_BITS(d, 32);
// VERIFY_BITS(c, 64);
VERIFY_CHECK(c <= 0x800001703FFFC2F7ULL);
// [d u7 0 0 0 0 0 0 0 t9 0 c-u7*R0 t6 t5 t4 t3 t2 t1 t0] = [p17 p16 p15 p14 p13 p12 p11 p10 p9 0 p7 p6 p5 p4 p3 p2 p1 p0]
uint32_t t7 = c & M; c >>= 26; c += u7 * R1;
VERIFY_BITS(t7, 26);
VERIFY_BITS(c, 38);
// [d u7 0 0 0 0 0 0 0 t9 c-u7*R1 t7-u7*R0 t6 t5 t4 t3 t2 t1 t0] = [p17 p16 p15 p14 p13 p12 p11 p10 p9 0 p7 p6 p5 p4 p3 p2 p1 p0]
// [d 0 0 0 0 0 0 0 0 t9 c t7 t6 t5 t4 t3 t2 t1 t0] = [p17 p16 p15 p14 p13 p12 p11 p10 p9 0 p7 p6 p5 p4 p3 p2 p1 p0]
c += (uint64_t)(a[0]*2) * a[8]
+ (uint64_t)(a[1]*2) * a[7]
+ (uint64_t)(a[2]*2) * a[6]
+ (uint64_t)(a[3]*2) * a[5]
+ (uint64_t)a[4] * a[4];
// VERIFY_BITS(c, 64);
VERIFY_CHECK(c <= 0x9000007B80000008ULL);
// [d 0 0 0 0 0 0 0 0 t9 c t7 t6 t5 t4 t3 t2 t1 t0] = [p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0]
d += (uint64_t)a[9] * a[9];
VERIFY_BITS(d, 57);
// [d 0 0 0 0 0 0 0 0 t9 c t7 t6 t5 t4 t3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0]
uint64_t u8 = d & M; d >>= 26; c += u8 * R0;
VERIFY_BITS(u8, 26);
VERIFY_BITS(d, 31);
// VERIFY_BITS(c, 64);
VERIFY_CHECK(c <= 0x9000016FBFFFC2F8ULL);
// [d u8 0 0 0 0 0 0 0 0 t9 c-u8*R0 t7 t6 t5 t4 t3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0]
r[3] = t3;
VERIFY_BITS(r[3], 26);
// [d u8 0 0 0 0 0 0 0 0 t9 c-u8*R0 t7 t6 t5 t4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0]
r[4] = t4;
VERIFY_BITS(r[4], 26);
// [d u8 0 0 0 0 0 0 0 0 t9 c-u8*R0 t7 t6 t5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0]
r[5] = t5;
VERIFY_BITS(r[5], 26);
// [d u8 0 0 0 0 0 0 0 0 t9 c-u8*R0 t7 t6 r5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0]
r[6] = t6;
VERIFY_BITS(r[6], 26);
// [d u8 0 0 0 0 0 0 0 0 t9 c-u8*R0 t7 r6 r5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0]
r[7] = t7;
VERIFY_BITS(r[7], 26);
// [d u8 0 0 0 0 0 0 0 0 t9 c-u8*R0 r7 r6 r5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0]
r[8] = c & M; c >>= 26; c += u8 * R1;
VERIFY_BITS(r[8], 26);
VERIFY_BITS(c, 39);
// [d u8 0 0 0 0 0 0 0 0 t9+c-u8*R1 r8-u8*R0 r7 r6 r5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0]
// [d 0 0 0 0 0 0 0 0 0 t9+c r8 r7 r6 r5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0]
c += d * R0 + t9;
VERIFY_BITS(c, 45);
// [d 0 0 0 0 0 0 0 0 0 c-d*R0 r8 r7 r6 r5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0]
r[9] = c & (M >> 4); c >>= 22; c += d * (R1 << 4);
VERIFY_BITS(r[9], 22);
VERIFY_BITS(c, 46);
// [d 0 0 0 0 0 0 0 0 r9+((c-d*R1<<4)<<22)-d*R0 r8 r7 r6 r5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0]
// [d 0 0 0 0 0 0 0 -d*R1 r9+(c<<22)-d*R0 r8 r7 r6 r5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0]
// [r9+(c<<22) r8 r7 r6 r5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0]
d = c * (R0 >> 4) + t0;
VERIFY_BITS(d, 56);
// [r9+(c<<22) r8 r7 r6 r5 r4 r3 t2 t1 d-c*R0>>4] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0]
r[0] = d & M; d >>= 26;
VERIFY_BITS(r[0], 26);
VERIFY_BITS(d, 30);
// [r9+(c<<22) r8 r7 r6 r5 r4 r3 t2 t1+d r0-c*R0>>4] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0]
d += c * (R1 >> 4) + t1;
VERIFY_BITS(d, 53);
VERIFY_CHECK(d <= 0x10000003FFFFBFULL);
// [r9+(c<<22) r8 r7 r6 r5 r4 r3 t2 d-c*R1>>4 r0-c*R0>>4] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0]
// [r9 r8 r7 r6 r5 r4 r3 t2 d r0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0]
r[1] = d & M; d >>= 26;
VERIFY_BITS(r[1], 26);
VERIFY_BITS(d, 27);
VERIFY_CHECK(d <= 0x4000000ULL);
// [r9 r8 r7 r6 r5 r4 r3 t2+d r1 r0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0]
d += t2;
VERIFY_BITS(d, 27);
// [r9 r8 r7 r6 r5 r4 r3 d r1 r0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0]
r[2] = d;
VERIFY_BITS(r[2], 27);
// [r9 r8 r7 r6 r5 r4 r3 r2 r1 r0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0]
}

View File

@ -7,99 +7,269 @@
#include <stdint.h>
#ifdef VERIFY
#define VERIFY_BITS(x, n) VERIFY_CHECK(((x) >> (n)) == 0)
#else
#define VERIFY_BITS(x, n) do { } while(0)
#endif
SECP256K1_INLINE static void secp256k1_fe_mul_inner(const uint64_t *a, const uint64_t *b, uint64_t *r) {
__int128 c = (__int128)a[0] * b[0];
uint64_t t0 = c & 0xFFFFFFFFFFFFFULL; c = c >> 52; // c max 0FFFFFFFFFFFFFE0
c = c + (__int128)a[0] * b[1] +
(__int128)a[1] * b[0];
uint64_t t1 = c & 0xFFFFFFFFFFFFFULL; c = c >> 52; // c max 20000000000000BF
c = c + (__int128)a[0] * b[2] +
(__int128)a[1] * b[1] +
(__int128)a[2] * b[0];
uint64_t t2 = c & 0xFFFFFFFFFFFFFULL; c = c >> 52; // c max 30000000000001A0
c = c + (__int128)a[0] * b[3] +
(__int128)a[1] * b[2] +
(__int128)a[2] * b[1] +
(__int128)a[3] * b[0];
uint64_t t3 = c & 0xFFFFFFFFFFFFFULL; c = c >> 52; // c max 4000000000000280
c = c + (__int128)a[0] * b[4] +
(__int128)a[1] * b[3] +
(__int128)a[2] * b[2] +
(__int128)a[3] * b[1] +
(__int128)a[4] * b[0];
uint64_t t4 = c & 0xFFFFFFFFFFFFFULL; c = c >> 52; // c max 320000000000037E
c = c + (__int128)a[1] * b[4] +
(__int128)a[2] * b[3] +
(__int128)a[3] * b[2] +
(__int128)a[4] * b[1];
uint64_t t5 = c & 0xFFFFFFFFFFFFFULL; c = c >> 52; // c max 22000000000002BE
c = c + (__int128)a[2] * b[4] +
(__int128)a[3] * b[3] +
(__int128)a[4] * b[2];
uint64_t t6 = c & 0xFFFFFFFFFFFFFULL; c = c >> 52; // c max 12000000000001DE
c = c + (__int128)a[3] * b[4] +
(__int128)a[4] * b[3];
uint64_t t7 = c & 0xFFFFFFFFFFFFFULL; c = c >> 52; // c max 02000000000000FE
c = c + (__int128)a[4] * b[4];
uint64_t t8 = c & 0xFFFFFFFFFFFFFULL; c = c >> 52; // c max 001000000000001E
uint64_t t9 = c;
VERIFY_BITS(a[0], 56);
VERIFY_BITS(a[1], 56);
VERIFY_BITS(a[2], 56);
VERIFY_BITS(a[3], 56);
VERIFY_BITS(a[4], 52);
VERIFY_BITS(b[0], 56);
VERIFY_BITS(b[1], 56);
VERIFY_BITS(b[2], 56);
VERIFY_BITS(b[3], 56);
VERIFY_BITS(b[4], 52);
c = t0 + (__int128)t5 * 0x1000003D10ULL;
t0 = c & 0xFFFFFFFFFFFFFULL; c = c >> 52; // c max 0000001000003D10
c = c + t1 + (__int128)t6 * 0x1000003D10ULL;
t1 = c & 0xFFFFFFFFFFFFFULL; c = c >> 52; // c max 0000001000003D10
c = c + t2 + (__int128)t7 * 0x1000003D10ULL;
r[2] = c & 0xFFFFFFFFFFFFFULL; c = c >> 52; // c max 0000001000003D10
c = c + t3 + (__int128)t8 * 0x1000003D10ULL;
r[3] = c & 0xFFFFFFFFFFFFFULL; c = c >> 52; // c max 0000001000003D10
c = c + t4 + (__int128)t9 * 0x1000003D10ULL;
r[4] = c & 0x0FFFFFFFFFFFFULL; c = c >> 48; // c max 000001000003D110
c = t0 + (__int128)c * 0x1000003D1ULL;
r[0] = c & 0xFFFFFFFFFFFFFULL; c = c >> 52; // c max 1000008
r[1] = t1 + c;
const uint64_t M = 0xFFFFFFFFFFFFFULL, R = 0x1000003D10ULL;
// [... a b c] is a shorthand for ... + a<<104 + b<<52 + c<<0 mod n.
// px is a shorthand for sum(a[i]*b[x-i], i=0..x).
// Note that [x 0 0 0 0 0] = [x*R].
__int128 c, d;
d = (__int128)a[0] * b[3]
+ (__int128)a[1] * b[2]
+ (__int128)a[2] * b[1]
+ (__int128)a[3] * b[0];
VERIFY_BITS(d, 114);
// [d 0 0 0] = [p3 0 0 0]
c = (__int128)a[4] * b[4];
VERIFY_BITS(c, 112);
// [c 0 0 0 0 d 0 0 0] = [p8 0 0 0 0 p3 0 0 0]
d += (c & M) * R; c >>= 52;
VERIFY_BITS(d, 115);
VERIFY_BITS(c, 60);
// [c 0 0 0 0 0 d 0 0 0] = [p8 0 0 0 0 p3 0 0 0]
uint64_t t3 = d & M; d >>= 52;
VERIFY_BITS(t3, 52);
VERIFY_BITS(d, 63);
// [c 0 0 0 0 d t3 0 0 0] = [p8 0 0 0 0 p3 0 0 0]
d += (__int128)a[0] * b[4]
+ (__int128)a[1] * b[3]
+ (__int128)a[2] * b[2]
+ (__int128)a[3] * b[1]
+ (__int128)a[4] * b[0];
VERIFY_BITS(d, 115);
// [c 0 0 0 0 d t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0]
d += c * R;
VERIFY_BITS(d, 116);
// [d t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0]
uint64_t t4 = d & M; d >>= 52;
VERIFY_BITS(t4, 52);
VERIFY_BITS(d, 64);
// [d t4 t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0]
uint64_t tx = (t4 >> 48); t4 &= (M >> 4);
VERIFY_BITS(tx, 4);
VERIFY_BITS(t4, 48);
// [d t4+(tx<<48) t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0]
c = (__int128)a[0] * b[0];
VERIFY_BITS(c, 112);
// [d t4+(tx<<48) t3 0 0 c] = [p8 0 0 0 p4 p3 0 0 p0]
d += (__int128)a[1] * b[4]
+ (__int128)a[2] * b[3]
+ (__int128)a[3] * b[2]
+ (__int128)a[4] * b[1];
VERIFY_BITS(d, 115);
// [d t4+(tx<<48) t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0]
uint64_t u0 = d & M; d >>= 52;
VERIFY_BITS(u0, 52);
VERIFY_BITS(d, 63);
// [d u0 t4+(tx<<48) t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0]
// [d 0 t4+(tx<<48)+(u0<<52) t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0]
u0 = (u0 << 4) | tx;
VERIFY_BITS(u0, 56);
// [d 0 t4+(u0<<48) t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0]
c += (__int128)u0 * (R >> 4);
VERIFY_BITS(c, 115);
// [d 0 t4 t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0]
uint64_t t0 = c & M; c >>= 52;
VERIFY_BITS(t0, 52);
VERIFY_BITS(c, 61);
// [d 0 t4 t3 0 c t0] = [p8 0 0 p5 p4 p3 0 0 p0]
c += (__int128)a[0] * b[1]
+ (__int128)a[1] * b[0];
VERIFY_BITS(c, 114);
// [d 0 t4 t3 0 c t0] = [p8 0 0 p5 p4 p3 0 p1 p0]
d += (__int128)a[2] * b[4]
+ (__int128)a[3] * b[3]
+ (__int128)a[4] * b[2];
VERIFY_BITS(d, 114);
// [d 0 t4 t3 0 c t0] = [p8 0 p6 p5 p4 p3 0 p1 p0]
c += (d & M) * R; d >>= 52;
VERIFY_BITS(c, 115);
VERIFY_BITS(d, 62);
// [d 0 0 t4 t3 0 c t0] = [p8 0 p6 p5 p4 p3 0 p1 p0]
uint64_t t1 = c & M; c >>= 52;
VERIFY_BITS(t1, 52);
VERIFY_BITS(c, 63);
// [d 0 0 t4 t3 c t1 t0] = [p8 0 p6 p5 p4 p3 0 p1 p0]
c += (__int128)a[0] * b[2]
+ (__int128)a[1] * b[1]
+ (__int128)a[2] * b[0];
VERIFY_BITS(c, 114);
// [d 0 0 t4 t3 c t1 t0] = [p8 0 p6 p5 p4 p3 p2 p1 p0]
d += (__int128)a[3] * b[4]
+ (__int128)a[4] * b[3];
VERIFY_BITS(d, 114);
// [d 0 0 t4 t3 c t1 t0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0]
c += (d & M) * R; d >>= 52;
VERIFY_BITS(c, 115);
VERIFY_BITS(d, 62);
// [d 0 0 0 t4 t3 c t1 t0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0]
r[0] = t0;
VERIFY_BITS(r[0], 52);
// [d 0 0 0 t4 t3 c t1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0]
r[1] = t1;
VERIFY_BITS(r[1], 52);
// [d 0 0 0 t4 t3 c r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0]
r[2] = c & M; c >>= 52;
VERIFY_BITS(r[2], 52);
VERIFY_BITS(c, 63);
// [d 0 0 0 t4 t3+c r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0]
c += d * R + t3;;
VERIFY_BITS(c, 100);
// [t4 c r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0]
r[3] = c & M; c >>= 52;
VERIFY_BITS(r[3], 52);
VERIFY_BITS(c, 48);
// [t4+c r3 r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0]
c += t4;
VERIFY_BITS(c, 49);
// [c r3 r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0]
r[4] = c;
VERIFY_BITS(r[4], 49);
// [r4 r3 r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0]
}
SECP256K1_INLINE static void secp256k1_fe_sqr_inner(const uint64_t *a, uint64_t *r) {
__int128 c = (__int128)a[0] * a[0];
uint64_t t0 = c & 0xFFFFFFFFFFFFFULL; c = c >> 52; // c max 0FFFFFFFFFFFFFE0
c = c + (__int128)(a[0]*2) * a[1];
uint64_t t1 = c & 0xFFFFFFFFFFFFFULL; c = c >> 52; // c max 20000000000000BF
c = c + (__int128)(a[0]*2) * a[2] +
(__int128)a[1] * a[1];
uint64_t t2 = c & 0xFFFFFFFFFFFFFULL; c = c >> 52; // c max 30000000000001A0
c = c + (__int128)(a[0]*2) * a[3] +
(__int128)(a[1]*2) * a[2];
uint64_t t3 = c & 0xFFFFFFFFFFFFFULL; c = c >> 52; // c max 4000000000000280
c = c + (__int128)(a[0]*2) * a[4] +
(__int128)(a[1]*2) * a[3] +
(__int128)a[2] * a[2];
uint64_t t4 = c & 0xFFFFFFFFFFFFFULL; c = c >> 52; // c max 320000000000037E
c = c + (__int128)(a[1]*2) * a[4] +
(__int128)(a[2]*2) * a[3];
uint64_t t5 = c & 0xFFFFFFFFFFFFFULL; c = c >> 52; // c max 22000000000002BE
c = c + (__int128)(a[2]*2) * a[4] +
(__int128)a[3] * a[3];
uint64_t t6 = c & 0xFFFFFFFFFFFFFULL; c = c >> 52; // c max 12000000000001DE
c = c + (__int128)(a[3]*2) * a[4];
uint64_t t7 = c & 0xFFFFFFFFFFFFFULL; c = c >> 52; // c max 02000000000000FE
c = c + (__int128)a[4] * a[4];
uint64_t t8 = c & 0xFFFFFFFFFFFFFULL; c = c >> 52; // c max 001000000000001E
uint64_t t9 = c;
c = t0 + (__int128)t5 * 0x1000003D10ULL;
t0 = c & 0xFFFFFFFFFFFFFULL; c = c >> 52; // c max 0000001000003D10
c = c + t1 + (__int128)t6 * 0x1000003D10ULL;
t1 = c & 0xFFFFFFFFFFFFFULL; c = c >> 52; // c max 0000001000003D10
c = c + t2 + (__int128)t7 * 0x1000003D10ULL;
r[2] = c & 0xFFFFFFFFFFFFFULL; c = c >> 52; // c max 0000001000003D10
c = c + t3 + (__int128)t8 * 0x1000003D10ULL;
r[3] = c & 0xFFFFFFFFFFFFFULL; c = c >> 52; // c max 0000001000003D10
c = c + t4 + (__int128)t9 * 0x1000003D10ULL;
r[4] = c & 0x0FFFFFFFFFFFFULL; c = c >> 48; // c max 000001000003D110
c = t0 + (__int128)c * 0x1000003D1ULL;
r[0] = c & 0xFFFFFFFFFFFFFULL; c = c >> 52; // c max 1000008
r[1] = t1 + c;
VERIFY_BITS(a[0], 56);
VERIFY_BITS(a[1], 56);
VERIFY_BITS(a[2], 56);
VERIFY_BITS(a[3], 56);
VERIFY_BITS(a[4], 52);
const uint64_t M = 0xFFFFFFFFFFFFFULL, R = 0x1000003D10ULL;
// [... a b c] is a shorthand for ... + a<<104 + b<<52 + c<<0 mod n.
// px is a shorthand for sum(a[i]*a[x-i], i=0..x).
// Note that [x 0 0 0 0 0] = [x*R].
__int128 c, d;
uint64_t a0 = a[0], a1 = a[1], a2 = a[2], a3 = a[3], a4 = a[4];
d = (__int128)(a0*2) * a3
+ (__int128)(a1*2) * a2;
VERIFY_BITS(d, 114);
// [d 0 0 0] = [p3 0 0 0]
c = (__int128)a4 * a4;
VERIFY_BITS(c, 112);
// [c 0 0 0 0 d 0 0 0] = [p8 0 0 0 0 p3 0 0 0]
d += (c & M) * R; c >>= 52;
VERIFY_BITS(d, 115);
VERIFY_BITS(c, 60);
// [c 0 0 0 0 0 d 0 0 0] = [p8 0 0 0 0 p3 0 0 0]
uint64_t t3 = d & M; d >>= 52;
VERIFY_BITS(t3, 52);
VERIFY_BITS(d, 63);
// [c 0 0 0 0 d t3 0 0 0] = [p8 0 0 0 0 p3 0 0 0]
a4 *= 2;
d += (__int128)a0 * a4
+ (__int128)(a1*2) * a3
+ (__int128)a2 * a2;
VERIFY_BITS(d, 115);
// [c 0 0 0 0 d t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0]
d += c * R;
VERIFY_BITS(d, 116);
// [d t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0]
uint64_t t4 = d & M; d >>= 52;
VERIFY_BITS(t4, 52);
VERIFY_BITS(d, 64);
// [d t4 t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0]
uint64_t tx = (t4 >> 48); t4 &= (M >> 4);
VERIFY_BITS(tx, 4);
VERIFY_BITS(t4, 48);
// [d t4+(tx<<48) t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0]
c = (__int128)a0 * a0;
VERIFY_BITS(c, 112);
// [d t4+(tx<<48) t3 0 0 c] = [p8 0 0 0 p4 p3 0 0 p0]
d += (__int128)a1 * a4
+ (__int128)(a2*2) * a3;
VERIFY_BITS(d, 114);
// [d t4+(tx<<48) t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0]
uint64_t u0 = d & M; d >>= 52;
VERIFY_BITS(u0, 52);
VERIFY_BITS(d, 62);
// [d u0 t4+(tx<<48) t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0]
// [d 0 t4+(tx<<48)+(u0<<52) t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0]
u0 = (u0 << 4) | tx;
VERIFY_BITS(u0, 56);
// [d 0 t4+(u0<<48) t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0]
c += (__int128)u0 * (R >> 4);
VERIFY_BITS(c, 113);
// [d 0 t4 t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0]
r[0] = c & M; c >>= 52;
VERIFY_BITS(r[0], 52);
VERIFY_BITS(c, 61);
// [d 0 t4 t3 0 c r0] = [p8 0 0 p5 p4 p3 0 0 p0]
a0 *= 2;
c += (__int128)a0 * a1;
VERIFY_BITS(c, 114);
// [d 0 t4 t3 0 c r0] = [p8 0 0 p5 p4 p3 0 p1 p0]
d += (__int128)a2 * a4
+ (__int128)a3 * a3;
VERIFY_BITS(d, 114);
// [d 0 t4 t3 0 c r0] = [p8 0 p6 p5 p4 p3 0 p1 p0]
c += (d & M) * R; d >>= 52;
VERIFY_BITS(c, 115);
VERIFY_BITS(d, 62);
// [d 0 0 t4 t3 0 c r0] = [p8 0 p6 p5 p4 p3 0 p1 p0]
r[1] = c & M; c >>= 52;
VERIFY_BITS(r[1], 52);
VERIFY_BITS(c, 63);
// [d 0 0 t4 t3 c r1 r0] = [p8 0 p6 p5 p4 p3 0 p1 p0]
c += (__int128)a0 * a2
+ (__int128)a1 * a1;
VERIFY_BITS(c, 114);
// [d 0 0 t4 t3 c r1 r0] = [p8 0 p6 p5 p4 p3 p2 p1 p0]
d += (__int128)a3 * a4;
VERIFY_BITS(d, 114);
// [d 0 0 t4 t3 c r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0]
c += (d & M) * R; d >>= 52;
VERIFY_BITS(c, 115);
VERIFY_BITS(d, 62);
// [d 0 0 0 t4 t3 c r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0]
r[2] = c & M; c >>= 52;
VERIFY_BITS(r[2], 52);
VERIFY_BITS(c, 63);
// [d 0 0 0 t4 t3+c r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0]
c += d * R + t3;;
VERIFY_BITS(c, 100);
// [t4 c r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0]
r[3] = c & M; c >>= 52;
VERIFY_BITS(r[3], 52);
VERIFY_BITS(c, 48);
// [t4+c r3 r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0]
c += t4;
VERIFY_BITS(c, 49);
// [c r3 r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0]
r[4] = c;
VERIFY_BITS(r[4], 49);
// [r4 r3 r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0]
}
#endif