Improve normalization performance for 32bit
- Uses a similar approach to the latest 64bit _normalize. - Add one useful optimization back into the 64bit _normalize too. Performance of 'bench' improved by around 0.5% for the 32bit field (but tested on a 64-bit machine).
This commit is contained in:
parent
f33793fb99
commit
42822baaa8
|
@ -15,77 +15,52 @@ void static secp256k1_fe_inner_start(void) {}
|
|||
void static secp256k1_fe_inner_stop(void) {}
|
||||
|
||||
void static secp256k1_fe_normalize(secp256k1_fe_t *r) {
|
||||
// fog("normalize in: ", r);
|
||||
uint32_t c;
|
||||
c = r->n[0];
|
||||
uint32_t t0 = c & 0x3FFFFFFUL;
|
||||
c = (c >> 26) + r->n[1];
|
||||
uint32_t t1 = c & 0x3FFFFFFUL;
|
||||
c = (c >> 26) + r->n[2];
|
||||
uint32_t t2 = c & 0x3FFFFFFUL;
|
||||
c = (c >> 26) + r->n[3];
|
||||
uint32_t t3 = c & 0x3FFFFFFUL;
|
||||
c = (c >> 26) + r->n[4];
|
||||
uint32_t t4 = c & 0x3FFFFFFUL;
|
||||
c = (c >> 26) + r->n[5];
|
||||
uint32_t t5 = c & 0x3FFFFFFUL;
|
||||
c = (c >> 26) + r->n[6];
|
||||
uint32_t t6 = c & 0x3FFFFFFUL;
|
||||
c = (c >> 26) + r->n[7];
|
||||
uint32_t t7 = c & 0x3FFFFFFUL;
|
||||
c = (c >> 26) + r->n[8];
|
||||
uint32_t t8 = c & 0x3FFFFFFUL;
|
||||
c = (c >> 26) + r->n[9];
|
||||
uint32_t t9 = c & 0x03FFFFFUL;
|
||||
c >>= 22;
|
||||
/* r->n[0] = t0; r->n[1] = t1; r->n[2] = t2; r->n[3] = t3; r->n[4] = t4;
|
||||
r->n[5] = t5; r->n[6] = t6; r->n[7] = t7; r->n[8] = t8; r->n[9] = t9;
|
||||
fog(" tm1: ", r);
|
||||
fprintf(stderr, "out c= %08lx\n", (unsigned long)c);*/
|
||||
uint32_t t0 = r->n[0], t1 = r->n[1], t2 = r->n[2], t3 = r->n[3], t4 = r->n[4],
|
||||
t5 = r->n[5], t6 = r->n[6], t7 = r->n[7], t8 = r->n[8], t9 = r->n[9];
|
||||
|
||||
// The following code will not modify the t's if c is initially 0.
|
||||
uint32_t d = c * 0x3D1UL + t0;
|
||||
t0 = d & 0x3FFFFFFULL;
|
||||
d = (d >> 26) + t1 + c*0x40;
|
||||
t1 = d & 0x3FFFFFFULL;
|
||||
d = (d >> 26) + t2;
|
||||
t2 = d & 0x3FFFFFFULL;
|
||||
d = (d >> 26) + t3;
|
||||
t3 = d & 0x3FFFFFFULL;
|
||||
d = (d >> 26) + t4;
|
||||
t4 = d & 0x3FFFFFFULL;
|
||||
d = (d >> 26) + t5;
|
||||
t5 = d & 0x3FFFFFFULL;
|
||||
d = (d >> 26) + t6;
|
||||
t6 = d & 0x3FFFFFFULL;
|
||||
d = (d >> 26) + t7;
|
||||
t7 = d & 0x3FFFFFFULL;
|
||||
d = (d >> 26) + t8;
|
||||
t8 = d & 0x3FFFFFFULL;
|
||||
d = (d >> 26) + t9;
|
||||
t9 = d & 0x03FFFFFULL;
|
||||
assert((d >> 22) == 0);
|
||||
/* r->n[0] = t0; r->n[1] = t1; r->n[2] = t2; r->n[3] = t3; r->n[4] = t4;
|
||||
r->n[5] = t5; r->n[6] = t6; r->n[7] = t7; r->n[8] = t8; r->n[9] = t9;
|
||||
fog(" tm2: ", r); */
|
||||
// Reduce t9 at the start so there will be at most a single carry from the first pass
|
||||
uint32_t x = t9 >> 22; t9 &= 0x03FFFFFUL;
|
||||
uint32_t m;
|
||||
|
||||
// Subtract p if result >= p
|
||||
uint64_t low = ((uint64_t)t1 << 26) | t0;
|
||||
uint64_t mask = -(int64_t)((t9 < 0x03FFFFFUL) | (t8 < 0x3FFFFFFUL) | (t7 < 0x3FFFFFFUL) | (t6 < 0x3FFFFFFUL) | (t5 < 0x3FFFFFFUL) | (t4 < 0x3FFFFFFUL) | (t3 < 0x3FFFFFFUL) | (t2 < 0x3FFFFFFUL) | (low < 0xFFFFEFFFFFC2FULL));
|
||||
t9 &= mask;
|
||||
t8 &= mask;
|
||||
t7 &= mask;
|
||||
t6 &= mask;
|
||||
t5 &= mask;
|
||||
t4 &= mask;
|
||||
t3 &= mask;
|
||||
t2 &= mask;
|
||||
low -= (~mask & 0xFFFFEFFFFFC2FULL);
|
||||
// The first pass ensures the magnitude is 1, ...
|
||||
t0 += x * 0x3D1UL; t1 += (x << 6);
|
||||
t1 += (t0 >> 26); t0 &= 0x3FFFFFFUL;
|
||||
t2 += (t1 >> 26); t1 &= 0x3FFFFFFUL;
|
||||
t3 += (t2 >> 26); t2 &= 0x3FFFFFFUL; m = t2;
|
||||
t4 += (t3 >> 26); t3 &= 0x3FFFFFFUL; m &= t3;
|
||||
t5 += (t4 >> 26); t4 &= 0x3FFFFFFUL; m &= t4;
|
||||
t6 += (t5 >> 26); t5 &= 0x3FFFFFFUL; m &= t5;
|
||||
t7 += (t6 >> 26); t6 &= 0x3FFFFFFUL; m &= t6;
|
||||
t8 += (t7 >> 26); t7 &= 0x3FFFFFFUL; m &= t7;
|
||||
t9 += (t8 >> 26); t8 &= 0x3FFFFFFUL; m &= t8;
|
||||
|
||||
// push internal variables back
|
||||
r->n[0] = low & 0x3FFFFFFUL; r->n[1] = (low >> 26) & 0x3FFFFFFUL; r->n[2] = t2; r->n[3] = t3; r->n[4] = t4;
|
||||
// ... except for a possible carry at bit 22 of t9 (i.e. bit 256 of the field element)
|
||||
assert(t9 >> 23 == 0);
|
||||
|
||||
// At most a single final reduction is needed; check if the value is >= the field characteristic
|
||||
x = (t9 >> 22) | ((t9 == 0x03FFFFFULL) & (m == 0x3FFFFFFULL)
|
||||
& ((t1 + 0x40UL + ((t0 + 0x3D1UL) >> 26)) > 0x3FFFFFFULL));
|
||||
|
||||
// Apply the final reduction (for constant-time behaviour, we do it always)
|
||||
t0 += x * 0x3D1UL; t1 += (x << 6);
|
||||
t1 += (t0 >> 26); t0 &= 0x3FFFFFFUL;
|
||||
t2 += (t1 >> 26); t1 &= 0x3FFFFFFUL;
|
||||
t3 += (t2 >> 26); t2 &= 0x3FFFFFFUL;
|
||||
t4 += (t3 >> 26); t3 &= 0x3FFFFFFUL;
|
||||
t5 += (t4 >> 26); t4 &= 0x3FFFFFFUL;
|
||||
t6 += (t5 >> 26); t5 &= 0x3FFFFFFUL;
|
||||
t7 += (t6 >> 26); t6 &= 0x3FFFFFFUL;
|
||||
t8 += (t7 >> 26); t7 &= 0x3FFFFFFUL;
|
||||
t9 += (t8 >> 26); t8 &= 0x3FFFFFFUL;
|
||||
|
||||
// If t9 didn't carry to bit 22 already, then it should have after any final reduction
|
||||
assert(t9 >> 22 == x);
|
||||
|
||||
// Mask off the possible multiple of 2^256 from the final reduction
|
||||
t9 &= 0x03FFFFFUL;
|
||||
|
||||
r->n[0] = t0; r->n[1] = t1; r->n[2] = t2; r->n[3] = t3; r->n[4] = t4;
|
||||
r->n[5] = t5; r->n[6] = t6; r->n[7] = t7; r->n[8] = t8; r->n[9] = t9;
|
||||
/* fog(" out: ", r);*/
|
||||
|
||||
#ifdef VERIFY
|
||||
r->magnitude = 1;
|
||||
|
|
|
@ -38,20 +38,20 @@ void static secp256k1_fe_normalize(secp256k1_fe_t *r) {
|
|||
|
||||
// Reduce t4 at the start so there will be at most a single carry from the first pass
|
||||
uint64_t x = t4 >> 48; t4 &= 0x0FFFFFFFFFFFFULL;
|
||||
uint64_t m;
|
||||
|
||||
// The first pass ensures the magnitude is 1, ...
|
||||
t0 += x * 0x1000003D1ULL;
|
||||
t1 += (t0 >> 52); t0 &= 0xFFFFFFFFFFFFFULL;
|
||||
t2 += (t1 >> 52); t1 &= 0xFFFFFFFFFFFFFULL;
|
||||
t3 += (t2 >> 52); t2 &= 0xFFFFFFFFFFFFFULL;
|
||||
t4 += (t3 >> 52); t3 &= 0xFFFFFFFFFFFFFULL;
|
||||
t2 += (t1 >> 52); t1 &= 0xFFFFFFFFFFFFFULL; m = t1;
|
||||
t3 += (t2 >> 52); t2 &= 0xFFFFFFFFFFFFFULL; m &= t2;
|
||||
t4 += (t3 >> 52); t3 &= 0xFFFFFFFFFFFFFULL; m &= t3;
|
||||
|
||||
// ... except for a possible carry at bit 48 of t4 (i.e. bit 256 of the field element)
|
||||
assert(t4 >> 49 == 0);
|
||||
|
||||
// At most a single final reduction is needed; check if the value is >= the field characteristic
|
||||
x = (t4 >> 48) | ((t4 == 0x0FFFFFFFFFFFFULL)
|
||||
& ((t3 & t2 & t1) == 0xFFFFFFFFFFFFFULL)
|
||||
x = (t4 >> 48) | ((t4 == 0x0FFFFFFFFFFFFULL) & (m == 0xFFFFFFFFFFFFFULL)
|
||||
& (t0 >= 0xFFFFEFFFFFC2FULL));
|
||||
|
||||
// Apply the final reduction (for constant-time behaviour, we do it always)
|
||||
|
|
Loading…
Reference in New Issue