Added optimised implementation of P-256 that uses 32->64 multiplications (MUL31).

This commit is contained in:
Thomas Pornin 2017-01-16 18:04:40 +01:00
parent 89ea3b1876
commit bd3036844b
9 changed files with 1700 additions and 27 deletions

View File

@ -47,7 +47,7 @@ TESTX509 = testx509
TESTMATH = testmath
OBJCODEC = $(BUILD)/ccopy.o $(BUILD)/dec16be.o $(BUILD)/dec16le.o $(BUILD)/dec32be.o $(BUILD)/dec32le.o $(BUILD)/dec64be.o $(BUILD)/dec64le.o $(BUILD)/enc16be.o $(BUILD)/enc16le.o $(BUILD)/enc32be.o $(BUILD)/enc32le.o $(BUILD)/enc64be.o $(BUILD)/enc64le.o $(BUILD)/pemdec.o
OBJEC = $(BUILD)/ec_all_m15.o $(BUILD)/ec_c25519_i15.o $(BUILD)/ec_c25519_i31.o $(BUILD)/ec_c25519_m15.o $(BUILD)/ec_c25519_m31.o $(BUILD)/ec_curve25519.o $(BUILD)/ec_p256_m15.o $(BUILD)/ec_prime_i15.o $(BUILD)/ec_prime_i31.o $(BUILD)/ec_secp256r1.o $(BUILD)/ec_secp384r1.o $(BUILD)/ec_secp521r1.o $(BUILD)/ecdsa_atr.o $(BUILD)/ecdsa_i15_bits.o $(BUILD)/ecdsa_i15_sign_asn1.o $(BUILD)/ecdsa_i15_sign_raw.o $(BUILD)/ecdsa_i15_vrfy_asn1.o $(BUILD)/ecdsa_i15_vrfy_raw.o $(BUILD)/ecdsa_i31_bits.o $(BUILD)/ecdsa_i31_sign_asn1.o $(BUILD)/ecdsa_i31_sign_raw.o $(BUILD)/ecdsa_i31_vrfy_asn1.o $(BUILD)/ecdsa_i31_vrfy_raw.o $(BUILD)/ecdsa_rta.o
OBJEC = $(BUILD)/ec_all_m15.o $(BUILD)/ec_all_m31.o $(BUILD)/ec_c25519_i15.o $(BUILD)/ec_c25519_i31.o $(BUILD)/ec_c25519_m15.o $(BUILD)/ec_c25519_m31.o $(BUILD)/ec_curve25519.o $(BUILD)/ec_p256_m15.o $(BUILD)/ec_p256_m31.o $(BUILD)/ec_prime_i15.o $(BUILD)/ec_prime_i31.o $(BUILD)/ec_secp256r1.o $(BUILD)/ec_secp384r1.o $(BUILD)/ec_secp521r1.o $(BUILD)/ecdsa_atr.o $(BUILD)/ecdsa_i15_bits.o $(BUILD)/ecdsa_i15_sign_asn1.o $(BUILD)/ecdsa_i15_sign_raw.o $(BUILD)/ecdsa_i15_vrfy_asn1.o $(BUILD)/ecdsa_i15_vrfy_raw.o $(BUILD)/ecdsa_i31_bits.o $(BUILD)/ecdsa_i31_sign_asn1.o $(BUILD)/ecdsa_i31_sign_raw.o $(BUILD)/ecdsa_i31_vrfy_asn1.o $(BUILD)/ecdsa_i31_vrfy_raw.o $(BUILD)/ecdsa_rta.o
# $(BUILD)/ec_prime_i31_secp256r1.o $(BUILD)/ec_prime_i31_secp384r1.o $(BUILD)/ec_prime_i31_secp521r1.o
OBJHASH = $(BUILD)/dig_oid.o $(BUILD)/dig_size.o $(BUILD)/ghash_ctmul.o $(BUILD)/ghash_ctmul32.o $(BUILD)/ghash_ctmul64.o $(BUILD)/md5.o $(BUILD)/md5sha1.o $(BUILD)/multihash.o $(BUILD)/sha1.o $(BUILD)/sha2big.o $(BUILD)/sha2small.o
OBJINT15 = $(BUILD)/i15_core.o $(BUILD)/i15_ext1.o $(BUILD)/i15_ext2.o
@ -165,6 +165,9 @@ $(BUILD)/ec_g_secp521r1.o: src/ec/ec_g_secp521r1.c $(HEADERS)
$(BUILD)/ec_all_m15.o: src/ec/ec_all_m15.c $(HEADERS)
$(CC) $(CFLAGS) -c -o $(BUILD)/ec_all_m15.o src/ec/ec_all_m15.c
$(BUILD)/ec_all_m31.o: src/ec/ec_all_m31.c $(HEADERS)
$(CC) $(CFLAGS) -c -o $(BUILD)/ec_all_m31.o src/ec/ec_all_m31.c
$(BUILD)/ec_c25519_i15.o: src/ec/ec_c25519_i15.c $(HEADERS)
$(CC) $(CFLAGS) -c -o $(BUILD)/ec_c25519_i15.o src/ec/ec_c25519_i15.c
@ -183,6 +186,9 @@ $(BUILD)/ec_curve25519.o: src/ec/ec_curve25519.c $(HEADERS)
$(BUILD)/ec_p256_m15.o: src/ec/ec_p256_m15.c $(HEADERS)
$(CC) $(CFLAGS) -c -o $(BUILD)/ec_p256_m15.o src/ec/ec_p256_m15.c
$(BUILD)/ec_p256_m31.o: src/ec/ec_p256_m31.c $(HEADERS)
$(CC) $(CFLAGS) -c -o $(BUILD)/ec_p256_m31.o src/ec/ec_p256_m31.c
$(BUILD)/ec_prime_i15.o: src/ec/ec_prime_i15.c $(HEADERS)
$(CC) $(CFLAGS) -c -o $(BUILD)/ec_prime_i15.o src/ec/ec_prime_i15.c

View File

@ -436,6 +436,15 @@ extern const br_ec_impl br_ec_prime_i15;
*/
extern const br_ec_impl br_ec_p256_m15;
/**
* \brief EC implementation "m31" for P-256.
*
* This implementation uses specialised code for curve secp256r1 (also
* known as NIST P-256), relying on multiplications of 31-bit values
* (MUL31).
*/
extern const br_ec_impl br_ec_p256_m31;
/**
* \brief EC implementation "i15" (generic code) for Curve25519.
*

121
src/ec/ec_all_m31.c Normal file
View File

@ -0,0 +1,121 @@
/*
* Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#include "inner.h"
static const unsigned char *
api_generator(int curve, size_t *len)
{
switch (curve) {
case BR_EC_secp256r1:
return br_ec_p256_m31.generator(curve, len);
case BR_EC_curve25519:
return br_ec_c25519_m31.generator(curve, len);
default:
return br_ec_prime_i31.generator(curve, len);
}
}
static const unsigned char *
api_order(int curve, size_t *len)
{
switch (curve) {
case BR_EC_secp256r1:
return br_ec_p256_m31.order(curve, len);
case BR_EC_curve25519:
return br_ec_c25519_m31.order(curve, len);
default:
return br_ec_prime_i31.order(curve, len);
}
}
static size_t
api_xoff(int curve, size_t *len)
{
switch (curve) {
case BR_EC_secp256r1:
return br_ec_p256_m31.xoff(curve, len);
case BR_EC_curve25519:
return br_ec_c25519_m31.xoff(curve, len);
default:
return br_ec_prime_i31.xoff(curve, len);
}
}
static uint32_t
api_mul(unsigned char *G, size_t Glen,
const unsigned char *kb, size_t kblen, int curve)
{
switch (curve) {
case BR_EC_secp256r1:
return br_ec_p256_m31.mul(G, Glen, kb, kblen, curve);
case BR_EC_curve25519:
return br_ec_c25519_m31.mul(G, Glen, kb, kblen, curve);
default:
return br_ec_prime_i31.mul(G, Glen, kb, kblen, curve);
}
}
static size_t
api_mulgen(unsigned char *R,
const unsigned char *x, size_t xlen, int curve)
{
switch (curve) {
case BR_EC_secp256r1:
return br_ec_p256_m31.mulgen(R, x, xlen, curve);
case BR_EC_curve25519:
return br_ec_c25519_m31.mulgen(R, x, xlen, curve);
default:
return br_ec_prime_i31.mulgen(R, x, xlen, curve);
}
}
static uint32_t
api_muladd(unsigned char *A, const unsigned char *B, size_t len,
const unsigned char *x, size_t xlen,
const unsigned char *y, size_t ylen, int curve)
{
switch (curve) {
case BR_EC_secp256r1:
return br_ec_p256_m31.muladd(A, B, len,
x, xlen, y, ylen, curve);
case BR_EC_curve25519:
return br_ec_c25519_m31.muladd(A, B, len,
x, xlen, y, ylen, curve);
default:
return br_ec_prime_i31.muladd(A, B, len,
x, xlen, y, ylen, curve);
}
}
/* see bearssl_ec.h */
const br_ec_impl br_ec_all_m31 = {
(uint32_t)0x23800000,
&api_generator,
&api_order,
&api_xoff,
&api_mul,
&api_mulgen,
&api_muladd
};

View File

@ -179,8 +179,14 @@ static uint32_t
api_mul(unsigned char *G, size_t Glen,
const unsigned char *kb, size_t kblen, int curve)
{
#define ILEN (18 * sizeof(uint16_t))
/*
* The a[] and b[] arrays have an extra word to allow for
* decoding without using br_i15_decode_reduce().
*/
uint16_t x1[18], x2[18], x3[18], z2[18], z3[18];
uint16_t a[18], aa[18], b[18], bb[18];
uint16_t a[19], aa[18], b[19], bb[18];
uint16_t c[18], d[18], e[18], da[18], cb[18];
unsigned char k[32];
uint32_t swap;
@ -205,17 +211,33 @@ api_mul(unsigned char *G, size_t Glen,
*/
byteswap(G);
/*
* Decode the point ('u' coordinate). This should be reduced
* modulo p, but we prefer to avoid the dependency on
* br_i15_decode_reduce(). Instead, we use br_i15_decode_mod()
* with a synthetic modulus of value 2^255 (this must work
* since G was truncated to 255 bits), then use a conditional
* subtraction. We use br_i15_decode_mod() and not
* br_i15_decode(), because the ec_prime_i15 implementation uses
* the former but not the latter.
* br_i15_decode_reduce(a, G, 32, C255_P);
*/
br_i15_zero(b, 0x111);
b[18] = 1;
br_i15_decode_mod(a, G, 32, b);
a[0] = 0x110;
br_i15_sub(a, C255_P, NOT(br_i15_sub(a, C255_P, 0)));
/*
* Initialise variables x1, x2, z2, x3 and z3. We set all of them
* into Montgomery representation.
*/
br_i15_decode_reduce(a, G, 32, C255_P);
br_i15_montymul(x1, a, C255_R2, C255_P, P0I);
memcpy(x3, x1, sizeof x1);
memcpy(x3, x1, ILEN);
br_i15_zero(z2, C255_P[0]);
memcpy(x2, z2, sizeof z2);
memcpy(x2, z2, ILEN);
x2[1] = 19;
memcpy(z3, x2, sizeof x2);
memcpy(z3, x2, ILEN);
memcpy(k, kb, kblen);
memset(k + kblen, 0, (sizeof k) - kblen);
@ -291,12 +313,12 @@ api_mul(unsigned char *G, size_t Glen,
* square-and-multiply algorithm; we mutualise most non-squarings
* since the exponent contains almost only ones.
*/
memcpy(a, z2, sizeof z2);
memcpy(a, z2, ILEN);
for (i = 0; i < 15; i ++) {
c255_mul(a, a, a);
c255_mul(a, a, z2);
}
memcpy(b, a, sizeof a);
memcpy(b, a, ILEN);
for (i = 0; i < 14; i ++) {
int j;
@ -311,11 +333,23 @@ api_mul(unsigned char *G, size_t Glen,
c255_mul(b, z2, b);
}
}
c255_mul(x2, x2, b);
br_i15_from_monty(x2, C255_P, P0I);
c255_mul(b, x2, b);
/*
* To avoid a dependency on br_i15_from_monty(), we use a
* Montgomery multiplication with 1.
* memcpy(x2, b, ILEN);
* br_i15_from_monty(x2, C255_P, P0I);
*/
br_i15_zero(a, C255_P[0]);
a[1] = 1;
br_i15_montymul(x2, a, b, C255_P, P0I);
br_i15_encode(G, 32, x2);
byteswap(G);
return 1;
#undef ILEN
}
static size_t

View File

@ -202,11 +202,27 @@ api_mul(unsigned char *G, size_t Glen,
*/
byteswap(G);
/*
* Decode the point ('u' coordinate). This should be reduced
* modulo p, but we prefer to avoid the dependency on
* br_i31_decode_reduce(). Instead, we use br_i31_decode_mod()
* with a synthetic modulus of value 2^255 (this must work
* since G was truncated to 255 bits), then use a conditional
* subtraction. We use br_i31_decode_mod() and not
* br_i31_decode(), because the ec_prime_i31 implementation uses
* the former but not the latter.
* br_i31_decode_reduce(a, G, 32, C255_P);
*/
br_i31_zero(b, 0x108);
b[9] = 0x0100;
br_i31_decode_mod(a, G, 32, b);
a[0] = 0x107;
br_i31_sub(a, C255_P, NOT(br_i31_sub(a, C255_P, 0)));
/*
* Initialise variables x1, x2, z2, x3 and z3. We set all of them
* into Montgomery representation.
*/
br_i31_decode_reduce(a, G, 32, C255_P);
br_i31_montymul(x1, a, C255_R2, C255_P, P0I);
memcpy(x3, x1, sizeof x1);
br_i31_zero(z2, C255_P[0]);
@ -308,8 +324,18 @@ api_mul(unsigned char *G, size_t Glen,
c255_mul(b, z2, b);
}
}
c255_mul(x2, x2, b);
br_i31_from_monty(x2, C255_P, P0I);
c255_mul(b, x2, b);
/*
* To avoid a dependency on br_i31_from_monty(), we use
* a Montgomery multiplication with 1.
* memcpy(x2, b, sizeof b);
* br_i31_from_monty(x2, C255_P, P0I);
*/
br_i31_zero(a, C255_P[0]);
a[1] = 1;
br_i31_montymul(x2, a, b, C255_P, P0I);
br_i31_encode(G, 32, x2);
byteswap(G);
return 1;

View File

@ -1101,18 +1101,20 @@ mul_f256(uint32_t *d, const uint32_t *a, const uint32_t *b)
}
/*
* Propagate carries. Since the operation above really is a
* truncature, followed by the addition of nonnegative values,
* the result will be positive. Moreover, the carry cannot
* exceed 5 bits (we performed 20 additions with values smaller
* than 256 bits).
* Propagate carries. This is a signed propagation, and the
* result may be negative. The loop above may enlarge values,
* but not two much: worst case is the chain involving t[i - 3],
* in which a value may be added to itself up to 7 times. Since
* starting values are 13-bit each, all words fit on 20 bits
* (21 to account for the sign bit).
*/
cc = norm13(t, t, 20);
/*
* Perform modular reduction again for the bits beyond 256 (the carry
* and the bits 256..259). This time, we can simply inject full
* word values.
* and the bits 256..259). Since the largest shift below is by 10
* bits, and the values fit on 21 bits, values fit in 32-bit words,
* thereby allowing injecting full word values.
*/
cc = (cc << 4) | (t[19] >> 9);
t[19] &= 0x01FF;
@ -1172,18 +1174,20 @@ square_f256(uint32_t *d, const uint32_t *a)
}
/*
* Propagate carries. Since the operation above really is a
* truncature, followed by the addition of nonnegative values,
* the result will be positive. Moreover, the carry cannot
* exceed 5 bits (we performed 20 additions with values smaller
* than 256 bits).
* Propagate carries. This is a signed propagation, and the
* result may be negative. The loop above may enlarge values,
* but not two much: worst case is the chain involving t[i - 3],
* in which a value may be added to itself up to 7 times. Since
* starting values are 13-bit each, all words fit on 20 bits
* (21 to account for the sign bit).
*/
cc = norm13(t, t, 20);
/*
* Perform modular reduction again for the bits beyond 256 (the carry
* and the bits 256..259). This time, we can simply inject full
* word values.
* and the bits 256..259). Since the largest shift below is by 10
* bits, and the values fit on 21 bits, values fit in 32-bit words,
* thereby allowing injecting full word values.
*/
cc = (cc << 4) | (t[19] >> 9);
t[19] &= 0x01FF;

1447
src/ec/ec_p256_m31.c Normal file

File diff suppressed because it is too large Load Diff

View File

@ -4918,6 +4918,13 @@ test_EC_p256_m15(void)
(uint32_t)1 << BR_EC_secp256r1);
}
static void
test_EC_p256_m31(void)
{
test_EC_KAT("EC_p256_m31", &br_ec_p256_m31,
(uint32_t)1 << BR_EC_secp256r1);
}
const struct {
const char *scalar;
const char *u_in;
@ -5567,6 +5574,7 @@ static const struct {
STU(EC_prime_i15),
STU(EC_prime_i31),
STU(EC_p256_m15),
STU(EC_p256_m31),
STU(EC_c25519_i15),
STU(EC_c25519_i31),
STU(EC_c25519_m15),

View File

@ -691,6 +691,13 @@ test_speed_ec_p256_m15(void)
&br_ec_p256_m15, &br_secp256r1);
}
static void
test_speed_ec_p256_m31(void)
{
test_speed_ec_inner("EC p256_m31",
&br_ec_p256_m31, &br_secp256r1);
}
static void
test_speed_ec_prime_i15(void)
{
@ -835,6 +842,15 @@ test_speed_ecdsa_p256_m15(void)
&br_ecdsa_i15_vrfy_asn1);
}
static void
test_speed_ecdsa_p256_m31(void)
{
test_speed_ecdsa_inner("ECDSA m31 P-256",
&br_ec_p256_m31, &br_secp256r1,
&br_ecdsa_i31_sign_asn1,
&br_ecdsa_i31_vrfy_asn1);
}
static void
test_speed_ecdsa_i15(void)
{
@ -1282,11 +1298,13 @@ static const struct {
STU(ec_prime_i15),
STU(ec_prime_i31),
STU(ec_p256_m15),
STU(ec_p256_m31),
STU(ec_c25519_i15),
STU(ec_c25519_i31),
STU(ec_c25519_m15),
STU(ec_c25519_m31),
STU(ecdsa_p256_m15),
STU(ecdsa_p256_m31),
STU(ecdsa_i15),
STU(ecdsa_i31),