Added optimised implementation of P-256 that uses 32->64 multiplications (MUL31).

2025-02-23 07:58:08 +00:00 · 2017-01-16 18:04:40 +01:00 · 2017-01-16 18:04:40 +01:00 · bd3036844b
commit bd3036844b
parent 89ea3b1876
9 changed files with 1700 additions and 27 deletions
--- a/8
+++ b/8
@ -47,7 +47,7 @@ TESTX509 = testx509
 TESTMATH = testmath

 OBJCODEC = $(BUILD)/ccopy.o $(BUILD)/dec16be.o $(BUILD)/dec16le.o $(BUILD)/dec32be.o $(BUILD)/dec32le.o $(BUILD)/dec64be.o $(BUILD)/dec64le.o $(BUILD)/enc16be.o $(BUILD)/enc16le.o $(BUILD)/enc32be.o $(BUILD)/enc32le.o $(BUILD)/enc64be.o $(BUILD)/enc64le.o $(BUILD)/pemdec.o
-OBJEC = $(BUILD)/ec_all_m15.o $(BUILD)/ec_c25519_i15.o $(BUILD)/ec_c25519_i31.o $(BUILD)/ec_c25519_m15.o $(BUILD)/ec_c25519_m31.o $(BUILD)/ec_curve25519.o $(BUILD)/ec_p256_m15.o $(BUILD)/ec_prime_i15.o $(BUILD)/ec_prime_i31.o $(BUILD)/ec_secp256r1.o $(BUILD)/ec_secp384r1.o $(BUILD)/ec_secp521r1.o $(BUILD)/ecdsa_atr.o $(BUILD)/ecdsa_i15_bits.o $(BUILD)/ecdsa_i15_sign_asn1.o $(BUILD)/ecdsa_i15_sign_raw.o $(BUILD)/ecdsa_i15_vrfy_asn1.o $(BUILD)/ecdsa_i15_vrfy_raw.o $(BUILD)/ecdsa_i31_bits.o $(BUILD)/ecdsa_i31_sign_asn1.o $(BUILD)/ecdsa_i31_sign_raw.o $(BUILD)/ecdsa_i31_vrfy_asn1.o $(BUILD)/ecdsa_i31_vrfy_raw.o $(BUILD)/ecdsa_rta.o
+OBJEC = $(BUILD)/ec_all_m15.o $(BUILD)/ec_all_m31.o $(BUILD)/ec_c25519_i15.o $(BUILD)/ec_c25519_i31.o $(BUILD)/ec_c25519_m15.o $(BUILD)/ec_c25519_m31.o $(BUILD)/ec_curve25519.o $(BUILD)/ec_p256_m15.o $(BUILD)/ec_p256_m31.o $(BUILD)/ec_prime_i15.o $(BUILD)/ec_prime_i31.o $(BUILD)/ec_secp256r1.o $(BUILD)/ec_secp384r1.o $(BUILD)/ec_secp521r1.o $(BUILD)/ecdsa_atr.o $(BUILD)/ecdsa_i15_bits.o $(BUILD)/ecdsa_i15_sign_asn1.o $(BUILD)/ecdsa_i15_sign_raw.o $(BUILD)/ecdsa_i15_vrfy_asn1.o $(BUILD)/ecdsa_i15_vrfy_raw.o $(BUILD)/ecdsa_i31_bits.o $(BUILD)/ecdsa_i31_sign_asn1.o $(BUILD)/ecdsa_i31_sign_raw.o $(BUILD)/ecdsa_i31_vrfy_asn1.o $(BUILD)/ecdsa_i31_vrfy_raw.o $(BUILD)/ecdsa_rta.o
 # $(BUILD)/ec_prime_i31_secp256r1.o $(BUILD)/ec_prime_i31_secp384r1.o $(BUILD)/ec_prime_i31_secp521r1.o
 OBJHASH = $(BUILD)/dig_oid.o $(BUILD)/dig_size.o $(BUILD)/ghash_ctmul.o $(BUILD)/ghash_ctmul32.o $(BUILD)/ghash_ctmul64.o $(BUILD)/md5.o $(BUILD)/md5sha1.o $(BUILD)/multihash.o $(BUILD)/sha1.o $(BUILD)/sha2big.o $(BUILD)/sha2small.o
 OBJINT15 = $(BUILD)/i15_core.o $(BUILD)/i15_ext1.o $(BUILD)/i15_ext2.o
@ -165,6 +165,9 @@ $(BUILD)/ec_g_secp521r1.o: src/ec/ec_g_secp521r1.c $(HEADERS)
 $(BUILD)/ec_all_m15.o: src/ec/ec_all_m15.c $(HEADERS)
 	$(CC) $(CFLAGS) -c -o $(BUILD)/ec_all_m15.o src/ec/ec_all_m15.c

+$(BUILD)/ec_all_m31.o: src/ec/ec_all_m31.c $(HEADERS)
+	$(CC) $(CFLAGS) -c -o $(BUILD)/ec_all_m31.o src/ec/ec_all_m31.c
+
 $(BUILD)/ec_c25519_i15.o: src/ec/ec_c25519_i15.c $(HEADERS)
 	$(CC) $(CFLAGS) -c -o $(BUILD)/ec_c25519_i15.o src/ec/ec_c25519_i15.c

@ -183,6 +186,9 @@ $(BUILD)/ec_curve25519.o: src/ec/ec_curve25519.c $(HEADERS)
 $(BUILD)/ec_p256_m15.o: src/ec/ec_p256_m15.c $(HEADERS)
 	$(CC) $(CFLAGS) -c -o $(BUILD)/ec_p256_m15.o src/ec/ec_p256_m15.c

+$(BUILD)/ec_p256_m31.o: src/ec/ec_p256_m31.c $(HEADERS)
+	$(CC) $(CFLAGS) -c -o $(BUILD)/ec_p256_m31.o src/ec/ec_p256_m31.c
+
 $(BUILD)/ec_prime_i15.o: src/ec/ec_prime_i15.c $(HEADERS)
 	$(CC) $(CFLAGS) -c -o $(BUILD)/ec_prime_i15.o src/ec/ec_prime_i15.c

--- a/inc/bearssl_ec.h
+++ b/inc/bearssl_ec.h
@ -436,6 +436,15 @@ extern const br_ec_impl br_ec_prime_i15;
 */
 extern const br_ec_impl br_ec_p256_m15;

+/**
+ * \brief EC implementation "m31" for P-256.
+ *
+ * This implementation uses specialised code for curve secp256r1 (also
+ * known as NIST P-256), relying on multiplications of 31-bit values
+ * (MUL31).
+ */
+extern const br_ec_impl br_ec_p256_m31;
+
 /**
 * \brief EC implementation "i15" (generic code) for Curve25519.
 *
--- a/src/ec/ec_all_m31.c
+++ b/src/ec/ec_all_m31.c
@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+static const unsigned char *
+api_generator(int curve, size_t *len)
+{
+	switch (curve) {
+	case BR_EC_secp256r1:
+		return br_ec_p256_m31.generator(curve, len);
+	case BR_EC_curve25519:
+		return br_ec_c25519_m31.generator(curve, len);
+	default:
+		return br_ec_prime_i31.generator(curve, len);
+	}
+}
+
+static const unsigned char *
+api_order(int curve, size_t *len)
+{
+	switch (curve) {
+	case BR_EC_secp256r1:
+		return br_ec_p256_m31.order(curve, len);
+	case BR_EC_curve25519:
+		return br_ec_c25519_m31.order(curve, len);
+	default:
+		return br_ec_prime_i31.order(curve, len);
+	}
+}
+
+static size_t
+api_xoff(int curve, size_t *len)
+{
+	switch (curve) {
+	case BR_EC_secp256r1:
+		return br_ec_p256_m31.xoff(curve, len);
+	case BR_EC_curve25519:
+		return br_ec_c25519_m31.xoff(curve, len);
+	default:
+		return br_ec_prime_i31.xoff(curve, len);
+	}
+}
+
+static uint32_t
+api_mul(unsigned char *G, size_t Glen,
+	const unsigned char *kb, size_t kblen, int curve)
+{
+	switch (curve) {
+	case BR_EC_secp256r1:
+		return br_ec_p256_m31.mul(G, Glen, kb, kblen, curve);
+	case BR_EC_curve25519:
+		return br_ec_c25519_m31.mul(G, Glen, kb, kblen, curve);
+	default:
+		return br_ec_prime_i31.mul(G, Glen, kb, kblen, curve);
+	}
+}
+
+static size_t
+api_mulgen(unsigned char *R,
+	const unsigned char *x, size_t xlen, int curve)
+{
+	switch (curve) {
+	case BR_EC_secp256r1:
+		return br_ec_p256_m31.mulgen(R, x, xlen, curve);
+	case BR_EC_curve25519:
+		return br_ec_c25519_m31.mulgen(R, x, xlen, curve);
+	default:
+		return br_ec_prime_i31.mulgen(R, x, xlen, curve);
+	}
+}
+
+static uint32_t
+api_muladd(unsigned char *A, const unsigned char *B, size_t len,
+	const unsigned char *x, size_t xlen,
+	const unsigned char *y, size_t ylen, int curve)
+{
+	switch (curve) {
+	case BR_EC_secp256r1:
+		return br_ec_p256_m31.muladd(A, B, len,
+			x, xlen, y, ylen, curve);
+	case BR_EC_curve25519:
+		return br_ec_c25519_m31.muladd(A, B, len,
+			x, xlen, y, ylen, curve);
+	default:
+		return br_ec_prime_i31.muladd(A, B, len,
+			x, xlen, y, ylen, curve);
+	}
+}
+
+/* see bearssl_ec.h */
+const br_ec_impl br_ec_all_m31 = {
+	(uint32_t)0x23800000,
+	&api_generator,
+	&api_order,
+	&api_xoff,
+	&api_mul,
+	&api_mulgen,
+	&api_muladd
+};
--- a/src/ec/ec_c25519_i15.c
+++ b/src/ec/ec_c25519_i15.c
@ -179,8 +179,14 @@ static uint32_t
 api_mul(unsigned char *G, size_t Glen,
 	const unsigned char *kb, size_t kblen, int curve)
 {
+#define ILEN   (18 * sizeof(uint16_t))
+
+	/*
+	 * The a[] and b[] arrays have an extra word to allow for
+	 * decoding without using br_i15_decode_reduce().
+	 */
 	uint16_t x1[18], x2[18], x3[18], z2[18], z3[18];
-	uint16_t a[18], aa[18], b[18], bb[18];
+	uint16_t a[19], aa[18], b[19], bb[18];
 	uint16_t c[18], d[18], e[18], da[18], cb[18];
 	unsigned char k[32];
 	uint32_t swap;
@ -205,17 +211,33 @@ api_mul(unsigned char *G, size_t Glen,
 	 */
 	byteswap(G);

+	/*
+	 * Decode the point ('u' coordinate). This should be reduced
+	 * modulo p, but we prefer to avoid the dependency on
+	 * br_i15_decode_reduce(). Instead, we use br_i15_decode_mod()
+	 * with a synthetic modulus of value 2^255 (this must work
+	 * since G was truncated to 255 bits), then use a conditional
+	 * subtraction. We use br_i15_decode_mod() and not
+	 * br_i15_decode(), because the ec_prime_i15 implementation uses
+	 * the former but not the latter.
+	 *    br_i15_decode_reduce(a, G, 32, C255_P);
+	 */
+	br_i15_zero(b, 0x111);
+	b[18] = 1;
+	br_i15_decode_mod(a, G, 32, b);
+	a[0] = 0x110;
+	br_i15_sub(a, C255_P, NOT(br_i15_sub(a, C255_P, 0)));
+
 	/*
 	 * Initialise variables x1, x2, z2, x3 and z3. We set all of them
 	 * into Montgomery representation.
 	 */
-	br_i15_decode_reduce(a, G, 32, C255_P);
 	br_i15_montymul(x1, a, C255_R2, C255_P, P0I);
-	memcpy(x3, x1, sizeof x1);
+	memcpy(x3, x1, ILEN);
 	br_i15_zero(z2, C255_P[0]);
-	memcpy(x2, z2, sizeof z2);
+	memcpy(x2, z2, ILEN);
 	x2[1] = 19;
-	memcpy(z3, x2, sizeof x2);
+	memcpy(z3, x2, ILEN);

 	memcpy(k, kb, kblen);
 	memset(k + kblen, 0, (sizeof k) - kblen);
@ -291,12 +313,12 @@ api_mul(unsigned char *G, size_t Glen,
 	 * square-and-multiply algorithm; we mutualise most non-squarings
 	 * since the exponent contains almost only ones.
 	 */
-	memcpy(a, z2, sizeof z2);
+	memcpy(a, z2, ILEN);
 	for (i = 0; i < 15; i ++) {
 		c255_mul(a, a, a);
 		c255_mul(a, a, z2);
 	}
-	memcpy(b, a, sizeof a);
+	memcpy(b, a, ILEN);
 	for (i = 0; i < 14; i ++) {
 		int j;

@ -311,11 +333,23 @@ api_mul(unsigned char *G, size_t Glen,
 			c255_mul(b, z2, b);
 		}
 	}
-	c255_mul(x2, x2, b);
-	br_i15_from_monty(x2, C255_P, P0I);
+	c255_mul(b, x2, b);
+
+	/*
+	 * To avoid a dependency on br_i15_from_monty(), we use a
+	 * Montgomery multiplication with 1.
+	 *    memcpy(x2, b, ILEN);
+	 *    br_i15_from_monty(x2, C255_P, P0I);
+	 */
+	br_i15_zero(a, C255_P[0]);
+	a[1] = 1;
+	br_i15_montymul(x2, a, b, C255_P, P0I);
+
 	br_i15_encode(G, 32, x2);
 	byteswap(G);
 	return 1;
+
+#undef ILEN
 }

 static size_t
--- a/src/ec/ec_c25519_i31.c
+++ b/src/ec/ec_c25519_i31.c
@ -202,11 +202,27 @@ api_mul(unsigned char *G, size_t Glen,
 	 */
 	byteswap(G);

+	/*
+	 * Decode the point ('u' coordinate). This should be reduced
+	 * modulo p, but we prefer to avoid the dependency on
+	 * br_i31_decode_reduce(). Instead, we use br_i31_decode_mod()
+	 * with a synthetic modulus of value 2^255 (this must work
+	 * since G was truncated to 255 bits), then use a conditional
+	 * subtraction. We use br_i31_decode_mod() and not
+	 * br_i31_decode(), because the ec_prime_i31 implementation uses
+	 * the former but not the latter.
+	 *    br_i31_decode_reduce(a, G, 32, C255_P);
+	 */
+	br_i31_zero(b, 0x108);
+	b[9] = 0x0100;
+	br_i31_decode_mod(a, G, 32, b);
+	a[0] = 0x107;
+	br_i31_sub(a, C255_P, NOT(br_i31_sub(a, C255_P, 0)));
+
 	/*
 	 * Initialise variables x1, x2, z2, x3 and z3. We set all of them
 	 * into Montgomery representation.
 	 */
-	br_i31_decode_reduce(a, G, 32, C255_P);
 	br_i31_montymul(x1, a, C255_R2, C255_P, P0I);
 	memcpy(x3, x1, sizeof x1);
 	br_i31_zero(z2, C255_P[0]);
@ -308,8 +324,18 @@ api_mul(unsigned char *G, size_t Glen,
 			c255_mul(b, z2, b);
 		}
 	}
-	c255_mul(x2, x2, b);
-	br_i31_from_monty(x2, C255_P, P0I);
+	c255_mul(b, x2, b);
+
+	/*
+	 * To avoid a dependency on br_i31_from_monty(), we use
+	 * a Montgomery multiplication with 1.
+	 *    memcpy(x2, b, sizeof b);
+	 *    br_i31_from_monty(x2, C255_P, P0I);
+	 */
+	br_i31_zero(a, C255_P[0]);
+	a[1] = 1;
+	br_i31_montymul(x2, a, b, C255_P, P0I);
+
 	br_i31_encode(G, 32, x2);
 	byteswap(G);
 	return 1;
--- a/src/ec/ec_p256_m15.c
+++ b/src/ec/ec_p256_m15.c
@ -1101,18 +1101,20 @@ mul_f256(uint32_t *d, const uint32_t *a, const uint32_t *b)
 	}

 	/*
-	 * Propagate carries. Since the operation above really is a
-	 * truncature, followed by the addition of nonnegative values,
-	 * the result will be positive. Moreover, the carry cannot
-	 * exceed 5 bits (we performed 20 additions with values smaller
-	 * than 256 bits).
+	 * Propagate carries. This is a signed propagation, and the
+	 * result may be negative. The loop above may enlarge values,
+	 * but not two much: worst case is the chain involving t[i - 3],
+	 * in which a value may be added to itself up to 7 times. Since
+	 * starting values are 13-bit each, all words fit on 20 bits
+	 * (21 to account for the sign bit).
 	 */
 	cc = norm13(t, t, 20);

 	/*
 	 * Perform modular reduction again for the bits beyond 256 (the carry
-	 * and the bits 256..259). This time, we can simply inject full
-	 * word values.
+	 * and the bits 256..259). Since the largest shift below is by 10
+	 * bits, and the values fit on 21 bits, values fit in 32-bit words,
+	 * thereby allowing injecting full word values.
 	 */
 	cc = (cc << 4) | (t[19] >> 9);
 	t[19] &= 0x01FF;
@ -1172,18 +1174,20 @@ square_f256(uint32_t *d, const uint32_t *a)
 	}

 	/*
-	 * Propagate carries. Since the operation above really is a
-	 * truncature, followed by the addition of nonnegative values,
-	 * the result will be positive. Moreover, the carry cannot
-	 * exceed 5 bits (we performed 20 additions with values smaller
-	 * than 256 bits).
+	 * Propagate carries. This is a signed propagation, and the
+	 * result may be negative. The loop above may enlarge values,
+	 * but not two much: worst case is the chain involving t[i - 3],
+	 * in which a value may be added to itself up to 7 times. Since
+	 * starting values are 13-bit each, all words fit on 20 bits
+	 * (21 to account for the sign bit).
 	 */
 	cc = norm13(t, t, 20);

 	/*
 	 * Perform modular reduction again for the bits beyond 256 (the carry
-	 * and the bits 256..259). This time, we can simply inject full
-	 * word values.
+	 * and the bits 256..259). Since the largest shift below is by 10
+	 * bits, and the values fit on 21 bits, values fit in 32-bit words,
+	 * thereby allowing injecting full word values.
 	 */
 	cc = (cc << 4) | (t[19] >> 9);
 	t[19] &= 0x01FF;
--- a/src/ec/ec_p256_m31.c
+++ b/src/ec/ec_p256_m31.c
--- a/test/test_crypto.c
+++ b/test/test_crypto.c
@ -4918,6 +4918,13 @@ test_EC_p256_m15(void)
 		(uint32_t)1 << BR_EC_secp256r1);
 }

+static void
+test_EC_p256_m31(void)
+{
+	test_EC_KAT("EC_p256_m31", &br_ec_p256_m31,
+		(uint32_t)1 << BR_EC_secp256r1);
+}
+
 const struct {
 	const char *scalar;
 	const char *u_in;
@ -5567,6 +5574,7 @@ static const struct {
 	STU(EC_prime_i15),
 	STU(EC_prime_i31),
 	STU(EC_p256_m15),
+	STU(EC_p256_m31),
 	STU(EC_c25519_i15),
 	STU(EC_c25519_i31),
 	STU(EC_c25519_m15),
--- a/test/test_speed.c
+++ b/test/test_speed.c
@ -691,6 +691,13 @@ test_speed_ec_p256_m15(void)
 		&br_ec_p256_m15, &br_secp256r1);
 }

+static void
+test_speed_ec_p256_m31(void)
+{
+	test_speed_ec_inner("EC p256_m31",
+		&br_ec_p256_m31, &br_secp256r1);
+}
+
 static void
 test_speed_ec_prime_i15(void)
 {
@ -835,6 +842,15 @@ test_speed_ecdsa_p256_m15(void)
 		&br_ecdsa_i15_vrfy_asn1);
 }

+static void
+test_speed_ecdsa_p256_m31(void)
+{
+	test_speed_ecdsa_inner("ECDSA m31 P-256",
+		&br_ec_p256_m31, &br_secp256r1,
+		&br_ecdsa_i31_sign_asn1,
+		&br_ecdsa_i31_vrfy_asn1);
+}
+
 static void
 test_speed_ecdsa_i15(void)
 {
@ -1282,11 +1298,13 @@ static const struct {
 	STU(ec_prime_i15),
 	STU(ec_prime_i31),
 	STU(ec_p256_m15),
+	STU(ec_p256_m31),
 	STU(ec_c25519_i15),
 	STU(ec_c25519_i31),
 	STU(ec_c25519_m15),
 	STU(ec_c25519_m31),
 	STU(ecdsa_p256_m15),
+	STU(ecdsa_p256_m31),
 	STU(ecdsa_i15),
 	STU(ecdsa_i31),