diff --git a/.travis.yml b/.travis.yml index 4a6202a..d172ad6 100644 --- a/.travis.yml +++ b/.travis.yml @@ -11,21 +11,23 @@ matrix: # Constantine only works with Nim devel # Build and test using both gcc and clang # Build and test on both x86-64 and ARM64 - - os: linux + # Ubuntu Bionic (18.04) is needed, it includes + # GCC 7 codegen fixes to addcarry_u64. + - dist: bionic arch: amd64 env: - ARCH=amd64 - CHANNEL=devel compiler: gcc - - os: linux + - dist: bionic arch: arm64 env: - ARCH=arm64 - CHANNEL=devel compiler: gcc - - os: linux + - dist: bionic arch: amd64 env: - ARCH=amd64 diff --git a/README.md b/README.md index 59c183d..db7ac10 100644 --- a/README.md +++ b/README.md @@ -18,14 +18,20 @@ You can install the developement version of the library through nimble with the nimble install https://github.com/mratsim/constantine@#master ``` +For speed it is recommended to prefer Clang, MSVC or ICC over GCC. +GCC does not properly optimize add-with-carry and sub-with-borrow loops (see [Compiler-caveats](#Compiler-caveats)). + +Further if using GCC, GCC 7 at minimum is required, previous versions +generated incorrect add-with-carry code. + ## Target audience The library aims to be a portable, compact and hardened library for elliptic curve cryptography needs, in particular for blockchain protocols and zero-knowledge proofs system. The library focuses on following properties: - constant-time (not leaking secret data via side-channels) -- generated code size, datatype size and stack usage - performance +- generated code size, datatype size and stack usage in this order @@ -54,6 +60,128 @@ actively hinder you by: A growing number of attack vectors is being collected for your viewing pleasure at https://github.com/mratsim/constantine/wiki/Constant-time-arithmetics +## Performance + +High-performance is a sought out property. +Note that security and side-channel resistance takes priority over performance. + +New applications of elliptic curve cryptography like zero-knowledge proofs or +proof-of-stake based blockchain protocols are bottlenecked by cryptography. + +### In blockchain + +Ethereum 2 clients spent or use to spend anywhere between 30% to 99% of their processing time verifying the signatures of block validators on R&D testnets +Assuming we want nodes to handle a thousand peers, if a cryptographic pairing takes 1ms, that represents 1s of cryptography per block to sign with a target +block frequency of 1 every 6 seconds. + +### In zero-knowledge proofs + +According to https://medium.com/loopring-protocol/zksnark-prover-optimizations-3e9a3e5578c0 +a 16-core CPU can prove 20 transfers/second or 10 transactions/second. +The previous implementation was 15x slower and one of the key optimizations +was changing the elliptic curve cryptography backend. +It had a direct implication on hardware cost and/or cloud computing resources required. + +### Compiler caveats + +Unfortunately compilers and in particular GCC are not very good at optimizing big integers and/or cryptographic code even when using intrinsics like `addcarry_u64`. + +Compilers with proper support of `addcarry_u64` like Clang, MSVC and ICC +may generate code up to 20~25% faster than GCC. + +This is explained by the GMP team: https://gmplib.org/manual/Assembly-Carry-Propagation.html +and can be reproduced with the following C code. + +See https://gcc.godbolt.org/z/2h768y +```C +#include +#include + +void add256(uint64_t a[4], uint64_t b[4]){ + uint8_t carry = 0; + for (int i = 0; i < 4; ++i) + carry = _addcarry_u64(carry, a[i], b[i], &a[i]); +} +``` + +GCC +```asm +add256: + movq (%rsi), %rax + addq (%rdi), %rax + setc %dl + movq %rax, (%rdi) + movq 8(%rdi), %rax + addb $-1, %dl + adcq 8(%rsi), %rax + setc %dl + movq %rax, 8(%rdi) + movq 16(%rdi), %rax + addb $-1, %dl + adcq 16(%rsi), %rax + setc %dl + movq %rax, 16(%rdi) + movq 24(%rsi), %rax + addb $-1, %dl + adcq %rax, 24(%rdi) + ret +``` + +Clang +```asm +add256: + movq (%rsi), %rax + addq %rax, (%rdi) + movq 8(%rsi), %rax + adcq %rax, 8(%rdi) + movq 16(%rsi), %rax + adcq %rax, 16(%rdi) + movq 24(%rsi), %rax + adcq %rax, 24(%rdi) + retq +``` + +### Inline assembly + +Constantine uses inline assembly for a very restricted use-case: "conditional mov", +and a temporary use-case "hardware 128-bit division" that will be replaced ASAP (as hardware division is not constant-time). + +Using intrinsics otherwise significantly improve code readability, portability, auditability and maintainability. + +#### Future optimizations + +In the future more inline assembly primitives might be added provided the performance benefit outvalues the significant complexity. +In particular, multiprecision multiplication and squaring on x86 can use the instructions MULX, ADCX and ADOX +to multiply-accumulate on 2 carry chains in parallel (with instruction-level parallelism) +and improve performance by 15~20% over an uint128-based implementation. +As no compiler is able to generate such code even when using the `_mulx_u64` and `_addcarryx_u64` intrinsics, +either the assembly for each supported bigint size must be hardcoded +or a "compiler" must be implemented in macros that will generate the required inline assembly at compile-time. + +Such a compiler can also be used to overcome GCC codegen deficiencies, here is an example for add-with-carry: +https://github.com/mratsim/finite-fields/blob/d7f6d8bb/macro_add_carry.nim + +## Sizes: code size, stack usage + +Thanks to 10x smaller key sizes for the same security level as RSA, elliptic curve cryptography +is widely used on resource-constrained devices. + +Constantine is actively optimize for code-size and stack usage. +Constantine does not use heap allocation. + +At the moment Constantine is optimized for 32-bit and 64-bit CPUs. + +When performance and code size conflicts, a careful and informed default is chosen. +In the future, a compile-time flag that goes beyond the compiler `-Os` might be provided. + +### Example tradeoff + +Unrolling Montgomery Multiplication brings about 15% performance improvement +which translate to ~15% on all operations in Constantine as field multiplication bottlenecks +all cryptographic primitives. +This is considered a worthwhile tradeoff on all but the most constrained CPUs +with those CPUs probably being 8-bit or 16-bit. + ## License Licensed and distributed under either of diff --git a/azure-pipelines.yml b/azure-pipelines.yml index bb936fc..c379eaa 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -19,12 +19,12 @@ strategy: CHANNEL: devel TEST_LANG: cpp Linux_devel_64bit: - VM: 'ubuntu-16.04' + VM: 'ubuntu-18.04' UCPU: amd64 CHANNEL: devel TEST_LANG: c Linux_cpp_devel_64bit: - VM: 'ubuntu-16.04' + VM: 'ubuntu-18.04' UCPU: amd64 CHANNEL: devel WEAVE_TEST_LANG: cpp diff --git a/bench_eth_curves_clang_old b/bench_eth_curves_clang_old deleted file mode 100755 index 7d2fb66..0000000 Binary files a/bench_eth_curves_clang_old and /dev/null differ diff --git a/bench_eth_curves_gcc_old b/bench_eth_curves_gcc_old deleted file mode 100755 index ff68a11..0000000 Binary files a/bench_eth_curves_gcc_old and /dev/null differ diff --git a/benchmarks/bls12_381_fp.nim b/benchmarks/bls12_381_fp.nim index b61c218..553f53f 100644 --- a/benchmarks/bls12_381_fp.nim +++ b/benchmarks/bls12_381_fp.nim @@ -23,7 +23,7 @@ import ../constantine/config/[common, curves], - ../constantine/arithmetic/[bigints_checked, finite_fields], + ../constantine/arithmetic/[bigints, finite_fields], ../constantine/io/[io_bigints, io_fields], random, std/monotimes, times, strformat, ./timers diff --git a/benchmarks/bn254_fp.nim b/benchmarks/bn254_fp.nim index e159c9b..1321482 100644 --- a/benchmarks/bn254_fp.nim +++ b/benchmarks/bn254_fp.nim @@ -23,7 +23,7 @@ import ../constantine/config/[common, curves], - ../constantine/arithmetic/[bigints_checked, finite_fields], + ../constantine/arithmetic/[bigints, finite_fields], ../constantine/io/[io_bigints, io_fields], random, std/monotimes, times, strformat, ./timers diff --git a/benchmarks/secp256k1_fp.nim b/benchmarks/secp256k1_fp.nim index ac15316..5191c16 100644 --- a/benchmarks/secp256k1_fp.nim +++ b/benchmarks/secp256k1_fp.nim @@ -23,7 +23,7 @@ import ../constantine/config/[common, curves], - ../constantine/arithmetic/[bigints_checked, finite_fields], + ../constantine/arithmetic/[bigints, finite_fields], ../constantine/io/[io_bigints, io_fields], random, std/monotimes, times, strformat, ./timers diff --git a/constantine.nimble b/constantine.nimble index 8efa80a..b9638b7 100644 --- a/constantine.nimble +++ b/constantine.nimble @@ -9,7 +9,7 @@ srcDir = "src" requires "nim >= 1.1.0" ### Helper functions -proc test(path: string) = +proc test(flags, path: string) = if not dirExists "build": mkDir "build" # Compilation language is controlled by WEAVE_TEST_LANG @@ -17,35 +17,64 @@ proc test(path: string) = if existsEnv"TEST_LANG": lang = getEnv"TEST_LANG" + var cc = "" + if existsEnv"CC": + cc = " --cc:" & getEnv"CC" + echo "\n========================================================================================" - echo "Running ", path + echo "Running [flags: ", flags, "] ", path echo "========================================================================================" - exec "nim " & lang & " --verbosity:0 --outdir:build -r --hints:off --warnings:off " & path + exec "nim " & lang & cc & " " & flags & " --verbosity:0 --outdir:build -r --hints:off --warnings:off " & path ### tasks task test, "Run all tests": # -d:testingCurves is configured in a *.nim.cfg for convenience - test "tests/test_primitives.nim" + test "", "tests/test_primitives.nim" - test "tests/test_io_bigints.nim" - test "tests/test_bigints.nim" - test "tests/test_bigints_multimod.nim" + test "", "tests/test_io_bigints.nim" + test "", "tests/test_bigints.nim" + test "", "tests/test_bigints_multimod.nim" - test "tests/test_io_fields" - test "tests/test_finite_fields.nim" - test "tests/test_finite_fields_powinv.nim" + test "", "tests/test_io_fields" + test "", "tests/test_finite_fields.nim" + test "", "tests/test_finite_fields_powinv.nim" - test "tests/test_bigints_vs_gmp.nim" - test "tests/test_finite_fields_vs_gmp.nim" + test "", "tests/test_bigints_vs_gmp.nim" + test "", "tests/test_finite_fields_vs_gmp.nim" + + if sizeof(int) == 8: # 32-bit tests + test "-d:Constantine32", "tests/test_primitives.nim" + + test "-d:Constantine32", "tests/test_io_bigints.nim" + test "-d:Constantine32", "tests/test_bigints.nim" + test "-d:Constantine32", "tests/test_bigints_multimod.nim" + + test "-d:Constantine32", "tests/test_io_fields" + test "-d:Constantine32", "tests/test_finite_fields.nim" + test "-d:Constantine32", "tests/test_finite_fields_powinv.nim" + + test "-d:Constantine32", "tests/test_bigints_vs_gmp.nim" + test "-d:Constantine32", "tests/test_finite_fields_vs_gmp.nim" task test_no_gmp, "Run tests that don't require GMP": # -d:testingCurves is configured in a *.nim.cfg for convenience - test "tests/test_primitives.nim" + test "", "tests/test_primitives.nim" - test "tests/test_io_bigints.nim" - test "tests/test_bigints.nim" - test "tests/test_bigints_multimod.nim" + test "", "tests/test_io_bigints.nim" + test "", "tests/test_bigints.nim" + test "", "tests/test_bigints_multimod.nim" - test "tests/test_io_fields" - test "tests/test_finite_fields.nim" - test "tests/test_finite_fields_powinv.nim" + test "", "tests/test_io_fields" + test "", "tests/test_finite_fields.nim" + test "", "tests/test_finite_fields_powinv.nim" + + if sizeof(int) == 8: # 32-bit tests + test "-d:Constantine32", "tests/test_primitives.nim" + + test "-d:Constantine32", "tests/test_io_bigints.nim" + test "-d:Constantine32", "tests/test_bigints.nim" + test "-d:Constantine32", "tests/test_bigints_multimod.nim" + + test "-d:Constantine32", "tests/test_io_fields" + test "-d:Constantine32", "tests/test_finite_fields.nim" + test "-d:Constantine32", "tests/test_finite_fields_powinv.nim" diff --git a/constantine/arithmetic/README.md b/constantine/arithmetic/README.md index 9ce12e1..b1c20f9 100644 --- a/constantine/arithmetic/README.md +++ b/constantine/arithmetic/README.md @@ -4,6 +4,17 @@ This folder contains the implementation of - big integers - finite field arithmetic (i.e. modular arithmetic) +As a tradeoff between speed, code size and compiler-enforced dependent type checking, the library is structured the following way: +- Finite Field: statically parametrized by an elliptic curve +- Big Integers: statically parametrized by the bit width of the field modulus +- Limbs: statically parametrized by the number of words to handle the bitwidth + +This allows to reuse the same implementation at the limbs-level for +curves that required the same number of words to save on code size, +for example secp256k1 and BN254. +It also enables compiler unrolling, inlining and register optimization, +where code size is not an issue for example for multi-precision addition. + ## References - Analyzing and Comparing Montgomery Multiplication Algorithms @@ -18,3 +29,7 @@ This folder contains the implementation of Chapter 5 of Guide to Pairing-Based Cryptography\ Jean Luc Beuchat, Luis J. Dominguez Perez, Sylvain Duquesne, Nadia El Mrabet, Laura Fuentes-Castañeda, Francisco Rodríguez-Henríquez, 2017\ https://www.researchgate.net/publication/319538235_Arithmetic_of_Finite_Fields + +- Faster big-integer modular multiplication for most moduli\ + Gautam Botrel, Gus Gutoski, and Thomas Piellard, 2020\ + https://hackmd.io/@zkteam/modular_multiplication diff --git a/constantine/arithmetic/bigints_checked.nim b/constantine/arithmetic/bigints.nim similarity index 65% rename from constantine/arithmetic/bigints_checked.nim rename to constantine/arithmetic/bigints.nim index f496bdc..9399ed9 100644 --- a/constantine/arithmetic/bigints_checked.nim +++ b/constantine/arithmetic/bigints.nim @@ -7,32 +7,59 @@ # at your option. This file may not be copied, modified, or distributed except according to those terms. import - ./bigints_raw, - ../primitives/constant_time, - ../config/common + ../config/common, + ../primitives, + ./limbs, + ./montgomery # ############################################################ # -# BigInts type-checked API +# BigInts # # ############################################################ -# The "checked" API is exported as a building blocks -# with enforced compile-time checking of BigInt bitsize +# The API is exported as a building block +# with enforced compile-time checking of BigInt bitwidth # and memory ownership. + +# ############################################################ +# Design # -# The "raw" compute API uses views to avoid code duplication -# due to generic/static monomorphization. +# Control flow should only depends on the static maximum number of bits +# This number is defined per Finite Field/Prime/Elliptic Curve # -# The "checked" API is a thin wrapper above the "raw" API to get the best of both world: -# - small code footprint -# - compiler enforced checks: types, bitsizes (dependant types) -# - compiler enforced memory: stack allocation and buffer ownership +# Data Layout +# +# The previous implementation of Constantine used type-erased views +# to optimized code-size (1) +# Also instead of using the full 64-bit of an uint64 it used +# 63-bit with the last bit to handle carries (2) +# +# (1) brought an advantage in terms of code-size if multiple curves +# were supported. +# However it prevented unrolling for some performance critical routines +# like addition and Montgomery multiplication. Furthermore, addition +# is only 1 or 2 instructions per limbs meaning unrolling+inlining +# is probably smaller in code-size than a function call. +# +# (2) Not using the full 64-bit eased carry and borrow handling. +# Also on older x86 Arch, the add-with-carry "ADC" instruction +# may be up to 6x slower than plain "ADD" with memory operand in a carry-chain. +# +# However, recent CPUs (less than 5 years) have reasonable or lower ADC latencies +# compared to the shifting and masking required when using 63 bits. +# Also we save on words to iterate on (1 word for BN254, secp256k1, BLS12-381) +# +# Furthermore, pairing curves are not fast-reduction friendly +# meaning that lazy reductions and lazy carries are impractical +# and so it's simpler to always carry additions instead of +# having redundant representations that forces costly reductions before multiplications. +# https://github.com/mratsim/constantine/issues/15 func wordsRequired(bits: int): int {.compileTime.} = ## Compute the number of limbs required # from the **announced** bit length - (bits + WordBitSize - 1) div WordBitSize + (bits + WordBitWidth - 1) div WordBitWidth type BigInt*[bits: static int] = object @@ -41,28 +68,18 @@ type ## - "bits" is the announced bit-length of the BigInt ## This is public data, usually equal to the curve prime bitlength. ## - ## - "bitLength" is the internal bitlength of the integer - ## This differs from the canonical bit-length as - ## Constantine word-size is smaller than a machine word. - ## This value should never be used as-is to prevent leaking secret data. - ## Computing this value requires constant-time operations. - ## Using this value requires converting it to the # of limbs in constant-time - ## ## - "limbs" is an internal field that holds the internal representation ## of the big integer. Least-significant limb first. Within limbs words are native-endian. ## ## This internal representation can be changed ## without notice and should not be used by external applications or libraries. - bitLength*: uint32 limbs*: array[bits.wordsRequired, Word] -template view*(a: BigInt): BigIntViewConst = - ## Returns a borrowed type-erased immutable view to a bigint - BigIntViewConst(cast[BigIntView](a.unsafeAddr)) - -template view*(a: var BigInt): BigIntViewMut = - ## Returns a borrowed type-erased mutable view to a mutable bigint - BigIntViewMut(cast[BigIntView](a.addr)) +# For unknown reason, `bits` doesn't semcheck if +# `limbs: Limbs[bits.wordsRequired]` +# with +# `Limbs[N: static int] = distinct array[N, Word]` +# so we don't set Limbs as a distinct type debug: import strutils @@ -70,9 +87,7 @@ debug: func `$`*(a: BigInt): string = result = "BigInt[" result.add $BigInt.bits - result.add "](bitLength: " - result.add $a.bitLength - result.add ", limbs: [" + result.add "](limbs: [" result.add $BaseType(a.limbs[0]) & " (0x" & toHex(BaseType(a.limbs[0])) & ')' for i in 1 ..< a.limbs.len: result.add ", " @@ -83,54 +98,40 @@ debug: {.push raises: [].} {.push inline.} -func setInternalBitLength*(a: var BigInt) = - ## Derive the actual bitsize used internally of a BigInt - ## from the announced BigInt bitsize - ## and set the bitLength field of that BigInt - ## to that computed value. - a.bitLength = uint32 static(a.bits + a.bits div WordBitSize) - func `==`*(a, b: BigInt): CTBool[Word] = ## Returns true if 2 big ints are equal ## Comparison is constant-time - var accum: Word - for i in static(0 ..< a.limbs.len): - accum = accum or (a.limbs[i] xor b.limbs[i]) - result = accum.isZero + a.limbs == b.limbs func isZero*(a: BigInt): CTBool[Word] = ## Returns true if a big int is equal to zero - a.view.isZero + a.limbs.isZero func setZero*(a: var BigInt) = ## Set a BigInt to 0 - a.setInternalBitLength() - zeroMem(a.limbs[0].unsafeAddr, a.limbs.len * sizeof(Word)) + a.limbs.setZero() func setOne*(a: var BigInt) = ## Set a BigInt to 1 - a.setInternalBitLength() - a.limbs[0] = Word(1) - when a.limbs.len > 1: - zeroMem(a.limbs[1].unsafeAddr, (a.limbs.len-1) * sizeof(Word)) + a.limbs.setOne() func cadd*(a: var BigInt, b: BigInt, ctl: CTBool[Word]): CTBool[Word] = ## Constant-time in-place conditional addition ## The addition is only performed if ctl is "true" ## The result carry is always computed. - cadd(a.view, b.view, ctl) + (CTBool[Word]) cadd(a.limbs, b.limbs, ctl) func csub*(a: var BigInt, b: BigInt, ctl: CTBool[Word]): CTBool[Word] = ## Constant-time in-place conditional addition ## The addition is only performed if ctl is "true" ## The result carry is always computed. - csub(a.view, b.view, ctl) + (CTBool[Word]) csub(a.limbs, b.limbs, ctl) func cdouble*(a: var BigInt, ctl: CTBool[Word]): CTBool[Word] = ## Constant-time in-place conditional doubling ## The doubling is only performed if ctl is "true" ## The result carry is always computed. - cadd(a.view, a.view, ctl) + (CTBool[Word]) cadd(a.limbs, a.limbs, ctl) # ############################################################ # @@ -143,38 +144,38 @@ func cdouble*(a: var BigInt, ctl: CTBool[Word]): CTBool[Word] = func add*(a: var BigInt, b: BigInt): CTBool[Word] = ## Constant-time in-place addition ## Returns the carry - add(a.view, b.view) + (CTBool[Word]) add(a.limbs, b.limbs) func sub*(a: var BigInt, b: BigInt): CTBool[Word] = ## Constant-time in-place substraction ## Returns the borrow - sub(a.view, b.view) + (CTBool[Word]) sub(a.limbs, b.limbs) func double*(a: var BigInt): CTBool[Word] = ## Constant-time in-place doubling ## Returns the carry - add(a.view, a.view) + (CTBool[Word]) add(a.limbs, a.limbs) func sum*(r: var BigInt, a, b: BigInt): CTBool[Word] = ## Sum `a` and `b` into `r`. ## `r` is initialized/overwritten ## ## Returns the carry - sum(r.view, a.view, b.view) + (CTBool[Word]) sum(r.limbs, a.limbs, b.limbs) func diff*(r: var BigInt, a, b: BigInt): CTBool[Word] = ## Substract `b` from `a` and store the result into `r`. ## `r` is initialized/overwritten ## ## Returns the borrow - diff(r.view, a.view, b.view) + (CTBool[Word]) diff(r.limbs, a.limbs, b.limbs) func double*(r: var BigInt, a: BigInt): CTBool[Word] = ## Double `a` into `r`. ## `r` is initialized/overwritten ## ## Returns the carry - sum(r.view, a.view, a.view) + (CTBool[Word]) sum(r.limbs, a.limbs, a.limbs) # ############################################################ # @@ -182,9 +183,13 @@ func double*(r: var BigInt, a: BigInt): CTBool[Word] = # # ############################################################ -# Use "csub", which unfortunately requires the first operand to be mutable. -# for example for a <= b, we now that if a-b borrows then b > a and so a<=b is false -# This can be tested with "not csub(a, b, CtFalse)" +func `<`*(a, b: BigInt): CTBool[Word] = + ## Returns true if a < b + a.limbs < b.limbs + +func `<=`*(a, b: BigInt): CTBool[Word] = + ## Returns true if a <= b + a.limbs <= b.limbs # ############################################################ # @@ -202,9 +207,15 @@ func reduce*[aBits, mBits](r: var BigInt[mBits], a: BigInt[aBits], M: BigInt[mBi # Note: for all cryptographic intents and purposes the modulus is known at compile-time # but we don't want to inline it as it would increase codesize, better have Nim # pass a pointer+length to a fixed session of the BSS. - reduce(r.view, a.view, M.view) + reduce(r.limbs, a.limbs, aBits, M.limbs, mBits) -func montyResidue*(mres: var BigInt, a, N, r2modN: BigInt, negInvModWord: static BaseType) = +# ############################################################ +# +# Montgomery Arithmetic +# +# ############################################################ + +func montyResidue*(mres: var BigInt, a, N, r2modM: BigInt, m0ninv: static BaseType, canUseNoCarryMontyMul: static bool) = ## Convert a BigInt from its natural representation ## to the Montgomery n-residue form ## @@ -213,9 +224,15 @@ func montyResidue*(mres: var BigInt, a, N, r2modN: BigInt, negInvModWord: static ## Caller must take care of properly switching between ## the natural and montgomery domain. ## Nesting Montgomery form is possible by applying this function twice. - montyResidue(mres.view, a.view, N.view, r2modN.view, Word(negInvModWord)) + ## + ## The Montgomery Magic Constants: + ## - `m0ninv` is µ = -1/N (mod M) + ## - `r2modM` is R² (mod M) + ## with W = M.len + ## and R = (2^WordBitSize)^W + montyResidue(mres.limbs, a.limbs, N.limbs, r2modM.limbs, m0ninv, canUseNoCarryMontyMul) -func redc*[mBits](r: var BigInt[mBits], a, N: BigInt[mBits], negInvModWord: static BaseType) = +func redc*[mBits](r: var BigInt[mBits], a, M: BigInt[mBits], m0ninv: static BaseType, canUseNoCarryMontyMul: static bool) = ## Convert a BigInt from its Montgomery n-residue form ## to the natural representation ## @@ -227,31 +244,27 @@ func redc*[mBits](r: var BigInt[mBits], a, N: BigInt[mBits], negInvModWord: stat var one {.noInit.}: BigInt[mBits] one.setOne() one - redc(r.view, a.view, one.view, N.view, Word(negInvModWord)) + redc(r.limbs, a.limbs, one.limbs, M.limbs, m0ninv, canUseNoCarryMontyMul) -# ############################################################ -# -# Montgomery Arithmetic -# -# ############################################################ - -func montyMul*(r: var BigInt, a, b, M: BigInt, negInvModWord: static BaseType) = +func montyMul*(r: var BigInt, a, b, M: BigInt, negInvModWord: static BaseType, canUseNoCarryMontyMul: static bool) = ## Compute r <- a*b (mod M) in the Montgomery domain ## ## This resets r to zero before processing. Use {.noInit.} ## to avoid duplicating with Nim zero-init policy - montyMul(r.view, a.view, b.view, M.view, Word(negInvModWord)) + montyMul(r.limbs, a.limbs, b.limbs, M.limbs, negInvModWord, canUseNoCarryMontyMul) -func montySquare*(r: var BigInt, a, M: BigInt, negInvModWord: static BaseType) = +func montySquare*(r: var BigInt, a, M: BigInt, negInvModWord: static BaseType, canUseNoCarryMontyMul: static bool) = ## Compute r <- a^2 (mod M) in the Montgomery domain ## ## This resets r to zero before processing. Use {.noInit.} ## to avoid duplicating with Nim zero-init policy - montySquare(r.view, a.view, M.view, Word(negInvModWord)) + montySquare(r.limbs, a.limbs, M.limbs, negInvModWord, canUseNoCarryMontyMul) func montyPow*[mBits, eBits: static int]( a: var BigInt[mBits], exponent: BigInt[eBits], - M, one: BigInt[mBits], negInvModWord: static BaseType, windowSize: static int) = + M, one: BigInt[mBits], negInvModWord: static BaseType, windowSize: static int, + canUseNoCarryMontyMul: static bool + ) = ## Compute a <- a^exponent (mod M) ## ``a`` in the Montgomery domain ## ``exponent`` is any BigInt, in the canonical domain @@ -268,16 +281,14 @@ func montyPow*[mBits, eBits: static int]( const scratchLen = if windowSize == 1: 2 else: (1 shl windowSize) + 1 - var scratchSpace {.noInit.}: array[scratchLen, BigInt[mBits]] - var scratchPtrs {.noInit.}: array[scratchLen, BigIntViewMut] - for i in 0 ..< scratchLen: - scratchPtrs[i] = scratchSpace[i].view() - - montyPow(a.view, expBE, M.view, one.view, Word(negInvModWord), scratchPtrs) + var scratchSpace {.noInit.}: array[scratchLen, Limbs[mBits.wordsRequired]] + montyPow(a.limbs, expBE, M.limbs, one.limbs, negInvModWord, scratchSpace, canUseNoCarryMontyMul) func montyPowUnsafeExponent*[mBits, eBits: static int]( a: var BigInt[mBits], exponent: BigInt[eBits], - M, one: BigInt[mBits], negInvModWord: static BaseType, windowSize: static int) = + M, one: BigInt[mBits], negInvModWord: static BaseType, windowSize: static int, + canUseNoCarryMontyMul: static bool + ) = ## Compute a <- a^exponent (mod M) ## ``a`` in the Montgomery domain ## ``exponent`` is any BigInt, in the canonical domain @@ -298,16 +309,14 @@ func montyPowUnsafeExponent*[mBits, eBits: static int]( const scratchLen = if windowSize == 1: 2 else: (1 shl windowSize) + 1 - var scratchSpace {.noInit.}: array[scratchLen, BigInt[mBits]] - var scratchPtrs {.noInit.}: array[scratchLen, BigIntViewMut] - for i in 0 ..< scratchLen: - scratchPtrs[i] = scratchSpace[i].view() - - montyPowUnsafeExponent(a.view, expBE, M.view, one.view, Word(negInvModWord), scratchPtrs) + var scratchSpace {.noInit.}: array[scratchLen, Limbs[mBits.wordsRequired]] + montyPowUnsafeExponent(a.limbs, expBE, M.limbs, one.limbs, negInvModWord, scratchSpace, canUseNoCarryMontyMul) func montyPowUnsafeExponent*[mBits: static int]( a: var BigInt[mBits], exponent: openarray[byte], - M, one: BigInt[mBits], negInvModWord: static BaseType, windowSize: static int) = + M, one: BigInt[mBits], negInvModWord: static BaseType, windowSize: static int, + canUseNoCarryMontyMul: static bool + ) = ## Compute a <- a^exponent (mod M) ## ``a`` in the Montgomery domain ## ``exponent`` is a BigInt in canonical representation @@ -324,9 +333,5 @@ func montyPowUnsafeExponent*[mBits: static int]( const scratchLen = if windowSize == 1: 2 else: (1 shl windowSize) + 1 - var scratchSpace {.noInit.}: array[scratchLen, BigInt[mBits]] - var scratchPtrs {.noInit.}: array[scratchLen, BigIntViewMut] - for i in 0 ..< scratchLen: - scratchPtrs[i] = scratchSpace[i].view() - - montyPowUnsafeExponent(a.view, exponent, M.view, one.view, Word(negInvModWord), scratchPtrs) + var scratchSpace {.noInit.}: array[scratchLen, Limbs[mBits.wordsRequired]] + montyPowUnsafeExponent(a.limbs, exponent, M.limbs, one.limbs, negInvModWord, scratchSpace, canUseNoCarryMontyMul) diff --git a/constantine/arithmetic/bigints_raw.nim b/constantine/arithmetic/bigints_raw.nim deleted file mode 100644 index b475c0f..0000000 --- a/constantine/arithmetic/bigints_raw.nim +++ /dev/null @@ -1,830 +0,0 @@ -# Constantine -# Copyright (c) 2018-2019 Status Research & Development GmbH -# Copyright (c) 2020-Present Mamy André-Ratsimbazafy -# Licensed and distributed under either of -# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). -# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). -# at your option. This file may not be copied, modified, or distributed except according to those terms. - -# ############################################################ -# -# BigInt Raw representation and operations -# -# ############################################################ -# -# This file holds the raw operations done on big ints -# The representation is optimized for: -# - constant-time (not leaking secret data via side-channel) -# - generated code size, datatype size and stack usage -# - performance -# in this order - -# ############################################################ -# Design - -# To avoid carry issues we don't use the -# most significant bit of each machine word. -# i.e. for a uint64 base we only use 63-bit. -# More info: https://github.com/status-im/nim-constantine/wiki/Constant-time-arithmetics#guidelines -# Especially: -# - https://bearssl.org/bigint.html -# - https://cryptojedi.org/peter/data/pairing-20131122.pdf -# - http://docs.milagro.io/en/amcl/milagro-crypto-library-white-paper.html -# -# Note that this might also be beneficial in terms of performance. -# Due to opcode latency, on Nehalem ADC is 6x times slower than ADD -# if it has dependencies (i.e the ADC depends on a previous ADC result) -# -# Control flow should only depends on the static maximum number of bits -# This number is defined per Finite Field/Prime/Elliptic Curve -# -# We internally order the limbs in little-endian -# So the least significant limb is limb[0] -# This is independent from the base type endianness. -# -# Constantine uses Nim generic integer to prevent mixing -# BigInts of different bitlength at compile-time and -# properly statically size the BigInt buffers. -# -# To avoid code-bloat due to monomorphization (i.e. duplicating code per announced bitlength) -# actual computation is deferred to type-erased routines. - -import - ../primitives/constant_time, - ../primitives/extended_precision, - ../config/common -from typetraits import distinctBase - -# ############################################################ -# -# BigInts type-erased API -# -# ############################################################ - -# The "checked" API is exported as a building blocks -# with enforced compile-time checking of BigInt bitsize -# and memory ownership. -# -# The "raw" compute API uses views to avoid code duplication -# due to generic/static monomorphization. -# -# The "checked" API is a thin wrapper above the "raw" API to get the best of both world: -# - small code footprint -# - compiler enforced checks: types, bitsizes -# - compiler enforced memory: stack allocation and buffer ownership - -type - BigIntView* = ptr object - ## Type-erased fixed-precision big integer - ## - ## This type mirrors the BigInt type and is used - ## for the low-level computation API - ## This design - ## - avoids code bloat due to generic monomorphization - ## otherwise each bigint routines would have an instantiation for - ## each static `bits` parameter. - ## - while not forcing the caller to preallocate computation buffers - ## for the high-level API and enforcing bitsizes - ## - avoids runtime bound-checks on the view - ## for performance - ## and to ensure exception-free code - ## even when compiled in non "-d:danger" mode - ## - ## As with the BigInt type: - ## - "bitLength" is the internal bitlength of the integer - ## This differs from the canonical bit-length as - ## Constantine word-size is smaller than a machine word. - ## This value should never be used as-is to prevent leaking secret data. - ## Computing this value requires constant-time operations. - ## Using this value requires converting it to the # of limbs in constant-time - ## - ## - "limbs" is an internal field that holds the internal representation - ## of the big integer. Least-significant limb first. Within limbs words are native-endian. - ## - ## This internal representation can be changed - ## without notice and should not be used by external applications or libraries. - ## - ## Accesses should be done via BigIntViewConst / BigIntViewConst - ## to have the compiler check for mutability - bitLength: uint32 - limbs: UncheckedArray[Word] - - # "Indirection" to enforce pointer types deep immutability - BigIntViewConst* = distinct BigIntView - ## Immutable view into a BigInt - BigIntViewMut* = distinct BigIntView - ## Mutable view into a BigInt - BigIntViewAny* = BigIntViewConst or BigIntViewMut - -# No exceptions allowed -{.push raises: [].} - -# ############################################################ -# -# Deep Mutability safety -# -# ############################################################ - -template `[]`*(v: BigIntViewConst, limbIdx: int): Word = - distinctBase(type v)(v).limbs[limbIdx] - -template `[]`*(v: BigIntViewMut, limbIdx: int): var Word = - distinctBase(type v)(v).limbs[limbIdx] - -template `[]=`*(v: BigIntViewMut, limbIdx: int, val: Word) = - distinctBase(type v)(v).limbs[limbIdx] = val - -template bitSizeof(v: BigIntViewAny): uint32 = - bind BigIntView - distinctBase(type v)(v).bitLength - -const divShiftor = log2(uint32(WordPhysBitSize)) -template numLimbs*(v: BigIntViewAny): int = - ## Compute the number of limbs from - ## the **internal** bitlength - (bitSizeof(v).int + WordPhysBitSize - 1) shr divShiftor - -template setBitLength(v: BigIntViewMut, internalBitLength: uint32) = - distinctBase(type v)(v).bitLength = internalBitLength - -# TODO: Check if repeated v.numLimbs calls are optimized away - -template `[]`*(v: BigIntViewConst, limbIdxFromEnd: BackwardsIndex): Word = - distinctBase(type v)(v).limbs[numLimbs(v).int - int limbIdxFromEnd] - -template `[]`*(v: BigIntViewMut, limbIdxFromEnd: BackwardsIndex): var Word = - distinctBase(type v)(v).limbs[numLimbs(v).int - int limbIdxFromEnd] - -template `[]=`*(v: BigIntViewMut, limbIdxFromEnd: BackwardsIndex, val: Word) = - distinctBase(type v)(v).limbs[numLimbs(v).int - int limbIdxFromEnd] = val - -# ############################################################ -# -# Checks and debug/test only primitives -# -# ############################################################ - -template checkMatchingBitlengths(a, b: distinct BigIntViewAny) = - ## Check that bitlengths of bigints match - ## This is only checked - ## with "-d:debugConstantine" and when assertions are on. - debug: - assert distinctBase(type a)(a).bitLength == - distinctBase(type b)(b).bitLength, "Internal Error: operands bitlength do not match" - -template checkValidModulus(m: BigIntViewConst) = - ## Check that the modulus is valid - ## The check is approximate, it only checks that - ## the most-significant words is non-zero instead of - ## checking that the last announced bit is 1. - ## This is only checked - ## with "-d:debugConstantine" and when assertions are on. - debug: - assert not isZero(m[^1]).bool, "Internal Error: the modulus must use all declared bits" - -template checkOddModulus(m: BigIntViewConst) = - ## CHeck that the modulus is odd - ## and valid for use in the Montgomery n-residue representation - debug: - assert bool(BaseType(m[0]) and 1), "Internal Error: the modulus must be odd to use the Montgomery representation." - -template checkWordShift(k: int) = - ## Checks that the shift is less than the word bit size - debug: - assert k <= WordBitSize, "Internal Error: the shift must be less than the word bit size" - -template checkPowScratchSpaceLen(len: int) = - ## Checks that there is a minimum of scratchspace to hold the temporaries - debug: - assert len >= 2, "Internal Error: the scratchspace for powmod should be equal or greater than 2" - -debug: - func `$`*(a: BigIntViewAny): string = - let len = a.numLimbs() - result = "[" - for i in 0 ..< len - 1: - result.add $a[i] - result.add ", " - result.add $a[len-1] - result.add "] (" - result.add $a.bitSizeof - result.add " bits)" - -# ############################################################ -# -# BigInt primitives -# -# ############################################################ - -func `==`*(a, b: distinct BigIntViewAny): CTBool[Word] = - ## Returns true if 2 big ints are equal - ## Comparison is constant-time - checkMatchingBitlengths(a, b) - var accum: Word - for i in 0 ..< a.numLimbs(): - accum = accum or (a[i] xor b[i]) - result = accum.isZero - -func isZero*(a: BigIntViewAny): CTBool[Word] = - ## Returns true if a big int is equal to zero - var accum: Word - for i in 0 ..< a.numLimbs(): - accum = accum or a[i] - result = accum.isZero() - -func setZero(a: BigIntViewMut) = - ## Set a BigInt to 0 - ## It's bit size is unchanged - zeroMem(a[0].unsafeAddr, a.numLimbs() * sizeof(Word)) - -func ccopy*(a: BigIntViewMut, b: BigIntViewAny, ctl: CTBool[Word]) = - ## Constant-time conditional copy - ## If ctl is true: b is copied into a - ## if ctl is false: b is not copied and a is untouched - ## Time and memory accesses are the same whether a copy occurs or not - checkMatchingBitlengths(a, b) - for i in 0 ..< a.numLimbs(): - a[i] = ctl.mux(b[i], a[i]) - -# The arithmetic primitives all accept a control input that indicates -# if it is a placebo operation. It stills performs the -# same memory accesses to be side-channel attack resistant. - -func cadd*(a: BigIntViewMut, b: BigIntViewAny, ctl: CTBool[Word]): CTBool[Word] = - ## Constant-time in-place conditional addition - ## The addition is only performed if ctl is "true" - ## The result carry is always computed. - ## - ## a and b MAY be the same buffer - ## a and b MUST have the same announced bitlength (i.e. `bits` static parameters) - checkMatchingBitlengths(a, b) - - for i in 0 ..< a.numLimbs(): - let new_a = a[i] + b[i] + Word(result) - result = new_a.isMsbSet() - a[i] = ctl.mux(new_a.mask(), a[i]) - -func csub*(a: BigIntViewMut, b: BigIntViewAny, ctl: CTBool[Word]): CTBool[Word] = - ## Constant-time in-place conditional substraction - ## The substraction is only performed if ctl is "true" - ## The result carry is always computed. - ## - ## a and b MAY be the same buffer - ## a and b MUST have the same announced bitlength (i.e. `bits` static parameters) - checkMatchingBitlengths(a, b) - - for i in 0 ..< a.numLimbs(): - let new_a = a[i] - b[i] - Word(result) - result = new_a.isMsbSet() - a[i] = ctl.mux(new_a.mask(), a[i]) - -func dec*(a: BigIntViewMut, w: Word): CTBool[Word] = - ## Decrement a big int by a small word - ## Returns the result carry - - a[0] -= w - result = a[0].isMsbSet() - a[0] = a[0].mask() - for i in 1 ..< a.numLimbs(): - a[i] -= Word(result) - result = a[i].isMsbSet() - a[i] = a[i].mask() - -func shiftRight*(a: BigIntViewMut, k: int) = - ## Shift right by k. - ## - ## k MUST be less than the base word size (2^31 or 2^63) - # We don't reuse shr for this in-place operation - # Do we need to return the shifted out part? - # - # Note: for speed, loading a[i] and a[i+1] - # instead of a[i-1] and a[i] - # is probably easier to parallelize for the compiler - # (antidependence WAR vs loop-carried dependence RAW) - checkWordShift(k) - - let len = a.numLimbs() - for i in 0 ..< len-1: - a[i] = (a[i] shr k) or mask(a[i+1] shl (WordBitSize - k)) - a[len-1] = a[len-1] shr k - -# ############################################################ -# -# BigInt Primitives Optimized for speed -# -# ############################################################ -# -# This section implements primitives that improve the speed -# of common use-cases at the expense of a slight increase in code-size. -# Where code size is a concern, the high-level API should use -# copy and/or the conditional operations. - -func add*(a: BigIntViewMut, b: BigIntViewAny): CTBool[Word] = - ## Constant-time in-place addition - ## Returns the carry - ## - ## a and b MAY be the same buffer - ## a and b MUST have the same announced bitlength (i.e. `bits` static parameters) - checkMatchingBitlengths(a, b) - - for i in 0 ..< a.numLimbs(): - a[i] = a[i] + b[i] + Word(result) - result = a[i].isMsbSet() - a[i] = a[i].mask() - -func sub*(a: BigIntViewMut, b: BigIntViewAny): CTBool[Word] = - ## Constant-time in-place substraction - ## Returns the borrow - ## - ## a and b MAY be the same buffer - ## a and b MUST have the same announced bitlength (i.e. `bits` static parameters) - checkMatchingBitlengths(a, b) - - for i in 0 ..< a.numLimbs(): - a[i] = a[i] - b[i] - Word(result) - result = a[i].isMsbSet() - a[i] = a[i].mask() - -func sum*(r: BigIntViewMut, a, b: distinct BigIntViewAny): CTBool[Word] = - ## Sum `a` and `b` into `r`. - ## `r` is initialized/overwritten - ## - ## Returns the carry - checkMatchingBitlengths(a, b) - - r.setBitLength(bitSizeof(a)) - - for i in 0 ..< a.numLimbs(): - r[i] = a[i] + b[i] + Word(result) - result = r[i].isMsbSet() - r[i] = r[i].mask() - -func diff*(r: BigIntViewMut, a, b: distinct BigIntViewAny): CTBool[Word] = - ## Substract `b` from `a` and store the result into `r`. - ## `r` is initialized/overwritten - ## - ## Returns the borrow - checkMatchingBitlengths(a, b) - - r.setBitLength(bitSizeof(a)) - - for i in 0 ..< a.numLimbs(): - r[i] = a[i] - b[i] - Word(result) - result = r[i].isMsbSet() - r[i] = r[i].mask() - -# ############################################################ -# -# Modular BigInt -# -# ############################################################ - -func shlAddMod(a: BigIntViewMut, c: Word, M: BigIntViewConst) = - ## Fused modular left-shift + add - ## Shift input `a` by a word and add `c` modulo `M` - ## - ## With a word W = 2^WordBitSize and a modulus M - ## Does a <- a * W + c (mod M) - ## - ## The modulus `M` MUST announced most-significant bit must be set. - checkValidModulus(M) - - let aLen = a.numLimbs() - let mBits = bitSizeof(M) - - if mBits <= WordBitSize: - # If M fits in a single limb - var q: Word - - # (hi, lo) = a * 2^63 + c - let hi = a[0] shr 1 # 64 - 63 = 1 - let lo = (a[0] shl WordBitSize) or c # Assumes most-significant bit in c is not set - unsafeDiv2n1n(q, a[0], hi, lo, M[0]) # (hi, lo) mod M - return - - else: - ## Multiple limbs - let hi = a[^1] # Save the high word to detect carries - let R = mBits and WordBitSize # R = mBits mod 64 - - var a0, a1, m0: Word - if R == 0: # If the number of mBits is a multiple of 64 - a0 = a[^1] # - moveMem(a[1].addr, a[0].addr, (aLen-1) * Word.sizeof) # we can just shift words - a[0] = c # and replace the first one by c - a1 = a[^1] - m0 = M[^1] - else: # Else: need to deal with partial word shifts at the edge. - a0 = mask((a[^1] shl (WordBitSize-R)) or (a[^2] shr R)) - moveMem(a[1].addr, a[0].addr, (aLen-1) * Word.sizeof) - a[0] = c - a1 = mask((a[^1] shl (WordBitSize-R)) or (a[^2] shr R)) - m0 = mask((M[^1] shl (WordBitSize-R)) or (M[^2] shr R)) - - # m0 has its high bit set. (a0, a1)/p0 fits in a limb. - # Get a quotient q, at most we will be 2 iterations off - # from the true quotient - - let - a_hi = a0 shr 1 # 64 - 63 = 1 - a_lo = (a0 shl WordBitSize) or a1 - var q, r: Word - unsafeDiv2n1n(q, r, a_hi, a_lo, m0) # Estimate quotient - q = mux( # If n_hi == divisor - a0 == m0, MaxWord, # Quotient == MaxWord (0b0111...1111) - mux( - q.isZero, Zero, # elif q == 0, true quotient = 0 - q - One # else instead of being of by 0, 1 or 2 - ) # we returning q-1 to be off by -1, 0 or 1 - ) - - # Now substract a*2^63 - q*p - var carry = Zero - var over_p = CtTrue # Track if quotient greater than the modulus - - for i in 0 ..< M.numLimbs(): - var qp_lo: Word - - block: # q*p - # q * p + carry (doubleword) carry from previous limb - unsafeFMA(carry, qp_lo, q, M[i], carry) - - block: # a*2^63 - q*p - a[i] -= qp_lo - carry += Word(a[i].isMsbSet) # Adjust if borrow - a[i] = a[i].mask() # Normalize to u63 - - over_p = mux( - a[i] == M[i], over_p, - a[i] > M[i] - ) - - # Fix quotient, the true quotient is either q-1, q or q+1 - # - # if carry < q or carry == q and over_p we must do "a -= p" - # if carry > hi (negative result) we must do "a += p" - - let neg = carry > hi - let tooBig = not neg and (over_p or (carry < hi)) - - discard a.cadd(M, ctl = neg) - discard a.csub(M, ctl = tooBig) - return - -func reduce*(r: BigIntViewMut, a: BigIntViewAny, M: BigIntViewConst) = - ## Reduce `a` modulo `M` and store the result in `r` - ## - ## The modulus `M` MUST announced most-significant bit must be set. - ## The result `r` buffer size MUST be at least the size of `M` buffer - ## - ## CT: Depends only on the bitlength of `a` and the modulus `M` - - # Note: for all cryptographic intents and purposes the modulus is known at compile-time - # but we don't want to inline it as it would increase codesize, better have Nim - # pass a pointer+length to a fixed session of the BSS. - checkValidModulus(M) - - let aBits = bitSizeof(a) - let mBits = bitSizeof(M) - let aLen = a.numLimbs() - - r.setBitLength(bitSizeof(M)) - - if aBits < mBits: - # if a uses less bits than the modulus, - # it is guaranteed < modulus. - # This relies on the precondition that the modulus uses all declared bits - copyMem(r[0].addr, a[0].unsafeAddr, aLen * sizeof(Word)) - for i in aLen ..< r.numLimbs(): - r[i] = Zero - else: - # a length i at least equal to the modulus. - # we can copy modulus.limbs-1 words - # and modular shift-left-add the rest - let mLen = M.numLimbs() - let aOffset = aLen - mLen - copyMem(r[0].addr, a[aOffset+1].unsafeAddr, (mLen-1) * sizeof(Word)) - r[^1] = Zero - # Now shift-left the copied words while adding the new word modulo M - for i in countdown(aOffset, 0): - r.shlAddMod(a[i], M) - -# ############################################################ -# -# Montgomery Arithmetic -# -# ############################################################ - -template wordMul(a, b: Word): Word = - mask(a * b) - -func montyMul*( - r: BigIntViewMut, a, b: distinct BigIntViewAny, - M: BigIntViewConst, negInvModWord: Word) = - ## Compute r <- a*b (mod M) in the Montgomery domain - ## `negInvModWord` = -1/M (mod Word). Our words are 2^31 or 2^63 - ## - ## This resets r to zero before processing. Use {.noInit.} - ## to avoid duplicating with Nim zero-init policy - ## The result `r` buffer size MUST be at least the size of `M` buffer - ## - ## - ## Assuming 63-bit wors, the magic constant should be: - ## - ## - µ ≡ -1/M[0] (mod 2^63) for a general multiplication - ## This can be precomputed with `negInvModWord` - ## - 1 for conversion from Montgomery to canonical representation - ## The library implements a faster `redc` primitive for that use-case - ## - R^2 (mod M) for conversion from canonical to Montgomery representation - ## - # i.e. c'R <- a'R b'R * R^-1 (mod M) in the natural domain - # as in the Montgomery domain all numbers are scaled by R - - checkValidModulus(M) - checkOddModulus(M) - checkMatchingBitlengths(a, M) - checkMatchingBitlengths(b, M) - - let nLen = M.numLimbs() - r.setBitLength(bitSizeof(M)) - setZero(r) - - var r_hi = Zero # represents the high word that is used in intermediate computation before reduction mod M - for i in 0 ..< nLen: - - let zi = (r[0] + wordMul(a[i], b[0])).wordMul(negInvModWord) - var carry: Word - # (carry, _) <- a[i] * b[0] + zi * M[0] + r[0] - unsafeFMA2_hi(carry, a[i], b[0], zi, M[0], r[0]) - - for j in 1 ..< nLen: - # (carry, r[j-1]) <- a[i] * b[j] + zi * M[j] + r[j] + carry - unsafeFMA2(carry, r[j-1], a[i], b[j], zi, M[j], r[j], carry) - - r_hi += carry - r[^1] = r_hi.mask() - r_hi = r_hi shr WordBitSize - - # If the extra word is not zero or if r-M does not borrow (i.e. r > M) - # Then substract M - discard r.csub(M, r_hi.isNonZero() or not r.csub(M, CtFalse)) - -func redc*(r: BigIntViewMut, a: BigIntViewAny, one, N: BigIntViewConst, negInvModWord: Word) {.inline.} = - ## Transform a bigint ``a`` from it's Montgomery N-residue representation (mod N) - ## to the regular natural representation (mod N) - ## - ## with W = N.numLimbs() - ## and R = (2^WordBitSize)^W - ## - ## Does "a * R^-1 (mod N)" - ## - ## This is called a Montgomery Reduction - ## The Montgomery Magic Constant is µ = -1/N mod N - ## is used internally and can be precomputed with negInvModWord(Curve) - # References: - # - https://eprint.iacr.org/2017/1057.pdf (Montgomery) - # page: Radix-r interleaved multiplication algorithm - # - https://en.wikipedia.org/wiki/Montgomery_modular_multiplication#Montgomery_arithmetic_on_multiprecision_(variable-radix)_integers - # - http://langevin.univ-tln.fr/cours/MLC/extra/montgomery.pdf - # Montgomery original paper - # - montyMul(r, a, one, N, negInvModWord) - -func montyResidue*( - r: BigIntViewMut, a: BigIntViewAny, - N, r2modN: BigIntViewConst, negInvModWord: Word) {.inline.} = - ## Transform a bigint ``a`` from it's natural representation (mod N) - ## to a the Montgomery n-residue representation - ## - ## Montgomery-Multiplication - based - ## - ## with W = N.numLimbs() - ## and R = (2^WordBitSize)^W - ## - ## Does "a * R (mod N)" - ## - ## `a`: The source BigInt in the natural representation. `a` in [0, N) range - ## `N`: The field modulus. N must be odd. - ## `r2modN`: 2^WordBitSize mod `N`. Can be precomputed with `r2mod` function - ## - ## Important: `r` is overwritten - ## The result `r` buffer size MUST be at least the size of `M` buffer - # Reference: https://eprint.iacr.org/2017/1057.pdf - montyMul(r, a, r2ModN, N, negInvModWord) - -func montySquare*( - r: BigIntViewMut, a: BigIntViewAny, - M: BigIntViewConst, negInvModWord: Word) {.inline.} = - ## Compute r <- a^2 (mod M) in the Montgomery domain - ## `negInvModWord` = -1/M (mod Word). Our words are 2^31 or 2^63 - montyMul(r, a, a, M, negInvModWord) - -# Montgomery Modular Exponentiation -# ------------------------------------------ -# We use fixed-window based exponentiation -# that is constant-time: i.e. the number of multiplications -# does not depend on the number of set bits in the exponents -# those are always done and conditionally copied. -# -# The exponent MUST NOT be private data (until audited otherwise) -# - Power attack on RSA, https://www.di.ens.fr/~fouque/pub/ches06.pdf -# - Flush-and-reload on Sliding window exponentiation: https://tutcris.tut.fi/portal/files/8966761/p1639_pereida_garcia.pdf -# - Sliding right into disaster, https://eprint.iacr.org/2017/627.pdf -# - Fixed window leak: https://www.scirp.org/pdf/JCC_2019102810331929.pdf -# - Constructing sliding-windows leak, https://easychair.org/publications/open/fBNC -# -# For pairing curves, this is the case since exponentiation is only -# used for inversion via the Little Fermat theorem. -# For RSA, some exponentiations uses private exponents. -# -# Note: -# - Implementation closely follows Thomas Pornin's BearSSL -# - Apache Milagro Crypto has an alternative implementation -# that is more straightforward however: -# - the exponent hamming weight is used as loop bounds -# - the base^k is stored at each index of a temp table of size k -# - the base^k to use is indexed by the hamming weight -# of the exponent, leaking this to cache attacks -# - in contrast BearSSL touches the whole table to -# hide the actual selection - -func getWindowLen(bufLen: int): uint = - ## Compute the maximum window size that fits in the scratchspace buffer - checkPowScratchSpaceLen(bufLen) - result = 5 - while (1 shl result) + 1 > bufLen: - dec result - -func montyPowPrologue( - a: BigIntViewMut, M, one: BigIntViewConst, - negInvModWord: Word, - scratchspace: openarray[BigIntViewMut] - ): tuple[window: uint, bigIntSize: int] {.inline.}= - # Due to the high number of parameters, - # forcing this inline actually reduces the code size - - result.window = scratchspace.len.getWindowLen() - result.bigIntSize = a.numLimbs() * sizeof(Word) + - offsetof(BigIntView, limbs) + - sizeof(BigIntView.bitLength) - - # Precompute window content, special case for window = 1 - # (i.e scratchspace has only space for 2 temporaries) - # The content scratchspace[2+k] is set at a^k - # with scratchspace[0] untouched - if result.window == 1: - copyMem(pointer scratchspace[1], pointer a, result.bigIntSize) - else: - scratchspace[1].setBitLength(bitSizeof(M)) - copyMem(pointer scratchspace[2], pointer a, result.bigIntSize) - for k in 2 ..< 1 shl result.window: - scratchspace[k+1].montyMul(scratchspace[k], a, M, negInvModWord) - - # Set a to one - copyMem(pointer a, pointer one, result.bigIntSize) - -func montyPowSquarings( - a: BigIntViewMut, - exponent: openarray[byte], - M: BigIntViewConst, - negInvModWord: Word, - tmp: BigIntViewMut, - window: uint, - bigIntSize: int, - acc, acc_len: var uint, - e: var int, - ): tuple[k, bits: uint] {.inline.}= - ## Squaring step of exponentiation by squaring - ## Get the next k bits in range [1, window) - ## Square k times - ## Returns the number of squarings done and the corresponding bits - ## - ## Updates iteration variables and accumulators - # Due to the high number of parameters, - # forcing this inline actually reduces the code size - - # Get the next bits - var k = window - if acc_len < window: - if e < exponent.len: - acc = (acc shl 8) or exponent[e].uint - inc e - acc_len += 8 - else: # Drained all exponent bits - k = acc_len - - let bits = (acc shr (acc_len - k)) and ((1'u32 shl k) - 1) - acc_len -= k - - # We have k bits and can do k squaring - for i in 0 ..< k: - tmp.montySquare(a, M, negInvModWord) - copyMem(pointer a, pointer tmp, bigIntSize) - - return (k, bits) - -func montyPow*( - a: BigIntViewMut, - exponent: openarray[byte], - M, one: BigIntViewConst, - negInvModWord: Word, - scratchspace: openarray[BigIntViewMut] - ) = - ## Modular exponentiation r = a^exponent mod M - ## in the Montgomery domain - ## - ## This uses fixed-window optimization if possible - ## - ## - On input ``a`` is the base, on ``output`` a = a^exponent (mod M) - ## ``a`` is in the Montgomery domain - ## - ``exponent`` is the exponent in big-endian canonical format (octet-string) - ## Use ``exportRawUint`` for conversion - ## - ``M`` is the modulus - ## - ``one`` is 1 (mod M) in montgomery representation - ## - ``negInvModWord`` is the montgomery magic constant "-1/M[0] mod 2^WordBitSize" - ## - ``scratchspace`` with k the window bitsize of size up to 5 - ## This is a buffer that can hold between 2^k + 1 big-ints - ## A window of of 1-bit (no window optimization) requires only 2 big-ints - ## - ## Note that the best window size require benchmarking and is a tradeoff between - ## - performance - ## - stack usage - ## - precomputation - ## - ## For example BLS12-381 window size of 5 is 30% faster than no window, - ## but windows of size 2, 3, 4 bring no performance benefit, only increased stack space. - ## A window of size 5 requires (2^5 + 1)*(381 + 7)/8 = 33 * 48 bytes = 1584 bytes - ## of scratchspace (on the stack). - - let (window, bigIntSize) = montyPowPrologue(a, M, one, negInvModWord, scratchspace) - - # We process bits with from most to least significant. - # At each loop iteration with have acc_len bits in acc. - # To maintain constant-time the number of iterations - # or the number of operations or memory accesses should be the same - # regardless of acc & acc_len - var - acc, acc_len: uint - e = 0 - while acc_len > 0 or e < exponent.len: - let (k, bits) = montyPowSquarings( - a, exponent, M, negInvModWord, - scratchspace[0], window, bigIntSize, - acc, acc_len, e - ) - - # Window lookup: we set scratchspace[1] to the lookup value. - # If the window length is 1, then it's already set. - if window > 1: - # otherwise we need a constant-time lookup - # in particular we need the same memory accesses, we can't - # just index the openarray with the bits to avoid cache attacks. - for i in 1 ..< 1 shl k: - let ctl = Word(i) == Word(bits) - scratchspace[1].ccopy(scratchspace[1+i], ctl) - - # Multiply with the looked-up value - # we keep the product only if the exponent bits are not all zero - scratchspace[0].montyMul(a, scratchspace[1], M, negInvModWord) - a.ccopy(scratchspace[0], Word(bits) != Zero) - -func montyPowUnsafeExponent*( - a: BigIntViewMut, - exponent: openarray[byte], - M, one: BigIntViewConst, - negInvModWord: Word, - scratchspace: openarray[BigIntViewMut] - ) = - ## Modular exponentiation r = a^exponent mod M - ## in the Montgomery domain - ## - ## Warning ⚠️ : - ## This is an optimization for public exponent - ## Otherwise bits of the exponent can be retrieved with: - ## - memory access analysis - ## - power analysis - ## - timing analysis - - # TODO: scratchspace[1] is unused when window > 1 - - let (window, bigIntSize) = montyPowPrologue( - a, M, one, negInvModWord, scratchspace) - - var - acc, acc_len: uint - e = 0 - while acc_len > 0 or e < exponent.len: - let (k, bits) = montyPowSquarings( - a, exponent, M, negInvModWord, - scratchspace[0], window, bigIntSize, - acc, acc_len, e - ) - - ## Warning ⚠️: Exposes the exponent bits - if bits != 0: - if window > 1: - scratchspace[0].montyMul(a, scratchspace[1+bits], M, negInvModWord) - else: - # scratchspace[1] holds the original `a` - scratchspace[0].montyMul(a, scratchspace[1], M, negInvModWord) - copyMem(pointer a, pointer scratchspace[0], bigIntSize) diff --git a/constantine/arithmetic/finite_fields.nim b/constantine/arithmetic/finite_fields.nim index 4049035..fe7ceda 100644 --- a/constantine/arithmetic/finite_fields.nim +++ b/constantine/arithmetic/finite_fields.nim @@ -25,9 +25,9 @@ # which requires a prime import - ../primitives/constant_time, + ../primitives, ../config/[common, curves], - ./bigints_checked + ./bigints, ./montgomery # type # `Fp`*[C: static Curve] = object @@ -57,16 +57,16 @@ debug: func fromBig*[C: static Curve](T: type Fp[C], src: BigInt): Fp[C] {.noInit.} = ## Convert a BigInt to its Montgomery form - result.mres.montyResidue(src, C.Mod.mres, C.getR2modP(), C.getNegInvModWord()) + result.mres.montyResidue(src, C.Mod.mres, C.getR2modP(), C.getNegInvModWord(), C.canUseNoCarryMontyMul()) func fromBig*[C: static Curve](dst: var Fp[C], src: BigInt) {.noInit.} = ## Convert a BigInt to its Montgomery form - dst.mres.montyResidue(src, C.Mod.mres, C.getR2modP(), C.getNegInvModWord()) + dst.mres.montyResidue(src, C.Mod.mres, C.getR2modP(), C.getNegInvModWord(), C.canUseNoCarryMontyMul()) func toBig*(src: Fp): auto {.noInit.} = ## Convert a finite-field element to a BigInt in natral representation var r {.noInit.}: typeof(src.mres) - r.redc(src.mres, Fp.C.Mod.mres, Fp.C.getNegInvModWord()) + r.redc(src.mres, Fp.C.Mod.mres, Fp.C.getNegInvModWord(), Fp.C.canUseNoCarryMontyMul()) return r # ############################################################ @@ -83,6 +83,12 @@ func toBig*(src: Fp): auto {.noInit.} = # - Golden Primes (φ^2 - φ - 1 with φ = 2^k for example Ed448-Goldilocks: 2^448 - 2^224 - 1) # exist and can be implemented with compile-time specialization. +# Note: for `+=`, double, sum +# not(a.mres < Fp.C.Mod.mres) is unnecessary if the prime has the form +# (2^64)^w - 1 (if using uint64 words). +# In practice I'm not aware of such prime being using in elliptic curves. +# 2^127 - 1 and 2^521 - 1 are used but 127 and 521 are not multiple of 32/64 + func `==`*(a, b: Fp): CTBool[Word] = ## Constant-time equality check a.mres == b.mres @@ -101,7 +107,7 @@ func setOne*(a: var Fp) = func `+=`*(a: var Fp, b: Fp) = ## In-place addition modulo p var overflowed = add(a.mres, b.mres) - overflowed = overflowed or not csub(a.mres, Fp.C.Mod.mres, CtFalse) # a >= P + overflowed = overflowed or not(a.mres < Fp.C.Mod.mres) discard csub(a.mres, Fp.C.Mod.mres, overflowed) func `-=`*(a: var Fp, b: Fp) = @@ -112,14 +118,14 @@ func `-=`*(a: var Fp, b: Fp) = func double*(a: var Fp) = ## Double ``a`` modulo p var overflowed = double(a.mres) - overflowed = overflowed or not csub(a.mres, Fp.C.Mod.mres, CtFalse) # a >= P + overflowed = overflowed or not(a.mres < Fp.C.Mod.mres) discard csub(a.mres, Fp.C.Mod.mres, overflowed) func sum*(r: var Fp, a, b: Fp) = ## Sum ``a`` and ``b`` into ``r`` module p ## r is initialized/overwritten var overflowed = r.mres.sum(a.mres, b.mres) - overflowed = overflowed or not csub(r.mres, Fp.C.Mod.mres, CtFalse) # r >= P + overflowed = overflowed or not(r.mres < Fp.C.Mod.mres) discard csub(r.mres, Fp.C.Mod.mres, overflowed) func diff*(r: var Fp, a, b: Fp) = @@ -132,17 +138,17 @@ func double*(r: var Fp, a: Fp) = ## Double ``a`` into ``r`` ## `r` is initialized/overwritten var overflowed = r.mres.double(a.mres) - overflowed = overflowed or not csub(r.mres, Fp.C.Mod.mres, CtFalse) # r >= P + overflowed = overflowed or not(r.mres < Fp.C.Mod.mres) discard csub(r.mres, Fp.C.Mod.mres, overflowed) func prod*(r: var Fp, a, b: Fp) = ## Store the product of ``a`` by ``b`` modulo p into ``r`` ## ``r`` is initialized / overwritten - r.mres.montyMul(a.mres, b.mres, Fp.C.Mod.mres, Fp.C.getNegInvModWord()) + r.mres.montyMul(a.mres, b.mres, Fp.C.Mod.mres, Fp.C.getNegInvModWord(), Fp.C.canUseNoCarryMontyMul()) func square*(r: var Fp, a: Fp) = ## Squaring modulo p - r.mres.montySquare(a.mres, Fp.C.Mod.mres, Fp.C.getNegInvModWord()) + r.mres.montySquare(a.mres, Fp.C.Mod.mres, Fp.C.getNegInvModWord(), Fp.C.canUseNoCarryMontyMul()) func neg*(r: var Fp, a: Fp) = ## Negate modulo p @@ -164,7 +170,8 @@ func pow*(a: var Fp, exponent: BigInt) = a.mres.montyPow( exponent, Fp.C.Mod.mres, Fp.C.getMontyOne(), - Fp.C.getNegInvModWord(), windowSize + Fp.C.getNegInvModWord(), windowSize, + Fp.C.canUseNoCarryMontyMul() ) func powUnsafeExponent*(a: var Fp, exponent: BigInt) = @@ -182,7 +189,8 @@ func powUnsafeExponent*(a: var Fp, exponent: BigInt) = a.mres.montyPowUnsafeExponent( exponent, Fp.C.Mod.mres, Fp.C.getMontyOne(), - Fp.C.getNegInvModWord(), windowSize + Fp.C.getNegInvModWord(), windowSize, + Fp.C.canUseNoCarryMontyMul() ) func inv*(a: var Fp) = @@ -193,7 +201,8 @@ func inv*(a: var Fp) = a.mres.montyPowUnsafeExponent( Fp.C.getInvModExponent(), Fp.C.Mod.mres, Fp.C.getMontyOne(), - Fp.C.getNegInvModWord(), windowSize + Fp.C.getNegInvModWord(), windowSize, + Fp.C.canUseNoCarryMontyMul() ) # ############################################################ diff --git a/constantine/arithmetic/limbs.nim b/constantine/arithmetic/limbs.nim new file mode 100644 index 0000000..ae8c7c2 --- /dev/null +++ b/constantine/arithmetic/limbs.nim @@ -0,0 +1,445 @@ +# Constantine +# Copyright (c) 2018-2019 Status Research & Development GmbH +# Copyright (c) 2020-Present Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +import + ../config/common, + ../primitives + +# ############################################################ +# +# Limbs raw representation and operations +# +# ############################################################ +# +# This file holds the raw operations done on big ints +# The representation is optimized for: +# - constant-time (not leaking secret data via side-channel) +# - performance +# - generated code size, datatype size and stack usage +# in this order +# +# The "limbs" API limits code duplication +# due to generic/static monomorphization for bit-width +# that are represented with the same number of words. +# +# It also exposes at the number of words to the compiler +# to allow aggressive unrolling and inlining for example +# of multi-precision addition which is so small (2 instructions per word) +# that inlining it improves both performance and code-size +# even for 2 curves (secp256k1 and BN254) that could share the code. +# +# The limb-endianess is little-endian, less significant limb is at index 0. +# The word-endianness is native-endian. + +type Limbs*[N: static int] = array[N, Word] + ## Limbs-type + ## Should be distinct type to avoid builtins to use non-constant time + ## implementation, for example for comparison. + ## + ## but for unknown reason, it prevents semchecking `bits` + +# No exceptions allowed +{.push raises: [].} + +# ############################################################ +# +# Accessors +# +# ############################################################ +# +# Commented out since we don't use a distinct type + +# template `[]`[N](v: Limbs[N], idx: int): Word = +# (array[N, Word])(v)[idx] +# +# template `[]`[N](v: var Limbs[N], idx: int): var Word = +# (array[N, Word])(v)[idx] +# +# template `[]=`[N](v: Limbs[N], idx: int, val: Word) = +# (array[N, Word])(v)[idx] = val + +# ############################################################ +# +# Checks and debug/test only primitives +# +# ############################################################ + +# ############################################################ +# +# Limbs Primitives +# +# ############################################################ + +{.push inline.} +# The following primitives are small enough on regular limb sizes +# (BN254 and secp256k1 -> 4 limbs, BLS12-381 -> 6 limbs) +# that inline both decreases the code size and increases speed +# as we avoid the parmeter packing/unpacking ceremony at function entry/exit +# and unrolling overhead is minimal. + +func `==`*(a, b: Limbs): CTBool[Word] = + ## Returns true if 2 limbs are equal + ## Comparison is constant-time + var accum = Zero + for i in 0 ..< a.len: + accum = accum or (a[i] xor b[i]) + result = accum.isZero() + +func isZero*(a: Limbs): CTBool[Word] = + ## Returns true if ``a`` is equal to zero + var accum = Zero + for i in 0 ..< a.len: + accum = accum or a[i] + result = accum.isZero() + +func setZero*(a: var Limbs) = + ## Set ``a`` to 0 + zeroMem(a[0].addr, sizeof(a)) + +func setOne*(a: var Limbs) = + ## Set ``a`` to 1 + a[0] = Word(1) + when a.len > 1: + zeroMem(a[1].addr, (a.len - 1) * sizeof(Word)) + +func ccopy*(a: var Limbs, b: Limbs, ctl: CTBool[Word]) = + ## Constant-time conditional copy + ## If ctl is true: b is copied into a + ## if ctl is false: b is not copied and a is untouched + ## Time and memory accesses are the same whether a copy occurs or not + # TODO: on x86, we use inline assembly for CMOV + # the codegen is a bit inefficient as the condition `ctl` + # is tested for each limb. + for i in 0 ..< a.len: + ctl.ccopy(a[i], b[i]) + +func add*(a: var Limbs, b: Limbs): Carry = + ## Limbs addition + ## Returns the carry + result = Carry(0) + for i in 0 ..< a.len: + addC(result, a[i], a[i], b[i], result) + +func cadd*(a: var Limbs, b: Limbs, ctl: CTBool[Word]): Carry = + ## Limbs conditional addition + ## Returns the carry + ## + ## if ctl is true: a <- a + b + ## if ctl is false: a <- a + ## The carry is always computed whether ctl is true or false + ## + ## Time and memory accesses are the same whether a copy occurs or not + result = Carry(0) + var sum: Word + for i in 0 ..< a.len: + addC(result, sum, a[i], b[i], result) + ctl.ccopy(a[i], sum) + +func sum*(r: var Limbs, a, b: Limbs): Carry = + ## Sum `a` and `b` into `r` + ## `r` is initialized/overwritten + ## + ## Returns the carry + result = Carry(0) + for i in 0 ..< a.len: + addC(result, r[i], a[i], b[i], result) + +func sub*(a: var Limbs, b: Limbs): Borrow = + ## Limbs substraction + ## Returns the borrow + result = Borrow(0) + for i in 0 ..< a.len: + subB(result, a[i], a[i], b[i], result) + +func csub*(a: var Limbs, b: Limbs, ctl: CTBool[Word]): Borrow = + ## Limbs conditional substraction + ## Returns the borrow + ## + ## if ctl is true: a <- a - b + ## if ctl is false: a <- a + ## The borrow is always computed whether ctl is true or false + ## + ## Time and memory accesses are the same whether a copy occurs or not + result = Borrow(0) + var diff: Word + for i in 0 ..< a.len: + subB(result, diff, a[i], b[i], result) + ctl.ccopy(a[i], diff) + +func diff*(r: var Limbs, a, b: Limbs): Borrow = + ## Diff `a` and `b` into `r` + ## `r` is initialized/overwritten + ## + ## Returns the borrow + result = Borrow(0) + for i in 0 ..< a.len: + subB(result, r[i], a[i], b[i], result) + +func `<`*(a, b: Limbs): CTBool[Word] = + ## Returns true if a < b + ## Comparison is constant-time + var diff: Word + var borrow: Borrow + for i in 0 ..< a.len: + subB(borrow, diff, a[i], b[i], borrow) + + result = (CTBool[Word])(borrow) + +func `<=`*(a, b: Limbs): CTBool[Word] = + ## Returns true if a <= b + ## Comparison is constant-time + not(b < a) + +{.pop.} # inline + +# ############################################################ +# +# Modular BigInt +# +# ############################################################ +# +# To avoid code-size explosion due to monomorphization +# and given that reductions are not in hot path in Constantine +# we use type-erased procedures, instead of instantiating +# one per number of limbs combination + +# Type-erasure +# ------------------------------------------------------------ + +type + LimbsView = ptr UncheckedArray[Word] + ## Type-erased fixed-precision limbs + ## + ## This type mirrors the Limb type and is used + ## for some low-level computation API + ## This design + ## - avoids code bloat due to generic monomorphization + ## otherwise limbs routines would have an instantiation for + ## each number of words. + ## + ## Accesses should be done via BigIntViewConst / BigIntViewConst + ## to have the compiler check for mutability + + # "Indirection" to enforce pointer types deep immutability + LimbsViewConst = distinct LimbsView + ## Immutable view into the limbs of a BigInt + LimbsViewMut = distinct LimbsView + ## Mutable view into a BigInt + LimbsViewAny = LimbsViewConst or LimbsViewMut + +# Deep Mutability safety +# ------------------------------------------------------------ + +template view(a: Limbs): LimbsViewConst = + ## Returns a borrowed type-erased immutable view to a bigint + LimbsViewConst(cast[LimbsView](a.unsafeAddr)) + +template view(a: var Limbs): LimbsViewMut = + ## Returns a borrowed type-erased mutable view to a mutable bigint + LimbsViewMut(cast[LimbsView](a.addr)) + +template `[]`*(v: LimbsViewConst, limbIdx: int): Word = + LimbsView(v)[limbIdx] + +template `[]`*(v: LimbsViewMut, limbIdx: int): var Word = + LimbsView(v)[limbIdx] + +template `[]=`*(v: LimbsViewMut, limbIdx: int, val: Word) = + LimbsView(v)[limbIdx] = val + +# Type-erased add-sub +# ------------------------------------------------------------ + +func cadd(a: LimbsViewMut, b: LimbsViewAny, ctl: CTBool[Word], len: int): Carry = + ## Type-erased conditional addition + ## Returns the carry + ## + ## if ctl is true: a <- a + b + ## if ctl is false: a <- a + ## The carry is always computed whether ctl is true or false + ## + ## Time and memory accesses are the same whether a copy occurs or not + result = Carry(0) + var sum: Word + for i in 0 ..< len: + addC(result, sum, a[i], b[i], result) + ctl.ccopy(a[i], sum) + +func csub(a: LimbsViewMut, b: LimbsViewAny, ctl: CTBool[Word], len: int): Borrow = + ## Type-erased conditional addition + ## Returns the borrow + ## + ## if ctl is true: a <- a - b + ## if ctl is false: a <- a + ## The borrow is always computed whether ctl is true or false + ## + ## Time and memory accesses are the same whether a copy occurs or not + result = Borrow(0) + var diff: Word + for i in 0 ..< len: + subB(result, diff, a[i], b[i], result) + ctl.ccopy(a[i], diff) + +# Modular reduction +# ------------------------------------------------------------ + +func numWordsFromBits(bits: int): int {.inline.} = + const divShiftor = log2(uint32(WordBitWidth)) + result = (bits + WordBitWidth - 1) shr divShiftor + +func shlAddMod_estimate(a: LimbsViewMut, aLen: int, + c: Word, M: LimbsViewConst, mBits: int + ): tuple[neg, tooBig: CTBool[Word]] = + ## Estimate a <- a shl 2^w + c (mod M) + ## + ## with w the base word width, usually 32 on 32-bit platforms and 64 on 64-bit platforms + ## + ## Updates ``a`` and returns ``neg`` and ``tooBig`` + ## If ``neg``, the estimate in ``a`` is negative and ``M`` must be added to it. + ## If ``tooBig``, the estimate in ``a`` overflowed and ``M`` must be substracted from it. + + # Aliases + # ---------------------------------------------------------------------- + let MLen = numWordsFromBits(mBits) + + # Captures aLen and MLen + template `[]`(v: untyped, limbIdxFromEnd: BackwardsIndex): Word {.dirty.}= + v[`v Len` - limbIdxFromEnd.int] + + # ---------------------------------------------------------------------- + # Assuming 64-bit words + let hi = a[^1] # Save the high word to detect carries + let R = mBits and (WordBitWidth - 1) # R = mBits mod 64 + + var a0, a1, m0: Word + if R == 0: # If the number of mBits is a multiple of 64 + a0 = a[^1] # + moveMem(a[1].addr, a[0].addr, (aLen-1) * Word.sizeof) # we can just shift words + a[0] = c # and replace the first one by c + a1 = a[^1] + m0 = M[^1] + else: # Else: need to deal with partial word shifts at the edge. + a0 = (a[^1] shl (WordBitWidth-R)) or (a[^2] shr R) + moveMem(a[1].addr, a[0].addr, (aLen-1) * Word.sizeof) + a[0] = c + a1 = (a[^1] shl (WordBitWidth-R)) or (a[^2] shr R) + m0 = (M[^1] shl (WordBitWidth-R)) or (M[^2] shr R) + + # m0 has its high bit set. (a0, a1)/p0 fits in a limb. + # Get a quotient q, at most we will be 2 iterations off + # from the true quotient + var q, r: Word + unsafeDiv2n1n(q, r, a0, a1, m0) # Estimate quotient + q = mux( # If n_hi == divisor + a0 == m0, MaxWord, # Quotient == MaxWord (0b1111...1111) + mux( + q.isZero, Zero, # elif q == 0, true quotient = 0 + q - One # else instead of being of by 0, 1 or 2 + ) # we returning q-1 to be off by -1, 0 or 1 + ) + + # Now substract a*2^64 - q*p + var carry = Zero + var over_p = CtTrue # Track if quotient greater than the modulus + + for i in 0 ..< MLen: + var qp_lo: Word + + block: # q*p + # q * p + carry (doubleword) carry from previous limb + muladd1(carry, qp_lo, q, M[i], Word carry) + + block: # a*2^64 - q*p + var borrow: Borrow + subB(borrow, a[i], a[i], qp_lo, Borrow(0)) + carry += Word(borrow) # Adjust if borrow + + over_p = mux( + a[i] == M[i], over_p, + a[i] > M[i] + ) + + # Fix quotient, the true quotient is either q-1, q or q+1 + # + # if carry < q or carry == q and over_p we must do "a -= p" + # if carry > hi (negative result) we must do "a += p" + + result.neg = Word(carry) > hi + result.tooBig = not(result.neg) and (over_p or (Word(carry) < hi)) + +func shlAddMod(a: LimbsViewMut, aLen: int, + c: Word, M: LimbsViewConst, mBits: int) = + ## Fused modular left-shift + add + ## Shift input `a` by a word and add `c` modulo `M` + ## + ## With a word W = 2^WordBitSize and a modulus M + ## Does a <- a * W + c (mod M) + ## + ## The modulus `M` most-significant bit at `mBits` MUST be set. + if mBits <= WordBitWidth: + # If M fits in a single limb + + # We normalize M with R so that the MSB is set + # And normalize (a * 2^64 + c) by R as well to maintain the result + # This ensures that (a0, a1)/p0 fits in a limb. + let R = mBits and (WordBitWidth - 1) + + # (hi, lo) = a * 2^64 + c + let hi = (a[0] shl (WordBitWidth-R)) or (c shr R) + let lo = c shl (WordBitWidth-R) + let m0 = M[0] shl (WordBitWidth-R) + + var q, r: Word + unsafeDiv2n1n(q, r, hi, lo, m0) # (hi, lo) mod M + + a[0] = r shr (WordBitWidth-R) + + else: + ## Multiple limbs + let (neg, tooBig) = shlAddMod_estimate(a, aLen, c, M, mBits) + discard a.cadd(M, ctl = neg, aLen) + discard a.csub(M, ctl = tooBig, aLen) + +func reduce(r: LimbsViewMut, + a: LimbsViewAny, aBits: int, + M: LimbsViewConst, mBits: int) = + ## Reduce `a` modulo `M` and store the result in `r` + let aLen = numWordsFromBits(aBits) + let mLen = numWordsFromBits(mBits) + let rLen = mLen + + if aBits < mBits: + # if a uses less bits than the modulus, + # it is guaranteed < modulus. + # This relies on the precondition that the modulus uses all declared bits + copyMem(r[0].addr, a[0].unsafeAddr, aLen * sizeof(Word)) + for i in aLen ..< mLen: + r[i] = Zero + else: + # a length i at least equal to the modulus. + # we can copy modulus.limbs-1 words + # and modular shift-left-add the rest + let aOffset = aLen - mLen + copyMem(r[0].addr, a[aOffset+1].unsafeAddr, (mLen-1) * sizeof(Word)) + r[rLen - 1] = Zero + # Now shift-left the copied words while adding the new word modulo M + for i in countdown(aOffset, 0): + shlAddMod(r, rLen, a[i], M, mBits) + +func reduce*[aLen, mLen](r: var Limbs[mLen], + a: Limbs[aLen], aBits: static int, + M: Limbs[mLen], mBits: static int + ) {.inline.} = + ## Reduce `a` modulo `M` and store the result in `r` + ## + ## Warning ⚠: At the moment this is NOT constant-time + ## as it relies on hardware division. + # This is implemented via type-erased indirection to avoid + # a significant amount of code duplication if instantiated for + # varying bitwidth. + reduce(r.view(), a.view(), aBits, M.view(), mBits) diff --git a/constantine/arithmetic/montgomery.nim b/constantine/arithmetic/montgomery.nim new file mode 100644 index 0000000..4bf4100 --- /dev/null +++ b/constantine/arithmetic/montgomery.nim @@ -0,0 +1,473 @@ +# Constantine +# Copyright (c) 2018-2019 Status Research & Development GmbH +# Copyright (c) 2020-Present Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +import + ../config/common, + ../primitives, + ./limbs, + macros + +# ############################################################ +# +# Multiprecision Montgomery Arithmetic +# +# ############################################################ +# +# Note: Montgomery multiplications and squarings are the biggest bottlenecks +# of an elliptic curve library, asymptotically 100% of the costly algorithms: +# - field exponentiation +# - field inversion +# - extension towers multiplication, squarings, inversion +# - elliptic curve point addition +# - elliptic curve point doubling +# - elliptic curve point multiplication +# - pairing Miller Loop +# - pairing final exponentiation +# are bottlenecked by Montgomery multiplications or squarings +# +# Unfortunately, the fastest implementation of Montgomery Multiplication +# on x86 is impossible without resorting to assembly (probably 15~30% faster) +# +# It requires implementing 2 parallel pipelines of carry-chains (via instruction-level parallelism) +# of MULX, ADCX and ADOX instructions, according to Intel paper: +# https://www.intel.cn/content/dam/www/public/us/en/documents/white-papers/ia-large-integer-arithmetic-paper.pdf +# and the code generation of MCL +# https://github.com/herumi/mcl +# +# A generic implementation would require implementing a mini-compiler as macro +# significantly sacrificing code readability, portability, auditability and maintainability. +# +# This would however save significant hardware or cloud resources. +# An example inline assembly compiler for add-with-carry is available in +# primitives/research/addcarry_subborrow_compiler.nim +# +# Instead we follow the optimized high-level implementation of Goff +# which skips a significant amount of additions for moduli +# that have their the most significant bit unset. + +# Loop unroller +# ------------------------------------------------------------ + +proc replaceNodes(ast: NimNode, what: NimNode, by: NimNode): NimNode = + # Replace "what" ident node by "by" + proc inspect(node: NimNode): NimNode = + case node.kind: + of {nnkIdent, nnkSym}: + if node.eqIdent(what): + return by + return node + of nnkEmpty: + return node + of nnkLiterals: + return node + else: + var rTree = node.kind.newTree() + for child in node: + rTree.add inspect(child) + return rTree + result = inspect(ast) + +macro staticFor(idx: untyped{nkIdent}, start, stopEx: static int, body: untyped): untyped = + result = newStmtList() + for i in start ..< stopEx: + result.add nnkBlockStmt.newTree( + ident("unrolledIter_" & $idx & $i), + body.replaceNodes(idx, newLit i) + ) + +# Implementation +# ------------------------------------------------------------ +# Note: the low-level implementations should not use static parameter +# the code generated is already big enough for curve with different +# limb sizes, we want to use the same codepath when limbs lenght are compatible. + +func montyMul_CIOS_nocarry_unrolled(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType) = + ## Montgomery Multiplication using Coarse Grained Operand Scanning (CIOS) + ## and no-carry optimization. + ## This requires the most significant word of the Modulus + ## M[^1] < high(Word) shr 1 (i.e. less than 0b01111...1111) + ## https://hackmd.io/@zkteam/modular_multiplication + + # We want all the computation to be kept in registers + # hence we use a temporary `t`, hoping that the compiler does it. + var t: typeof(M) # zero-init + const N = t.len + staticFor i, 0, N: + # (A, t[0]) <- a[0] * b[i] + t[0] + # m <- (t[0] * m0ninv) mod 2^w + # (C, _) <- m * M[0] + t[0] + var A: Word + muladd1(A, t[0], a[0], b[i], t[0]) + let m = t[0] * Word(m0ninv) + var C, lo: Word + muladd1(C, lo, m, M[0], t[0]) + + staticFor j, 1, N: + # (A, t[j]) <- a[j] * b[i] + A + t[j] + # (C, t[j-1]) <- m * M[j] + C + t[j] + muladd2(A, t[j], a[j], b[i], A, t[j]) + muladd2(C, t[j-1], m, M[j], C, t[j]) + + t[N-1] = C + A + + discard t.csub(M, not(t < M)) + r = t + +func montyMul_CIOS(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType) = + ## Montgomery Multiplication using Coarse Grained Operand Scanning (CIOS) + # - Analyzing and Comparing Montgomery Multiplication Algorithms + # Cetin Kaya Koc and Tolga Acar and Burton S. Kaliski Jr. + # http://pdfs.semanticscholar.org/5e39/41ff482ec3ee41dc53c3298f0be085c69483.pdf + # + # - Montgomery Arithmetic from a Software Perspective\ + # Joppe W. Bos and Peter L. Montgomery, 2017\ + # https://eprint.iacr.org/2017/1057 + + # We want all the computation to be kept in registers + # hence we use a temporary `t`, hoping that the compiler does it. + var t: typeof(M) # zero-init + const N = t.len + # Extra words to handle up to 2 carries t[N] and t[N+1] + var tN: Word + var tNp1: Carry + + staticFor i, 0, N: + var C = Zero + + # Multiplication + staticFor j, 0, N: + # (C, t[j]) <- a[j] * b[i] + t[j] + C + muladd2(C, t[j], a[j], b[i], t[j], C) + addC(tNp1, tN, tN, C, Carry(0)) + + # Reduction + # m <- (t[0] * m0ninv) mod 2^w + # (C, _) <- m * M[0] + t[0] + var lo: Word + C = Zero + let m = t[0] * Word(m0ninv) + muladd1(C, lo, m, M[0], t[0]) + staticFor j, 1, N: + # (C, t[j]) <- a[j] * b[i] + t[j] + C + muladd2(C, t[j-1], m, M[j], t[j], C) + + # (C,t[N-1]) <- t[N] + C + # (_, t[N]) <- t[N+1] + C + var carry: Carry + addC(carry, t[N-1], tN, C, Carry(0)) + addC(carry, tN, Word(tNp1), Zero, carry) + + # t[N+1] can only be non-zero in the intermediate computation + # since it is immediately reduce to t[N] at the end of each "i" iteration + # However if t[N] is non-zero we have t > M + discard t.csub(M, tN.isNonZero() or not(t < M)) # TODO: (t >= M) is unnecessary for prime in the form (2^64)^w + r = t + +# Exported API +# ------------------------------------------------------------ + +func montyMul*( + r: var Limbs, a, b, M: Limbs, + m0ninv: static BaseType, canUseNoCarryMontyMul: static bool) {.inline.} = + ## Compute r <- a*b (mod M) in the Montgomery domain + ## `m0ninv` = -1/M (mod Word). Our words are 2^32 or 2^64 + ## + ## This resets r to zero before processing. Use {.noInit.} + ## to avoid duplicating with Nim zero-init policy + ## The result `r` buffer size MUST be at least the size of `M` buffer + ## + ## + ## Assuming 64-bit words, the magic constant should be: + ## + ## - µ ≡ -1/M[0] (mod 2^64) for a general multiplication + ## This can be precomputed with `negInvModWord` + ## - 1 for conversion from Montgomery to canonical representation + ## The library implements a faster `redc` primitive for that use-case + ## - R^2 (mod M) for conversion from canonical to Montgomery representation + ## + # i.e. c'R <- a'R b'R * R^-1 (mod M) in the natural domain + # as in the Montgomery domain all numbers are scaled by R + + # Nim doesn't like static Word, so we pass static BaseType up to here + # Then we passe them as Word again for the final processing. + + # Many curve moduli are "Montgomery-friendly" which means that m0inv is 1 + # This saves N basic type multiplication and potentially many register mov + # as well as unless using "mulx" instruction, x86 "mul" requires very specific registers. + # Compilers should be able to constant-propagate, but this prevents reusing code + # for example between secp256k1 (friendly) and BN254 (non-friendly). + # Here, as "montyMul" is inlined at the call site, the compiler shouldn't constant fold, saving size. + # Inlining the implementation instead (and no-inline this "montyMul" proc) would allow constant propagation + # of Montgomery-friendly m0ninv if the compiler deems it interesting, + # or we use `when m0ninv == 1` and enforce the inlining. + when canUseNoCarryMontyMul: + montyMul_CIOS_nocarry_unrolled(r, a, b, M, m0ninv) + else: + montyMul_CIOS(r, a, b, M, m0ninv) + +func montySquare*(r: var Limbs, a, M: Limbs, + m0ninv: static BaseType, canUseNoCarryMontyMul: static bool) {.inline.} = + ## Compute r <- a^2 (mod M) in the Montgomery domain + ## `negInvModWord` = -1/M (mod Word). Our words are 2^31 or 2^63 + montyMul(r, a, a, M, m0ninv, canUseNoCarryMontyMul) + +func redc*(r: var Limbs, a, one, M: Limbs, + m0ninv: static BaseType, canUseNoCarryMontyMul: static bool) {.inline.} = + ## Transform a bigint ``a`` from it's Montgomery N-residue representation (mod N) + ## to the regular natural representation (mod N) + ## + ## with W = M.len + ## and R = (2^WordBitSize)^W + ## + ## Does "a * R^-1 (mod M)" + ## + ## This is called a Montgomery Reduction + ## The Montgomery Magic Constant is µ = -1/N mod M + ## is used internally and can be precomputed with negInvModWord(Curve) + # References: + # - https://eprint.iacr.org/2017/1057.pdf (Montgomery) + # page: Radix-r interleaved multiplication algorithm + # - https://en.wikipedia.org/wiki/Montgomery_modular_multiplication#Montgomery_arithmetic_on_multiprecision_(variable-radix)_integers + # - http://langevin.univ-tln.fr/cours/MLC/extra/montgomery.pdf + # Montgomery original paper + # + montyMul(r, a, one, M, m0ninv, canUseNoCarryMontyMul) + +func montyResidue*(r: var Limbs, a, M, r2modM: Limbs, + m0ninv: static BaseType, canUseNoCarryMontyMul: static bool) {.inline.} = + ## Transform a bigint ``a`` from it's natural representation (mod N) + ## to a the Montgomery n-residue representation + ## + ## Montgomery-Multiplication - based + ## + ## with W = M.len + ## and R = (2^WordBitSize)^W + ## + ## Does "a * R (mod M)" + ## + ## `a`: The source BigInt in the natural representation. `a` in [0, N) range + ## `M`: The field modulus. M must be odd. + ## `r2modM`: 2^WordBitSize mod `M`. Can be precomputed with `r2mod` function + ## + ## Important: `r` is overwritten + ## The result `r` buffer size MUST be at least the size of `M` buffer + # Reference: https://eprint.iacr.org/2017/1057.pdf + montyMul(r, a, r2ModM, M, m0ninv, canUseNoCarryMontyMul) + +# Montgomery Modular Exponentiation +# ------------------------------------------ +# We use fixed-window based exponentiation +# that is constant-time: i.e. the number of multiplications +# does not depend on the number of set bits in the exponents +# those are always done and conditionally copied. +# +# The exponent MUST NOT be private data (until audited otherwise) +# - Power attack on RSA, https://www.di.ens.fr/~fouque/pub/ches06.pdf +# - Flush-and-reload on Sliding window exponentiation: https://tutcris.tut.fi/portal/files/8966761/p1639_pereida_garcia.pdf +# - Sliding right into disaster, https://eprint.iacr.org/2017/627.pdf +# - Fixed window leak: https://www.scirp.org/pdf/JCC_2019102810331929.pdf +# - Constructing sliding-windows leak, https://easychair.org/publications/open/fBNC +# +# For pairing curves, this is the case since exponentiation is only +# used for inversion via the Little Fermat theorem. +# For RSA, some exponentiations uses private exponents. +# +# Note: +# - Implementation closely follows Thomas Pornin's BearSSL +# - Apache Milagro Crypto has an alternative implementation +# that is more straightforward however: +# - the exponent hamming weight is used as loop bounds +# - the base^k is stored at each index of a temp table of size k +# - the base^k to use is indexed by the hamming weight +# of the exponent, leaking this to cache attacks +# - in contrast BearSSL touches the whole table to +# hide the actual selection + +template checkPowScratchSpaceLen(len: int) = + ## Checks that there is a minimum of scratchspace to hold the temporaries + debug: + assert len >= 2, "Internal Error: the scratchspace for powmod should be equal or greater than 2" + +func getWindowLen(bufLen: int): uint = + ## Compute the maximum window size that fits in the scratchspace buffer + checkPowScratchSpaceLen(bufLen) + result = 5 + while (1 shl result) + 1 > bufLen: + dec result + +func montyPowPrologue( + a: var Limbs, M, one: Limbs, + m0ninv: static BaseType, + scratchspace: var openarray[Limbs], + canUseNoCarryMontyMul: static bool + ): uint = + ## Setup the scratchspace + ## Returns the fixed-window size for exponentiation with window optimization. + result = scratchspace.len.getWindowLen() + # Precompute window content, special case for window = 1 + # (i.e scratchspace has only space for 2 temporaries) + # The content scratchspace[2+k] is set at a^k + # with scratchspace[0] untouched + if result == 1: + scratchspace[1] = a + else: + scratchspace[2] = a + for k in 2 ..< 1 shl result: + scratchspace[k+1].montyMul(scratchspace[k], a, M, m0ninv, canUseNoCarryMontyMul) + + # Set a to one + a = one + +func montyPowSquarings( + a: var Limbs, + exponent: openarray[byte], + M: Limbs, + negInvModWord: static BaseType, + tmp: var Limbs, + window: uint, + acc, acc_len: var uint, + e: var int, + canUseNoCarryMontyMul: static bool + ): tuple[k, bits: uint] {.inline.}= + ## Squaring step of exponentiation by squaring + ## Get the next k bits in range [1, window) + ## Square k times + ## Returns the number of squarings done and the corresponding bits + ## + ## Updates iteration variables and accumulators + # Due to the high number of parameters, + # forcing this inline actually reduces the code size + + # Get the next bits + var k = window + if acc_len < window: + if e < exponent.len: + acc = (acc shl 8) or exponent[e].uint + inc e + acc_len += 8 + else: # Drained all exponent bits + k = acc_len + + let bits = (acc shr (acc_len - k)) and ((1'u32 shl k) - 1) + acc_len -= k + + # We have k bits and can do k squaring + for i in 0 ..< k: + tmp.montySquare(a, M, negInvModWord, canUseNoCarryMontyMul) + a = tmp + + return (k, bits) + +func montyPow*( + a: var Limbs, + exponent: openarray[byte], + M, one: Limbs, + negInvModWord: static BaseType, + scratchspace: var openarray[Limbs], + canUseNoCarryMontyMul: static bool + ) = + ## Modular exponentiation r = a^exponent mod M + ## in the Montgomery domain + ## + ## This uses fixed-window optimization if possible + ## + ## - On input ``a`` is the base, on ``output`` a = a^exponent (mod M) + ## ``a`` is in the Montgomery domain + ## - ``exponent`` is the exponent in big-endian canonical format (octet-string) + ## Use ``exportRawUint`` for conversion + ## - ``M`` is the modulus + ## - ``one`` is 1 (mod M) in montgomery representation + ## - ``negInvModWord`` is the montgomery magic constant "-1/M[0] mod 2^WordBitSize" + ## - ``scratchspace`` with k the window bitsize of size up to 5 + ## This is a buffer that can hold between 2^k + 1 big-ints + ## A window of of 1-bit (no window optimization) requires only 2 big-ints + ## + ## Note that the best window size require benchmarking and is a tradeoff between + ## - performance + ## - stack usage + ## - precomputation + ## + ## For example BLS12-381 window size of 5 is 30% faster than no window, + ## but windows of size 2, 3, 4 bring no performance benefit, only increased stack space. + ## A window of size 5 requires (2^5 + 1)*(381 + 7)/8 = 33 * 48 bytes = 1584 bytes + ## of scratchspace (on the stack). + + let window = montyPowPrologue(a, M, one, negInvModWord, scratchspace, canUseNoCarryMontyMul) + + # We process bits with from most to least significant. + # At each loop iteration with have acc_len bits in acc. + # To maintain constant-time the number of iterations + # or the number of operations or memory accesses should be the same + # regardless of acc & acc_len + var + acc, acc_len: uint + e = 0 + while acc_len > 0 or e < exponent.len: + let (k, bits) = montyPowSquarings( + a, exponent, M, negInvModWord, + scratchspace[0], window, + acc, acc_len, e, + canUseNoCarryMontyMul + ) + + # Window lookup: we set scratchspace[1] to the lookup value. + # If the window length is 1, then it's already set. + if window > 1: + # otherwise we need a constant-time lookup + # in particular we need the same memory accesses, we can't + # just index the openarray with the bits to avoid cache attacks. + for i in 1 ..< 1 shl k: + let ctl = Word(i) == Word(bits) + scratchspace[1].ccopy(scratchspace[1+i], ctl) + + # Multiply with the looked-up value + # we keep the product only if the exponent bits are not all zero + scratchspace[0].montyMul(a, scratchspace[1], M, negInvModWord, canUseNoCarryMontyMul) + a.ccopy(scratchspace[0], Word(bits).isNonZero()) + +func montyPowUnsafeExponent*( + a: var Limbs, + exponent: openarray[byte], + M, one: Limbs, + negInvModWord: static BaseType, + scratchspace: var openarray[Limbs], + canUseNoCarryMontyMul: static bool + ) = + ## Modular exponentiation r = a^exponent mod M + ## in the Montgomery domain + ## + ## Warning ⚠️ : + ## This is an optimization for public exponent + ## Otherwise bits of the exponent can be retrieved with: + ## - memory access analysis + ## - power analysis + ## - timing analysis + + # TODO: scratchspace[1] is unused when window > 1 + + let window = montyPowPrologue(a, M, one, negInvModWord, scratchspace, canUseNoCarryMontyMul) + + var + acc, acc_len: uint + e = 0 + while acc_len > 0 or e < exponent.len: + let (k, bits) = montyPowSquarings( + a, exponent, M, negInvModWord, + scratchspace[0], window, + acc, acc_len, e, + canUseNoCarryMontyMul + ) + + ## Warning ⚠️: Exposes the exponent bits + if bits != 0: + if window > 1: + scratchspace[0].montyMul(a, scratchspace[1+bits], M, negInvModWord, canUseNoCarryMontyMul) + else: + # scratchspace[1] holds the original `a` + scratchspace[0].montyMul(a, scratchspace[1], M, negInvModWord, canUseNoCarryMontyMul) + a = scratchspace[0] diff --git a/constantine/arithmetic/precomputed.nim b/constantine/arithmetic/precomputed.nim index 4af0a75..150db9d 100644 --- a/constantine/arithmetic/precomputed.nim +++ b/constantine/arithmetic/precomputed.nim @@ -7,7 +7,7 @@ # at your option. This file may not be copied, modified, or distributed except according to those terms. import - ./bigints_checked, + ./bigints, ../primitives/constant_time, ../config/common, ../io/io_bigints @@ -22,37 +22,77 @@ import # ############################################################ # # Those primitives are intended to be compile-time only -# They are generic over the bitsize: enabling them at runtime -# would create a copy for each bitsize used (monomorphization) -# leading to code-bloat. -# Thos are NOT compile-time, using CTBool seems to confuse the VM +# Those are NOT tagged compile-time, using CTBool seems to confuse the VM # We don't use distinct types here, they confuse the VM -# Similarly, isMsbSet causes trouble with distinct type in the VM +# Similarly, using addC / subB confuses the VM -func isMsbSet(x: BaseType): bool = - const msb_pos = BaseType.sizeof * 8 - 1 - bool(x shr msb_pos) +# As we choose to use the full 32/64 bits of the integers and there is no carry flag +# in the compile-time VM we need a portable (and slow) "adc" and "sbb". +# Hopefully compilation time stays decent. + +const + HalfWidth = WordBitWidth shr 1 + HalfBase = (BaseType(1) shl HalfWidth) + HalfMask = HalfBase - 1 + +func split(n: BaseType): tuple[hi, lo: BaseType] = + result.hi = n shr HalfWidth + result.lo = n and HalfMask + +func merge(hi, lo: BaseType): BaseType = + (hi shl HalfWidth) or lo + +func addC(cOut, sum: var BaseType, a, b, cIn: BaseType) = + # Add with carry, fallback for the Compile-Time VM + # (CarryOut, Sum) <- a + b + CarryIn + let (aHi, aLo) = split(a) + let (bHi, bLo) = split(b) + let tLo = aLo + bLo + cIn + let (cLo, rLo) = split(tLo) + let tHi = aHi + bHi + cLo + let (cHi, rHi) = split(tHi) + cOut = cHi + sum = merge(rHi, rLo) + +func subB(bOut, diff: var BaseType, a, b, bIn: BaseType) = + # Substract with borrow, fallback for the Compile-Time VM + # (BorrowOut, Sum) <- a - b - BorrowIn + let (aHi, aLo) = split(a) + let (bHi, bLo) = split(b) + let tLo = HalfBase + aLo - bLo - bIn + let (noBorrowLo, rLo) = split(tLo) + let tHi = HalfBase + aHi - bHi - BaseType(noBorrowLo == 0) + let (noBorrowHi, rHi) = split(tHi) + bOut = BaseType(noBorrowHi == 0) + diff = merge(rHi, rLo) func dbl(a: var BigInt): bool = ## In-place multiprecision double ## a -> 2a + var carry, sum: BaseType for i in 0 ..< a.limbs.len: - var z = BaseType(a.limbs[i]) * 2 + BaseType(result) - result = z.isMsbSet() - a.limbs[i] = mask(Word(z)) + let ai = BaseType(a.limbs[i]) + addC(carry, sum, ai, ai, carry) + a.limbs[i] = Word(sum) -func sub(a: var BigInt, b: BigInt, ctl: bool): bool = + result = bool(carry) + +func csub(a: var BigInt, b: BigInt, ctl: bool): bool = ## In-place optional substraction ## ## It is NOT constant-time and is intended ## only for compile-time precomputation ## of non-secret data. + var borrow, diff: BaseType for i in 0 ..< a.limbs.len: - let new_a = BaseType(a.limbs[i]) - BaseType(b.limbs[i]) - BaseType(result) - result = new_a.isMsbSet() - a.limbs[i] = if ctl: new_a.Word.mask() - else: a.limbs[i] + let ai = BaseType(a.limbs[i]) + let bi = BaseType(b.limbs[i]) + subB(borrow, diff, ai, bi, borrow) + if ctl: + a.limbs[i] = Word(diff) + + result = bool(borrow) func doubleMod(a: var BigInt, M: BigInt) = ## In-place modular double @@ -62,8 +102,8 @@ func doubleMod(a: var BigInt, M: BigInt) = ## only for compile-time precomputation ## of non-secret data. var ctl = dbl(a) - ctl = ctl or not sub(a, M, false) - discard sub(a, M, ctl) + ctl = ctl or not a.csub(M, false) + discard csub(a, M, ctl) # ############################################################ # @@ -75,11 +115,19 @@ func checkOddModulus(M: BigInt) = doAssert bool(BaseType(M.limbs[0]) and 1), "Internal Error: the modulus must be odd to use the Montgomery representation." func checkValidModulus(M: BigInt) = - const expectedMsb = M.bits-1 - WordBitSize * (M.limbs.len - 1) + const expectedMsb = M.bits-1 - WordBitWidth * (M.limbs.len - 1) let msb = log2(BaseType(M.limbs[^1])) doAssert msb == expectedMsb, "Internal Error: the modulus must use all declared bits and only those" +func useNoCarryMontyMul*(M: BigInt): bool = + ## Returns if the modulus is compatible + ## with the no-carry Montgomery Multiplication + ## from https://hackmd.io/@zkteam/modular_multiplication + # Indirection needed because static object are buggy + # https://github.com/nim-lang/Nim/issues/9679 + BaseType(M.limbs[^1]) < high(BaseType) shr 1 + func negInvModWord*(M: BigInt): BaseType = ## Returns the Montgomery domain magic constant for the input modulus: ## @@ -88,9 +136,9 @@ func negInvModWord*(M: BigInt): BaseType = ## M[0] is the least significant limb of M ## M must be odd and greater than 2. ## - ## Assuming 63-bit words: + ## Assuming 64-bit words: ## - ## µ ≡ -1/M[0] (mod 2^63) + ## µ ≡ -1/M[0] (mod 2^64) # We use BaseType for return value because static distinct type # confuses Nim semchecks [UPSTREAM BUG] @@ -108,17 +156,17 @@ func negInvModWord*(M: BigInt): BaseType = # - http://marc-b-reynolds.github.io/math/2017/09/18/ModInverse.html # For Montgomery magic number, we are in a special case - # where a = M and m = 2^WordBitsize. + # where a = M and m = 2^WordBitWidth. # For a and m to be coprimes, a must be odd. # We have the following relation # ax ≡ 1 (mod 2^k) <=> ax(2 - ax) ≡ 1 (mod 2^(2k)) # # To get -1/M0 mod LimbSize - # we can either negate the resulting x of `ax(2 - ax) ≡ 1 (mod 2^(2k))` - # or do ax(2 + ax) ≡ 1 (mod 2^(2k)) + # we can negate the result x of `ax(2 - ax) ≡ 1 (mod 2^(2k))` + # or if k is odd: do ax(2 + ax) ≡ 1 (mod 2^(2k)) # - # To get the the modular inverse of 2^k' with arbitrary k' (like k=63 in our case) + # To get the the modular inverse of 2^k' with arbitrary k' # we can do modInv(a, 2^64) mod 2^63 as mentionned in Koc paper. checkOddModulus(M) @@ -126,21 +174,21 @@ func negInvModWord*(M: BigInt): BaseType = let M0 = BaseType(M.limbs[0]) - k = log2(uint32(WordPhysBitSize)) + k = log2(WordBitWidth.uint32) result = M0 # Start from an inverse of M0 modulo 2, M0 is odd and it's own inverse for _ in 0 ..< k: # at each iteration we get the inverse mod(2^2k) - result *= 2 + M0 * result # x' = x(2 + ax) (`+` to avoid negating at the end) + result *= 2 - M0 * result # x' = x(2 - ax) - # Our actual word size is 2^63 not 2^64 - result = result and BaseType(MaxWord) + # negate to obtain the negative inverse + result = not(result) + 1 func r_powmod(n: static int, M: BigInt): BigInt = ## Returns the Montgomery domain magic constant for the input modulus: ## - ## R ≡ R (mod M) with R = (2^WordBitSize)^numWords + ## R ≡ R (mod M) with R = (2^WordBitWidth)^numWords ## or - ## R² ≡ R² (mod M) with R = (2^WordBitSize)^numWords + ## R² ≡ R² (mod M) with R = (2^WordBitWidth)^numWords ## ## Assuming a field modulus of size 256-bit with 63-bit words, we require 5 words ## R² ≡ ((2^63)^5)^2 (mod M) = 2^630 (mod M) @@ -161,22 +209,20 @@ func r_powmod(n: static int, M: BigInt): BigInt = checkOddModulus(M) checkValidModulus(M) - result.setInternalBitLength() - const w = M.limbs.len - msb = M.bits-1 - WordBitSize * (w - 1) - start = (w-1)*WordBitSize + msb - stop = n*WordBitSize*w + msb = M.bits-1 - WordBitWidth * (w - 1) + start = (w-1)*WordBitWidth + msb + stop = n*WordBitWidth*w - result.limbs[^1] = Word(1 shl msb) # C0 = 2^(wn-1), the power of 2 immediatly less than the modulus + result.limbs[^1] = Word(BaseType(1) shl msb) # C0 = 2^(wn-1), the power of 2 immediatly less than the modulus for _ in start ..< stop: result.doubleMod(M) func r2mod*(M: BigInt): BigInt = ## Returns the Montgomery domain magic constant for the input modulus: ## - ## R² ≡ R² (mod M) with R = (2^WordBitSize)^numWords + ## R² ≡ R² (mod M) with R = (2^WordBitWidth)^numWords ## ## Assuming a field modulus of size 256-bit with 63-bit words, we require 5 words ## R² ≡ ((2^63)^5)^2 (mod M) = 2^630 (mod M) @@ -195,6 +241,6 @@ func primeMinus2_BE*[bits: static int]( ## For use to precompute modular inverse exponent. var tmp = P - discard tmp.sub(BigInt[bits].fromRawUint([byte 2], bigEndian), true) + discard tmp.csub(BigInt[bits].fromRawUint([byte 2], bigEndian), true) result.exportRawUint(tmp, bigEndian) diff --git a/constantine/config/common.nim b/constantine/config/common.nim index 66b0372..d726e28 100644 --- a/constantine/config/common.nim +++ b/constantine/config/common.nim @@ -12,9 +12,9 @@ # # ############################################################ -import ../primitives/constant_time +import ../primitives -when sizeof(int) == 8: +when sizeof(int) == 8 and not defined(Constantine32): type BaseType* = uint64 ## Physical BigInt for conversion in "normal integers" @@ -29,23 +29,15 @@ type ## A logical BigInt word is of size physical MachineWord-1 const - ExcessBits = 1 - WordPhysBitSize* = sizeof(Word) * 8 - WordBitSize* = WordPhysBitSize - ExcessBits + WordBitWidth* = sizeof(Word) * 8 + ## Logical word size CtTrue* = ctrue(Word) CtFalse* = cfalse(Word) Zero* = Word(0) One* = Word(1) - MaxWord* = (not Zero) shr (WordPhysBitSize - WordBitSize) - ## This represents 0x7F_FF_FF_FF__FF_FF_FF_FF - ## also 0b0111...1111 - ## This biggest representable number in our limbs. - ## i.e. The most significant bit is never set at the end of each function - -template mask*(w: Word): Word = - w and MaxWord + MaxWord* = Word(high(BaseType)) # ############################################################ # diff --git a/constantine/config/curves.nim b/constantine/config/curves.nim index 6a1424e..a627454 100644 --- a/constantine/config/curves.nim +++ b/constantine/config/curves.nim @@ -11,7 +11,7 @@ import macros, # Internal ./curves_parser, ./common, - ../arithmetic/[precomputed, bigints_checked] + ../arithmetic/[precomputed, bigints] # ############################################################ @@ -109,6 +109,17 @@ macro genMontyMagics(T: typed): untyped = let E = T.getImpl[2] for i in 1 ..< E.len: let curve = E[i] + # const MyCurve_CanUseNoCarryMontyMul = useNoCarryMontyMul(MyCurve_Modulus) + result.add newConstStmt( + ident($curve & "_CanUseNoCarryMontyMul"), newCall( + bindSym"useNoCarryMontyMul", + nnkDotExpr.newTree( + bindSym($curve & "_Modulus"), + ident"mres" + ) + ) + ) + # const MyCurve_R2modP = r2mod(MyCurve_Modulus) result.add newConstStmt( ident($curve & "_R2modP"), newCall( @@ -154,6 +165,11 @@ macro genMontyMagics(T: typed): untyped = genMontyMagics(Curve) +macro canUseNoCarryMontyMul*(C: static Curve): untyped = + ## Returns true if the Modulus is compatible with a fast + ## Montgomery multiplication that avoids many carries + result = bindSym($C & "_CanUseNoCarryMontyMul") + macro getR2modP*(C: static Curve): untyped = ## Get the Montgomery "R^2 mod P" constant associated to a curve field modulus result = bindSym($C & "_R2modP") @@ -192,7 +208,7 @@ macro debugConsts(): untyped = echo "Curve ", `curveName`,':' echo " Field Modulus: ", `modulus` echo " Montgomery R² (mod P): ", `r2modp` - echo " Montgomery -1/P[0] (mod 2^", WordBitSize, "): ", `negInvModWord` + echo " Montgomery -1/P[0] (mod 2^", WordBitWidth, "): ", `negInvModWord` result.add quote do: echo "----------------------------------------------------------------------------" diff --git a/constantine/config/curves_parser.nim b/constantine/config/curves_parser.nim index c0bbcb7..6254ab4 100644 --- a/constantine/config/curves_parser.nim +++ b/constantine/config/curves_parser.nim @@ -10,7 +10,7 @@ import # Standard library macros, # Internal - ../io/io_bigints, ../arithmetic/bigints_checked + ../io/io_bigints, ../arithmetic/bigints # Macro to parse declarative curves configuration. diff --git a/constantine/io/io_bigints.nim b/constantine/io/io_bigints.nim index dc85fd7..ecdefd5 100644 --- a/constantine/io/io_bigints.nim +++ b/constantine/io/io_bigints.nim @@ -12,7 +12,7 @@ import ../primitives/constant_time, - ../arithmetic/bigints_checked, + ../arithmetic/bigints, ../config/common # ############################################################ @@ -21,6 +21,10 @@ import # # ############################################################ +# Note: the parsing/serialization routines were initially developed +# with an internal representation that used 31 bits out of a uint32 +# or 63-bits out of an uint64 + # TODO: tag/remove exceptions raised. func fromRawUintLE( @@ -49,10 +53,10 @@ func fromRawUintLE( acc_len += 8 # We count bit by bit # if full, dump - if acc_len >= WordBitSize: - dst.limbs[dst_idx] = mask(acc) + if acc_len >= WordBitWidth: + dst.limbs[dst_idx] = acc inc dst_idx - acc_len -= WordBitSize + acc_len -= WordBitWidth acc = src_byte shr (8 - acc_len) if dst_idx < dst.limbs.len: @@ -86,10 +90,10 @@ func fromRawUintBE( acc_len += 8 # We count bit by bit # if full, dump - if acc_len >= WordBitSize: - dst.limbs[dst_idx] = mask(acc) + if acc_len >= WordBitWidth: + dst.limbs[dst_idx] = acc inc dst_idx - acc_len -= WordBitSize + acc_len -= WordBitWidth acc = src_byte shr (8 - acc_len) if dst_idx < dst.limbs.len: @@ -113,7 +117,6 @@ func fromRawUint*( dst.fromRawUintLE(src) else: dst.fromRawUintBE(src) - dst.setInternalBitLength() func fromRawUint*( T: type BigInt, @@ -187,14 +190,17 @@ func exportRawUintLE( inc src_idx if acc_len == 0: - # Edge case, we need to refill the buffer to output 64-bit - # as we can only read 63-bit per word + # We need to refill the buffer to output 64-bit acc = w - acc_len = WordBitSize + acc_len = WordBitWidth else: - let lo = (w shl acc_len) or acc - dec acc_len - acc = w shr (WordBitSize - acc_len) + when WordBitWidth == sizeof(Word) * 8: + let lo = acc + acc = w + else: # If using 63-bit (or less) out of uint64 + let lo = (w shl acc_len) or acc + dec acc_len + acc = w shr (WordBitWidth - acc_len) if tail >= sizeof(Word): # Unrolled copy @@ -237,14 +243,17 @@ func exportRawUintBE( inc src_idx if acc_len == 0: - # Edge case, we need to refill the buffer to output 64-bit - # as we can only read 63-bit per word + # We need to refill the buffer to output 64-bit acc = w - acc_len = WordBitSize + acc_len = WordBitWidth else: - let lo = (w shl acc_len) or acc - dec acc_len - acc = w shr (WordBitSize - acc_len) + when WordBitWidth == sizeof(Word) * 8: + let lo = acc + acc = w + else: # If using 63-bit (or less) out of uint64 + let lo = (w shl acc_len) or acc + dec acc_len + acc = w shr (WordBitWidth - acc_len) if tail >= sizeof(Word): # Unrolled copy diff --git a/constantine/io/io_fields.nim b/constantine/io/io_fields.nim index b9973fe..dbc1c43 100644 --- a/constantine/io/io_fields.nim +++ b/constantine/io/io_fields.nim @@ -9,7 +9,7 @@ import ./io_bigints, ../config/curves, - ../arithmetic/[bigints_checked, finite_fields] + ../arithmetic/[bigints, finite_fields] # No exceptions allowed {.push raises: [].} diff --git a/constantine/primitives.nim b/constantine/primitives.nim new file mode 100644 index 0000000..d311f84 --- /dev/null +++ b/constantine/primitives.nim @@ -0,0 +1,21 @@ +# Constantine +# Copyright (c) 2018-2019 Status Research & Development GmbH +# Copyright (c) 2020-Present Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +import + primitives/constant_time_types, + primitives/constant_time, + primitives/multiplexers, + primitives/addcarry_subborrow, + primitives/extended_precision + +export + constant_time_types, + constant_time, + multiplexers, + addcarry_subborrow, + extended_precision diff --git a/constantine/primitives/README.md b/constantine/primitives/README.md index c4374f8..921ff6b 100644 --- a/constantine/primitives/README.md +++ b/constantine/primitives/README.md @@ -6,3 +6,86 @@ This folder holds: to have the compiler enforce proper usage - extended precision multiplication and division primitives - assembly primitives +- intrinsics + +## Security + +⚠: **Hardware assumptions** + + Constantine assumes that multiplication is implemented + constant-time in hardware. + + If this is not the case, + you SHOULD **strongly reconsider** your hardware choice or + reimplement multiplication with constant-time guarantees + (at the cost of speed and code-size) + +⚠: Currently division and modulo operations are `unsafe` + and uses hardware division. + No known CPU implements division in constant-time. + A constant-time alternative will be provided. + +While extremely slow, division and modulo are only used +on random or user inputs to constrain them to the prime field +of the elliptic curves. +Constantine internals are built to avoid costly constant-time divisions. + +## Performance and code size + +It is recommended to prefer Clang, MSVC or ICC over GCC if possible. +GCC code is significantly slower and bigger for multiprecision arithmetic +even when using dedicated intrinsics. + +See https://gcc.godbolt.org/z/2h768y +```C +#include +#include + +void add256(uint64_t a[4], uint64_t b[4]){ + uint8_t carry = 0; + for (int i = 0; i < 4; ++i) + carry = _addcarry_u64(carry, a[i], b[i], &a[i]); +} +``` + +GCC +```asm +add256: + movq (%rsi), %rax + addq (%rdi), %rax + setc %dl + movq %rax, (%rdi) + movq 8(%rdi), %rax + addb $-1, %dl + adcq 8(%rsi), %rax + setc %dl + movq %rax, 8(%rdi) + movq 16(%rdi), %rax + addb $-1, %dl + adcq 16(%rsi), %rax + setc %dl + movq %rax, 16(%rdi) + movq 24(%rsi), %rax + addb $-1, %dl + adcq %rax, 24(%rdi) + ret +``` + +Clang +```asm +add256: + movq (%rsi), %rax + addq %rax, (%rdi) + movq 8(%rsi), %rax + adcq %rax, 8(%rdi) + movq 16(%rsi), %rax + adcq %rax, 16(%rdi) + movq 24(%rsi), %rax + adcq %rax, 24(%rdi) + retq +``` + +### Inline assembly + +Using inline assembly will sacrifice code readability, portability, auditability and maintainability. +That said the performance might be worth it. diff --git a/constantine/primitives/addcarry_subborrow.nim b/constantine/primitives/addcarry_subborrow.nim new file mode 100644 index 0000000..03bec55 --- /dev/null +++ b/constantine/primitives/addcarry_subborrow.nim @@ -0,0 +1,168 @@ +# Constantine +# Copyright (c) 2018-2019 Status Research & Development GmbH +# Copyright (c) 2020-Present Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +import ./constant_time_types + +# ############################################################ +# +# Add-with-carry and Sub-with-borrow +# +# ############################################################ +# +# This file implements add-with-carry and sub-with-borrow +# +# It is currently (Mar 2020) impossible to have the compiler +# generate optimal code in a generic way. +# +# On x86, addcarry_u64 intrinsic will generate optimal code +# except for GCC. +# +# On other CPU architectures inline assembly might be desirable. +# A compiler proof-of-concept is available in the "research" folder. +# +# See https://gcc.godbolt.org/z/2h768y +# ```C +# #include +# #include +# +# void add256(uint64_t a[4], uint64_t b[4]){ +# uint8_t carry = 0; +# for (int i = 0; i < 4; ++i) +# carry = _addcarry_u64(carry, a[i], b[i], &a[i]); +# } +# ``` +# +# GCC +# ```asm +# add256: +# movq (%rsi), %rax +# addq (%rdi), %rax +# setc %dl +# movq %rax, (%rdi) +# movq 8(%rdi), %rax +# addb $-1, %dl +# adcq 8(%rsi), %rax +# setc %dl +# movq %rax, 8(%rdi) +# movq 16(%rdi), %rax +# addb $-1, %dl +# adcq 16(%rsi), %rax +# setc %dl +# movq %rax, 16(%rdi) +# movq 24(%rsi), %rax +# addb $-1, %dl +# adcq %rax, 24(%rdi) +# ret +# ``` +# +# Clang +# ```asm +# add256: +# movq (%rsi), %rax +# addq %rax, (%rdi) +# movq 8(%rsi), %rax +# adcq %rax, 8(%rdi) +# movq 16(%rsi), %rax +# adcq %rax, 16(%rdi) +# movq 24(%rsi), %rax +# adcq %rax, 24(%rdi) +# retq +# ``` + +# ############################################################ +# +# Intrinsics +# +# ############################################################ + +# Note: GCC before 2017 had incorrect codegen in some cases: +# - https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81300 + +when X86: + when defined(windows): + {.pragma: intrinsics, header:"", nodecl.} + else: + {.pragma: intrinsics, header:"", nodecl.} + + func addcarry_u32(carryIn: Carry, a, b: Ct[uint32], sum: var Ct[uint32]): Carry {.importc: "_addcarry_u32", intrinsics.} + func subborrow_u32(borrowIn: Borrow, a, b: Ct[uint32], diff: var Ct[uint32]): Borrow {.importc: "_subborrow_u32", intrinsics.} + + func addcarry_u64(carryIn: Carry, a, b: Ct[uint64], sum: var Ct[uint64]): Carry {.importc: "_addcarry_u64", intrinsics.} + func subborrow_u64(borrowIn: Borrow, a, b: Ct[uint64], diff: var Ct[uint64]): Borrow {.importc: "_subborrow_u64", intrinsics.} + +# ############################################################ +# +# Public +# +# ############################################################ + +func addC*(cOut: var Carry, sum: var Ct[uint32], a, b: Ct[uint32], cIn: Carry) {.inline.} = + ## Addition with carry + ## (CarryOut, Sum) <- a + b + CarryIn + when X86: + cOut = addcarry_u32(cIn, a, b, sum) + else: + let dblPrec = uint64(cIn) + uint64(a) + uint64(b) + sum = (Ct[uint32])(dblPrec) + cOut = Carry(dblPrec shr 32) + +func subB*(bOut: var Borrow, diff: var Ct[uint32], a, b: Ct[uint32], bIn: Borrow) {.inline.} = + ## Substraction with borrow + ## (BorrowOut, Diff) <- a - b - borrowIn + when X86: + bOut = subborrow_u32(bIn, a, b, diff) + else: + let dblPrec = uint64(a) - uint64(b) - uint64(bIn) + diff = (Ct[uint32])(dblPrec) + # On borrow the high word will be 0b1111...1111 and needs to be masked + bOut = Borrow((dblPrec shr 32) and 1) + +func addC*(cOut: var Carry, sum: var Ct[uint64], a, b: Ct[uint64], cIn: Carry) {.inline.} = + ## Addition with carry + ## (CarryOut, Sum) <- a + b + CarryIn + when X86: + cOut = addcarry_u64(cIn, a, b, sum) + else: + block: + static: + doAssert GCC_Compatible + doAssert sizeof(int) == 8 + + var dblPrec {.noInit.}: uint128 + {.emit:[dblPrec, " = (unsigned __int128)", a," + (unsigned __int128)", b, " + (unsigned __int128)",cIn,";"].} + + # Don't forget to dereference the var param in C mode + when defined(cpp): + {.emit:[cOut, " = (NU64)(", dblPrec," >> ", 64'u64, ");"].} + {.emit:[sum, " = (NU64)", dblPrec,";"].} + else: + {.emit:["*",cOut, " = (NU64)(", dblPrec," >> ", 64'u64, ");"].} + {.emit:["*",sum, " = (NU64)", dblPrec,";"].} + +func subB*(bOut: var Borrow, diff: var Ct[uint64], a, b: Ct[uint64], bIn: Borrow) {.inline.} = + ## Substraction with borrow + ## (BorrowOut, Diff) <- a - b - borrowIn + when X86: + bOut = subborrow_u64(bIn, a, b, diff) + else: + block: + static: + doAssert GCC_Compatible + doAssert sizeof(int) == 8 + + var dblPrec {.noInit.}: uint128 + {.emit:[dblPrec, " = (unsigned __int128)", a," - (unsigned __int128)", b, " - (unsigned __int128)",bIn,";"].} + + # Don't forget to dereference the var param in C mode + # On borrow the high word will be 0b1111...1111 and needs to be masked + when defined(cpp): + {.emit:[bOut, " = (NU64)(", dblPrec," >> ", 64'u64, ") & 1;"].} + {.emit:[diff, " = (NU64)", dblPrec,";"].} + else: + {.emit:["*",bOut, " = (NU64)(", dblPrec," >> ", 64'u64, ") & 1;"].} + {.emit:["*",diff, " = (NU64)", dblPrec,";"].} diff --git a/constantine/primitives/constant_time.nim b/constantine/primitives/constant_time.nim index 41bd118..66586ab 100644 --- a/constantine/primitives/constant_time.nim +++ b/constantine/primitives/constant_time.nim @@ -6,27 +6,7 @@ # * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). # at your option. This file may not be copied, modified, or distributed except according to those terms. -# ############################################################ -# -# Constant-time primitives -# -# ############################################################ -type - BaseUint* = SomeUnsignedInt or byte - - Ct*[T: BaseUint] = distinct T - - CTBool*[T: Ct] = distinct T # range[T(0)..T(1)] - ## To avoid the compiler replacing bitwise boolean operations - ## by conditional branches, we don't use booleans. - ## We use an int to prevent compiler "optimization" and introduction of branches - # Note, we could use "range" but then the codegen - # uses machine-sized signed integer types. - # signed types and machine-dependent words are undesired - # - we don't want compiler optimizing signed "undefined behavior" - # - Basic functions like BIgInt add/sub - # return and/or accept CTBool, we don't want them - # to require unnecessarily 8 bytes instead of 4 bytes +import ./constant_time_types # ############################################################ # @@ -227,79 +207,6 @@ template `<=`*[T: Ct](x, y: T): CTBool[T] = template `xor`*[T: Ct](x, y: CTBool[T]): CTBool[T] = CTBool[T](noteq(T(x), T(y))) -func mux*[T](ctl: CTBool[T], x, y: T): T {.inline.}= - ## Multiplexer / selector - ## Returns x if ctl is true - ## else returns y - ## So equivalent to ctl? x: y - # - # TODO verify assembly generated - # Alternatives: - # - https://cryptocoding.net/index.php/Coding_rules - # - https://www.cl.cam.ac.uk/~rja14/Papers/whatyouc.pdf - when defined(amd64) or defined(i386): - when sizeof(T) == 8: - var muxed = x - asm """ - testq %[ctl], %[ctl] - cmovzq %[y], %[muxed] - : [muxed] "+r" (`muxed`) - : [ctl] "r" (`ctl`), [y] "r" (`y`) - : "cc" - """ - muxed - elif sizeof(T) == 4: - var muxed = x - asm """ - testl %[ctl], %[ctl] - cmovzl %[y], %[muxed] - : [muxed] "+r" (`muxed`) - : [ctl] "r" (`ctl`), [y] "r" (`y`) - : "cc" - """ - muxed - else: - {.error: "Unsupported word size".} - else: - let # Templates duplicate input params code - x_Mux = x - y_Mux = y - y_Mux xor (-T(ctl) and (x_Mux xor y_Mux)) - -func mux*[T: CTBool](ctl: CTBool, x, y: T): T {.inline.}= - ## Multiplexer / selector - ## Returns x if ctl is true - ## else returns y - ## So equivalent to ctl? x: y - when defined(amd64) or defined(i386): - when sizeof(T) == 8: - var muxed = x - asm """ - testq %[ctl], %[ctl] - cmovzq %[y], %[muxed] - : [muxed] "+r" (`muxed`) - : [ctl] "r" (`ctl`), [y] "r" (`y`) - : "cc" - """ - muxed - elif sizeof(T) == 4: - var muxed = x - asm """ - testl %[ctl], %[ctl] - cmovzl %[y], %[muxed] - : [muxed] "+r" (`muxed`) - : [ctl] "r" (`ctl`), [y] "r" (`y`) - : "cc" - """ - muxed - else: - {.error: "Unsupported word size".} - else: - let # Templates duplicate input params code - x_Mux = x - y_Mux = y - T(T.T(y_Mux) xor (-T.T(ctl) and T.T(x_Mux xor y_Mux))) - # ############################################################ # # Workaround system.nim `!=` template diff --git a/constantine/primitives/constant_time_types.nim b/constantine/primitives/constant_time_types.nim new file mode 100644 index 0000000..2cab28f --- /dev/null +++ b/constantine/primitives/constant_time_types.nim @@ -0,0 +1,41 @@ +# Constantine +# Copyright (c) 2018-2019 Status Research & Development GmbH +# Copyright (c) 2020-Present Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +type + BaseUint* = SomeUnsignedInt or byte + + Ct*[T: BaseUint] = distinct T + + CTBool*[T: Ct] = distinct T # range[T(0)..T(1)] + ## To avoid the compiler replacing bitwise boolean operations + ## by conditional branches, we don't use booleans. + ## We use an int to prevent compiler "optimization" and introduction of branches + # Note, we could use "range" but then the codegen + # uses machine-sized signed integer types. + # signed types and machine-dependent words are undesired + # - we don't want compiler optimizing signed "undefined behavior" + # - Basic functions like BigInt add/sub + # return and/or accept CTBool, we don't want them + # to require unnecessarily 8 bytes instead of 4 bytes + # + # Also Nim adds tests everywhere a range type is used which is great + # except in a crypto library: + # - We don't want exceptions + # - Nim will be helpful and return the offending value, which might be secret data + # - This will hint the underlying C compiler about the value range + # and seeing 0/1 it might want to use branches again. + + Carry* = Ct[uint8] # distinct range[0'u8 .. 1] + Borrow* = Ct[uint8] # distinct range[0'u8 .. 1] + +const GCC_Compatible* = defined(gcc) or defined(clang) or defined(llvm_gcc) +const X86* = defined(amd64) or defined(i386) + +when sizeof(int) == 8 and GCC_Compatible: + type + uint128*{.importc: "unsigned __int128".} = object diff --git a/constantine/primitives/extended_precision.nim b/constantine/primitives/extended_precision.nim index 963909c..87fe11a 100644 --- a/constantine/primitives/extended_precision.nim +++ b/constantine/primitives/extended_precision.nim @@ -8,11 +8,11 @@ # ############################################################ # -# Unsafe constant-time primitives with specific restrictions +# Extended precision primitives # # ############################################################ -import ./constant_time +import ./constant_time_types # ############################################################ # @@ -37,37 +37,30 @@ func unsafeDiv2n1n*(q, r: var Ct[uint32], n_hi, n_lo, d: Ct[uint32]) {.inline.}= q = (Ct[uint32])(dividend div divisor) r = (Ct[uint32])(dividend mod divisor) -func unsafeFMA*(hi, lo: var Ct[uint32], a, b, c: Ct[uint32]) {.inline.} = +func muladd1*(hi, lo: var Ct[uint32], a, b, c: Ct[uint32]) {.inline.} = ## Extended precision multiplication + addition - ## This is constant-time on most hardware except some specific one like Cortex M0 ## (hi, lo) <- a*b + c - block: - # Note: since a and b use 31-bit, - # the result is 62-bit and carrying cannot overflow - let dblPrec = uint64(a) * uint64(b) + uint64(c) - hi = Ct[uint32](dblPrec shr 31) - lo = Ct[uint32](dblPrec) and Ct[uint32](1'u32 shl 31 - 1) + ## + ## Note: 0xFFFFFFFF² -> (hi: 0xFFFFFFFE, lo: 0x00000001) + ## so adding any c cannot overflow + ## + ## This is constant-time on most hardware + ## See: https://www.bearssl.org/ctmul.html + let dblPrec = uint64(a) * uint64(b) + uint64(c) + lo = (Ct[uint32])(dblPrec) + hi = (Ct[uint32])(dblPrec shr 32) -func unsafeFMA2*(hi, lo: var Ct[uint32], a1, b1, a2, b2, c1, c2: Ct[uint32]) {.inline.}= - ## (hi, lo) <- a1 * b1 + a2 * b2 + c1 + c2 - block: - # TODO: Can this overflow? - let dblPrec = uint64(a1) * uint64(b1) + - uint64(a2) * uint64(b2) + - uint64(c1) + - uint64(c2) - hi = Ct[uint32](dblPrec shr 31) - lo = Ct[uint32](dblPrec) and Ct[uint32](1'u32 shl 31 - 1) - -func unsafeFMA2_hi*(hi: var Ct[uint32], a1, b1, a2, b2, c1: Ct[uint32]) {.inline.}= - ## Returns the high word of the sum of extended precision multiply-adds - ## (hi, _) <- a1 * b1 + a2 * b2 + c - block: - # TODO: Can this overflow? - let dblPrec = uint64(a1) * uint64(b1) + - uint64(a2) * uint64(b2) + - uint64(c1) - hi = Ct[uint32](dblPrec shr 31) +func muladd2*(hi, lo: var Ct[uint32], a, b, c1, c2: Ct[uint32]) {.inline.}= + ## Extended precision multiplication + addition + addition + ## This is constant-time on most hardware except some specific one like Cortex M0 + ## (hi, lo) <- a*b + c1 + c2 + ## + ## Note: 0xFFFFFFFF² -> (hi: 0xFFFFFFFE, lo: 0x00000001) + ## so adding 0xFFFFFFFF leads to (hi: 0xFFFFFFFF, lo: 0x00000000) + ## and we have enough space to add again 0xFFFFFFFF without overflowing + let dblPrec = uint64(a) * uint64(b) + uint64(c1) + uint64(c2) + lo = (Ct[uint32])(dblPrec) + hi = (Ct[uint32])(dblPrec shr 32) # ############################################################ # @@ -76,191 +69,14 @@ func unsafeFMA2_hi*(hi: var Ct[uint32], a1, b1, a2, b2, c1: Ct[uint32]) {.inline # ############################################################ when sizeof(int) == 8: - const GccCompatible = defined(gcc) or defined(clang) or defined(llvm_gcc) + when defined(vcc): + from ./extended_precision_x86_64_msvc import unsafeDiv2n1n, muladd1, muladd2 + elif GCCCompatible: + # TODO: constant-time div2n1n + when X86: + from ./extended_precision_x86_64_gcc import unsafeDiv2n1n + from ./extended_precision_64bit_uint128 import muladd1, muladd2 + else: + from ./extended_precision_64bit_uint128 import unsafeDiv2n1n, muladd1, muladd2 - when GccCompatible: - type - uint128*{.importc: "unsigned __int128".} = object - - func unsafeDiv2n1n*(q, r: var Ct[uint64], n_hi, n_lo, d: Ct[uint64]) {.inline.}= - ## Division uint128 by uint64 - ## Warning ⚠️ : - ## - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE - ## - if n_hi > d result is undefined - {.warning: "unsafeDiv2n1n is not constant-time at the moment on most hardware".} - - # TODO !!! - Replace by constant-time, portable, non-assembly version - # -> use uint128? Compiler might add unwanted branches - - # DIV r/m64 - # Divide RDX:RAX (n_hi:n_lo) by r/m64 - # - # Inputs - # - numerator high word in RDX, - # - numerator low word in RAX, - # - divisor as r/m parameter (register or memory at the compiler discretion) - # Result - # - Quotient in RAX - # - Remainder in RDX - - # 1. name the register/memory "divisor" - # 2. don't forget to dereference the var hidden pointer - # 3. - - # 4. no clobbered registers beside explectly used RAX and RDX - when defined(amd64): - when defined(cpp): - asm """ - divq %[divisor] - : "=a" (`q`), "=d" (`r`) - : "d" (`n_hi`), "a" (`n_lo`), [divisor] "rm" (`d`) - : - """ - else: - asm """ - divq %[divisor] - : "=a" (`*q`), "=d" (`*r`) - : "d" (`n_hi`), "a" (`n_lo`), [divisor] "rm" (`d`) - : - """ - else: - var dblPrec {.noInit.}: uint128 - {.emit:[dblPrec, " = (unsigned __int128)", n_hi," << 64 | (unsigned __int128)",n_lo,";"].} - - # Don't forget to dereference the var param in C mode - when defined(cpp): - {.emit:[q, " = (NU64)(", dblPrec," / ", d, ");"].} - {.emit:[r, " = (NU64)(", dblPrec," % ", d, ");"].} - else: - {.emit:["*",q, " = (NU64)(", dblPrec," / ", d, ");"].} - {.emit:["*",r, " = (NU64)(", dblPrec," % ", d, ");"].} - - func unsafeFMA*(hi, lo: var Ct[uint64], a, b, c: Ct[uint64]) {.inline.}= - ## Extended precision multiplication + addition - ## This is constant-time on most hardware except some specific one like Cortex M0 - ## (hi, lo) <- a*b + c - block: - # Note: since a and b use 63-bit, - # the result is 126-bit and carrying cannot overflow - var dblPrec {.noInit.}: uint128 - {.emit:[dblPrec, " = (unsigned __int128)", a," * (unsigned __int128)", b, " + (unsigned __int128)",c,";"].} - - # Don't forget to dereference the var param in C mode - when defined(cpp): - {.emit:[hi, " = (NU64)(", dblPrec," >> ", 63'u64, ");"].} - {.emit:[lo, " = (NU64)", dblPrec," & ", 1'u64 shl 63 - 1, ";"].} - else: - {.emit:["*",hi, " = (NU64)(", dblPrec," >> ", 63'u64, ");"].} - {.emit:["*",lo, " = (NU64)", dblPrec," & ", 1'u64 shl 63 - 1, ";"].} - - func unsafeFMA2*(hi, lo: var Ct[uint64], a1, b1, a2, b2, c1, c2: Ct[uint64]) {.inline.}= - ## (hi, lo) <- a1 * b1 + a2 * b2 + c1 + c2 - block: - # TODO: Can this overflow? - var dblPrec: uint128 - {.emit:[dblPrec, " = (unsigned __int128)", a1," * (unsigned __int128)", b1, - " + (unsigned __int128)", a2," * (unsigned __int128)", b2, - " + (unsigned __int128)", c1, - " + (unsigned __int128)", c2, ";"].} - # Don't forget to dereference the var param in C mode - when defined(cpp): - {.emit:[hi, " = (NU64)(", dblPrec," >> ", 63'u64, ");"].} - {.emit:[lo, " = (NU64)", dblPrec," & ", (1'u64 shl 63 - 1), ";"].} - else: - {.emit:["*",hi, " = (NU64)(", dblPrec," >> ", 63'u64, ");"].} - {.emit:["*",lo, " = (NU64)", dblPrec," & ", (1'u64 shl 63 - 1), ";"].} - - func unsafeFMA2_hi*(hi: var Ct[uint64], a1, b1, a2, b2, c: Ct[uint64]) {.inline.}= - ## Returns the high word of the sum of extended precision multiply-adds - ## (hi, _) <- a1 * b1 + a2 * b2 + c - block: - var dblPrec: uint128 - {.emit:[dblPrec, " = (unsigned __int128)", a1," * (unsigned __int128)", b1, - " + (unsigned __int128)", a2," * (unsigned __int128)", b2, - " + (unsigned __int128)", c, ";"].} - # Don't forget to dereference the var param in C mode - when defined(cpp): - {.emit:[hi, " = (NU64)(", dblPrec," >> ", 63'u64, ");"].} - else: - {.emit:["*",hi, " = (NU64)(", dblPrec," >> ", 63'u64, ");"].} - - elif defined(vcc): - func udiv128(highDividend, lowDividend, divisor: uint64, remainder: var uint64): uint64 {.importc:"_udiv128", header: "", nodecl.} - ## Division 128 by 64, Microsoft only, 64-bit only, - ## returns quotient as return value remainder as var parameter - ## Warning ⚠️ : - ## - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE - ## - if n_hi > d result is undefined - - func unsafeDiv2n1n*(q, r: var Ct[uint64], n_hi, n_lo, d: Ct[uint64]) {.inline.}= - ## Division uint128 by uint64 - ## Warning ⚠️ : - ## - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE - ## - if n_hi > d result is undefined - {.warning: "unsafeDiv2n1n is not constant-time at the moment on most hardware".} - - # TODO !!! - Replace by constant-time, portable, non-assembly version - # -> use uint128? Compiler might add unwanted branches - q = udiv128(n_hi, n_lo, d, r) - - func addcarry_u64(carryIn: cuchar, a, b: uint64, sum: var uint64): cuchar {.importc:"_addcarry_u64", header:"", nodecl.} - ## (CarryOut, Sum) <-- a + b - ## Available on MSVC and ICC (Clang and GCC have very bad codegen, use uint128 instead) - ## Return value is the carry-out - - func umul128(a, b: uint64, hi: var uint64): uint64 {.importc:"_umul128", header:"", nodecl.} - ## (hi, lo) <-- a * b - ## Return value is the low word - - func unsafeFMA*(hi, lo: var Ct[uint64], a, b, c: Ct[uint64]) {.inline.}= - ## Extended precision multiplication + addition - ## This is constant-time on most hardware except some specific one like Cortex M0 - ## (hi, lo) <- a*b + c - var carry: cuchar - var hi, lo: uint64 - lo = umul128(uint64(a), uint64(b), hi) - carry = addcarry_u64(cuchar(0), lo, uint64(c), lo) - discard addcarry_u64(carry, hi, 0, hi) - - func unsafeFMA2*(hi, lo: var Ct[uint64], a1, b1, a2, b2, c1, c2: Ct[uint64]) {.inline.}= - ## (hi, lo) <- a1 * b1 + a2 * b2 + c1 + c2 - var f1_lo, f1_hi, f2_lo, f2_hi: uint64 - var carry: cuchar - - f1_lo = umul128(uint64(a1), uint64(b1), f1_hi) - f2_lo = umul128(uint64(a2), uint64(b2), f2_hi) - - # On CPU with ADX: we can use addcarryx_u64 (adcx/adox) to have - # separate carry chains that can be processed in parallel by CPU - - # Carry chain 1 - carry = addcarry_u64(cuchar(0), f1_lo, uint64(c1), f1_lo) - discard addcarry_u64(carry, f1_hi, 0, f1_hi) - - # Carry chain 2 - carry = addcarry_u64(cuchar(0), f2_lo, uint64(c2), f2_lo) - discard addcarry_u64(carry, f2_hi, 0, f2_hi) - - # Merge - carry = addcarry_u64(cuchar(0), f1_lo, f2_lo, lo) - discard addcarry_u64(carry, f1_hi, f2_hi, hi) - - func unsafeFMA2_hi*(hi: var Ct[uint64], a1, b1, a2, b2, c: Ct[uint64]) {.inline.}= - ## Returns the high word of the sum of extended precision multiply-adds - ## (hi, _) <- a1 * b1 + a2 * b2 + c - - var f1_lo, f1_hi, f2_lo, f2_hi: uint64 - var carry: cuchar - - f1_lo = umul128(uint64(a1), uint64(b1), f1_hi) - f2_lo = umul128(uint64(a2), uint64(b2), f2_hi) - - carry = addcarry_u64(cuchar(0), f1_lo, uint64(c), f1_lo) - discard addcarry_u64(carry, f1_hi, 0, f1_hi) - - # Merge - var lo: uint64 - carry = addcarry_u64(cuchar(0), f1_lo, f2_lo, lo) - discard addcarry_u64(carry, f1_hi, f2_hi, hi) - - else: - {.error: "Compiler not implemented".} + export unsafeDiv2n1n, muladd1, muladd2 diff --git a/constantine/primitives/extended_precision_64bit_uint128.nim b/constantine/primitives/extended_precision_64bit_uint128.nim new file mode 100644 index 0000000..7e1e70f --- /dev/null +++ b/constantine/primitives/extended_precision_64bit_uint128.nim @@ -0,0 +1,81 @@ +# Constantine +# Copyright (c) 2018-2019 Status Research & Development GmbH +# Copyright (c) 2020-Present Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +import ./constant_time_types + +# ############################################################ +# +# Extended precision primitives on GCC & Clang (all CPU archs) +# +# ############################################################ + +static: + doAssert GCC_Compatible + doAssert sizeof(int) == 8 + +func unsafeDiv2n1n*(q, r: var Ct[uint64], n_hi, n_lo, d: Ct[uint64]) {.inline.}= + ## Division uint128 by uint64 + ## Warning ⚠️ : + ## - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE on some platforms + ## - if n_hi > d result is undefined + {.warning: "unsafeDiv2n1n is not constant-time at the moment on most hardware".} + + var dblPrec {.noInit.}: uint128 + {.emit:[dblPrec, " = (unsigned __int128)", n_hi," << 64 | (unsigned __int128)",n_lo,";"].} + + # Don't forget to dereference the var param in C mode + when defined(cpp): + {.emit:[q, " = (NU64)(", dblPrec," / ", d, ");"].} + {.emit:[r, " = (NU64)(", dblPrec," % ", d, ");"].} + else: + {.emit:["*",q, " = (NU64)(", dblPrec," / ", d, ");"].} + {.emit:["*",r, " = (NU64)(", dblPrec," % ", d, ");"].} + +func muladd1*(hi, lo: var Ct[uint64], a, b, c: Ct[uint64]) {.inline.} = + ## Extended precision multiplication + addition + ## (hi, lo) <- a*b + c + ## + ## Note: 0xFFFFFFFF_FFFFFFFF² -> (hi: 0xFFFFFFFFFFFFFFFE, lo: 0x0000000000000001) + ## so adding any c cannot overflow + ## + ## This is constant-time on most hardware + ## See: https://www.bearssl.org/ctmul.html + block: + var dblPrec {.noInit.}: uint128 + {.emit:[dblPrec, " = (unsigned __int128)", a," * (unsigned __int128)", b, " + (unsigned __int128)",c,";"].} + + # Don't forget to dereference the var param in C mode + when defined(cpp): + {.emit:[hi, " = (NU64)(", dblPrec," >> ", 64'u64, ");"].} + {.emit:[lo, " = (NU64)", dblPrec,";"].} + else: + {.emit:["*",hi, " = (NU64)(", dblPrec," >> ", 64'u64, ");"].} + {.emit:["*",lo, " = (NU64)", dblPrec,";"].} + +func muladd2*(hi, lo: var Ct[uint64], a, b, c1, c2: Ct[uint64]) {.inline.}= + ## Extended precision multiplication + addition + addition + ## This is constant-time on most hardware except some specific one like Cortex M0 + ## (hi, lo) <- a*b + c1 + c2 + ## + ## Note: 0xFFFFFFFF_FFFFFFFF² -> (hi: 0xFFFFFFFFFFFFFFFE, lo: 0x0000000000000001) + ## so adding 0xFFFFFFFFFFFFFFFF leads to (hi: 0xFFFFFFFFFFFFFFFF, lo: 0x0000000000000000) + ## and we have enough space to add again 0xFFFFFFFFFFFFFFFF without overflowing + block: + var dblPrec {.noInit.}: uint128 + {.emit:[ + dblPrec, " = (unsigned __int128)", a," * (unsigned __int128)", b, + " + (unsigned __int128)",c1," + (unsigned __int128)",c2,";" + ].} + + # Don't forget to dereference the var param in C mode + when defined(cpp): + {.emit:[hi, " = (NU64)(", dblPrec," >> ", 64'u64, ");"].} + {.emit:[lo, " = (NU64)", dblPrec,";"].} + else: + {.emit:["*",hi, " = (NU64)(", dblPrec," >> ", 64'u64, ");"].} + {.emit:["*",lo, " = (NU64)", dblPrec,";"].} diff --git a/constantine/primitives/extended_precision_x86_64_gcc.nim b/constantine/primitives/extended_precision_x86_64_gcc.nim new file mode 100644 index 0000000..3a4ec61 --- /dev/null +++ b/constantine/primitives/extended_precision_x86_64_gcc.nim @@ -0,0 +1,60 @@ +# Constantine +# Copyright (c) 2018-2019 Status Research & Development GmbH +# Copyright (c) 2020-Present Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +import ./constant_time_types + +# ############################################################ +# +# Extended precision primitives for X86-64 on GCC & Clang +# +# ############################################################ + +static: + doAssert(defined(gcc) or defined(clang) or defined(llvm_gcc)) + doAssert sizeof(int) == 8 + doAssert X86 + +func unsafeDiv2n1n*(q, r: var Ct[uint64], n_hi, n_lo, d: Ct[uint64]) {.inline.}= + ## Division uint128 by uint64 + ## Warning ⚠️ : + ## - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE + ## - if n_hi > d result is undefined + {.warning: "unsafeDiv2n1n is not constant-time at the moment on most hardware".} + + # TODO !!! - Replace by constant-time, portable, non-assembly version + # -> use uint128? Compiler might add unwanted branches + + # DIV r/m64 + # Divide RDX:RAX (n_hi:n_lo) by r/m64 + # + # Inputs + # - numerator high word in RDX, + # - numerator low word in RAX, + # - divisor as r/m parameter (register or memory at the compiler discretion) + # Result + # - Quotient in RAX + # - Remainder in RDX + + # 1. name the register/memory "divisor" + # 2. don't forget to dereference the var hidden pointer + # 3. - + # 4. no clobbered registers beside explicitly used RAX and RDX + when defined(cpp): + asm """ + divq %[divisor] + : "=a" (`q`), "=d" (`r`) + : "d" (`n_hi`), "a" (`n_lo`), [divisor] "rm" (`d`) + : + """ + else: + asm """ + divq %[divisor] + : "=a" (`*q`), "=d" (`*r`) + : "d" (`n_hi`), "a" (`n_lo`), [divisor] "rm" (`d`) + : + """ diff --git a/constantine/primitives/extended_precision_x86_64_msvc.nim b/constantine/primitives/extended_precision_x86_64_msvc.nim new file mode 100644 index 0000000..2be7d34 --- /dev/null +++ b/constantine/primitives/extended_precision_x86_64_msvc.nim @@ -0,0 +1,78 @@ +# Constantine +# Copyright (c) 2018-2019 Status Research & Development GmbH +# Copyright (c) 2020-Present Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +import + ./constant_time_types, + ./addcarry_subborrow + +# ############################################################ +# +# Extended precision primitives for X86-64 on MSVC +# +# ############################################################ + +static: + doAssert defined(vcc) + doAssert sizeof(int) == 8 + doAssert X86 + +func udiv128(highDividend, lowDividend, divisor: Ct[uint64], remainder: var Ct[uint64]): Ct[uint64] {.importc:"_udiv128", header: "", nodecl.} + ## Division 128 by 64, Microsoft only, 64-bit only, + ## returns quotient as return value remainder as var parameter + ## Warning ⚠️ : + ## - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE + ## - if n_hi > d result is undefined + +func umul128(a, b: Ct[uint64], hi: var Ct[uint64]): Ct[uint64] {.importc:"_umul128", header:"", nodecl.} + ## (hi, lo) <-- a * b + ## Return value is the low word + +func unsafeDiv2n1n*(q, r: var Ct[uint64], n_hi, n_lo, d: Ct[uint64]) {.inline.}= + ## Division uint128 by uint64 + ## Warning ⚠️ : + ## - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE + ## - if n_hi > d result is undefined + {.warning: "unsafeDiv2n1n is not constant-time at the moment on most hardware".} + + # TODO !!! - Replace by constant-time, portable, non-assembly version + # -> use uint128? Compiler might add unwanted branches + q = udiv128(n_hi, n_lo, d, r) + +func muladd1*(hi, lo: var Ct[uint64], a, b, c: Ct[uint64]) {.inline.} = + ## Extended precision multiplication + addition + ## (hi, lo) <- a*b + c + ## + ## Note: 0xFFFFFFFF_FFFFFFFF² -> (hi: 0xFFFFFFFFFFFFFFFE, lo: 0x0000000000000001) + ## so adding any c cannot overflow + ## + ## This is constant-time on most hardware + ## See: https://www.bearssl.org/ctmul.html + var carry: Carry + lo = umul128(a, b, hi) + addC(carry, lo, lo, c, Carry(0)) + addC(carry, hi, hi, 0, carry) + +func muladd2*(hi, lo: var Ct[uint64], a, b, c1, c2: Ct[uint64]) {.inline.}= + ## Extended precision multiplication + addition + addition + ## This is constant-time on most hardware except some specific one like Cortex M0 + ## (hi, lo) <- a*b + c1 + c2 + ## + ## Note: 0xFFFFFFFF_FFFFFFFF² -> (hi: 0xFFFFFFFFFFFFFFFE, lo: 0x0000000000000001) + ## so adding 0xFFFFFFFFFFFFFFFF leads to (hi: 0xFFFFFFFFFFFFFFFF, lo: 0x0000000000000000) + ## and we have enough space to add again 0xFFFFFFFFFFFFFFFF without overflowing + # For speed this could be implemented with parallel pipelined carry chains + # via MULX + ADCX + ADOX + var carry1, carry2: Carry + + lo = umul128(a, b, hi) + # Carry chain 1 + addC(carry1, lo, lo, c1, Carry(0)) + addC(carry1, hi, hi, 0, carry1) + # Carry chain 2 + addC(carry2, lo, lo, c2, Carry(0)) + addC(carry2, hi, hi, 0, carry2) diff --git a/constantine/primitives/multiplexers.nim b/constantine/primitives/multiplexers.nim new file mode 100644 index 0000000..077bc28 --- /dev/null +++ b/constantine/primitives/multiplexers.nim @@ -0,0 +1,161 @@ +# Constantine +# Copyright (c) 2018-2019 Status Research & Development GmbH +# Copyright (c) 2020-Present Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +import ./constant_time_types + +# ############################################################ +# +# Constant-time multiplexers/selectors/conditional moves +# +# ############################################################ + +# For efficiency, those are implemented in inline assembly if possible +# API: +# - mux(CTBool, Word, Word) +# - mux(CTBool, CTBool, CTBool) +# - ccopy(CTBool, var Word, Word) +# +# Those prevents the compiler from introducing branches and leaking secret data: +# - https://www.cl.cam.ac.uk/~rja14/Papers/whatyouc.pdf +# - https://github.com/veorq/cryptocoding + +# Generic implementation +# ------------------------------------------------------------ + +func mux_fallback[T](ctl: CTBool[T], x, y: T): T {.inline.}= + ## result = if ctl: x else: y + ## This is a constant-time operation + y xor (-T(ctl) and (x xor y)) + +func mux_fallback[T: CTBool](ctl: CTBool, x, y: T): T {.inline.}= + ## result = if ctl: x else: y + ## This is a constant-time operation + T(T.T(y) xor (-T.T(ctl) and T.T(x xor y))) + +func ccopy_fallback[T](ctl: CTBool[T], x: var T, y: T) {.inline.}= + ## Conditional copy + ## Copy ``y`` into ``x`` if ``ctl`` is true + x = ctl.mux_fallback(y, x) + +# x86 and x86-64 +# ------------------------------------------------------------ + +template mux_x86_impl() {.dirty.} = + static: doAssert(X86) + static: doAssert(GCC_Compatible) + + when sizeof(T) == 8: + var muxed = x + asm """ + testq %[ctl], %[ctl] + cmovzq %[y], %[muxed] + : [muxed] "+r" (`muxed`) + : [ctl] "r" (`ctl`), [y] "r" (`y`) + : "cc" + """ + muxed + elif sizeof(T) == 4: + var muxed = x + asm """ + testl %[ctl], %[ctl] + cmovzl %[y], %[muxed] + : [muxed] "+r" (`muxed`) + : [ctl] "r" (`ctl`), [y] "r" (`y`) + : "cc" + """ + muxed + else: + {.error: "Unsupported word size".} + +func mux_x86[T](ctl: CTBool[T], x, y: T): T {.inline.}= + ## Multiplexer / selector + ## Returns x if ctl is true + ## else returns y + ## So equivalent to ctl? x: y + mux_x86_impl() + +func mux_x86[T: CTBool](ctl: CTBool, x, y: T): T {.inline.}= + ## Multiplexer / selector + ## Returns x if ctl is true + ## else returns y + ## So equivalent to ctl? x: y + mux_x86_impl() + +func ccopy_x86[T](ctl: CTBool[T], x: var T, y: T) {.inline.}= + ## Conditional copy + ## Copy ``y`` into ``x`` if ``ctl`` is true + static: doAssert(X86) + static: doAssert(GCC_Compatible) + + when sizeof(T) == 8: + when defined(cpp): + asm """ + testq %[ctl], %[ctl] + cmovnzq %[y], %[x] + : [x] "+r" (`x`) + : [ctl] "r" (`ctl`), [y] "r" (`y`) + : "cc" + """ + else: + asm """ + testq %[ctl], %[ctl] + cmovnzq %[y], %[x] + : [x] "+r" (`*x`) + : [ctl] "r" (`ctl`), [y] "r" (`y`) + : "cc" + """ + elif sizeof(T) == 4: + when defined(cpp): + asm """ + testl %[ctl], %[ctl] + cmovnzl %[y], %[x] + : [x] "+r" (`x`) + : [ctl] "r" (`ctl`), [y] "r" (`y`) + : "cc" + """ + else: + asm """ + testl %[ctl], %[ctl] + cmovnzl %[y], %[x] + : [x] "+r" (`*x`) + : [ctl] "r" (`ctl`), [y] "r" (`y`) + : "cc" + """ + else: + {.error: "Unsupported word size".} + +# Public functions +# ------------------------------------------------------------ + +func mux*[T](ctl: CTBool[T], x, y: T): T {.inline.}= + ## Multiplexer / selector + ## Returns x if ctl is true + ## else returns y + ## So equivalent to ctl? x: y + when X86 and GCC_Compatible: + mux_x86(ctl, x, y) + else: + mux_fallback(ctl, x, y) + +func mux*[T: CTBool](ctl: CTBool, x, y: T): T {.inline.}= + ## Multiplexer / selector + ## Returns x if ctl is true + ## else returns y + ## So equivalent to ctl? x: y + when X86 and GCC_Compatible: + mux_x86(ctl, x, y) + else: + mux_fallback(ctl, x, y) + +func ccopy*[T](ctl: CTBool[T], x: var T, y: T) {.inline.}= + ## Conditional copy + ## Copy ``y`` into ``x`` if ``ctl`` is true + when X86 and GCC_Compatible: + ccopy_x86(ctl, x, y) + else: + ccopy_fallback(ctl, x, y) diff --git a/constantine/primitives/research/README.md b/constantine/primitives/research/README.md new file mode 100644 index 0000000..32cc175 --- /dev/null +++ b/constantine/primitives/research/README.md @@ -0,0 +1,45 @@ +# Compiler for generic inline assembly code-generation + +This folder holds alternative implementations of primitives +that uses inline assembly. + +This avoids the pitfalls of traditional compiler bad code generation +for multiprecision arithmetic (see GCC https://gcc.godbolt.org/z/2h768y) +or unsupported features like handling 2 carry chains for +multiplication using MULX/ADOX/ADCX. + +To be generic over multiple curves, +for example BN254 requires 4 words and BLS12-381 requires 6 words of size 64 bits, +the compilers is implemented as a set of macros that generate inline assembly. + +⚠⚠⚠ Warning! Warning! Warning! + +This is a significant sacrifice of code readability, portability, auditability and maintainability in favor of performance. + +This combines 2 of the most notorious ways to obfuscate your code: +* metaprogramming and macros +* inline assembly + +Adventurers beware: not for the faint of heart. + +This is unfinished, untested, unused, unfuzzed and just a proof-of-concept at the moment.* + +_* I take no responsibility if this smashes your stack, eats your cat, hides a skeleton in your closet, warps a pink elephant in the room, summons untold eldritch horrors or causes the heat death of the universe. You have been warned._ + +_The road to debugging hell is paved with metaprogrammed assembly optimizations._ + +_For my defence, OpenSSL assembly is generated by a Perl script and neither Perl nor the generated Assembly are type-checked by a dependently-typed compiler._ + +## References + +Multiprecision (Montgomery) Multiplication & Squaring in Assembly + +- Intel MULX/ADCX/ADOX Table 2 p13: https://www.intel.cn/content/dam/www/public/us/en/documents/white-papers/ia-large-integer-arithmetic-paper.pdf +- Squaring: https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/large-integer-squaring-ia-paper.pdf +- https://eprint.iacr.org/eprint-bin/getfile.pl?entry=2017/558&version=20170608:200345&file=558.pdf +- https://github.com/intel/ipp-crypto +- https://github.com/herumi/mcl + +Experimentations in Nim + +- https://github.com/mratsim/finite-fields diff --git a/constantine/primitives/research/addcarry_subborrow_compiler.nim b/constantine/primitives/research/addcarry_subborrow_compiler.nim new file mode 100644 index 0000000..63d2523 --- /dev/null +++ b/constantine/primitives/research/addcarry_subborrow_compiler.nim @@ -0,0 +1,133 @@ +# Constantine +# Copyright (c) 2018-2019 Status Research & Development GmbH +# Copyright (c) 2020-Present Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +# ############################################################ +# +# Add-with-carry and Sub-with-borrow +# +# ############################################################ +# +# This is a proof-of-concept optimal add-with-carry +# compiler implemented as Nim macros. +# +# This overcome the bad GCC codegen aven with addcary_u64 intrinsic. + +import macros + +func wordsRequired(bits: int): int {.compileTime.} = + ## Compute the number of limbs required + ## from the announced bit length + (bits + 64 - 1) div 64 + +type + BigInt[bits: static int] {.byref.} = object + ## BigInt + ## Enforce-passing by reference otherwise uint128 are passed by stack + ## which causes issue with the inline assembly + limbs: array[bits.wordsRequired, uint64] + +macro addCarryGen_u64(a, b: untyped, bits: static int): untyped = + var asmStmt = (block: + " movq %[b], %[tmp]\n" & + " addq %[tmp], %[a]\n" + ) + + let maxByteOffset = bits div 8 + const wsize = sizeof(uint64) + + when defined(gcc): + for byteOffset in countup(wsize, maxByteOffset-1, wsize): + asmStmt.add (block: + "\n" & + # movq 8+%[b], %[tmp] + " movq " & $byteOffset & "+%[b], %[tmp]\n" & + # adcq %[tmp], 8+%[a] + " adcq %[tmp], " & $byteOffset & "+%[a]\n" + ) + elif defined(clang): + # https://lists.llvm.org/pipermail/llvm-dev/2017-August/116202.html + for byteOffset in countup(wsize, maxByteOffset-1, wsize): + asmStmt.add (block: + "\n" & + # movq 8+%[b], %[tmp] + " movq " & $byteOffset & "%[b], %[tmp]\n" & + # adcq %[tmp], 8+%[a] + " adcq %[tmp], " & $byteOffset & "%[a]\n" + ) + + let tmp = ident("tmp") + asmStmt.add (block: + ": [tmp] \"+r\" (`" & $tmp & "`), [a] \"+m\" (`" & $a & "->limbs[0]`)\n" & + ": [b] \"m\"(`" & $b & "->limbs[0]`)\n" & + ": \"cc\"" + ) + + result = newStmtList() + result.add quote do: + var `tmp`{.noinit.}: uint64 + + result.add nnkAsmStmt.newTree( + newEmptyNode(), + newLit asmStmt + ) + + echo result.toStrLit + +func `+=`(a: var BigInt, b: BigInt) {.noinline.}= + # Depending on inline or noinline + # the generated ASM addressing must be tweaked for Clang + # https://lists.llvm.org/pipermail/llvm-dev/2017-August/116202.html + addCarryGen_u64(a, b, BigInt.bits) + +# ############################################# +when isMainModule: + import random + proc rand(T: typedesc[BigInt]): T = + for i in 0 ..< result.limbs.len: + result.limbs[i] = uint64(rand(high(int))) + + proc main() = + block: + let a = BigInt[128](limbs: [high(uint64), 0]) + let b = BigInt[128](limbs: [1'u64, 0]) + + echo "a: ", a + echo "b: ", b + echo "------------------------------------------------------" + + var a1 = a + a1 += b + echo a1 + echo "======================================================" + + block: + let a = rand(BigInt[256]) + let b = rand(BigInt[256]) + + echo "a: ", a + echo "b: ", b + echo "------------------------------------------------------" + + var a1 = a + a1 += b + echo a1 + echo "======================================================" + + block: + let a = rand(BigInt[384]) + let b = rand(BigInt[384]) + + echo "a: ", a + echo "b: ", b + echo "------------------------------------------------------" + + var a1 = a + a1 += b + echo a1 + + main() diff --git a/constantine/tower_field_extensions/abelian_groups.nim b/constantine/tower_field_extensions/abelian_groups.nim index d827f8b..0ee83b7 100644 --- a/constantine/tower_field_extensions/abelian_groups.nim +++ b/constantine/tower_field_extensions/abelian_groups.nim @@ -9,7 +9,7 @@ import ../arithmetic/finite_fields, ../config/common, - ../primitives/constant_time + ../primitives # ############################################################ # diff --git a/tests/prng.nim b/tests/prng.nim index 37c4967..43d68af 100644 --- a/tests/prng.nim +++ b/tests/prng.nim @@ -7,7 +7,7 @@ # at your option. This file may not be copied, modified, or distributed except according to those terms. import - ../constantine/arithmetic/bigints_checked, + ../constantine/arithmetic/bigints, ../constantine/config/[common, curves] # ############################################################ @@ -86,13 +86,12 @@ func random[T](rng: var RngState, a: var T, C: static Curve) {.noInit.}= when T is BigInt: var reduced, unreduced{.noInit.}: T - unreduced.setInternalBitLength() for i in 0 ..< unreduced.limbs.len: unreduced.limbs[i] = Word(rng.next()) # Note: a simple modulo will be biaised but it's simple and "fast" reduced.reduce(unreduced, C.Mod.mres) - a.montyResidue(reduced, C.Mod.mres, C.getR2modP(), C.getNegInvModWord()) + a.montyResidue(reduced, C.Mod.mres, C.getR2modP(), C.getNegInvModWord(), C.canUseNoCarryMontyMul()) else: for field in fields(a): diff --git a/tests/test_bigints.nim b/tests/test_bigints.nim index 591b720..c747ce3 100644 --- a/tests/test_bigints.nim +++ b/tests/test_bigints.nim @@ -8,9 +8,9 @@ import unittest, ../constantine/io/io_bigints, - ../constantine/arithmetic/bigints_checked, + ../constantine/arithmetic/bigints, ../constantine/config/common, - ../constantine/primitives/constant_time + ../constantine/primitives proc main() = suite "isZero": diff --git a/tests/test_bigints_multimod.nim b/tests/test_bigints_multimod.nim index ff6553c..0a9fd12 100644 --- a/tests/test_bigints_multimod.nim +++ b/tests/test_bigints_multimod.nim @@ -11,8 +11,8 @@ import unittest, # Third-party ../constantine/io/io_bigints, - ../constantine/arithmetic/[bigints_raw, bigints_checked], - ../constantine/primitives/constant_time + ../constantine/arithmetic/bigints, + ../constantine/primitives proc main() = suite "Bigints - Multiprecision modulo": @@ -89,4 +89,52 @@ proc main() = check: bool(r == expected) + test "bitsize 1882 mod bitsize 312": + let a = BigInt[1882].fromHex("0x3feb1d432a950d856fc121c5057671cf81bf9d283a30b69128e84d57900aba486136b9e93f96293dbf7e280b8a641d970748b27ba0986411c7359f32f37447e34ae9e9189336269326fb62fd4d0891bf2383548e8ada92517cf5001e449dd5b4c6501b361636c13f3d5db5ed40f7048f8b1b8db65e9a34a08992e19527ded175fd6b4c4559c25c384691f0567ad27cf5df2b4192d94dc3bf596216067fd02a3790c048bc4bff16e70f84c395ff1243d4b92b514d0c22fc35a82611b77137f09ec8bc31df58fbea2b532ef38ed9078bd2982893326833a20daf2792bdf1ac75ca80e2ffd063f49bb173e7b100") + let m = BigInt[312].fromHex("0x8bad37615c65cb40b592525aeb19de0b8a3f9db87f3c77050a77050ebe81712d78253cdc0eafec") + + let expected = BigInt[312].fromHex("0x1d79fa2f576827a70b38b303036884b346fc52941b2df0863e8f635c467ea1aec04520e6feb614") + + var r: BigInt[312] + r.reduce(a, m) + + check: + bool(r == expected) + + test "bitsize 5276 mod bitsize 337": + let a = BigInt[5276].fromHex("0xdcc610304e437c91df568effa736e9ec472d921d2e32f0123f59f8a0e7a639a84db3d6e91c4ce9164e2183aeb9efdfdf5b179b1e5b8074602193b9ba0f5cf547ce31c6c6d33317c40fdcc66090d13034a8ed82b1244cd9e82ec43b08a4a8cd7aaa4937b72b19b01c942427db3e630e70f6823f36a4d0db17b0515ab1582672f613c22f43b2743929d92a924b2d7529a08fa2950ac90fd529207d3dd55a65f80f77715b340755f545424375a1f6dfe3eea1309365036d924226297ecd1296c5938a7b18fe36c3126f54161818ff8e29d69c25b7a47a47061f6e76b6ffbe0c2dbcbf83f49b0bd24cb6f2de460e6c6540e15e23e23573a04dff7d18f88e266a1e36627181dd18a9a182182b1c4e1ec8123b916d18a82139c6b2f5cc7206681b21ec3b14f4da44337892a90db21e070c8799a5cd7e81c03b901ade08021401d6a4cd27bef1e1215c65c2e8abadf44cc455383b37c12fe1f25774bbb0552ca54699c8d38cd88b56ff80c130734dbd231f8f2d15e62effe7bfedde43c4d06f06115befbafabcb1128b3c80f8c6395696f28b6d32c12cc74ba7fcef95e97bd854c98716b6c079d971199a4d3fa4f6d7f901f5370b3f0a4fa6dddff81820ca012bb821560b86701d25a3c99f0daae5824bc5d4731c1e5e879b94bb0a5a862ac79d22fc42d20d3d8963a49997627d4d246088a21531e58174e55eed8007c7e05bece76c64a368c42a7e178b0ba0ce3b54f1d9a568755c71f3518e5d10caa2eda8edd74f13c41b70c6ff0a75f6b821b38cb6148acf6890fc79d508cfa741c8514498b81aaf1698420bf844742d325afe8fce3e85c1d2aefc6bc254e3628f19116643a538c6657a937d62069dfe7217a9e9138e8a12f9857c9eb671c2adb3b3129d0653eb62296bdcfe51335b966e39838a4b18fce380af1f00") + let m = BigInt[337].fromHex("0x016255c2e37f1b1405f9f195040e80778b896b23a1487a40ece792894025590800bcadc343fcef4e2d01b8") + + let expected = BigInt[337].fromHex("0x66bb36adf84b9024f97100688cc66be2f412fd91e9bae3623e810dcae86166a52bdb4c889fa0e5d128d8") + + var r: BigInt[337] + r.reduce(a, m) + + check: + bool(r == expected) + + test "bitsize 4793 mod bitsize 190": + let a = BigInt[4793].fromHex("0x20924645cabc04f0213ba42961e10dedd2c6bd0c9625d04949037b15f9546001551651049038285b441824ef5540a174da0d5ab5f6f07750b9d6ea21a8dd127b467cd1ff0d547d7c86705402bfb8efee231c8385d14666d4e5fdd4e4e6c230ed61b631a6387c57823578139db306c1687bb950985e608c2792694e895e97039c0c155c79b3d595b391f5f8217feeebc20b093658d3e7612449ef575da3d0cde0d3726c58ca9302952deaee8b44a31029086db65838767c60b63f68f9c207ca128574ff9023fc29de264c8e4df20b7764064f9228a2481d5936cc840e107f73b04fcf31f8060c38ea5fb9c8f165e4bbdd1c7b8f0cfb950be57d87678a0a3d45eb1ccbef1a977e881de4f4f95ef0e144a0486ca47084a565242a2baab7a5383e85d51c466d7b03e1f06285bfe04cbb4b90e829a50af103ab8a812cfdad100344b3ae0ab3b96e26a0d97cf16d1910212471f9b3f5e3d0133360387ca3a52682d68447e7ac454e321bc5381a24ff5348baad68d3609a7dfa2118275f2620cf30b1ebb21d98b1d783b45c2acf4a9a9b1cfeba21b2fe1d93fda3234ee90bdba1b23e3a514c7e2189f7bf07236397e1efc5cb5b3a3e748ba130272d880b9d74fc6c2386f19c9e51093ce885ad60493a3d4d0c84154e6fb6d4bb222207eb9f3a2136cebe883a5a89b95eba5363c113f330636d00dda40f3445afb651a56a1d00e5d3815b3c06f123e5eb6b8ce5621ab8f05765fe803e94a12998c249cb1e84c9c4785c8631454283e0471149bc541eebc691b3231e4969b433b9c8195db915cb3baef8db7b3ab0dff2aa7f284e5b86e8055ab95bf45086a216138000") + let m = BigInt[190].fromHex("0x3cf10d948e00a135ab10a6d073b8289e8465d5798d06891c") + + let expected = BigInt[190].fromHex("0x1154587f8cfac96bc146790bc49262ad32e1560a0bf734a4") + + var r: BigInt[190] + r.reduce(a, m) + + check: + bool(r == expected) + + test "bitsize 5240 mod bitsize 3072": + let a = BigInt[5240].fromHex("0xfd76093ad413df93dddded94a16c17ffbdd1d8ce9377f5940af54293410603fd381822bb9cec0074005f68dd7bc33f879fb0613b9cd0bac50c27e5e40a0c3948e5b4a07e6c9b1795f2f60647d67b9fd5025d82deffcbb62209a921eb766f7a2335d1b6ca0a4877c948d5cf68aaa2d5bdbea3f991eb027b91d03c91712d739cf6522279007add5febb85fe8f5ef661641fc2dc36b37e709d77f3a0f016f1b421527d2a28c9e8734f243a81e985a26e6ec5650ae81f10381869a9a78e5dac5e6d65783c40f8c232398425e96da2c6d94290ee6463580c826c609691a860f8ceb233a072fca384a9a74d15beaad7df8f1dcde437a02db6218b0c0bca43f9f936bdede271b730b098cf4dd97a84a10feb7eb04841ac01e4728a12a1f96b88ac91bef33818095777893813635cc918480f255a45bf9ebc740e3992877879d4ee64b1aad22439dc9872e5ff13a25dcb32669dab77e05982adbfb06073d5b9bd2b0dcdc7a515296b13e7251d3fb6ca132492f66d312e2610011284b6a2f2a1c26a873959ff5935ddd1e229d4ec7cdf3fa1ce1f55bc549481adfee5ec8aa8b4eddad88c74298d50c2d310be6c21e92067bf8b5ae2330f750f60122251d1fcf84c58a3abee3bed8715dda1c016eb58672faed0a5c678806028195586a349702eeff0738d14ac9a2ced66a50e894cf0a3d546608e6666443d5ea4bc6e7078ae356257ce12fc3d6cf84fe52f13fe27ad89038d041698ed615c856d326d2f1fc5cc916a5176d44a965cf5247b81b901212eb3f35912e82d68b28fe438b3f9cb43362794a53976dcc6ec21aa097813e47f7cb02af3e2bdd9f4323e3ffb577a800cdb5fe925b832263db0332a235c6d8de3df97d8a963c8062f1d39aa302f8db2e2ac292fb6f66d4e2e074e8ce4c77ca0a311bcd3455") + let m = BigInt[3072].fromHex("0x82a3b927471854fe51e4246c36e83a110ab5fec30a5f26fca0316b8ab42784bc17015ef9d0217769e695b92bfe4f30ffcfef881179edc6623dcaf305ee4424da00d8c49a873535e095ac64a8cc66767c26ffa7f2f1acdaedd82b09b62d297f951cd3af83e7023b4eafea8056fc9e4f53f03eb9ee93613d58219214f8d884f51d4e09b336a4bf53fb29a4394fc9b8d4004f4ab04cdfda43441e63846e3dbd02c46ab521b85a16d4a063c33be63e88c9b3fec486f9eda4958a167cb4dd64dd44c7047e4f1372e6ce6f29bbc4a6cc0f498c0428dbc35daaa81abedd937e602ce3eb38666f0ccd603955949e068dd005e2e2bf6d423fd183fcbf61c504eeffa589c3482251b1191e7d71b8e31fc05979b4ebb6ab57ce810d6e34144a8417ab2ca45709b3841bb08cbf38658d2f4129adee121933369deb238db2f74df4490ea5486685554cc4dac015f4d09ded70a4fc808b080142eb7c865fe8e89046f3c0de448f1442258d2cd565dfd457cfb49ab0c0a735196e6cb06a962f29e53060576327b8") + + let expected = BigInt[3072].fromHex("0x75be9187192dccf08bedcb06c7fba60830840cb8a5c3a5895e63ffd78073f2f7e0ccc72ae2f91c2be9fe51e48373bf4426e6e1babb9bc5374747a0e24b982a27359cf403a6bb900800b6dd52b309788df7f599f3db6f5b5ba5fbe88b8d03ab32fbe8d75dbbad0178f70dc4dfbc39008e5c8a4975f08060f4af1718e1a8811b0b73daabf67bf971c1fa79d678e3e2bf878a844004d1ab5b11a2c3e4fa8abbbe15b75a4a15a4c0eecd128ad7b13571545a967cac88d1b1e88c3b09723849c54adede6b36dd21000f41bc404083bf01902d2d3591c2e51fe0cc26d691cbc9ba6ea3137bd977745cc8761c828f7d54899841701faeca7ff5fc975968693284c2dcaf68e9852a67b5782810834f2eed0ba8e69d18c2a9d8aa1d81528110f0156610febe5ee2db65add65006a9f91370828e356c7751fa50bb49f43b408cd2f4767a43bc57888afe01d2a85d457c68a3eb60de713b79c318b92cb1b2837cf78f9e6e5ec0091d2810a34a1c75400190f8582a8b42f436b799db088689f8187b6db8530d") + + var r: BigInt[3072] + r.reduce(a, m) + + check: + bool(r == expected) + main() diff --git a/tests/test_bigints_vs_gmp.nim b/tests/test_bigints_vs_gmp.nim index f11840b..2bfe2b4 100644 --- a/tests/test_bigints_vs_gmp.nim +++ b/tests/test_bigints_vs_gmp.nim @@ -13,7 +13,7 @@ import gmp, stew/byteutils, # Internal ../constantine/io/io_bigints, - ../constantine/arithmetic/[bigints_raw, bigints_checked], + ../constantine/arithmetic/bigints, ../constantine/primitives/constant_time # We test up to 1024-bit, more is really slow @@ -113,16 +113,16 @@ proc main() = var aW, mW: csize # Word written by GMP - discard mpz_export(aBuf[0].addr, aW.addr, GMP_LeastSignificantWordFirst, 1, GMP_WordNativeEndian, 0, a) - discard mpz_export(mBuf[0].addr, mW.addr, GMP_LeastSignificantWordFirst, 1, GMP_WordNativeEndian, 0, m) + discard mpz_export(aBuf[0].addr, aW.addr, GMP_MostSignificantWordFirst, 1, GMP_WordNativeEndian, 0, a) + discard mpz_export(mBuf[0].addr, mW.addr, GMP_MostSignificantWordFirst, 1, GMP_WordNativeEndian, 0, m) # Since the modulus is using all bits, it's we can test for exact amount copy - doAssert aLen >= aW, "Expected at most " & $aLen & " bytes but wrote " & $aW & " for " & toHex(aBuf) & " (little-endian)" - doAssert mLen == mW, "Expected " & $mLen & " bytes but wrote " & $mW & " for " & toHex(mBuf) & " (little-endian)" + doAssert aLen >= aW, "Expected at most " & $aLen & " bytes but wrote " & $aW & " for " & toHex(aBuf) & " (big-endian)" + doAssert mLen == mW, "Expected " & $mLen & " bytes but wrote " & $mW & " for " & toHex(mBuf) & " (big-endian)" # Build the bigint - let aTest = BigInt[aBits].fromRawUint(aBuf, littleEndian) - let mTest = BigInt[mBits].fromRawUint(mBuf, littleEndian) + let aTest = BigInt[aBits].fromRawUint(aBuf.toOpenArray(0, aW-1), bigEndian) + let mTest = BigInt[mBits].fromRawUint(mBuf.toOpenArray(0, mW-1), bigEndian) ######################################################### # Modulus @@ -135,15 +135,16 @@ proc main() = # Check var rGMP: array[mLen, byte] var rW: csize # Word written by GMP - discard mpz_export(rGMP[0].addr, rW.addr, GMP_LeastSignificantWordFirst, 1, GMP_WordNativeEndian, 0, r) + discard mpz_export(rGMP[0].addr, rW.addr, GMP_MostSignificantWordFirst, 1, GMP_WordNativeEndian, 0, r) var rConstantine: array[mLen, byte] - exportRawUint(rConstantine, rTest, littleEndian) + exportRawUint(rConstantine, rTest, bigEndian) # echo "rGMP: ", rGMP.toHex() # echo "rConstantine: ", rConstantine.toHex() - doAssert rGMP == rConstantine, block: + # Note: in bigEndian, GMP aligns left while constantine aligns right + doAssert rGMP.toOpenArray(0, rW-1) == rConstantine.toOpenArray(mLen-rW, mLen-1), block: # Reexport as bigEndian for debugging discard mpz_export(aBuf[0].addr, aW.addr, GMP_MostSignificantWordFirst, 1, GMP_WordNativeEndian, 0, a) discard mpz_export(mBuf[0].addr, mW.addr, GMP_MostSignificantWordFirst, 1, GMP_WordNativeEndian, 0, m) @@ -152,6 +153,7 @@ proc main() = " m (" & align($mBits, 4) & "-bit): " & mBuf.toHex & "\n" & "failed:" & "\n" & " GMP: " & rGMP.toHex() & "\n" & - " Constantine: " & rConstantine.toHex() + " Constantine: " & rConstantine.toHex() & "\n" & + "(Note that GMP aligns bytes left while constantine aligns bytes right)" main() diff --git a/tests/test_finite_fields_powinv.nim b/tests/test_finite_fields_powinv.nim index 122e87b..20430f8 100644 --- a/tests/test_finite_fields_powinv.nim +++ b/tests/test_finite_fields_powinv.nim @@ -7,7 +7,7 @@ # at your option. This file may not be copied, modified, or distributed except according to those terms. import unittest, - ../constantine/arithmetic/[bigints_checked, finite_fields], + ../constantine/arithmetic/[bigints, finite_fields], ../constantine/io/io_fields, ../constantine/config/curves diff --git a/tests/test_finite_fields_vs_gmp.nim b/tests/test_finite_fields_vs_gmp.nim index 426affa..0c29b0d 100644 --- a/tests/test_finite_fields_vs_gmp.nim +++ b/tests/test_finite_fields_vs_gmp.nim @@ -13,7 +13,7 @@ import gmp, stew/byteutils, # Internal ../constantine/io/[io_bigints, io_fields], - ../constantine/arithmetic/[finite_fields, bigints_checked], + ../constantine/arithmetic/[finite_fields, bigints], ../constantine/primitives/constant_time, ../constantine/config/curves @@ -100,7 +100,7 @@ proc binary_epilogue[C: static Curve, N: static int]( " b: " & bBuf.toHex & "\n" & "failed:" & "\n" & " GMP: " & rGMP.toHex() & "\n" & - " Constantine: " & rConstantine.toHex() & + " Constantine: " & rConstantine.toHex() & "\n" & "(Note that GMP aligns bytes left while constantine aligns bytes right)" # ############################################################ diff --git a/tests/test_fp2.nim b/tests/test_fp2.nim index 7dfae9f..86e96c3 100644 --- a/tests/test_fp2.nim +++ b/tests/test_fp2.nim @@ -12,7 +12,7 @@ import # Internals ../constantine/tower_field_extensions/[abelian_groups, fp2_complex], ../constantine/config/[common, curves], - ../constantine/arithmetic/bigints_checked, + ../constantine/arithmetic/bigints, # Test utilities ./prng @@ -45,7 +45,7 @@ suite "𝔽p2 = 𝔽p[𝑖] (irreducible polynomial x²+1)": O var r: typeof(C.Mod.mres) - r.redc(oneFp2.c0.mres, C.Mod.mres, C.getNegInvModWord()) + r.redc(oneFp2.c0.mres, C.Mod.mres, C.getNegInvModWord(), canUseNoCarryMontyMul = false) check: bool(r == oneBig) diff --git a/tests/test_io_bigints.nim b/tests/test_io_bigints.nim index 0ae021d..f5a87e1 100644 --- a/tests/test_io_bigints.nim +++ b/tests/test_io_bigints.nim @@ -9,7 +9,7 @@ import unittest, random, ../constantine/io/io_bigints, ../constantine/config/common, - ../constantine/arithmetic/bigints_checked + ../constantine/arithmetic/bigints randomize(0xDEADBEEF) # Random seed for reproducibility type T = BaseType @@ -24,7 +24,6 @@ proc main() = check: T(big.limbs[0]) == 0 - T(big.limbs[1]) == 0 test "Parsing and dumping round-trip on uint64": block: @@ -85,4 +84,11 @@ proc main() = check: p == hex + test "Round trip on 3072-bit integer": + const n = "0x75be9187192dccf08bedcb06c7fba60830840cb8a5c3a5895e63ffd78073f2f7e0ccc72ae2f91c2be9fe51e48373bf4426e6e1babb9bc5374747a0e24b982a27359cf403a6bb900800b6dd52b309788df7f599f3db6f5b5ba5fbe88b8d03ab32fbe8d75dbbad0178f70dc4dfbc39008e5c8a4975f08060f4af1718e1a8811b0b73daabf67bf971c1fa79d678e3e2bf878a844004d1ab5b11a2c3e4fa8abbbe15b75a4a15a4c0eecd128ad7b13571545a967cac88d1b1e88c3b09723849c54adede6b36dd21000f41bc404083bf01902d2d3591c2e51fe0cc26d691cbc9ba6ea3137bd977745cc8761c828f7d54899841701faeca7ff5fc975968693284c2dcaf68e9852a67b5782810834f2eed0ba8e69d18c2a9d8aa1d81528110f0156610febe5ee2db65add65006a9f91370828e356c7751fa50bb49f43b408cd2f4767a43bc57888afe01d2a85d457c68a3eb60de713b79c318b92cb1b2837cf78f9e6e5ec0091d2810a34a1c75400190f8582a8b42f436b799db088689f8187b6db8530d" + let x = BigInt[3072].fromHex(n) + let h = x.toHex(bigEndian) + + check: n == h + main() diff --git a/tests/test_io_fields.nim b/tests/test_io_fields.nim index 1e06d7c..177f488 100644 --- a/tests/test_io_fields.nim +++ b/tests/test_io_fields.nim @@ -10,7 +10,7 @@ import unittest, random, ../constantine/io/[io_bigints, io_fields], ../constantine/config/curves, ../constantine/config/common, - ../constantine/arithmetic/[bigints_checked, finite_fields] + ../constantine/arithmetic/[bigints, finite_fields] randomize(0xDEADBEEF) # Random seed for reproducibility type T = BaseType @@ -18,6 +18,30 @@ type T = BaseType proc main() = suite "IO - Finite fields": test "Parsing and serializing round-trip on uint64": + # 101 --------------------------------- + block: + # "Little-endian" - 0 + let x = BaseType(0) + let x_bytes = cast[array[sizeof(BaseType), byte]](x) + var f: Fp[Fake101] + f.fromUint(x) + + var r_bytes: array[sizeof(BaseType), byte] + exportRawUint(r_bytes, f, littleEndian) + check: x_bytes == r_bytes + + block: + # "Little-endian" - 1 + let x = BaseType(1) + let x_bytes = cast[array[sizeof(BaseType), byte]](x) + var f: Fp[Fake101] + f.fromUint(x) + + var r_bytes: array[sizeof(BaseType), byte] + exportRawUint(r_bytes, f, littleEndian) + check: x_bytes == r_bytes + + # Mersenne 61 --------------------------------- block: # "Little-endian" - 0 let x = 0'u64 @@ -103,4 +127,20 @@ proc main() = check: p == hex + test "Round trip on prime field of NIST P256 (secp256r1) curve": + block: # 2^126 + const p = "0x0000000000000000000000000000000040000000000000000000000000000000" + let x = Fp[P256].fromBig BigInt[256].fromHex(p) + let hex = x.toHex(bigEndian) + + check: p == hex + + test "Round trip on prime field of BLS12_381 curve": + block: # 2^126 + const p = "0x000000000000000000000000000000000000000000000000000000000000000040000000000000000000000000000000" + let x = Fp[BLS12_381].fromBig BigInt[381].fromHex(p) + let hex = x.toHex(bigEndian) + + check: p == hex + main() diff --git a/tests/test_primitives.nim b/tests/test_primitives.nim index ee8ef31..a0892c5 100644 --- a/tests/test_primitives.nim +++ b/tests/test_primitives.nim @@ -7,7 +7,7 @@ # at your option. This file may not be copied, modified, or distributed except according to those terms. import unittest, random, math, - ../constantine/primitives/constant_time + ../constantine/primitives # Random seed for reproducibility randomize(0xDEADBEEF)