diff --git a/benchmarks/bench_blueprint.nim b/benchmarks/bench_blueprint.nim new file mode 100644 index 0000000..ce3c10e --- /dev/null +++ b/benchmarks/bench_blueprint.nim @@ -0,0 +1,108 @@ +# Constantine +# Copyright (c) 2018-2019 Status Research & Development GmbH +# Copyright (c) 2020-Present Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +# ############################################################ +# +# Benchmark blueprint +# +# ############################################################ + +import + # Internal + ../constantine/config/common, + # Helpers + ../helpers/[prng_unsafe, static_for], + ./platforms, + # Standard library + std/[monotimes, times, strformat, strutils, macros] + +export strformat, platforms, times, monotimes, macros + +var rng*: RngState +let seed = uint32(getTime().toUnix() and (1'i64 shl 32 - 1)) # unixTime mod 2^32 +rng.seed(seed) +echo "bench xoshiro512** seed: ", seed + +# warmup +proc warmup*() = + # Warmup - make sure cpu is on max perf + let start = cpuTime() + var foo = 123 + for i in 0 ..< 300_000_000: + foo += i*i mod 456 + foo = foo mod 789 + + # Compiler shouldn't optimize away the results as cpuTime rely on sideeffects + let stop = cpuTime() + echo &"Warmup: {stop - start:>4.4f} s, result {foo} (displayed to avoid compiler optimizing warmup away)\n" + +warmup() + +when defined(gcc): + echo "\nCompiled with GCC" +elif defined(clang): + echo "\nCompiled with Clang" +elif defined(vcc): + echo "\nCompiled with MSVC" +elif defined(icc): + echo "\nCompiled with ICC" +else: + echo "\nCompiled with an unknown compiler" + +echo "Optimization level => " +echo " no optimization: ", not defined(release) +echo " release: ", defined(release) +echo " danger: ", defined(danger) +echo " inline assembly: ", UseASM_X86_64 + +when (sizeof(int) == 4) or defined(Constantine32): + echo "⚠️ Warning: using Constantine with 32-bit limbs" +else: + echo "Using Constantine with 64-bit limbs" + +when SupportsCPUName: + echo "Running on ", cpuName(), "" + +when SupportsGetTicks: + echo "\n⚠️ Cycles measurements are approximate and use the CPU nominal clock: Turbo-Boost and overclocking will skew them." + echo "i.e. a 20% overclock will be about 20% off (assuming no dynamic frequency scaling)" + +echo "\n=================================================================================================================\n" + +proc separator*(length: int) = + echo "-".repeat(length) + +proc notes*() = + echo "Notes:" + echo " - Compilers:" + echo " Compilers are severely limited on multiprecision arithmetic." + echo " Constantine compile-time assembler is used by default (nimble bench_fp)." + echo " GCC is significantly slower than Clang on multiprecision arithmetic due to catastrophic handling of carries." + echo " GCC also seems to have issues with large temporaries and register spilling." + echo " This is somewhat alleviated by Constantine compile-time assembler." + echo " Bench on specific compiler with assembler: \"nimble bench_ec_g1_gcc\" or \"nimble bench_ec_g1_clang\"." + echo " Bench on specific compiler with assembler: \"nimble bench_ec_g1_gcc_noasm\" or \"nimble bench_ec_g1_clang_noasm\"." + echo " - The simplest operations might be optimized away by the compiler." + echo " - Fast Squaring and Fast Multiplication are possible if there are spare bits in the prime representation (i.e. the prime uses 254 bits out of 256 bits)" + +template measure*(iters: int, + startTime, stopTime: untyped, + startClk, stopClk: untyped, + body: untyped): untyped = + let startTime = getMonotime() + when SupportsGetTicks: + let startClk = getTicks() + for _ in 0 ..< iters: + body + when SupportsGetTicks: + let stopClk = getTicks() + let stopTime = getMonotime() + + when not SupportsGetTicks: + let startClk = -1'i64 + let stopClk = -1'i64 diff --git a/benchmarks/bench_elliptic_template.nim b/benchmarks/bench_elliptic_template.nim index 92b2fc9..cd60a7b 100644 --- a/benchmarks/bench_elliptic_template.nim +++ b/benchmarks/bench_elliptic_template.nim @@ -21,85 +21,12 @@ import # Helpers ../helpers/[prng_unsafe, static_for], ./platforms, - # Standard library - std/[monotimes, times, strformat, strutils, macros], + ./bench_blueprint, # Reference unsafe scalar multiplication ../tests/support/ec_reference_scalar_mult -var rng: RngState -let seed = uint32(getTime().toUnix() and (1'i64 shl 32 - 1)) # unixTime mod 2^32 -rng.seed(seed) -echo "bench xoshiro512** seed: ", seed - -# warmup -proc warmup*() = - # Warmup - make sure cpu is on max perf - let start = cpuTime() - var foo = 123 - for i in 0 ..< 300_000_000: - foo += i*i mod 456 - foo = foo mod 789 - - # Compiler shouldn't optimize away the results as cpuTime rely on sideeffects - let stop = cpuTime() - echo &"Warmup: {stop - start:>4.4f} s, result {foo} (displayed to avoid compiler optimizing warmup away)\n" - -warmup() - -when defined(gcc): - echo "\nCompiled with GCC" -elif defined(clang): - echo "\nCompiled with Clang" -elif defined(vcc): - echo "\nCompiled with MSVC" -elif defined(icc): - echo "\nCompiled with ICC" -else: - echo "\nCompiled with an unknown compiler" - -echo "Optimization level => " -echo " no optimization: ", not defined(release) -echo " release: ", defined(release) -echo " danger: ", defined(danger) -echo " inline assembly: ", UseASM_X86_64 - -when (sizeof(int) == 4) or defined(Constantine32): - echo "⚠️ Warning: using Constantine with 32-bit limbs" -else: - echo "Using Constantine with 64-bit limbs" - -when SupportsCPUName: - echo "Running on ", cpuName(), "" - -when SupportsGetTicks: - echo "\n⚠️ Cycles measurements are approximate and use the CPU nominal clock: Turbo-Boost and overclocking will skew them." - echo "i.e. a 20% overclock will be about 20% off (assuming no dynamic frequency scaling)" - -echo "\n=================================================================================================================\n" - -proc separator*() = - echo "-".repeat(177) - -proc report(op, elliptic: string, start, stop: MonoTime, startClk, stopClk: int64, iters: int) = - let ns = inNanoseconds((stop-start) div iters) - let throughput = 1e9 / float64(ns) - when SupportsGetTicks: - echo &"{op:<60} {elliptic:<40} {throughput:>15.3f} ops/s {ns:>9} ns/op {(stopClk - startClk) div iters:>9} CPU cycles (approx)" - else: - echo &"{op:<60} {elliptic:<40} {throughput:>15.3f} ops/s {ns:>9} ns/op" - -proc notes*() = - echo "Notes:" - echo " - Compilers:" - echo " Compilers are severely limited on multiprecision arithmetic." - echo " Constantine compile-time assembler is used by default (nimble bench_fp)." - echo " GCC is significantly slower than Clang on multiprecision arithmetic due to catastrophic handling of carries." - echo " GCC also seems to have issues with large temporaries and register spilling." - echo " This is somewhat alleviated by Constantine compile-time assembler." - echo " Bench on specific compiler with assembler: \"nimble bench_ec_g1_gcc\" or \"nimble bench_ec_g1_clang\"." - echo " Bench on specific compiler with assembler: \"nimble bench_ec_g1_gcc_noasm\" or \"nimble bench_ec_g1_clang_noasm\"." - echo " - The simplest operations might be optimized away by the compiler." - echo " - Fast Squaring and Fast Multiplication are possible if there are spare bits in the prime representation (i.e. the prime uses 254 bits out of 256 bits)" +export notes +proc separator*() = separator(177) macro fixEllipticDisplay(T: typedesc): untyped = # At compile-time, enums are integers and their display is buggy @@ -111,21 +38,17 @@ macro fixEllipticDisplay(T: typedesc): untyped = name.add "[" & fieldName & "[" & curveName & "]]" result = newLit name +proc report(op, elliptic: string, start, stop: MonoTime, startClk, stopClk: int64, iters: int) = + let ns = inNanoseconds((stop-start) div iters) + let throughput = 1e9 / float64(ns) + when SupportsGetTicks: + echo &"{op:<60} {elliptic:<40} {throughput:>15.3f} ops/s {ns:>9} ns/op {(stopClk - startClk) div iters:>9} CPU cycles (approx)" + else: + echo &"{op:<60} {elliptic:<40} {throughput:>15.3f} ops/s {ns:>9} ns/op" + template bench(op: string, T: typedesc, iters: int, body: untyped): untyped = - let start = getMonotime() - when SupportsGetTicks: - let startClk = getTicks() - for _ in 0 ..< iters: - body - when SupportsGetTicks: - let stopClk = getTicks() - let stop = getMonotime() - - when not SupportsGetTicks: - let startClk = -1'i64 - let stopClk = -1'i64 - - report(op, fixEllipticDisplay(T), start, stop, startClk, stopClk, iters) + measure(iters, startTime, stopTime, startClk, stopClk, body) + report(op, fixEllipticDisplay(T), startTime, stopTime, startClk, stopClk, iters) proc addBench*(T: typedesc, iters: int) = const G1_or_G2 = when T.F is Fp: "G1" else: "G2" diff --git a/benchmarks/bench_fields_template.nim b/benchmarks/bench_fields_template.nim index f4821f7..312723d 100644 --- a/benchmarks/bench_fields_template.nim +++ b/benchmarks/bench_fields_template.nim @@ -19,63 +19,10 @@ import ../constantine/towers, # Helpers ../helpers/[prng_unsafe, static_for], - ./platforms, - # Standard library - std/[monotimes, times, strformat, strutils, macros] + ./bench_blueprint -var rng: RngState -let seed = uint32(getTime().toUnix() and (1'i64 shl 32 - 1)) # unixTime mod 2^32 -rng.seed(seed) -echo "bench xoshiro512** seed: ", seed - -# warmup -proc warmup*() = - # Warmup - make sure cpu is on max perf - let start = cpuTime() - var foo = 123 - for i in 0 ..< 300_000_000: - foo += i*i mod 456 - foo = foo mod 789 - - # Compiler shouldn't optimize away the results as cpuTime rely on sideeffects - let stop = cpuTime() - echo &"Warmup: {stop - start:>4.4f} s, result {foo} (displayed to avoid compiler optimizing warmup away)\n" - -warmup() - -when defined(gcc): - echo "\nCompiled with GCC" -elif defined(clang): - echo "\nCompiled with Clang" -elif defined(vcc): - echo "\nCompiled with MSVC" -elif defined(icc): - echo "\nCompiled with ICC" -else: - echo "\nCompiled with an unknown compiler" - -echo "Optimization level => " -echo " no optimization: ", not defined(release) -echo " release: ", defined(release) -echo " danger: ", defined(danger) -echo " inline assembly: ", UseASM_X86_64 - -when (sizeof(int) == 4) or defined(Constantine32): - echo "⚠️ Warning: using Constantine with 32-bit limbs" -else: - echo "Using Constantine with 64-bit limbs" - -when SupportsCPUName: - echo "Running on ", cpuName(), "" - -when SupportsGetTicks: - echo "\n⚠️ Cycles measurements are approximate and use the CPU nominal clock: Turbo-Boost and overclocking will skew them." - echo "i.e. a 20% overclock will be about 20% off (assuming no dynamic frequency scaling)" - -echo "\n=================================================================================================================\n" - -proc separator*() = - echo "-".repeat(145) +export notes +proc separator*() = separator(145) proc report(op, field: string, start, stop: MonoTime, startClk, stopClk: int64, iters: int) = let ns = inNanoseconds((stop-start) div iters) @@ -85,19 +32,6 @@ proc report(op, field: string, start, stop: MonoTime, startClk, stopClk: int64, else: echo &"{op:<50} {field:<18} {throughput:>15.3f} ops/s {ns:>9} ns/op" -proc notes*() = - echo "Notes:" - echo " - Compilers:" - echo " Compilers are severely limited on multiprecision arithmetic." - echo " Constantine compile-time assembler is used by default (nimble bench_fp)." - echo " GCC is significantly slower than Clang on multiprecision arithmetic due to catastrophic handling of carries." - echo " GCC also seems to have issues with large temporaries and register spilling." - echo " This is somewhat alleviated by Constantine compile-time assembler." - echo " Bench on specific compiler with assembler: \"nimble bench_fp_gcc\" or \"nimble bench_fp_clang\"." - echo " Bench on specific compiler with assembler: \"nimble bench_fp_gcc_noasm\" or \"nimble bench_fp_clang_noasm\"." - echo " - The simplest operations might be optimized away by the compiler." - echo " - Fast Squaring and Fast Multiplication are possible if there are spare bits in the prime representation (i.e. the prime uses 254 bits out of 256 bits)" - macro fixFieldDisplay(T: typedesc): untyped = # At compile-time, enums are integers and their display is buggy # we get the Curve ID instead of the curve name. @@ -107,20 +41,8 @@ macro fixFieldDisplay(T: typedesc): untyped = result = newLit name template bench(op: string, T: typedesc, iters: int, body: untyped): untyped = - let start = getMonotime() - when SupportsGetTicks: - let startClk = getTicks() - for _ in 0 ..< iters: - body - when SupportsGetTicks: - let stopClk = getTicks() - let stop = getMonotime() - - when not SupportsGetTicks: - let startClk = -1'i64 - let stopClk = -1'i64 - - report(op, fixFieldDisplay(T), start, stop, startClk, stopClk, iters) + measure(iters, startTime, stopTime, startClk, stopClk, body) + report(op, fixFieldDisplay(T), startTime, stopTime, startClk, stopClk, iters) proc addBench*(T: typedesc, iters: int) = var x = rng.random_unsafe(T) diff --git a/benchmarks/bench_pairing_template.nim b/benchmarks/bench_pairing_template.nim index 94b0817..50ad36e 100644 --- a/benchmarks/bench_pairing_template.nim +++ b/benchmarks/bench_pairing_template.nim @@ -28,101 +28,23 @@ import pairing_bn ], # Helpers - ../helpers/[prng_unsafe, static_for], - ./platforms, - # Standard library - std/[monotimes, times, strformat, strutils, macros] + ../helpers/prng_unsafe, + ./bench_blueprint -var rng: RngState -let seed = uint32(getTime().toUnix() and (1'i64 shl 32 - 1)) # unixTime mod 2^32 -rng.seed(seed) -echo "bench xoshiro512** seed: ", seed +export notes +proc separator*() = separator(177) -# warmup -proc warmup*() = - # Warmup - make sure cpu is on max perf - let start = cpuTime() - var foo = 123 - for i in 0 ..< 300_000_000: - foo += i*i mod 456 - foo = foo mod 789 - - # Compiler shouldn't optimize away the results as cpuTime rely on sideeffects - let stop = cpuTime() - echo &"Warmup: {stop - start:>4.4f} s, result {foo} (displayed to avoid compiler optimizing warmup away)\n" - -warmup() - -when defined(gcc): - echo "\nCompiled with GCC" -elif defined(clang): - echo "\nCompiled with Clang" -elif defined(vcc): - echo "\nCompiled with MSVC" -elif defined(icc): - echo "\nCompiled with ICC" -else: - echo "\nCompiled with an unknown compiler" - -echo "Optimization level => " -echo " no optimization: ", not defined(release) -echo " release: ", defined(release) -echo " danger: ", defined(danger) -echo " inline assembly: ", UseASM_X86_64 - -when (sizeof(int) == 4) or defined(Constantine32): - echo "⚠️ Warning: using Constantine with 32-bit limbs" -else: - echo "Using Constantine with 64-bit limbs" - -when SupportsCPUName: - echo "Running on ", cpuName(), "" - -when SupportsGetTicks: - echo "\n⚠️ Cycles measurements are approximate and use the CPU nominal clock: Turbo-Boost and overclocking will skew them." - echo "i.e. a 20% overclock will be about 20% off (assuming no dynamic frequency scaling)" - -echo "\n=================================================================================================================\n" - -proc separator*() = - echo "-".repeat(177) - -proc report(op, curve: string, start, stop: MonoTime, startClk, stopClk: int64, iters: int) = - let ns = inNanoseconds((stop-start) div iters) +proc report(op, curve: string, startTime, stopTime: MonoTime, startClk, stopClk: int64, iters: int) = + let ns = inNanoseconds((stopTime-startTime) div iters) let throughput = 1e9 / float64(ns) when SupportsGetTicks: echo &"{op:<60} {curve:<15} {throughput:>15.3f} ops/s {ns:>9} ns/op {(stopClk - startClk) div iters:>9} CPU cycles (approx)" else: echo &"{op:<60} {curve:<15} {throughput:>15.3f} ops/s {ns:>9} ns/op" -proc notes*() = - echo "Notes:" - echo " - Compilers:" - echo " Compilers are severely limited on multiprecision arithmetic." - echo " Constantine compile-time assembler is used by default (nimble bench_fp)." - echo " GCC is significantly slower than Clang on multiprecision arithmetic due to catastrophic handling of carries." - echo " GCC also seems to have issues with large temporaries and register spilling." - echo " This is somewhat alleviated by Constantine compile-time assembler." - echo " Bench on specific compiler with assembler: \"nimble bench_ec_g1_gcc\" or \"nimble bench_ec_g1_clang\"." - echo " Bench on specific compiler with assembler: \"nimble bench_ec_g1_gcc_noasm\" or \"nimble bench_ec_g1_clang_noasm\"." - echo " - The simplest operations might be optimized away by the compiler." - echo " - Fast Squaring and Fast Multiplication are possible if there are spare bits in the prime representation (i.e. the prime uses 254 bits out of 256 bits)" - template bench(op: string, C: static Curve, iters: int, body: untyped): untyped = - let start = getMonotime() - when SupportsGetTicks: - let startClk = getTicks() - for _ in 0 ..< iters: - body - when SupportsGetTicks: - let stopClk = getTicks() - let stop = getMonotime() - - when not SupportsGetTicks: - let startClk = -1'i64 - let stopClk = -1'i64 - - report(op, $C, start, stop, startClk, stopClk, iters) + measure(iters, startTime, stopTime, startClk, stopClk, body) + report(op, $C, startTime, stopTime, startClk, stopClk, iters) func random_point*(rng: var RngState, EC: typedesc): EC {.noInit.} = result = rng.random_unsafe(EC) diff --git a/benchmarks/bench_sha256.nim b/benchmarks/bench_sha256.nim new file mode 100644 index 0000000..d125580 --- /dev/null +++ b/benchmarks/bench_sha256.nim @@ -0,0 +1,58 @@ +import + # Internals + ../constantine/hashes/h_sha256, + # Helpers + ../helpers/prng_unsafe, + ./bench_blueprint + +proc separator*() = separator(69) + +proc SHA256[T: byte|char]( + msg: openarray[T], + digest: ptr array[32, byte] = nil + ): ptr array[32, byte] {.cdecl, dynlib: "libssl.so", importc.} + +proc SHA256_OpenSSL[T: byte|char]( + digest: var array[32, byte], + s: openarray[T]) = + discard SHA256(s, digest.addr) + +proc report(op: string, bytes: int, startTime, stopTime: MonoTime, startClk, stopClk: int64, iters: int) = + let ns = inNanoseconds((stopTime-startTime) div iters) + let throughput = 1e9 / float64(ns) + when SupportsGetTicks: + let cycles = (stopClk - startClk) div iters + let cyclePerByte = cycles.float64 / bytes.float64 + echo &"{op:<30} {throughput:>15.3f} ops/s {ns:>9} ns/op {cycles:>10} cycles {cyclePerByte:>5.2f} cycles/byte" + else: + echo &"{op:<30} {throughput:>15.3f} ops/s {ns:>9} ns/op" + +template bench(op: string, bytes: int, iters: int, body: untyped): untyped = + measure(iters, startTime, stopTime, startClk, stopClk, body) + report(op, bytes, startTime, stopTime, startClk, stopClk, iters) + +proc benchSHA256_constantine[T](msg: openarray[T], msgComment: string, iters: int) = + var digest: array[32, byte] + bench("SHA256 - Constantine - " & msgComment, msg.len, iters): + sha256.hash(digest, msg) + +proc benchSHA256_openssl[T](msg: openarray[T], msgComment: string, iters: int) = + var digest: array[32, byte] + bench("SHA256 - OpenSSL - " & msgComment, msg.len, iters): + SHA256_OpenSSL(digest, msg) + +when isMainModule: + proc main() = + block: + let msg128B = rng.random_byte_seq(128) + benchSHA256_constantine(msg128B, "128B", 128) + benchSHA256_openssl(msg128B, "128B", 128) + block: + let msg5MB = rng.random_byte_seq(5_000_000) + benchSHA256_constantine(msg5MB, "5MB", 16) + benchSHA256_openssl(msg5MB, "5MB", 16) + block: + let msg100MB = rng.random_byte_seq(100_000_000) + benchSHA256_constantine(msg100MB, "100MB", 3) + benchSHA256_openssl(msg100MB, "100MB", 3) + main() diff --git a/constantine.nimble b/constantine.nimble index b381b8f..099c7a5 100644 --- a/constantine.nimble +++ b/constantine.nimble @@ -129,17 +129,40 @@ const testDesc: seq[tuple[path: string, useGMP: bool]] = @[ ("tests/t_pairing_bn254_snarks_optate.nim", false), ("tests/t_pairing_bls12_377_optate.nim", false), ("tests/t_pairing_bls12_381_optate.nim", false), + + # Hashing vs OpenSSL + ("tests/t_hash_sha256_vs_openssl.nim", true), ] # For temporary (hopefully) investigation that can only be reproduced in CI const useDebug = [ - "tests/t_bigints.nim" + "tests/t_bigints.nim", + "tests/t_hash_sha256_vs_openssl.nim", ] +# Tests that uses sequences require Nim GC, stack scanning and nil pointer passed to openarray +# In particular the tests that uses the json test vectors, don't sanitize them. +# we do use gc:none to help +const skipSanitizers = [ + "tests/t_ec_sage_bn254_nogami.nim", + "tests/t_ec_sage_bn254_snarks.nim", + "tests/t_ec_sage_bls12_377.nim", + "tests/t_ec_sage_bls12_381.nim", +] + +const sanitizers = + " --passC:-fsanitize=undefined --passL:-fsanitize=undefined" & + " --passC:-fno-sanitize-recover" & # Enforce crash on undefined behaviour + " --gc:none" # The conservative stack scanning of Nim default GC triggers, alignment UB and stack-buffer-overflow check. + # " --passC:-fsanitize=address --passL:-fsanitize=address" & # Requires too much stack for the inline assembly + # Helper functions # ---------------------------------------------------------------- +proc clearParallelBuild() = + exec "> " & buildParallel + proc test(flags, path: string, commandFile = false) = # commandFile should be a "file" but Nimscript doesn't support IO if not dirExists "build": @@ -153,6 +176,7 @@ proc test(flags, path: string, commandFile = false) = if existsEnv"CC": cc = " --cc:" & getEnv"CC" + var flags = flags & " --passC:-fstack-protector-all" let command = "nim " & lang & cc & " " & flags & " --verbosity:0 --outdir:build/testsuite -r --hints:off --warnings:off " & " --nimcache:nimcache/" & path & " " & @@ -160,11 +184,10 @@ proc test(flags, path: string, commandFile = false) = if not commandFile: echo "\n==============================================================================================" - echo "Running [flags: ", flags, "] ", path + echo "Running [flags:", flags, "] ", path echo "==============================================================================================" exec command else: - # commandFile.writeLine command exec "echo \'" & command & "\' >> " & buildParallel proc runBench(benchName: string, compiler = "", useAsm = true) = @@ -181,24 +204,29 @@ proc runBench(benchName: string, compiler = "", useAsm = true) = " --nimcache:nimcache/" & benchName & "_" & compiler & "_" & (if useAsm: "useASM" else: "noASM") & " -r --hints:off --warnings:off benchmarks/" & benchName & ".nim" +proc runTests(requireGMP: bool, dumpCmdFile = false, test32bit = false, testASM = true) = + for td in testDesc: + if not(td.useGMP and not requireGMP): + var flags = "" + if not testASM: + flags &= " -d:ConstantineASM=false" + if test32bit: + flags &= " -d:Constantine32" + if td.path in useDebug: + flags &= " -d:debugConstantine" + if td.path notin skipSanitizers: + flags &= sanitizers + test flags, td.path, dumpCmdFile + # Tasks # ---------------------------------------------------------------- task test, "Run all tests": # -d:testingCurves is configured in a *.nim.cfg for convenience - - for td in testDesc: - if td.path in useDebug: - test "-d:debugConstantine", td.path - else: - test "", td.path + runTests(requireGMP = true) # if sizeof(int) == 8: # 32-bit tests on 64-bit arch - # for td in testDesc: - # if td.path in useDebug: - # test "-d:Constantine32 -d:debugConstantine", td.path - # else: - # test "-d:Constantine32", td.path + # runTests(requireGMP = true, test32bit = true) # Ensure benchmarks stay relevant. Ignore Windows 32-bit at the moment if not defined(windows) or not (existsEnv"UCPU" or getEnv"UCPU" == "i686"): @@ -213,23 +241,14 @@ task test, "Run all tests": runBench("bench_pairing_bls12_381") runBench("bench_pairing_bn254_nogami") runBench("bench_pairing_bn254_snarks") + runBench("bench_sha256") task test_no_gmp, "Run tests that don't require GMP": # -d:testingCurves is configured in a *.nim.cfg for convenience - for td in testDesc: - if not td.useGMP: - if td.path in useDebug: - test "-d:debugConstantine", td.path - else: - test "", td.path + runTests(requireGMP = false) - if sizeof(int) == 8: # 32-bit tests on 64-bit arch - for td in testDesc: - if not td.useGMP: - if td.path in useDebug: - test "-d:Constantine32 -d:debugConstantine", td.path - else: - test "-d:Constantine32", td.path + # if sizeof(int) == 8: # 32-bit tests on 64-bit arch + # runTests(requireGMP = true, test32bit = true) # Ensure benchmarks stay relevant. Ignore Windows 32-bit at the moment if not defined(windows) or not (existsEnv"UCPU" or getEnv"UCPU" == "i686"): @@ -243,31 +262,17 @@ task test_no_gmp, "Run tests that don't require GMP": runBench("bench_pairing_bls12_381") runBench("bench_pairing_bn254_nogami") runBench("bench_pairing_bn254_snarks") + runBench("bench_sha256") task test_parallel, "Run all tests in parallel (via GNU parallel)": # -d:testingCurves is configured in a *.nim.cfg for convenience - let cmdFile = true # open(buildParallel, mode = fmWrite) # Nimscript doesn't support IO :/ - exec "> " & buildParallel - - for td in testDesc: - if td.path in useDebug: - test "-d:debugConstantine", td.path, cmdFile - else: - test "", td.path, cmdFile - - # cmdFile.close() - # Execute everything in parallel with GNU parallel + clearParallelBuild() + runTests(requireGMP = true, dumpCmdFile = true) exec "parallel --keep-order --group < " & buildParallel - exec "> " & buildParallel if sizeof(int) == 8: # 32-bit tests on 64-bit arch - for td in testDesc: - if td.path in useDebug: - test "-d:Constantine32 -d:debugConstantine", td.path, cmdFile - else: - test "-d:Constantine32", td.path, cmdFile - # cmdFile.close() - # Execute everything in parallel with GNU parallel + clearParallelBuild() + runTests(requireGMP = true, dumpCmdFile = true, test32bit = true) exec "parallel --keep-order --group < " & buildParallel # Now run the benchmarks @@ -286,31 +291,18 @@ task test_parallel, "Run all tests in parallel (via GNU parallel)": runBench("bench_pairing_bls12_381") runBench("bench_pairing_bn254_nogami") runBench("bench_pairing_bn254_snarks") + runBench("bench_sha256") task test_parallel_no_assembler, "Run all tests (without macro assembler) in parallel (via GNU parallel)": # -d:testingCurves is configured in a *.nim.cfg for convenience - let cmdFile = true # open(buildParallel, mode = fmWrite) # Nimscript doesn't support IO :/ - exec "> " & buildParallel - - for td in testDesc: - if td.path in useDebug: - test "-d:debugConstantine -d:ConstantineASM=false", td.path, cmdFile - else: - test " -d:ConstantineASM=false", td.path, cmdFile - - # cmdFile.close() - # Execute everything in parallel with GNU parallel + clearParallelBuild() + runTests(requireGMP = true, dumpCmdFile = true, testASM = false) exec "parallel --keep-order --group < " & buildParallel exec "> " & buildParallel if sizeof(int) == 8: # 32-bit tests on 64-bit arch - for td in testDesc: - if td.path in useDebug: - test "-d:Constantine32 -d:debugConstantine -d:ConstantineASM=false", td.path, cmdFile - else: - test "-d:Constantine32 -d:ConstantineASM=false", td.path, cmdFile - # cmdFile.close() - # Execute everything in parallel with GNU parallel + clearParallelBuild() + runTests(requireGMP = true, dumpCmdFile = true, test32bit = true, testASM = false) exec "parallel --keep-order --group < " & buildParallel # Now run the benchmarks @@ -329,33 +321,17 @@ task test_parallel_no_assembler, "Run all tests (without macro assembler) in par runBench("bench_pairing_bls12_381") runBench("bench_pairing_bn254_nogami") runBench("bench_pairing_bn254_snarks") + runBench("bench_sha256") task test_parallel_no_gmp, "Run all tests in parallel (via GNU parallel)": # -d:testingCurves is configured in a *.nim.cfg for convenience - let cmdFile = true # open(buildParallel, mode = fmWrite) # Nimscript doesn't support IO :/ - exec "> " & buildParallel - - for td in testDesc: - if not td.useGMP: - if td.path in useDebug: - test "-d:debugConstantine", td.path, cmdFile - else: - test "", td.path, cmdFile - - # cmdFile.close() - # Execute everything in parallel with GNU parallel + clearParallelBuild() + runTests(requireGMP = false, dumpCmdFile = true) exec "parallel --keep-order --group < " & buildParallel - exec "> " & buildParallel if sizeof(int) == 8: # 32-bit tests on 64-bit arch - for td in testDesc: - if not td.useGMP: - if td.path in useDebug: - test "-d:Constantine32 -d:debugConstantine", td.path, cmdFile - else: - test "-d:Constantine32", td.path, cmdFile - # cmdFile.close() - # Execute everything in parallel with GNU parallel + clearParallelBuild() + runTests(requireGMP = false, dumpCmdFile = true, test32bit = true) exec "parallel --keep-order --group < " & buildParallel # Now run the benchmarks @@ -374,33 +350,18 @@ task test_parallel_no_gmp, "Run all tests in parallel (via GNU parallel)": runBench("bench_pairing_bls12_381") runBench("bench_pairing_bn254_nogami") runBench("bench_pairing_bn254_snarks") + runBench("bench_sha256") task test_parallel_no_gmp_no_assembler, "Run all tests in parallel (via GNU parallel)": # -d:testingCurves is configured in a *.nim.cfg for convenience - let cmdFile = true # open(buildParallel, mode = fmWrite) # Nimscript doesn't support IO :/ - exec "> " & buildParallel - - for td in testDesc: - if not td.useGMP: - if td.path in useDebug: - test "-d:debugConstantine -d:ConstantineASM=false", td.path, cmdFile - else: - test "-d:ConstantineASM=false", td.path, cmdFile - - # cmdFile.close() - # Execute everything in parallel with GNU parallel + clearParallelBuild() + runTests(requireGMP = false, dumpCmdFile = true, testASM = false) exec "parallel --keep-order --group < " & buildParallel exec "> " & buildParallel if sizeof(int) == 8: # 32-bit tests on 64-bit arch - for td in testDesc: - if not td.useGMP: - if td.path in useDebug: - test "-d:Constantine32 -d:debugConstantine", td.path, cmdFile - else: - test "-d:Constantine32", td.path, cmdFile - # cmdFile.close() - # Execute everything in parallel with GNU parallel + clearParallelBuild() + runTests(requireGMP = false, dumpCmdFile = true, test32bit = true, testASM = false) exec "parallel --keep-order --group < " & buildParallel # Now run the benchmarks @@ -419,6 +380,7 @@ task test_parallel_no_gmp_no_assembler, "Run all tests in parallel (via GNU para runBench("bench_pairing_bls12_381") runBench("bench_pairing_bn254_nogami") runBench("bench_pairing_bn254_snarks") + runBench("bench_sha256") task bench_fp, "Run benchmark 𝔽p with your default compiler": runBench("bench_fp") @@ -599,3 +561,6 @@ task bench_pairing_bn254_snarks_gcc_noasm, "Run pairings benchmarks for BN254-Sn task bench_pairing_bn254_snarks_clang_noasm, "Run pairings benchmarks for BN254-Snarks - Clang no Assembly": runBench("bench_pairing_bn254_snarks", "clang", useAsm = false) + +task bench_sha256, "Run SHA256 benchmarks": + runBench("bench_sha256") diff --git a/constantine/hashes/h_sha256.nim b/constantine/hashes/h_sha256.nim new file mode 100644 index 0000000..1eea6d2 --- /dev/null +++ b/constantine/hashes/h_sha256.nim @@ -0,0 +1,348 @@ +# Constantine +# Copyright (c) 2018-2019 Status Research & Development GmbH +# Copyright (c) 2020-Present Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +import + ../config/common, + ../io/endians + +# SHA256, a hash function from the SHA2 family +# -------------------------------------------------------------------------------- +# +# References: +# - NIST: https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.180-4.pdf +# - IETF: US Secure Hash Algorithms (SHA and HMAC-SHA) https://tools.ietf.org/html/rfc4634 +# - Intel optimization https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/sha-256-implementations-paper.pdf +# - Parallelizing message schedules +# to accelerate the computations of hash functions +# Shay Gueron, Vlad Krasnov, 2012 +# https://eprint.iacr.org/2012/067.pdf +# +# Vectors: +# - https://csrc.nist.gov/CSRC/media/Projects/Cryptographic-Standards-and-Guidelines/documents/examples/SHA256.pdf + +# Types and constants +# ---------------------------------------------------------------- + +const + DigestSize = 32 + BlockSize = 64 + HashSize = DigestSize div sizeof(uint32) # 8 + +type + Sha256Context* = object + ## Align to 64 for cache line and SIMD friendliness + H{.align: 64}: array[HashSize, uint32] + buf{.align: 64}: array[BlockSize, byte] + msgLen: uint64 + bufIdx: uint8 + + sha256* = Sha256Context + +# Internal +# ---------------------------------------------------------------- +# TODO: vectorized implementations + +# No exceptions allowed in core cryptographic operations +{.push raises: [].} +{.push checks: off.} + +template rotr(x, n: uint32): uint32 = + ## Rotate right the bits + # We always use it with constants in 0 ..< 32 + # so undefined behaviour. + (x shr n) or (x shl (32 - n)) + +template ch(x, y, z: uint32): uint32 = + ## "Choose" function of SHA256 + ## Choose bit i from yi or zi depending on xi + when false: # Spec FIPS 180-4 + (x and y) xor (not(x) and z) + else: # RFC4634 + ((x and (y xor z)) xor z) + +template maj(x, y, z: uint32): uint32 = + ## "Majority" function of SHA256 + when false: # Spec FIPS 180-4 + (x and y) xor (x and z) xor (y and z) + else: # RFC4634 + (x and (y or z)) or (y and z) + +template S0(x: uint32): uint32 = + # Σ₀ + rotr(x, 2) xor rotr(x, 13) xor rotr(x, 22) + +template S1(x: uint32): uint32 = + # Σ₁ + rotr(x, 6) xor rotr(x, 11) xor rotr(x, 25) + +template s0(x: uint32): uint32 = + # σ₀ + rotr(x, 7) xor rotr(x, 18) xor (x shr 3) + +template s1(x: uint32): uint32 = + # σ₁ + rotr(x, 17) xor rotr(x, 19) xor (x shr 10) + +func setZero[N](a: var array[N, SomeNumber]){.inline.} = + for i in 0 ..< a.len: + a[i] = 0 + +func hashMessageBlocks[T: byte|char]( + H: var array[HashSize, uint32], + message: openarray[T]): uint = + ## Hash a message block by block + ## Sha256 block size is 64 bytes hence + ## a message will be process 64 by 64 bytes. + ## FIPS.180-4 6.2.2. SHA-256 Hash Computation + + result = 0 + let numBlocks = message.len.uint div BlockSize + if numBlocks == 0: + return 0 + + const K256 = [ + 0x428a2f98'u32, 0x71374491'u32, 0xb5c0fbcf'u32, 0xe9b5dba5'u32, 0x3956c25b'u32, 0x59f111f1'u32, 0x923f82a4'u32, 0xab1c5ed5'u32, + 0xd807aa98'u32, 0x12835b01'u32, 0x243185be'u32, 0x550c7dc3'u32, 0x72be5d74'u32, 0x80deb1fe'u32, 0x9bdc06a7'u32, 0xc19bf174'u32, + 0xe49b69c1'u32, 0xefbe4786'u32, 0x0fc19dc6'u32, 0x240ca1cc'u32, 0x2de92c6f'u32, 0x4a7484aa'u32, 0x5cb0a9dc'u32, 0x76f988da'u32, + 0x983e5152'u32, 0xa831c66d'u32, 0xb00327c8'u32, 0xbf597fc7'u32, 0xc6e00bf3'u32, 0xd5a79147'u32, 0x06ca6351'u32, 0x14292967'u32, + 0x27b70a85'u32, 0x2e1b2138'u32, 0x4d2c6dfc'u32, 0x53380d13'u32, 0x650a7354'u32, 0x766a0abb'u32, 0x81c2c92e'u32, 0x92722c85'u32, + 0xa2bfe8a1'u32, 0xa81a664b'u32, 0xc24b8b70'u32, 0xc76c51a3'u32, 0xd192e819'u32, 0xd6990624'u32, 0xf40e3585'u32, 0x106aa070'u32, + 0x19a4c116'u32, 0x1e376c08'u32, 0x2748774c'u32, 0x34b0bcb5'u32, 0x391c0cb3'u32, 0x4ed8aa4a'u32, 0x5b9cca4f'u32, 0x682e6ff3'u32, + 0x748f82ee'u32, 0x78a5636f'u32, 0x84c87814'u32, 0x8cc70208'u32, 0x90befffa'u32, 0xa4506ceb'u32, 0xbef9a3f7'u32, 0xc67178f2'u32 + ] + + var + a = H[0] + b = H[1] + c = H[2] + d = H[3] + e = H[4] + f = H[5] + g = H[6] + h = H[7] + + for _ in 0 ..< numBlocks: + # The first 16 bytes have different handling + # from bytes 16..<64. + # Using an array[64, uint32] will span it + # across 8 cache lines impacting performance + + # Workspace with message schedule Wₜ + var W{.noInit.}: array[16, uint32] + var t = 0'u32 + while t < 16: # Wₜ = Mⁱₜ + W[t].parseFromBlob(message, result, bigEndian) + let T1 = h + S1(e) + ch(e, f, g) + K256[t] + W[t] + let T2 = S0(a) + maj(a, b, c) + h = g + g = f + f = e + e = d + T1 + d = c + c = b + b = a + a = T1+T2 + + t += 1 + + while t < 64: + W[t mod 16] += s1(W[(t-2) mod 16]) + + W[(t-7) mod 16] + + s0(W[(t-15) mod 16]) + let T1 = h + S1(e) + ch(e, f, g) + K256[t] + W[t mod 16] + let T2 = S0(a) + maj(a, b, c) + h = g + g = f + f = e + e = d + T1 + d = c + c = b + b = a + a = T1+T2 + + t += 1 + + a += H[0]; H[0] = a + b += H[1]; H[1] = b + c += H[2]; H[2] = c + d += H[3]; H[3] = d + e += H[4]; H[4] = e + f += H[5]; H[5] = f + g += H[6]; H[6] = g + h += H[7]; H[7] = h + +func dumpHash( + digest: var array[DigestSize, byte], + H: array[HashSize, uint32]) = + ## Convert the internal hash into a message digest + var dstIdx = 0'u + for i in 0 ..< H.len: + digest.dumpRawInt(H[i], dstIdx, bigEndian) + dstIdx += uint sizeof(uint32) + +func copy[N: static int, T: byte|char]( + dst: var array[N, byte], + dStart: SomeInteger, + src: openArray[T], + sStart: SomeInteger, + len: SomeInteger + ) = + ## Copy dst[dStart ..< dStart+len] = src[sStart ..< sStart+len] + ## Unlike the standard library, this cannot throw + ## even a defect. + ## It also handles copy of char into byte arrays + debug: + doAssert 0 <= dStart and dStart+len <= dst.len.uint + doAssert 0 <= sStart and sStart+len <= src.len.uint + + for i in 0 ..< len: + dst[dStart + i] = byte src[sStart + i] + +func hashBuffer(ctx: var Sha256Context) = + discard ctx.H.hashMessageBlocks(ctx.buf) + ctx.buf.setZero() + ctx.bufIdx = 0 + +# Public API +# ---------------------------------------------------------------- + +func init*(ctx: var Sha256Context) = + ## Initialize or reinitialize a Sha256 context + + ctx.msgLen = 0 + ctx.buf.setZero() + ctx.bufIdx = 0 + + ctx.H[0] = 0x6a09e667'u32; + ctx.H[1] = 0xbb67ae85'u32; + ctx.H[2] = 0x3c6ef372'u32; + ctx.H[3] = 0xa54ff53a'u32; + ctx.H[4] = 0x510e527f'u32; + ctx.H[5] = 0x9b05688c'u32; + ctx.H[6] = 0x1f83d9ab'u32; + ctx.H[7] = 0x5be0cd19'u32; + +func update*[T: char|byte](ctx: var Sha256Context, message: openarray[T]) = + ## Append a message to a SHA256 context + ## for incremental SHA256 computation + ## + ## Security note: the tail of your message might be stored + ## in an internal buffer. + ## if sensitive content is used, ensure that + ## `ctx.finish(...)` and `ctx.clear()` are called as soon as possible. + ## Additionally ensure that the message(s) passed were stored + ## in memory considered secure for your threat model. + ## + ## For passwords and secret keys, you MUST NOT use raw SHA-256 + ## use a Key Derivation Function instead (KDF) + + debug: + doAssert: 0 <= ctx.bufIdx and ctx.bufIdx.int < ctx.buf.len + for i in ctx.bufIdx ..< ctx.buf.len: + doAssert ctx.buf[i] == 0 + + var # Message processing state machine + cur = 0'u + bytesLeft = message.len.uint + + ctx.msgLen += bytesLeft + + if ctx.bufIdx != 0: # Previous partial update + let bufIdx = ctx.bufIdx.uint + let free = ctx.buf.sizeof().uint - bufIdx + + if free > bytesLeft: + # Enough free space, store in buffer + ctx.buf.copy(dStart = bufIdx, message, sStart = 0, len = bytesLeft) + ctx.bufIdx += bytesLeft.uint8 + return + else: + # Fill the buffer and do one sha256 hash + ctx.buf.copy(dStart = bufIdx, message, sStart = 0, len = free) + ctx.hashBuffer() + + # Update message state for further processing + cur = free + bytesLeft -= free + + # Process n blocks (64 byte each) + let consumed = ctx.H.hashMessageBlocks( + message.toOpenArray(int cur, message.len-1)) + cur += consumed + bytesLeft -= consumed + + if bytesLeft != 0: + # Store the tail in buffer + debug: # TODO: state machine formal verification - https://nim-lang.org/docs/drnim.html + doAssert ctx.bufIdx == 0 + doAssert cur + bytesLeft == message.len.uint + + ctx.buf.copy(dStart = 0'u, message, sStart = cur, len = bytesLeft) + ctx.bufIdx = uint8 bytesLeft + +func finish*(ctx: var Sha256Context, digest: var array[32, byte]) = + ## Finalize a SHA256 computation and output the + ## message digest to the `digest` buffer. + ## + ## Security note: this does not clear the internal buffer. + ## if sensitive content is used, use "ctx.clear()" + ## and also make sure that the message(s) passed were stored + ## in memory considered secure for your threat model. + ## + ## For passwords and secret keys, you MUST NOT use raw SHA-256 + ## use a Key Derivation Function instead (KDF) + + debug: + doAssert: 0 <= ctx.bufIdx and ctx.bufIdx.int < ctx.buf.len + for i in ctx.bufIdx ..< ctx.buf.len: + doAssert ctx.buf[i] == 0 + + # Add '1' bit at the end of the message (+7 zero bits) + ctx.buf[ctx.bufIdx] = 0b1000_0000 + + # Add k bits so that msgLenBits + 1 + k ≡ 448 mod 512 + # Hence in bytes msgLen + 1 + K ≡ 56 mod 64 + const padZone = 56 + if ctx.bufIdx >= padZone: + # We are in the 56..<64 mod 64 byte count + # and need to rollover to 0 + ctx.hashBuffer() + + let lenInBits = ctx.msgLen.uint64 * 8 + ctx.buf.dumpRawInt(lenInBits, padZone, bigEndian) + discard ctx.H.hashMessageBlocks(ctx.buf) + digest.dumpHash(ctx.H) + +func clear*(ctx: var Sha256Context) = + ## Clear the context internal buffers + ## Security note: + ## For passwords and secret keys, you MUST NOT use raw SHA-256 + ## use a Key Derivation Function instead (KDF) + # TODO: ensure compiler cannot optimize the code away + ctx.buf.setZero() + +func hash*[T: char|byte]( + HashKind: type sha256, + digest: var array[32, byte], + message: openarray[T], + clearMem = false) = + ## Produce a SHA256 digest from a message + var ctx {.noInit.}: HashKind + ctx.init() + ctx.update(message) + ctx.finish(digest) + + if clearMem: + ctx.clear() + +func hash*[T: char|byte]( + HashKind: type sha256, + message: openarray[T], + clearmem = false): array[32, byte] = + ## Produce a SHA256 digest from a message + HashKind.hash(result, message, clearMem) diff --git a/constantine/io/endians.nim b/constantine/io/endians.nim new file mode 100644 index 0000000..bee18bd --- /dev/null +++ b/constantine/io/endians.nim @@ -0,0 +1,78 @@ +# Constantine +# Copyright (c) 2018-2019 Status Research & Development GmbH +# Copyright (c) 2020-Present Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +import ../config/common + +# perf critical we don't want bound checks here +# So no checks and we avoid signed int to ensur eno exceptions. +# TODO: Nim formal verification: https://nim-lang.org/docs/drnim.html +{.push checks:off, raises: [].} + +template toByte*(x: SomeUnsignedInt): byte = + ## At compile-time, conversion to bytes checks the range + ## we want to ensure this is done at the register level + ## at runtime in a single "mov byte" instruction + when nimvm: + byte(x and 0xFF) + else: + byte(x) + +func parseFromBlob*[T: byte|char]( + dst: var SomeUnsignedInt, + src: openArray[T], + cursor: var uint, endian: static Endianness) {.inline.} = + ## Read an unsigned integer from a raw binary blob. + ## The `cursor` represents the current index in the array and is updated + ## by N bytes where N is the size of `dst` type in bytes. + ## The binary blob is interpreted as: + ## - an array of words traversed from 0 ..< len (little-endian), via an incremented `cursor` + ## - with each word being of `endian` ordering for deserialization purpose. + debug: + doAssert 0 <= cursor and cursor < src.len.uint + doAssert cursor + sizeof(dst).uint <= src.len.uint, + "cursor (" & $cursor & ") + sizeof(dst) (" & $sizeof(dst) & + ") <= src.len (" & $src.len & ")" + + type U = typeof(dst) + const L = sizeof(dst) + + var accum: U = 0 + when endian == littleEndian: + for i in 0'u ..< L: + accum = accum or (U(src[cursor+i]) shl (i * 8)) + else: + for i in 0'u ..< L: + accum = accum or (U(src[cursor+i]) shl ((L - 1 - i) * 8)) + dst = accum + cursor.inc(L) + +func dumpRawInt*[T: byte|char]( + dst: var openArray[T], + src: SomeUnsignedInt, + cursor: uint, endian: static Endianness) {.inline.} = + ## Dump an integer into raw binary form + ## The `cursor` represents the current index in the array and is updated + ## by N bytes where N is the size of `src` type in bytes. + ## The binary blob is interpreted as: + ## - an array of words traversed from 0 ..< len (little-endian), via an incremented `cursor` + ## - with each word being of `endian` ordering for deserialization purpose. + debug: + doAssert 0 <= cursor and cursor < dst.len.uint + doAssert cursor + sizeof(src).uint <= dst.len.uint, + "cursor (" & $cursor & ") + sizeof(src) (" & $sizeof(src) & + ") <= dst.len (" & $dst.len & ")" + + type U = typeof(src) + const L = uint sizeof(src) + + when endian == littleEndian: + for i in 0'u ..< L: + dst[cursor+i] = toByte(src shr (i * 8)) + else: + for i in 0'u ..< L: + dst[cursor+i] = toByte(src shr ((L-i-1) * 8)) diff --git a/constantine/io/io_bigints.nim b/constantine/io/io_bigints.nim index aa8fa29..11d8cac 100644 --- a/constantine/io/io_bigints.nim +++ b/constantine/io/io_bigints.nim @@ -12,7 +12,8 @@ import ../primitives/constant_time, - ../config/[common, type_bigint] + ../config/[common, type_bigint], + ./endians # ############################################################ # @@ -152,24 +153,17 @@ func fromUint*( # # ############################################################ -template toByte(x: SomeUnsignedInt): byte = - ## At compile-time, conversion to bytes checks the range - ## we want to ensure this is done at the register level - ## at runtime in a single "mov byte" instruction - when nimvm: - byte(x and 0xFF) - else: - byte(x) - template blobFrom(dst: var openArray[byte], src: SomeUnsignedInt, startIdx: int, endian: static Endianness) = ## Write an integer into a raw binary blob ## Swapping endianness if needed + ## startidx is the first written array item if littleEndian is requested + ## or the last if bigEndian is requested when endian == cpuEndian: for i in 0 ..< sizeof(src): - dst[startIdx+i] = toByte((src shr (i * 8))) + dst[startIdx+i] = toByte(src shr (i * 8)) else: for i in 0 ..< sizeof(src): - dst[startIdx+sizeof(src)-1-i] = toByte((src shr (i * 8))) + dst[startIdx+sizeof(src)-1-i] = toByte(src shr (i * 8)) func exportRawUintLE( dst: var openarray[byte], diff --git a/helpers/prng_unsafe.nim b/helpers/prng_unsafe.nim index 2a7d7c6..9225444 100644 --- a/helpers/prng_unsafe.nim +++ b/helpers/prng_unsafe.nim @@ -364,6 +364,14 @@ func random_long01Seq_with_randZ*(rng: var RngState, T: typedesc[ECP_ShortW_Proj ## Skewed towards long bitstrings of 0 or 1 rng.random_long01Seq_with_randZ(result) +# Byte sequences +# ------------------------------------------------------------ + +func random_byte_seq*(rng: var RngState, length: int): seq[byte] = + result.newSeq(length) + for b in result.mitems: + b = byte rng.next() + # Sanity checks # ------------------------------------------------------------ diff --git a/tests/t_hash_sha256_vs_openssl.nim b/tests/t_hash_sha256_vs_openssl.nim new file mode 100644 index 0000000..2c48bc4 --- /dev/null +++ b/tests/t_hash_sha256_vs_openssl.nim @@ -0,0 +1,115 @@ +import + # Internals + ../constantine/hashes/h_sha256, + # Helpers + ../helpers/prng_unsafe, + # Third-party + stew/byteutils + +proc SHA256[T: byte|char]( + msg: openarray[T], + digest: ptr array[32, byte] = nil + ): ptr array[32, byte] {.cdecl, dynlib: "libssl.so", importc.} + +proc SHA256_OpenSSL[T: byte|char]( + digest: var array[32, byte], + s: openarray[T]) = + discard SHA256(s, digest.addr) + +echo "\n------------------------------------------------------\n" +const SmallSizeIters = 128 +const LargeSizeIters = 10 + +proc sanityABC = + var bufCt: array[32, byte] + let msg = "abc" + + let hashed = hexToByteArray[32]( + "BA7816BF8F01CFEA414140DE5DAE2223" & + "B00361A396177A9CB410FF61F20015AD") + + sha256.hash(bufCt, msg) + + doAssert bufCt == hashed + +proc sanityABC2 = + var bufCt: array[32, byte] + let msg = "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq" + + let hashed = hexToByteArray[32]( + "248D6A61D20638B8E5C026930C3E6039" & + "A33CE45964FF2167F6ECEDD419DB06C1") + + sha256.hash(bufCt, msg) + + doAssert bufCt == hashed + +proc innerTest(rng: var RngState, sizeRange: Slice[int]) = + let size = rng.random_unsafe(sizeRange) + let msg = rng.random_byte_seq(size) + + var bufCt, bufOssl: array[32, byte] + + sha256.hash(bufCt, msg) + SHA256_OpenSSL(bufOssl, msg) + doAssert bufCt == bufOssl + +proc chunkTest(rng: var RngState, sizeRange: Slice[int]) = + let size = rng.random_unsafe(sizeRange) + let msg = rng.random_byte_seq(size) + + let chunkSize = rng.random_unsafe(2 ..< 20) + + var bufOnePass: array[32, byte] + sha256.hash(bufOnePass, msg) + + var bufChunked: array[32, byte] + let maxChunk = max(2, sizeRange.b div 10) # Consume up to 10% at once + + var ctx: Sha256Context + ctx.init() + var cur = 0 + while size - cur > 0: + let chunkSize = rng.random_unsafe(0 ..< maxChunk) + let stop = min(cur+chunkSize-1, size-1) + let consumed = stop-cur+1 + ctx.update(msg.toOpenArray(cur, stop)) + cur += consumed + + ctx.finish(bufChunked) + + doAssert bufOnePass == bufChunked + +proc main() = + echo "SHA256 - sanity checks" + sanityABC() + sanityABC2() + + echo "SHA256 - Starting differential testing vs OpenSSL" + + var rng: RngState + rng.seed(0xFACADE) + + echo "SHA256 - 0 <= size < 64 - exhaustive" + for i in 0 ..< 64: + rng.innerTest(i .. i) + + echo "SHA256 - 0 <= size < 64 - exhaustive chunked" + for i in 0 ..< 64: + rng.chunkTest(i .. i) + + echo "SHA256 - 64 <= size < 1024B" + for _ in 0 ..< SmallSizeIters: + rng.innerTest(0 ..< 1024) + + echo "SHA256 - 64 <= size < 1024B - chunked" + for _ in 0 ..< SmallSizeIters: + rng.chunkTest(0 ..< 1024) + + echo "SHA256 - 1MB <= size < 50MB" + for _ in 0 ..< LargeSizeIters: + rng.innerTest(1_000_000 ..< 50_000_000) + + echo "SHA256 - Differential testing vs OpenSSL - SUCCESS" + +main()