diff --git a/benchmarks/bench_sha256.nim b/benchmarks/bench_sha256.nim index e6a31d3..6f2cc98 100644 --- a/benchmarks/bench_sha256.nim +++ b/benchmarks/bench_sha256.nim @@ -48,9 +48,9 @@ proc report(op: string, bytes: int, startTime, stopTime: MonoTime, startClk, sto when SupportsGetTicks: let cycles = (stopClk - startClk) div iters let cyclePerByte = cycles.float64 / bytes.float64 - echo &"{op:<30} {throughput:>15.3f} ops/s {ns:>9} ns/op {cycles:>10} cycles {cyclePerByte:>5.2f} cycles/byte" + echo &"{op:<50} {throughput:>15.3f} ops/s {ns:>9} ns/op {cycles:>10} cycles {cyclePerByte:>5.2f} cycles/byte" else: - echo &"{op:<30} {throughput:>15.3f} ops/s {ns:>9} ns/op" + echo &"{op:<50} {throughput:>15.3f} ops/s {ns:>9} ns/op" template bench(op: string, bytes: int, iters: int, body: untyped): untyped = measure(iters, startTime, stopTime, startClk, stopClk, body) @@ -63,41 +63,23 @@ proc benchSHA256_constantine[T](msg: openarray[T], msgComment: string, iters: in proc benchSHA256_openssl[T](msg: openarray[T], msgComment: string, iters: int) = var digest: array[32, byte] - bench("SHA256 - OpenSSL - " & msgComment, msg.len, iters): + bench("SHA256 - OpenSSL - " & msgComment, msg.len, iters): SHA256_OpenSSL(digest, msg) when isMainModule: proc main() = - block: - let msg32B = rng.random_byte_seq(32) - benchSHA256_constantine(msg32B, "32B", 100) - benchSHA256_openssl(msg32B, "32B", 100) - block: - let msg64B = rng.random_byte_seq(64) - benchSHA256_constantine(msg64B, "64B", 100) - benchSHA256_openssl(msg64B, "64B", 100) - block: - let msg128B = rng.random_byte_seq(128) - benchSHA256_constantine(msg128B, "128B", 100) - benchSHA256_openssl(msg128B, "128B", 100) - block: - let msg576B = rng.random_byte_seq(576) - benchSHA256_constantine(msg576B, "576B", 50) - benchSHA256_openssl(msg576B, "576B", 50) - block: - let msg8192B = rng.random_byte_seq(8192) - benchSHA256_constantine(msg8192B, "8192B", 25) - benchSHA256_openssl(msg8192B, "8192B", 25) - block: - let msg1MB = rng.random_byte_seq(1_000_000) - benchSHA256_constantine(msg1MB, "1MB", 16) - benchSHA256_openssl(msg1MB, "1MB", 16) - block: - let msg10MB = rng.random_byte_seq(10_000_000) - benchSHA256_constantine(msg10MB, "10MB", 16) - benchSHA256_openssl(msg10MB, "10MB", 16) - block: - let msg100MB = rng.random_byte_seq(100_000_000) - benchSHA256_constantine(msg100MB, "100MB", 3) - benchSHA256_openssl(msg100MB, "100MB", 3) + const sizes = [ + 32, 64, 128, 256, + 1024, 4096, 16384, 65536, + 1_000_000, 10_000_000 + ] + + const target_cycles = 1_000_000_000'i64 + const worst_cycles_per_bytes = 25'i64 + for s in sizes: + let msg = rng.random_byte_seq(s) + let iters = int(target_cycles div (s.int64 * worst_cycles_per_bytes)) + benchSHA256_constantine(msg, $s & "B", iters) + benchSHA256_openssl(msg, $s & "B", iters) + main() diff --git a/constantine/hashes/h_sha256.nim b/constantine/hashes/h_sha256.nim index 231776b..38b5b0e 100644 --- a/constantine/hashes/h_sha256.nim +++ b/constantine/hashes/h_sha256.nim @@ -7,8 +7,13 @@ # at your option. This file may not be copied, modified, or distributed except according to those terms. import - std/macros, - ../platforms/[abstractions, endians] + ../platforms/[abstractions, endians], + ./sha256/sha256_generic + +when UseASM_X86_32: + import ./sha256/[ + sha256_x86_ssse3, + sha256_x86_shaext] # SHA256, a hash function from the SHA2 family # -------------------------------------------------------------------------------- @@ -16,11 +21,6 @@ import # References: # - NIST: https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.180-4.pdf # - IETF: US Secure Hash Algorithms (SHA and HMAC-SHA) https://tools.ietf.org/html/rfc4634 -# - Intel optimization https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/sha-256-implementations-paper.pdf -# - Parallelizing message schedules -# to accelerate the computations of hash functions -# Shay Gueron, Vlad Krasnov, 2012 -# https://eprint.iacr.org/2012/067.pdf # # Vectors: # - https://csrc.nist.gov/CSRC/media/Projects/Cryptographic-Standards-and-Guidelines/documents/examples/SHA256.pdf @@ -28,277 +28,74 @@ import # Types and constants # ---------------------------------------------------------------- -const - DigestSize = 32 - BlockSize = 64 - HashSize = DigestSize div sizeof(uint32) # 8 - type Sha256Context* = object ## Align to 64 for cache line and SIMD friendliness - H{.align: 64}: array[HashSize, uint32] + s{.align: 64}: Sha256_state buf{.align: 64}: array[BlockSize, byte] msgLen: uint64 - bufIdx: uint8 sha256* = Sha256Context -# Internal +# Internals # ---------------------------------------------------------------- -# TODO: vectorized implementations # No exceptions allowed in core cryptographic operations {.push raises: [].} {.push checks: off.} -template rotr(x, n: uint32): uint32 = - ## Rotate right the bits - # We always use it with constants in 0 ..< 32 - # so undefined behaviour. - (x shr n) or (x shl (32 - n)) - -template ch(x, y, z: uint32): uint32 = - ## "Choose" function of SHA256 - ## Choose bit i from yi or zi depending on xi - when false: # Spec FIPS 180-4 - (x and y) xor (not(x) and z) - else: # RFC4634 - ((x and (y xor z)) xor z) - -template maj(x, y, z: uint32): uint32 = - ## "Majority" function of SHA256 - when false: # Spec FIPS 180-4 - (x and y) xor (x and z) xor (y and z) - else: # RFC4634 - (x and (y or z)) or (y and z) - -template S0(x: uint32): uint32 = - # Σ₀ - rotr(x, 2) xor rotr(x, 13) xor rotr(x, 22) - -template S1(x: uint32): uint32 = - # Σ₁ - rotr(x, 6) xor rotr(x, 11) xor rotr(x, 25) - -template s0(x: uint32): uint32 = - # σ₀ - rotr(x, 7) xor rotr(x, 18) xor (x shr 3) - -template s1(x: uint32): uint32 = - # σ₁ - rotr(x, 17) xor rotr(x, 19) xor (x shr 10) - -template u32BE(blob: array[4, byte]): uint32 = - ## Interpret a data blob as a big-endian uint32 - ## This should lower to - when nimvm: - (blob[0].uint32 shl 24) or (blob[1].uint32 shl 16) or (blob[2].uint32 shl 8) or blob[3].uint32 - else: - when cpuEndian == littleEndian: - (blob[0].uint32 shl 24) or (blob[1].uint32 shl 16) or (blob[2].uint32 shl 8) or blob[3].uint32 +func hashMessageBlocks( + s: var Sha256_state, + message: ptr UncheckedArray[byte], + numBlocks: uint) = + when UseASM_X86_32: + if ({.noSideEffect.}: hasSha()): + hashMessageBlocks_shaext(s, message, numBlocks) + elif ({.noSideEffect.}: hasSSSE3()): + hashMessageBlocks_ssse3(s, message, numBlocks) else: - cast[uint32](blob) - -template getU32at[T: byte|char](msg: openarray[T], pos: SomeInteger): uint32 = - u32BE(cast[ptr array[4, byte]](msg[pos].unsafeAddr())[]) - -func rotateRight[T](a: var openarray[T], k: int) = - ## Rotate a seuqnce by k - doAssert a.len > 0 - let k = k mod a.len - - for _ in 0 ..< k: - let tmp = a[^1] - for i in countdown(a.len-1, 1): - a[i] = a[i-1] - a[0] = tmp - -macro round(a, b, c, d, e, f, g, h: untyped, t: static int): untyped = - ## Unrolled and allocation efficient SHA256 round - var s = [a, b, c, d, e, f, g, h] - s.rotateRight(t) - let - a = s[0] - b = s[1] - c = s[2] - d = s[3] - e = s[4] - f = s[5] - g = s[6] - h = s[7] - - # W[t] - let w = nnkBracketExpr.newTree( - ident("W"), newLit(t mod 16) - ) - - result = newStmtList() - - if t < 16: - # Reading message phase - let msg = ident"message" - let curBlock = ident"curBlock" - result.add quote do: - `w` = getU32at(`msg`, `curBlock`*64 + `t`*4) + hashMessageBlocks_generic(s, message, numBlocks) else: - # Mixing - - # Wt-2, Wt-7, Wt-15, Wt-16 - let Wtm2 = nnkBracketExpr.newTree( - ident("W"), newLit((t-2) mod 16) - ) - let Wtm7 = nnkBracketExpr.newTree( - ident("W"), newLit((t-7) mod 16) - ) - let Wtm15 = nnkBracketExpr.newTree( - ident("W"), newLit((t-15) mod 16) - ) - # w is Wt-16 - result.add quote do: - `w` += s1(`Wtm2`) + `Wtm7` + s0(`Wtm15`) - - result.add quote do: - let T1 = `h` + S1(`e`) + ch(`e`, `f`, `g`) + K256[`t`] + `w` - let T2 = S0(`a`) + maj(`a`, `b`, `c`) - `d` += T1 - `h` = T1 + T2 - -func hashMessageBlocks[T: byte|char]( - H: var array[HashSize, uint32], - message: openarray[T]): uint = - ## Hash a message block by block - ## Sha256 block size is 64 bytes hence - ## a message will be process 64 by 64 bytes. - ## FIPS.180-4 6.2.2. SHA-256 Hash Computation - - result = 0 - let numBlocks = message.len.uint div BlockSize - if numBlocks == 0: - return 0 - - const K256 = [ - 0x428a2f98'u32, 0x71374491'u32, 0xb5c0fbcf'u32, 0xe9b5dba5'u32, 0x3956c25b'u32, 0x59f111f1'u32, 0x923f82a4'u32, 0xab1c5ed5'u32, - 0xd807aa98'u32, 0x12835b01'u32, 0x243185be'u32, 0x550c7dc3'u32, 0x72be5d74'u32, 0x80deb1fe'u32, 0x9bdc06a7'u32, 0xc19bf174'u32, - 0xe49b69c1'u32, 0xefbe4786'u32, 0x0fc19dc6'u32, 0x240ca1cc'u32, 0x2de92c6f'u32, 0x4a7484aa'u32, 0x5cb0a9dc'u32, 0x76f988da'u32, - 0x983e5152'u32, 0xa831c66d'u32, 0xb00327c8'u32, 0xbf597fc7'u32, 0xc6e00bf3'u32, 0xd5a79147'u32, 0x06ca6351'u32, 0x14292967'u32, - 0x27b70a85'u32, 0x2e1b2138'u32, 0x4d2c6dfc'u32, 0x53380d13'u32, 0x650a7354'u32, 0x766a0abb'u32, 0x81c2c92e'u32, 0x92722c85'u32, - 0xa2bfe8a1'u32, 0xa81a664b'u32, 0xc24b8b70'u32, 0xc76c51a3'u32, 0xd192e819'u32, 0xd6990624'u32, 0xf40e3585'u32, 0x106aa070'u32, - 0x19a4c116'u32, 0x1e376c08'u32, 0x2748774c'u32, 0x34b0bcb5'u32, 0x391c0cb3'u32, 0x4ed8aa4a'u32, 0x5b9cca4f'u32, 0x682e6ff3'u32, - 0x748f82ee'u32, 0x78a5636f'u32, 0x84c87814'u32, 0x8cc70208'u32, 0x90befffa'u32, 0xa4506ceb'u32, 0xbef9a3f7'u32, 0xc67178f2'u32 - ] - - var - a = H[0] - b = H[1] - c = H[2] - d = H[3] - e = H[4] - f = H[5] - g = H[6] - h = H[7] - - for curBlock in 0 ..< numBlocks: - # The first 16 bytes have different handling - # from bytes 16..<64. - # Using an array[64, uint32] will span it - # across 8 cache lines impacting performance - - # Workspace with message schedule Wₜ - var W{.noInit.}: array[16, uint32] - - when true: - # Translation of the spec - # This is faster than even OpenSSL for hashing just 32 bytes - # for example for HMAC and HKDF. - var t = 0'u32 - while t < 16: # Wₜ = Mⁱₜ - W[t].parseFromBlob(message, result, bigEndian) - let T1 = h + S1(e) + ch(e, f, g) + K256[t] + W[t] - let T2 = S0(a) + maj(a, b, c) - h = g - g = f - f = e - e = d + T1 - d = c - c = b - b = a - a = T1+T2 - - t += 1 - - while t < 64: - W[t mod 16] += s1(W[(t-2) mod 16]) + - W[(t-7) mod 16] + - s0(W[(t-15) mod 16]) - let T1 = h + S1(e) + ch(e, f, g) + K256[t] + W[t mod 16] - let T2 = S0(a) + maj(a, b, c) - h = g - g = f - f = e - e = d + T1 - d = c - c = b - b = a - a = T1+T2 - - t += 1 - else: - # optimized version for large hashes - # For hashing 32B, this is slower than the rough translation - # of spec, unless compiled with -mssse3 (but no vector instructions are used :/) - staticFor t, 0, 64: - round(a, b, c, d, e, f, g, h, t) - result += 64 - - a += H[0]; H[0] = a - b += H[1]; H[1] = b - c += H[2]; H[2] = c - d += H[3]; H[3] = d - e += H[4]; H[4] = e - f += H[5]; H[5] = f - g += H[6]; H[6] = g - h += H[7]; H[7] = h + hashMessageBlocks_generic(s, message, numBlocks) func dumpHash( digest: var array[DigestSize, byte], - H: array[HashSize, uint32]) = + s: Sha256_state) {.inline.} = ## Convert the internal hash into a message digest var dstIdx = 0'u - for i in 0 ..< H.len: - digest.dumpRawInt(H[i], dstIdx, bigEndian) + for i in 0 ..< s.H.len: + digest.dumpRawInt(s.H[i], dstIdx, bigEndian) dstIdx += uint sizeof(uint32) -func hashBuffer(ctx: var Sha256Context) = - discard ctx.H.hashMessageBlocks(ctx.buf) +func hashBuffer(ctx: var Sha256Context) {.inline.} = + ctx.s.hashMessageBlocks(ctx.buf.asUnchecked(), numBlocks = 1) ctx.buf.setZero() - ctx.bufIdx = 0 # Public API # ---------------------------------------------------------------- template digestSize*(H: type sha256): int = ## Returns the output size in bytes - 32 + DigestSize template internalBlockSize*(H: type sha256): int = ## Returns the byte size of the hash function ingested blocks - 64 + BlockSize func init*(ctx: var Sha256Context) = ## Initialize or reinitialize a Sha256 context ctx.msgLen = 0 ctx.buf.setZero() - ctx.bufIdx = 0 - ctx.H[0] = 0x6a09e667'u32 - ctx.H[1] = 0xbb67ae85'u32 - ctx.H[2] = 0x3c6ef372'u32 - ctx.H[3] = 0xa54ff53a'u32 - ctx.H[4] = 0x510e527f'u32 - ctx.H[5] = 0x9b05688c'u32 - ctx.H[6] = 0x1f83d9ab'u32 - ctx.H[7] = 0x5be0cd19'u32 + ctx.s.H[0] = 0x6a09e667'u32 + ctx.s.H[1] = 0xbb67ae85'u32 + ctx.s.H[2] = 0x3c6ef372'u32 + ctx.s.H[3] = 0xa54ff53a'u32 + ctx.s.H[4] = 0x510e527f'u32 + ctx.s.H[5] = 0x9b05688c'u32 + ctx.s.H[6] = 0x1f83d9ab'u32 + ctx.s.H[7] = 0x5be0cd19'u32 func initZeroPadded*(ctx: var Sha256Context) = ## Initialize a Sha256 context @@ -312,18 +109,17 @@ func initZeroPadded*(ctx: var Sha256Context) = ctx.msgLen = 64 ctx.buf.setZero() - ctx.bufIdx = 0 - ctx.H[0] = 0xda5698be'u32 - ctx.H[1] = 0x17b9b469'u32 - ctx.H[2] = 0x62335799'u32 - ctx.H[3] = 0x779fbeca'u32 - ctx.H[4] = 0x8ce5d491'u32 - ctx.H[5] = 0xc0d26243'u32 - ctx.H[6] = 0xbafef9ea'u32 - ctx.H[7] = 0x1837a9d8'u32 + ctx.s.H[0] = 0xda5698be'u32 + ctx.s.H[1] = 0x17b9b469'u32 + ctx.s.H[2] = 0x62335799'u32 + ctx.s.H[3] = 0x779fbeca'u32 + ctx.s.H[4] = 0x8ce5d491'u32 + ctx.s.H[5] = 0xc0d26243'u32 + ctx.s.H[6] = 0xbafef9ea'u32 + ctx.s.H[7] = 0x1837a9d8'u32 -func update*[T: char|byte](ctx: var Sha256Context, message: openarray[T]) = +func update*(ctx: var Sha256Context, message: openarray[byte]) = ## Append a message to a SHA256 context ## for incremental SHA256 computation ## @@ -336,53 +132,48 @@ func update*[T: char|byte](ctx: var Sha256Context, message: openarray[T]) = ## ## For passwords and secret keys, you MUST NOT use raw SHA-256 ## use a Key Derivation Function instead (KDF) + + # Message processing state machine + var bufIdx = uint(ctx.msgLen mod BlockSize) + var cur = 0'u + var bytesLeft = message.len.uint - debug: - doAssert: 0 <= ctx.bufIdx and ctx.bufIdx.int < ctx.buf.len - for i in ctx.bufIdx ..< ctx.buf.len: - doAssert ctx.buf[i] == 0 - - if message.len == 0: - return - - var # Message processing state machine - cur = 0'u - bytesLeft = message.len.uint - - ctx.msgLen += bytesLeft - - if ctx.bufIdx != 0: # Previous partial update - let bufIdx = ctx.bufIdx.uint - let free = ctx.buf.sizeof().uint - bufIdx - - if free > bytesLeft: - # Enough free space, store in buffer - ctx.buf.copy(dStart = bufIdx, message, sStart = 0, len = bytesLeft) - ctx.bufIdx += bytesLeft.uint8 - return - else: - # Fill the buffer and do one sha256 hash - ctx.buf.copy(dStart = bufIdx, message, sStart = 0, len = free) - ctx.hashBuffer() - - # Update message state for further processing - cur = free - bytesLeft -= free - - # Process n blocks (64 byte each) - let consumed = ctx.H.hashMessageBlocks( - message.toOpenArray(int cur, message.len-1)) - cur += consumed - bytesLeft -= consumed + if bufIdx != 0 and bufIdx+bytesLeft >= BlockSize: + # Previous partial update, fill the buffer and do one sha256 hash + let free = BlockSize - bufIdx + ctx.buf.copy(dStart = bufIdx, message, sStart = 0, len = free) + ctx.hashBuffer() + bufIdx = 0 + cur = free + bytesLeft -= free + + if bytesLeft >= BlockSize: + # Process n blocks (64 byte each) + let numBlocks = bytesLeft div BlockSize + ctx.s.hashMessageBlocks(message.asUnchecked +% cur, numBlocks) + cur += numBlocks * BlockSize + bytesLeft -= numBlocks * BlockSize if bytesLeft != 0: # Store the tail in buffer - debug: # TODO: state machine formal verification - https://nim-lang.org/docs/drnim.html - doAssert ctx.bufIdx == 0 - doAssert cur + bytesLeft == message.len.uint + ctx.buf.copy(dStart = bufIdx, message, sStart = cur, len = bytesLeft) - ctx.buf.copy(dStart = 0'u, message, sStart = cur, len = bytesLeft) - ctx.bufIdx = uint8 bytesLeft + ctx.msgLen += message.len.uint + +func update*(ctx: var Sha256Context, message: openarray[char]) {.inline.} = + ## Append a message to a SHA256 context + ## for incremental SHA256 computation + ## + ## Security note: the tail of your message might be stored + ## in an internal buffer. + ## if sensitive content is used, ensure that + ## `ctx.finish(...)` and `ctx.clear()` are called as soon as possible. + ## Additionally ensure that the message(s) passed were stored + ## in memory considered secure for your threat model. + ## + ## For passwords and secret keys, you MUST NOT use raw SHA-256 + ## use a Key Derivation Function instead (KDF) + ctx.update(message.toOpenArrayByte(message.low, message.high)) func finish*(ctx: var Sha256Context, digest: var array[32, byte]) = ## Finalize a SHA256 computation and output the @@ -396,26 +187,23 @@ func finish*(ctx: var Sha256Context, digest: var array[32, byte]) = ## For passwords and secret keys, you MUST NOT use raw SHA-256 ## use a Key Derivation Function instead (KDF) - debug: - doAssert: 0 <= ctx.bufIdx and ctx.bufIdx.int < ctx.buf.len - for i in ctx.bufIdx ..< ctx.buf.len: - doAssert ctx.buf[i] == 0 + let bufIdx = uint(ctx.msgLen mod BlockSize) # Add '1' bit at the end of the message (+7 zero bits) - ctx.buf[ctx.bufIdx] = 0b1000_0000 + ctx.buf[bufIdx] = 0b1000_0000 # Add k bits so that msgLenBits + 1 + k ≡ 448 mod 512 # Hence in bytes msgLen + 1 + K ≡ 56 mod 64 const padZone = 56 - if ctx.bufIdx >= padZone: + if bufIdx >= padZone: # We are in the 56..<64 mod 64 byte count # and need to rollover to 0 ctx.hashBuffer() let lenInBits = ctx.msgLen.uint64 * 8 ctx.buf.dumpRawInt(lenInBits, padZone, bigEndian) - discard ctx.H.hashMessageBlocks(ctx.buf) - digest.dumpHash(ctx.H) + ctx.s.hashMessageBlocks(ctx.buf.asUnchecked(), numBlocks = 1) + digest.dumpHash(ctx.s) func clear*(ctx: var Sha256Context) = ## Clear the context internal buffers @@ -423,7 +211,6 @@ func clear*(ctx: var Sha256Context) = ## For passwords and secret keys, you MUST NOT use raw SHA-256 ## use a Key Derivation Function instead (KDF) # TODO: ensure compiler cannot optimize the code away - ctx.H.setZero() + ctx.s.H.setZero() ctx.buf.setZero() ctx.msgLen = 0 - ctx.bufIdx = 0 \ No newline at end of file diff --git a/constantine/hashes/sha256/sha256_generic.nim b/constantine/hashes/sha256/sha256_generic.nim new file mode 100644 index 0000000..2e8dbd4 --- /dev/null +++ b/constantine/hashes/sha256/sha256_generic.nim @@ -0,0 +1,185 @@ +# Constantine +# Copyright (c) 2018-2019 Status Research & Development GmbH +# Copyright (c) 2020-Present Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +import + ../../platforms/primitives + +# SHA256, a hash function from the SHA2 family +# -------------------------------------------------------------------------------- +# +# References: +# - NIST: https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.180-4.pdf +# - IETF: US Secure Hash Algorithms (SHA and HMAC-SHA) https://tools.ietf.org/html/rfc4634 +# Vectors: +# - https://csrc.nist.gov/CSRC/media/Projects/Cryptographic-Standards-and-Guidelines/documents/examples/SHA256.pdf + +# No exceptions allowed in core cryptographic operations +{.push raises: [].} +{.push checks: off.} + +# Types & Constants +# ------------------------------------------------ +# The enforced alignment should help the compiler produce optimized code + +type Word* = uint32 + +const + DigestSize* = 32 + BlockSize* = 64 + HashSize* = DigestSize div sizeof(Word) # 8 + +type Sha256_MessageSchedule* = object + w*{.align: 64.}: array[BlockSize div sizeof(Word), Word] + +type Sha256_state* = object + H*{.align: 64.}: array[HashSize, Word] + +const K256* = [ + 0x428a2f98'u32, 0x71374491'u32, 0xb5c0fbcf'u32, 0xe9b5dba5'u32, 0x3956c25b'u32, 0x59f111f1'u32, 0x923f82a4'u32, 0xab1c5ed5'u32, + 0xd807aa98'u32, 0x12835b01'u32, 0x243185be'u32, 0x550c7dc3'u32, 0x72be5d74'u32, 0x80deb1fe'u32, 0x9bdc06a7'u32, 0xc19bf174'u32, + 0xe49b69c1'u32, 0xefbe4786'u32, 0x0fc19dc6'u32, 0x240ca1cc'u32, 0x2de92c6f'u32, 0x4a7484aa'u32, 0x5cb0a9dc'u32, 0x76f988da'u32, + 0x983e5152'u32, 0xa831c66d'u32, 0xb00327c8'u32, 0xbf597fc7'u32, 0xc6e00bf3'u32, 0xd5a79147'u32, 0x06ca6351'u32, 0x14292967'u32, + 0x27b70a85'u32, 0x2e1b2138'u32, 0x4d2c6dfc'u32, 0x53380d13'u32, 0x650a7354'u32, 0x766a0abb'u32, 0x81c2c92e'u32, 0x92722c85'u32, + 0xa2bfe8a1'u32, 0xa81a664b'u32, 0xc24b8b70'u32, 0xc76c51a3'u32, 0xd192e819'u32, 0xd6990624'u32, 0xf40e3585'u32, 0x106aa070'u32, + 0x19a4c116'u32, 0x1e376c08'u32, 0x2748774c'u32, 0x34b0bcb5'u32, 0x391c0cb3'u32, 0x4ed8aa4a'u32, 0x5b9cca4f'u32, 0x682e6ff3'u32, + 0x748f82ee'u32, 0x78a5636f'u32, 0x84c87814'u32, 0x8cc70208'u32, 0x90befffa'u32, 0xa4506ceb'u32, 0xbef9a3f7'u32, 0xc67178f2'u32 +] + +# Primitives +# ------------------------------------------------ + +template rotr(x, n: uint32): uint32 = + ## Rotate right the bits + # We always use it with constants in 0 ..< 32 + # so no undefined behaviour. + (x shr n) or (x shl (32 - n)) + +template ch(x, y, z: uint32): uint32 = + ## "Choose" function of SHA256 + ## Choose bit i from yi or zi depending on xi + when false: # Spec FIPS 180-4 + (x and y) xor (not(x) and z) + else: # RFC4634 + ((x and (y xor z)) xor z) + +template maj(x, y, z: uint32): uint32 = + ## "Majority" function of SHA256 + when false: # Spec FIPS 180-4 + (x and y) xor (x and z) xor (y and z) + else: # RFC4634 + (x and (y or z)) or (y and z) + +template S0(x: uint32): uint32 = + # Σ₀ + rotr(x, 2) xor rotr(x, 13) xor rotr(x, 22) + +template S1(x: uint32): uint32 = + # Σ₁ + rotr(x, 6) xor rotr(x, 11) xor rotr(x, 25) + +template s0(x: uint32): uint32 = + # σ₀ + rotr(x, 7) xor rotr(x, 18) xor (x shr 3) + +template s1(x: uint32): uint32 = + # σ₁ + rotr(x, 17) xor rotr(x, 19) xor (x shr 10) + +# Message schedule +# ------------------------------------------------ + +template u32BE(blob: array[4, byte]): uint32 = + ## Interpret a data blob as a big-endian uint32 + when nimvm: + (blob[0].uint32 shl 24) or (blob[1].uint32 shl 16) or (blob[2].uint32 shl 8) or blob[3].uint32 + else: + when cpuEndian == littleEndian: + (blob[0].uint32 shl 24) or (blob[1].uint32 shl 16) or (blob[2].uint32 shl 8) or blob[3].uint32 + else: + cast[uint32](blob) + +template getU32at(msg: ptr UncheckedArray[byte], pos: SomeInteger): uint32 = + u32BE(cast[ptr array[4, byte]](msg[pos].addr)[]) + +# State updates +# ------------------------------------------------ + +template copy*(dst: var Sha256_state, src: Sha256_state) = + ## State copy + # Should compile with a specialized aligned copy. + # No bounds check + for i in 0 ..< HashSize: + dst.H[i] = src.H[i] + +template accumulate*(dst: var Sha256_state, src: Sha256_state) = + ## State accumulation + # No bounds check + for i in 0 ..< HashSize: + dst.H[i] += src.H[i] + +template sha256_round*(s: var Sha256_state, wt, kt: Word) = + template a: Word = s.H[0] + template b: Word = s.H[1] + template c: Word = s.H[2] + template d: Word = s.H[3] + template e: Word = s.H[4] + template f: Word = s.H[5] + template g: Word = s.H[6] + template h: Word = s.H[7] + + let T1 = h + S1(e) + ch(e, f, g) + kt + wt + let T2 = S0(a) + maj(a, b, c) + d += T1 + h = T1 + T2 + + s.H.rotateRight() + +# Hash Computation +# ------------------------------------------------ + +func sha256_rounds_0_15( + s: var Sha256_state, + ms: var Sha256_MessageSchedule, + message: ptr UncheckedArray[byte]) {.inline.} = + staticFor t, 0, 16: + ms.w[t] = message.getU32at(t * sizeof(Word)) + sha256_round(s, ms.w[t], K256[t]) + +func sha256_rounds_16_63( + s: var Sha256_state, + ms: var Sha256_MessageSchedule) {.inline.} = + staticFor t, 16, 64: + ms.w[t and 15] += s1(ms.w[(t - 2) and 15])+ + ms.w[(t - 7) and 15] + + s0(ms.w[(t - 15) and 15]) + + sha256_round(s, ms.w[t and 15], K256[t]) + +func hashMessageBlocks_generic*( + H: var Sha256_state, + message: ptr UncheckedArray[byte], + numBlocks: uint) = + ## Hash a message block by block + ## Sha256 block size is 64 bytes hence + ## a message will be process 64 by 64 bytes. + ## FIPS.180-4 6.2.2. SHA-256 Hash Computation + + var msg = message + var ms{.noInit.}: Sha256_MessageSchedule + var s{.noInit.}: Sha256_state + + s.copy(H) + + for _ in 0 ..< numBlocks: + sha256_rounds_0_15(s, ms, msg) + msg +%= BlockSize + + sha256_rounds_16_63(s, ms) + + s.accumulate(H) # accumulate on register variables + H.copy(s) diff --git a/constantine/hashes/sha256/sha256_x86_shaext.nim b/constantine/hashes/sha256/sha256_x86_shaext.nim new file mode 100644 index 0000000..4d38117 --- /dev/null +++ b/constantine/hashes/sha256/sha256_x86_shaext.nim @@ -0,0 +1,130 @@ +# Constantine +# Copyright (c) 2018-2019 Status Research & Development GmbH +# Copyright (c) 2020-Present Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +import + ../../platforms/isa/simd_x86, + ../../platforms/primitives, + ./sha256_generic + +{.localpassC:"-msse4.1 -msha".} + +# SHA256, a hash function from the SHA2 family +# -------------------------------------------------------------------------------- +# +# References: +# - Intel SHA extensions whitepaper +# https://www.intel.com/content/dam/develop/external/us/en/documents/intel-sha-extensions-white-paper-402097.pdf +# - Intel SHA extensions article +# https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sha-extensions.html + +# No exceptions allowed in core cryptographic operations +{.push raises: [].} +{.push checks: off.} + +# Primitives +# ------------------------------------------------ + +template setr_K(i: int): m128i = + setr_u32x4(K256[4*i], K256[4*i+1], K256[4*i+2], K256[4*i+3]) + +# Hash Computation +# ------------------------------------------------ + +func hashMessageBlocks_shaext*( + H: var Sha256_state, + message: ptr UncheckedArray[byte], + numBlocks: uint)= + ## Hash a message block by block + ## Sha256 block size is 64 bytes hence + ## a message will be process 64 by 64 bytes. + + var + abef_save {.noInit.}: m128i + cdgh_save {.noInit.}: m128i + state0 {.noInit.}: m128i + state1 {.noInit.}: m128i + msgtmp {.noInit.}: array[4, m128i] + msg {.noInit.}: m128i + tmp {.noInit.}: m128i + + data = message + + let shuf_mask = set_u64x2(0x0c0d0e0f08090a0b, 0x0405060700010203) + + # The SHA state is stored in this order: + # D, C, B, A, H, G, F, E + # + # state0 contains ABEF, state1 contains CDGH + + tmp = shuf_u32x4(loada_u128(H.H[0].addr), 0xB1) # CDAB + state1 = shuf_u32x4(loada_u128(H.H[4].addr), 0x1B) # EFGH + state0 = alignr_u128(tmp, state1, 8) # ABEF + state1 = blend_u16x8(state1, tmp, 0xF0) # CDGH + + for _ in 0 ..< numBlocks: + # Save current state + abef_save = state0 + cdgh_save = state1 + + # Rounds 0-3 + msgtmp[0] = shuf_u8x16(loadu_u128(data[0].addr), shuf_mask) + msg = add_u32x4(msgtmp[0], setr_K(0)) + state1 = sha256_2rounds(state1, state0, msg) + msg = shuf_u32x4(msg, 0x0E) + state0 = sha256_2rounds(state0, state1, msg) + + # Rounds 4-7 and 8-11 + staticFor i, 1, 3: + msgtmp[i] = shuf_u8x16(loadu_u128(data[16*i].addr), shuf_mask) + msg = add_u32x4(msgtmp[i], setr_K(i)) + state1 = sha256_2rounds(state1, state0, msg) + msg = shuf_u32x4(msg, 0x0E) + state0 = sha256_2rounds(state0, state1, msg) + msgtmp[i-1] = sha256_msg1(msgtmp[i-1], msgtmp[i]) + + # Rounds 12-59 + msgtmp[3] = shuf_u8x16(loadu_u128(data[16*3].addr), shuf_mask) + + staticFor i, 3, 15: + let prev = (i-1) and 3 # mod 4, we rotate buffers + let curr = i and 3 + let next = (i+1) and 3 + + msg = add_u32x4(msgtmp[curr], setr_K(i)) + state1 = sha256_2rounds(state1, state0, msg) + tmp = alignr_u128(msgtmp[curr], msgtmp[prev], 4) + msgtmp[next] = add_u32x4(msgtmp[next], tmp) + msgtmp[next] = sha256_msg2(msgtmp[next], msgtmp[curr]) + msg = shuf_u32x4(msg, 0x0E) + state0 = sha256_2rounds(state0, state1, msg) + msgtmp[prev] = sha256_msg1(msgtmp[prev], msgtmp[curr]) + + # Rounds 60-63 + msg = add_u32x4(msgtmp[3], setr_K(15)) + state1 = sha256_2rounds(state1, state0, msg) + msg = shuf_u32x4(msg, 0x0E) + state0 = sha256_2rounds(state0, state1, msg) + + # Accumulate + state0 = add_u32x4(state0, abef_save) + state1 = add_u32x4(state1, cdgh_save) + + data +%= BlockSize + + # The SHA state is stored in this order: + # D, C, B, A, H, G, F, E + # + # state0 contains ABEF, state1 contains CDGH + + tmp = shuf_u32x4(state0, 0x1B) # FEBA + state1 = shuf_u32x4(state1, 0xB1) # DCHG + state0 = blend_u16x8(tmp, state1, 0xF0) # DCBA + state1 = alignr_u128(state1, tmp, 8) # HGFE + + storea_u128(H.H[0].addr, state0) + storea_u128(H.H[4].addr, state1) \ No newline at end of file diff --git a/constantine/hashes/sha256/sha256_x86_ssse3.nim b/constantine/hashes/sha256/sha256_x86_ssse3.nim new file mode 100644 index 0000000..a74b60c --- /dev/null +++ b/constantine/hashes/sha256/sha256_x86_ssse3.nim @@ -0,0 +1,211 @@ +# Constantine +# Copyright (c) 2018-2019 Status Research & Development GmbH +# Copyright (c) 2020-Present Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +import + ../../platforms/isa/simd_x86, + ../../platforms/primitives, + ./sha256_generic + +{.localpassC:"-mssse3".} + +# SHA256, SSSE3 optimizations +# -------------------------------------------------------------------------------- +# +# References: +# - NIST: https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.180-4.pdf +# - IETF: US Secure Hash Algorithms (SHA and HMAC-SHA) https://tools.ietf.org/html/rfc4634 +# - Fast SHA-256 Implementations on Intel® Architecture Processors +# https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/sha-256-implementations-paper.pdf + +# Following the intel whitepaper we split our code into: +# We keep track of a 256-bit state vector corresponding +# to {a, b, c, d, e, f, g, h} in specification +# +# Processing is done in 2 steps +# - Message scheduler: +# Takes the input 16 DWORDs and +# computes 48 new DWORDs. Together with the original 16 DWORDs, these +# form a vector of 64 DWORDs that is the input to the second step. +# This can be vectorized. +# - 64 SHA rounds: +# This code is scalar. + +# No exceptions allowed in core cryptographic operations +{.push raises: [].} +{.push checks: off.} + +# Vectorized message scheduler +# ------------------------------------------------ + +const VecNum = BlockSize div 16 # BlockSize / sizeof(m128i) +const VecWords = 16 div sizeof(Word) # sizeof(m128i) / sizeof(Word) + +func initMessageSchedule( + msnext: var array[VecNum, m128i], + ms: var Sha256_MessageSchedule, + message: ptr UncheckedArray[byte]) {.inline.} = + ## Initial state, from data + ## - Precompute steps for the future message schedule `msnext` + ## - compute the current message schedule `ms` + + let mask = setr_u32x4(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f) + let pK256 = K256.unsafeAddr() + + staticFor i, 0, VecNum: + msnext[i] = loadu_u128(message[i * sizeof(m128i)].addr) + msnext[i] = shuf_u8x16(msnext[i], mask) + storea_u128(ms.w[VecWords*i].addr, add_u32x4(msnext[i], loadu_u128(pK256[VecWords*i].addr))) + +func updateMessageSchedule( + W: var array[4, m128i], + loMask, hiMask: m128i) {.inline.} = + # Steady state + # ------------ + # The message schedule workspace W[16:0] + # is updated with + # W[t mod 16] += s0 + s1 + W[(t-7) mod 16] + # with + # s0 = σ₀(W[(t-15) mod 16]) + # s1 = σ₁(W[(t-2) mod 16]) + # by denoting the right rotation >>>, and xor ⊕ + # σ₀(x) = (x >>> 7) ⊕ (x >>> 18) ⊕ (x >>> 3) + # σ₁(x) = (x >>> 17) ⊕ (x >>> 19) ⊕ (x >>> 10) + + const rot0 = [int32 7, 18, 3] + const rot1 = [int32 17, 19, 10] + + var v{.noInit.}: array[4, m128i] + + v[0] = alignr_u128(W[1], W[0], 4) + v[3] = alignr_u128(W[3], W[2], 4) + v[2] = shr_u32x4(v[0], rot0[0]) + W[0] = add_u32x4(W[0], v[3]) + + v[3] = shr_u32x4(v[0], rot0[2]) + v[1] = shl_u32x4(v[0], 32-rot0[1]) + v[0] = xor_u128(v[3], v[2]) + + v[3] = shuf_u32x4(W[3], 0xfa) + v[2] = shr_u32x4(v[2], rot0[1] - rot0[0]) + v[0] = xor_u128(v[0], v[1]) + v[0] = xor_u128(v[0], v[2]) + + v[1] = shl_u32x4(v[1], rot0[1] - rot0[0]) + v[2] = shr_u32x4(v[3], rot1[2]) + v[3] = shr_u64x2(v[3], rot1[0]) + W[0] = add_u32x4(W[0], xor_u128(v[0], v[1])) + + v[2] = xor_u128(v[2], v[3]) + v[3] = shr_u64x2(v[3], rot1[1] - rot1[0]) + v[2] = shuf_u8x16(xor_u128(v[2], v[3]), lo_mask) + W[0] = add_u32x4(W[0], v[2]) + + v[3] = shuf_u32x4(W[0], 0x50) + v[2] = shr_u32x4(v[3], rot1[2]) + v[3] = shr_u64x2(v[3], rot1[0]) + v[2] = xor_u128(v[2], v[3]) + v[3] = shr_u64x2(v[3], rot1[1] - rot1[0]) + + W[0] = add_u32x4(W[0], shuf_u8x16(xor_u128(v[2], v[3]), hi_mask)) + + W.rotateLeft() + +# Hash Computation +# ------------------------------------------------ + +func sha256_rounds_0_47( + s: var Sha256_state, + ms: var Sha256_MessageSchedule, + msnext: var array[VecNum, m128i]) {.inline.} = + ## Process Sha256 rounds 0 to 47 + + let loMask = setr_u32x4(0x03020100, 0x0b0a0908, -1, -1) + let hiMask = setr_u32x4(-1, -1, 0x03020100, 0x0b0a0908) + + # The first items of K256 were processed in initMessageSchedule + var k256_idx = 16 + + # Rounds 0-15, 16-31, 32-47 + for r in 0 ..< 3: + + # Important unrolling for 2 reasons, see Intel paper + # - State updates: + # In each round calculation six out of the eight state variables are shifted to the + # next state variable. Rather than do these using mov instructions, we rename + # the virtual registers (symbols) to effect this “shift”. Thus each round + # effectively rotates the set of state register names by one place. By doing 8 or + # 16 rounds in the body of the loop, the names have rotated back to their + # starting values, so no register moves are needed before looping. + # + # - Message schedule: + # Similarly on the vector unit for the message scheduling, the 16 necessary + # scheduled DWORDs are stored in four XMM registers, as described earlier. For + # example, the initial data DWORDs are stored in order as {X0, X1, X2, X3}. + # When we compute four new scheduled DWORDs, we store them in X0 + # (overwriting the “oldest” data DWORDs), so now the scheduled DWORDs are + # stored in order in {X1, X2, X3, X0}. Once again, we handle this by “rotating” + # the four names, where in this case the names rotate one place every four + # rounds (because we compute four scheduled DWORDs in each calculation). + # By having 16 rounds (four scheduling operations) in the body of the loop, + # these XMM register names rotate back to their initial value, and again no + # register moves are needed before looping. + + staticFor i, 0, VecNum: + # We interleave computing the message scheduled at {t+4, t+5, t+6, t+7} + # with SHA256 state update for {t, t+1, t+2, t+3} + + # As they are independent, hopefully the compiler reorders instructions + # for maximum throughput. + # Also it optimize away the moves and use register renaming to avoid rotations + const pos = VecWords * i + + msnext.updateMessageSchedule(loMask, hiMask) + let wnext = add_u32x4(msnext[3], loadu_u128(K256[k256_idx].unsafeAddr)) + + # K256 was already included in the computation of wnext, hence kt = 0 + s.sha256_round(wt = ms.w[pos + 0], kt = 0) + s.sha256_round(wt = ms.w[pos + 1], kt = 0) + s.sha256_round(wt = ms.w[pos + 2], kt = 0) + s.sha256_round(wt = ms.w[pos + 3], kt = 0) + + storea_u128(ms.w[pos].addr, wnext) + k256_idx += VecWords + + +func sha256_rounds_48_63( + s: var Sha256_state, + ms: var Sha256_MessageSchedule) {.inline.} = + ## Process Sha256 rounds 48 to 63 + staticFor t, 48, 64: + # Wt[i mod 16] and K256 was already integrated in the computation of wnext + s.sha256_round(wt = ms.w[t and 15], kt = 0) + +func hashMessageBlocks_ssse3*( + H: var Sha256_state, + message: ptr UncheckedArray[byte], + numBlocks: uint)= + ## Hash a message block by block + ## Sha256 block size is 64 bytes hence + ## a message will be process 64 by 64 bytes. + + var msg = message + var ms{.noInit.}: Sha256_MessageSchedule + var msnext{.noInit.}: array[VecNum, m128i] + var s{.noInit.}: Sha256_state + + s.copy(H) + + for _ in 0 ..< numBlocks: + initMessageSchedule(msnext, ms, msg) + msg +%= BlockSize + + sha256_rounds_0_47(s, ms, msnext) + sha256_rounds_48_63(s, ms) + + s.accumulate(H) # accumulate on register variables + H.copy(s) diff --git a/constantine/math/config/type_bigint.nim b/constantine/math/config/type_bigint.nim index 33ed2b5..5ecd105 100644 --- a/constantine/math/config/type_bigint.nim +++ b/constantine/math/config/type_bigint.nim @@ -36,7 +36,7 @@ debug: result[0] = '0' result[1] = 'x' var a = a - for j in countdown(L-1, 0): + for j in countdown(result.len-1, 2): result[j] = hexChars.secretLookup(a and SecretWord 0xF) a = a shr 4 @@ -45,7 +45,7 @@ debug: result.add " " & toHex(a[0]) for i in 1 ..< a.len: result.add ", " & toHex(a[i]) - result.add "])" + result.add "]" func `$`*(a: BigInt): string = result = "BigInt[" diff --git a/constantine/platforms/isa/simd_x86.nim b/constantine/platforms/isa/simd_x86.nim new file mode 100644 index 0000000..487637d --- /dev/null +++ b/constantine/platforms/isa/simd_x86.nim @@ -0,0 +1,277 @@ +# Constantine +# Copyright (c) 2018-2019 Status Research & Development GmbH +# Copyright (c) 2020-Present Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +static: doAssert defined(i386) or defined(amd64) + +# SIMD throughput and latency: +# - https://software.intel.com/sites/landingpage/IntrinsicsGuide/ +# - https://www.agner.org/optimize/instruction_tables.pdf + +# Reminder: x86 is little-endian, order is [low part, high part] +# Documentation at https://software.intel.com/sites/landingpage/IntrinsicsGuide/ + +when defined(vcc): + {.pragma: x86_type, byCopy, header:"".} + {.pragma: x86, noDecl, header:"".} +else: + {.pragma: x86_type, byCopy, header:"".} + {.pragma: x86, noDecl, header:"".} + +type + m128* {.importc: "__m128", x86_type.} = object + raw: array[4, float32] + m128d* {.importc: "__m128d", x86_type.} = object + raw: array[2, float64] + m128i* {.importc: "__m128i", x86_type.} = object + raw: array[16, byte] + m256* {.importc: "__m256", x86_type.} = object + raw: array[8, float32] + m256d* {.importc: "__m256d", x86_type.} = object + raw: array[4, float64] + m256i* {.importc: "__m256i", x86_type.} = object + raw: array[32, byte] + m512* {.importc: "__m512", x86_type.} = object + raw: array[16, float32] + m512d* {.importc: "__m512d", x86_type.} = object + raw: array[8, float64] + m512i* {.importc: "__m512i", x86_type.} = object + raw: array[64, byte] + mmask8* {.importc: "__mmask8", x86_type.} = uint8 + mmask16* {.importc: "__mmask16", x86_type.} = uint16 + mmask64* {.importc: "__mmask64", x86_type.} = uint64 + +# ############################################################ +# +# SSE2 - integer - packed +# +# ############################################################ + +func mm_setzero_si128(): m128i {.importc: "_mm_setzero_si128", x86.} +func mm_set1_epi8(a: int8 or uint8): m128i {.importc: "_mm_set1_epi8", x86.} +func mm_set1_epi16(a: int16 or uint16): m128i {.importc: "_mm_set1_epi16", x86.} +func mm_set1_epi32(a: int32 or uint32): m128i {.importc: "_mm_set1_epi32", x86.} +func mm_set1_epi64x(a: int64 or uint64): m128i {.importc: "_mm_set1_epi64x", x86.} +func mm_set_epi64x(e1, e0: int64 or uint64): m128i {.importc: "_mm_set_epi64x", x86.} +func mm_load_si128(mem_addr: ptr m128i): m128i {.importc: "_mm_load_si128", x86.} +func mm_loadu_si128(mem_addr: ptr m128i): m128i {.importc: "_mm_loadu_si128", x86.} +func mm_store_si128(mem_addr: ptr m128i, a: m128i) {.importc: "_mm_store_si128", x86.} +func mm_storeu_si128(mem_addr: ptr m128i, a: m128i) {.importc: "_mm_storeu_si128", x86.} + +func mm_set_epi32(e3, e2, e1, e0: int32 or uint32): m128i {.importc: "_mm_set_epi32", x86.} + ## Initialize m128i with {e3, e2, e1, e0} (big endian order) + ## in order [e0, e1, e2, e3] +func mm_setr_epi32(e3, e2, e1, e0: int32 or uint32): m128i {.importc: "_mm_setr_epi32", x86.} + ## Initialize m128i with {e3, e2, e1, e0} (big endian order) + ## in order [e3, e2, e1, e0] + +func mm_xor_si128(a, b: m128i): m128i {.importc: "_mm_xor_si128", x86.} + +func mm_add_epi8(a, b: m128i): m128i {.importc: "_mm_add_epi8", x86.} +func mm_add_epi16(a, b: m128i): m128i {.importc: "_mm_add_epi16", x86.} +func mm_add_epi32(a, b: m128i): m128i {.importc: "_mm_add_epi32", x86.} +func mm_add_epi64(a, b: m128i): m128i {.importc: "_mm_add_epi64", x86.} + +func mm_slli_epi64(a: m128i, imm8: int32 or uint32): m128i {.importc: "_mm_slli_epi64", x86.} + ## Shift 2xint64 left +func mm_srli_epi64(a: m128i, imm8: int32 or uint32): m128i {.importc: "_mm_srli_epi64", x86.} + ## Shift 2xint64 right +func mm_srli_epi32(a: m128i, imm8: int32 or uint32): m128i {.importc: "_mm_srli_epi32", x86.} + ## Shift 4xint32 left +func mm_slli_epi32(a: m128i, imm8: int32 or uint32): m128i {.importc: "_mm_slli_epi32", x86.} + ## Shift 4xint32 right + +func mm_shuffle_epi32(a: m128i, imm8: int32 or uint32): m128i {.importc: "_mm_shuffle_epi32", x86.} + ## Shuffle 32-bit integers in a according to the control in imm8 + ## Formula is in big endian representation + ## a = {a3, a2, a1, a0} + ## dst = {d3, d2, d1, d0} + ## imm8 = {bits[7:6], bits[5:4], bits[3:2], bits[1:0]} + ## d0 will refer a[bits[1:0]] + ## d1 a[bits[3:2]] + +# ############################################################ +# +# SSSE3 - integer - packed +# +# ############################################################ + +func mm_alignr_epi8(a, b: m128i, imm8: int32 or uint32): m128i {.importc: "_mm_alignr_epi8", x86.} + ## Concatenate 16-byte blocks in a and b into a 32-byte temporary result, + ## shift the result right by imm8 bytes, and return the low 16 bytes + ## Input: + ## a[127:0], b[127:0] + ## Result: + ## tmp[255:128] = a + ## tmp[127:0] = b + ## tmp[255:0] = tmp[255:0] >> (imm8*8) + ## dst[127:0] = tmp[127:0] + +func mm_shuffle_epi8(a, b: m128i): m128i {.importc: "_mm_shuffle_epi8", x86.} + ## Shuffle 8-bit integers in a according to the control mask in b + ## Formula is in big endian representation + ## a = {a15, a14, a13, a12, a11, a10, a9, a8, a7, a6, a5, a4, a3, a2, a1, a0} + ## b = {b15, b14, b13, b12, b11, b10, b9, b8, b7, b6, b5, b4, b3, b2, b1, b0} + ## dst = {d15, d14, d13, d12, d11, d10, d9, d8, d7, d6, d5, d4, d3, d2, d1, d0} + ## + ## The control mask b0 ... b15 have the shape: + ## bits z000uvwx + ## if z is set, the corresponding d is set to zero. + ## otherwise uvwx represents a binary number in 0..15, + ## the corresponding d will be set to a(uvwx) + ## + ## for i in 0 ..< 16: + ## if bitand(b[i], 0x80) != 0: + ## dst[i] = 0 + ## else: + ## dst[i] = a[bitand(b[i], 0x0F)] + +# ############################################################ +# +# SSE4.1 - integer - packed +# +# ############################################################ + +func mm_blend_epi16(a, b: m128i, imm8: int32 or uint32): m128i {.importc: "_mm_blend_epi16", x86.} + ## Blend packed 16-bit integers from a and b using control mask imm8, + ## and store the results in dst. + ## + ## FOR j := 0 to 7 + ## i := j*16 + ## IF imm8[j] + ## dst[i+15:i] := b[i+15:i] + ## ELSE + ## dst[i+15:i] := a[i+15:i] + +# ############################################################ +# +# AVX512F - integer - packed +# +# ############################################################ + +func mm_ror_epi32(a: m128i, imm8: int32 or uint32): m128i {.importc: "_mm_ror_epi32", x86.} + ## Rotate 4xint32 right + +func mm_mask_add_epi32(src: m128i, mask: mmask8, a, b: m128i): m128i {.importc: "_mm_mask_add_epi32", x86.} + ## Add packed 32-bit integers in a and b, and store the results in dst using writemask mask + ## (elements are copied from src when the corresponding mask bit is not set). + ## for j in 0 ..< 4: + ## let i = j*32 + ## if k[j]: + ## dst[i+31:i] := a[i+31:i] + b[i+31:i] + ## else: + ## dst[i+31:i] := src[i+31:i] + +# ############################################################ +# +# SHA extensions +# +# ############################################################ + +func mm_sha256msg1_epu32(a, b: m128i): m128i {.importc: "_mm_sha256msg1_epu32", x86.} + ## Perform an intermediate calculation for the next four SHA256 message values (unsigned 32-bit integers) + ## using previous message values from a and b, and store the result in dst. + ## + ## W4 := b[31:0] + ## W3 := a[127:96] + ## W2 := a[95:64] + ## W1 := a[63:32] + ## W0 := a[31:0] + ## dst[127:96] := W3 + sigma0(W4) + ## dst[95:64] := W2 + sigma0(W3) + ## dst[63:32] := W1 + sigma0(W2) + ## dst[31:0] := W0 + sigma0(W1) + +func mm_sha256msg2_epu32(a, b: m128i): m128i {.importc: "_mm_sha256msg2_epu32", x86.} + ## Perform the final calculation for the next four SHA256 message values (unsigned 32-bit integers) + ## using previous message values from a and b, and store the result in dst. + ## + ## W14 := b[95:64] + ## W15 := b[127:96] + ## W16 := a[31:0] + sigma1(W14) + ## W17 := a[63:32] + sigma1(W15) + ## W18 := a[95:64] + sigma1(W16) + ## W19 := a[127:96] + sigma1(W17) + ## dst[127:96] := W19 + ## dst[95:64] := W18 + ## dst[63:32] := W17 + ## dst[31:0] := W16 + +func mm_sha256rnds2_epu32(cdgh, abef, k: m128i): m128i {.importc: "_mm_sha256rnds2_epu32", x86.} + ## Perform 2 rounds of SHA256 operation using + ## an initial SHA256 state (C,D,G,H) from a, + ## an initial SHA256 state (A,B,E,F) from b, + ## and a pre-computed sum of the next 2 round message values (unsigned 32-bit integers) + ## and the corresponding round constants from k, + ## and store the updated SHA256 state (A,B,E,F) in dst. + ## + ## A[0] := b[127:96] + ## B[0] := b[95:64] + ## C[0] := a[127:96] + ## D[0] := a[95:64] + ## E[0] := b[63:32] + ## F[0] := b[31:0] + ## G[0] := a[63:32] + ## H[0] := a[31:0] + ## W_K[0] := k[31:0] + ## W_K[1] := k[63:32] + ## FOR i := 0 to 1 + ## A[i+1] := Ch(E[i], F[i], G[i]) + sum1(E[i]) + W_K[i] + H[i] + Maj(A[i], B[i], C[i]) + sum0(A[i]) + ## B[i+1] := A[i] + ## C[i+1] := B[i] + ## D[i+1] := C[i] + ## E[i+1] := Ch(E[i], F[i], G[i]) + sum1(E[i]) + W_K[i] + H[i] + D[i] + ## F[i+1] := E[i] + ## G[i+1] := F[i] + ## H[i+1] := G[i] + ## ENDFOR + ## dst[127:96] := A[2] + ## dst[95:64] := B[2] + ## dst[63:32] := E[2] + ## dst[31:0] := F[2] + +# Aliases +# ------------------------------------------------ + +template set_u64x2*(e1, e0: int64 or uint64): m128i = + mm_set_epi64x(e1, e0) +template setr_u32x4*(e3, e2, e1, e0: int32 or uint32): m128i = + mm_setr_epi32(e3, e2, e1, e0) +template loada_u128*(data: pointer): m128i = + mm_load_si128(cast[ptr m128i](data)) +template loadu_u128*(data: pointer): m128i = + mm_loadu_si128(cast[ptr m128i](data)) +template storea_u128*(mem_addr: pointer, a: m128i) = + mm_store_si128(cast[ptr m128i](mem_addr), a) + +template xor_u128*(a, b: m128i): m128i = + mm_xor_si128(a, b) + +template add_u32x4*(a, b: m128i): m128i = + mm_add_epi32(a, b) +template shl_u32x4*(a: m128i, imm8: int32 or uint32): m128i = + mm_slli_epi32(a, imm8) +template shr_u32x4*(a: m128i, imm8: int32 or uint32): m128i = + mm_srli_epi32(a, imm8) +template shr_u64x2*(a: m128i, imm8: int32 or uint32): m128i = + mm_srli_epi64(a, imm8) + +template alignr_u128*(a, b: m128i, shiftRightByBytes: int32 or uint32): m128i = + mm_alignr_epi8(a, b, shiftRightByBytes) +template shuf_u8x16*(a: m128i, mask: m128i): m128i = + mm_shuffle_epi8(a, mask) +template shuf_u32x4*(a: m128i, mask: int32 or uint32): m128i = + mm_shuffle_epi32(a, mask) +template blend_u16x8*(a, b: m128i, mask: int32 or uint32): m128i = + mm_blend_epi16(a, b, mask) + +template sha256_msg1*(a, b: m128i): m128i = + mm_sha256msg1_epu32(a, b) +template sha256_msg2*(a, b: m128i): m128i = + mm_sha256msg2_epu32(a, b) +template sha256_2rounds*(cdgh, abef, k: m128i): m128i = + mm_sha256rnds2_epu32(cdgh, abef, k) \ No newline at end of file diff --git a/constantine/platforms/primitives.nim b/constantine/platforms/primitives.nim index b6ee9b3..b3460e5 100644 --- a/constantine/platforms/primitives.nim +++ b/constantine/platforms/primitives.nim @@ -34,6 +34,10 @@ when X86 and GCC_Compatible: import isa/[cpuinfo_x86, macro_assembler_x86] export cpuinfo_x86, macro_assembler_x86 +# No exceptions allowed in core cryptographic operations +{.push raises: [].} +{.push checks: off.} + # ############################################################ # # Instrumentation @@ -71,4 +75,46 @@ func copy*[T: byte|char]( {.push checks: off.} # No OverflowError or IndexError allowed for i in 0 ..< len: - dst[dStart + i] = byte src[sStart + i] \ No newline at end of file + dst[dStart + i] = byte src[sStart + i] + +func rotateRight*[N: static int, T](a: var array[N, T]) {.inline.} = + # Rotate right (Somehow we can't use a generic template here) + # Inline + # Hopefully we want the compiler to see that N rounds of rotation + # can be optimized away with register renaming + let tmp = a[a.len-1] + staticForCountdown i, a.len-1, 1: + a[i] = a[i-1] + a[0] = tmp + +func rotateLeft*[N: static int, T](a: var array[N, T]) {.inline.} = + # Rotate left (Somehow we can't use a generic template here) + # Inline + # Hopefully we want the compiler to see that N rounds of rotation + # can be optimized away with register renaming + let tmp = a[0] + staticFor i, 0, a.len-1: + a[i] = a[i+1] + a[a.len-1] = tmp + +# ############################################################ +# +# Pointer arithmetics +# +# ############################################################ + +template asUnchecked*[T](a: openArray[T]): ptr UncheckedArray[T] = + cast[ptr UncheckedArray[T]](a[0].unsafeAddr) + +# Warning for pointer arithmetics via inline C +# be careful of not passing a `var ptr` +# to a function as `var` are passed by hidden pointers in Nim and the wrong +# pointer will be modified. Templates are fine. + +func `+%`*(p: ptr, offset: SomeInteger): type(p) {.inline, noInit.}= + ## Pointer increment + {.emit: [result, " = ", p, " + ", offset, ";"].} + +func `+%=`*(p: var ptr, offset: SomeInteger){.inline.}= + ## Pointer increment + p = p +% offset \ No newline at end of file diff --git a/helpers/static_for.nim b/helpers/static_for.nim index ab988a0..4dca6e6 100644 --- a/helpers/static_for.nim +++ b/helpers/static_for.nim @@ -35,6 +35,14 @@ macro staticFor*(idx: untyped{nkIdent}, start, stopEx: static int, body: untyped body.replaceNodes(idx, newLit i) ) +macro staticForCountdown*(idx: untyped{nkIdent}, start, stopIncl: static int, body: untyped): untyped = + result = newStmtList() + for i in countdown(start, stopIncl): + result.add nnkBlockStmt.newTree( + ident("unrolledIter_" & $idx & $i), + body.replaceNodes(idx, newLit i) + ) + {.experimental: "dynamicBindSym".} macro staticFor*(ident: untyped{nkIdent}, choices: typed, body: untyped): untyped = diff --git a/tests/t_hash_sha256_vs_openssl.nim b/tests/t_hash_sha256_vs_openssl.nim index ce18520..463b0ac 100644 --- a/tests/t_hash_sha256_vs_openssl.nim +++ b/tests/t_hash_sha256_vs_openssl.nim @@ -34,7 +34,7 @@ else: # But the new API isn't expose on Linux :/ # TODO: fix Windows -when not defined(Windows): +when not defined(windows): proc SHA256[T: byte|char]( msg: openarray[T], digest: ptr array[32, byte] = nil