unrolled SHA256 (for 32B faster only if using ssse3)

2025-02-18 15:16:21 +00:00 · 2021-02-15 18:43:35 +01:00 · 2021-02-15 18:43:35 +01:00 · 18069e54d3
commit 18069e54d3
parent 976edb64bb
2 changed files with 128 additions and 35 deletions
--- a/benchmarks/bench_sha256.nim
+++ b/benchmarks/bench_sha256.nim
@ -68,6 +68,10 @@ proc benchSHA256_openssl[T](msg: openarray[T], msgComment: string, iters: int) =
 when isMainModule:
  proc main() =
    block:
      let msg128B = rng.random_byte_seq(32)
      benchSHA256_constantine(msg128B, "32B", 32)
      benchSHA256_openssl(msg128B, "32B", 32)
    block:
      let msg128B = rng.random_byte_seq(128)
      benchSHA256_constantine(msg128B, "128B", 128)
--- a/constantine/hashes/h_sha256.nim
+++ b/constantine/hashes/h_sha256.nim
@ -7,8 +7,10 @@
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 import
  std/macros,
  ../config/common,
-  ../io/endians
+  ../io/endians,
  ../primitives/static_for
 # SHA256, a hash function from the SHA2 family
 # --------------------------------------------------------------------------------
@ -51,6 +53,10 @@ type
 {.push raises: [].}
 {.push checks: off.}
 func setZero[N](a: var array[N, SomeNumber]){.inline.} =
  for i in 0 ..< a.len:
    a[i] = 0
 template rotr(x, n: uint32): uint32 =
  ## Rotate right the bits
  # We always use it with constants in 0 ..< 32
@ -62,7 +68,7 @@ template ch(x, y, z: uint32): uint32 =
  ## Choose bit i from yi or zi depending on xi
  when false: # Spec FIPS 180-4
    (x and y) xor (not(x) and z)
-  else:      # RFC4634
+  else:       # RFC4634
    ((x and (y xor z)) xor z)
 template maj(x, y, z: uint32): uint32 =
@ -88,9 +94,80 @@ template s1(x: uint32): uint32 =
  # σ₁
  rotr(x, 17) xor rotr(x, 19) xor (x shr 10)
-func setZero[N](a: var array[N, SomeNumber]){.inline.} =
+template u32BE(blob: array[4, byte]): uint32 =
-  for i in 0 ..< a.len:
+  ## Interpret a data blob as a big-endian uint32
-    a[i] = 0
+  ## This should lower to
  when nimvm:
    (blob[0].uint32 shl 24) or (blob[1].uint32 shl 16) or (blob[2].uint32 shl 8) or blob[3].uint32
  else:
    when cpuEndian == littleEndian:
      (blob[0].uint32 shl 24) or (blob[1].uint32 shl 16) or (blob[2].uint32 shl 8) or blob[3].uint32
    else:
      cast[uint32](blob)
 template getU32at[T: byte|char](msg: openarray[T], pos: SomeInteger): uint32 =
  u32BE(cast[ptr array[4, byte]](msg[pos].unsafeAddr())[])
 func rotateRight[T](a: var openarray[T], k: int) =
  ## Rotate a seuqnce by k
  doAssert a.len > 0
  let k = k mod a.len
  for _ in 0 ..< k:
    let tmp = a[^1]
    for i in countdown(a.len-1, 1):
      a[i] = a[i-1]
    a[0] = tmp
 macro round(a, b, c, d, e, f, g, h: untyped, t: static int): untyped =
  ## Unrolled and allocation efficient SHA256 round
  var s = [a, b, c, d, e, f, g, h]
  s.rotateRight(t)
  let
    a = s[0]
    b = s[1]
    c = s[2]
    d = s[3]
    e = s[4]
    f = s[5]
    g = s[6]
    h = s[7]
  # W[t]
  let w = nnkBracketExpr.newTree(
    ident("W"), newLit(t mod 16)
  )
  result = newStmtList()
  if t < 16:
    # Reading message phase
    let msg = ident"message"
    let curBlock = ident"curBlock"
    result.add quote do:
      `w` = getU32at(`msg`, `curBlock`*64 + `t`*4)
  else:
    # Mixing
    # Wt-2, Wt-7, Wt-15, Wt-16
    let Wtm2 = nnkBracketExpr.newTree(
      ident("W"), newLit((t-2) mod 16)
    )
    let Wtm7 = nnkBracketExpr.newTree(
      ident("W"), newLit((t-7) mod 16)
    )
    let Wtm15 = nnkBracketExpr.newTree(
      ident("W"), newLit((t-15) mod 16)
    )
    # w is Wt-16
    result.add quote do:
      `w` += s1(`Wtm2`) + `Wtm7` + s0(`Wtm15`)
  result.add quote do:
    let T1 = `h` + S1(`e`) + ch(`e`, `f`, `g`) + K256[`t`] + `w`
    let T2 = S0(`a`) + maj(`a`, `b`, `c`)
    `d` += T1
    `h` = T1 + T2
 func hashMessageBlocks[T: byte|char](
       H: var array[HashSize, uint32],
@ -126,7 +203,7 @@ func hashMessageBlocks[T: byte|char](
    g = H[6]
    h = H[7]
-  for _ in 0 ..< numBlocks:
+  for curBlock in 0 ..< numBlocks:
    # The first 16 bytes have different handling
    # from bytes 16..<64.
    # Using an array[64, uint32] will span it
@ -134,38 +211,50 @@ func hashMessageBlocks[T: byte|char](
    # Workspace with message schedule Wₜ
    var W{.noInit.}: array[16, uint32]
    var t = 0'u32
    while t < 16: # Wₜ = Mⁱₜ
      W[t].parseFromBlob(message, result, bigEndian)
      let T1 = h + S1(e) + ch(e, f, g) + K256[t] + W[t]
      let T2 = S0(a) + maj(a, b, c)
      h = g
      g = f
      f = e
      e = d + T1
      d = c
      c = b
      b = a
      a = T1+T2
-      t += 1
+    when true:
      # Translation of the spec
      # This is faster than even OpenSSL for hashing just 32 bytes
      # for example for HMAC and HKDF.
      var t = 0'u32
      while t < 16: # Wₜ = Mⁱₜ
        W[t].parseFromBlob(message, result, bigEndian)
        let T1 = h + S1(e) + ch(e, f, g) + K256[t] + W[t]
        let T2 = S0(a) + maj(a, b, c)
        h = g
        g = f
        f = e
        e = d + T1
        d = c
        c = b
        b = a
        a = T1+T2
-    while t < 64:
+        t += 1
      W[t mod 16] += s1(W[(t-2) mod 16]) +
                     W[(t-7) mod 16] +
                     s0(W[(t-15) mod 16])
      let T1 = h + S1(e) + ch(e, f, g) + K256[t] + W[t mod 16]
      let T2 = S0(a) + maj(a, b, c)
      h = g
      g = f
      f = e
      e = d + T1
      d = c
      c = b
      b = a
      a = T1+T2
-      t += 1
+      while t < 64:
        W[t mod 16] += s1(W[(t-2) mod 16]) +
                      W[(t-7) mod 16] +
                      s0(W[(t-15) mod 16])
        let T1 = h + S1(e) + ch(e, f, g) + K256[t] + W[t mod 16]
        let T2 = S0(a) + maj(a, b, c)
        h = g
        g = f
        f = e
        e = d + T1
        d = c
        c = b
        b = a
        a = T1+T2
        t += 1
    else:
      # optimized version for large hashes
      # For hashing 32B, this is slower than the rough translation
      # of spec, unless compiled with -mssse3 (but no vector instructions are used :/)
      staticFor t, 0, 64:
        round(a, b, c, d, e, f, g, h, t)
      result += 64
    a += H[0]; H[0] = a
    b += H[1]; H[1] = b