Sha256 refactor (#206)
* sha256: separate message scheduling and state updates to help implement specific use-cases like #205; also implement SSSE3 acceleration (2006, Intel Core 2 Duo) * sha256: simplify update flow, store less metadata in context * sha256: Fix reworked update function * Implement x86 hardware SHA acceleration * typo
This commit is contained in:
parent
b901dd5878
commit
351a3f6bd2
|
@ -48,9 +48,9 @@ proc report(op: string, bytes: int, startTime, stopTime: MonoTime, startClk, sto
|
||||||
when SupportsGetTicks:
|
when SupportsGetTicks:
|
||||||
let cycles = (stopClk - startClk) div iters
|
let cycles = (stopClk - startClk) div iters
|
||||||
let cyclePerByte = cycles.float64 / bytes.float64
|
let cyclePerByte = cycles.float64 / bytes.float64
|
||||||
echo &"{op:<30} {throughput:>15.3f} ops/s {ns:>9} ns/op {cycles:>10} cycles {cyclePerByte:>5.2f} cycles/byte"
|
echo &"{op:<50} {throughput:>15.3f} ops/s {ns:>9} ns/op {cycles:>10} cycles {cyclePerByte:>5.2f} cycles/byte"
|
||||||
else:
|
else:
|
||||||
echo &"{op:<30} {throughput:>15.3f} ops/s {ns:>9} ns/op"
|
echo &"{op:<50} {throughput:>15.3f} ops/s {ns:>9} ns/op"
|
||||||
|
|
||||||
template bench(op: string, bytes: int, iters: int, body: untyped): untyped =
|
template bench(op: string, bytes: int, iters: int, body: untyped): untyped =
|
||||||
measure(iters, startTime, stopTime, startClk, stopClk, body)
|
measure(iters, startTime, stopTime, startClk, stopClk, body)
|
||||||
|
@ -63,41 +63,23 @@ proc benchSHA256_constantine[T](msg: openarray[T], msgComment: string, iters: in
|
||||||
|
|
||||||
proc benchSHA256_openssl[T](msg: openarray[T], msgComment: string, iters: int) =
|
proc benchSHA256_openssl[T](msg: openarray[T], msgComment: string, iters: int) =
|
||||||
var digest: array[32, byte]
|
var digest: array[32, byte]
|
||||||
bench("SHA256 - OpenSSL - " & msgComment, msg.len, iters):
|
bench("SHA256 - OpenSSL - " & msgComment, msg.len, iters):
|
||||||
SHA256_OpenSSL(digest, msg)
|
SHA256_OpenSSL(digest, msg)
|
||||||
|
|
||||||
when isMainModule:
|
when isMainModule:
|
||||||
proc main() =
|
proc main() =
|
||||||
block:
|
const sizes = [
|
||||||
let msg32B = rng.random_byte_seq(32)
|
32, 64, 128, 256,
|
||||||
benchSHA256_constantine(msg32B, "32B", 100)
|
1024, 4096, 16384, 65536,
|
||||||
benchSHA256_openssl(msg32B, "32B", 100)
|
1_000_000, 10_000_000
|
||||||
block:
|
]
|
||||||
let msg64B = rng.random_byte_seq(64)
|
|
||||||
benchSHA256_constantine(msg64B, "64B", 100)
|
const target_cycles = 1_000_000_000'i64
|
||||||
benchSHA256_openssl(msg64B, "64B", 100)
|
const worst_cycles_per_bytes = 25'i64
|
||||||
block:
|
for s in sizes:
|
||||||
let msg128B = rng.random_byte_seq(128)
|
let msg = rng.random_byte_seq(s)
|
||||||
benchSHA256_constantine(msg128B, "128B", 100)
|
let iters = int(target_cycles div (s.int64 * worst_cycles_per_bytes))
|
||||||
benchSHA256_openssl(msg128B, "128B", 100)
|
benchSHA256_constantine(msg, $s & "B", iters)
|
||||||
block:
|
benchSHA256_openssl(msg, $s & "B", iters)
|
||||||
let msg576B = rng.random_byte_seq(576)
|
|
||||||
benchSHA256_constantine(msg576B, "576B", 50)
|
|
||||||
benchSHA256_openssl(msg576B, "576B", 50)
|
|
||||||
block:
|
|
||||||
let msg8192B = rng.random_byte_seq(8192)
|
|
||||||
benchSHA256_constantine(msg8192B, "8192B", 25)
|
|
||||||
benchSHA256_openssl(msg8192B, "8192B", 25)
|
|
||||||
block:
|
|
||||||
let msg1MB = rng.random_byte_seq(1_000_000)
|
|
||||||
benchSHA256_constantine(msg1MB, "1MB", 16)
|
|
||||||
benchSHA256_openssl(msg1MB, "1MB", 16)
|
|
||||||
block:
|
|
||||||
let msg10MB = rng.random_byte_seq(10_000_000)
|
|
||||||
benchSHA256_constantine(msg10MB, "10MB", 16)
|
|
||||||
benchSHA256_openssl(msg10MB, "10MB", 16)
|
|
||||||
block:
|
|
||||||
let msg100MB = rng.random_byte_seq(100_000_000)
|
|
||||||
benchSHA256_constantine(msg100MB, "100MB", 3)
|
|
||||||
benchSHA256_openssl(msg100MB, "100MB", 3)
|
|
||||||
main()
|
main()
|
||||||
|
|
|
@ -7,8 +7,13 @@
|
||||||
# at your option. This file may not be copied, modified, or distributed except according to those terms.
|
# at your option. This file may not be copied, modified, or distributed except according to those terms.
|
||||||
|
|
||||||
import
|
import
|
||||||
std/macros,
|
../platforms/[abstractions, endians],
|
||||||
../platforms/[abstractions, endians]
|
./sha256/sha256_generic
|
||||||
|
|
||||||
|
when UseASM_X86_32:
|
||||||
|
import ./sha256/[
|
||||||
|
sha256_x86_ssse3,
|
||||||
|
sha256_x86_shaext]
|
||||||
|
|
||||||
# SHA256, a hash function from the SHA2 family
|
# SHA256, a hash function from the SHA2 family
|
||||||
# --------------------------------------------------------------------------------
|
# --------------------------------------------------------------------------------
|
||||||
|
@ -16,11 +21,6 @@ import
|
||||||
# References:
|
# References:
|
||||||
# - NIST: https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.180-4.pdf
|
# - NIST: https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.180-4.pdf
|
||||||
# - IETF: US Secure Hash Algorithms (SHA and HMAC-SHA) https://tools.ietf.org/html/rfc4634
|
# - IETF: US Secure Hash Algorithms (SHA and HMAC-SHA) https://tools.ietf.org/html/rfc4634
|
||||||
# - Intel optimization https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/sha-256-implementations-paper.pdf
|
|
||||||
# - Parallelizing message schedules
|
|
||||||
# to accelerate the computations of hash functions
|
|
||||||
# Shay Gueron, Vlad Krasnov, 2012
|
|
||||||
# https://eprint.iacr.org/2012/067.pdf
|
|
||||||
#
|
#
|
||||||
# Vectors:
|
# Vectors:
|
||||||
# - https://csrc.nist.gov/CSRC/media/Projects/Cryptographic-Standards-and-Guidelines/documents/examples/SHA256.pdf
|
# - https://csrc.nist.gov/CSRC/media/Projects/Cryptographic-Standards-and-Guidelines/documents/examples/SHA256.pdf
|
||||||
|
@ -28,277 +28,74 @@ import
|
||||||
# Types and constants
|
# Types and constants
|
||||||
# ----------------------------------------------------------------
|
# ----------------------------------------------------------------
|
||||||
|
|
||||||
const
|
|
||||||
DigestSize = 32
|
|
||||||
BlockSize = 64
|
|
||||||
HashSize = DigestSize div sizeof(uint32) # 8
|
|
||||||
|
|
||||||
type
|
type
|
||||||
Sha256Context* = object
|
Sha256Context* = object
|
||||||
## Align to 64 for cache line and SIMD friendliness
|
## Align to 64 for cache line and SIMD friendliness
|
||||||
H{.align: 64}: array[HashSize, uint32]
|
s{.align: 64}: Sha256_state
|
||||||
buf{.align: 64}: array[BlockSize, byte]
|
buf{.align: 64}: array[BlockSize, byte]
|
||||||
msgLen: uint64
|
msgLen: uint64
|
||||||
bufIdx: uint8
|
|
||||||
|
|
||||||
sha256* = Sha256Context
|
sha256* = Sha256Context
|
||||||
|
|
||||||
# Internal
|
# Internals
|
||||||
# ----------------------------------------------------------------
|
# ----------------------------------------------------------------
|
||||||
# TODO: vectorized implementations
|
|
||||||
|
|
||||||
# No exceptions allowed in core cryptographic operations
|
# No exceptions allowed in core cryptographic operations
|
||||||
{.push raises: [].}
|
{.push raises: [].}
|
||||||
{.push checks: off.}
|
{.push checks: off.}
|
||||||
|
|
||||||
template rotr(x, n: uint32): uint32 =
|
func hashMessageBlocks(
|
||||||
## Rotate right the bits
|
s: var Sha256_state,
|
||||||
# We always use it with constants in 0 ..< 32
|
message: ptr UncheckedArray[byte],
|
||||||
# so undefined behaviour.
|
numBlocks: uint) =
|
||||||
(x shr n) or (x shl (32 - n))
|
when UseASM_X86_32:
|
||||||
|
if ({.noSideEffect.}: hasSha()):
|
||||||
template ch(x, y, z: uint32): uint32 =
|
hashMessageBlocks_shaext(s, message, numBlocks)
|
||||||
## "Choose" function of SHA256
|
elif ({.noSideEffect.}: hasSSSE3()):
|
||||||
## Choose bit i from yi or zi depending on xi
|
hashMessageBlocks_ssse3(s, message, numBlocks)
|
||||||
when false: # Spec FIPS 180-4
|
|
||||||
(x and y) xor (not(x) and z)
|
|
||||||
else: # RFC4634
|
|
||||||
((x and (y xor z)) xor z)
|
|
||||||
|
|
||||||
template maj(x, y, z: uint32): uint32 =
|
|
||||||
## "Majority" function of SHA256
|
|
||||||
when false: # Spec FIPS 180-4
|
|
||||||
(x and y) xor (x and z) xor (y and z)
|
|
||||||
else: # RFC4634
|
|
||||||
(x and (y or z)) or (y and z)
|
|
||||||
|
|
||||||
template S0(x: uint32): uint32 =
|
|
||||||
# Σ₀
|
|
||||||
rotr(x, 2) xor rotr(x, 13) xor rotr(x, 22)
|
|
||||||
|
|
||||||
template S1(x: uint32): uint32 =
|
|
||||||
# Σ₁
|
|
||||||
rotr(x, 6) xor rotr(x, 11) xor rotr(x, 25)
|
|
||||||
|
|
||||||
template s0(x: uint32): uint32 =
|
|
||||||
# σ₀
|
|
||||||
rotr(x, 7) xor rotr(x, 18) xor (x shr 3)
|
|
||||||
|
|
||||||
template s1(x: uint32): uint32 =
|
|
||||||
# σ₁
|
|
||||||
rotr(x, 17) xor rotr(x, 19) xor (x shr 10)
|
|
||||||
|
|
||||||
template u32BE(blob: array[4, byte]): uint32 =
|
|
||||||
## Interpret a data blob as a big-endian uint32
|
|
||||||
## This should lower to
|
|
||||||
when nimvm:
|
|
||||||
(blob[0].uint32 shl 24) or (blob[1].uint32 shl 16) or (blob[2].uint32 shl 8) or blob[3].uint32
|
|
||||||
else:
|
|
||||||
when cpuEndian == littleEndian:
|
|
||||||
(blob[0].uint32 shl 24) or (blob[1].uint32 shl 16) or (blob[2].uint32 shl 8) or blob[3].uint32
|
|
||||||
else:
|
else:
|
||||||
cast[uint32](blob)
|
hashMessageBlocks_generic(s, message, numBlocks)
|
||||||
|
|
||||||
template getU32at[T: byte|char](msg: openarray[T], pos: SomeInteger): uint32 =
|
|
||||||
u32BE(cast[ptr array[4, byte]](msg[pos].unsafeAddr())[])
|
|
||||||
|
|
||||||
func rotateRight[T](a: var openarray[T], k: int) =
|
|
||||||
## Rotate a seuqnce by k
|
|
||||||
doAssert a.len > 0
|
|
||||||
let k = k mod a.len
|
|
||||||
|
|
||||||
for _ in 0 ..< k:
|
|
||||||
let tmp = a[^1]
|
|
||||||
for i in countdown(a.len-1, 1):
|
|
||||||
a[i] = a[i-1]
|
|
||||||
a[0] = tmp
|
|
||||||
|
|
||||||
macro round(a, b, c, d, e, f, g, h: untyped, t: static int): untyped =
|
|
||||||
## Unrolled and allocation efficient SHA256 round
|
|
||||||
var s = [a, b, c, d, e, f, g, h]
|
|
||||||
s.rotateRight(t)
|
|
||||||
let
|
|
||||||
a = s[0]
|
|
||||||
b = s[1]
|
|
||||||
c = s[2]
|
|
||||||
d = s[3]
|
|
||||||
e = s[4]
|
|
||||||
f = s[5]
|
|
||||||
g = s[6]
|
|
||||||
h = s[7]
|
|
||||||
|
|
||||||
# W[t]
|
|
||||||
let w = nnkBracketExpr.newTree(
|
|
||||||
ident("W"), newLit(t mod 16)
|
|
||||||
)
|
|
||||||
|
|
||||||
result = newStmtList()
|
|
||||||
|
|
||||||
if t < 16:
|
|
||||||
# Reading message phase
|
|
||||||
let msg = ident"message"
|
|
||||||
let curBlock = ident"curBlock"
|
|
||||||
result.add quote do:
|
|
||||||
`w` = getU32at(`msg`, `curBlock`*64 + `t`*4)
|
|
||||||
else:
|
else:
|
||||||
# Mixing
|
hashMessageBlocks_generic(s, message, numBlocks)
|
||||||
|
|
||||||
# Wt-2, Wt-7, Wt-15, Wt-16
|
|
||||||
let Wtm2 = nnkBracketExpr.newTree(
|
|
||||||
ident("W"), newLit((t-2) mod 16)
|
|
||||||
)
|
|
||||||
let Wtm7 = nnkBracketExpr.newTree(
|
|
||||||
ident("W"), newLit((t-7) mod 16)
|
|
||||||
)
|
|
||||||
let Wtm15 = nnkBracketExpr.newTree(
|
|
||||||
ident("W"), newLit((t-15) mod 16)
|
|
||||||
)
|
|
||||||
# w is Wt-16
|
|
||||||
result.add quote do:
|
|
||||||
`w` += s1(`Wtm2`) + `Wtm7` + s0(`Wtm15`)
|
|
||||||
|
|
||||||
result.add quote do:
|
|
||||||
let T1 = `h` + S1(`e`) + ch(`e`, `f`, `g`) + K256[`t`] + `w`
|
|
||||||
let T2 = S0(`a`) + maj(`a`, `b`, `c`)
|
|
||||||
`d` += T1
|
|
||||||
`h` = T1 + T2
|
|
||||||
|
|
||||||
func hashMessageBlocks[T: byte|char](
|
|
||||||
H: var array[HashSize, uint32],
|
|
||||||
message: openarray[T]): uint =
|
|
||||||
## Hash a message block by block
|
|
||||||
## Sha256 block size is 64 bytes hence
|
|
||||||
## a message will be process 64 by 64 bytes.
|
|
||||||
## FIPS.180-4 6.2.2. SHA-256 Hash Computation
|
|
||||||
|
|
||||||
result = 0
|
|
||||||
let numBlocks = message.len.uint div BlockSize
|
|
||||||
if numBlocks == 0:
|
|
||||||
return 0
|
|
||||||
|
|
||||||
const K256 = [
|
|
||||||
0x428a2f98'u32, 0x71374491'u32, 0xb5c0fbcf'u32, 0xe9b5dba5'u32, 0x3956c25b'u32, 0x59f111f1'u32, 0x923f82a4'u32, 0xab1c5ed5'u32,
|
|
||||||
0xd807aa98'u32, 0x12835b01'u32, 0x243185be'u32, 0x550c7dc3'u32, 0x72be5d74'u32, 0x80deb1fe'u32, 0x9bdc06a7'u32, 0xc19bf174'u32,
|
|
||||||
0xe49b69c1'u32, 0xefbe4786'u32, 0x0fc19dc6'u32, 0x240ca1cc'u32, 0x2de92c6f'u32, 0x4a7484aa'u32, 0x5cb0a9dc'u32, 0x76f988da'u32,
|
|
||||||
0x983e5152'u32, 0xa831c66d'u32, 0xb00327c8'u32, 0xbf597fc7'u32, 0xc6e00bf3'u32, 0xd5a79147'u32, 0x06ca6351'u32, 0x14292967'u32,
|
|
||||||
0x27b70a85'u32, 0x2e1b2138'u32, 0x4d2c6dfc'u32, 0x53380d13'u32, 0x650a7354'u32, 0x766a0abb'u32, 0x81c2c92e'u32, 0x92722c85'u32,
|
|
||||||
0xa2bfe8a1'u32, 0xa81a664b'u32, 0xc24b8b70'u32, 0xc76c51a3'u32, 0xd192e819'u32, 0xd6990624'u32, 0xf40e3585'u32, 0x106aa070'u32,
|
|
||||||
0x19a4c116'u32, 0x1e376c08'u32, 0x2748774c'u32, 0x34b0bcb5'u32, 0x391c0cb3'u32, 0x4ed8aa4a'u32, 0x5b9cca4f'u32, 0x682e6ff3'u32,
|
|
||||||
0x748f82ee'u32, 0x78a5636f'u32, 0x84c87814'u32, 0x8cc70208'u32, 0x90befffa'u32, 0xa4506ceb'u32, 0xbef9a3f7'u32, 0xc67178f2'u32
|
|
||||||
]
|
|
||||||
|
|
||||||
var
|
|
||||||
a = H[0]
|
|
||||||
b = H[1]
|
|
||||||
c = H[2]
|
|
||||||
d = H[3]
|
|
||||||
e = H[4]
|
|
||||||
f = H[5]
|
|
||||||
g = H[6]
|
|
||||||
h = H[7]
|
|
||||||
|
|
||||||
for curBlock in 0 ..< numBlocks:
|
|
||||||
# The first 16 bytes have different handling
|
|
||||||
# from bytes 16..<64.
|
|
||||||
# Using an array[64, uint32] will span it
|
|
||||||
# across 8 cache lines impacting performance
|
|
||||||
|
|
||||||
# Workspace with message schedule Wₜ
|
|
||||||
var W{.noInit.}: array[16, uint32]
|
|
||||||
|
|
||||||
when true:
|
|
||||||
# Translation of the spec
|
|
||||||
# This is faster than even OpenSSL for hashing just 32 bytes
|
|
||||||
# for example for HMAC and HKDF.
|
|
||||||
var t = 0'u32
|
|
||||||
while t < 16: # Wₜ = Mⁱₜ
|
|
||||||
W[t].parseFromBlob(message, result, bigEndian)
|
|
||||||
let T1 = h + S1(e) + ch(e, f, g) + K256[t] + W[t]
|
|
||||||
let T2 = S0(a) + maj(a, b, c)
|
|
||||||
h = g
|
|
||||||
g = f
|
|
||||||
f = e
|
|
||||||
e = d + T1
|
|
||||||
d = c
|
|
||||||
c = b
|
|
||||||
b = a
|
|
||||||
a = T1+T2
|
|
||||||
|
|
||||||
t += 1
|
|
||||||
|
|
||||||
while t < 64:
|
|
||||||
W[t mod 16] += s1(W[(t-2) mod 16]) +
|
|
||||||
W[(t-7) mod 16] +
|
|
||||||
s0(W[(t-15) mod 16])
|
|
||||||
let T1 = h + S1(e) + ch(e, f, g) + K256[t] + W[t mod 16]
|
|
||||||
let T2 = S0(a) + maj(a, b, c)
|
|
||||||
h = g
|
|
||||||
g = f
|
|
||||||
f = e
|
|
||||||
e = d + T1
|
|
||||||
d = c
|
|
||||||
c = b
|
|
||||||
b = a
|
|
||||||
a = T1+T2
|
|
||||||
|
|
||||||
t += 1
|
|
||||||
else:
|
|
||||||
# optimized version for large hashes
|
|
||||||
# For hashing 32B, this is slower than the rough translation
|
|
||||||
# of spec, unless compiled with -mssse3 (but no vector instructions are used :/)
|
|
||||||
staticFor t, 0, 64:
|
|
||||||
round(a, b, c, d, e, f, g, h, t)
|
|
||||||
result += 64
|
|
||||||
|
|
||||||
a += H[0]; H[0] = a
|
|
||||||
b += H[1]; H[1] = b
|
|
||||||
c += H[2]; H[2] = c
|
|
||||||
d += H[3]; H[3] = d
|
|
||||||
e += H[4]; H[4] = e
|
|
||||||
f += H[5]; H[5] = f
|
|
||||||
g += H[6]; H[6] = g
|
|
||||||
h += H[7]; H[7] = h
|
|
||||||
|
|
||||||
func dumpHash(
|
func dumpHash(
|
||||||
digest: var array[DigestSize, byte],
|
digest: var array[DigestSize, byte],
|
||||||
H: array[HashSize, uint32]) =
|
s: Sha256_state) {.inline.} =
|
||||||
## Convert the internal hash into a message digest
|
## Convert the internal hash into a message digest
|
||||||
var dstIdx = 0'u
|
var dstIdx = 0'u
|
||||||
for i in 0 ..< H.len:
|
for i in 0 ..< s.H.len:
|
||||||
digest.dumpRawInt(H[i], dstIdx, bigEndian)
|
digest.dumpRawInt(s.H[i], dstIdx, bigEndian)
|
||||||
dstIdx += uint sizeof(uint32)
|
dstIdx += uint sizeof(uint32)
|
||||||
|
|
||||||
func hashBuffer(ctx: var Sha256Context) =
|
func hashBuffer(ctx: var Sha256Context) {.inline.} =
|
||||||
discard ctx.H.hashMessageBlocks(ctx.buf)
|
ctx.s.hashMessageBlocks(ctx.buf.asUnchecked(), numBlocks = 1)
|
||||||
ctx.buf.setZero()
|
ctx.buf.setZero()
|
||||||
ctx.bufIdx = 0
|
|
||||||
|
|
||||||
# Public API
|
# Public API
|
||||||
# ----------------------------------------------------------------
|
# ----------------------------------------------------------------
|
||||||
|
|
||||||
template digestSize*(H: type sha256): int =
|
template digestSize*(H: type sha256): int =
|
||||||
## Returns the output size in bytes
|
## Returns the output size in bytes
|
||||||
32
|
DigestSize
|
||||||
|
|
||||||
template internalBlockSize*(H: type sha256): int =
|
template internalBlockSize*(H: type sha256): int =
|
||||||
## Returns the byte size of the hash function ingested blocks
|
## Returns the byte size of the hash function ingested blocks
|
||||||
64
|
BlockSize
|
||||||
|
|
||||||
func init*(ctx: var Sha256Context) =
|
func init*(ctx: var Sha256Context) =
|
||||||
## Initialize or reinitialize a Sha256 context
|
## Initialize or reinitialize a Sha256 context
|
||||||
|
|
||||||
ctx.msgLen = 0
|
ctx.msgLen = 0
|
||||||
ctx.buf.setZero()
|
ctx.buf.setZero()
|
||||||
ctx.bufIdx = 0
|
|
||||||
|
|
||||||
ctx.H[0] = 0x6a09e667'u32
|
ctx.s.H[0] = 0x6a09e667'u32
|
||||||
ctx.H[1] = 0xbb67ae85'u32
|
ctx.s.H[1] = 0xbb67ae85'u32
|
||||||
ctx.H[2] = 0x3c6ef372'u32
|
ctx.s.H[2] = 0x3c6ef372'u32
|
||||||
ctx.H[3] = 0xa54ff53a'u32
|
ctx.s.H[3] = 0xa54ff53a'u32
|
||||||
ctx.H[4] = 0x510e527f'u32
|
ctx.s.H[4] = 0x510e527f'u32
|
||||||
ctx.H[5] = 0x9b05688c'u32
|
ctx.s.H[5] = 0x9b05688c'u32
|
||||||
ctx.H[6] = 0x1f83d9ab'u32
|
ctx.s.H[6] = 0x1f83d9ab'u32
|
||||||
ctx.H[7] = 0x5be0cd19'u32
|
ctx.s.H[7] = 0x5be0cd19'u32
|
||||||
|
|
||||||
func initZeroPadded*(ctx: var Sha256Context) =
|
func initZeroPadded*(ctx: var Sha256Context) =
|
||||||
## Initialize a Sha256 context
|
## Initialize a Sha256 context
|
||||||
|
@ -312,18 +109,17 @@ func initZeroPadded*(ctx: var Sha256Context) =
|
||||||
|
|
||||||
ctx.msgLen = 64
|
ctx.msgLen = 64
|
||||||
ctx.buf.setZero()
|
ctx.buf.setZero()
|
||||||
ctx.bufIdx = 0
|
|
||||||
|
|
||||||
ctx.H[0] = 0xda5698be'u32
|
ctx.s.H[0] = 0xda5698be'u32
|
||||||
ctx.H[1] = 0x17b9b469'u32
|
ctx.s.H[1] = 0x17b9b469'u32
|
||||||
ctx.H[2] = 0x62335799'u32
|
ctx.s.H[2] = 0x62335799'u32
|
||||||
ctx.H[3] = 0x779fbeca'u32
|
ctx.s.H[3] = 0x779fbeca'u32
|
||||||
ctx.H[4] = 0x8ce5d491'u32
|
ctx.s.H[4] = 0x8ce5d491'u32
|
||||||
ctx.H[5] = 0xc0d26243'u32
|
ctx.s.H[5] = 0xc0d26243'u32
|
||||||
ctx.H[6] = 0xbafef9ea'u32
|
ctx.s.H[6] = 0xbafef9ea'u32
|
||||||
ctx.H[7] = 0x1837a9d8'u32
|
ctx.s.H[7] = 0x1837a9d8'u32
|
||||||
|
|
||||||
func update*[T: char|byte](ctx: var Sha256Context, message: openarray[T]) =
|
func update*(ctx: var Sha256Context, message: openarray[byte]) =
|
||||||
## Append a message to a SHA256 context
|
## Append a message to a SHA256 context
|
||||||
## for incremental SHA256 computation
|
## for incremental SHA256 computation
|
||||||
##
|
##
|
||||||
|
@ -337,52 +133,47 @@ func update*[T: char|byte](ctx: var Sha256Context, message: openarray[T]) =
|
||||||
## For passwords and secret keys, you MUST NOT use raw SHA-256
|
## For passwords and secret keys, you MUST NOT use raw SHA-256
|
||||||
## use a Key Derivation Function instead (KDF)
|
## use a Key Derivation Function instead (KDF)
|
||||||
|
|
||||||
debug:
|
# Message processing state machine
|
||||||
doAssert: 0 <= ctx.bufIdx and ctx.bufIdx.int < ctx.buf.len
|
var bufIdx = uint(ctx.msgLen mod BlockSize)
|
||||||
for i in ctx.bufIdx ..< ctx.buf.len:
|
var cur = 0'u
|
||||||
doAssert ctx.buf[i] == 0
|
var bytesLeft = message.len.uint
|
||||||
|
|
||||||
if message.len == 0:
|
if bufIdx != 0 and bufIdx+bytesLeft >= BlockSize:
|
||||||
return
|
# Previous partial update, fill the buffer and do one sha256 hash
|
||||||
|
let free = BlockSize - bufIdx
|
||||||
|
ctx.buf.copy(dStart = bufIdx, message, sStart = 0, len = free)
|
||||||
|
ctx.hashBuffer()
|
||||||
|
bufIdx = 0
|
||||||
|
cur = free
|
||||||
|
bytesLeft -= free
|
||||||
|
|
||||||
var # Message processing state machine
|
if bytesLeft >= BlockSize:
|
||||||
cur = 0'u
|
# Process n blocks (64 byte each)
|
||||||
bytesLeft = message.len.uint
|
let numBlocks = bytesLeft div BlockSize
|
||||||
|
ctx.s.hashMessageBlocks(message.asUnchecked +% cur, numBlocks)
|
||||||
ctx.msgLen += bytesLeft
|
cur += numBlocks * BlockSize
|
||||||
|
bytesLeft -= numBlocks * BlockSize
|
||||||
if ctx.bufIdx != 0: # Previous partial update
|
|
||||||
let bufIdx = ctx.bufIdx.uint
|
|
||||||
let free = ctx.buf.sizeof().uint - bufIdx
|
|
||||||
|
|
||||||
if free > bytesLeft:
|
|
||||||
# Enough free space, store in buffer
|
|
||||||
ctx.buf.copy(dStart = bufIdx, message, sStart = 0, len = bytesLeft)
|
|
||||||
ctx.bufIdx += bytesLeft.uint8
|
|
||||||
return
|
|
||||||
else:
|
|
||||||
# Fill the buffer and do one sha256 hash
|
|
||||||
ctx.buf.copy(dStart = bufIdx, message, sStart = 0, len = free)
|
|
||||||
ctx.hashBuffer()
|
|
||||||
|
|
||||||
# Update message state for further processing
|
|
||||||
cur = free
|
|
||||||
bytesLeft -= free
|
|
||||||
|
|
||||||
# Process n blocks (64 byte each)
|
|
||||||
let consumed = ctx.H.hashMessageBlocks(
|
|
||||||
message.toOpenArray(int cur, message.len-1))
|
|
||||||
cur += consumed
|
|
||||||
bytesLeft -= consumed
|
|
||||||
|
|
||||||
if bytesLeft != 0:
|
if bytesLeft != 0:
|
||||||
# Store the tail in buffer
|
# Store the tail in buffer
|
||||||
debug: # TODO: state machine formal verification - https://nim-lang.org/docs/drnim.html
|
ctx.buf.copy(dStart = bufIdx, message, sStart = cur, len = bytesLeft)
|
||||||
doAssert ctx.bufIdx == 0
|
|
||||||
doAssert cur + bytesLeft == message.len.uint
|
|
||||||
|
|
||||||
ctx.buf.copy(dStart = 0'u, message, sStart = cur, len = bytesLeft)
|
ctx.msgLen += message.len.uint
|
||||||
ctx.bufIdx = uint8 bytesLeft
|
|
||||||
|
func update*(ctx: var Sha256Context, message: openarray[char]) {.inline.} =
|
||||||
|
## Append a message to a SHA256 context
|
||||||
|
## for incremental SHA256 computation
|
||||||
|
##
|
||||||
|
## Security note: the tail of your message might be stored
|
||||||
|
## in an internal buffer.
|
||||||
|
## if sensitive content is used, ensure that
|
||||||
|
## `ctx.finish(...)` and `ctx.clear()` are called as soon as possible.
|
||||||
|
## Additionally ensure that the message(s) passed were stored
|
||||||
|
## in memory considered secure for your threat model.
|
||||||
|
##
|
||||||
|
## For passwords and secret keys, you MUST NOT use raw SHA-256
|
||||||
|
## use a Key Derivation Function instead (KDF)
|
||||||
|
ctx.update(message.toOpenArrayByte(message.low, message.high))
|
||||||
|
|
||||||
func finish*(ctx: var Sha256Context, digest: var array[32, byte]) =
|
func finish*(ctx: var Sha256Context, digest: var array[32, byte]) =
|
||||||
## Finalize a SHA256 computation and output the
|
## Finalize a SHA256 computation and output the
|
||||||
|
@ -396,26 +187,23 @@ func finish*(ctx: var Sha256Context, digest: var array[32, byte]) =
|
||||||
## For passwords and secret keys, you MUST NOT use raw SHA-256
|
## For passwords and secret keys, you MUST NOT use raw SHA-256
|
||||||
## use a Key Derivation Function instead (KDF)
|
## use a Key Derivation Function instead (KDF)
|
||||||
|
|
||||||
debug:
|
let bufIdx = uint(ctx.msgLen mod BlockSize)
|
||||||
doAssert: 0 <= ctx.bufIdx and ctx.bufIdx.int < ctx.buf.len
|
|
||||||
for i in ctx.bufIdx ..< ctx.buf.len:
|
|
||||||
doAssert ctx.buf[i] == 0
|
|
||||||
|
|
||||||
# Add '1' bit at the end of the message (+7 zero bits)
|
# Add '1' bit at the end of the message (+7 zero bits)
|
||||||
ctx.buf[ctx.bufIdx] = 0b1000_0000
|
ctx.buf[bufIdx] = 0b1000_0000
|
||||||
|
|
||||||
# Add k bits so that msgLenBits + 1 + k ≡ 448 mod 512
|
# Add k bits so that msgLenBits + 1 + k ≡ 448 mod 512
|
||||||
# Hence in bytes msgLen + 1 + K ≡ 56 mod 64
|
# Hence in bytes msgLen + 1 + K ≡ 56 mod 64
|
||||||
const padZone = 56
|
const padZone = 56
|
||||||
if ctx.bufIdx >= padZone:
|
if bufIdx >= padZone:
|
||||||
# We are in the 56..<64 mod 64 byte count
|
# We are in the 56..<64 mod 64 byte count
|
||||||
# and need to rollover to 0
|
# and need to rollover to 0
|
||||||
ctx.hashBuffer()
|
ctx.hashBuffer()
|
||||||
|
|
||||||
let lenInBits = ctx.msgLen.uint64 * 8
|
let lenInBits = ctx.msgLen.uint64 * 8
|
||||||
ctx.buf.dumpRawInt(lenInBits, padZone, bigEndian)
|
ctx.buf.dumpRawInt(lenInBits, padZone, bigEndian)
|
||||||
discard ctx.H.hashMessageBlocks(ctx.buf)
|
ctx.s.hashMessageBlocks(ctx.buf.asUnchecked(), numBlocks = 1)
|
||||||
digest.dumpHash(ctx.H)
|
digest.dumpHash(ctx.s)
|
||||||
|
|
||||||
func clear*(ctx: var Sha256Context) =
|
func clear*(ctx: var Sha256Context) =
|
||||||
## Clear the context internal buffers
|
## Clear the context internal buffers
|
||||||
|
@ -423,7 +211,6 @@ func clear*(ctx: var Sha256Context) =
|
||||||
## For passwords and secret keys, you MUST NOT use raw SHA-256
|
## For passwords and secret keys, you MUST NOT use raw SHA-256
|
||||||
## use a Key Derivation Function instead (KDF)
|
## use a Key Derivation Function instead (KDF)
|
||||||
# TODO: ensure compiler cannot optimize the code away
|
# TODO: ensure compiler cannot optimize the code away
|
||||||
ctx.H.setZero()
|
ctx.s.H.setZero()
|
||||||
ctx.buf.setZero()
|
ctx.buf.setZero()
|
||||||
ctx.msgLen = 0
|
ctx.msgLen = 0
|
||||||
ctx.bufIdx = 0
|
|
|
@ -0,0 +1,185 @@
|
||||||
|
# Constantine
|
||||||
|
# Copyright (c) 2018-2019 Status Research & Development GmbH
|
||||||
|
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
|
||||||
|
# Licensed and distributed under either of
|
||||||
|
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
|
||||||
|
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
|
||||||
|
# at your option. This file may not be copied, modified, or distributed except according to those terms.
|
||||||
|
|
||||||
|
import
|
||||||
|
../../platforms/primitives
|
||||||
|
|
||||||
|
# SHA256, a hash function from the SHA2 family
|
||||||
|
# --------------------------------------------------------------------------------
|
||||||
|
#
|
||||||
|
# References:
|
||||||
|
# - NIST: https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.180-4.pdf
|
||||||
|
# - IETF: US Secure Hash Algorithms (SHA and HMAC-SHA) https://tools.ietf.org/html/rfc4634
|
||||||
|
# Vectors:
|
||||||
|
# - https://csrc.nist.gov/CSRC/media/Projects/Cryptographic-Standards-and-Guidelines/documents/examples/SHA256.pdf
|
||||||
|
|
||||||
|
# No exceptions allowed in core cryptographic operations
|
||||||
|
{.push raises: [].}
|
||||||
|
{.push checks: off.}
|
||||||
|
|
||||||
|
# Types & Constants
|
||||||
|
# ------------------------------------------------
|
||||||
|
# The enforced alignment should help the compiler produce optimized code
|
||||||
|
|
||||||
|
type Word* = uint32
|
||||||
|
|
||||||
|
const
|
||||||
|
DigestSize* = 32
|
||||||
|
BlockSize* = 64
|
||||||
|
HashSize* = DigestSize div sizeof(Word) # 8
|
||||||
|
|
||||||
|
type Sha256_MessageSchedule* = object
|
||||||
|
w*{.align: 64.}: array[BlockSize div sizeof(Word), Word]
|
||||||
|
|
||||||
|
type Sha256_state* = object
|
||||||
|
H*{.align: 64.}: array[HashSize, Word]
|
||||||
|
|
||||||
|
const K256* = [
|
||||||
|
0x428a2f98'u32, 0x71374491'u32, 0xb5c0fbcf'u32, 0xe9b5dba5'u32, 0x3956c25b'u32, 0x59f111f1'u32, 0x923f82a4'u32, 0xab1c5ed5'u32,
|
||||||
|
0xd807aa98'u32, 0x12835b01'u32, 0x243185be'u32, 0x550c7dc3'u32, 0x72be5d74'u32, 0x80deb1fe'u32, 0x9bdc06a7'u32, 0xc19bf174'u32,
|
||||||
|
0xe49b69c1'u32, 0xefbe4786'u32, 0x0fc19dc6'u32, 0x240ca1cc'u32, 0x2de92c6f'u32, 0x4a7484aa'u32, 0x5cb0a9dc'u32, 0x76f988da'u32,
|
||||||
|
0x983e5152'u32, 0xa831c66d'u32, 0xb00327c8'u32, 0xbf597fc7'u32, 0xc6e00bf3'u32, 0xd5a79147'u32, 0x06ca6351'u32, 0x14292967'u32,
|
||||||
|
0x27b70a85'u32, 0x2e1b2138'u32, 0x4d2c6dfc'u32, 0x53380d13'u32, 0x650a7354'u32, 0x766a0abb'u32, 0x81c2c92e'u32, 0x92722c85'u32,
|
||||||
|
0xa2bfe8a1'u32, 0xa81a664b'u32, 0xc24b8b70'u32, 0xc76c51a3'u32, 0xd192e819'u32, 0xd6990624'u32, 0xf40e3585'u32, 0x106aa070'u32,
|
||||||
|
0x19a4c116'u32, 0x1e376c08'u32, 0x2748774c'u32, 0x34b0bcb5'u32, 0x391c0cb3'u32, 0x4ed8aa4a'u32, 0x5b9cca4f'u32, 0x682e6ff3'u32,
|
||||||
|
0x748f82ee'u32, 0x78a5636f'u32, 0x84c87814'u32, 0x8cc70208'u32, 0x90befffa'u32, 0xa4506ceb'u32, 0xbef9a3f7'u32, 0xc67178f2'u32
|
||||||
|
]
|
||||||
|
|
||||||
|
# Primitives
|
||||||
|
# ------------------------------------------------
|
||||||
|
|
||||||
|
template rotr(x, n: uint32): uint32 =
|
||||||
|
## Rotate right the bits
|
||||||
|
# We always use it with constants in 0 ..< 32
|
||||||
|
# so no undefined behaviour.
|
||||||
|
(x shr n) or (x shl (32 - n))
|
||||||
|
|
||||||
|
template ch(x, y, z: uint32): uint32 =
|
||||||
|
## "Choose" function of SHA256
|
||||||
|
## Choose bit i from yi or zi depending on xi
|
||||||
|
when false: # Spec FIPS 180-4
|
||||||
|
(x and y) xor (not(x) and z)
|
||||||
|
else: # RFC4634
|
||||||
|
((x and (y xor z)) xor z)
|
||||||
|
|
||||||
|
template maj(x, y, z: uint32): uint32 =
|
||||||
|
## "Majority" function of SHA256
|
||||||
|
when false: # Spec FIPS 180-4
|
||||||
|
(x and y) xor (x and z) xor (y and z)
|
||||||
|
else: # RFC4634
|
||||||
|
(x and (y or z)) or (y and z)
|
||||||
|
|
||||||
|
template S0(x: uint32): uint32 =
|
||||||
|
# Σ₀
|
||||||
|
rotr(x, 2) xor rotr(x, 13) xor rotr(x, 22)
|
||||||
|
|
||||||
|
template S1(x: uint32): uint32 =
|
||||||
|
# Σ₁
|
||||||
|
rotr(x, 6) xor rotr(x, 11) xor rotr(x, 25)
|
||||||
|
|
||||||
|
template s0(x: uint32): uint32 =
|
||||||
|
# σ₀
|
||||||
|
rotr(x, 7) xor rotr(x, 18) xor (x shr 3)
|
||||||
|
|
||||||
|
template s1(x: uint32): uint32 =
|
||||||
|
# σ₁
|
||||||
|
rotr(x, 17) xor rotr(x, 19) xor (x shr 10)
|
||||||
|
|
||||||
|
# Message schedule
|
||||||
|
# ------------------------------------------------
|
||||||
|
|
||||||
|
template u32BE(blob: array[4, byte]): uint32 =
|
||||||
|
## Interpret a data blob as a big-endian uint32
|
||||||
|
when nimvm:
|
||||||
|
(blob[0].uint32 shl 24) or (blob[1].uint32 shl 16) or (blob[2].uint32 shl 8) or blob[3].uint32
|
||||||
|
else:
|
||||||
|
when cpuEndian == littleEndian:
|
||||||
|
(blob[0].uint32 shl 24) or (blob[1].uint32 shl 16) or (blob[2].uint32 shl 8) or blob[3].uint32
|
||||||
|
else:
|
||||||
|
cast[uint32](blob)
|
||||||
|
|
||||||
|
template getU32at(msg: ptr UncheckedArray[byte], pos: SomeInteger): uint32 =
|
||||||
|
u32BE(cast[ptr array[4, byte]](msg[pos].addr)[])
|
||||||
|
|
||||||
|
# State updates
|
||||||
|
# ------------------------------------------------
|
||||||
|
|
||||||
|
template copy*(dst: var Sha256_state, src: Sha256_state) =
|
||||||
|
## State copy
|
||||||
|
# Should compile with a specialized aligned copy.
|
||||||
|
# No bounds check
|
||||||
|
for i in 0 ..< HashSize:
|
||||||
|
dst.H[i] = src.H[i]
|
||||||
|
|
||||||
|
template accumulate*(dst: var Sha256_state, src: Sha256_state) =
|
||||||
|
## State accumulation
|
||||||
|
# No bounds check
|
||||||
|
for i in 0 ..< HashSize:
|
||||||
|
dst.H[i] += src.H[i]
|
||||||
|
|
||||||
|
template sha256_round*(s: var Sha256_state, wt, kt: Word) =
|
||||||
|
template a: Word = s.H[0]
|
||||||
|
template b: Word = s.H[1]
|
||||||
|
template c: Word = s.H[2]
|
||||||
|
template d: Word = s.H[3]
|
||||||
|
template e: Word = s.H[4]
|
||||||
|
template f: Word = s.H[5]
|
||||||
|
template g: Word = s.H[6]
|
||||||
|
template h: Word = s.H[7]
|
||||||
|
|
||||||
|
let T1 = h + S1(e) + ch(e, f, g) + kt + wt
|
||||||
|
let T2 = S0(a) + maj(a, b, c)
|
||||||
|
d += T1
|
||||||
|
h = T1 + T2
|
||||||
|
|
||||||
|
s.H.rotateRight()
|
||||||
|
|
||||||
|
# Hash Computation
|
||||||
|
# ------------------------------------------------
|
||||||
|
|
||||||
|
func sha256_rounds_0_15(
|
||||||
|
s: var Sha256_state,
|
||||||
|
ms: var Sha256_MessageSchedule,
|
||||||
|
message: ptr UncheckedArray[byte]) {.inline.} =
|
||||||
|
staticFor t, 0, 16:
|
||||||
|
ms.w[t] = message.getU32at(t * sizeof(Word))
|
||||||
|
sha256_round(s, ms.w[t], K256[t])
|
||||||
|
|
||||||
|
func sha256_rounds_16_63(
|
||||||
|
s: var Sha256_state,
|
||||||
|
ms: var Sha256_MessageSchedule) {.inline.} =
|
||||||
|
staticFor t, 16, 64:
|
||||||
|
ms.w[t and 15] += s1(ms.w[(t - 2) and 15])+
|
||||||
|
ms.w[(t - 7) and 15] +
|
||||||
|
s0(ms.w[(t - 15) and 15])
|
||||||
|
|
||||||
|
sha256_round(s, ms.w[t and 15], K256[t])
|
||||||
|
|
||||||
|
func hashMessageBlocks_generic*(
|
||||||
|
H: var Sha256_state,
|
||||||
|
message: ptr UncheckedArray[byte],
|
||||||
|
numBlocks: uint) =
|
||||||
|
## Hash a message block by block
|
||||||
|
## Sha256 block size is 64 bytes hence
|
||||||
|
## a message will be process 64 by 64 bytes.
|
||||||
|
## FIPS.180-4 6.2.2. SHA-256 Hash Computation
|
||||||
|
|
||||||
|
var msg = message
|
||||||
|
var ms{.noInit.}: Sha256_MessageSchedule
|
||||||
|
var s{.noInit.}: Sha256_state
|
||||||
|
|
||||||
|
s.copy(H)
|
||||||
|
|
||||||
|
for _ in 0 ..< numBlocks:
|
||||||
|
sha256_rounds_0_15(s, ms, msg)
|
||||||
|
msg +%= BlockSize
|
||||||
|
|
||||||
|
sha256_rounds_16_63(s, ms)
|
||||||
|
|
||||||
|
s.accumulate(H) # accumulate on register variables
|
||||||
|
H.copy(s)
|
|
@ -0,0 +1,130 @@
|
||||||
|
# Constantine
|
||||||
|
# Copyright (c) 2018-2019 Status Research & Development GmbH
|
||||||
|
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
|
||||||
|
# Licensed and distributed under either of
|
||||||
|
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
|
||||||
|
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
|
||||||
|
# at your option. This file may not be copied, modified, or distributed except according to those terms.
|
||||||
|
|
||||||
|
import
|
||||||
|
../../platforms/isa/simd_x86,
|
||||||
|
../../platforms/primitives,
|
||||||
|
./sha256_generic
|
||||||
|
|
||||||
|
{.localpassC:"-msse4.1 -msha".}
|
||||||
|
|
||||||
|
# SHA256, a hash function from the SHA2 family
|
||||||
|
# --------------------------------------------------------------------------------
|
||||||
|
#
|
||||||
|
# References:
|
||||||
|
# - Intel SHA extensions whitepaper
|
||||||
|
# https://www.intel.com/content/dam/develop/external/us/en/documents/intel-sha-extensions-white-paper-402097.pdf
|
||||||
|
# - Intel SHA extensions article
|
||||||
|
# https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sha-extensions.html
|
||||||
|
|
||||||
|
# No exceptions allowed in core cryptographic operations
|
||||||
|
{.push raises: [].}
|
||||||
|
{.push checks: off.}
|
||||||
|
|
||||||
|
# Primitives
|
||||||
|
# ------------------------------------------------
|
||||||
|
|
||||||
|
template setr_K(i: int): m128i =
|
||||||
|
setr_u32x4(K256[4*i], K256[4*i+1], K256[4*i+2], K256[4*i+3])
|
||||||
|
|
||||||
|
# Hash Computation
|
||||||
|
# ------------------------------------------------
|
||||||
|
|
||||||
|
func hashMessageBlocks_shaext*(
|
||||||
|
H: var Sha256_state,
|
||||||
|
message: ptr UncheckedArray[byte],
|
||||||
|
numBlocks: uint)=
|
||||||
|
## Hash a message block by block
|
||||||
|
## Sha256 block size is 64 bytes hence
|
||||||
|
## a message will be process 64 by 64 bytes.
|
||||||
|
|
||||||
|
var
|
||||||
|
abef_save {.noInit.}: m128i
|
||||||
|
cdgh_save {.noInit.}: m128i
|
||||||
|
state0 {.noInit.}: m128i
|
||||||
|
state1 {.noInit.}: m128i
|
||||||
|
msgtmp {.noInit.}: array[4, m128i]
|
||||||
|
msg {.noInit.}: m128i
|
||||||
|
tmp {.noInit.}: m128i
|
||||||
|
|
||||||
|
data = message
|
||||||
|
|
||||||
|
let shuf_mask = set_u64x2(0x0c0d0e0f08090a0b, 0x0405060700010203)
|
||||||
|
|
||||||
|
# The SHA state is stored in this order:
|
||||||
|
# D, C, B, A, H, G, F, E
|
||||||
|
#
|
||||||
|
# state0 contains ABEF, state1 contains CDGH
|
||||||
|
|
||||||
|
tmp = shuf_u32x4(loada_u128(H.H[0].addr), 0xB1) # CDAB
|
||||||
|
state1 = shuf_u32x4(loada_u128(H.H[4].addr), 0x1B) # EFGH
|
||||||
|
state0 = alignr_u128(tmp, state1, 8) # ABEF
|
||||||
|
state1 = blend_u16x8(state1, tmp, 0xF0) # CDGH
|
||||||
|
|
||||||
|
for _ in 0 ..< numBlocks:
|
||||||
|
# Save current state
|
||||||
|
abef_save = state0
|
||||||
|
cdgh_save = state1
|
||||||
|
|
||||||
|
# Rounds 0-3
|
||||||
|
msgtmp[0] = shuf_u8x16(loadu_u128(data[0].addr), shuf_mask)
|
||||||
|
msg = add_u32x4(msgtmp[0], setr_K(0))
|
||||||
|
state1 = sha256_2rounds(state1, state0, msg)
|
||||||
|
msg = shuf_u32x4(msg, 0x0E)
|
||||||
|
state0 = sha256_2rounds(state0, state1, msg)
|
||||||
|
|
||||||
|
# Rounds 4-7 and 8-11
|
||||||
|
staticFor i, 1, 3:
|
||||||
|
msgtmp[i] = shuf_u8x16(loadu_u128(data[16*i].addr), shuf_mask)
|
||||||
|
msg = add_u32x4(msgtmp[i], setr_K(i))
|
||||||
|
state1 = sha256_2rounds(state1, state0, msg)
|
||||||
|
msg = shuf_u32x4(msg, 0x0E)
|
||||||
|
state0 = sha256_2rounds(state0, state1, msg)
|
||||||
|
msgtmp[i-1] = sha256_msg1(msgtmp[i-1], msgtmp[i])
|
||||||
|
|
||||||
|
# Rounds 12-59
|
||||||
|
msgtmp[3] = shuf_u8x16(loadu_u128(data[16*3].addr), shuf_mask)
|
||||||
|
|
||||||
|
staticFor i, 3, 15:
|
||||||
|
let prev = (i-1) and 3 # mod 4, we rotate buffers
|
||||||
|
let curr = i and 3
|
||||||
|
let next = (i+1) and 3
|
||||||
|
|
||||||
|
msg = add_u32x4(msgtmp[curr], setr_K(i))
|
||||||
|
state1 = sha256_2rounds(state1, state0, msg)
|
||||||
|
tmp = alignr_u128(msgtmp[curr], msgtmp[prev], 4)
|
||||||
|
msgtmp[next] = add_u32x4(msgtmp[next], tmp)
|
||||||
|
msgtmp[next] = sha256_msg2(msgtmp[next], msgtmp[curr])
|
||||||
|
msg = shuf_u32x4(msg, 0x0E)
|
||||||
|
state0 = sha256_2rounds(state0, state1, msg)
|
||||||
|
msgtmp[prev] = sha256_msg1(msgtmp[prev], msgtmp[curr])
|
||||||
|
|
||||||
|
# Rounds 60-63
|
||||||
|
msg = add_u32x4(msgtmp[3], setr_K(15))
|
||||||
|
state1 = sha256_2rounds(state1, state0, msg)
|
||||||
|
msg = shuf_u32x4(msg, 0x0E)
|
||||||
|
state0 = sha256_2rounds(state0, state1, msg)
|
||||||
|
|
||||||
|
# Accumulate
|
||||||
|
state0 = add_u32x4(state0, abef_save)
|
||||||
|
state1 = add_u32x4(state1, cdgh_save)
|
||||||
|
|
||||||
|
data +%= BlockSize
|
||||||
|
|
||||||
|
# The SHA state is stored in this order:
|
||||||
|
# D, C, B, A, H, G, F, E
|
||||||
|
#
|
||||||
|
# state0 contains ABEF, state1 contains CDGH
|
||||||
|
|
||||||
|
tmp = shuf_u32x4(state0, 0x1B) # FEBA
|
||||||
|
state1 = shuf_u32x4(state1, 0xB1) # DCHG
|
||||||
|
state0 = blend_u16x8(tmp, state1, 0xF0) # DCBA
|
||||||
|
state1 = alignr_u128(state1, tmp, 8) # HGFE
|
||||||
|
|
||||||
|
storea_u128(H.H[0].addr, state0)
|
||||||
|
storea_u128(H.H[4].addr, state1)
|
|
@ -0,0 +1,211 @@
|
||||||
|
# Constantine
|
||||||
|
# Copyright (c) 2018-2019 Status Research & Development GmbH
|
||||||
|
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
|
||||||
|
# Licensed and distributed under either of
|
||||||
|
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
|
||||||
|
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
|
||||||
|
# at your option. This file may not be copied, modified, or distributed except according to those terms.
|
||||||
|
|
||||||
|
import
|
||||||
|
../../platforms/isa/simd_x86,
|
||||||
|
../../platforms/primitives,
|
||||||
|
./sha256_generic
|
||||||
|
|
||||||
|
{.localpassC:"-mssse3".}
|
||||||
|
|
||||||
|
# SHA256, SSSE3 optimizations
|
||||||
|
# --------------------------------------------------------------------------------
|
||||||
|
#
|
||||||
|
# References:
|
||||||
|
# - NIST: https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.180-4.pdf
|
||||||
|
# - IETF: US Secure Hash Algorithms (SHA and HMAC-SHA) https://tools.ietf.org/html/rfc4634
|
||||||
|
# - Fast SHA-256 Implementations on Intel® Architecture Processors
|
||||||
|
# https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/sha-256-implementations-paper.pdf
|
||||||
|
|
||||||
|
# Following the intel whitepaper we split our code into:
|
||||||
|
# We keep track of a 256-bit state vector corresponding
|
||||||
|
# to {a, b, c, d, e, f, g, h} in specification
|
||||||
|
#
|
||||||
|
# Processing is done in 2 steps
|
||||||
|
# - Message scheduler:
|
||||||
|
# Takes the input 16 DWORDs and
|
||||||
|
# computes 48 new DWORDs. Together with the original 16 DWORDs, these
|
||||||
|
# form a vector of 64 DWORDs that is the input to the second step.
|
||||||
|
# This can be vectorized.
|
||||||
|
# - 64 SHA rounds:
|
||||||
|
# This code is scalar.
|
||||||
|
|
||||||
|
# No exceptions allowed in core cryptographic operations
|
||||||
|
{.push raises: [].}
|
||||||
|
{.push checks: off.}
|
||||||
|
|
||||||
|
# Vectorized message scheduler
|
||||||
|
# ------------------------------------------------
|
||||||
|
|
||||||
|
const VecNum = BlockSize div 16 # BlockSize / sizeof(m128i)
|
||||||
|
const VecWords = 16 div sizeof(Word) # sizeof(m128i) / sizeof(Word)
|
||||||
|
|
||||||
|
func initMessageSchedule(
|
||||||
|
msnext: var array[VecNum, m128i],
|
||||||
|
ms: var Sha256_MessageSchedule,
|
||||||
|
message: ptr UncheckedArray[byte]) {.inline.} =
|
||||||
|
## Initial state, from data
|
||||||
|
## - Precompute steps for the future message schedule `msnext`
|
||||||
|
## - compute the current message schedule `ms`
|
||||||
|
|
||||||
|
let mask = setr_u32x4(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f)
|
||||||
|
let pK256 = K256.unsafeAddr()
|
||||||
|
|
||||||
|
staticFor i, 0, VecNum:
|
||||||
|
msnext[i] = loadu_u128(message[i * sizeof(m128i)].addr)
|
||||||
|
msnext[i] = shuf_u8x16(msnext[i], mask)
|
||||||
|
storea_u128(ms.w[VecWords*i].addr, add_u32x4(msnext[i], loadu_u128(pK256[VecWords*i].addr)))
|
||||||
|
|
||||||
|
func updateMessageSchedule(
|
||||||
|
W: var array[4, m128i],
|
||||||
|
loMask, hiMask: m128i) {.inline.} =
|
||||||
|
# Steady state
|
||||||
|
# ------------
|
||||||
|
# The message schedule workspace W[16:0]
|
||||||
|
# is updated with
|
||||||
|
# W[t mod 16] += s0 + s1 + W[(t-7) mod 16]
|
||||||
|
# with
|
||||||
|
# s0 = σ₀(W[(t-15) mod 16])
|
||||||
|
# s1 = σ₁(W[(t-2) mod 16])
|
||||||
|
# by denoting the right rotation >>>, and xor ⊕
|
||||||
|
# σ₀(x) = (x >>> 7) ⊕ (x >>> 18) ⊕ (x >>> 3)
|
||||||
|
# σ₁(x) = (x >>> 17) ⊕ (x >>> 19) ⊕ (x >>> 10)
|
||||||
|
|
||||||
|
const rot0 = [int32 7, 18, 3]
|
||||||
|
const rot1 = [int32 17, 19, 10]
|
||||||
|
|
||||||
|
var v{.noInit.}: array[4, m128i]
|
||||||
|
|
||||||
|
v[0] = alignr_u128(W[1], W[0], 4)
|
||||||
|
v[3] = alignr_u128(W[3], W[2], 4)
|
||||||
|
v[2] = shr_u32x4(v[0], rot0[0])
|
||||||
|
W[0] = add_u32x4(W[0], v[3])
|
||||||
|
|
||||||
|
v[3] = shr_u32x4(v[0], rot0[2])
|
||||||
|
v[1] = shl_u32x4(v[0], 32-rot0[1])
|
||||||
|
v[0] = xor_u128(v[3], v[2])
|
||||||
|
|
||||||
|
v[3] = shuf_u32x4(W[3], 0xfa)
|
||||||
|
v[2] = shr_u32x4(v[2], rot0[1] - rot0[0])
|
||||||
|
v[0] = xor_u128(v[0], v[1])
|
||||||
|
v[0] = xor_u128(v[0], v[2])
|
||||||
|
|
||||||
|
v[1] = shl_u32x4(v[1], rot0[1] - rot0[0])
|
||||||
|
v[2] = shr_u32x4(v[3], rot1[2])
|
||||||
|
v[3] = shr_u64x2(v[3], rot1[0])
|
||||||
|
W[0] = add_u32x4(W[0], xor_u128(v[0], v[1]))
|
||||||
|
|
||||||
|
v[2] = xor_u128(v[2], v[3])
|
||||||
|
v[3] = shr_u64x2(v[3], rot1[1] - rot1[0])
|
||||||
|
v[2] = shuf_u8x16(xor_u128(v[2], v[3]), lo_mask)
|
||||||
|
W[0] = add_u32x4(W[0], v[2])
|
||||||
|
|
||||||
|
v[3] = shuf_u32x4(W[0], 0x50)
|
||||||
|
v[2] = shr_u32x4(v[3], rot1[2])
|
||||||
|
v[3] = shr_u64x2(v[3], rot1[0])
|
||||||
|
v[2] = xor_u128(v[2], v[3])
|
||||||
|
v[3] = shr_u64x2(v[3], rot1[1] - rot1[0])
|
||||||
|
|
||||||
|
W[0] = add_u32x4(W[0], shuf_u8x16(xor_u128(v[2], v[3]), hi_mask))
|
||||||
|
|
||||||
|
W.rotateLeft()
|
||||||
|
|
||||||
|
# Hash Computation
|
||||||
|
# ------------------------------------------------
|
||||||
|
|
||||||
|
func sha256_rounds_0_47(
|
||||||
|
s: var Sha256_state,
|
||||||
|
ms: var Sha256_MessageSchedule,
|
||||||
|
msnext: var array[VecNum, m128i]) {.inline.} =
|
||||||
|
## Process Sha256 rounds 0 to 47
|
||||||
|
|
||||||
|
let loMask = setr_u32x4(0x03020100, 0x0b0a0908, -1, -1)
|
||||||
|
let hiMask = setr_u32x4(-1, -1, 0x03020100, 0x0b0a0908)
|
||||||
|
|
||||||
|
# The first items of K256 were processed in initMessageSchedule
|
||||||
|
var k256_idx = 16
|
||||||
|
|
||||||
|
# Rounds 0-15, 16-31, 32-47
|
||||||
|
for r in 0 ..< 3:
|
||||||
|
|
||||||
|
# Important unrolling for 2 reasons, see Intel paper
|
||||||
|
# - State updates:
|
||||||
|
# In each round calculation six out of the eight state variables are shifted to the
|
||||||
|
# next state variable. Rather than do these using mov instructions, we rename
|
||||||
|
# the virtual registers (symbols) to effect this “shift”. Thus each round
|
||||||
|
# effectively rotates the set of state register names by one place. By doing 8 or
|
||||||
|
# 16 rounds in the body of the loop, the names have rotated back to their
|
||||||
|
# starting values, so no register moves are needed before looping.
|
||||||
|
#
|
||||||
|
# - Message schedule:
|
||||||
|
# Similarly on the vector unit for the message scheduling, the 16 necessary
|
||||||
|
# scheduled DWORDs are stored in four XMM registers, as described earlier. For
|
||||||
|
# example, the initial data DWORDs are stored in order as {X0, X1, X2, X3}.
|
||||||
|
# When we compute four new scheduled DWORDs, we store them in X0
|
||||||
|
# (overwriting the “oldest” data DWORDs), so now the scheduled DWORDs are
|
||||||
|
# stored in order in {X1, X2, X3, X0}. Once again, we handle this by “rotating”
|
||||||
|
# the four names, where in this case the names rotate one place every four
|
||||||
|
# rounds (because we compute four scheduled DWORDs in each calculation).
|
||||||
|
# By having 16 rounds (four scheduling operations) in the body of the loop,
|
||||||
|
# these XMM register names rotate back to their initial value, and again no
|
||||||
|
# register moves are needed before looping.
|
||||||
|
|
||||||
|
staticFor i, 0, VecNum:
|
||||||
|
# We interleave computing the message scheduled at {t+4, t+5, t+6, t+7}
|
||||||
|
# with SHA256 state update for {t, t+1, t+2, t+3}
|
||||||
|
|
||||||
|
# As they are independent, hopefully the compiler reorders instructions
|
||||||
|
# for maximum throughput.
|
||||||
|
# Also it optimize away the moves and use register renaming to avoid rotations
|
||||||
|
const pos = VecWords * i
|
||||||
|
|
||||||
|
msnext.updateMessageSchedule(loMask, hiMask)
|
||||||
|
let wnext = add_u32x4(msnext[3], loadu_u128(K256[k256_idx].unsafeAddr))
|
||||||
|
|
||||||
|
# K256 was already included in the computation of wnext, hence kt = 0
|
||||||
|
s.sha256_round(wt = ms.w[pos + 0], kt = 0)
|
||||||
|
s.sha256_round(wt = ms.w[pos + 1], kt = 0)
|
||||||
|
s.sha256_round(wt = ms.w[pos + 2], kt = 0)
|
||||||
|
s.sha256_round(wt = ms.w[pos + 3], kt = 0)
|
||||||
|
|
||||||
|
storea_u128(ms.w[pos].addr, wnext)
|
||||||
|
k256_idx += VecWords
|
||||||
|
|
||||||
|
|
||||||
|
func sha256_rounds_48_63(
|
||||||
|
s: var Sha256_state,
|
||||||
|
ms: var Sha256_MessageSchedule) {.inline.} =
|
||||||
|
## Process Sha256 rounds 48 to 63
|
||||||
|
staticFor t, 48, 64:
|
||||||
|
# Wt[i mod 16] and K256 was already integrated in the computation of wnext
|
||||||
|
s.sha256_round(wt = ms.w[t and 15], kt = 0)
|
||||||
|
|
||||||
|
func hashMessageBlocks_ssse3*(
|
||||||
|
H: var Sha256_state,
|
||||||
|
message: ptr UncheckedArray[byte],
|
||||||
|
numBlocks: uint)=
|
||||||
|
## Hash a message block by block
|
||||||
|
## Sha256 block size is 64 bytes hence
|
||||||
|
## a message will be process 64 by 64 bytes.
|
||||||
|
|
||||||
|
var msg = message
|
||||||
|
var ms{.noInit.}: Sha256_MessageSchedule
|
||||||
|
var msnext{.noInit.}: array[VecNum, m128i]
|
||||||
|
var s{.noInit.}: Sha256_state
|
||||||
|
|
||||||
|
s.copy(H)
|
||||||
|
|
||||||
|
for _ in 0 ..< numBlocks:
|
||||||
|
initMessageSchedule(msnext, ms, msg)
|
||||||
|
msg +%= BlockSize
|
||||||
|
|
||||||
|
sha256_rounds_0_47(s, ms, msnext)
|
||||||
|
sha256_rounds_48_63(s, ms)
|
||||||
|
|
||||||
|
s.accumulate(H) # accumulate on register variables
|
||||||
|
H.copy(s)
|
|
@ -36,7 +36,7 @@ debug:
|
||||||
result[0] = '0'
|
result[0] = '0'
|
||||||
result[1] = 'x'
|
result[1] = 'x'
|
||||||
var a = a
|
var a = a
|
||||||
for j in countdown(L-1, 0):
|
for j in countdown(result.len-1, 2):
|
||||||
result[j] = hexChars.secretLookup(a and SecretWord 0xF)
|
result[j] = hexChars.secretLookup(a and SecretWord 0xF)
|
||||||
a = a shr 4
|
a = a shr 4
|
||||||
|
|
||||||
|
@ -45,7 +45,7 @@ debug:
|
||||||
result.add " " & toHex(a[0])
|
result.add " " & toHex(a[0])
|
||||||
for i in 1 ..< a.len:
|
for i in 1 ..< a.len:
|
||||||
result.add ", " & toHex(a[i])
|
result.add ", " & toHex(a[i])
|
||||||
result.add "])"
|
result.add "]"
|
||||||
|
|
||||||
func `$`*(a: BigInt): string =
|
func `$`*(a: BigInt): string =
|
||||||
result = "BigInt["
|
result = "BigInt["
|
||||||
|
|
|
@ -0,0 +1,277 @@
|
||||||
|
# Constantine
|
||||||
|
# Copyright (c) 2018-2019 Status Research & Development GmbH
|
||||||
|
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
|
||||||
|
# Licensed and distributed under either of
|
||||||
|
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
|
||||||
|
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
|
||||||
|
# at your option. This file may not be copied, modified, or distributed except according to those terms.
|
||||||
|
|
||||||
|
static: doAssert defined(i386) or defined(amd64)
|
||||||
|
|
||||||
|
# SIMD throughput and latency:
|
||||||
|
# - https://software.intel.com/sites/landingpage/IntrinsicsGuide/
|
||||||
|
# - https://www.agner.org/optimize/instruction_tables.pdf
|
||||||
|
|
||||||
|
# Reminder: x86 is little-endian, order is [low part, high part]
|
||||||
|
# Documentation at https://software.intel.com/sites/landingpage/IntrinsicsGuide/
|
||||||
|
|
||||||
|
when defined(vcc):
|
||||||
|
{.pragma: x86_type, byCopy, header:"<intrin.h>".}
|
||||||
|
{.pragma: x86, noDecl, header:"<intrin.h>".}
|
||||||
|
else:
|
||||||
|
{.pragma: x86_type, byCopy, header:"<x86intrin.h>".}
|
||||||
|
{.pragma: x86, noDecl, header:"<x86intrin.h>".}
|
||||||
|
|
||||||
|
type
|
||||||
|
m128* {.importc: "__m128", x86_type.} = object
|
||||||
|
raw: array[4, float32]
|
||||||
|
m128d* {.importc: "__m128d", x86_type.} = object
|
||||||
|
raw: array[2, float64]
|
||||||
|
m128i* {.importc: "__m128i", x86_type.} = object
|
||||||
|
raw: array[16, byte]
|
||||||
|
m256* {.importc: "__m256", x86_type.} = object
|
||||||
|
raw: array[8, float32]
|
||||||
|
m256d* {.importc: "__m256d", x86_type.} = object
|
||||||
|
raw: array[4, float64]
|
||||||
|
m256i* {.importc: "__m256i", x86_type.} = object
|
||||||
|
raw: array[32, byte]
|
||||||
|
m512* {.importc: "__m512", x86_type.} = object
|
||||||
|
raw: array[16, float32]
|
||||||
|
m512d* {.importc: "__m512d", x86_type.} = object
|
||||||
|
raw: array[8, float64]
|
||||||
|
m512i* {.importc: "__m512i", x86_type.} = object
|
||||||
|
raw: array[64, byte]
|
||||||
|
mmask8* {.importc: "__mmask8", x86_type.} = uint8
|
||||||
|
mmask16* {.importc: "__mmask16", x86_type.} = uint16
|
||||||
|
mmask64* {.importc: "__mmask64", x86_type.} = uint64
|
||||||
|
|
||||||
|
# ############################################################
|
||||||
|
#
|
||||||
|
# SSE2 - integer - packed
|
||||||
|
#
|
||||||
|
# ############################################################
|
||||||
|
|
||||||
|
func mm_setzero_si128(): m128i {.importc: "_mm_setzero_si128", x86.}
|
||||||
|
func mm_set1_epi8(a: int8 or uint8): m128i {.importc: "_mm_set1_epi8", x86.}
|
||||||
|
func mm_set1_epi16(a: int16 or uint16): m128i {.importc: "_mm_set1_epi16", x86.}
|
||||||
|
func mm_set1_epi32(a: int32 or uint32): m128i {.importc: "_mm_set1_epi32", x86.}
|
||||||
|
func mm_set1_epi64x(a: int64 or uint64): m128i {.importc: "_mm_set1_epi64x", x86.}
|
||||||
|
func mm_set_epi64x(e1, e0: int64 or uint64): m128i {.importc: "_mm_set_epi64x", x86.}
|
||||||
|
func mm_load_si128(mem_addr: ptr m128i): m128i {.importc: "_mm_load_si128", x86.}
|
||||||
|
func mm_loadu_si128(mem_addr: ptr m128i): m128i {.importc: "_mm_loadu_si128", x86.}
|
||||||
|
func mm_store_si128(mem_addr: ptr m128i, a: m128i) {.importc: "_mm_store_si128", x86.}
|
||||||
|
func mm_storeu_si128(mem_addr: ptr m128i, a: m128i) {.importc: "_mm_storeu_si128", x86.}
|
||||||
|
|
||||||
|
func mm_set_epi32(e3, e2, e1, e0: int32 or uint32): m128i {.importc: "_mm_set_epi32", x86.}
|
||||||
|
## Initialize m128i with {e3, e2, e1, e0} (big endian order)
|
||||||
|
## in order [e0, e1, e2, e3]
|
||||||
|
func mm_setr_epi32(e3, e2, e1, e0: int32 or uint32): m128i {.importc: "_mm_setr_epi32", x86.}
|
||||||
|
## Initialize m128i with {e3, e2, e1, e0} (big endian order)
|
||||||
|
## in order [e3, e2, e1, e0]
|
||||||
|
|
||||||
|
func mm_xor_si128(a, b: m128i): m128i {.importc: "_mm_xor_si128", x86.}
|
||||||
|
|
||||||
|
func mm_add_epi8(a, b: m128i): m128i {.importc: "_mm_add_epi8", x86.}
|
||||||
|
func mm_add_epi16(a, b: m128i): m128i {.importc: "_mm_add_epi16", x86.}
|
||||||
|
func mm_add_epi32(a, b: m128i): m128i {.importc: "_mm_add_epi32", x86.}
|
||||||
|
func mm_add_epi64(a, b: m128i): m128i {.importc: "_mm_add_epi64", x86.}
|
||||||
|
|
||||||
|
func mm_slli_epi64(a: m128i, imm8: int32 or uint32): m128i {.importc: "_mm_slli_epi64", x86.}
|
||||||
|
## Shift 2xint64 left
|
||||||
|
func mm_srli_epi64(a: m128i, imm8: int32 or uint32): m128i {.importc: "_mm_srli_epi64", x86.}
|
||||||
|
## Shift 2xint64 right
|
||||||
|
func mm_srli_epi32(a: m128i, imm8: int32 or uint32): m128i {.importc: "_mm_srli_epi32", x86.}
|
||||||
|
## Shift 4xint32 left
|
||||||
|
func mm_slli_epi32(a: m128i, imm8: int32 or uint32): m128i {.importc: "_mm_slli_epi32", x86.}
|
||||||
|
## Shift 4xint32 right
|
||||||
|
|
||||||
|
func mm_shuffle_epi32(a: m128i, imm8: int32 or uint32): m128i {.importc: "_mm_shuffle_epi32", x86.}
|
||||||
|
## Shuffle 32-bit integers in a according to the control in imm8
|
||||||
|
## Formula is in big endian representation
|
||||||
|
## a = {a3, a2, a1, a0}
|
||||||
|
## dst = {d3, d2, d1, d0}
|
||||||
|
## imm8 = {bits[7:6], bits[5:4], bits[3:2], bits[1:0]}
|
||||||
|
## d0 will refer a[bits[1:0]]
|
||||||
|
## d1 a[bits[3:2]]
|
||||||
|
|
||||||
|
# ############################################################
|
||||||
|
#
|
||||||
|
# SSSE3 - integer - packed
|
||||||
|
#
|
||||||
|
# ############################################################
|
||||||
|
|
||||||
|
func mm_alignr_epi8(a, b: m128i, imm8: int32 or uint32): m128i {.importc: "_mm_alignr_epi8", x86.}
|
||||||
|
## Concatenate 16-byte blocks in a and b into a 32-byte temporary result,
|
||||||
|
## shift the result right by imm8 bytes, and return the low 16 bytes
|
||||||
|
## Input:
|
||||||
|
## a[127:0], b[127:0]
|
||||||
|
## Result:
|
||||||
|
## tmp[255:128] = a
|
||||||
|
## tmp[127:0] = b
|
||||||
|
## tmp[255:0] = tmp[255:0] >> (imm8*8)
|
||||||
|
## dst[127:0] = tmp[127:0]
|
||||||
|
|
||||||
|
func mm_shuffle_epi8(a, b: m128i): m128i {.importc: "_mm_shuffle_epi8", x86.}
|
||||||
|
## Shuffle 8-bit integers in a according to the control mask in b
|
||||||
|
## Formula is in big endian representation
|
||||||
|
## a = {a15, a14, a13, a12, a11, a10, a9, a8, a7, a6, a5, a4, a3, a2, a1, a0}
|
||||||
|
## b = {b15, b14, b13, b12, b11, b10, b9, b8, b7, b6, b5, b4, b3, b2, b1, b0}
|
||||||
|
## dst = {d15, d14, d13, d12, d11, d10, d9, d8, d7, d6, d5, d4, d3, d2, d1, d0}
|
||||||
|
##
|
||||||
|
## The control mask b0 ... b15 have the shape:
|
||||||
|
## bits z000uvwx
|
||||||
|
## if z is set, the corresponding d is set to zero.
|
||||||
|
## otherwise uvwx represents a binary number in 0..15,
|
||||||
|
## the corresponding d will be set to a(uvwx)
|
||||||
|
##
|
||||||
|
## for i in 0 ..< 16:
|
||||||
|
## if bitand(b[i], 0x80) != 0:
|
||||||
|
## dst[i] = 0
|
||||||
|
## else:
|
||||||
|
## dst[i] = a[bitand(b[i], 0x0F)]
|
||||||
|
|
||||||
|
# ############################################################
|
||||||
|
#
|
||||||
|
# SSE4.1 - integer - packed
|
||||||
|
#
|
||||||
|
# ############################################################
|
||||||
|
|
||||||
|
func mm_blend_epi16(a, b: m128i, imm8: int32 or uint32): m128i {.importc: "_mm_blend_epi16", x86.}
|
||||||
|
## Blend packed 16-bit integers from a and b using control mask imm8,
|
||||||
|
## and store the results in dst.
|
||||||
|
##
|
||||||
|
## FOR j := 0 to 7
|
||||||
|
## i := j*16
|
||||||
|
## IF imm8[j]
|
||||||
|
## dst[i+15:i] := b[i+15:i]
|
||||||
|
## ELSE
|
||||||
|
## dst[i+15:i] := a[i+15:i]
|
||||||
|
|
||||||
|
# ############################################################
|
||||||
|
#
|
||||||
|
# AVX512F - integer - packed
|
||||||
|
#
|
||||||
|
# ############################################################
|
||||||
|
|
||||||
|
func mm_ror_epi32(a: m128i, imm8: int32 or uint32): m128i {.importc: "_mm_ror_epi32", x86.}
|
||||||
|
## Rotate 4xint32 right
|
||||||
|
|
||||||
|
func mm_mask_add_epi32(src: m128i, mask: mmask8, a, b: m128i): m128i {.importc: "_mm_mask_add_epi32", x86.}
|
||||||
|
## Add packed 32-bit integers in a and b, and store the results in dst using writemask mask
|
||||||
|
## (elements are copied from src when the corresponding mask bit is not set).
|
||||||
|
## for j in 0 ..< 4:
|
||||||
|
## let i = j*32
|
||||||
|
## if k[j]:
|
||||||
|
## dst[i+31:i] := a[i+31:i] + b[i+31:i]
|
||||||
|
## else:
|
||||||
|
## dst[i+31:i] := src[i+31:i]
|
||||||
|
|
||||||
|
# ############################################################
|
||||||
|
#
|
||||||
|
# SHA extensions
|
||||||
|
#
|
||||||
|
# ############################################################
|
||||||
|
|
||||||
|
func mm_sha256msg1_epu32(a, b: m128i): m128i {.importc: "_mm_sha256msg1_epu32", x86.}
|
||||||
|
## Perform an intermediate calculation for the next four SHA256 message values (unsigned 32-bit integers)
|
||||||
|
## using previous message values from a and b, and store the result in dst.
|
||||||
|
##
|
||||||
|
## W4 := b[31:0]
|
||||||
|
## W3 := a[127:96]
|
||||||
|
## W2 := a[95:64]
|
||||||
|
## W1 := a[63:32]
|
||||||
|
## W0 := a[31:0]
|
||||||
|
## dst[127:96] := W3 + sigma0(W4)
|
||||||
|
## dst[95:64] := W2 + sigma0(W3)
|
||||||
|
## dst[63:32] := W1 + sigma0(W2)
|
||||||
|
## dst[31:0] := W0 + sigma0(W1)
|
||||||
|
|
||||||
|
func mm_sha256msg2_epu32(a, b: m128i): m128i {.importc: "_mm_sha256msg2_epu32", x86.}
|
||||||
|
## Perform the final calculation for the next four SHA256 message values (unsigned 32-bit integers)
|
||||||
|
## using previous message values from a and b, and store the result in dst.
|
||||||
|
##
|
||||||
|
## W14 := b[95:64]
|
||||||
|
## W15 := b[127:96]
|
||||||
|
## W16 := a[31:0] + sigma1(W14)
|
||||||
|
## W17 := a[63:32] + sigma1(W15)
|
||||||
|
## W18 := a[95:64] + sigma1(W16)
|
||||||
|
## W19 := a[127:96] + sigma1(W17)
|
||||||
|
## dst[127:96] := W19
|
||||||
|
## dst[95:64] := W18
|
||||||
|
## dst[63:32] := W17
|
||||||
|
## dst[31:0] := W16
|
||||||
|
|
||||||
|
func mm_sha256rnds2_epu32(cdgh, abef, k: m128i): m128i {.importc: "_mm_sha256rnds2_epu32", x86.}
|
||||||
|
## Perform 2 rounds of SHA256 operation using
|
||||||
|
## an initial SHA256 state (C,D,G,H) from a,
|
||||||
|
## an initial SHA256 state (A,B,E,F) from b,
|
||||||
|
## and a pre-computed sum of the next 2 round message values (unsigned 32-bit integers)
|
||||||
|
## and the corresponding round constants from k,
|
||||||
|
## and store the updated SHA256 state (A,B,E,F) in dst.
|
||||||
|
##
|
||||||
|
## A[0] := b[127:96]
|
||||||
|
## B[0] := b[95:64]
|
||||||
|
## C[0] := a[127:96]
|
||||||
|
## D[0] := a[95:64]
|
||||||
|
## E[0] := b[63:32]
|
||||||
|
## F[0] := b[31:0]
|
||||||
|
## G[0] := a[63:32]
|
||||||
|
## H[0] := a[31:0]
|
||||||
|
## W_K[0] := k[31:0]
|
||||||
|
## W_K[1] := k[63:32]
|
||||||
|
## FOR i := 0 to 1
|
||||||
|
## A[i+1] := Ch(E[i], F[i], G[i]) + sum1(E[i]) + W_K[i] + H[i] + Maj(A[i], B[i], C[i]) + sum0(A[i])
|
||||||
|
## B[i+1] := A[i]
|
||||||
|
## C[i+1] := B[i]
|
||||||
|
## D[i+1] := C[i]
|
||||||
|
## E[i+1] := Ch(E[i], F[i], G[i]) + sum1(E[i]) + W_K[i] + H[i] + D[i]
|
||||||
|
## F[i+1] := E[i]
|
||||||
|
## G[i+1] := F[i]
|
||||||
|
## H[i+1] := G[i]
|
||||||
|
## ENDFOR
|
||||||
|
## dst[127:96] := A[2]
|
||||||
|
## dst[95:64] := B[2]
|
||||||
|
## dst[63:32] := E[2]
|
||||||
|
## dst[31:0] := F[2]
|
||||||
|
|
||||||
|
# Aliases
|
||||||
|
# ------------------------------------------------
|
||||||
|
|
||||||
|
template set_u64x2*(e1, e0: int64 or uint64): m128i =
|
||||||
|
mm_set_epi64x(e1, e0)
|
||||||
|
template setr_u32x4*(e3, e2, e1, e0: int32 or uint32): m128i =
|
||||||
|
mm_setr_epi32(e3, e2, e1, e0)
|
||||||
|
template loada_u128*(data: pointer): m128i =
|
||||||
|
mm_load_si128(cast[ptr m128i](data))
|
||||||
|
template loadu_u128*(data: pointer): m128i =
|
||||||
|
mm_loadu_si128(cast[ptr m128i](data))
|
||||||
|
template storea_u128*(mem_addr: pointer, a: m128i) =
|
||||||
|
mm_store_si128(cast[ptr m128i](mem_addr), a)
|
||||||
|
|
||||||
|
template xor_u128*(a, b: m128i): m128i =
|
||||||
|
mm_xor_si128(a, b)
|
||||||
|
|
||||||
|
template add_u32x4*(a, b: m128i): m128i =
|
||||||
|
mm_add_epi32(a, b)
|
||||||
|
template shl_u32x4*(a: m128i, imm8: int32 or uint32): m128i =
|
||||||
|
mm_slli_epi32(a, imm8)
|
||||||
|
template shr_u32x4*(a: m128i, imm8: int32 or uint32): m128i =
|
||||||
|
mm_srli_epi32(a, imm8)
|
||||||
|
template shr_u64x2*(a: m128i, imm8: int32 or uint32): m128i =
|
||||||
|
mm_srli_epi64(a, imm8)
|
||||||
|
|
||||||
|
template alignr_u128*(a, b: m128i, shiftRightByBytes: int32 or uint32): m128i =
|
||||||
|
mm_alignr_epi8(a, b, shiftRightByBytes)
|
||||||
|
template shuf_u8x16*(a: m128i, mask: m128i): m128i =
|
||||||
|
mm_shuffle_epi8(a, mask)
|
||||||
|
template shuf_u32x4*(a: m128i, mask: int32 or uint32): m128i =
|
||||||
|
mm_shuffle_epi32(a, mask)
|
||||||
|
template blend_u16x8*(a, b: m128i, mask: int32 or uint32): m128i =
|
||||||
|
mm_blend_epi16(a, b, mask)
|
||||||
|
|
||||||
|
template sha256_msg1*(a, b: m128i): m128i =
|
||||||
|
mm_sha256msg1_epu32(a, b)
|
||||||
|
template sha256_msg2*(a, b: m128i): m128i =
|
||||||
|
mm_sha256msg2_epu32(a, b)
|
||||||
|
template sha256_2rounds*(cdgh, abef, k: m128i): m128i =
|
||||||
|
mm_sha256rnds2_epu32(cdgh, abef, k)
|
|
@ -34,6 +34,10 @@ when X86 and GCC_Compatible:
|
||||||
import isa/[cpuinfo_x86, macro_assembler_x86]
|
import isa/[cpuinfo_x86, macro_assembler_x86]
|
||||||
export cpuinfo_x86, macro_assembler_x86
|
export cpuinfo_x86, macro_assembler_x86
|
||||||
|
|
||||||
|
# No exceptions allowed in core cryptographic operations
|
||||||
|
{.push raises: [].}
|
||||||
|
{.push checks: off.}
|
||||||
|
|
||||||
# ############################################################
|
# ############################################################
|
||||||
#
|
#
|
||||||
# Instrumentation
|
# Instrumentation
|
||||||
|
@ -72,3 +76,45 @@ func copy*[T: byte|char](
|
||||||
{.push checks: off.} # No OverflowError or IndexError allowed
|
{.push checks: off.} # No OverflowError or IndexError allowed
|
||||||
for i in 0 ..< len:
|
for i in 0 ..< len:
|
||||||
dst[dStart + i] = byte src[sStart + i]
|
dst[dStart + i] = byte src[sStart + i]
|
||||||
|
|
||||||
|
func rotateRight*[N: static int, T](a: var array[N, T]) {.inline.} =
|
||||||
|
# Rotate right (Somehow we can't use a generic template here)
|
||||||
|
# Inline
|
||||||
|
# Hopefully we want the compiler to see that N rounds of rotation
|
||||||
|
# can be optimized away with register renaming
|
||||||
|
let tmp = a[a.len-1]
|
||||||
|
staticForCountdown i, a.len-1, 1:
|
||||||
|
a[i] = a[i-1]
|
||||||
|
a[0] = tmp
|
||||||
|
|
||||||
|
func rotateLeft*[N: static int, T](a: var array[N, T]) {.inline.} =
|
||||||
|
# Rotate left (Somehow we can't use a generic template here)
|
||||||
|
# Inline
|
||||||
|
# Hopefully we want the compiler to see that N rounds of rotation
|
||||||
|
# can be optimized away with register renaming
|
||||||
|
let tmp = a[0]
|
||||||
|
staticFor i, 0, a.len-1:
|
||||||
|
a[i] = a[i+1]
|
||||||
|
a[a.len-1] = tmp
|
||||||
|
|
||||||
|
# ############################################################
|
||||||
|
#
|
||||||
|
# Pointer arithmetics
|
||||||
|
#
|
||||||
|
# ############################################################
|
||||||
|
|
||||||
|
template asUnchecked*[T](a: openArray[T]): ptr UncheckedArray[T] =
|
||||||
|
cast[ptr UncheckedArray[T]](a[0].unsafeAddr)
|
||||||
|
|
||||||
|
# Warning for pointer arithmetics via inline C
|
||||||
|
# be careful of not passing a `var ptr`
|
||||||
|
# to a function as `var` are passed by hidden pointers in Nim and the wrong
|
||||||
|
# pointer will be modified. Templates are fine.
|
||||||
|
|
||||||
|
func `+%`*(p: ptr, offset: SomeInteger): type(p) {.inline, noInit.}=
|
||||||
|
## Pointer increment
|
||||||
|
{.emit: [result, " = ", p, " + ", offset, ";"].}
|
||||||
|
|
||||||
|
func `+%=`*(p: var ptr, offset: SomeInteger){.inline.}=
|
||||||
|
## Pointer increment
|
||||||
|
p = p +% offset
|
|
@ -35,6 +35,14 @@ macro staticFor*(idx: untyped{nkIdent}, start, stopEx: static int, body: untyped
|
||||||
body.replaceNodes(idx, newLit i)
|
body.replaceNodes(idx, newLit i)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
macro staticForCountdown*(idx: untyped{nkIdent}, start, stopIncl: static int, body: untyped): untyped =
|
||||||
|
result = newStmtList()
|
||||||
|
for i in countdown(start, stopIncl):
|
||||||
|
result.add nnkBlockStmt.newTree(
|
||||||
|
ident("unrolledIter_" & $idx & $i),
|
||||||
|
body.replaceNodes(idx, newLit i)
|
||||||
|
)
|
||||||
|
|
||||||
{.experimental: "dynamicBindSym".}
|
{.experimental: "dynamicBindSym".}
|
||||||
|
|
||||||
macro staticFor*(ident: untyped{nkIdent}, choices: typed, body: untyped): untyped =
|
macro staticFor*(ident: untyped{nkIdent}, choices: typed, body: untyped): untyped =
|
||||||
|
|
|
@ -34,7 +34,7 @@ else:
|
||||||
# But the new API isn't expose on Linux :/
|
# But the new API isn't expose on Linux :/
|
||||||
|
|
||||||
# TODO: fix Windows
|
# TODO: fix Windows
|
||||||
when not defined(Windows):
|
when not defined(windows):
|
||||||
proc SHA256[T: byte|char](
|
proc SHA256[T: byte|char](
|
||||||
msg: openarray[T],
|
msg: openarray[T],
|
||||||
digest: ptr array[32, byte] = nil
|
digest: ptr array[32, byte] = nil
|
||||||
|
|
Loading…
Reference in New Issue