Sha256 refactor (#206)

* sha256: separate message scheduling and state updates to help implement specific use-cases like #205; also implement SSSE3 acceleration (2006, Intel Core 2 Duo)

* sha256: simplify update flow, store less metadata in context

* sha256: Fix reworked update function

* Implement x86 hardware SHA acceleration

* typo
This commit is contained in:
Mamy Ratsimbazafy 2022-09-19 02:02:57 +02:00 committed by GitHub
parent b901dd5878
commit 351a3f6bd2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 965 additions and 339 deletions

View File

@ -48,9 +48,9 @@ proc report(op: string, bytes: int, startTime, stopTime: MonoTime, startClk, sto
when SupportsGetTicks:
let cycles = (stopClk - startClk) div iters
let cyclePerByte = cycles.float64 / bytes.float64
echo &"{op:<30} {throughput:>15.3f} ops/s {ns:>9} ns/op {cycles:>10} cycles {cyclePerByte:>5.2f} cycles/byte"
echo &"{op:<50} {throughput:>15.3f} ops/s {ns:>9} ns/op {cycles:>10} cycles {cyclePerByte:>5.2f} cycles/byte"
else:
echo &"{op:<30} {throughput:>15.3f} ops/s {ns:>9} ns/op"
echo &"{op:<50} {throughput:>15.3f} ops/s {ns:>9} ns/op"
template bench(op: string, bytes: int, iters: int, body: untyped): untyped =
measure(iters, startTime, stopTime, startClk, stopClk, body)
@ -63,41 +63,23 @@ proc benchSHA256_constantine[T](msg: openarray[T], msgComment: string, iters: in
proc benchSHA256_openssl[T](msg: openarray[T], msgComment: string, iters: int) =
var digest: array[32, byte]
bench("SHA256 - OpenSSL - " & msgComment, msg.len, iters):
bench("SHA256 - OpenSSL - " & msgComment, msg.len, iters):
SHA256_OpenSSL(digest, msg)
when isMainModule:
proc main() =
block:
let msg32B = rng.random_byte_seq(32)
benchSHA256_constantine(msg32B, "32B", 100)
benchSHA256_openssl(msg32B, "32B", 100)
block:
let msg64B = rng.random_byte_seq(64)
benchSHA256_constantine(msg64B, "64B", 100)
benchSHA256_openssl(msg64B, "64B", 100)
block:
let msg128B = rng.random_byte_seq(128)
benchSHA256_constantine(msg128B, "128B", 100)
benchSHA256_openssl(msg128B, "128B", 100)
block:
let msg576B = rng.random_byte_seq(576)
benchSHA256_constantine(msg576B, "576B", 50)
benchSHA256_openssl(msg576B, "576B", 50)
block:
let msg8192B = rng.random_byte_seq(8192)
benchSHA256_constantine(msg8192B, "8192B", 25)
benchSHA256_openssl(msg8192B, "8192B", 25)
block:
let msg1MB = rng.random_byte_seq(1_000_000)
benchSHA256_constantine(msg1MB, "1MB", 16)
benchSHA256_openssl(msg1MB, "1MB", 16)
block:
let msg10MB = rng.random_byte_seq(10_000_000)
benchSHA256_constantine(msg10MB, "10MB", 16)
benchSHA256_openssl(msg10MB, "10MB", 16)
block:
let msg100MB = rng.random_byte_seq(100_000_000)
benchSHA256_constantine(msg100MB, "100MB", 3)
benchSHA256_openssl(msg100MB, "100MB", 3)
const sizes = [
32, 64, 128, 256,
1024, 4096, 16384, 65536,
1_000_000, 10_000_000
]
const target_cycles = 1_000_000_000'i64
const worst_cycles_per_bytes = 25'i64
for s in sizes:
let msg = rng.random_byte_seq(s)
let iters = int(target_cycles div (s.int64 * worst_cycles_per_bytes))
benchSHA256_constantine(msg, $s & "B", iters)
benchSHA256_openssl(msg, $s & "B", iters)
main()

View File

@ -7,8 +7,13 @@
# at your option. This file may not be copied, modified, or distributed except according to those terms.
import
std/macros,
../platforms/[abstractions, endians]
../platforms/[abstractions, endians],
./sha256/sha256_generic
when UseASM_X86_32:
import ./sha256/[
sha256_x86_ssse3,
sha256_x86_shaext]
# SHA256, a hash function from the SHA2 family
# --------------------------------------------------------------------------------
@ -16,11 +21,6 @@ import
# References:
# - NIST: https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.180-4.pdf
# - IETF: US Secure Hash Algorithms (SHA and HMAC-SHA) https://tools.ietf.org/html/rfc4634
# - Intel optimization https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/sha-256-implementations-paper.pdf
# - Parallelizing message schedules
# to accelerate the computations of hash functions
# Shay Gueron, Vlad Krasnov, 2012
# https://eprint.iacr.org/2012/067.pdf
#
# Vectors:
# - https://csrc.nist.gov/CSRC/media/Projects/Cryptographic-Standards-and-Guidelines/documents/examples/SHA256.pdf
@ -28,277 +28,74 @@ import
# Types and constants
# ----------------------------------------------------------------
const
DigestSize = 32
BlockSize = 64
HashSize = DigestSize div sizeof(uint32) # 8
type
Sha256Context* = object
## Align to 64 for cache line and SIMD friendliness
H{.align: 64}: array[HashSize, uint32]
s{.align: 64}: Sha256_state
buf{.align: 64}: array[BlockSize, byte]
msgLen: uint64
bufIdx: uint8
sha256* = Sha256Context
# Internal
# Internals
# ----------------------------------------------------------------
# TODO: vectorized implementations
# No exceptions allowed in core cryptographic operations
{.push raises: [].}
{.push checks: off.}
template rotr(x, n: uint32): uint32 =
## Rotate right the bits
# We always use it with constants in 0 ..< 32
# so undefined behaviour.
(x shr n) or (x shl (32 - n))
template ch(x, y, z: uint32): uint32 =
## "Choose" function of SHA256
## Choose bit i from yi or zi depending on xi
when false: # Spec FIPS 180-4
(x and y) xor (not(x) and z)
else: # RFC4634
((x and (y xor z)) xor z)
template maj(x, y, z: uint32): uint32 =
## "Majority" function of SHA256
when false: # Spec FIPS 180-4
(x and y) xor (x and z) xor (y and z)
else: # RFC4634
(x and (y or z)) or (y and z)
template S0(x: uint32): uint32 =
# Σ₀
rotr(x, 2) xor rotr(x, 13) xor rotr(x, 22)
template S1(x: uint32): uint32 =
# Σ₁
rotr(x, 6) xor rotr(x, 11) xor rotr(x, 25)
template s0(x: uint32): uint32 =
# σ₀
rotr(x, 7) xor rotr(x, 18) xor (x shr 3)
template s1(x: uint32): uint32 =
# σ₁
rotr(x, 17) xor rotr(x, 19) xor (x shr 10)
template u32BE(blob: array[4, byte]): uint32 =
## Interpret a data blob as a big-endian uint32
## This should lower to
when nimvm:
(blob[0].uint32 shl 24) or (blob[1].uint32 shl 16) or (blob[2].uint32 shl 8) or blob[3].uint32
else:
when cpuEndian == littleEndian:
(blob[0].uint32 shl 24) or (blob[1].uint32 shl 16) or (blob[2].uint32 shl 8) or blob[3].uint32
func hashMessageBlocks(
s: var Sha256_state,
message: ptr UncheckedArray[byte],
numBlocks: uint) =
when UseASM_X86_32:
if ({.noSideEffect.}: hasSha()):
hashMessageBlocks_shaext(s, message, numBlocks)
elif ({.noSideEffect.}: hasSSSE3()):
hashMessageBlocks_ssse3(s, message, numBlocks)
else:
cast[uint32](blob)
template getU32at[T: byte|char](msg: openarray[T], pos: SomeInteger): uint32 =
u32BE(cast[ptr array[4, byte]](msg[pos].unsafeAddr())[])
func rotateRight[T](a: var openarray[T], k: int) =
## Rotate a seuqnce by k
doAssert a.len > 0
let k = k mod a.len
for _ in 0 ..< k:
let tmp = a[^1]
for i in countdown(a.len-1, 1):
a[i] = a[i-1]
a[0] = tmp
macro round(a, b, c, d, e, f, g, h: untyped, t: static int): untyped =
## Unrolled and allocation efficient SHA256 round
var s = [a, b, c, d, e, f, g, h]
s.rotateRight(t)
let
a = s[0]
b = s[1]
c = s[2]
d = s[3]
e = s[4]
f = s[5]
g = s[6]
h = s[7]
# W[t]
let w = nnkBracketExpr.newTree(
ident("W"), newLit(t mod 16)
)
result = newStmtList()
if t < 16:
# Reading message phase
let msg = ident"message"
let curBlock = ident"curBlock"
result.add quote do:
`w` = getU32at(`msg`, `curBlock`*64 + `t`*4)
hashMessageBlocks_generic(s, message, numBlocks)
else:
# Mixing
# Wt-2, Wt-7, Wt-15, Wt-16
let Wtm2 = nnkBracketExpr.newTree(
ident("W"), newLit((t-2) mod 16)
)
let Wtm7 = nnkBracketExpr.newTree(
ident("W"), newLit((t-7) mod 16)
)
let Wtm15 = nnkBracketExpr.newTree(
ident("W"), newLit((t-15) mod 16)
)
# w is Wt-16
result.add quote do:
`w` += s1(`Wtm2`) + `Wtm7` + s0(`Wtm15`)
result.add quote do:
let T1 = `h` + S1(`e`) + ch(`e`, `f`, `g`) + K256[`t`] + `w`
let T2 = S0(`a`) + maj(`a`, `b`, `c`)
`d` += T1
`h` = T1 + T2
func hashMessageBlocks[T: byte|char](
H: var array[HashSize, uint32],
message: openarray[T]): uint =
## Hash a message block by block
## Sha256 block size is 64 bytes hence
## a message will be process 64 by 64 bytes.
## FIPS.180-4 6.2.2. SHA-256 Hash Computation
result = 0
let numBlocks = message.len.uint div BlockSize
if numBlocks == 0:
return 0
const K256 = [
0x428a2f98'u32, 0x71374491'u32, 0xb5c0fbcf'u32, 0xe9b5dba5'u32, 0x3956c25b'u32, 0x59f111f1'u32, 0x923f82a4'u32, 0xab1c5ed5'u32,
0xd807aa98'u32, 0x12835b01'u32, 0x243185be'u32, 0x550c7dc3'u32, 0x72be5d74'u32, 0x80deb1fe'u32, 0x9bdc06a7'u32, 0xc19bf174'u32,
0xe49b69c1'u32, 0xefbe4786'u32, 0x0fc19dc6'u32, 0x240ca1cc'u32, 0x2de92c6f'u32, 0x4a7484aa'u32, 0x5cb0a9dc'u32, 0x76f988da'u32,
0x983e5152'u32, 0xa831c66d'u32, 0xb00327c8'u32, 0xbf597fc7'u32, 0xc6e00bf3'u32, 0xd5a79147'u32, 0x06ca6351'u32, 0x14292967'u32,
0x27b70a85'u32, 0x2e1b2138'u32, 0x4d2c6dfc'u32, 0x53380d13'u32, 0x650a7354'u32, 0x766a0abb'u32, 0x81c2c92e'u32, 0x92722c85'u32,
0xa2bfe8a1'u32, 0xa81a664b'u32, 0xc24b8b70'u32, 0xc76c51a3'u32, 0xd192e819'u32, 0xd6990624'u32, 0xf40e3585'u32, 0x106aa070'u32,
0x19a4c116'u32, 0x1e376c08'u32, 0x2748774c'u32, 0x34b0bcb5'u32, 0x391c0cb3'u32, 0x4ed8aa4a'u32, 0x5b9cca4f'u32, 0x682e6ff3'u32,
0x748f82ee'u32, 0x78a5636f'u32, 0x84c87814'u32, 0x8cc70208'u32, 0x90befffa'u32, 0xa4506ceb'u32, 0xbef9a3f7'u32, 0xc67178f2'u32
]
var
a = H[0]
b = H[1]
c = H[2]
d = H[3]
e = H[4]
f = H[5]
g = H[6]
h = H[7]
for curBlock in 0 ..< numBlocks:
# The first 16 bytes have different handling
# from bytes 16..<64.
# Using an array[64, uint32] will span it
# across 8 cache lines impacting performance
# Workspace with message schedule Wₜ
var W{.noInit.}: array[16, uint32]
when true:
# Translation of the spec
# This is faster than even OpenSSL for hashing just 32 bytes
# for example for HMAC and HKDF.
var t = 0'u32
while t < 16: # Wₜ = Mⁱₜ
W[t].parseFromBlob(message, result, bigEndian)
let T1 = h + S1(e) + ch(e, f, g) + K256[t] + W[t]
let T2 = S0(a) + maj(a, b, c)
h = g
g = f
f = e
e = d + T1
d = c
c = b
b = a
a = T1+T2
t += 1
while t < 64:
W[t mod 16] += s1(W[(t-2) mod 16]) +
W[(t-7) mod 16] +
s0(W[(t-15) mod 16])
let T1 = h + S1(e) + ch(e, f, g) + K256[t] + W[t mod 16]
let T2 = S0(a) + maj(a, b, c)
h = g
g = f
f = e
e = d + T1
d = c
c = b
b = a
a = T1+T2
t += 1
else:
# optimized version for large hashes
# For hashing 32B, this is slower than the rough translation
# of spec, unless compiled with -mssse3 (but no vector instructions are used :/)
staticFor t, 0, 64:
round(a, b, c, d, e, f, g, h, t)
result += 64
a += H[0]; H[0] = a
b += H[1]; H[1] = b
c += H[2]; H[2] = c
d += H[3]; H[3] = d
e += H[4]; H[4] = e
f += H[5]; H[5] = f
g += H[6]; H[6] = g
h += H[7]; H[7] = h
hashMessageBlocks_generic(s, message, numBlocks)
func dumpHash(
digest: var array[DigestSize, byte],
H: array[HashSize, uint32]) =
s: Sha256_state) {.inline.} =
## Convert the internal hash into a message digest
var dstIdx = 0'u
for i in 0 ..< H.len:
digest.dumpRawInt(H[i], dstIdx, bigEndian)
for i in 0 ..< s.H.len:
digest.dumpRawInt(s.H[i], dstIdx, bigEndian)
dstIdx += uint sizeof(uint32)
func hashBuffer(ctx: var Sha256Context) =
discard ctx.H.hashMessageBlocks(ctx.buf)
func hashBuffer(ctx: var Sha256Context) {.inline.} =
ctx.s.hashMessageBlocks(ctx.buf.asUnchecked(), numBlocks = 1)
ctx.buf.setZero()
ctx.bufIdx = 0
# Public API
# ----------------------------------------------------------------
template digestSize*(H: type sha256): int =
## Returns the output size in bytes
32
DigestSize
template internalBlockSize*(H: type sha256): int =
## Returns the byte size of the hash function ingested blocks
64
BlockSize
func init*(ctx: var Sha256Context) =
## Initialize or reinitialize a Sha256 context
ctx.msgLen = 0
ctx.buf.setZero()
ctx.bufIdx = 0
ctx.H[0] = 0x6a09e667'u32
ctx.H[1] = 0xbb67ae85'u32
ctx.H[2] = 0x3c6ef372'u32
ctx.H[3] = 0xa54ff53a'u32
ctx.H[4] = 0x510e527f'u32
ctx.H[5] = 0x9b05688c'u32
ctx.H[6] = 0x1f83d9ab'u32
ctx.H[7] = 0x5be0cd19'u32
ctx.s.H[0] = 0x6a09e667'u32
ctx.s.H[1] = 0xbb67ae85'u32
ctx.s.H[2] = 0x3c6ef372'u32
ctx.s.H[3] = 0xa54ff53a'u32
ctx.s.H[4] = 0x510e527f'u32
ctx.s.H[5] = 0x9b05688c'u32
ctx.s.H[6] = 0x1f83d9ab'u32
ctx.s.H[7] = 0x5be0cd19'u32
func initZeroPadded*(ctx: var Sha256Context) =
## Initialize a Sha256 context
@ -312,18 +109,17 @@ func initZeroPadded*(ctx: var Sha256Context) =
ctx.msgLen = 64
ctx.buf.setZero()
ctx.bufIdx = 0
ctx.H[0] = 0xda5698be'u32
ctx.H[1] = 0x17b9b469'u32
ctx.H[2] = 0x62335799'u32
ctx.H[3] = 0x779fbeca'u32
ctx.H[4] = 0x8ce5d491'u32
ctx.H[5] = 0xc0d26243'u32
ctx.H[6] = 0xbafef9ea'u32
ctx.H[7] = 0x1837a9d8'u32
ctx.s.H[0] = 0xda5698be'u32
ctx.s.H[1] = 0x17b9b469'u32
ctx.s.H[2] = 0x62335799'u32
ctx.s.H[3] = 0x779fbeca'u32
ctx.s.H[4] = 0x8ce5d491'u32
ctx.s.H[5] = 0xc0d26243'u32
ctx.s.H[6] = 0xbafef9ea'u32
ctx.s.H[7] = 0x1837a9d8'u32
func update*[T: char|byte](ctx: var Sha256Context, message: openarray[T]) =
func update*(ctx: var Sha256Context, message: openarray[byte]) =
## Append a message to a SHA256 context
## for incremental SHA256 computation
##
@ -337,52 +133,47 @@ func update*[T: char|byte](ctx: var Sha256Context, message: openarray[T]) =
## For passwords and secret keys, you MUST NOT use raw SHA-256
## use a Key Derivation Function instead (KDF)
debug:
doAssert: 0 <= ctx.bufIdx and ctx.bufIdx.int < ctx.buf.len
for i in ctx.bufIdx ..< ctx.buf.len:
doAssert ctx.buf[i] == 0
# Message processing state machine
var bufIdx = uint(ctx.msgLen mod BlockSize)
var cur = 0'u
var bytesLeft = message.len.uint
if message.len == 0:
return
if bufIdx != 0 and bufIdx+bytesLeft >= BlockSize:
# Previous partial update, fill the buffer and do one sha256 hash
let free = BlockSize - bufIdx
ctx.buf.copy(dStart = bufIdx, message, sStart = 0, len = free)
ctx.hashBuffer()
bufIdx = 0
cur = free
bytesLeft -= free
var # Message processing state machine
cur = 0'u
bytesLeft = message.len.uint
ctx.msgLen += bytesLeft
if ctx.bufIdx != 0: # Previous partial update
let bufIdx = ctx.bufIdx.uint
let free = ctx.buf.sizeof().uint - bufIdx
if free > bytesLeft:
# Enough free space, store in buffer
ctx.buf.copy(dStart = bufIdx, message, sStart = 0, len = bytesLeft)
ctx.bufIdx += bytesLeft.uint8
return
else:
# Fill the buffer and do one sha256 hash
ctx.buf.copy(dStart = bufIdx, message, sStart = 0, len = free)
ctx.hashBuffer()
# Update message state for further processing
cur = free
bytesLeft -= free
# Process n blocks (64 byte each)
let consumed = ctx.H.hashMessageBlocks(
message.toOpenArray(int cur, message.len-1))
cur += consumed
bytesLeft -= consumed
if bytesLeft >= BlockSize:
# Process n blocks (64 byte each)
let numBlocks = bytesLeft div BlockSize
ctx.s.hashMessageBlocks(message.asUnchecked +% cur, numBlocks)
cur += numBlocks * BlockSize
bytesLeft -= numBlocks * BlockSize
if bytesLeft != 0:
# Store the tail in buffer
debug: # TODO: state machine formal verification - https://nim-lang.org/docs/drnim.html
doAssert ctx.bufIdx == 0
doAssert cur + bytesLeft == message.len.uint
ctx.buf.copy(dStart = bufIdx, message, sStart = cur, len = bytesLeft)
ctx.buf.copy(dStart = 0'u, message, sStart = cur, len = bytesLeft)
ctx.bufIdx = uint8 bytesLeft
ctx.msgLen += message.len.uint
func update*(ctx: var Sha256Context, message: openarray[char]) {.inline.} =
## Append a message to a SHA256 context
## for incremental SHA256 computation
##
## Security note: the tail of your message might be stored
## in an internal buffer.
## if sensitive content is used, ensure that
## `ctx.finish(...)` and `ctx.clear()` are called as soon as possible.
## Additionally ensure that the message(s) passed were stored
## in memory considered secure for your threat model.
##
## For passwords and secret keys, you MUST NOT use raw SHA-256
## use a Key Derivation Function instead (KDF)
ctx.update(message.toOpenArrayByte(message.low, message.high))
func finish*(ctx: var Sha256Context, digest: var array[32, byte]) =
## Finalize a SHA256 computation and output the
@ -396,26 +187,23 @@ func finish*(ctx: var Sha256Context, digest: var array[32, byte]) =
## For passwords and secret keys, you MUST NOT use raw SHA-256
## use a Key Derivation Function instead (KDF)
debug:
doAssert: 0 <= ctx.bufIdx and ctx.bufIdx.int < ctx.buf.len
for i in ctx.bufIdx ..< ctx.buf.len:
doAssert ctx.buf[i] == 0
let bufIdx = uint(ctx.msgLen mod BlockSize)
# Add '1' bit at the end of the message (+7 zero bits)
ctx.buf[ctx.bufIdx] = 0b1000_0000
ctx.buf[bufIdx] = 0b1000_0000
# Add k bits so that msgLenBits + 1 + k ≡ 448 mod 512
# Hence in bytes msgLen + 1 + K ≡ 56 mod 64
const padZone = 56
if ctx.bufIdx >= padZone:
if bufIdx >= padZone:
# We are in the 56..<64 mod 64 byte count
# and need to rollover to 0
ctx.hashBuffer()
let lenInBits = ctx.msgLen.uint64 * 8
ctx.buf.dumpRawInt(lenInBits, padZone, bigEndian)
discard ctx.H.hashMessageBlocks(ctx.buf)
digest.dumpHash(ctx.H)
ctx.s.hashMessageBlocks(ctx.buf.asUnchecked(), numBlocks = 1)
digest.dumpHash(ctx.s)
func clear*(ctx: var Sha256Context) =
## Clear the context internal buffers
@ -423,7 +211,6 @@ func clear*(ctx: var Sha256Context) =
## For passwords and secret keys, you MUST NOT use raw SHA-256
## use a Key Derivation Function instead (KDF)
# TODO: ensure compiler cannot optimize the code away
ctx.H.setZero()
ctx.s.H.setZero()
ctx.buf.setZero()
ctx.msgLen = 0
ctx.bufIdx = 0

View File

@ -0,0 +1,185 @@
# Constantine
# Copyright (c) 2018-2019 Status Research & Development GmbH
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.
import
../../platforms/primitives
# SHA256, a hash function from the SHA2 family
# --------------------------------------------------------------------------------
#
# References:
# - NIST: https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.180-4.pdf
# - IETF: US Secure Hash Algorithms (SHA and HMAC-SHA) https://tools.ietf.org/html/rfc4634
# Vectors:
# - https://csrc.nist.gov/CSRC/media/Projects/Cryptographic-Standards-and-Guidelines/documents/examples/SHA256.pdf
# No exceptions allowed in core cryptographic operations
{.push raises: [].}
{.push checks: off.}
# Types & Constants
# ------------------------------------------------
# The enforced alignment should help the compiler produce optimized code
type Word* = uint32
const
DigestSize* = 32
BlockSize* = 64
HashSize* = DigestSize div sizeof(Word) # 8
type Sha256_MessageSchedule* = object
w*{.align: 64.}: array[BlockSize div sizeof(Word), Word]
type Sha256_state* = object
H*{.align: 64.}: array[HashSize, Word]
const K256* = [
0x428a2f98'u32, 0x71374491'u32, 0xb5c0fbcf'u32, 0xe9b5dba5'u32, 0x3956c25b'u32, 0x59f111f1'u32, 0x923f82a4'u32, 0xab1c5ed5'u32,
0xd807aa98'u32, 0x12835b01'u32, 0x243185be'u32, 0x550c7dc3'u32, 0x72be5d74'u32, 0x80deb1fe'u32, 0x9bdc06a7'u32, 0xc19bf174'u32,
0xe49b69c1'u32, 0xefbe4786'u32, 0x0fc19dc6'u32, 0x240ca1cc'u32, 0x2de92c6f'u32, 0x4a7484aa'u32, 0x5cb0a9dc'u32, 0x76f988da'u32,
0x983e5152'u32, 0xa831c66d'u32, 0xb00327c8'u32, 0xbf597fc7'u32, 0xc6e00bf3'u32, 0xd5a79147'u32, 0x06ca6351'u32, 0x14292967'u32,
0x27b70a85'u32, 0x2e1b2138'u32, 0x4d2c6dfc'u32, 0x53380d13'u32, 0x650a7354'u32, 0x766a0abb'u32, 0x81c2c92e'u32, 0x92722c85'u32,
0xa2bfe8a1'u32, 0xa81a664b'u32, 0xc24b8b70'u32, 0xc76c51a3'u32, 0xd192e819'u32, 0xd6990624'u32, 0xf40e3585'u32, 0x106aa070'u32,
0x19a4c116'u32, 0x1e376c08'u32, 0x2748774c'u32, 0x34b0bcb5'u32, 0x391c0cb3'u32, 0x4ed8aa4a'u32, 0x5b9cca4f'u32, 0x682e6ff3'u32,
0x748f82ee'u32, 0x78a5636f'u32, 0x84c87814'u32, 0x8cc70208'u32, 0x90befffa'u32, 0xa4506ceb'u32, 0xbef9a3f7'u32, 0xc67178f2'u32
]
# Primitives
# ------------------------------------------------
template rotr(x, n: uint32): uint32 =
## Rotate right the bits
# We always use it with constants in 0 ..< 32
# so no undefined behaviour.
(x shr n) or (x shl (32 - n))
template ch(x, y, z: uint32): uint32 =
## "Choose" function of SHA256
## Choose bit i from yi or zi depending on xi
when false: # Spec FIPS 180-4
(x and y) xor (not(x) and z)
else: # RFC4634
((x and (y xor z)) xor z)
template maj(x, y, z: uint32): uint32 =
## "Majority" function of SHA256
when false: # Spec FIPS 180-4
(x and y) xor (x and z) xor (y and z)
else: # RFC4634
(x and (y or z)) or (y and z)
template S0(x: uint32): uint32 =
# Σ₀
rotr(x, 2) xor rotr(x, 13) xor rotr(x, 22)
template S1(x: uint32): uint32 =
# Σ₁
rotr(x, 6) xor rotr(x, 11) xor rotr(x, 25)
template s0(x: uint32): uint32 =
# σ₀
rotr(x, 7) xor rotr(x, 18) xor (x shr 3)
template s1(x: uint32): uint32 =
# σ₁
rotr(x, 17) xor rotr(x, 19) xor (x shr 10)
# Message schedule
# ------------------------------------------------
template u32BE(blob: array[4, byte]): uint32 =
## Interpret a data blob as a big-endian uint32
when nimvm:
(blob[0].uint32 shl 24) or (blob[1].uint32 shl 16) or (blob[2].uint32 shl 8) or blob[3].uint32
else:
when cpuEndian == littleEndian:
(blob[0].uint32 shl 24) or (blob[1].uint32 shl 16) or (blob[2].uint32 shl 8) or blob[3].uint32
else:
cast[uint32](blob)
template getU32at(msg: ptr UncheckedArray[byte], pos: SomeInteger): uint32 =
u32BE(cast[ptr array[4, byte]](msg[pos].addr)[])
# State updates
# ------------------------------------------------
template copy*(dst: var Sha256_state, src: Sha256_state) =
## State copy
# Should compile with a specialized aligned copy.
# No bounds check
for i in 0 ..< HashSize:
dst.H[i] = src.H[i]
template accumulate*(dst: var Sha256_state, src: Sha256_state) =
## State accumulation
# No bounds check
for i in 0 ..< HashSize:
dst.H[i] += src.H[i]
template sha256_round*(s: var Sha256_state, wt, kt: Word) =
template a: Word = s.H[0]
template b: Word = s.H[1]
template c: Word = s.H[2]
template d: Word = s.H[3]
template e: Word = s.H[4]
template f: Word = s.H[5]
template g: Word = s.H[6]
template h: Word = s.H[7]
let T1 = h + S1(e) + ch(e, f, g) + kt + wt
let T2 = S0(a) + maj(a, b, c)
d += T1
h = T1 + T2
s.H.rotateRight()
# Hash Computation
# ------------------------------------------------
func sha256_rounds_0_15(
s: var Sha256_state,
ms: var Sha256_MessageSchedule,
message: ptr UncheckedArray[byte]) {.inline.} =
staticFor t, 0, 16:
ms.w[t] = message.getU32at(t * sizeof(Word))
sha256_round(s, ms.w[t], K256[t])
func sha256_rounds_16_63(
s: var Sha256_state,
ms: var Sha256_MessageSchedule) {.inline.} =
staticFor t, 16, 64:
ms.w[t and 15] += s1(ms.w[(t - 2) and 15])+
ms.w[(t - 7) and 15] +
s0(ms.w[(t - 15) and 15])
sha256_round(s, ms.w[t and 15], K256[t])
func hashMessageBlocks_generic*(
H: var Sha256_state,
message: ptr UncheckedArray[byte],
numBlocks: uint) =
## Hash a message block by block
## Sha256 block size is 64 bytes hence
## a message will be process 64 by 64 bytes.
## FIPS.180-4 6.2.2. SHA-256 Hash Computation
var msg = message
var ms{.noInit.}: Sha256_MessageSchedule
var s{.noInit.}: Sha256_state
s.copy(H)
for _ in 0 ..< numBlocks:
sha256_rounds_0_15(s, ms, msg)
msg +%= BlockSize
sha256_rounds_16_63(s, ms)
s.accumulate(H) # accumulate on register variables
H.copy(s)

View File

@ -0,0 +1,130 @@
# Constantine
# Copyright (c) 2018-2019 Status Research & Development GmbH
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.
import
../../platforms/isa/simd_x86,
../../platforms/primitives,
./sha256_generic
{.localpassC:"-msse4.1 -msha".}
# SHA256, a hash function from the SHA2 family
# --------------------------------------------------------------------------------
#
# References:
# - Intel SHA extensions whitepaper
# https://www.intel.com/content/dam/develop/external/us/en/documents/intel-sha-extensions-white-paper-402097.pdf
# - Intel SHA extensions article
# https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sha-extensions.html
# No exceptions allowed in core cryptographic operations
{.push raises: [].}
{.push checks: off.}
# Primitives
# ------------------------------------------------
template setr_K(i: int): m128i =
setr_u32x4(K256[4*i], K256[4*i+1], K256[4*i+2], K256[4*i+3])
# Hash Computation
# ------------------------------------------------
func hashMessageBlocks_shaext*(
H: var Sha256_state,
message: ptr UncheckedArray[byte],
numBlocks: uint)=
## Hash a message block by block
## Sha256 block size is 64 bytes hence
## a message will be process 64 by 64 bytes.
var
abef_save {.noInit.}: m128i
cdgh_save {.noInit.}: m128i
state0 {.noInit.}: m128i
state1 {.noInit.}: m128i
msgtmp {.noInit.}: array[4, m128i]
msg {.noInit.}: m128i
tmp {.noInit.}: m128i
data = message
let shuf_mask = set_u64x2(0x0c0d0e0f08090a0b, 0x0405060700010203)
# The SHA state is stored in this order:
# D, C, B, A, H, G, F, E
#
# state0 contains ABEF, state1 contains CDGH
tmp = shuf_u32x4(loada_u128(H.H[0].addr), 0xB1) # CDAB
state1 = shuf_u32x4(loada_u128(H.H[4].addr), 0x1B) # EFGH
state0 = alignr_u128(tmp, state1, 8) # ABEF
state1 = blend_u16x8(state1, tmp, 0xF0) # CDGH
for _ in 0 ..< numBlocks:
# Save current state
abef_save = state0
cdgh_save = state1
# Rounds 0-3
msgtmp[0] = shuf_u8x16(loadu_u128(data[0].addr), shuf_mask)
msg = add_u32x4(msgtmp[0], setr_K(0))
state1 = sha256_2rounds(state1, state0, msg)
msg = shuf_u32x4(msg, 0x0E)
state0 = sha256_2rounds(state0, state1, msg)
# Rounds 4-7 and 8-11
staticFor i, 1, 3:
msgtmp[i] = shuf_u8x16(loadu_u128(data[16*i].addr), shuf_mask)
msg = add_u32x4(msgtmp[i], setr_K(i))
state1 = sha256_2rounds(state1, state0, msg)
msg = shuf_u32x4(msg, 0x0E)
state0 = sha256_2rounds(state0, state1, msg)
msgtmp[i-1] = sha256_msg1(msgtmp[i-1], msgtmp[i])
# Rounds 12-59
msgtmp[3] = shuf_u8x16(loadu_u128(data[16*3].addr), shuf_mask)
staticFor i, 3, 15:
let prev = (i-1) and 3 # mod 4, we rotate buffers
let curr = i and 3
let next = (i+1) and 3
msg = add_u32x4(msgtmp[curr], setr_K(i))
state1 = sha256_2rounds(state1, state0, msg)
tmp = alignr_u128(msgtmp[curr], msgtmp[prev], 4)
msgtmp[next] = add_u32x4(msgtmp[next], tmp)
msgtmp[next] = sha256_msg2(msgtmp[next], msgtmp[curr])
msg = shuf_u32x4(msg, 0x0E)
state0 = sha256_2rounds(state0, state1, msg)
msgtmp[prev] = sha256_msg1(msgtmp[prev], msgtmp[curr])
# Rounds 60-63
msg = add_u32x4(msgtmp[3], setr_K(15))
state1 = sha256_2rounds(state1, state0, msg)
msg = shuf_u32x4(msg, 0x0E)
state0 = sha256_2rounds(state0, state1, msg)
# Accumulate
state0 = add_u32x4(state0, abef_save)
state1 = add_u32x4(state1, cdgh_save)
data +%= BlockSize
# The SHA state is stored in this order:
# D, C, B, A, H, G, F, E
#
# state0 contains ABEF, state1 contains CDGH
tmp = shuf_u32x4(state0, 0x1B) # FEBA
state1 = shuf_u32x4(state1, 0xB1) # DCHG
state0 = blend_u16x8(tmp, state1, 0xF0) # DCBA
state1 = alignr_u128(state1, tmp, 8) # HGFE
storea_u128(H.H[0].addr, state0)
storea_u128(H.H[4].addr, state1)

View File

@ -0,0 +1,211 @@
# Constantine
# Copyright (c) 2018-2019 Status Research & Development GmbH
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.
import
../../platforms/isa/simd_x86,
../../platforms/primitives,
./sha256_generic
{.localpassC:"-mssse3".}
# SHA256, SSSE3 optimizations
# --------------------------------------------------------------------------------
#
# References:
# - NIST: https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.180-4.pdf
# - IETF: US Secure Hash Algorithms (SHA and HMAC-SHA) https://tools.ietf.org/html/rfc4634
# - Fast SHA-256 Implementations on Intel® Architecture Processors
# https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/sha-256-implementations-paper.pdf
# Following the intel whitepaper we split our code into:
# We keep track of a 256-bit state vector corresponding
# to {a, b, c, d, e, f, g, h} in specification
#
# Processing is done in 2 steps
# - Message scheduler:
# Takes the input 16 DWORDs and
# computes 48 new DWORDs. Together with the original 16 DWORDs, these
# form a vector of 64 DWORDs that is the input to the second step.
# This can be vectorized.
# - 64 SHA rounds:
# This code is scalar.
# No exceptions allowed in core cryptographic operations
{.push raises: [].}
{.push checks: off.}
# Vectorized message scheduler
# ------------------------------------------------
const VecNum = BlockSize div 16 # BlockSize / sizeof(m128i)
const VecWords = 16 div sizeof(Word) # sizeof(m128i) / sizeof(Word)
func initMessageSchedule(
msnext: var array[VecNum, m128i],
ms: var Sha256_MessageSchedule,
message: ptr UncheckedArray[byte]) {.inline.} =
## Initial state, from data
## - Precompute steps for the future message schedule `msnext`
## - compute the current message schedule `ms`
let mask = setr_u32x4(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f)
let pK256 = K256.unsafeAddr()
staticFor i, 0, VecNum:
msnext[i] = loadu_u128(message[i * sizeof(m128i)].addr)
msnext[i] = shuf_u8x16(msnext[i], mask)
storea_u128(ms.w[VecWords*i].addr, add_u32x4(msnext[i], loadu_u128(pK256[VecWords*i].addr)))
func updateMessageSchedule(
W: var array[4, m128i],
loMask, hiMask: m128i) {.inline.} =
# Steady state
# ------------
# The message schedule workspace W[16:0]
# is updated with
# W[t mod 16] += s0 + s1 + W[(t-7) mod 16]
# with
# s0 = σ₀(W[(t-15) mod 16])
# s1 = σ₁(W[(t-2) mod 16])
# by denoting the right rotation >>>, and xor ⊕
# σ₀(x) = (x >>> 7) ⊕ (x >>> 18) ⊕ (x >>> 3)
# σ₁(x) = (x >>> 17) ⊕ (x >>> 19) ⊕ (x >>> 10)
const rot0 = [int32 7, 18, 3]
const rot1 = [int32 17, 19, 10]
var v{.noInit.}: array[4, m128i]
v[0] = alignr_u128(W[1], W[0], 4)
v[3] = alignr_u128(W[3], W[2], 4)
v[2] = shr_u32x4(v[0], rot0[0])
W[0] = add_u32x4(W[0], v[3])
v[3] = shr_u32x4(v[0], rot0[2])
v[1] = shl_u32x4(v[0], 32-rot0[1])
v[0] = xor_u128(v[3], v[2])
v[3] = shuf_u32x4(W[3], 0xfa)
v[2] = shr_u32x4(v[2], rot0[1] - rot0[0])
v[0] = xor_u128(v[0], v[1])
v[0] = xor_u128(v[0], v[2])
v[1] = shl_u32x4(v[1], rot0[1] - rot0[0])
v[2] = shr_u32x4(v[3], rot1[2])
v[3] = shr_u64x2(v[3], rot1[0])
W[0] = add_u32x4(W[0], xor_u128(v[0], v[1]))
v[2] = xor_u128(v[2], v[3])
v[3] = shr_u64x2(v[3], rot1[1] - rot1[0])
v[2] = shuf_u8x16(xor_u128(v[2], v[3]), lo_mask)
W[0] = add_u32x4(W[0], v[2])
v[3] = shuf_u32x4(W[0], 0x50)
v[2] = shr_u32x4(v[3], rot1[2])
v[3] = shr_u64x2(v[3], rot1[0])
v[2] = xor_u128(v[2], v[3])
v[3] = shr_u64x2(v[3], rot1[1] - rot1[0])
W[0] = add_u32x4(W[0], shuf_u8x16(xor_u128(v[2], v[3]), hi_mask))
W.rotateLeft()
# Hash Computation
# ------------------------------------------------
func sha256_rounds_0_47(
s: var Sha256_state,
ms: var Sha256_MessageSchedule,
msnext: var array[VecNum, m128i]) {.inline.} =
## Process Sha256 rounds 0 to 47
let loMask = setr_u32x4(0x03020100, 0x0b0a0908, -1, -1)
let hiMask = setr_u32x4(-1, -1, 0x03020100, 0x0b0a0908)
# The first items of K256 were processed in initMessageSchedule
var k256_idx = 16
# Rounds 0-15, 16-31, 32-47
for r in 0 ..< 3:
# Important unrolling for 2 reasons, see Intel paper
# - State updates:
# In each round calculation six out of the eight state variables are shifted to the
# next state variable. Rather than do these using mov instructions, we rename
# the virtual registers (symbols) to effect this “shift”. Thus each round
# effectively rotates the set of state register names by one place. By doing 8 or
# 16 rounds in the body of the loop, the names have rotated back to their
# starting values, so no register moves are needed before looping.
#
# - Message schedule:
# Similarly on the vector unit for the message scheduling, the 16 necessary
# scheduled DWORDs are stored in four XMM registers, as described earlier. For
# example, the initial data DWORDs are stored in order as {X0, X1, X2, X3}.
# When we compute four new scheduled DWORDs, we store them in X0
# (overwriting the “oldest” data DWORDs), so now the scheduled DWORDs are
# stored in order in {X1, X2, X3, X0}. Once again, we handle this by “rotating”
# the four names, where in this case the names rotate one place every four
# rounds (because we compute four scheduled DWORDs in each calculation).
# By having 16 rounds (four scheduling operations) in the body of the loop,
# these XMM register names rotate back to their initial value, and again no
# register moves are needed before looping.
staticFor i, 0, VecNum:
# We interleave computing the message scheduled at {t+4, t+5, t+6, t+7}
# with SHA256 state update for {t, t+1, t+2, t+3}
# As they are independent, hopefully the compiler reorders instructions
# for maximum throughput.
# Also it optimize away the moves and use register renaming to avoid rotations
const pos = VecWords * i
msnext.updateMessageSchedule(loMask, hiMask)
let wnext = add_u32x4(msnext[3], loadu_u128(K256[k256_idx].unsafeAddr))
# K256 was already included in the computation of wnext, hence kt = 0
s.sha256_round(wt = ms.w[pos + 0], kt = 0)
s.sha256_round(wt = ms.w[pos + 1], kt = 0)
s.sha256_round(wt = ms.w[pos + 2], kt = 0)
s.sha256_round(wt = ms.w[pos + 3], kt = 0)
storea_u128(ms.w[pos].addr, wnext)
k256_idx += VecWords
func sha256_rounds_48_63(
s: var Sha256_state,
ms: var Sha256_MessageSchedule) {.inline.} =
## Process Sha256 rounds 48 to 63
staticFor t, 48, 64:
# Wt[i mod 16] and K256 was already integrated in the computation of wnext
s.sha256_round(wt = ms.w[t and 15], kt = 0)
func hashMessageBlocks_ssse3*(
H: var Sha256_state,
message: ptr UncheckedArray[byte],
numBlocks: uint)=
## Hash a message block by block
## Sha256 block size is 64 bytes hence
## a message will be process 64 by 64 bytes.
var msg = message
var ms{.noInit.}: Sha256_MessageSchedule
var msnext{.noInit.}: array[VecNum, m128i]
var s{.noInit.}: Sha256_state
s.copy(H)
for _ in 0 ..< numBlocks:
initMessageSchedule(msnext, ms, msg)
msg +%= BlockSize
sha256_rounds_0_47(s, ms, msnext)
sha256_rounds_48_63(s, ms)
s.accumulate(H) # accumulate on register variables
H.copy(s)

View File

@ -36,7 +36,7 @@ debug:
result[0] = '0'
result[1] = 'x'
var a = a
for j in countdown(L-1, 0):
for j in countdown(result.len-1, 2):
result[j] = hexChars.secretLookup(a and SecretWord 0xF)
a = a shr 4
@ -45,7 +45,7 @@ debug:
result.add " " & toHex(a[0])
for i in 1 ..< a.len:
result.add ", " & toHex(a[i])
result.add "])"
result.add "]"
func `$`*(a: BigInt): string =
result = "BigInt["

View File

@ -0,0 +1,277 @@
# Constantine
# Copyright (c) 2018-2019 Status Research & Development GmbH
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.
static: doAssert defined(i386) or defined(amd64)
# SIMD throughput and latency:
# - https://software.intel.com/sites/landingpage/IntrinsicsGuide/
# - https://www.agner.org/optimize/instruction_tables.pdf
# Reminder: x86 is little-endian, order is [low part, high part]
# Documentation at https://software.intel.com/sites/landingpage/IntrinsicsGuide/
when defined(vcc):
{.pragma: x86_type, byCopy, header:"<intrin.h>".}
{.pragma: x86, noDecl, header:"<intrin.h>".}
else:
{.pragma: x86_type, byCopy, header:"<x86intrin.h>".}
{.pragma: x86, noDecl, header:"<x86intrin.h>".}
type
m128* {.importc: "__m128", x86_type.} = object
raw: array[4, float32]
m128d* {.importc: "__m128d", x86_type.} = object
raw: array[2, float64]
m128i* {.importc: "__m128i", x86_type.} = object
raw: array[16, byte]
m256* {.importc: "__m256", x86_type.} = object
raw: array[8, float32]
m256d* {.importc: "__m256d", x86_type.} = object
raw: array[4, float64]
m256i* {.importc: "__m256i", x86_type.} = object
raw: array[32, byte]
m512* {.importc: "__m512", x86_type.} = object
raw: array[16, float32]
m512d* {.importc: "__m512d", x86_type.} = object
raw: array[8, float64]
m512i* {.importc: "__m512i", x86_type.} = object
raw: array[64, byte]
mmask8* {.importc: "__mmask8", x86_type.} = uint8
mmask16* {.importc: "__mmask16", x86_type.} = uint16
mmask64* {.importc: "__mmask64", x86_type.} = uint64
# ############################################################
#
# SSE2 - integer - packed
#
# ############################################################
func mm_setzero_si128(): m128i {.importc: "_mm_setzero_si128", x86.}
func mm_set1_epi8(a: int8 or uint8): m128i {.importc: "_mm_set1_epi8", x86.}
func mm_set1_epi16(a: int16 or uint16): m128i {.importc: "_mm_set1_epi16", x86.}
func mm_set1_epi32(a: int32 or uint32): m128i {.importc: "_mm_set1_epi32", x86.}
func mm_set1_epi64x(a: int64 or uint64): m128i {.importc: "_mm_set1_epi64x", x86.}
func mm_set_epi64x(e1, e0: int64 or uint64): m128i {.importc: "_mm_set_epi64x", x86.}
func mm_load_si128(mem_addr: ptr m128i): m128i {.importc: "_mm_load_si128", x86.}
func mm_loadu_si128(mem_addr: ptr m128i): m128i {.importc: "_mm_loadu_si128", x86.}
func mm_store_si128(mem_addr: ptr m128i, a: m128i) {.importc: "_mm_store_si128", x86.}
func mm_storeu_si128(mem_addr: ptr m128i, a: m128i) {.importc: "_mm_storeu_si128", x86.}
func mm_set_epi32(e3, e2, e1, e0: int32 or uint32): m128i {.importc: "_mm_set_epi32", x86.}
## Initialize m128i with {e3, e2, e1, e0} (big endian order)
## in order [e0, e1, e2, e3]
func mm_setr_epi32(e3, e2, e1, e0: int32 or uint32): m128i {.importc: "_mm_setr_epi32", x86.}
## Initialize m128i with {e3, e2, e1, e0} (big endian order)
## in order [e3, e2, e1, e0]
func mm_xor_si128(a, b: m128i): m128i {.importc: "_mm_xor_si128", x86.}
func mm_add_epi8(a, b: m128i): m128i {.importc: "_mm_add_epi8", x86.}
func mm_add_epi16(a, b: m128i): m128i {.importc: "_mm_add_epi16", x86.}
func mm_add_epi32(a, b: m128i): m128i {.importc: "_mm_add_epi32", x86.}
func mm_add_epi64(a, b: m128i): m128i {.importc: "_mm_add_epi64", x86.}
func mm_slli_epi64(a: m128i, imm8: int32 or uint32): m128i {.importc: "_mm_slli_epi64", x86.}
## Shift 2xint64 left
func mm_srli_epi64(a: m128i, imm8: int32 or uint32): m128i {.importc: "_mm_srli_epi64", x86.}
## Shift 2xint64 right
func mm_srli_epi32(a: m128i, imm8: int32 or uint32): m128i {.importc: "_mm_srli_epi32", x86.}
## Shift 4xint32 left
func mm_slli_epi32(a: m128i, imm8: int32 or uint32): m128i {.importc: "_mm_slli_epi32", x86.}
## Shift 4xint32 right
func mm_shuffle_epi32(a: m128i, imm8: int32 or uint32): m128i {.importc: "_mm_shuffle_epi32", x86.}
## Shuffle 32-bit integers in a according to the control in imm8
## Formula is in big endian representation
## a = {a3, a2, a1, a0}
## dst = {d3, d2, d1, d0}
## imm8 = {bits[7:6], bits[5:4], bits[3:2], bits[1:0]}
## d0 will refer a[bits[1:0]]
## d1 a[bits[3:2]]
# ############################################################
#
# SSSE3 - integer - packed
#
# ############################################################
func mm_alignr_epi8(a, b: m128i, imm8: int32 or uint32): m128i {.importc: "_mm_alignr_epi8", x86.}
## Concatenate 16-byte blocks in a and b into a 32-byte temporary result,
## shift the result right by imm8 bytes, and return the low 16 bytes
## Input:
## a[127:0], b[127:0]
## Result:
## tmp[255:128] = a
## tmp[127:0] = b
## tmp[255:0] = tmp[255:0] >> (imm8*8)
## dst[127:0] = tmp[127:0]
func mm_shuffle_epi8(a, b: m128i): m128i {.importc: "_mm_shuffle_epi8", x86.}
## Shuffle 8-bit integers in a according to the control mask in b
## Formula is in big endian representation
## a = {a15, a14, a13, a12, a11, a10, a9, a8, a7, a6, a5, a4, a3, a2, a1, a0}
## b = {b15, b14, b13, b12, b11, b10, b9, b8, b7, b6, b5, b4, b3, b2, b1, b0}
## dst = {d15, d14, d13, d12, d11, d10, d9, d8, d7, d6, d5, d4, d3, d2, d1, d0}
##
## The control mask b0 ... b15 have the shape:
## bits z000uvwx
## if z is set, the corresponding d is set to zero.
## otherwise uvwx represents a binary number in 0..15,
## the corresponding d will be set to a(uvwx)
##
## for i in 0 ..< 16:
## if bitand(b[i], 0x80) != 0:
## dst[i] = 0
## else:
## dst[i] = a[bitand(b[i], 0x0F)]
# ############################################################
#
# SSE4.1 - integer - packed
#
# ############################################################
func mm_blend_epi16(a, b: m128i, imm8: int32 or uint32): m128i {.importc: "_mm_blend_epi16", x86.}
## Blend packed 16-bit integers from a and b using control mask imm8,
## and store the results in dst.
##
## FOR j := 0 to 7
## i := j*16
## IF imm8[j]
## dst[i+15:i] := b[i+15:i]
## ELSE
## dst[i+15:i] := a[i+15:i]
# ############################################################
#
# AVX512F - integer - packed
#
# ############################################################
func mm_ror_epi32(a: m128i, imm8: int32 or uint32): m128i {.importc: "_mm_ror_epi32", x86.}
## Rotate 4xint32 right
func mm_mask_add_epi32(src: m128i, mask: mmask8, a, b: m128i): m128i {.importc: "_mm_mask_add_epi32", x86.}
## Add packed 32-bit integers in a and b, and store the results in dst using writemask mask
## (elements are copied from src when the corresponding mask bit is not set).
## for j in 0 ..< 4:
## let i = j*32
## if k[j]:
## dst[i+31:i] := a[i+31:i] + b[i+31:i]
## else:
## dst[i+31:i] := src[i+31:i]
# ############################################################
#
# SHA extensions
#
# ############################################################
func mm_sha256msg1_epu32(a, b: m128i): m128i {.importc: "_mm_sha256msg1_epu32", x86.}
## Perform an intermediate calculation for the next four SHA256 message values (unsigned 32-bit integers)
## using previous message values from a and b, and store the result in dst.
##
## W4 := b[31:0]
## W3 := a[127:96]
## W2 := a[95:64]
## W1 := a[63:32]
## W0 := a[31:0]
## dst[127:96] := W3 + sigma0(W4)
## dst[95:64] := W2 + sigma0(W3)
## dst[63:32] := W1 + sigma0(W2)
## dst[31:0] := W0 + sigma0(W1)
func mm_sha256msg2_epu32(a, b: m128i): m128i {.importc: "_mm_sha256msg2_epu32", x86.}
## Perform the final calculation for the next four SHA256 message values (unsigned 32-bit integers)
## using previous message values from a and b, and store the result in dst.
##
## W14 := b[95:64]
## W15 := b[127:96]
## W16 := a[31:0] + sigma1(W14)
## W17 := a[63:32] + sigma1(W15)
## W18 := a[95:64] + sigma1(W16)
## W19 := a[127:96] + sigma1(W17)
## dst[127:96] := W19
## dst[95:64] := W18
## dst[63:32] := W17
## dst[31:0] := W16
func mm_sha256rnds2_epu32(cdgh, abef, k: m128i): m128i {.importc: "_mm_sha256rnds2_epu32", x86.}
## Perform 2 rounds of SHA256 operation using
## an initial SHA256 state (C,D,G,H) from a,
## an initial SHA256 state (A,B,E,F) from b,
## and a pre-computed sum of the next 2 round message values (unsigned 32-bit integers)
## and the corresponding round constants from k,
## and store the updated SHA256 state (A,B,E,F) in dst.
##
## A[0] := b[127:96]
## B[0] := b[95:64]
## C[0] := a[127:96]
## D[0] := a[95:64]
## E[0] := b[63:32]
## F[0] := b[31:0]
## G[0] := a[63:32]
## H[0] := a[31:0]
## W_K[0] := k[31:0]
## W_K[1] := k[63:32]
## FOR i := 0 to 1
## A[i+1] := Ch(E[i], F[i], G[i]) + sum1(E[i]) + W_K[i] + H[i] + Maj(A[i], B[i], C[i]) + sum0(A[i])
## B[i+1] := A[i]
## C[i+1] := B[i]
## D[i+1] := C[i]
## E[i+1] := Ch(E[i], F[i], G[i]) + sum1(E[i]) + W_K[i] + H[i] + D[i]
## F[i+1] := E[i]
## G[i+1] := F[i]
## H[i+1] := G[i]
## ENDFOR
## dst[127:96] := A[2]
## dst[95:64] := B[2]
## dst[63:32] := E[2]
## dst[31:0] := F[2]
# Aliases
# ------------------------------------------------
template set_u64x2*(e1, e0: int64 or uint64): m128i =
mm_set_epi64x(e1, e0)
template setr_u32x4*(e3, e2, e1, e0: int32 or uint32): m128i =
mm_setr_epi32(e3, e2, e1, e0)
template loada_u128*(data: pointer): m128i =
mm_load_si128(cast[ptr m128i](data))
template loadu_u128*(data: pointer): m128i =
mm_loadu_si128(cast[ptr m128i](data))
template storea_u128*(mem_addr: pointer, a: m128i) =
mm_store_si128(cast[ptr m128i](mem_addr), a)
template xor_u128*(a, b: m128i): m128i =
mm_xor_si128(a, b)
template add_u32x4*(a, b: m128i): m128i =
mm_add_epi32(a, b)
template shl_u32x4*(a: m128i, imm8: int32 or uint32): m128i =
mm_slli_epi32(a, imm8)
template shr_u32x4*(a: m128i, imm8: int32 or uint32): m128i =
mm_srli_epi32(a, imm8)
template shr_u64x2*(a: m128i, imm8: int32 or uint32): m128i =
mm_srli_epi64(a, imm8)
template alignr_u128*(a, b: m128i, shiftRightByBytes: int32 or uint32): m128i =
mm_alignr_epi8(a, b, shiftRightByBytes)
template shuf_u8x16*(a: m128i, mask: m128i): m128i =
mm_shuffle_epi8(a, mask)
template shuf_u32x4*(a: m128i, mask: int32 or uint32): m128i =
mm_shuffle_epi32(a, mask)
template blend_u16x8*(a, b: m128i, mask: int32 or uint32): m128i =
mm_blend_epi16(a, b, mask)
template sha256_msg1*(a, b: m128i): m128i =
mm_sha256msg1_epu32(a, b)
template sha256_msg2*(a, b: m128i): m128i =
mm_sha256msg2_epu32(a, b)
template sha256_2rounds*(cdgh, abef, k: m128i): m128i =
mm_sha256rnds2_epu32(cdgh, abef, k)

View File

@ -34,6 +34,10 @@ when X86 and GCC_Compatible:
import isa/[cpuinfo_x86, macro_assembler_x86]
export cpuinfo_x86, macro_assembler_x86
# No exceptions allowed in core cryptographic operations
{.push raises: [].}
{.push checks: off.}
# ############################################################
#
# Instrumentation
@ -72,3 +76,45 @@ func copy*[T: byte|char](
{.push checks: off.} # No OverflowError or IndexError allowed
for i in 0 ..< len:
dst[dStart + i] = byte src[sStart + i]
func rotateRight*[N: static int, T](a: var array[N, T]) {.inline.} =
# Rotate right (Somehow we can't use a generic template here)
# Inline
# Hopefully we want the compiler to see that N rounds of rotation
# can be optimized away with register renaming
let tmp = a[a.len-1]
staticForCountdown i, a.len-1, 1:
a[i] = a[i-1]
a[0] = tmp
func rotateLeft*[N: static int, T](a: var array[N, T]) {.inline.} =
# Rotate left (Somehow we can't use a generic template here)
# Inline
# Hopefully we want the compiler to see that N rounds of rotation
# can be optimized away with register renaming
let tmp = a[0]
staticFor i, 0, a.len-1:
a[i] = a[i+1]
a[a.len-1] = tmp
# ############################################################
#
# Pointer arithmetics
#
# ############################################################
template asUnchecked*[T](a: openArray[T]): ptr UncheckedArray[T] =
cast[ptr UncheckedArray[T]](a[0].unsafeAddr)
# Warning for pointer arithmetics via inline C
# be careful of not passing a `var ptr`
# to a function as `var` are passed by hidden pointers in Nim and the wrong
# pointer will be modified. Templates are fine.
func `+%`*(p: ptr, offset: SomeInteger): type(p) {.inline, noInit.}=
## Pointer increment
{.emit: [result, " = ", p, " + ", offset, ";"].}
func `+%=`*(p: var ptr, offset: SomeInteger){.inline.}=
## Pointer increment
p = p +% offset

View File

@ -35,6 +35,14 @@ macro staticFor*(idx: untyped{nkIdent}, start, stopEx: static int, body: untyped
body.replaceNodes(idx, newLit i)
)
macro staticForCountdown*(idx: untyped{nkIdent}, start, stopIncl: static int, body: untyped): untyped =
result = newStmtList()
for i in countdown(start, stopIncl):
result.add nnkBlockStmt.newTree(
ident("unrolledIter_" & $idx & $i),
body.replaceNodes(idx, newLit i)
)
{.experimental: "dynamicBindSym".}
macro staticFor*(ident: untyped{nkIdent}, choices: typed, body: untyped): untyped =

View File

@ -34,7 +34,7 @@ else:
# But the new API isn't expose on Linux :/
# TODO: fix Windows
when not defined(Windows):
when not defined(windows):
proc SHA256[T: byte|char](
msg: openarray[T],
digest: ptr array[32, byte] = nil