144 lines
5.5 KiB
Go
144 lines
5.5 KiB
Go
package blake3
|
|
|
|
import (
|
|
"bytes"
|
|
"math/bits"
|
|
)
|
|
|
|
func compressNodeGeneric(out *[16]uint32, n node) {
|
|
g := func(a, b, c, d, mx, my uint32) (uint32, uint32, uint32, uint32) {
|
|
a += b + mx
|
|
d = bits.RotateLeft32(d^a, -16)
|
|
c += d
|
|
b = bits.RotateLeft32(b^c, -12)
|
|
a += b + my
|
|
d = bits.RotateLeft32(d^a, -8)
|
|
c += d
|
|
b = bits.RotateLeft32(b^c, -7)
|
|
return a, b, c, d
|
|
}
|
|
|
|
// NOTE: we unroll all of the rounds, as well as the permutations that occur
|
|
// between rounds.
|
|
|
|
// round 1 (also initializes state)
|
|
// columns
|
|
s0, s4, s8, s12 := g(n.cv[0], n.cv[4], iv[0], uint32(n.counter), n.block[0], n.block[1])
|
|
s1, s5, s9, s13 := g(n.cv[1], n.cv[5], iv[1], uint32(n.counter>>32), n.block[2], n.block[3])
|
|
s2, s6, s10, s14 := g(n.cv[2], n.cv[6], iv[2], n.blockLen, n.block[4], n.block[5])
|
|
s3, s7, s11, s15 := g(n.cv[3], n.cv[7], iv[3], n.flags, n.block[6], n.block[7])
|
|
// diagonals
|
|
s0, s5, s10, s15 = g(s0, s5, s10, s15, n.block[8], n.block[9])
|
|
s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[10], n.block[11])
|
|
s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[12], n.block[13])
|
|
s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[14], n.block[15])
|
|
|
|
// round 2
|
|
s0, s4, s8, s12 = g(s0, s4, s8, s12, n.block[2], n.block[6])
|
|
s1, s5, s9, s13 = g(s1, s5, s9, s13, n.block[3], n.block[10])
|
|
s2, s6, s10, s14 = g(s2, s6, s10, s14, n.block[7], n.block[0])
|
|
s3, s7, s11, s15 = g(s3, s7, s11, s15, n.block[4], n.block[13])
|
|
s0, s5, s10, s15 = g(s0, s5, s10, s15, n.block[1], n.block[11])
|
|
s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[12], n.block[5])
|
|
s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[9], n.block[14])
|
|
s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[15], n.block[8])
|
|
|
|
// round 3
|
|
s0, s4, s8, s12 = g(s0, s4, s8, s12, n.block[3], n.block[4])
|
|
s1, s5, s9, s13 = g(s1, s5, s9, s13, n.block[10], n.block[12])
|
|
s2, s6, s10, s14 = g(s2, s6, s10, s14, n.block[13], n.block[2])
|
|
s3, s7, s11, s15 = g(s3, s7, s11, s15, n.block[7], n.block[14])
|
|
s0, s5, s10, s15 = g(s0, s5, s10, s15, n.block[6], n.block[5])
|
|
s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[9], n.block[0])
|
|
s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[11], n.block[15])
|
|
s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[8], n.block[1])
|
|
|
|
// round 4
|
|
s0, s4, s8, s12 = g(s0, s4, s8, s12, n.block[10], n.block[7])
|
|
s1, s5, s9, s13 = g(s1, s5, s9, s13, n.block[12], n.block[9])
|
|
s2, s6, s10, s14 = g(s2, s6, s10, s14, n.block[14], n.block[3])
|
|
s3, s7, s11, s15 = g(s3, s7, s11, s15, n.block[13], n.block[15])
|
|
s0, s5, s10, s15 = g(s0, s5, s10, s15, n.block[4], n.block[0])
|
|
s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[11], n.block[2])
|
|
s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[5], n.block[8])
|
|
s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[1], n.block[6])
|
|
|
|
// round 5
|
|
s0, s4, s8, s12 = g(s0, s4, s8, s12, n.block[12], n.block[13])
|
|
s1, s5, s9, s13 = g(s1, s5, s9, s13, n.block[9], n.block[11])
|
|
s2, s6, s10, s14 = g(s2, s6, s10, s14, n.block[15], n.block[10])
|
|
s3, s7, s11, s15 = g(s3, s7, s11, s15, n.block[14], n.block[8])
|
|
s0, s5, s10, s15 = g(s0, s5, s10, s15, n.block[7], n.block[2])
|
|
s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[5], n.block[3])
|
|
s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[0], n.block[1])
|
|
s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[6], n.block[4])
|
|
|
|
// round 6
|
|
s0, s4, s8, s12 = g(s0, s4, s8, s12, n.block[9], n.block[14])
|
|
s1, s5, s9, s13 = g(s1, s5, s9, s13, n.block[11], n.block[5])
|
|
s2, s6, s10, s14 = g(s2, s6, s10, s14, n.block[8], n.block[12])
|
|
s3, s7, s11, s15 = g(s3, s7, s11, s15, n.block[15], n.block[1])
|
|
s0, s5, s10, s15 = g(s0, s5, s10, s15, n.block[13], n.block[3])
|
|
s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[0], n.block[10])
|
|
s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[2], n.block[6])
|
|
s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[4], n.block[7])
|
|
|
|
// round 7
|
|
s0, s4, s8, s12 = g(s0, s4, s8, s12, n.block[11], n.block[15])
|
|
s1, s5, s9, s13 = g(s1, s5, s9, s13, n.block[5], n.block[0])
|
|
s2, s6, s10, s14 = g(s2, s6, s10, s14, n.block[1], n.block[9])
|
|
s3, s7, s11, s15 = g(s3, s7, s11, s15, n.block[8], n.block[6])
|
|
s0, s5, s10, s15 = g(s0, s5, s10, s15, n.block[14], n.block[10])
|
|
s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[2], n.block[12])
|
|
s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[3], n.block[4])
|
|
s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[7], n.block[13])
|
|
|
|
// finalization
|
|
*out = [16]uint32{
|
|
s0 ^ s8, s1 ^ s9, s2 ^ s10, s3 ^ s11,
|
|
s4 ^ s12, s5 ^ s13, s6 ^ s14, s7 ^ s15,
|
|
s8 ^ n.cv[0], s9 ^ n.cv[1], s10 ^ n.cv[2], s11 ^ n.cv[3],
|
|
s12 ^ n.cv[4], s13 ^ n.cv[5], s14 ^ n.cv[6], s15 ^ n.cv[7],
|
|
}
|
|
}
|
|
|
|
func chainingValue(n node) (cv [8]uint32) {
|
|
full := compressNode(n)
|
|
copy(cv[:], full[:])
|
|
return
|
|
}
|
|
|
|
func compressBufferGeneric(buf *[maxSIMD * chunkSize]byte, buflen int, key *[8]uint32, counter uint64, flags uint32) (n node) {
|
|
if buflen <= chunkSize {
|
|
return compressChunk(buf[:buflen], key, counter, flags)
|
|
}
|
|
var cvs [maxSIMD][8]uint32
|
|
var numCVs uint64
|
|
for bb := bytes.NewBuffer(buf[:buflen]); bb.Len() > 0; numCVs++ {
|
|
cvs[numCVs] = chainingValue(compressChunk(bb.Next(chunkSize), key, counter+numCVs, flags))
|
|
}
|
|
return mergeSubtrees(&cvs, numCVs, key, flags)
|
|
}
|
|
|
|
func compressBlocksGeneric(outs *[maxSIMD][64]byte, n node) {
|
|
for i := range outs {
|
|
wordsToBytes(compressNode(n), &outs[i])
|
|
n.counter++
|
|
}
|
|
}
|
|
|
|
func mergeSubtreesGeneric(cvs *[maxSIMD][8]uint32, numCVs uint64, key *[8]uint32, flags uint32) node {
|
|
for numCVs > 2 {
|
|
rem := numCVs / 2
|
|
for i := range cvs[:rem] {
|
|
cvs[i] = chainingValue(parentNode(cvs[i*2], cvs[i*2+1], *key, flags))
|
|
}
|
|
if numCVs%2 != 0 {
|
|
cvs[rem] = cvs[rem*2]
|
|
rem++
|
|
}
|
|
numCVs = rem
|
|
}
|
|
return parentNode(cvs[0], cvs[1], *key, flags)
|
|
}
|