136 lines
4.6 KiB
Go
136 lines
4.6 KiB
Go
|
package guts
|
||
|
|
||
|
import "unsafe"
|
||
|
|
||
|
//go:generate go run avo/gen.go -out blake3_amd64.s
|
||
|
|
||
|
//go:noescape
|
||
|
func compressChunksAVX512(cvs *[16][8]uint32, buf *[16 * ChunkSize]byte, key *[8]uint32, counter uint64, flags uint32)
|
||
|
|
||
|
//go:noescape
|
||
|
func compressChunksAVX2(cvs *[8][8]uint32, buf *[8 * ChunkSize]byte, key *[8]uint32, counter uint64, flags uint32)
|
||
|
|
||
|
//go:noescape
|
||
|
func compressBlocksAVX512(out *[1024]byte, block *[16]uint32, cv *[8]uint32, counter uint64, blockLen uint32, flags uint32)
|
||
|
|
||
|
//go:noescape
|
||
|
func compressBlocksAVX2(out *[512]byte, msgs *[16]uint32, cv *[8]uint32, counter uint64, blockLen uint32, flags uint32)
|
||
|
|
||
|
//go:noescape
|
||
|
func compressParentsAVX2(parents *[8][8]uint32, cvs *[16][8]uint32, key *[8]uint32, flags uint32)
|
||
|
|
||
|
func compressBufferAVX512(buf *[MaxSIMD * ChunkSize]byte, buflen int, key *[8]uint32, counter uint64, flags uint32) Node {
|
||
|
var cvs [MaxSIMD][8]uint32
|
||
|
compressChunksAVX512(&cvs, buf, key, counter, flags)
|
||
|
numChunks := uint64(buflen / ChunkSize)
|
||
|
if buflen%ChunkSize != 0 {
|
||
|
// use non-asm for remainder
|
||
|
partialChunk := buf[buflen-buflen%ChunkSize : buflen]
|
||
|
cvs[numChunks] = ChainingValue(CompressChunk(partialChunk, key, counter+numChunks, flags))
|
||
|
numChunks++
|
||
|
}
|
||
|
return mergeSubtrees(&cvs, numChunks, key, flags)
|
||
|
}
|
||
|
|
||
|
func compressBufferAVX2(buf *[MaxSIMD * ChunkSize]byte, buflen int, key *[8]uint32, counter uint64, flags uint32) Node {
|
||
|
var cvs [MaxSIMD][8]uint32
|
||
|
cvHalves := (*[2][8][8]uint32)(unsafe.Pointer(&cvs))
|
||
|
bufHalves := (*[2][8 * ChunkSize]byte)(unsafe.Pointer(buf))
|
||
|
compressChunksAVX2(&cvHalves[0], &bufHalves[0], key, counter, flags)
|
||
|
numChunks := uint64(buflen / ChunkSize)
|
||
|
if numChunks > 8 {
|
||
|
compressChunksAVX2(&cvHalves[1], &bufHalves[1], key, counter+8, flags)
|
||
|
}
|
||
|
if buflen%ChunkSize != 0 {
|
||
|
// use non-asm for remainder
|
||
|
partialChunk := buf[buflen-buflen%ChunkSize : buflen]
|
||
|
cvs[numChunks] = ChainingValue(CompressChunk(partialChunk, key, counter+numChunks, flags))
|
||
|
numChunks++
|
||
|
}
|
||
|
return mergeSubtrees(&cvs, numChunks, key, flags)
|
||
|
}
|
||
|
|
||
|
// CompressBuffer compresses up to MaxSIMD chunks in parallel and returns their
|
||
|
// root node.
|
||
|
func CompressBuffer(buf *[MaxSIMD * ChunkSize]byte, buflen int, key *[8]uint32, counter uint64, flags uint32) Node {
|
||
|
if buflen <= ChunkSize {
|
||
|
return CompressChunk(buf[:buflen], key, counter, flags)
|
||
|
}
|
||
|
switch {
|
||
|
case haveAVX512 && buflen >= ChunkSize*2:
|
||
|
return compressBufferAVX512(buf, buflen, key, counter, flags)
|
||
|
case haveAVX2 && buflen >= ChunkSize*2:
|
||
|
return compressBufferAVX2(buf, buflen, key, counter, flags)
|
||
|
default:
|
||
|
return compressBufferGeneric(buf, buflen, key, counter, flags)
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// CompressChunk compresses a single chunk, returning its final (uncompressed)
|
||
|
// node.
|
||
|
func CompressChunk(chunk []byte, key *[8]uint32, counter uint64, flags uint32) Node {
|
||
|
n := Node{
|
||
|
CV: *key,
|
||
|
Counter: counter,
|
||
|
BlockLen: BlockSize,
|
||
|
Flags: flags | FlagChunkStart,
|
||
|
}
|
||
|
blockBytes := (*[64]byte)(unsafe.Pointer(&n.Block))[:]
|
||
|
for len(chunk) > BlockSize {
|
||
|
copy(blockBytes, chunk)
|
||
|
chunk = chunk[BlockSize:]
|
||
|
n.CV = ChainingValue(n)
|
||
|
n.Flags &^= FlagChunkStart
|
||
|
}
|
||
|
// pad last block with zeros
|
||
|
n.Block = [16]uint32{}
|
||
|
copy(blockBytes, chunk)
|
||
|
n.BlockLen = uint32(len(chunk))
|
||
|
n.Flags |= FlagChunkEnd
|
||
|
return n
|
||
|
}
|
||
|
|
||
|
// CompressBlocks compresses MaxSIMD copies of n with successive counter values,
|
||
|
// storing the results in out.
|
||
|
func CompressBlocks(out *[MaxSIMD * BlockSize]byte, n Node) {
|
||
|
switch {
|
||
|
case haveAVX512:
|
||
|
compressBlocksAVX512(out, &n.Block, &n.CV, n.Counter, n.BlockLen, n.Flags)
|
||
|
case haveAVX2:
|
||
|
outs := (*[2][512]byte)(unsafe.Pointer(out))
|
||
|
compressBlocksAVX2(&outs[0], &n.Block, &n.CV, n.Counter, n.BlockLen, n.Flags)
|
||
|
compressBlocksAVX2(&outs[1], &n.Block, &n.CV, n.Counter+8, n.BlockLen, n.Flags)
|
||
|
default:
|
||
|
outs := (*[MaxSIMD][64]byte)(unsafe.Pointer(out))
|
||
|
compressBlocksGeneric(outs, n)
|
||
|
}
|
||
|
}
|
||
|
|
||
|
func mergeSubtrees(cvs *[MaxSIMD][8]uint32, numCVs uint64, key *[8]uint32, flags uint32) Node {
|
||
|
if !haveAVX2 {
|
||
|
return mergeSubtreesGeneric(cvs, numCVs, key, flags)
|
||
|
}
|
||
|
for numCVs > 2 {
|
||
|
if numCVs%2 == 0 {
|
||
|
compressParentsAVX2((*[8][8]uint32)(unsafe.Pointer(cvs)), cvs, key, flags)
|
||
|
} else {
|
||
|
keep := cvs[numCVs-1]
|
||
|
compressParentsAVX2((*[8][8]uint32)(unsafe.Pointer(cvs)), cvs, key, flags)
|
||
|
cvs[numCVs/2] = keep
|
||
|
numCVs++
|
||
|
}
|
||
|
numCVs /= 2
|
||
|
}
|
||
|
return ParentNode(cvs[0], cvs[1], key, flags)
|
||
|
}
|
||
|
|
||
|
// BytesToWords converts an array of 64 bytes to an array of 16 bytes.
|
||
|
func BytesToWords(bytes [64]byte) [16]uint32 {
|
||
|
return *(*[16]uint32)(unsafe.Pointer(&bytes))
|
||
|
}
|
||
|
|
||
|
// WordsToBytes converts an array of 16 words to an array of 64 bytes.
|
||
|
func WordsToBytes(words [16]uint32) [64]byte {
|
||
|
return *(*[64]byte)(unsafe.Pointer(&words))
|
||
|
}
|